xref: /linux/fs/ocfs2/alloc.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 
31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
32 #include <cluster/masklog.h>
33 
34 #include "ocfs2.h"
35 
36 #include "alloc.h"
37 #include "dlmglue.h"
38 #include "extent_map.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "sysfile.h"
44 #include "file.h"
45 #include "super.h"
46 #include "uptodate.h"
47 
48 #include "buffer_head_io.h"
49 
50 static int ocfs2_extent_contig(struct inode *inode,
51 			       struct ocfs2_extent_rec *ext,
52 			       u64 blkno);
53 
54 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55 				     handle_t *handle,
56 				     struct inode *inode,
57 				     int wanted,
58 				     struct ocfs2_alloc_context *meta_ac,
59 				     struct buffer_head *bhs[]);
60 
61 static int ocfs2_add_branch(struct ocfs2_super *osb,
62 			    handle_t *handle,
63 			    struct inode *inode,
64 			    struct buffer_head *fe_bh,
65 			    struct buffer_head *eb_bh,
66 			    struct buffer_head *last_eb_bh,
67 			    struct ocfs2_alloc_context *meta_ac);
68 
69 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70 				  handle_t *handle,
71 				  struct inode *inode,
72 				  struct buffer_head *fe_bh,
73 				  struct ocfs2_alloc_context *meta_ac,
74 				  struct buffer_head **ret_new_eb_bh);
75 
76 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77 				  handle_t *handle,
78 				  struct inode *inode,
79 				  struct buffer_head *fe_bh,
80 				  u64 blkno,
81 				  u32 new_clusters);
82 
83 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84 				    struct inode *inode,
85 				    struct buffer_head *fe_bh,
86 				    struct buffer_head **target_bh);
87 
88 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89 				       struct inode *inode,
90 				       struct ocfs2_dinode *fe,
91 				       unsigned int new_i_clusters,
92 				       struct buffer_head *old_last_eb,
93 				       struct buffer_head **new_last_eb);
94 
95 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96 
97 static int ocfs2_extent_contig(struct inode *inode,
98 			       struct ocfs2_extent_rec *ext,
99 			       u64 blkno)
100 {
101 	return blkno == (le64_to_cpu(ext->e_blkno) +
102 			 ocfs2_clusters_to_blocks(inode->i_sb,
103 						  le32_to_cpu(ext->e_clusters)));
104 }
105 
106 /*
107  * How many free extents have we got before we need more meta data?
108  */
109 int ocfs2_num_free_extents(struct ocfs2_super *osb,
110 			   struct inode *inode,
111 			   struct ocfs2_dinode *fe)
112 {
113 	int retval;
114 	struct ocfs2_extent_list *el;
115 	struct ocfs2_extent_block *eb;
116 	struct buffer_head *eb_bh = NULL;
117 
118 	mlog_entry_void();
119 
120 	if (!OCFS2_IS_VALID_DINODE(fe)) {
121 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122 		retval = -EIO;
123 		goto bail;
124 	}
125 
126 	if (fe->i_last_eb_blk) {
127 		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 					  &eb_bh, OCFS2_BH_CACHED, inode);
129 		if (retval < 0) {
130 			mlog_errno(retval);
131 			goto bail;
132 		}
133 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134 		el = &eb->h_list;
135 	} else
136 		el = &fe->id2.i_list;
137 
138 	BUG_ON(el->l_tree_depth != 0);
139 
140 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141 bail:
142 	if (eb_bh)
143 		brelse(eb_bh);
144 
145 	mlog_exit(retval);
146 	return retval;
147 }
148 
149 /* expects array to already be allocated
150  *
151  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152  * l_count for you
153  */
154 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155 				     handle_t *handle,
156 				     struct inode *inode,
157 				     int wanted,
158 				     struct ocfs2_alloc_context *meta_ac,
159 				     struct buffer_head *bhs[])
160 {
161 	int count, status, i;
162 	u16 suballoc_bit_start;
163 	u32 num_got;
164 	u64 first_blkno;
165 	struct ocfs2_extent_block *eb;
166 
167 	mlog_entry_void();
168 
169 	count = 0;
170 	while (count < wanted) {
171 		status = ocfs2_claim_metadata(osb,
172 					      handle,
173 					      meta_ac,
174 					      wanted - count,
175 					      &suballoc_bit_start,
176 					      &num_got,
177 					      &first_blkno);
178 		if (status < 0) {
179 			mlog_errno(status);
180 			goto bail;
181 		}
182 
183 		for(i = count;  i < (num_got + count); i++) {
184 			bhs[i] = sb_getblk(osb->sb, first_blkno);
185 			if (bhs[i] == NULL) {
186 				status = -EIO;
187 				mlog_errno(status);
188 				goto bail;
189 			}
190 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191 
192 			status = ocfs2_journal_access(handle, inode, bhs[i],
193 						      OCFS2_JOURNAL_ACCESS_CREATE);
194 			if (status < 0) {
195 				mlog_errno(status);
196 				goto bail;
197 			}
198 
199 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 			/* Ok, setup the minimal stuff here. */
202 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 			eb->h_blkno = cpu_to_le64(first_blkno);
204 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205 
206 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 			/* we always use slot zero's suballocator */
208 			eb->h_suballoc_slot = 0;
209 #else
210 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211 #endif
212 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213 			eb->h_list.l_count =
214 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215 
216 			suballoc_bit_start++;
217 			first_blkno++;
218 
219 			/* We'll also be dirtied by the caller, so
220 			 * this isn't absolutely necessary. */
221 			status = ocfs2_journal_dirty(handle, bhs[i]);
222 			if (status < 0) {
223 				mlog_errno(status);
224 				goto bail;
225 			}
226 		}
227 
228 		count += num_got;
229 	}
230 
231 	status = 0;
232 bail:
233 	if (status < 0) {
234 		for(i = 0; i < wanted; i++) {
235 			if (bhs[i])
236 				brelse(bhs[i]);
237 			bhs[i] = NULL;
238 		}
239 	}
240 	mlog_exit(status);
241 	return status;
242 }
243 
244 /*
245  * Add an entire tree branch to our inode. eb_bh is the extent block
246  * to start at, if we don't want to start the branch at the dinode
247  * structure.
248  *
249  * last_eb_bh is required as we have to update it's next_leaf pointer
250  * for the new last extent block.
251  *
252  * the new branch will be 'empty' in the sense that every block will
253  * contain a single record with e_clusters == 0.
254  */
255 static int ocfs2_add_branch(struct ocfs2_super *osb,
256 			    handle_t *handle,
257 			    struct inode *inode,
258 			    struct buffer_head *fe_bh,
259 			    struct buffer_head *eb_bh,
260 			    struct buffer_head *last_eb_bh,
261 			    struct ocfs2_alloc_context *meta_ac)
262 {
263 	int status, new_blocks, i;
264 	u64 next_blkno, new_last_eb_blk;
265 	struct buffer_head *bh;
266 	struct buffer_head **new_eb_bhs = NULL;
267 	struct ocfs2_dinode *fe;
268 	struct ocfs2_extent_block *eb;
269 	struct ocfs2_extent_list  *eb_el;
270 	struct ocfs2_extent_list  *el;
271 
272 	mlog_entry_void();
273 
274 	BUG_ON(!last_eb_bh);
275 
276 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
277 
278 	if (eb_bh) {
279 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280 		el = &eb->h_list;
281 	} else
282 		el = &fe->id2.i_list;
283 
284 	/* we never add a branch to a leaf. */
285 	BUG_ON(!el->l_tree_depth);
286 
287 	new_blocks = le16_to_cpu(el->l_tree_depth);
288 
289 	/* allocate the number of new eb blocks we need */
290 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291 			     GFP_KERNEL);
292 	if (!new_eb_bhs) {
293 		status = -ENOMEM;
294 		mlog_errno(status);
295 		goto bail;
296 	}
297 
298 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 					   meta_ac, new_eb_bhs);
300 	if (status < 0) {
301 		mlog_errno(status);
302 		goto bail;
303 	}
304 
305 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 	 * linked with the rest of the tree.
307 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
308 	 *
309 	 * when we leave the loop, new_last_eb_blk will point to the
310 	 * newest leaf, and next_blkno will point to the topmost extent
311 	 * block. */
312 	next_blkno = new_last_eb_blk = 0;
313 	for(i = 0; i < new_blocks; i++) {
314 		bh = new_eb_bhs[i];
315 		eb = (struct ocfs2_extent_block *) bh->b_data;
316 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318 			status = -EIO;
319 			goto bail;
320 		}
321 		eb_el = &eb->h_list;
322 
323 		status = ocfs2_journal_access(handle, inode, bh,
324 					      OCFS2_JOURNAL_ACCESS_CREATE);
325 		if (status < 0) {
326 			mlog_errno(status);
327 			goto bail;
328 		}
329 
330 		eb->h_next_leaf_blk = 0;
331 		eb_el->l_tree_depth = cpu_to_le16(i);
332 		eb_el->l_next_free_rec = cpu_to_le16(1);
333 		eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 		if (!eb_el->l_tree_depth)
337 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338 
339 		status = ocfs2_journal_dirty(handle, bh);
340 		if (status < 0) {
341 			mlog_errno(status);
342 			goto bail;
343 		}
344 
345 		next_blkno = le64_to_cpu(eb->h_blkno);
346 	}
347 
348 	/* This is a bit hairy. We want to update up to three blocks
349 	 * here without leaving any of them in an inconsistent state
350 	 * in case of error. We don't have to worry about
351 	 * journal_dirty erroring as it won't unless we've aborted the
352 	 * handle (in which case we would never be here) so reserving
353 	 * the write with journal_access is all we need to do. */
354 	status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 				      OCFS2_JOURNAL_ACCESS_WRITE);
356 	if (status < 0) {
357 		mlog_errno(status);
358 		goto bail;
359 	}
360 	status = ocfs2_journal_access(handle, inode, fe_bh,
361 				      OCFS2_JOURNAL_ACCESS_WRITE);
362 	if (status < 0) {
363 		mlog_errno(status);
364 		goto bail;
365 	}
366 	if (eb_bh) {
367 		status = ocfs2_journal_access(handle, inode, eb_bh,
368 					      OCFS2_JOURNAL_ACCESS_WRITE);
369 		if (status < 0) {
370 			mlog_errno(status);
371 			goto bail;
372 		}
373 	}
374 
375 	/* Link the new branch into the rest of the tree (el will
376 	 * either be on the fe, or the extent block passed in. */
377 	i = le16_to_cpu(el->l_next_free_rec);
378 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 	el->l_recs[i].e_cpos = fe->i_clusters;
380 	el->l_recs[i].e_clusters = 0;
381 	le16_add_cpu(&el->l_next_free_rec, 1);
382 
383 	/* fe needs a new last extent block pointer, as does the
384 	 * next_leaf on the previously last-extent-block. */
385 	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386 
387 	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389 
390 	status = ocfs2_journal_dirty(handle, last_eb_bh);
391 	if (status < 0)
392 		mlog_errno(status);
393 	status = ocfs2_journal_dirty(handle, fe_bh);
394 	if (status < 0)
395 		mlog_errno(status);
396 	if (eb_bh) {
397 		status = ocfs2_journal_dirty(handle, eb_bh);
398 		if (status < 0)
399 			mlog_errno(status);
400 	}
401 
402 	status = 0;
403 bail:
404 	if (new_eb_bhs) {
405 		for (i = 0; i < new_blocks; i++)
406 			if (new_eb_bhs[i])
407 				brelse(new_eb_bhs[i]);
408 		kfree(new_eb_bhs);
409 	}
410 
411 	mlog_exit(status);
412 	return status;
413 }
414 
415 /*
416  * adds another level to the allocation tree.
417  * returns back the new extent block so you can add a branch to it
418  * after this call.
419  */
420 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421 				  handle_t *handle,
422 				  struct inode *inode,
423 				  struct buffer_head *fe_bh,
424 				  struct ocfs2_alloc_context *meta_ac,
425 				  struct buffer_head **ret_new_eb_bh)
426 {
427 	int status, i;
428 	struct buffer_head *new_eb_bh = NULL;
429 	struct ocfs2_dinode *fe;
430 	struct ocfs2_extent_block *eb;
431 	struct ocfs2_extent_list  *fe_el;
432 	struct ocfs2_extent_list  *eb_el;
433 
434 	mlog_entry_void();
435 
436 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437 					   &new_eb_bh);
438 	if (status < 0) {
439 		mlog_errno(status);
440 		goto bail;
441 	}
442 
443 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446 		status = -EIO;
447 		goto bail;
448 	}
449 
450 	eb_el = &eb->h_list;
451 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 	fe_el = &fe->id2.i_list;
453 
454 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 				      OCFS2_JOURNAL_ACCESS_CREATE);
456 	if (status < 0) {
457 		mlog_errno(status);
458 		goto bail;
459 	}
460 
461 	/* copy the fe data into the new extent block */
462 	eb_el->l_tree_depth = fe_el->l_tree_depth;
463 	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 	}
469 
470 	status = ocfs2_journal_dirty(handle, new_eb_bh);
471 	if (status < 0) {
472 		mlog_errno(status);
473 		goto bail;
474 	}
475 
476 	status = ocfs2_journal_access(handle, inode, fe_bh,
477 				      OCFS2_JOURNAL_ACCESS_WRITE);
478 	if (status < 0) {
479 		mlog_errno(status);
480 		goto bail;
481 	}
482 
483 	/* update fe now */
484 	le16_add_cpu(&fe_el->l_tree_depth, 1);
485 	fe_el->l_recs[0].e_cpos = 0;
486 	fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 	fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 		fe_el->l_recs[i].e_cpos = 0;
490 		fe_el->l_recs[i].e_clusters = 0;
491 		fe_el->l_recs[i].e_blkno = 0;
492 	}
493 	fe_el->l_next_free_rec = cpu_to_le16(1);
494 
495 	/* If this is our 1st tree depth shift, then last_eb_blk
496 	 * becomes the allocated extent block */
497 	if (fe_el->l_tree_depth == cpu_to_le16(1))
498 		fe->i_last_eb_blk = eb->h_blkno;
499 
500 	status = ocfs2_journal_dirty(handle, fe_bh);
501 	if (status < 0) {
502 		mlog_errno(status);
503 		goto bail;
504 	}
505 
506 	*ret_new_eb_bh = new_eb_bh;
507 	new_eb_bh = NULL;
508 	status = 0;
509 bail:
510 	if (new_eb_bh)
511 		brelse(new_eb_bh);
512 
513 	mlog_exit(status);
514 	return status;
515 }
516 
517 /*
518  * Expects the tree to already have room in the rightmost leaf for the
519  * extent.  Updates all the extent blocks (and the dinode) on the way
520  * down.
521  */
522 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 				  handle_t *handle,
524 				  struct inode *inode,
525 				  struct buffer_head *fe_bh,
526 				  u64 start_blk,
527 				  u32 new_clusters)
528 {
529 	int status, i, num_bhs = 0;
530 	u64 next_blkno;
531 	u16 next_free;
532 	struct buffer_head **eb_bhs = NULL;
533 	struct ocfs2_dinode *fe;
534 	struct ocfs2_extent_block *eb;
535 	struct ocfs2_extent_list  *el;
536 
537 	mlog_entry_void();
538 
539 	status = ocfs2_journal_access(handle, inode, fe_bh,
540 				      OCFS2_JOURNAL_ACCESS_WRITE);
541 	if (status < 0) {
542 		mlog_errno(status);
543 		goto bail;
544 	}
545 
546 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 	el = &fe->id2.i_list;
548 	if (el->l_tree_depth) {
549 		/* This is another operation where we want to be
550 		 * careful about our tree updates. An error here means
551 		 * none of the previous changes we made should roll
552 		 * forward. As a result, we have to record the buffers
553 		 * for this part of the tree in an array and reserve a
554 		 * journal write to them before making any changes. */
555 		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 				 GFP_KERNEL);
558 		if (!eb_bhs) {
559 			status = -ENOMEM;
560 			mlog_errno(status);
561 			goto bail;
562 		}
563 
564 		i = 0;
565 		while(el->l_tree_depth) {
566 			next_free = le16_to_cpu(el->l_next_free_rec);
567 			if (next_free == 0) {
568 				ocfs2_error(inode->i_sb,
569 					    "Dinode %llu has a bad extent list",
570 					    (unsigned long long)OCFS2_I(inode)->ip_blkno);
571 				status = -EIO;
572 				goto bail;
573 			}
574 			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
575 
576 			BUG_ON(i >= num_bhs);
577 			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578 						  OCFS2_BH_CACHED, inode);
579 			if (status < 0) {
580 				mlog_errno(status);
581 				goto bail;
582 			}
583 			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584 			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
586 								 eb);
587 				status = -EIO;
588 				goto bail;
589 			}
590 
591 			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592 						      OCFS2_JOURNAL_ACCESS_WRITE);
593 			if (status < 0) {
594 				mlog_errno(status);
595 				goto bail;
596 			}
597 
598 			el = &eb->h_list;
599 			i++;
600 			/* When we leave this loop, eb_bhs[num_bhs - 1] will
601 			 * hold the bottom-most leaf extent block. */
602 		}
603 		BUG_ON(el->l_tree_depth);
604 
605 		el = &fe->id2.i_list;
606 		/* If we have tree depth, then the fe update is
607 		 * trivial, and we want to switch el out for the
608 		 * bottom-most leaf in order to update it with the
609 		 * actual extent data below. */
610 		next_free = le16_to_cpu(el->l_next_free_rec);
611 		if (next_free == 0) {
612 			ocfs2_error(inode->i_sb,
613 				    "Dinode %llu has a bad extent list",
614 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
615 			status = -EIO;
616 			goto bail;
617 		}
618 		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
619 			     new_clusters);
620 		/* (num_bhs - 1) to avoid the leaf */
621 		for(i = 0; i < (num_bhs - 1); i++) {
622 			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
623 			el = &eb->h_list;
624 
625 			/* finally, make our actual change to the
626 			 * intermediate extent blocks. */
627 			next_free = le16_to_cpu(el->l_next_free_rec);
628 			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
629 				     new_clusters);
630 
631 			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
632 			if (status < 0)
633 				mlog_errno(status);
634 		}
635 		BUG_ON(i != (num_bhs - 1));
636 		/* note that the leaf block wasn't touched in
637 		 * the loop above */
638 		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
639 		el = &eb->h_list;
640 		BUG_ON(el->l_tree_depth);
641 	}
642 
643 	/* yay, we can finally add the actual extent now! */
644 	i = le16_to_cpu(el->l_next_free_rec) - 1;
645 	if (le16_to_cpu(el->l_next_free_rec) &&
646 	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647 		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648 	} else if (le16_to_cpu(el->l_next_free_rec) &&
649 		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650 		/* having an empty extent at eof is legal. */
651 		if (el->l_recs[i].e_cpos != fe->i_clusters) {
652 			ocfs2_error(inode->i_sb,
653 				    "Dinode %llu trailing extent is bad: "
654 				    "cpos (%u) != number of clusters (%u)",
655 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
656 				    le32_to_cpu(el->l_recs[i].e_cpos),
657 				    le32_to_cpu(fe->i_clusters));
658 			status = -EIO;
659 			goto bail;
660 		}
661 		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662 		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
663 	} else {
664 		/* No contiguous record, or no empty record at eof, so
665 		 * we add a new one. */
666 
667 		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668 		       le16_to_cpu(el->l_count));
669 		i = le16_to_cpu(el->l_next_free_rec);
670 
671 		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672 		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673 		el->l_recs[i].e_cpos = fe->i_clusters;
674 		le16_add_cpu(&el->l_next_free_rec, 1);
675 	}
676 
677 	/*
678 	 * extent_map errors are not fatal, so they are ignored outside
679 	 * of flushing the thing.
680 	 */
681 	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
682 					 new_clusters);
683 	if (status) {
684 		mlog_errno(status);
685 		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
686 	}
687 
688 	status = ocfs2_journal_dirty(handle, fe_bh);
689 	if (status < 0)
690 		mlog_errno(status);
691 	if (fe->id2.i_list.l_tree_depth) {
692 		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
693 		if (status < 0)
694 			mlog_errno(status);
695 	}
696 
697 	status = 0;
698 bail:
699 	if (eb_bhs) {
700 		for (i = 0; i < num_bhs; i++)
701 			if (eb_bhs[i])
702 				brelse(eb_bhs[i]);
703 		kfree(eb_bhs);
704 	}
705 
706 	mlog_exit(status);
707 	return status;
708 }
709 
710 /*
711  * Should only be called when there is no space left in any of the
712  * leaf nodes. What we want to do is find the lowest tree depth
713  * non-leaf extent block with room for new records. There are three
714  * valid results of this search:
715  *
716  * 1) a lowest extent block is found, then we pass it back in
717  *    *lowest_eb_bh and return '0'
718  *
719  * 2) the search fails to find anything, but the dinode has room. We
720  *    pass NULL back in *lowest_eb_bh, but still return '0'
721  *
722  * 3) the search fails to find anything AND the dinode is full, in
723  *    which case we return > 0
724  *
725  * return status < 0 indicates an error.
726  */
727 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
728 				    struct inode *inode,
729 				    struct buffer_head *fe_bh,
730 				    struct buffer_head **target_bh)
731 {
732 	int status = 0, i;
733 	u64 blkno;
734 	struct ocfs2_dinode *fe;
735 	struct ocfs2_extent_block *eb;
736 	struct ocfs2_extent_list  *el;
737 	struct buffer_head *bh = NULL;
738 	struct buffer_head *lowest_bh = NULL;
739 
740 	mlog_entry_void();
741 
742 	*target_bh = NULL;
743 
744 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
745 	el = &fe->id2.i_list;
746 
747 	while(le16_to_cpu(el->l_tree_depth) > 1) {
748 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
749 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
750 				    "extent list (next_free_rec == 0)",
751 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
752 			status = -EIO;
753 			goto bail;
754 		}
755 		i = le16_to_cpu(el->l_next_free_rec) - 1;
756 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
757 		if (!blkno) {
758 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
759 				    "list where extent # %d has no physical "
760 				    "block start",
761 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
762 			status = -EIO;
763 			goto bail;
764 		}
765 
766 		if (bh) {
767 			brelse(bh);
768 			bh = NULL;
769 		}
770 
771 		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
772 					  inode);
773 		if (status < 0) {
774 			mlog_errno(status);
775 			goto bail;
776 		}
777 
778 		eb = (struct ocfs2_extent_block *) bh->b_data;
779 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
780 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
781 			status = -EIO;
782 			goto bail;
783 		}
784 		el = &eb->h_list;
785 
786 		if (le16_to_cpu(el->l_next_free_rec) <
787 		    le16_to_cpu(el->l_count)) {
788 			if (lowest_bh)
789 				brelse(lowest_bh);
790 			lowest_bh = bh;
791 			get_bh(lowest_bh);
792 		}
793 	}
794 
795 	/* If we didn't find one and the fe doesn't have any room,
796 	 * then return '1' */
797 	if (!lowest_bh
798 	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
799 		status = 1;
800 
801 	*target_bh = lowest_bh;
802 bail:
803 	if (bh)
804 		brelse(bh);
805 
806 	mlog_exit(status);
807 	return status;
808 }
809 
810 /* the caller needs to update fe->i_clusters */
811 int ocfs2_insert_extent(struct ocfs2_super *osb,
812 			handle_t *handle,
813 			struct inode *inode,
814 			struct buffer_head *fe_bh,
815 			u64 start_blk,
816 			u32 new_clusters,
817 			struct ocfs2_alloc_context *meta_ac)
818 {
819 	int status, i, shift;
820 	struct buffer_head *last_eb_bh = NULL;
821 	struct buffer_head *bh = NULL;
822 	struct ocfs2_dinode *fe;
823 	struct ocfs2_extent_block *eb;
824 	struct ocfs2_extent_list  *el;
825 
826 	mlog_entry_void();
827 
828 	mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
829 	     new_clusters, (unsigned long long)start_blk,
830 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
831 
832 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
833 	el = &fe->id2.i_list;
834 
835 	if (el->l_tree_depth) {
836 		/* jump to end of tree */
837 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
838 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
839 		if (status < 0) {
840 			mlog_exit(status);
841 			goto bail;
842 		}
843 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
844 		el = &eb->h_list;
845 	}
846 
847 	/* Can we allocate without adding/shifting tree bits? */
848 	i = le16_to_cpu(el->l_next_free_rec) - 1;
849 	if (le16_to_cpu(el->l_next_free_rec) == 0
850 	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
851 	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
852 	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
853 		goto out_add;
854 
855 	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
856 	     "tree now.\n");
857 
858 	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
859 	if (shift < 0) {
860 		status = shift;
861 		mlog_errno(status);
862 		goto bail;
863 	}
864 
865 	/* We traveled all the way to the bottom of the allocation tree
866 	 * and didn't find room for any more extents - we need to add
867 	 * another tree level */
868 	if (shift) {
869 		/* if we hit a leaf, we'd better be empty :) */
870 		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871 		       le16_to_cpu(el->l_count));
872 		BUG_ON(bh);
873 		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
874 		     "(current = %u)\n",
875 		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
876 
877 		/* ocfs2_shift_tree_depth will return us a buffer with
878 		 * the new extent block (so we can pass that to
879 		 * ocfs2_add_branch). */
880 		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
881 						meta_ac, &bh);
882 		if (status < 0) {
883 			mlog_errno(status);
884 			goto bail;
885 		}
886 		/* Special case: we have room now if we shifted from
887 		 * tree_depth 0 */
888 		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
889 			goto out_add;
890 	}
891 
892 	/* call ocfs2_add_branch to add the final part of the tree with
893 	 * the new data. */
894 	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
895 	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
896 				  meta_ac);
897 	if (status < 0) {
898 		mlog_errno(status);
899 		goto bail;
900 	}
901 
902 out_add:
903 	/* Finally, we can add clusters. */
904 	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
905 					start_blk, new_clusters);
906 	if (status < 0)
907 		mlog_errno(status);
908 
909 bail:
910 	if (bh)
911 		brelse(bh);
912 
913 	if (last_eb_bh)
914 		brelse(last_eb_bh);
915 
916 	mlog_exit(status);
917 	return status;
918 }
919 
920 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
921 {
922 	struct buffer_head *tl_bh = osb->osb_tl_bh;
923 	struct ocfs2_dinode *di;
924 	struct ocfs2_truncate_log *tl;
925 
926 	di = (struct ocfs2_dinode *) tl_bh->b_data;
927 	tl = &di->id2.i_dealloc;
928 
929 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
930 			"slot %d, invalid truncate log parameters: used = "
931 			"%u, count = %u\n", osb->slot_num,
932 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
933 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
934 }
935 
936 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
937 					   unsigned int new_start)
938 {
939 	unsigned int tail_index;
940 	unsigned int current_tail;
941 
942 	/* No records, nothing to coalesce */
943 	if (!le16_to_cpu(tl->tl_used))
944 		return 0;
945 
946 	tail_index = le16_to_cpu(tl->tl_used) - 1;
947 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
948 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
949 
950 	return current_tail == new_start;
951 }
952 
953 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
954 				     handle_t *handle,
955 				     u64 start_blk,
956 				     unsigned int num_clusters)
957 {
958 	int status, index;
959 	unsigned int start_cluster, tl_count;
960 	struct inode *tl_inode = osb->osb_tl_inode;
961 	struct buffer_head *tl_bh = osb->osb_tl_bh;
962 	struct ocfs2_dinode *di;
963 	struct ocfs2_truncate_log *tl;
964 
965 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
966 		   (unsigned long long)start_blk, num_clusters);
967 
968 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
969 
970 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
971 
972 	di = (struct ocfs2_dinode *) tl_bh->b_data;
973 	tl = &di->id2.i_dealloc;
974 	if (!OCFS2_IS_VALID_DINODE(di)) {
975 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
976 		status = -EIO;
977 		goto bail;
978 	}
979 
980 	tl_count = le16_to_cpu(tl->tl_count);
981 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
982 			tl_count == 0,
983 			"Truncate record count on #%llu invalid "
984 			"wanted %u, actual %u\n",
985 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
986 			ocfs2_truncate_recs_per_inode(osb->sb),
987 			le16_to_cpu(tl->tl_count));
988 
989 	/* Caller should have known to flush before calling us. */
990 	index = le16_to_cpu(tl->tl_used);
991 	if (index >= tl_count) {
992 		status = -ENOSPC;
993 		mlog_errno(status);
994 		goto bail;
995 	}
996 
997 	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 				      OCFS2_JOURNAL_ACCESS_WRITE);
999 	if (status < 0) {
1000 		mlog_errno(status);
1001 		goto bail;
1002 	}
1003 
1004 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 	     "%llu (index = %d)\n", num_clusters, start_cluster,
1006 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
1007 
1008 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009 		/*
1010 		 * Move index back to the record we are coalescing with.
1011 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012 		 */
1013 		index--;
1014 
1015 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
1018 		     num_clusters);
1019 	} else {
1020 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 		tl->tl_used = cpu_to_le16(index + 1);
1022 	}
1023 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024 
1025 	status = ocfs2_journal_dirty(handle, tl_bh);
1026 	if (status < 0) {
1027 		mlog_errno(status);
1028 		goto bail;
1029 	}
1030 
1031 bail:
1032 	mlog_exit(status);
1033 	return status;
1034 }
1035 
1036 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037 					 handle_t *handle,
1038 					 struct inode *data_alloc_inode,
1039 					 struct buffer_head *data_alloc_bh)
1040 {
1041 	int status = 0;
1042 	int i;
1043 	unsigned int num_clusters;
1044 	u64 start_blk;
1045 	struct ocfs2_truncate_rec rec;
1046 	struct ocfs2_dinode *di;
1047 	struct ocfs2_truncate_log *tl;
1048 	struct inode *tl_inode = osb->osb_tl_inode;
1049 	struct buffer_head *tl_bh = osb->osb_tl_bh;
1050 
1051 	mlog_entry_void();
1052 
1053 	di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 	tl = &di->id2.i_dealloc;
1055 	i = le16_to_cpu(tl->tl_used) - 1;
1056 	while (i >= 0) {
1057 		/* Caller has given us at least enough credits to
1058 		 * update the truncate log dinode */
1059 		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 					      OCFS2_JOURNAL_ACCESS_WRITE);
1061 		if (status < 0) {
1062 			mlog_errno(status);
1063 			goto bail;
1064 		}
1065 
1066 		tl->tl_used = cpu_to_le16(i);
1067 
1068 		status = ocfs2_journal_dirty(handle, tl_bh);
1069 		if (status < 0) {
1070 			mlog_errno(status);
1071 			goto bail;
1072 		}
1073 
1074 		/* TODO: Perhaps we can calculate the bulk of the
1075 		 * credits up front rather than extending like
1076 		 * this. */
1077 		status = ocfs2_extend_trans(handle,
1078 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079 		if (status < 0) {
1080 			mlog_errno(status);
1081 			goto bail;
1082 		}
1083 
1084 		rec = tl->tl_recs[i];
1085 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 						    le32_to_cpu(rec.t_start));
1087 		num_clusters = le32_to_cpu(rec.t_clusters);
1088 
1089 		/* if start_blk is not set, we ignore the record as
1090 		 * invalid. */
1091 		if (start_blk) {
1092 			mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 			     i, le32_to_cpu(rec.t_start), num_clusters);
1094 
1095 			status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 						     data_alloc_bh, start_blk,
1097 						     num_clusters);
1098 			if (status < 0) {
1099 				mlog_errno(status);
1100 				goto bail;
1101 			}
1102 		}
1103 		i--;
1104 	}
1105 
1106 bail:
1107 	mlog_exit(status);
1108 	return status;
1109 }
1110 
1111 /* Expects you to already be holding tl_inode->i_mutex */
1112 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113 {
1114 	int status;
1115 	unsigned int num_to_flush;
1116 	handle_t *handle;
1117 	struct inode *tl_inode = osb->osb_tl_inode;
1118 	struct inode *data_alloc_inode = NULL;
1119 	struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 	struct buffer_head *data_alloc_bh = NULL;
1121 	struct ocfs2_dinode *di;
1122 	struct ocfs2_truncate_log *tl;
1123 
1124 	mlog_entry_void();
1125 
1126 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
1127 
1128 	di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 	tl = &di->id2.i_dealloc;
1130 	if (!OCFS2_IS_VALID_DINODE(di)) {
1131 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132 		status = -EIO;
1133 		goto out;
1134 	}
1135 
1136 	num_to_flush = le16_to_cpu(tl->tl_used);
1137 	mlog(0, "Flush %u records from truncate log #%llu\n",
1138 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
1139 	if (!num_to_flush) {
1140 		status = 0;
1141 		goto out;
1142 	}
1143 
1144 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
1145 						       GLOBAL_BITMAP_SYSTEM_INODE,
1146 						       OCFS2_INVALID_SLOT);
1147 	if (!data_alloc_inode) {
1148 		status = -EINVAL;
1149 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
1150 		goto out;
1151 	}
1152 
1153 	mutex_lock(&data_alloc_inode->i_mutex);
1154 
1155 	status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
1156 	if (status < 0) {
1157 		mlog_errno(status);
1158 		goto out_mutex;
1159 	}
1160 
1161 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
1162 	if (IS_ERR(handle)) {
1163 		status = PTR_ERR(handle);
1164 		mlog_errno(status);
1165 		goto out_unlock;
1166 	}
1167 
1168 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1169 					       data_alloc_bh);
1170 	if (status < 0)
1171 		mlog_errno(status);
1172 
1173 	ocfs2_commit_trans(osb, handle);
1174 
1175 out_unlock:
1176 	brelse(data_alloc_bh);
1177 	ocfs2_meta_unlock(data_alloc_inode, 1);
1178 
1179 out_mutex:
1180 	mutex_unlock(&data_alloc_inode->i_mutex);
1181 	iput(data_alloc_inode);
1182 
1183 out:
1184 	mlog_exit(status);
1185 	return status;
1186 }
1187 
1188 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1189 {
1190 	int status;
1191 	struct inode *tl_inode = osb->osb_tl_inode;
1192 
1193 	mutex_lock(&tl_inode->i_mutex);
1194 	status = __ocfs2_flush_truncate_log(osb);
1195 	mutex_unlock(&tl_inode->i_mutex);
1196 
1197 	return status;
1198 }
1199 
1200 static void ocfs2_truncate_log_worker(struct work_struct *work)
1201 {
1202 	int status;
1203 	struct ocfs2_super *osb =
1204 		container_of(work, struct ocfs2_super,
1205 			     osb_truncate_log_wq.work);
1206 
1207 	mlog_entry_void();
1208 
1209 	status = ocfs2_flush_truncate_log(osb);
1210 	if (status < 0)
1211 		mlog_errno(status);
1212 
1213 	mlog_exit(status);
1214 }
1215 
1216 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1217 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1218 				       int cancel)
1219 {
1220 	if (osb->osb_tl_inode) {
1221 		/* We want to push off log flushes while truncates are
1222 		 * still running. */
1223 		if (cancel)
1224 			cancel_delayed_work(&osb->osb_truncate_log_wq);
1225 
1226 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1227 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1228 	}
1229 }
1230 
1231 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1232 				       int slot_num,
1233 				       struct inode **tl_inode,
1234 				       struct buffer_head **tl_bh)
1235 {
1236 	int status;
1237 	struct inode *inode = NULL;
1238 	struct buffer_head *bh = NULL;
1239 
1240 	inode = ocfs2_get_system_file_inode(osb,
1241 					   TRUNCATE_LOG_SYSTEM_INODE,
1242 					   slot_num);
1243 	if (!inode) {
1244 		status = -EINVAL;
1245 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1246 		goto bail;
1247 	}
1248 
1249 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1250 				  OCFS2_BH_CACHED, inode);
1251 	if (status < 0) {
1252 		iput(inode);
1253 		mlog_errno(status);
1254 		goto bail;
1255 	}
1256 
1257 	*tl_inode = inode;
1258 	*tl_bh    = bh;
1259 bail:
1260 	mlog_exit(status);
1261 	return status;
1262 }
1263 
1264 /* called during the 1st stage of node recovery. we stamp a clean
1265  * truncate log and pass back a copy for processing later. if the
1266  * truncate log does not require processing, a *tl_copy is set to
1267  * NULL. */
1268 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1269 				      int slot_num,
1270 				      struct ocfs2_dinode **tl_copy)
1271 {
1272 	int status;
1273 	struct inode *tl_inode = NULL;
1274 	struct buffer_head *tl_bh = NULL;
1275 	struct ocfs2_dinode *di;
1276 	struct ocfs2_truncate_log *tl;
1277 
1278 	*tl_copy = NULL;
1279 
1280 	mlog(0, "recover truncate log from slot %d\n", slot_num);
1281 
1282 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1283 	if (status < 0) {
1284 		mlog_errno(status);
1285 		goto bail;
1286 	}
1287 
1288 	di = (struct ocfs2_dinode *) tl_bh->b_data;
1289 	tl = &di->id2.i_dealloc;
1290 	if (!OCFS2_IS_VALID_DINODE(di)) {
1291 		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1292 		status = -EIO;
1293 		goto bail;
1294 	}
1295 
1296 	if (le16_to_cpu(tl->tl_used)) {
1297 		mlog(0, "We'll have %u logs to recover\n",
1298 		     le16_to_cpu(tl->tl_used));
1299 
1300 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1301 		if (!(*tl_copy)) {
1302 			status = -ENOMEM;
1303 			mlog_errno(status);
1304 			goto bail;
1305 		}
1306 
1307 		/* Assuming the write-out below goes well, this copy
1308 		 * will be passed back to recovery for processing. */
1309 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1310 
1311 		/* All we need to do to clear the truncate log is set
1312 		 * tl_used. */
1313 		tl->tl_used = 0;
1314 
1315 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
1316 		if (status < 0) {
1317 			mlog_errno(status);
1318 			goto bail;
1319 		}
1320 	}
1321 
1322 bail:
1323 	if (tl_inode)
1324 		iput(tl_inode);
1325 	if (tl_bh)
1326 		brelse(tl_bh);
1327 
1328 	if (status < 0 && (*tl_copy)) {
1329 		kfree(*tl_copy);
1330 		*tl_copy = NULL;
1331 	}
1332 
1333 	mlog_exit(status);
1334 	return status;
1335 }
1336 
1337 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1338 					 struct ocfs2_dinode *tl_copy)
1339 {
1340 	int status = 0;
1341 	int i;
1342 	unsigned int clusters, num_recs, start_cluster;
1343 	u64 start_blk;
1344 	handle_t *handle;
1345 	struct inode *tl_inode = osb->osb_tl_inode;
1346 	struct ocfs2_truncate_log *tl;
1347 
1348 	mlog_entry_void();
1349 
1350 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1351 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1352 		return -EINVAL;
1353 	}
1354 
1355 	tl = &tl_copy->id2.i_dealloc;
1356 	num_recs = le16_to_cpu(tl->tl_used);
1357 	mlog(0, "cleanup %u records from %llu\n", num_recs,
1358 	     (unsigned long long)tl_copy->i_blkno);
1359 
1360 	mutex_lock(&tl_inode->i_mutex);
1361 	for(i = 0; i < num_recs; i++) {
1362 		if (ocfs2_truncate_log_needs_flush(osb)) {
1363 			status = __ocfs2_flush_truncate_log(osb);
1364 			if (status < 0) {
1365 				mlog_errno(status);
1366 				goto bail_up;
1367 			}
1368 		}
1369 
1370 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
1371 		if (IS_ERR(handle)) {
1372 			status = PTR_ERR(handle);
1373 			mlog_errno(status);
1374 			goto bail_up;
1375 		}
1376 
1377 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1378 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1379 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1380 
1381 		status = ocfs2_truncate_log_append(osb, handle,
1382 						   start_blk, clusters);
1383 		ocfs2_commit_trans(osb, handle);
1384 		if (status < 0) {
1385 			mlog_errno(status);
1386 			goto bail_up;
1387 		}
1388 	}
1389 
1390 bail_up:
1391 	mutex_unlock(&tl_inode->i_mutex);
1392 
1393 	mlog_exit(status);
1394 	return status;
1395 }
1396 
1397 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1398 {
1399 	int status;
1400 	struct inode *tl_inode = osb->osb_tl_inode;
1401 
1402 	mlog_entry_void();
1403 
1404 	if (tl_inode) {
1405 		cancel_delayed_work(&osb->osb_truncate_log_wq);
1406 		flush_workqueue(ocfs2_wq);
1407 
1408 		status = ocfs2_flush_truncate_log(osb);
1409 		if (status < 0)
1410 			mlog_errno(status);
1411 
1412 		brelse(osb->osb_tl_bh);
1413 		iput(osb->osb_tl_inode);
1414 	}
1415 
1416 	mlog_exit_void();
1417 }
1418 
1419 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1420 {
1421 	int status;
1422 	struct inode *tl_inode = NULL;
1423 	struct buffer_head *tl_bh = NULL;
1424 
1425 	mlog_entry_void();
1426 
1427 	status = ocfs2_get_truncate_log_info(osb,
1428 					     osb->slot_num,
1429 					     &tl_inode,
1430 					     &tl_bh);
1431 	if (status < 0)
1432 		mlog_errno(status);
1433 
1434 	/* ocfs2_truncate_log_shutdown keys on the existence of
1435 	 * osb->osb_tl_inode so we don't set any of the osb variables
1436 	 * until we're sure all is well. */
1437 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
1438 			  ocfs2_truncate_log_worker);
1439 	osb->osb_tl_bh    = tl_bh;
1440 	osb->osb_tl_inode = tl_inode;
1441 
1442 	mlog_exit(status);
1443 	return status;
1444 }
1445 
1446 /* This function will figure out whether the currently last extent
1447  * block will be deleted, and if it will, what the new last extent
1448  * block will be so we can update his h_next_leaf_blk field, as well
1449  * as the dinodes i_last_eb_blk */
1450 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1451 				       struct inode *inode,
1452 				       struct ocfs2_dinode *fe,
1453 				       u32 new_i_clusters,
1454 				       struct buffer_head *old_last_eb,
1455 				       struct buffer_head **new_last_eb)
1456 {
1457 	int i, status = 0;
1458 	u64 block = 0;
1459 	struct ocfs2_extent_block *eb;
1460 	struct ocfs2_extent_list *el;
1461 	struct buffer_head *bh = NULL;
1462 
1463 	*new_last_eb = NULL;
1464 
1465 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1466 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1467 		status = -EIO;
1468 		goto bail;
1469 	}
1470 
1471 	/* we have no tree, so of course, no last_eb. */
1472 	if (!fe->id2.i_list.l_tree_depth)
1473 		goto bail;
1474 
1475 	/* trunc to zero special case - this makes tree_depth = 0
1476 	 * regardless of what it is.  */
1477 	if (!new_i_clusters)
1478 		goto bail;
1479 
1480 	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1481 	el = &(eb->h_list);
1482 	BUG_ON(!el->l_next_free_rec);
1483 
1484 	/* Make sure that this guy will actually be empty after we
1485 	 * clear away the data. */
1486 	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1487 		goto bail;
1488 
1489 	/* Ok, at this point, we know that last_eb will definitely
1490 	 * change, so lets traverse the tree and find the second to
1491 	 * last extent block. */
1492 	el = &(fe->id2.i_list);
1493 	/* go down the tree, */
1494 	do {
1495 		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1496 			if (le32_to_cpu(el->l_recs[i].e_cpos) <
1497 			    new_i_clusters) {
1498 				block = le64_to_cpu(el->l_recs[i].e_blkno);
1499 				break;
1500 			}
1501 		}
1502 		BUG_ON(i < 0);
1503 
1504 		if (bh) {
1505 			brelse(bh);
1506 			bh = NULL;
1507 		}
1508 
1509 		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1510 					 inode);
1511 		if (status < 0) {
1512 			mlog_errno(status);
1513 			goto bail;
1514 		}
1515 		eb = (struct ocfs2_extent_block *) bh->b_data;
1516 		el = &eb->h_list;
1517 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1518 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1519 			status = -EIO;
1520 			goto bail;
1521 		}
1522 	} while (el->l_tree_depth);
1523 
1524 	*new_last_eb = bh;
1525 	get_bh(*new_last_eb);
1526 	mlog(0, "returning block %llu\n",
1527 	     (unsigned long long)le64_to_cpu(eb->h_blkno));
1528 bail:
1529 	if (bh)
1530 		brelse(bh);
1531 
1532 	return status;
1533 }
1534 
1535 static int ocfs2_do_truncate(struct ocfs2_super *osb,
1536 			     unsigned int clusters_to_del,
1537 			     struct inode *inode,
1538 			     struct buffer_head *fe_bh,
1539 			     struct buffer_head *old_last_eb_bh,
1540 			     handle_t *handle,
1541 			     struct ocfs2_truncate_context *tc)
1542 {
1543 	int status, i, depth;
1544 	struct ocfs2_dinode *fe;
1545 	struct ocfs2_extent_block *eb;
1546 	struct ocfs2_extent_block *last_eb = NULL;
1547 	struct ocfs2_extent_list *el;
1548 	struct buffer_head *eb_bh = NULL;
1549 	struct buffer_head *last_eb_bh = NULL;
1550 	u64 next_eb = 0;
1551 	u64 delete_blk = 0;
1552 
1553 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
1554 
1555 	status = ocfs2_find_new_last_ext_blk(osb,
1556 					     inode,
1557 					     fe,
1558 					     le32_to_cpu(fe->i_clusters) -
1559 					     		clusters_to_del,
1560 					     old_last_eb_bh,
1561 					     &last_eb_bh);
1562 	if (status < 0) {
1563 		mlog_errno(status);
1564 		goto bail;
1565 	}
1566 	if (last_eb_bh)
1567 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1568 
1569 	status = ocfs2_journal_access(handle, inode, fe_bh,
1570 				      OCFS2_JOURNAL_ACCESS_WRITE);
1571 	if (status < 0) {
1572 		mlog_errno(status);
1573 		goto bail;
1574 	}
1575 	el = &(fe->id2.i_list);
1576 
1577 	spin_lock(&OCFS2_I(inode)->ip_lock);
1578 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1579 				      clusters_to_del;
1580 	spin_unlock(&OCFS2_I(inode)->ip_lock);
1581 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1582 	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1583 	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1584 
1585 	i = le16_to_cpu(el->l_next_free_rec) - 1;
1586 
1587 	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1588 	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1589 	/* tree depth zero, we can just delete the clusters, otherwise
1590 	 * we need to record the offset of the next level extent block
1591 	 * as we may overwrite it. */
1592 	if (!el->l_tree_depth)
1593 		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1594 			+ ocfs2_clusters_to_blocks(osb->sb,
1595 					le32_to_cpu(el->l_recs[i].e_clusters));
1596 	else
1597 		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1598 
1599 	if (!el->l_recs[i].e_clusters) {
1600 		/* if we deleted the whole extent record, then clear
1601 		 * out the other fields and update the extent
1602 		 * list. For depth > 0 trees, we've already recorded
1603 		 * the extent block in 'next_eb' */
1604 		el->l_recs[i].e_cpos = 0;
1605 		el->l_recs[i].e_blkno = 0;
1606 		BUG_ON(!el->l_next_free_rec);
1607 		le16_add_cpu(&el->l_next_free_rec, -1);
1608 	}
1609 
1610 	depth = le16_to_cpu(el->l_tree_depth);
1611 	if (!fe->i_clusters) {
1612 		/* trunc to zero is a special case. */
1613 		el->l_tree_depth = 0;
1614 		fe->i_last_eb_blk = 0;
1615 	} else if (last_eb)
1616 		fe->i_last_eb_blk = last_eb->h_blkno;
1617 
1618 	status = ocfs2_journal_dirty(handle, fe_bh);
1619 	if (status < 0) {
1620 		mlog_errno(status);
1621 		goto bail;
1622 	}
1623 
1624 	if (last_eb) {
1625 		/* If there will be a new last extent block, then by
1626 		 * definition, there cannot be any leaves to the right of
1627 		 * him. */
1628 		status = ocfs2_journal_access(handle, inode, last_eb_bh,
1629 					      OCFS2_JOURNAL_ACCESS_WRITE);
1630 		if (status < 0) {
1631 			mlog_errno(status);
1632 			goto bail;
1633 		}
1634 		last_eb->h_next_leaf_blk = 0;
1635 		status = ocfs2_journal_dirty(handle, last_eb_bh);
1636 		if (status < 0) {
1637 			mlog_errno(status);
1638 			goto bail;
1639 		}
1640 	}
1641 
1642 	/* if our tree depth > 0, update all the tree blocks below us. */
1643 	while (depth) {
1644 		mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
1645 		     depth,  (unsigned long long)next_eb);
1646 		status = ocfs2_read_block(osb, next_eb, &eb_bh,
1647 					  OCFS2_BH_CACHED, inode);
1648 		if (status < 0) {
1649 			mlog_errno(status);
1650 			goto bail;
1651 		}
1652 		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1653 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1654 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1655 			status = -EIO;
1656 			goto bail;
1657 		}
1658 		el = &(eb->h_list);
1659 
1660 		status = ocfs2_journal_access(handle, inode, eb_bh,
1661 					      OCFS2_JOURNAL_ACCESS_WRITE);
1662 		if (status < 0) {
1663 			mlog_errno(status);
1664 			goto bail;
1665 		}
1666 
1667 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1668 		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1669 
1670 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1671 
1672 		mlog(0, "extent block %llu, before: record %d: "
1673 		     "(%u, %u, %llu), next = %u\n",
1674 		     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1675 		     le32_to_cpu(el->l_recs[i].e_cpos),
1676 		     le32_to_cpu(el->l_recs[i].e_clusters),
1677 		     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1678 		     le16_to_cpu(el->l_next_free_rec));
1679 
1680 		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1681 		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1682 
1683 		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1684 		/* bottom-most block requires us to delete data.*/
1685 		if (!el->l_tree_depth)
1686 			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1687 				+ ocfs2_clusters_to_blocks(osb->sb,
1688 					le32_to_cpu(el->l_recs[i].e_clusters));
1689 		if (!el->l_recs[i].e_clusters) {
1690 			el->l_recs[i].e_cpos = 0;
1691 			el->l_recs[i].e_blkno = 0;
1692 			BUG_ON(!el->l_next_free_rec);
1693 			le16_add_cpu(&el->l_next_free_rec, -1);
1694 		}
1695 		mlog(0, "extent block %llu, after: record %d: "
1696 		     "(%u, %u, %llu), next = %u\n",
1697 		     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1698 		     le32_to_cpu(el->l_recs[i].e_cpos),
1699 		     le32_to_cpu(el->l_recs[i].e_clusters),
1700 		     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1701 		     le16_to_cpu(el->l_next_free_rec));
1702 
1703 		status = ocfs2_journal_dirty(handle, eb_bh);
1704 		if (status < 0) {
1705 			mlog_errno(status);
1706 			goto bail;
1707 		}
1708 
1709 		if (!el->l_next_free_rec) {
1710 			mlog(0, "deleting this extent block.\n");
1711 
1712 			ocfs2_remove_from_cache(inode, eb_bh);
1713 
1714 			BUG_ON(el->l_recs[0].e_clusters);
1715 			BUG_ON(el->l_recs[0].e_cpos);
1716 			BUG_ON(el->l_recs[0].e_blkno);
1717 			if (eb->h_suballoc_slot == 0) {
1718 				/*
1719 				 * This code only understands how to
1720 				 * lock the suballocator in slot 0,
1721 				 * which is fine because allocation is
1722 				 * only ever done out of that
1723 				 * suballocator too. A future version
1724 				 * might change that however, so avoid
1725 				 * a free if we don't know how to
1726 				 * handle it. This way an fs incompat
1727 				 * bit will not be necessary.
1728 				 */
1729 				status = ocfs2_free_extent_block(handle,
1730 								 tc->tc_ext_alloc_inode,
1731 								 tc->tc_ext_alloc_bh,
1732 								 eb);
1733 				if (status < 0) {
1734 					mlog_errno(status);
1735 					goto bail;
1736 				}
1737 			}
1738 		}
1739 		brelse(eb_bh);
1740 		eb_bh = NULL;
1741 		depth--;
1742 	}
1743 
1744 	BUG_ON(!delete_blk);
1745 	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1746 					   clusters_to_del);
1747 	if (status < 0) {
1748 		mlog_errno(status);
1749 		goto bail;
1750 	}
1751 	status = 0;
1752 bail:
1753 	if (!status)
1754 		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1755 	else
1756 		ocfs2_extent_map_drop(inode, 0);
1757 	mlog_exit(status);
1758 	return status;
1759 }
1760 
1761 /*
1762  * It is expected, that by the time you call this function,
1763  * inode->i_size and fe->i_size have been adjusted.
1764  *
1765  * WARNING: This will kfree the truncate context
1766  */
1767 int ocfs2_commit_truncate(struct ocfs2_super *osb,
1768 			  struct inode *inode,
1769 			  struct buffer_head *fe_bh,
1770 			  struct ocfs2_truncate_context *tc)
1771 {
1772 	int status, i, credits, tl_sem = 0;
1773 	u32 clusters_to_del, target_i_clusters;
1774 	u64 last_eb = 0;
1775 	struct ocfs2_dinode *fe;
1776 	struct ocfs2_extent_block *eb;
1777 	struct ocfs2_extent_list *el;
1778 	struct buffer_head *last_eb_bh;
1779 	handle_t *handle = NULL;
1780 	struct inode *tl_inode = osb->osb_tl_inode;
1781 
1782 	mlog_entry_void();
1783 
1784 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
1785 
1786 	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1787 						     i_size_read(inode));
1788 
1789 	last_eb_bh = tc->tc_last_eb_bh;
1790 	tc->tc_last_eb_bh = NULL;
1791 
1792 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
1793 
1794 	if (fe->id2.i_list.l_tree_depth) {
1795 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1796 		el = &eb->h_list;
1797 	} else
1798 		el = &fe->id2.i_list;
1799 	last_eb = le64_to_cpu(fe->i_last_eb_blk);
1800 start:
1801 	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1802 	     "last_eb = %llu, fe->i_last_eb_blk = %llu, "
1803 	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1804 	     le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
1805 	     (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
1806 	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1807 
1808 	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1809 		mlog(0, "last_eb changed!\n");
1810 		BUG_ON(!fe->id2.i_list.l_tree_depth);
1811 		last_eb = le64_to_cpu(fe->i_last_eb_blk);
1812 		/* i_last_eb_blk may have changed, read it if
1813 		 * necessary. We don't have to worry about the
1814 		 * truncate to zero case here (where there becomes no
1815 		 * last_eb) because we never loop back after our work
1816 		 * is done. */
1817 		if (last_eb_bh) {
1818 			brelse(last_eb_bh);
1819 			last_eb_bh = NULL;
1820 		}
1821 
1822 		status = ocfs2_read_block(osb, last_eb,
1823 					  &last_eb_bh, OCFS2_BH_CACHED,
1824 					  inode);
1825 		if (status < 0) {
1826 			mlog_errno(status);
1827 			goto bail;
1828 		}
1829 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1830 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1831 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1832 			status = -EIO;
1833 			goto bail;
1834 		}
1835 		el = &(eb->h_list);
1836 	}
1837 
1838 	/* by now, el will point to the extent list on the bottom most
1839 	 * portion of this tree. */
1840 	i = le16_to_cpu(el->l_next_free_rec) - 1;
1841 	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1842 		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1843 	else
1844 		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1845 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
1846 				  target_i_clusters;
1847 
1848 	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1849 
1850 	mutex_lock(&tl_inode->i_mutex);
1851 	tl_sem = 1;
1852 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
1853 	 * record is free for use. If there isn't any, we flush to get
1854 	 * an empty truncate log.  */
1855 	if (ocfs2_truncate_log_needs_flush(osb)) {
1856 		status = __ocfs2_flush_truncate_log(osb);
1857 		if (status < 0) {
1858 			mlog_errno(status);
1859 			goto bail;
1860 		}
1861 	}
1862 
1863 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1864 						fe, el);
1865 	handle = ocfs2_start_trans(osb, credits);
1866 	if (IS_ERR(handle)) {
1867 		status = PTR_ERR(handle);
1868 		handle = NULL;
1869 		mlog_errno(status);
1870 		goto bail;
1871 	}
1872 
1873 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1874 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1875 	if (status < 0)
1876 		mlog_errno(status);
1877 
1878 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1879 				   last_eb_bh, handle, tc);
1880 	if (status < 0) {
1881 		mlog_errno(status);
1882 		goto bail;
1883 	}
1884 
1885 	mutex_unlock(&tl_inode->i_mutex);
1886 	tl_sem = 0;
1887 
1888 	ocfs2_commit_trans(osb, handle);
1889 	handle = NULL;
1890 
1891 	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1892 	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1893 		goto start;
1894 bail:
1895 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1896 
1897 	ocfs2_schedule_truncate_log_flush(osb, 1);
1898 
1899 	if (tl_sem)
1900 		mutex_unlock(&tl_inode->i_mutex);
1901 
1902 	if (handle)
1903 		ocfs2_commit_trans(osb, handle);
1904 
1905 	if (last_eb_bh)
1906 		brelse(last_eb_bh);
1907 
1908 	/* This will drop the ext_alloc cluster lock for us */
1909 	ocfs2_free_truncate_context(tc);
1910 
1911 	mlog_exit(status);
1912 	return status;
1913 }
1914 
1915 
1916 /*
1917  * Expects the inode to already be locked. This will figure out which
1918  * inodes need to be locked and will put them on the returned truncate
1919  * context.
1920  */
1921 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1922 			   struct inode *inode,
1923 			   struct buffer_head *fe_bh,
1924 			   struct ocfs2_truncate_context **tc)
1925 {
1926 	int status, metadata_delete;
1927 	unsigned int new_i_clusters;
1928 	struct ocfs2_dinode *fe;
1929 	struct ocfs2_extent_block *eb;
1930 	struct ocfs2_extent_list *el;
1931 	struct buffer_head *last_eb_bh = NULL;
1932 	struct inode *ext_alloc_inode = NULL;
1933 	struct buffer_head *ext_alloc_bh = NULL;
1934 
1935 	mlog_entry_void();
1936 
1937 	*tc = NULL;
1938 
1939 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1940 						  i_size_read(inode));
1941 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
1942 
1943 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1944 	     "%llu\n", fe->i_clusters, new_i_clusters,
1945 	     (unsigned long long)fe->i_size);
1946 
1947 	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1948 		ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1949 			    "%u and size %llu whereas struct inode has "
1950 			    "cluster count %u and size %llu which caused an "
1951 			    "invalid truncate to %u clusters.",
1952 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
1953 			    le32_to_cpu(fe->i_clusters),
1954 			    (unsigned long long)le64_to_cpu(fe->i_size),
1955 			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1956 			    new_i_clusters);
1957 		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1958 		status = -EIO;
1959 		goto bail;
1960 	}
1961 
1962 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1963 	if (!(*tc)) {
1964 		status = -ENOMEM;
1965 		mlog_errno(status);
1966 		goto bail;
1967 	}
1968 
1969 	metadata_delete = 0;
1970 	if (fe->id2.i_list.l_tree_depth) {
1971 		/* If we have a tree, then the truncate may result in
1972 		 * metadata deletes. Figure this out from the
1973 		 * rightmost leaf block.*/
1974 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1975 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
1976 		if (status < 0) {
1977 			mlog_errno(status);
1978 			goto bail;
1979 		}
1980 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1981 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1982 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1983 
1984 			brelse(last_eb_bh);
1985 			status = -EIO;
1986 			goto bail;
1987 		}
1988 		el = &(eb->h_list);
1989 		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1990 			metadata_delete = 1;
1991 	}
1992 
1993 	(*tc)->tc_last_eb_bh = last_eb_bh;
1994 
1995 	if (metadata_delete) {
1996 		mlog(0, "Will have to delete metadata for this trunc. "
1997 		     "locking allocator.\n");
1998 		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
1999 		if (!ext_alloc_inode) {
2000 			status = -ENOMEM;
2001 			mlog_errno(status);
2002 			goto bail;
2003 		}
2004 
2005 		mutex_lock(&ext_alloc_inode->i_mutex);
2006 		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
2007 
2008 		status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
2009 		if (status < 0) {
2010 			mlog_errno(status);
2011 			goto bail;
2012 		}
2013 		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2014 		(*tc)->tc_ext_alloc_locked = 1;
2015 	}
2016 
2017 	status = 0;
2018 bail:
2019 	if (status < 0) {
2020 		if (*tc)
2021 			ocfs2_free_truncate_context(*tc);
2022 		*tc = NULL;
2023 	}
2024 	mlog_exit_void();
2025 	return status;
2026 }
2027 
2028 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2029 {
2030 	if (tc->tc_ext_alloc_inode) {
2031 		if (tc->tc_ext_alloc_locked)
2032 			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2033 
2034 		mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
2035 		iput(tc->tc_ext_alloc_inode);
2036 	}
2037 
2038 	if (tc->tc_ext_alloc_bh)
2039 		brelse(tc->tc_ext_alloc_bh);
2040 
2041 	if (tc->tc_last_eb_bh)
2042 		brelse(tc->tc_last_eb_bh);
2043 
2044 	kfree(tc);
2045 }
2046