xref: /linux/fs/ocfs2/alloc.c (revision 440d6635b20037bc9ad46b20817d7b61cef0fc1b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * alloc.c
4  *
5  * Extent allocs and frees
6  *
7  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
8  */
9 
10 #include <linux/fs.h>
11 #include <linux/types.h>
12 #include <linux/slab.h>
13 #include <linux/string.h>
14 #include <linux/highmem.h>
15 #include <linux/swap.h>
16 #include <linux/quotaops.h>
17 #include <linux/blkdev.h>
18 #include <linux/sched/signal.h>
19 
20 #include <cluster/masklog.h>
21 
22 #include "ocfs2.h"
23 
24 #include "alloc.h"
25 #include "aops.h"
26 #include "blockcheck.h"
27 #include "dlmglue.h"
28 #include "extent_map.h"
29 #include "inode.h"
30 #include "journal.h"
31 #include "localalloc.h"
32 #include "suballoc.h"
33 #include "sysfile.h"
34 #include "file.h"
35 #include "super.h"
36 #include "uptodate.h"
37 #include "xattr.h"
38 #include "refcounttree.h"
39 #include "ocfs2_trace.h"
40 
41 #include "buffer_head_io.h"
42 
43 enum ocfs2_contig_type {
44 	CONTIG_NONE = 0,
45 	CONTIG_LEFT,
46 	CONTIG_RIGHT,
47 	CONTIG_LEFTRIGHT,
48 };
49 
50 static enum ocfs2_contig_type
51 	ocfs2_extent_rec_contig(struct super_block *sb,
52 				struct ocfs2_extent_rec *ext,
53 				struct ocfs2_extent_rec *insert_rec);
54 /*
55  * Operations for a specific extent tree type.
56  *
57  * To implement an on-disk btree (extent tree) type in ocfs2, add
58  * an ocfs2_extent_tree_operations structure and the matching
59  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
60  * for the allocation portion of the extent tree.
61  */
62 struct ocfs2_extent_tree_operations {
63 	/*
64 	 * last_eb_blk is the block number of the right most leaf extent
65 	 * block.  Most on-disk structures containing an extent tree store
66 	 * this value for fast access.  The ->eo_set_last_eb_blk() and
67 	 * ->eo_get_last_eb_blk() operations access this value.  They are
68 	 *  both required.
69 	 */
70 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
71 				   u64 blkno);
72 	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
73 
74 	/*
75 	 * The on-disk structure usually keeps track of how many total
76 	 * clusters are stored in this extent tree.  This function updates
77 	 * that value.  new_clusters is the delta, and must be
78 	 * added to the total.  Required.
79 	 */
80 	void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
81 				   u32 new_clusters);
82 
83 	/*
84 	 * If this extent tree is supported by an extent map, insert
85 	 * a record into the map.
86 	 */
87 	void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
88 				     struct ocfs2_extent_rec *rec);
89 
90 	/*
91 	 * If this extent tree is supported by an extent map, truncate the
92 	 * map to clusters,
93 	 */
94 	void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
95 				       u32 clusters);
96 
97 	/*
98 	 * If ->eo_insert_check() exists, it is called before rec is
99 	 * inserted into the extent tree.  It is optional.
100 	 */
101 	int (*eo_insert_check)(struct ocfs2_extent_tree *et,
102 			       struct ocfs2_extent_rec *rec);
103 	int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
104 
105 	/*
106 	 * --------------------------------------------------------------
107 	 * The remaining are internal to ocfs2_extent_tree and don't have
108 	 * accessor functions
109 	 */
110 
111 	/*
112 	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
113 	 * It is required.
114 	 */
115 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
116 
117 	/*
118 	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
119 	 * it exists.  If it does not, et->et_max_leaf_clusters is set
120 	 * to 0 (unlimited).  Optional.
121 	 */
122 	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
123 
124 	/*
125 	 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
126 	 * are contiguous or not. Optional. Don't need to set it if use
127 	 * ocfs2_extent_rec as the tree leaf.
128 	 */
129 	enum ocfs2_contig_type
130 		(*eo_extent_contig)(struct ocfs2_extent_tree *et,
131 				    struct ocfs2_extent_rec *ext,
132 				    struct ocfs2_extent_rec *insert_rec);
133 };
134 
135 
136 /*
137  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
138  * in the methods.
139  */
140 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
141 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
142 					 u64 blkno);
143 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
144 					 u32 clusters);
145 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
146 					   struct ocfs2_extent_rec *rec);
147 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
148 					     u32 clusters);
149 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
150 				     struct ocfs2_extent_rec *rec);
151 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
152 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
153 
154 static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
155 					struct ocfs2_extent_tree *et,
156 					struct buffer_head **new_eb_bh,
157 					int blk_wanted, int *blk_given);
158 static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
159 
160 static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
161 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
162 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
163 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
164 	.eo_extent_map_insert	= ocfs2_dinode_extent_map_insert,
165 	.eo_extent_map_truncate	= ocfs2_dinode_extent_map_truncate,
166 	.eo_insert_check	= ocfs2_dinode_insert_check,
167 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
168 	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
169 };
170 
171 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
172 					 u64 blkno)
173 {
174 	struct ocfs2_dinode *di = et->et_object;
175 
176 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
177 	di->i_last_eb_blk = cpu_to_le64(blkno);
178 }
179 
180 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
181 {
182 	struct ocfs2_dinode *di = et->et_object;
183 
184 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
185 	return le64_to_cpu(di->i_last_eb_blk);
186 }
187 
188 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
189 					 u32 clusters)
190 {
191 	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
192 	struct ocfs2_dinode *di = et->et_object;
193 
194 	le32_add_cpu(&di->i_clusters, clusters);
195 	spin_lock(&oi->ip_lock);
196 	oi->ip_clusters = le32_to_cpu(di->i_clusters);
197 	spin_unlock(&oi->ip_lock);
198 }
199 
200 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
201 					   struct ocfs2_extent_rec *rec)
202 {
203 	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
204 
205 	ocfs2_extent_map_insert_rec(inode, rec);
206 }
207 
208 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
209 					     u32 clusters)
210 {
211 	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
212 
213 	ocfs2_extent_map_trunc(inode, clusters);
214 }
215 
216 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
217 				     struct ocfs2_extent_rec *rec)
218 {
219 	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
220 	struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
221 
222 	BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
223 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
224 			(oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
225 			"Device %s, asking for sparse allocation: inode %llu, "
226 			"cpos %u, clusters %u\n",
227 			osb->dev_str,
228 			(unsigned long long)oi->ip_blkno,
229 			rec->e_cpos, oi->ip_clusters);
230 
231 	return 0;
232 }
233 
234 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
235 {
236 	struct ocfs2_dinode *di = et->et_object;
237 
238 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
239 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
240 
241 	return 0;
242 }
243 
244 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
245 {
246 	struct ocfs2_dinode *di = et->et_object;
247 
248 	et->et_root_el = &di->id2.i_list;
249 }
250 
251 
252 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
253 {
254 	struct ocfs2_xattr_value_buf *vb = et->et_object;
255 
256 	et->et_root_el = &vb->vb_xv->xr_list;
257 }
258 
259 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
260 					      u64 blkno)
261 {
262 	struct ocfs2_xattr_value_buf *vb = et->et_object;
263 
264 	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
265 }
266 
267 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
268 {
269 	struct ocfs2_xattr_value_buf *vb = et->et_object;
270 
271 	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
272 }
273 
274 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
275 					      u32 clusters)
276 {
277 	struct ocfs2_xattr_value_buf *vb = et->et_object;
278 
279 	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
280 }
281 
282 static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
283 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
284 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
285 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
286 	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
287 };
288 
289 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
290 {
291 	struct ocfs2_xattr_block *xb = et->et_object;
292 
293 	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
294 }
295 
296 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
297 {
298 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
299 	et->et_max_leaf_clusters =
300 		ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
301 }
302 
303 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
304 					     u64 blkno)
305 {
306 	struct ocfs2_xattr_block *xb = et->et_object;
307 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
308 
309 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
310 }
311 
312 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
313 {
314 	struct ocfs2_xattr_block *xb = et->et_object;
315 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
316 
317 	return le64_to_cpu(xt->xt_last_eb_blk);
318 }
319 
320 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
321 					     u32 clusters)
322 {
323 	struct ocfs2_xattr_block *xb = et->et_object;
324 
325 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
326 }
327 
328 static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
329 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
330 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
331 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
332 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
333 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
334 };
335 
336 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
337 					  u64 blkno)
338 {
339 	struct ocfs2_dx_root_block *dx_root = et->et_object;
340 
341 	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
342 }
343 
344 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
345 {
346 	struct ocfs2_dx_root_block *dx_root = et->et_object;
347 
348 	return le64_to_cpu(dx_root->dr_last_eb_blk);
349 }
350 
351 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
352 					  u32 clusters)
353 {
354 	struct ocfs2_dx_root_block *dx_root = et->et_object;
355 
356 	le32_add_cpu(&dx_root->dr_clusters, clusters);
357 }
358 
359 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
360 {
361 	struct ocfs2_dx_root_block *dx_root = et->et_object;
362 
363 	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
364 
365 	return 0;
366 }
367 
368 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
369 {
370 	struct ocfs2_dx_root_block *dx_root = et->et_object;
371 
372 	et->et_root_el = &dx_root->dr_list;
373 }
374 
375 static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
376 	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
377 	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
378 	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
379 	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
380 	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
381 };
382 
383 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
384 {
385 	struct ocfs2_refcount_block *rb = et->et_object;
386 
387 	et->et_root_el = &rb->rf_list;
388 }
389 
390 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
391 						u64 blkno)
392 {
393 	struct ocfs2_refcount_block *rb = et->et_object;
394 
395 	rb->rf_last_eb_blk = cpu_to_le64(blkno);
396 }
397 
398 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
399 {
400 	struct ocfs2_refcount_block *rb = et->et_object;
401 
402 	return le64_to_cpu(rb->rf_last_eb_blk);
403 }
404 
405 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
406 						u32 clusters)
407 {
408 	struct ocfs2_refcount_block *rb = et->et_object;
409 
410 	le32_add_cpu(&rb->rf_clusters, clusters);
411 }
412 
413 static enum ocfs2_contig_type
414 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
415 				  struct ocfs2_extent_rec *ext,
416 				  struct ocfs2_extent_rec *insert_rec)
417 {
418 	return CONTIG_NONE;
419 }
420 
421 static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
422 	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
423 	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
424 	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
425 	.eo_fill_root_el	= ocfs2_refcount_tree_fill_root_el,
426 	.eo_extent_contig	= ocfs2_refcount_tree_extent_contig,
427 };
428 
429 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
430 				     struct ocfs2_caching_info *ci,
431 				     struct buffer_head *bh,
432 				     ocfs2_journal_access_func access,
433 				     void *obj,
434 				     const struct ocfs2_extent_tree_operations *ops)
435 {
436 	et->et_ops = ops;
437 	et->et_root_bh = bh;
438 	et->et_ci = ci;
439 	et->et_root_journal_access = access;
440 	if (!obj)
441 		obj = (void *)bh->b_data;
442 	et->et_object = obj;
443 	et->et_dealloc = NULL;
444 
445 	et->et_ops->eo_fill_root_el(et);
446 	if (!et->et_ops->eo_fill_max_leaf_clusters)
447 		et->et_max_leaf_clusters = 0;
448 	else
449 		et->et_ops->eo_fill_max_leaf_clusters(et);
450 }
451 
452 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
453 				   struct ocfs2_caching_info *ci,
454 				   struct buffer_head *bh)
455 {
456 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
457 				 NULL, &ocfs2_dinode_et_ops);
458 }
459 
460 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
461 				       struct ocfs2_caching_info *ci,
462 				       struct buffer_head *bh)
463 {
464 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
465 				 NULL, &ocfs2_xattr_tree_et_ops);
466 }
467 
468 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
469 					struct ocfs2_caching_info *ci,
470 					struct ocfs2_xattr_value_buf *vb)
471 {
472 	__ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
473 				 &ocfs2_xattr_value_et_ops);
474 }
475 
476 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
477 				    struct ocfs2_caching_info *ci,
478 				    struct buffer_head *bh)
479 {
480 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
481 				 NULL, &ocfs2_dx_root_et_ops);
482 }
483 
484 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
485 				     struct ocfs2_caching_info *ci,
486 				     struct buffer_head *bh)
487 {
488 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
489 				 NULL, &ocfs2_refcount_tree_et_ops);
490 }
491 
492 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
493 					    u64 new_last_eb_blk)
494 {
495 	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
496 }
497 
498 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
499 {
500 	return et->et_ops->eo_get_last_eb_blk(et);
501 }
502 
503 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
504 					    u32 clusters)
505 {
506 	et->et_ops->eo_update_clusters(et, clusters);
507 }
508 
509 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
510 					      struct ocfs2_extent_rec *rec)
511 {
512 	if (et->et_ops->eo_extent_map_insert)
513 		et->et_ops->eo_extent_map_insert(et, rec);
514 }
515 
516 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
517 						u32 clusters)
518 {
519 	if (et->et_ops->eo_extent_map_truncate)
520 		et->et_ops->eo_extent_map_truncate(et, clusters);
521 }
522 
523 static inline int ocfs2_et_root_journal_access(handle_t *handle,
524 					       struct ocfs2_extent_tree *et,
525 					       int type)
526 {
527 	return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
528 					  type);
529 }
530 
531 static inline enum ocfs2_contig_type
532 	ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
533 			       struct ocfs2_extent_rec *rec,
534 			       struct ocfs2_extent_rec *insert_rec)
535 {
536 	if (et->et_ops->eo_extent_contig)
537 		return et->et_ops->eo_extent_contig(et, rec, insert_rec);
538 
539 	return ocfs2_extent_rec_contig(
540 				ocfs2_metadata_cache_get_super(et->et_ci),
541 				rec, insert_rec);
542 }
543 
544 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
545 					struct ocfs2_extent_rec *rec)
546 {
547 	int ret = 0;
548 
549 	if (et->et_ops->eo_insert_check)
550 		ret = et->et_ops->eo_insert_check(et, rec);
551 	return ret;
552 }
553 
554 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
555 {
556 	int ret = 0;
557 
558 	if (et->et_ops->eo_sanity_check)
559 		ret = et->et_ops->eo_sanity_check(et);
560 	return ret;
561 }
562 
563 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
564 					 struct ocfs2_extent_block *eb);
565 static void ocfs2_adjust_rightmost_records(handle_t *handle,
566 					   struct ocfs2_extent_tree *et,
567 					   struct ocfs2_path *path,
568 					   struct ocfs2_extent_rec *insert_rec);
569 /*
570  * Reset the actual path elements so that we can reuse the structure
571  * to build another path. Generally, this involves freeing the buffer
572  * heads.
573  */
574 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
575 {
576 	int i, start = 0, depth = 0;
577 	struct ocfs2_path_item *node;
578 
579 	if (keep_root)
580 		start = 1;
581 
582 	for(i = start; i < path_num_items(path); i++) {
583 		node = &path->p_node[i];
584 
585 		brelse(node->bh);
586 		node->bh = NULL;
587 		node->el = NULL;
588 	}
589 
590 	/*
591 	 * Tree depth may change during truncate, or insert. If we're
592 	 * keeping the root extent list, then make sure that our path
593 	 * structure reflects the proper depth.
594 	 */
595 	if (keep_root)
596 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
597 	else
598 		path_root_access(path) = NULL;
599 
600 	path->p_tree_depth = depth;
601 }
602 
603 void ocfs2_free_path(struct ocfs2_path *path)
604 {
605 	if (path) {
606 		ocfs2_reinit_path(path, 0);
607 		kfree(path);
608 	}
609 }
610 
611 /*
612  * All the elements of src into dest. After this call, src could be freed
613  * without affecting dest.
614  *
615  * Both paths should have the same root. Any non-root elements of dest
616  * will be freed.
617  */
618 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
619 {
620 	int i;
621 
622 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
623 	BUG_ON(path_root_el(dest) != path_root_el(src));
624 	BUG_ON(path_root_access(dest) != path_root_access(src));
625 
626 	ocfs2_reinit_path(dest, 1);
627 
628 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
629 		dest->p_node[i].bh = src->p_node[i].bh;
630 		dest->p_node[i].el = src->p_node[i].el;
631 
632 		if (dest->p_node[i].bh)
633 			get_bh(dest->p_node[i].bh);
634 	}
635 }
636 
637 /*
638  * Make the *dest path the same as src and re-initialize src path to
639  * have a root only.
640  */
641 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
642 {
643 	int i;
644 
645 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
646 	BUG_ON(path_root_access(dest) != path_root_access(src));
647 
648 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
649 		brelse(dest->p_node[i].bh);
650 
651 		dest->p_node[i].bh = src->p_node[i].bh;
652 		dest->p_node[i].el = src->p_node[i].el;
653 
654 		src->p_node[i].bh = NULL;
655 		src->p_node[i].el = NULL;
656 	}
657 }
658 
659 /*
660  * Insert an extent block at given index.
661  *
662  * This will not take an additional reference on eb_bh.
663  */
664 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
665 					struct buffer_head *eb_bh)
666 {
667 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
668 
669 	/*
670 	 * Right now, no root bh is an extent block, so this helps
671 	 * catch code errors with dinode trees. The assertion can be
672 	 * safely removed if we ever need to insert extent block
673 	 * structures at the root.
674 	 */
675 	BUG_ON(index == 0);
676 
677 	path->p_node[index].bh = eb_bh;
678 	path->p_node[index].el = &eb->h_list;
679 }
680 
681 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
682 					 struct ocfs2_extent_list *root_el,
683 					 ocfs2_journal_access_func access)
684 {
685 	struct ocfs2_path *path;
686 
687 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
688 
689 	path = kzalloc_obj(*path, GFP_NOFS);
690 	if (path) {
691 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
692 		get_bh(root_bh);
693 		path_root_bh(path) = root_bh;
694 		path_root_el(path) = root_el;
695 		path_root_access(path) = access;
696 	}
697 
698 	return path;
699 }
700 
701 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
702 {
703 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
704 			      path_root_access(path));
705 }
706 
707 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
708 {
709 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
710 			      et->et_root_journal_access);
711 }
712 
713 /*
714  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
715  * otherwise it's the root_access function.
716  *
717  * I don't like the way this function's name looks next to
718  * ocfs2_journal_access_path(), but I don't have a better one.
719  */
720 int ocfs2_path_bh_journal_access(handle_t *handle,
721 				 struct ocfs2_caching_info *ci,
722 				 struct ocfs2_path *path,
723 				 int idx)
724 {
725 	ocfs2_journal_access_func access = path_root_access(path);
726 
727 	if (!access)
728 		access = ocfs2_journal_access;
729 
730 	if (idx)
731 		access = ocfs2_journal_access_eb;
732 
733 	return access(handle, ci, path->p_node[idx].bh,
734 		      OCFS2_JOURNAL_ACCESS_WRITE);
735 }
736 
737 /*
738  * Convenience function to journal all components in a path.
739  */
740 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
741 			      handle_t *handle,
742 			      struct ocfs2_path *path)
743 {
744 	int i, ret = 0;
745 
746 	if (!path)
747 		goto out;
748 
749 	for(i = 0; i < path_num_items(path); i++) {
750 		ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
751 		if (ret < 0) {
752 			mlog_errno(ret);
753 			goto out;
754 		}
755 	}
756 
757 out:
758 	return ret;
759 }
760 
761 /*
762  * Return the index of the extent record which contains cluster #v_cluster.
763  * -1 is returned if it was not found.
764  *
765  * Should work fine on interior and exterior nodes.
766  */
767 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
768 {
769 	int ret = -1;
770 	int i;
771 	struct ocfs2_extent_rec *rec;
772 	u32 rec_end, rec_start, clusters;
773 
774 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
775 		rec = &el->l_recs[i];
776 
777 		rec_start = le32_to_cpu(rec->e_cpos);
778 		clusters = ocfs2_rec_clusters(el, rec);
779 
780 		rec_end = rec_start + clusters;
781 
782 		if (v_cluster >= rec_start && v_cluster < rec_end) {
783 			ret = i;
784 			break;
785 		}
786 	}
787 
788 	return ret;
789 }
790 
791 /*
792  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
793  * ocfs2_extent_rec_contig only work properly against leaf nodes!
794  */
795 static int ocfs2_block_extent_contig(struct super_block *sb,
796 				     struct ocfs2_extent_rec *ext,
797 				     u64 blkno)
798 {
799 	u64 blk_end = le64_to_cpu(ext->e_blkno);
800 
801 	blk_end += ocfs2_clusters_to_blocks(sb,
802 				    le16_to_cpu(ext->e_leaf_clusters));
803 
804 	return blkno == blk_end;
805 }
806 
807 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
808 				  struct ocfs2_extent_rec *right)
809 {
810 	u32 left_range;
811 
812 	left_range = le32_to_cpu(left->e_cpos) +
813 		le16_to_cpu(left->e_leaf_clusters);
814 
815 	return (left_range == le32_to_cpu(right->e_cpos));
816 }
817 
818 static enum ocfs2_contig_type
819 	ocfs2_extent_rec_contig(struct super_block *sb,
820 				struct ocfs2_extent_rec *ext,
821 				struct ocfs2_extent_rec *insert_rec)
822 {
823 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
824 
825 	/*
826 	 * Refuse to coalesce extent records with different flag
827 	 * fields - we don't want to mix unwritten extents with user
828 	 * data.
829 	 */
830 	if (ext->e_flags != insert_rec->e_flags)
831 		return CONTIG_NONE;
832 
833 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
834 	    ocfs2_block_extent_contig(sb, ext, blkno))
835 			return CONTIG_RIGHT;
836 
837 	blkno = le64_to_cpu(ext->e_blkno);
838 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
839 	    ocfs2_block_extent_contig(sb, insert_rec, blkno))
840 		return CONTIG_LEFT;
841 
842 	return CONTIG_NONE;
843 }
844 
845 /*
846  * NOTE: We can have pretty much any combination of contiguousness and
847  * appending.
848  *
849  * The usefulness of APPEND_TAIL is more in that it lets us know that
850  * we'll have to update the path to that leaf.
851  */
852 enum ocfs2_append_type {
853 	APPEND_NONE = 0,
854 	APPEND_TAIL,
855 };
856 
857 enum ocfs2_split_type {
858 	SPLIT_NONE = 0,
859 	SPLIT_LEFT,
860 	SPLIT_RIGHT,
861 };
862 
863 struct ocfs2_insert_type {
864 	enum ocfs2_split_type	ins_split;
865 	enum ocfs2_append_type	ins_appending;
866 	enum ocfs2_contig_type	ins_contig;
867 	int			ins_contig_index;
868 	int			ins_tree_depth;
869 };
870 
871 struct ocfs2_merge_ctxt {
872 	enum ocfs2_contig_type	c_contig_type;
873 	int			c_has_empty_extent;
874 	int			c_split_covers_rec;
875 };
876 
877 static int ocfs2_validate_extent_block(struct super_block *sb,
878 				       struct buffer_head *bh)
879 {
880 	int rc;
881 	struct ocfs2_extent_block *eb =
882 		(struct ocfs2_extent_block *)bh->b_data;
883 
884 	trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
885 
886 	BUG_ON(!buffer_uptodate(bh));
887 
888 	/*
889 	 * If the ecc fails, we return the error but otherwise
890 	 * leave the filesystem running.  We know any error is
891 	 * local to this block.
892 	 */
893 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
894 	if (rc) {
895 		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
896 		     (unsigned long long)bh->b_blocknr);
897 		return rc;
898 	}
899 
900 	/*
901 	 * Errors after here are fatal.
902 	 */
903 
904 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
905 		rc = ocfs2_error(sb,
906 				 "Extent block #%llu has bad signature %.*s\n",
907 				 (unsigned long long)bh->b_blocknr, 7,
908 				 eb->h_signature);
909 		goto bail;
910 	}
911 
912 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
913 		rc = ocfs2_error(sb,
914 				 "Extent block #%llu has an invalid h_blkno of %llu\n",
915 				 (unsigned long long)bh->b_blocknr,
916 				 (unsigned long long)le64_to_cpu(eb->h_blkno));
917 		goto bail;
918 	}
919 
920 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
921 		rc = ocfs2_error(sb,
922 				 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
923 				 (unsigned long long)bh->b_blocknr,
924 				 le32_to_cpu(eb->h_fs_generation));
925 		goto bail;
926 	}
927 
928 	if (le16_to_cpu(eb->h_list.l_count) != ocfs2_extent_recs_per_eb(sb)) {
929 		rc = ocfs2_error(sb,
930 				 "Extent block #%llu has invalid l_count %u (expected %u)\n",
931 				 (unsigned long long)bh->b_blocknr,
932 				 le16_to_cpu(eb->h_list.l_count),
933 				 ocfs2_extent_recs_per_eb(sb));
934 		goto bail;
935 	}
936 
937 	if (le16_to_cpu(eb->h_list.l_next_free_rec) > le16_to_cpu(eb->h_list.l_count)) {
938 		rc = ocfs2_error(sb,
939 				 "Extent block #%llu has invalid l_next_free_rec %u (l_count %u)\n",
940 				 (unsigned long long)bh->b_blocknr,
941 				 le16_to_cpu(eb->h_list.l_next_free_rec),
942 				 le16_to_cpu(eb->h_list.l_count));
943 		goto bail;
944 	}
945 
946 bail:
947 	return rc;
948 }
949 
950 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
951 			    struct buffer_head **bh)
952 {
953 	int rc;
954 	struct buffer_head *tmp = *bh;
955 
956 	rc = ocfs2_read_block(ci, eb_blkno, &tmp,
957 			      ocfs2_validate_extent_block);
958 
959 	/* If ocfs2_read_block() got us a new bh, pass it up. */
960 	if (!rc && !*bh)
961 		*bh = tmp;
962 
963 	return rc;
964 }
965 
966 
967 /*
968  * How many free extents have we got before we need more meta data?
969  */
970 int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
971 {
972 	int retval;
973 	struct ocfs2_extent_list *el = NULL;
974 	struct ocfs2_extent_block *eb;
975 	struct buffer_head *eb_bh = NULL;
976 	u64 last_eb_blk = 0;
977 
978 	el = et->et_root_el;
979 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
980 
981 	if (last_eb_blk) {
982 		retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
983 						 &eb_bh);
984 		if (retval < 0) {
985 			mlog_errno(retval);
986 			goto bail;
987 		}
988 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
989 		el = &eb->h_list;
990 	}
991 
992 	if (el->l_tree_depth != 0) {
993 		retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
994 				"Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
995 				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
996 				(unsigned long long)last_eb_blk,
997 				le16_to_cpu(el->l_tree_depth));
998 		goto bail;
999 	}
1000 
1001 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
1002 bail:
1003 	brelse(eb_bh);
1004 
1005 	trace_ocfs2_num_free_extents(retval);
1006 	return retval;
1007 }
1008 
1009 /* expects array to already be allocated
1010  *
1011  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
1012  * l_count for you
1013  */
1014 static int ocfs2_create_new_meta_bhs(handle_t *handle,
1015 				     struct ocfs2_extent_tree *et,
1016 				     int wanted,
1017 				     struct ocfs2_alloc_context *meta_ac,
1018 				     struct buffer_head *bhs[])
1019 {
1020 	int count, status, i;
1021 	u16 suballoc_bit_start;
1022 	u32 num_got;
1023 	u64 suballoc_loc, first_blkno;
1024 	struct ocfs2_super *osb =
1025 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1026 	struct ocfs2_extent_block *eb;
1027 
1028 	count = 0;
1029 	while (count < wanted) {
1030 		status = ocfs2_claim_metadata(handle,
1031 					      meta_ac,
1032 					      wanted - count,
1033 					      &suballoc_loc,
1034 					      &suballoc_bit_start,
1035 					      &num_got,
1036 					      &first_blkno);
1037 		if (status < 0) {
1038 			mlog_errno(status);
1039 			goto bail;
1040 		}
1041 
1042 		for(i = count;  i < (num_got + count); i++) {
1043 			bhs[i] = sb_getblk(osb->sb, first_blkno);
1044 			if (bhs[i] == NULL) {
1045 				status = -ENOMEM;
1046 				mlog_errno(status);
1047 				goto bail;
1048 			}
1049 			ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1050 
1051 			status = ocfs2_journal_access_eb(handle, et->et_ci,
1052 							 bhs[i],
1053 							 OCFS2_JOURNAL_ACCESS_CREATE);
1054 			if (status < 0) {
1055 				mlog_errno(status);
1056 				goto bail;
1057 			}
1058 
1059 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1060 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1061 			/* Ok, setup the minimal stuff here. */
1062 			strscpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1063 			eb->h_blkno = cpu_to_le64(first_blkno);
1064 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1065 			eb->h_suballoc_slot =
1066 				cpu_to_le16(meta_ac->ac_alloc_slot);
1067 			eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1068 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1069 			eb->h_list.l_count =
1070 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1071 
1072 			suballoc_bit_start++;
1073 			first_blkno++;
1074 
1075 			/* We'll also be dirtied by the caller, so
1076 			 * this isn't absolutely necessary. */
1077 			ocfs2_journal_dirty(handle, bhs[i]);
1078 		}
1079 
1080 		count += num_got;
1081 	}
1082 
1083 	status = 0;
1084 bail:
1085 	if (status < 0) {
1086 		for(i = 0; i < wanted; i++) {
1087 			brelse(bhs[i]);
1088 			bhs[i] = NULL;
1089 		}
1090 	}
1091 	return status;
1092 }
1093 
1094 /*
1095  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1096  *
1097  * Returns the sum of the rightmost extent rec logical offset and
1098  * cluster count.
1099  *
1100  * ocfs2_add_branch() uses this to determine what logical cluster
1101  * value should be populated into the leftmost new branch records.
1102  *
1103  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1104  * value for the new topmost tree record.
1105  */
1106 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1107 {
1108 	int i;
1109 
1110 	i = le16_to_cpu(el->l_next_free_rec) - 1;
1111 
1112 	return le32_to_cpu(el->l_recs[i].e_cpos) +
1113 		ocfs2_rec_clusters(el, &el->l_recs[i]);
1114 }
1115 
1116 /*
1117  * Change range of the branches in the right most path according to the leaf
1118  * extent block's rightmost record.
1119  */
1120 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1121 					 struct ocfs2_extent_tree *et)
1122 {
1123 	int status;
1124 	struct ocfs2_path *path = NULL;
1125 	struct ocfs2_extent_list *el;
1126 	struct ocfs2_extent_rec *rec;
1127 
1128 	path = ocfs2_new_path_from_et(et);
1129 	if (!path) {
1130 		status = -ENOMEM;
1131 		return status;
1132 	}
1133 
1134 	status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1135 	if (status < 0) {
1136 		mlog_errno(status);
1137 		goto out;
1138 	}
1139 
1140 	status = ocfs2_extend_trans(handle, path_num_items(path));
1141 	if (status < 0) {
1142 		mlog_errno(status);
1143 		goto out;
1144 	}
1145 
1146 	status = ocfs2_journal_access_path(et->et_ci, handle, path);
1147 	if (status < 0) {
1148 		mlog_errno(status);
1149 		goto out;
1150 	}
1151 
1152 	el = path_leaf_el(path);
1153 	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
1154 
1155 	ocfs2_adjust_rightmost_records(handle, et, path, rec);
1156 
1157 out:
1158 	ocfs2_free_path(path);
1159 	return status;
1160 }
1161 
1162 /*
1163  * Add an entire tree branch to our inode. eb_bh is the extent block
1164  * to start at, if we don't want to start the branch at the root
1165  * structure.
1166  *
1167  * last_eb_bh is required as we have to update it's next_leaf pointer
1168  * for the new last extent block.
1169  *
1170  * the new branch will be 'empty' in the sense that every block will
1171  * contain a single record with cluster count == 0.
1172  */
1173 static int ocfs2_add_branch(handle_t *handle,
1174 			    struct ocfs2_extent_tree *et,
1175 			    struct buffer_head *eb_bh,
1176 			    struct buffer_head **last_eb_bh,
1177 			    struct ocfs2_alloc_context *meta_ac)
1178 {
1179 	int status, new_blocks, i, block_given = 0;
1180 	u64 next_blkno, new_last_eb_blk;
1181 	struct buffer_head *bh;
1182 	struct buffer_head **new_eb_bhs = NULL;
1183 	struct ocfs2_extent_block *eb;
1184 	struct ocfs2_extent_list  *eb_el;
1185 	struct ocfs2_extent_list  *el;
1186 	u32 new_cpos, root_end;
1187 
1188 	BUG_ON(!last_eb_bh || !*last_eb_bh);
1189 
1190 	if (eb_bh) {
1191 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1192 		el = &eb->h_list;
1193 	} else
1194 		el = et->et_root_el;
1195 
1196 	/* we never add a branch to a leaf. */
1197 	BUG_ON(!el->l_tree_depth);
1198 
1199 	new_blocks = le16_to_cpu(el->l_tree_depth);
1200 
1201 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1202 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1203 	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1204 
1205 	/*
1206 	 * If there is a gap before the root end and the real end
1207 	 * of the rightmost leaf block, we need to remove the gap
1208 	 * between new_cpos and root_end first so that the tree
1209 	 * is consistent after we add a new branch(it will start
1210 	 * from new_cpos).
1211 	 */
1212 	if (root_end > new_cpos) {
1213 		trace_ocfs2_adjust_rightmost_branch(
1214 			(unsigned long long)
1215 			ocfs2_metadata_cache_owner(et->et_ci),
1216 			root_end, new_cpos);
1217 
1218 		status = ocfs2_adjust_rightmost_branch(handle, et);
1219 		if (status) {
1220 			mlog_errno(status);
1221 			goto bail;
1222 		}
1223 	}
1224 
1225 	/* allocate the number of new eb blocks we need */
1226 	new_eb_bhs = kzalloc_objs(struct buffer_head *, new_blocks);
1227 	if (!new_eb_bhs) {
1228 		status = -ENOMEM;
1229 		mlog_errno(status);
1230 		goto bail;
1231 	}
1232 
1233 	/* Firstyly, try to reuse dealloc since we have already estimated how
1234 	 * many extent blocks we may use.
1235 	 */
1236 	if (!ocfs2_is_dealloc_empty(et)) {
1237 		status = ocfs2_reuse_blk_from_dealloc(handle, et,
1238 						      new_eb_bhs, new_blocks,
1239 						      &block_given);
1240 		if (status < 0) {
1241 			mlog_errno(status);
1242 			goto bail;
1243 		}
1244 	}
1245 
1246 	BUG_ON(block_given > new_blocks);
1247 
1248 	if (block_given < new_blocks) {
1249 		BUG_ON(!meta_ac);
1250 		status = ocfs2_create_new_meta_bhs(handle, et,
1251 						   new_blocks - block_given,
1252 						   meta_ac,
1253 						   &new_eb_bhs[block_given]);
1254 		if (status < 0) {
1255 			mlog_errno(status);
1256 			goto bail;
1257 		}
1258 	}
1259 
1260 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1261 	 * linked with the rest of the tree.
1262 	 * conversely, new_eb_bhs[0] is the new bottommost leaf.
1263 	 *
1264 	 * when we leave the loop, new_last_eb_blk will point to the
1265 	 * newest leaf, and next_blkno will point to the topmost extent
1266 	 * block. */
1267 	next_blkno = new_last_eb_blk = 0;
1268 	for(i = 0; i < new_blocks; i++) {
1269 		bh = new_eb_bhs[i];
1270 		eb = (struct ocfs2_extent_block *) bh->b_data;
1271 		/* ocfs2_create_new_meta_bhs() should create it right! */
1272 		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1273 		eb_el = &eb->h_list;
1274 
1275 		status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1276 						 OCFS2_JOURNAL_ACCESS_CREATE);
1277 		if (status < 0) {
1278 			mlog_errno(status);
1279 			goto bail;
1280 		}
1281 
1282 		eb->h_next_leaf_blk = 0;
1283 		eb_el->l_tree_depth = cpu_to_le16(i);
1284 		eb_el->l_next_free_rec = cpu_to_le16(1);
1285 		/*
1286 		 * This actually counts as an empty extent as
1287 		 * c_clusters == 0
1288 		 */
1289 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1290 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1291 		/*
1292 		 * eb_el isn't always an interior node, but even leaf
1293 		 * nodes want a zero'd flags and reserved field so
1294 		 * this gets the whole 32 bits regardless of use.
1295 		 */
1296 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1297 		if (!eb_el->l_tree_depth)
1298 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1299 
1300 		ocfs2_journal_dirty(handle, bh);
1301 		next_blkno = le64_to_cpu(eb->h_blkno);
1302 	}
1303 
1304 	/* This is a bit hairy. We want to update up to three blocks
1305 	 * here without leaving any of them in an inconsistent state
1306 	 * in case of error. We don't have to worry about
1307 	 * journal_dirty erroring as it won't unless we've aborted the
1308 	 * handle (in which case we would never be here) so reserving
1309 	 * the write with journal_access is all we need to do. */
1310 	status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1311 					 OCFS2_JOURNAL_ACCESS_WRITE);
1312 	if (status < 0) {
1313 		mlog_errno(status);
1314 		goto bail;
1315 	}
1316 	status = ocfs2_et_root_journal_access(handle, et,
1317 					      OCFS2_JOURNAL_ACCESS_WRITE);
1318 	if (status < 0) {
1319 		mlog_errno(status);
1320 		goto bail;
1321 	}
1322 	if (eb_bh) {
1323 		status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1324 						 OCFS2_JOURNAL_ACCESS_WRITE);
1325 		if (status < 0) {
1326 			mlog_errno(status);
1327 			goto bail;
1328 		}
1329 	}
1330 
1331 	/* Link the new branch into the rest of the tree (el will
1332 	 * either be on the root_bh, or the extent block passed in. */
1333 	i = le16_to_cpu(el->l_next_free_rec);
1334 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1335 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1336 	el->l_recs[i].e_int_clusters = 0;
1337 	le16_add_cpu(&el->l_next_free_rec, 1);
1338 
1339 	/* fe needs a new last extent block pointer, as does the
1340 	 * next_leaf on the previously last-extent-block. */
1341 	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1342 
1343 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1344 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1345 
1346 	ocfs2_journal_dirty(handle, *last_eb_bh);
1347 	ocfs2_journal_dirty(handle, et->et_root_bh);
1348 	if (eb_bh)
1349 		ocfs2_journal_dirty(handle, eb_bh);
1350 
1351 	/*
1352 	 * Some callers want to track the rightmost leaf so pass it
1353 	 * back here.
1354 	 */
1355 	brelse(*last_eb_bh);
1356 	get_bh(new_eb_bhs[0]);
1357 	*last_eb_bh = new_eb_bhs[0];
1358 
1359 	status = 0;
1360 bail:
1361 	if (new_eb_bhs) {
1362 		for (i = 0; i < new_blocks; i++)
1363 			brelse(new_eb_bhs[i]);
1364 		kfree(new_eb_bhs);
1365 	}
1366 
1367 	return status;
1368 }
1369 
1370 /*
1371  * adds another level to the allocation tree.
1372  * returns back the new extent block so you can add a branch to it
1373  * after this call.
1374  */
1375 static int ocfs2_shift_tree_depth(handle_t *handle,
1376 				  struct ocfs2_extent_tree *et,
1377 				  struct ocfs2_alloc_context *meta_ac,
1378 				  struct buffer_head **ret_new_eb_bh)
1379 {
1380 	int status, i, block_given = 0;
1381 	u32 new_clusters;
1382 	struct buffer_head *new_eb_bh = NULL;
1383 	struct ocfs2_extent_block *eb;
1384 	struct ocfs2_extent_list  *root_el;
1385 	struct ocfs2_extent_list  *eb_el;
1386 
1387 	if (!ocfs2_is_dealloc_empty(et)) {
1388 		status = ocfs2_reuse_blk_from_dealloc(handle, et,
1389 						      &new_eb_bh, 1,
1390 						      &block_given);
1391 	} else if (meta_ac) {
1392 		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1393 						   &new_eb_bh);
1394 
1395 	} else {
1396 		BUG();
1397 	}
1398 
1399 	if (status < 0) {
1400 		mlog_errno(status);
1401 		goto bail;
1402 	}
1403 
1404 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1405 	/* ocfs2_create_new_meta_bhs() should create it right! */
1406 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1407 
1408 	eb_el = &eb->h_list;
1409 	root_el = et->et_root_el;
1410 
1411 	status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1412 					 OCFS2_JOURNAL_ACCESS_CREATE);
1413 	if (status < 0) {
1414 		mlog_errno(status);
1415 		goto bail;
1416 	}
1417 
1418 	/* copy the root extent list data into the new extent block */
1419 	eb_el->l_tree_depth = root_el->l_tree_depth;
1420 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
1421 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1422 		eb_el->l_recs[i] = root_el->l_recs[i];
1423 
1424 	ocfs2_journal_dirty(handle, new_eb_bh);
1425 
1426 	status = ocfs2_et_root_journal_access(handle, et,
1427 					      OCFS2_JOURNAL_ACCESS_WRITE);
1428 	if (status < 0) {
1429 		mlog_errno(status);
1430 		goto bail;
1431 	}
1432 
1433 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1434 
1435 	/* update root_bh now */
1436 	le16_add_cpu(&root_el->l_tree_depth, 1);
1437 	root_el->l_recs[0].e_cpos = 0;
1438 	root_el->l_recs[0].e_blkno = eb->h_blkno;
1439 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1440 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1441 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1442 	root_el->l_next_free_rec = cpu_to_le16(1);
1443 
1444 	/* If this is our 1st tree depth shift, then last_eb_blk
1445 	 * becomes the allocated extent block */
1446 	if (root_el->l_tree_depth == cpu_to_le16(1))
1447 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1448 
1449 	ocfs2_journal_dirty(handle, et->et_root_bh);
1450 
1451 	*ret_new_eb_bh = new_eb_bh;
1452 	new_eb_bh = NULL;
1453 	status = 0;
1454 bail:
1455 	brelse(new_eb_bh);
1456 
1457 	return status;
1458 }
1459 
1460 /*
1461  * Should only be called when there is no space left in any of the
1462  * leaf nodes. What we want to do is find the lowest tree depth
1463  * non-leaf extent block with room for new records. There are three
1464  * valid results of this search:
1465  *
1466  * 1) a lowest extent block is found, then we pass it back in
1467  *    *lowest_eb_bh and return '0'
1468  *
1469  * 2) the search fails to find anything, but the root_el has room. We
1470  *    pass NULL back in *lowest_eb_bh, but still return '0'
1471  *
1472  * 3) the search fails to find anything AND the root_el is full, in
1473  *    which case we return > 0
1474  *
1475  * return status < 0 indicates an error.
1476  */
1477 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1478 				    struct buffer_head **target_bh)
1479 {
1480 	int status = 0, i;
1481 	u64 blkno;
1482 	struct ocfs2_extent_block *eb;
1483 	struct ocfs2_extent_list  *el;
1484 	struct buffer_head *bh = NULL;
1485 	struct buffer_head *lowest_bh = NULL;
1486 
1487 	*target_bh = NULL;
1488 
1489 	el = et->et_root_el;
1490 
1491 	while(le16_to_cpu(el->l_tree_depth) > 1) {
1492 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1493 			status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1494 					"Owner %llu has empty extent list (next_free_rec == 0)\n",
1495 					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1496 			goto bail;
1497 		}
1498 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1499 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1500 		if (!blkno) {
1501 			status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1502 					"Owner %llu has extent list where extent # %d has no physical block start\n",
1503 					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1504 			goto bail;
1505 		}
1506 
1507 		brelse(bh);
1508 		bh = NULL;
1509 
1510 		status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1511 		if (status < 0) {
1512 			mlog_errno(status);
1513 			goto bail;
1514 		}
1515 
1516 		eb = (struct ocfs2_extent_block *) bh->b_data;
1517 		el = &eb->h_list;
1518 
1519 		if (le16_to_cpu(el->l_next_free_rec) <
1520 		    le16_to_cpu(el->l_count)) {
1521 			brelse(lowest_bh);
1522 			lowest_bh = bh;
1523 			get_bh(lowest_bh);
1524 		}
1525 	}
1526 
1527 	/* If we didn't find one and the fe doesn't have any room,
1528 	 * then return '1' */
1529 	el = et->et_root_el;
1530 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1531 		status = 1;
1532 
1533 	*target_bh = lowest_bh;
1534 bail:
1535 	brelse(bh);
1536 
1537 	return status;
1538 }
1539 
1540 /*
1541  * Grow a b-tree so that it has more records.
1542  *
1543  * We might shift the tree depth in which case existing paths should
1544  * be considered invalid.
1545  *
1546  * Tree depth after the grow is returned via *final_depth.
1547  *
1548  * *last_eb_bh will be updated by ocfs2_add_branch().
1549  */
1550 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1551 			   int *final_depth, struct buffer_head **last_eb_bh,
1552 			   struct ocfs2_alloc_context *meta_ac)
1553 {
1554 	int ret, shift;
1555 	struct ocfs2_extent_list *el = et->et_root_el;
1556 	int depth = le16_to_cpu(el->l_tree_depth);
1557 	struct buffer_head *bh = NULL;
1558 
1559 	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
1560 
1561 	shift = ocfs2_find_branch_target(et, &bh);
1562 	if (shift < 0) {
1563 		ret = shift;
1564 		mlog_errno(ret);
1565 		goto out;
1566 	}
1567 
1568 	/* We traveled all the way to the bottom of the allocation tree
1569 	 * and didn't find room for any more extents - we need to add
1570 	 * another tree level */
1571 	if (shift) {
1572 		BUG_ON(bh);
1573 		trace_ocfs2_grow_tree(
1574 			(unsigned long long)
1575 			ocfs2_metadata_cache_owner(et->et_ci),
1576 			depth);
1577 
1578 		/* ocfs2_shift_tree_depth will return us a buffer with
1579 		 * the new extent block (so we can pass that to
1580 		 * ocfs2_add_branch). */
1581 		ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1582 		if (ret < 0) {
1583 			mlog_errno(ret);
1584 			goto out;
1585 		}
1586 		depth++;
1587 		if (depth == 1) {
1588 			/*
1589 			 * Special case: we have room now if we shifted from
1590 			 * tree_depth 0, so no more work needs to be done.
1591 			 *
1592 			 * We won't be calling add_branch, so pass
1593 			 * back *last_eb_bh as the new leaf. At depth
1594 			 * zero, it should always be null so there's
1595 			 * no reason to brelse.
1596 			 */
1597 			BUG_ON(*last_eb_bh);
1598 			get_bh(bh);
1599 			*last_eb_bh = bh;
1600 			goto out;
1601 		}
1602 	}
1603 
1604 	/* call ocfs2_add_branch to add the final part of the tree with
1605 	 * the new data. */
1606 	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1607 			       meta_ac);
1608 	if (ret < 0)
1609 		mlog_errno(ret);
1610 
1611 out:
1612 	if (final_depth)
1613 		*final_depth = depth;
1614 	brelse(bh);
1615 	return ret;
1616 }
1617 
1618 /*
1619  * This function will discard the rightmost extent record.
1620  */
1621 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1622 {
1623 	int next_free = le16_to_cpu(el->l_next_free_rec);
1624 	int count = le16_to_cpu(el->l_count);
1625 	unsigned int num_bytes;
1626 
1627 	BUG_ON(!next_free);
1628 	/* This will cause us to go off the end of our extent list. */
1629 	BUG_ON(next_free >= count);
1630 
1631 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1632 
1633 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1634 }
1635 
1636 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1637 			      struct ocfs2_extent_rec *insert_rec)
1638 {
1639 	int i, insert_index, next_free, has_empty, num_bytes;
1640 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1641 	struct ocfs2_extent_rec *rec;
1642 
1643 	next_free = le16_to_cpu(el->l_next_free_rec);
1644 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1645 
1646 	BUG_ON(!next_free);
1647 
1648 	/* The tree code before us didn't allow enough room in the leaf. */
1649 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1650 
1651 	/*
1652 	 * The easiest way to approach this is to just remove the
1653 	 * empty extent and temporarily decrement next_free.
1654 	 */
1655 	if (has_empty) {
1656 		/*
1657 		 * If next_free was 1 (only an empty extent), this
1658 		 * loop won't execute, which is fine. We still want
1659 		 * the decrement above to happen.
1660 		 */
1661 		for(i = 0; i < (next_free - 1); i++)
1662 			el->l_recs[i] = el->l_recs[i+1];
1663 
1664 		next_free--;
1665 	}
1666 
1667 	/*
1668 	 * Figure out what the new record index should be.
1669 	 */
1670 	for(i = 0; i < next_free; i++) {
1671 		rec = &el->l_recs[i];
1672 
1673 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1674 			break;
1675 	}
1676 	insert_index = i;
1677 
1678 	trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
1679 				has_empty, next_free,
1680 				le16_to_cpu(el->l_count));
1681 
1682 	BUG_ON(insert_index < 0);
1683 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1684 	BUG_ON(insert_index > next_free);
1685 
1686 	/*
1687 	 * No need to memmove if we're just adding to the tail.
1688 	 */
1689 	if (insert_index != next_free) {
1690 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1691 
1692 		num_bytes = next_free - insert_index;
1693 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1694 		memmove(&el->l_recs[insert_index + 1],
1695 			&el->l_recs[insert_index],
1696 			num_bytes);
1697 	}
1698 
1699 	/*
1700 	 * Either we had an empty extent, and need to re-increment or
1701 	 * there was no empty extent on a non full rightmost leaf node,
1702 	 * in which case we still need to increment.
1703 	 */
1704 	next_free++;
1705 	el->l_next_free_rec = cpu_to_le16(next_free);
1706 	/*
1707 	 * Make sure none of the math above just messed up our tree.
1708 	 */
1709 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1710 
1711 	el->l_recs[insert_index] = *insert_rec;
1712 
1713 }
1714 
1715 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1716 {
1717 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1718 
1719 	BUG_ON(num_recs == 0);
1720 
1721 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1722 		num_recs--;
1723 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1724 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1725 		memset(&el->l_recs[num_recs], 0,
1726 		       sizeof(struct ocfs2_extent_rec));
1727 		el->l_next_free_rec = cpu_to_le16(num_recs);
1728 	}
1729 }
1730 
1731 /*
1732  * Create an empty extent record .
1733  *
1734  * l_next_free_rec may be updated.
1735  *
1736  * If an empty extent already exists do nothing.
1737  */
1738 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1739 {
1740 	int next_free = le16_to_cpu(el->l_next_free_rec);
1741 
1742 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1743 
1744 	if (next_free == 0)
1745 		goto set_and_inc;
1746 
1747 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1748 		return;
1749 
1750 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1751 			"Asked to create an empty extent in a full list:\n"
1752 			"count = %u, tree depth = %u",
1753 			le16_to_cpu(el->l_count),
1754 			le16_to_cpu(el->l_tree_depth));
1755 
1756 	ocfs2_shift_records_right(el);
1757 
1758 set_and_inc:
1759 	le16_add_cpu(&el->l_next_free_rec, 1);
1760 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1761 }
1762 
1763 /*
1764  * For a rotation which involves two leaf nodes, the "root node" is
1765  * the lowest level tree node which contains a path to both leafs. This
1766  * resulting set of information can be used to form a complete "subtree"
1767  *
1768  * This function is passed two full paths from the dinode down to a
1769  * pair of adjacent leaves. It's task is to figure out which path
1770  * index contains the subtree root - this can be the root index itself
1771  * in a worst-case rotation.
1772  *
1773  * The array index of the subtree root is passed back.
1774  */
1775 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1776 			    struct ocfs2_path *left,
1777 			    struct ocfs2_path *right)
1778 {
1779 	int i = 0;
1780 
1781 	/*
1782 	 * Check that the caller passed in two paths from the same tree.
1783 	 */
1784 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1785 
1786 	do {
1787 		i++;
1788 
1789 		/*
1790 		 * The caller didn't pass two adjacent paths.
1791 		 */
1792 		mlog_bug_on_msg(i > left->p_tree_depth,
1793 				"Owner %llu, left depth %u, right depth %u\n"
1794 				"left leaf blk %llu, right leaf blk %llu\n",
1795 				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1796 				left->p_tree_depth, right->p_tree_depth,
1797 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1798 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1799 	} while (left->p_node[i].bh->b_blocknr ==
1800 		 right->p_node[i].bh->b_blocknr);
1801 
1802 	return i - 1;
1803 }
1804 
1805 typedef void (path_insert_t)(void *, struct buffer_head *);
1806 
1807 /*
1808  * Traverse a btree path in search of cpos, starting at root_el.
1809  *
1810  * This code can be called with a cpos larger than the tree, in which
1811  * case it will return the rightmost path.
1812  */
1813 static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1814 			     struct ocfs2_extent_list *root_el, u32 cpos,
1815 			     path_insert_t *func, void *data)
1816 {
1817 	int i, ret = 0;
1818 	u32 range;
1819 	u64 blkno;
1820 	struct buffer_head *bh = NULL;
1821 	struct ocfs2_extent_block *eb;
1822 	struct ocfs2_extent_list *el;
1823 	struct ocfs2_extent_rec *rec;
1824 
1825 	el = root_el;
1826 	while (el->l_tree_depth) {
1827 		if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) {
1828 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1829 				    "Owner %llu has invalid tree depth %u in extent list\n",
1830 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1831 				    le16_to_cpu(el->l_tree_depth));
1832 			ret = -EROFS;
1833 			goto out;
1834 		}
1835 		if (!el->l_next_free_rec || !el->l_count) {
1836 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1837 				    "Owner %llu has empty extent list at depth %u\n"
1838 				    "(next free=%u count=%u)\n",
1839 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1840 				    le16_to_cpu(el->l_tree_depth),
1841 				    le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count));
1842 			ret = -EROFS;
1843 			goto out;
1844 		}
1845 
1846 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1847 			rec = &el->l_recs[i];
1848 
1849 			/*
1850 			 * In the case that cpos is off the allocation
1851 			 * tree, this should just wind up returning the
1852 			 * rightmost record.
1853 			 */
1854 			range = le32_to_cpu(rec->e_cpos) +
1855 				ocfs2_rec_clusters(el, rec);
1856 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1857 			    break;
1858 		}
1859 
1860 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1861 		if (blkno == 0) {
1862 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1863 				    "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
1864 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1865 				    le16_to_cpu(el->l_tree_depth), i);
1866 			ret = -EROFS;
1867 			goto out;
1868 		}
1869 
1870 		brelse(bh);
1871 		bh = NULL;
1872 		ret = ocfs2_read_extent_block(ci, blkno, &bh);
1873 		if (ret) {
1874 			mlog_errno(ret);
1875 			goto out;
1876 		}
1877 
1878 		eb = (struct ocfs2_extent_block *) bh->b_data;
1879 		el = &eb->h_list;
1880 
1881 		if (func)
1882 			func(data, bh);
1883 	}
1884 
1885 out:
1886 	/*
1887 	 * Catch any trailing bh that the loop didn't handle.
1888 	 */
1889 	brelse(bh);
1890 
1891 	return ret;
1892 }
1893 
1894 /*
1895  * Given an initialized path (that is, it has a valid root extent
1896  * list), this function will traverse the btree in search of the path
1897  * which would contain cpos.
1898  *
1899  * The path traveled is recorded in the path structure.
1900  *
1901  * Note that this will not do any comparisons on leaf node extent
1902  * records, so it will work fine in the case that we just added a tree
1903  * branch.
1904  */
1905 struct find_path_data {
1906 	int index;
1907 	struct ocfs2_path *path;
1908 };
1909 static void find_path_ins(void *data, struct buffer_head *bh)
1910 {
1911 	struct find_path_data *fp = data;
1912 
1913 	get_bh(bh);
1914 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1915 	fp->index++;
1916 }
1917 int ocfs2_find_path(struct ocfs2_caching_info *ci,
1918 		    struct ocfs2_path *path, u32 cpos)
1919 {
1920 	struct find_path_data data;
1921 
1922 	data.index = 1;
1923 	data.path = path;
1924 	return __ocfs2_find_path(ci, path_root_el(path), cpos,
1925 				 find_path_ins, &data);
1926 }
1927 
1928 static void find_leaf_ins(void *data, struct buffer_head *bh)
1929 {
1930 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1931 	struct ocfs2_extent_list *el = &eb->h_list;
1932 	struct buffer_head **ret = data;
1933 
1934 	/* We want to retain only the leaf block. */
1935 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1936 		get_bh(bh);
1937 		*ret = bh;
1938 	}
1939 }
1940 /*
1941  * Find the leaf block in the tree which would contain cpos. No
1942  * checking of the actual leaf is done.
1943  *
1944  * Some paths want to call this instead of allocating a path structure
1945  * and calling ocfs2_find_path().
1946  *
1947  * This function doesn't handle non btree extent lists.
1948  */
1949 int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1950 		    struct ocfs2_extent_list *root_el, u32 cpos,
1951 		    struct buffer_head **leaf_bh)
1952 {
1953 	int ret;
1954 	struct buffer_head *bh = NULL;
1955 
1956 	ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1957 	if (ret) {
1958 		mlog_errno(ret);
1959 		goto out;
1960 	}
1961 
1962 	*leaf_bh = bh;
1963 out:
1964 	return ret;
1965 }
1966 
1967 /*
1968  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1969  *
1970  * Basically, we've moved stuff around at the bottom of the tree and
1971  * we need to fix up the extent records above the changes to reflect
1972  * the new changes.
1973  *
1974  * left_rec: the record on the left.
1975  * right_rec: the record to the right of left_rec
1976  * right_child_el: is the child list pointed to by right_rec
1977  *
1978  * By definition, this only works on interior nodes.
1979  */
1980 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1981 				  struct ocfs2_extent_rec *right_rec,
1982 				  struct ocfs2_extent_list *right_child_el)
1983 {
1984 	u32 left_clusters, right_end;
1985 
1986 	/*
1987 	 * Interior nodes never have holes. Their cpos is the cpos of
1988 	 * the leftmost record in their child list. Their cluster
1989 	 * count covers the full theoretical range of their child list
1990 	 * - the range between their cpos and the cpos of the record
1991 	 * immediately to their right.
1992 	 */
1993 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1994 	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1995 		BUG_ON(right_child_el->l_tree_depth);
1996 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1997 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1998 	}
1999 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
2000 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
2001 
2002 	/*
2003 	 * Calculate the rightmost cluster count boundary before
2004 	 * moving cpos - we will need to adjust clusters after
2005 	 * updating e_cpos to keep the same highest cluster count.
2006 	 */
2007 	right_end = le32_to_cpu(right_rec->e_cpos);
2008 	right_end += le32_to_cpu(right_rec->e_int_clusters);
2009 
2010 	right_rec->e_cpos = left_rec->e_cpos;
2011 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
2012 
2013 	right_end -= le32_to_cpu(right_rec->e_cpos);
2014 	right_rec->e_int_clusters = cpu_to_le32(right_end);
2015 }
2016 
2017 /*
2018  * Adjust the adjacent root node records involved in a
2019  * rotation. left_el_blkno is passed in as a key so that we can easily
2020  * find it's index in the root list.
2021  */
2022 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
2023 				      struct ocfs2_extent_list *left_el,
2024 				      struct ocfs2_extent_list *right_el,
2025 				      u64 left_el_blkno)
2026 {
2027 	int i;
2028 
2029 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
2030 	       le16_to_cpu(left_el->l_tree_depth));
2031 
2032 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2033 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2034 			break;
2035 	}
2036 
2037 	/*
2038 	 * The path walking code should have never returned a root and
2039 	 * two paths which are not adjacent.
2040 	 */
2041 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2042 
2043 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
2044 				      &root_el->l_recs[i + 1], right_el);
2045 }
2046 
2047 /*
2048  * We've changed a leaf block (in right_path) and need to reflect that
2049  * change back up the subtree.
2050  *
2051  * This happens in multiple places:
2052  *   - When we've moved an extent record from the left path leaf to the right
2053  *     path leaf to make room for an empty extent in the left path leaf.
2054  *   - When our insert into the right path leaf is at the leftmost edge
2055  *     and requires an update of the path immediately to it's left. This
2056  *     can occur at the end of some types of rotation and appending inserts.
2057  *   - When we've adjusted the last extent record in the left path leaf and the
2058  *     1st extent record in the right path leaf during cross extent block merge.
2059  */
2060 static void ocfs2_complete_edge_insert(handle_t *handle,
2061 				       struct ocfs2_path *left_path,
2062 				       struct ocfs2_path *right_path,
2063 				       int subtree_index)
2064 {
2065 	int i, idx;
2066 	struct ocfs2_extent_list *el, *left_el, *right_el;
2067 	struct ocfs2_extent_rec *left_rec, *right_rec;
2068 	struct buffer_head *root_bh;
2069 
2070 	/*
2071 	 * Update the counts and position values within all the
2072 	 * interior nodes to reflect the leaf rotation we just did.
2073 	 *
2074 	 * The root node is handled below the loop.
2075 	 *
2076 	 * We begin the loop with right_el and left_el pointing to the
2077 	 * leaf lists and work our way up.
2078 	 *
2079 	 * NOTE: within this loop, left_el and right_el always refer
2080 	 * to the *child* lists.
2081 	 */
2082 	left_el = path_leaf_el(left_path);
2083 	right_el = path_leaf_el(right_path);
2084 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2085 		trace_ocfs2_complete_edge_insert(i);
2086 
2087 		/*
2088 		 * One nice property of knowing that all of these
2089 		 * nodes are below the root is that we only deal with
2090 		 * the leftmost right node record and the rightmost
2091 		 * left node record.
2092 		 */
2093 		el = left_path->p_node[i].el;
2094 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2095 		left_rec = &el->l_recs[idx];
2096 
2097 		el = right_path->p_node[i].el;
2098 		right_rec = &el->l_recs[0];
2099 
2100 		ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
2101 
2102 		ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2103 		ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2104 
2105 		/*
2106 		 * Setup our list pointers now so that the current
2107 		 * parents become children in the next iteration.
2108 		 */
2109 		left_el = left_path->p_node[i].el;
2110 		right_el = right_path->p_node[i].el;
2111 	}
2112 
2113 	/*
2114 	 * At the root node, adjust the two adjacent records which
2115 	 * begin our path to the leaves.
2116 	 */
2117 
2118 	el = left_path->p_node[subtree_index].el;
2119 	left_el = left_path->p_node[subtree_index + 1].el;
2120 	right_el = right_path->p_node[subtree_index + 1].el;
2121 
2122 	ocfs2_adjust_root_records(el, left_el, right_el,
2123 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
2124 
2125 	root_bh = left_path->p_node[subtree_index].bh;
2126 
2127 	ocfs2_journal_dirty(handle, root_bh);
2128 }
2129 
2130 static int ocfs2_rotate_subtree_right(handle_t *handle,
2131 				      struct ocfs2_extent_tree *et,
2132 				      struct ocfs2_path *left_path,
2133 				      struct ocfs2_path *right_path,
2134 				      int subtree_index)
2135 {
2136 	int ret, i;
2137 	struct buffer_head *right_leaf_bh;
2138 	struct buffer_head *left_leaf_bh = NULL;
2139 	struct buffer_head *root_bh;
2140 	struct ocfs2_extent_list *right_el, *left_el;
2141 	struct ocfs2_extent_rec move_rec;
2142 
2143 	left_leaf_bh = path_leaf_bh(left_path);
2144 	left_el = path_leaf_el(left_path);
2145 
2146 	if (left_el->l_next_free_rec != left_el->l_count) {
2147 		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2148 			    "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
2149 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2150 			    (unsigned long long)left_leaf_bh->b_blocknr,
2151 			    le16_to_cpu(left_el->l_next_free_rec));
2152 		return -EROFS;
2153 	}
2154 
2155 	/*
2156 	 * This extent block may already have an empty record, so we
2157 	 * return early if so.
2158 	 */
2159 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2160 		return 0;
2161 
2162 	root_bh = left_path->p_node[subtree_index].bh;
2163 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2164 
2165 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2166 					   subtree_index);
2167 	if (ret) {
2168 		mlog_errno(ret);
2169 		goto out;
2170 	}
2171 
2172 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2173 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2174 						   right_path, i);
2175 		if (ret) {
2176 			mlog_errno(ret);
2177 			goto out;
2178 		}
2179 
2180 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2181 						   left_path, i);
2182 		if (ret) {
2183 			mlog_errno(ret);
2184 			goto out;
2185 		}
2186 	}
2187 
2188 	right_leaf_bh = path_leaf_bh(right_path);
2189 	right_el = path_leaf_el(right_path);
2190 
2191 	/* This is a code error, not a disk corruption. */
2192 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2193 			"because rightmost leaf block %llu is empty\n",
2194 			(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2195 			(unsigned long long)right_leaf_bh->b_blocknr);
2196 
2197 	ocfs2_create_empty_extent(right_el);
2198 
2199 	ocfs2_journal_dirty(handle, right_leaf_bh);
2200 
2201 	/* Do the copy now. */
2202 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2203 	move_rec = left_el->l_recs[i];
2204 	right_el->l_recs[0] = move_rec;
2205 
2206 	/*
2207 	 * Clear out the record we just copied and shift everything
2208 	 * over, leaving an empty extent in the left leaf.
2209 	 *
2210 	 * We temporarily subtract from next_free_rec so that the
2211 	 * shift will lose the tail record (which is now defunct).
2212 	 */
2213 	le16_add_cpu(&left_el->l_next_free_rec, -1);
2214 	ocfs2_shift_records_right(left_el);
2215 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2216 	le16_add_cpu(&left_el->l_next_free_rec, 1);
2217 
2218 	ocfs2_journal_dirty(handle, left_leaf_bh);
2219 
2220 	ocfs2_complete_edge_insert(handle, left_path, right_path,
2221 				   subtree_index);
2222 
2223 out:
2224 	return ret;
2225 }
2226 
2227 /*
2228  * Given a full path, determine what cpos value would return us a path
2229  * containing the leaf immediately to the left of the current one.
2230  *
2231  * Will return zero if the path passed in is already the leftmost path.
2232  */
2233 int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2234 				  struct ocfs2_path *path, u32 *cpos)
2235 {
2236 	int i, j, ret = 0;
2237 	u64 blkno;
2238 	struct ocfs2_extent_list *el;
2239 
2240 	BUG_ON(path->p_tree_depth == 0);
2241 
2242 	*cpos = 0;
2243 
2244 	blkno = path_leaf_bh(path)->b_blocknr;
2245 
2246 	/* Start at the tree node just above the leaf and work our way up. */
2247 	i = path->p_tree_depth - 1;
2248 	while (i >= 0) {
2249 		el = path->p_node[i].el;
2250 
2251 		/*
2252 		 * Find the extent record just before the one in our
2253 		 * path.
2254 		 */
2255 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2256 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2257 				if (j == 0) {
2258 					if (i == 0) {
2259 						/*
2260 						 * We've determined that the
2261 						 * path specified is already
2262 						 * the leftmost one - return a
2263 						 * cpos of zero.
2264 						 */
2265 						goto out;
2266 					}
2267 					/*
2268 					 * The leftmost record points to our
2269 					 * leaf - we need to travel up the
2270 					 * tree one level.
2271 					 */
2272 					goto next_node;
2273 				}
2274 
2275 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2276 				*cpos = *cpos + ocfs2_rec_clusters(el,
2277 							   &el->l_recs[j - 1]);
2278 				*cpos = *cpos - 1;
2279 				goto out;
2280 			}
2281 		}
2282 
2283 		/*
2284 		 * If we got here, we never found a valid node where
2285 		 * the tree indicated one should be.
2286 		 */
2287 		ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2288 			    (unsigned long long)blkno);
2289 		ret = -EROFS;
2290 		goto out;
2291 
2292 next_node:
2293 		blkno = path->p_node[i].bh->b_blocknr;
2294 		i--;
2295 	}
2296 
2297 out:
2298 	return ret;
2299 }
2300 
2301 /*
2302  * Extend the transaction by enough credits to complete the rotation,
2303  * and still leave at least the original number of credits allocated
2304  * to this transaction.
2305  */
2306 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2307 					   int op_credits,
2308 					   struct ocfs2_path *path)
2309 {
2310 	int ret = 0;
2311 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2312 
2313 	if (jbd2_handle_buffer_credits(handle) < credits)
2314 		ret = ocfs2_extend_trans(handle,
2315 				credits - jbd2_handle_buffer_credits(handle));
2316 
2317 	return ret;
2318 }
2319 
2320 /*
2321  * Trap the case where we're inserting into the theoretical range past
2322  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2323  * whose cpos is less than ours into the right leaf.
2324  *
2325  * It's only necessary to look at the rightmost record of the left
2326  * leaf because the logic that calls us should ensure that the
2327  * theoretical ranges in the path components above the leaves are
2328  * correct.
2329  */
2330 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2331 						 u32 insert_cpos)
2332 {
2333 	struct ocfs2_extent_list *left_el;
2334 	struct ocfs2_extent_rec *rec;
2335 	int next_free;
2336 
2337 	left_el = path_leaf_el(left_path);
2338 	next_free = le16_to_cpu(left_el->l_next_free_rec);
2339 	rec = &left_el->l_recs[next_free - 1];
2340 
2341 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
2342 		return 1;
2343 	return 0;
2344 }
2345 
2346 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2347 {
2348 	int next_free = le16_to_cpu(el->l_next_free_rec);
2349 	unsigned int range;
2350 	struct ocfs2_extent_rec *rec;
2351 
2352 	if (next_free == 0)
2353 		return 0;
2354 
2355 	rec = &el->l_recs[0];
2356 	if (ocfs2_is_empty_extent(rec)) {
2357 		/* Empty list. */
2358 		if (next_free == 1)
2359 			return 0;
2360 		rec = &el->l_recs[1];
2361 	}
2362 
2363 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2364 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2365 		return 1;
2366 	return 0;
2367 }
2368 
2369 /*
2370  * Rotate all the records in a btree right one record, starting at insert_cpos.
2371  *
2372  * The path to the rightmost leaf should be passed in.
2373  *
2374  * The array is assumed to be large enough to hold an entire path (tree depth).
2375  *
2376  * Upon successful return from this function:
2377  *
2378  * - The 'right_path' array will contain a path to the leaf block
2379  *   whose range contains e_cpos.
2380  * - That leaf block will have a single empty extent in list index 0.
2381  * - In the case that the rotation requires a post-insert update,
2382  *   *ret_left_path will contain a valid path which can be passed to
2383  *   ocfs2_insert_path().
2384  */
2385 static int ocfs2_rotate_tree_right(handle_t *handle,
2386 				   struct ocfs2_extent_tree *et,
2387 				   enum ocfs2_split_type split,
2388 				   u32 insert_cpos,
2389 				   struct ocfs2_path *right_path,
2390 				   struct ocfs2_path **ret_left_path)
2391 {
2392 	int ret, start, orig_credits = jbd2_handle_buffer_credits(handle);
2393 	u32 cpos;
2394 	struct ocfs2_path *left_path = NULL;
2395 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2396 
2397 	*ret_left_path = NULL;
2398 
2399 	left_path = ocfs2_new_path_from_path(right_path);
2400 	if (!left_path) {
2401 		ret = -ENOMEM;
2402 		mlog_errno(ret);
2403 		goto out;
2404 	}
2405 
2406 	ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2407 	if (ret) {
2408 		mlog_errno(ret);
2409 		goto out;
2410 	}
2411 
2412 	trace_ocfs2_rotate_tree_right(
2413 		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2414 		insert_cpos, cpos);
2415 
2416 	/*
2417 	 * What we want to do here is:
2418 	 *
2419 	 * 1) Start with the rightmost path.
2420 	 *
2421 	 * 2) Determine a path to the leaf block directly to the left
2422 	 *    of that leaf.
2423 	 *
2424 	 * 3) Determine the 'subtree root' - the lowest level tree node
2425 	 *    which contains a path to both leaves.
2426 	 *
2427 	 * 4) Rotate the subtree.
2428 	 *
2429 	 * 5) Find the next subtree by considering the left path to be
2430 	 *    the new right path.
2431 	 *
2432 	 * The check at the top of this while loop also accepts
2433 	 * insert_cpos == cpos because cpos is only a _theoretical_
2434 	 * value to get us the left path - insert_cpos might very well
2435 	 * be filling that hole.
2436 	 *
2437 	 * Stop at a cpos of '0' because we either started at the
2438 	 * leftmost branch (i.e., a tree with one branch and a
2439 	 * rotation inside of it), or we've gone as far as we can in
2440 	 * rotating subtrees.
2441 	 */
2442 	while (cpos && insert_cpos <= cpos) {
2443 		trace_ocfs2_rotate_tree_right(
2444 			(unsigned long long)
2445 			ocfs2_metadata_cache_owner(et->et_ci),
2446 			insert_cpos, cpos);
2447 
2448 		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2449 		if (ret) {
2450 			mlog_errno(ret);
2451 			goto out;
2452 		}
2453 
2454 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
2455 				path_leaf_bh(right_path),
2456 				"Owner %llu: error during insert of %u "
2457 				"(left path cpos %u) results in two identical "
2458 				"paths ending at %llu\n",
2459 				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2460 				insert_cpos, cpos,
2461 				(unsigned long long)
2462 				path_leaf_bh(left_path)->b_blocknr);
2463 
2464 		if (split == SPLIT_NONE &&
2465 		    ocfs2_rotate_requires_path_adjustment(left_path,
2466 							  insert_cpos)) {
2467 
2468 			/*
2469 			 * We've rotated the tree as much as we
2470 			 * should. The rest is up to
2471 			 * ocfs2_insert_path() to complete, after the
2472 			 * record insertion. We indicate this
2473 			 * situation by returning the left path.
2474 			 *
2475 			 * The reason we don't adjust the records here
2476 			 * before the record insert is that an error
2477 			 * later might break the rule where a parent
2478 			 * record e_cpos will reflect the actual
2479 			 * e_cpos of the 1st nonempty record of the
2480 			 * child list.
2481 			 */
2482 			*ret_left_path = left_path;
2483 			goto out_ret_path;
2484 		}
2485 
2486 		start = ocfs2_find_subtree_root(et, left_path, right_path);
2487 
2488 		trace_ocfs2_rotate_subtree(start,
2489 			(unsigned long long)
2490 			right_path->p_node[start].bh->b_blocknr,
2491 			right_path->p_tree_depth);
2492 
2493 		ret = ocfs2_extend_rotate_transaction(handle, start,
2494 						      orig_credits, right_path);
2495 		if (ret) {
2496 			mlog_errno(ret);
2497 			goto out;
2498 		}
2499 
2500 		ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2501 						 right_path, start);
2502 		if (ret) {
2503 			mlog_errno(ret);
2504 			goto out;
2505 		}
2506 
2507 		if (split != SPLIT_NONE &&
2508 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2509 						insert_cpos)) {
2510 			/*
2511 			 * A rotate moves the rightmost left leaf
2512 			 * record over to the leftmost right leaf
2513 			 * slot. If we're doing an extent split
2514 			 * instead of a real insert, then we have to
2515 			 * check that the extent to be split wasn't
2516 			 * just moved over. If it was, then we can
2517 			 * exit here, passing left_path back -
2518 			 * ocfs2_split_extent() is smart enough to
2519 			 * search both leaves.
2520 			 */
2521 			*ret_left_path = left_path;
2522 			goto out_ret_path;
2523 		}
2524 
2525 		/*
2526 		 * There is no need to re-read the next right path
2527 		 * as we know that it'll be our current left
2528 		 * path. Optimize by copying values instead.
2529 		 */
2530 		ocfs2_mv_path(right_path, left_path);
2531 
2532 		ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2533 		if (ret) {
2534 			mlog_errno(ret);
2535 			goto out;
2536 		}
2537 	}
2538 
2539 out:
2540 	ocfs2_free_path(left_path);
2541 
2542 out_ret_path:
2543 	return ret;
2544 }
2545 
2546 static int ocfs2_update_edge_lengths(handle_t *handle,
2547 				     struct ocfs2_extent_tree *et,
2548 				     struct ocfs2_path *path)
2549 {
2550 	int i, idx, ret;
2551 	struct ocfs2_extent_rec *rec;
2552 	struct ocfs2_extent_list *el;
2553 	struct ocfs2_extent_block *eb;
2554 	u32 range;
2555 
2556 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2557 	if (ret) {
2558 		mlog_errno(ret);
2559 		goto out;
2560 	}
2561 
2562 	/* Path should always be rightmost. */
2563 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2564 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2565 
2566 	el = &eb->h_list;
2567 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2568 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2569 	rec = &el->l_recs[idx];
2570 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2571 
2572 	for (i = 0; i < path->p_tree_depth; i++) {
2573 		el = path->p_node[i].el;
2574 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2575 		rec = &el->l_recs[idx];
2576 
2577 		rec->e_int_clusters = cpu_to_le32(range);
2578 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2579 
2580 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2581 	}
2582 out:
2583 	return ret;
2584 }
2585 
2586 static void ocfs2_unlink_path(handle_t *handle,
2587 			      struct ocfs2_extent_tree *et,
2588 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2589 			      struct ocfs2_path *path, int unlink_start)
2590 {
2591 	int ret, i;
2592 	struct ocfs2_extent_block *eb;
2593 	struct ocfs2_extent_list *el;
2594 	struct buffer_head *bh;
2595 
2596 	for(i = unlink_start; i < path_num_items(path); i++) {
2597 		bh = path->p_node[i].bh;
2598 
2599 		eb = (struct ocfs2_extent_block *)bh->b_data;
2600 		/*
2601 		 * Not all nodes might have had their final count
2602 		 * decremented by the caller - handle this here.
2603 		 */
2604 		el = &eb->h_list;
2605 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2606 			mlog(ML_ERROR,
2607 			     "Inode %llu, attempted to remove extent block "
2608 			     "%llu with %u records\n",
2609 			     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2610 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2611 			     le16_to_cpu(el->l_next_free_rec));
2612 
2613 			ocfs2_journal_dirty(handle, bh);
2614 			ocfs2_remove_from_cache(et->et_ci, bh);
2615 			continue;
2616 		}
2617 
2618 		el->l_next_free_rec = 0;
2619 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2620 
2621 		ocfs2_journal_dirty(handle, bh);
2622 
2623 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2624 		if (ret)
2625 			mlog_errno(ret);
2626 
2627 		ocfs2_remove_from_cache(et->et_ci, bh);
2628 	}
2629 }
2630 
2631 static void ocfs2_unlink_subtree(handle_t *handle,
2632 				 struct ocfs2_extent_tree *et,
2633 				 struct ocfs2_path *left_path,
2634 				 struct ocfs2_path *right_path,
2635 				 int subtree_index,
2636 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2637 {
2638 	int i;
2639 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2640 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2641 	struct ocfs2_extent_block *eb;
2642 
2643 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2644 
2645 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2646 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2647 			break;
2648 
2649 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2650 
2651 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2652 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2653 
2654 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2655 	eb->h_next_leaf_blk = 0;
2656 
2657 	ocfs2_journal_dirty(handle, root_bh);
2658 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2659 
2660 	ocfs2_unlink_path(handle, et, dealloc, right_path,
2661 			  subtree_index + 1);
2662 }
2663 
2664 static int ocfs2_rotate_subtree_left(handle_t *handle,
2665 				     struct ocfs2_extent_tree *et,
2666 				     struct ocfs2_path *left_path,
2667 				     struct ocfs2_path *right_path,
2668 				     int subtree_index,
2669 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2670 				     int *deleted)
2671 {
2672 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2673 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2674 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2675 	struct ocfs2_extent_block *eb;
2676 
2677 	*deleted = 0;
2678 
2679 	right_leaf_el = path_leaf_el(right_path);
2680 	left_leaf_el = path_leaf_el(left_path);
2681 	root_bh = left_path->p_node[subtree_index].bh;
2682 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2683 
2684 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2685 		return 0;
2686 
2687 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2688 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2689 		/*
2690 		 * It's legal for us to proceed if the right leaf is
2691 		 * the rightmost one and it has an empty extent. There
2692 		 * are two cases to handle - whether the leaf will be
2693 		 * empty after removal or not. If the leaf isn't empty
2694 		 * then just remove the empty extent up front. The
2695 		 * next block will handle empty leaves by flagging
2696 		 * them for unlink.
2697 		 *
2698 		 * Non rightmost leaves will throw -EAGAIN and the
2699 		 * caller can manually move the subtree and retry.
2700 		 */
2701 
2702 		if (eb->h_next_leaf_blk != 0ULL)
2703 			return -EAGAIN;
2704 
2705 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2706 			ret = ocfs2_journal_access_eb(handle, et->et_ci,
2707 						      path_leaf_bh(right_path),
2708 						      OCFS2_JOURNAL_ACCESS_WRITE);
2709 			if (ret) {
2710 				mlog_errno(ret);
2711 				goto out;
2712 			}
2713 
2714 			ocfs2_remove_empty_extent(right_leaf_el);
2715 		} else
2716 			right_has_empty = 1;
2717 	}
2718 
2719 	if (eb->h_next_leaf_blk == 0ULL &&
2720 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2721 		/*
2722 		 * We have to update i_last_eb_blk during the meta
2723 		 * data delete.
2724 		 */
2725 		ret = ocfs2_et_root_journal_access(handle, et,
2726 						   OCFS2_JOURNAL_ACCESS_WRITE);
2727 		if (ret) {
2728 			mlog_errno(ret);
2729 			goto out;
2730 		}
2731 
2732 		del_right_subtree = 1;
2733 	}
2734 
2735 	/*
2736 	 * Getting here with an empty extent in the right path implies
2737 	 * that it's the rightmost path and will be deleted.
2738 	 */
2739 	BUG_ON(right_has_empty && !del_right_subtree);
2740 
2741 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2742 					   subtree_index);
2743 	if (ret) {
2744 		mlog_errno(ret);
2745 		goto out;
2746 	}
2747 
2748 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2749 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2750 						   right_path, i);
2751 		if (ret) {
2752 			mlog_errno(ret);
2753 			goto out;
2754 		}
2755 
2756 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2757 						   left_path, i);
2758 		if (ret) {
2759 			mlog_errno(ret);
2760 			goto out;
2761 		}
2762 	}
2763 
2764 	if (!right_has_empty) {
2765 		/*
2766 		 * Only do this if we're moving a real
2767 		 * record. Otherwise, the action is delayed until
2768 		 * after removal of the right path in which case we
2769 		 * can do a simple shift to remove the empty extent.
2770 		 */
2771 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2772 		memset(&right_leaf_el->l_recs[0], 0,
2773 		       sizeof(struct ocfs2_extent_rec));
2774 	}
2775 	if (eb->h_next_leaf_blk == 0ULL) {
2776 		/*
2777 		 * Move recs over to get rid of empty extent, decrease
2778 		 * next_free. This is allowed to remove the last
2779 		 * extent in our leaf (setting l_next_free_rec to
2780 		 * zero) - the delete code below won't care.
2781 		 */
2782 		ocfs2_remove_empty_extent(right_leaf_el);
2783 	}
2784 
2785 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2786 	ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2787 
2788 	if (del_right_subtree) {
2789 		ocfs2_unlink_subtree(handle, et, left_path, right_path,
2790 				     subtree_index, dealloc);
2791 		ret = ocfs2_update_edge_lengths(handle, et, left_path);
2792 		if (ret) {
2793 			mlog_errno(ret);
2794 			goto out;
2795 		}
2796 
2797 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2798 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2799 
2800 		/*
2801 		 * Removal of the extent in the left leaf was skipped
2802 		 * above so we could delete the right path
2803 		 * 1st.
2804 		 */
2805 		if (right_has_empty)
2806 			ocfs2_remove_empty_extent(left_leaf_el);
2807 
2808 		ocfs2_journal_dirty(handle, et_root_bh);
2809 
2810 		*deleted = 1;
2811 	} else
2812 		ocfs2_complete_edge_insert(handle, left_path, right_path,
2813 					   subtree_index);
2814 
2815 out:
2816 	return ret;
2817 }
2818 
2819 /*
2820  * Given a full path, determine what cpos value would return us a path
2821  * containing the leaf immediately to the right of the current one.
2822  *
2823  * Will return zero if the path passed in is already the rightmost path.
2824  *
2825  * This looks similar, but is subtly different to
2826  * ocfs2_find_cpos_for_left_leaf().
2827  */
2828 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2829 				   struct ocfs2_path *path, u32 *cpos)
2830 {
2831 	int i, j, ret = 0;
2832 	u64 blkno;
2833 	struct ocfs2_extent_list *el;
2834 
2835 	*cpos = 0;
2836 
2837 	if (path->p_tree_depth == 0)
2838 		return 0;
2839 
2840 	blkno = path_leaf_bh(path)->b_blocknr;
2841 
2842 	/* Start at the tree node just above the leaf and work our way up. */
2843 	i = path->p_tree_depth - 1;
2844 	while (i >= 0) {
2845 		int next_free;
2846 
2847 		el = path->p_node[i].el;
2848 
2849 		/*
2850 		 * Find the extent record just after the one in our
2851 		 * path.
2852 		 */
2853 		next_free = le16_to_cpu(el->l_next_free_rec);
2854 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2855 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2856 				if (j == (next_free - 1)) {
2857 					if (i == 0) {
2858 						/*
2859 						 * We've determined that the
2860 						 * path specified is already
2861 						 * the rightmost one - return a
2862 						 * cpos of zero.
2863 						 */
2864 						goto out;
2865 					}
2866 					/*
2867 					 * The rightmost record points to our
2868 					 * leaf - we need to travel up the
2869 					 * tree one level.
2870 					 */
2871 					goto next_node;
2872 				}
2873 
2874 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2875 				goto out;
2876 			}
2877 		}
2878 
2879 		/*
2880 		 * If we got here, we never found a valid node where
2881 		 * the tree indicated one should be.
2882 		 */
2883 		ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2884 			    (unsigned long long)blkno);
2885 		ret = -EROFS;
2886 		goto out;
2887 
2888 next_node:
2889 		blkno = path->p_node[i].bh->b_blocknr;
2890 		i--;
2891 	}
2892 
2893 out:
2894 	return ret;
2895 }
2896 
2897 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2898 					    struct ocfs2_extent_tree *et,
2899 					    struct ocfs2_path *path)
2900 {
2901 	int ret;
2902 	struct buffer_head *bh = path_leaf_bh(path);
2903 	struct ocfs2_extent_list *el = path_leaf_el(path);
2904 
2905 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2906 		return 0;
2907 
2908 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2909 					   path_num_items(path) - 1);
2910 	if (ret) {
2911 		mlog_errno(ret);
2912 		goto out;
2913 	}
2914 
2915 	ocfs2_remove_empty_extent(el);
2916 	ocfs2_journal_dirty(handle, bh);
2917 
2918 out:
2919 	return ret;
2920 }
2921 
2922 static int __ocfs2_rotate_tree_left(handle_t *handle,
2923 				    struct ocfs2_extent_tree *et,
2924 				    int orig_credits,
2925 				    struct ocfs2_path *path,
2926 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2927 				    struct ocfs2_path **empty_extent_path)
2928 {
2929 	int ret, subtree_root, deleted;
2930 	u32 right_cpos;
2931 	struct ocfs2_path *left_path = NULL;
2932 	struct ocfs2_path *right_path = NULL;
2933 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2934 
2935 	if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
2936 		return 0;
2937 
2938 	*empty_extent_path = NULL;
2939 
2940 	ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2941 	if (ret) {
2942 		mlog_errno(ret);
2943 		goto out;
2944 	}
2945 
2946 	left_path = ocfs2_new_path_from_path(path);
2947 	if (!left_path) {
2948 		ret = -ENOMEM;
2949 		mlog_errno(ret);
2950 		goto out;
2951 	}
2952 
2953 	ocfs2_cp_path(left_path, path);
2954 
2955 	right_path = ocfs2_new_path_from_path(path);
2956 	if (!right_path) {
2957 		ret = -ENOMEM;
2958 		mlog_errno(ret);
2959 		goto out;
2960 	}
2961 
2962 	while (right_cpos) {
2963 		ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2964 		if (ret) {
2965 			mlog_errno(ret);
2966 			goto out;
2967 		}
2968 
2969 		subtree_root = ocfs2_find_subtree_root(et, left_path,
2970 						       right_path);
2971 
2972 		trace_ocfs2_rotate_subtree(subtree_root,
2973 		     (unsigned long long)
2974 		     right_path->p_node[subtree_root].bh->b_blocknr,
2975 		     right_path->p_tree_depth);
2976 
2977 		ret = ocfs2_extend_rotate_transaction(handle, 0,
2978 						      orig_credits, left_path);
2979 		if (ret) {
2980 			mlog_errno(ret);
2981 			goto out;
2982 		}
2983 
2984 		/*
2985 		 * Caller might still want to make changes to the
2986 		 * tree root, so re-add it to the journal here.
2987 		 */
2988 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2989 						   left_path, 0);
2990 		if (ret) {
2991 			mlog_errno(ret);
2992 			goto out;
2993 		}
2994 
2995 		ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2996 						right_path, subtree_root,
2997 						dealloc, &deleted);
2998 		if (ret == -EAGAIN) {
2999 			/*
3000 			 * The rotation has to temporarily stop due to
3001 			 * the right subtree having an empty
3002 			 * extent. Pass it back to the caller for a
3003 			 * fixup.
3004 			 */
3005 			*empty_extent_path = right_path;
3006 			right_path = NULL;
3007 			goto out;
3008 		}
3009 		if (ret) {
3010 			mlog_errno(ret);
3011 			goto out;
3012 		}
3013 
3014 		/*
3015 		 * The subtree rotate might have removed records on
3016 		 * the rightmost edge. If so, then rotation is
3017 		 * complete.
3018 		 */
3019 		if (deleted)
3020 			break;
3021 
3022 		ocfs2_mv_path(left_path, right_path);
3023 
3024 		ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3025 						     &right_cpos);
3026 		if (ret) {
3027 			mlog_errno(ret);
3028 			goto out;
3029 		}
3030 	}
3031 
3032 out:
3033 	ocfs2_free_path(right_path);
3034 	ocfs2_free_path(left_path);
3035 
3036 	return ret;
3037 }
3038 
3039 static int ocfs2_remove_rightmost_path(handle_t *handle,
3040 				struct ocfs2_extent_tree *et,
3041 				struct ocfs2_path *path,
3042 				struct ocfs2_cached_dealloc_ctxt *dealloc)
3043 {
3044 	int ret, subtree_index;
3045 	u32 cpos;
3046 	struct ocfs2_path *left_path = NULL;
3047 	struct ocfs2_extent_block *eb;
3048 	struct ocfs2_extent_list *el;
3049 
3050 	ret = ocfs2_et_sanity_check(et);
3051 	if (ret)
3052 		goto out;
3053 
3054 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3055 	if (ret) {
3056 		mlog_errno(ret);
3057 		goto out;
3058 	}
3059 
3060 	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3061 					    path, &cpos);
3062 	if (ret) {
3063 		mlog_errno(ret);
3064 		goto out;
3065 	}
3066 
3067 	if (cpos) {
3068 		/*
3069 		 * We have a path to the left of this one - it needs
3070 		 * an update too.
3071 		 */
3072 		left_path = ocfs2_new_path_from_path(path);
3073 		if (!left_path) {
3074 			ret = -ENOMEM;
3075 			mlog_errno(ret);
3076 			goto out;
3077 		}
3078 
3079 		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3080 		if (ret) {
3081 			mlog_errno(ret);
3082 			goto out;
3083 		}
3084 
3085 		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3086 		if (ret) {
3087 			mlog_errno(ret);
3088 			goto out;
3089 		}
3090 
3091 		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3092 
3093 		ocfs2_unlink_subtree(handle, et, left_path, path,
3094 				     subtree_index, dealloc);
3095 		ret = ocfs2_update_edge_lengths(handle, et, left_path);
3096 		if (ret) {
3097 			mlog_errno(ret);
3098 			goto out;
3099 		}
3100 
3101 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3102 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3103 	} else {
3104 		/*
3105 		 * 'path' is also the leftmost path which
3106 		 * means it must be the only one. This gets
3107 		 * handled differently because we want to
3108 		 * revert the root back to having extents
3109 		 * in-line.
3110 		 */
3111 		ocfs2_unlink_path(handle, et, dealloc, path, 1);
3112 
3113 		el = et->et_root_el;
3114 		el->l_tree_depth = 0;
3115 		el->l_next_free_rec = 0;
3116 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3117 
3118 		ocfs2_et_set_last_eb_blk(et, 0);
3119 	}
3120 
3121 	ocfs2_journal_dirty(handle, path_root_bh(path));
3122 
3123 out:
3124 	ocfs2_free_path(left_path);
3125 	return ret;
3126 }
3127 
3128 static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
3129 				struct ocfs2_extent_tree *et,
3130 				struct ocfs2_path *path,
3131 				struct ocfs2_cached_dealloc_ctxt *dealloc)
3132 {
3133 	handle_t *handle;
3134 	int ret;
3135 	int credits = path->p_tree_depth * 2 + 1;
3136 
3137 	handle = ocfs2_start_trans(osb, credits);
3138 	if (IS_ERR(handle)) {
3139 		ret = PTR_ERR(handle);
3140 		mlog_errno(ret);
3141 		return ret;
3142 	}
3143 
3144 	ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
3145 	if (ret)
3146 		mlog_errno(ret);
3147 
3148 	ocfs2_commit_trans(osb, handle);
3149 	return ret;
3150 }
3151 
3152 /*
3153  * Left rotation of btree records.
3154  *
3155  * In many ways, this is (unsurprisingly) the opposite of right
3156  * rotation. We start at some non-rightmost path containing an empty
3157  * extent in the leaf block. The code works its way to the rightmost
3158  * path by rotating records to the left in every subtree.
3159  *
3160  * This is used by any code which reduces the number of extent records
3161  * in a leaf. After removal, an empty record should be placed in the
3162  * leftmost list position.
3163  *
3164  * This won't handle a length update of the rightmost path records if
3165  * the rightmost tree leaf record is removed so the caller is
3166  * responsible for detecting and correcting that.
3167  */
3168 static int ocfs2_rotate_tree_left(handle_t *handle,
3169 				  struct ocfs2_extent_tree *et,
3170 				  struct ocfs2_path *path,
3171 				  struct ocfs2_cached_dealloc_ctxt *dealloc)
3172 {
3173 	int ret, orig_credits = jbd2_handle_buffer_credits(handle);
3174 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3175 	struct ocfs2_extent_block *eb;
3176 	struct ocfs2_extent_list *el;
3177 
3178 	el = path_leaf_el(path);
3179 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3180 		return 0;
3181 
3182 	if (path->p_tree_depth == 0) {
3183 rightmost_no_delete:
3184 		/*
3185 		 * Inline extents. This is trivially handled, so do
3186 		 * it up front.
3187 		 */
3188 		ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3189 		if (ret)
3190 			mlog_errno(ret);
3191 		goto out;
3192 	}
3193 
3194 	/*
3195 	 * Handle rightmost branch now. There's several cases:
3196 	 *  1) simple rotation leaving records in there. That's trivial.
3197 	 *  2) rotation requiring a branch delete - there's no more
3198 	 *     records left. Two cases of this:
3199 	 *     a) There are branches to the left.
3200 	 *     b) This is also the leftmost (the only) branch.
3201 	 *
3202 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3203 	 *  2a) we need the left branch so that we can update it with the unlink
3204 	 *  2b) we need to bring the root back to inline extents.
3205 	 */
3206 
3207 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3208 	el = &eb->h_list;
3209 	if (eb->h_next_leaf_blk == 0) {
3210 		/*
3211 		 * This gets a bit tricky if we're going to delete the
3212 		 * rightmost path. Get the other cases out of the way
3213 		 * 1st.
3214 		 */
3215 		if (le16_to_cpu(el->l_next_free_rec) > 1)
3216 			goto rightmost_no_delete;
3217 
3218 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
3219 			ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3220 					"Owner %llu has empty extent block at %llu\n",
3221 					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3222 					(unsigned long long)le64_to_cpu(eb->h_blkno));
3223 			goto out;
3224 		}
3225 
3226 		/*
3227 		 * XXX: The caller can not trust "path" any more after
3228 		 * this as it will have been deleted. What do we do?
3229 		 *
3230 		 * In theory the rotate-for-merge code will never get
3231 		 * here because it'll always ask for a rotate in a
3232 		 * nonempty list.
3233 		 */
3234 
3235 		ret = ocfs2_remove_rightmost_path(handle, et, path,
3236 						  dealloc);
3237 		if (ret)
3238 			mlog_errno(ret);
3239 		goto out;
3240 	}
3241 
3242 	/*
3243 	 * Now we can loop, remembering the path we get from -EAGAIN
3244 	 * and restarting from there.
3245 	 */
3246 try_rotate:
3247 	ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3248 				       dealloc, &restart_path);
3249 	if (ret && ret != -EAGAIN) {
3250 		mlog_errno(ret);
3251 		goto out;
3252 	}
3253 
3254 	while (ret == -EAGAIN) {
3255 		tmp_path = restart_path;
3256 		restart_path = NULL;
3257 
3258 		ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3259 					       tmp_path, dealloc,
3260 					       &restart_path);
3261 		if (ret && ret != -EAGAIN) {
3262 			mlog_errno(ret);
3263 			goto out;
3264 		}
3265 
3266 		ocfs2_free_path(tmp_path);
3267 		tmp_path = NULL;
3268 
3269 		if (ret == 0)
3270 			goto try_rotate;
3271 	}
3272 
3273 out:
3274 	ocfs2_free_path(tmp_path);
3275 	ocfs2_free_path(restart_path);
3276 	return ret;
3277 }
3278 
3279 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3280 				int index)
3281 {
3282 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
3283 	unsigned int size;
3284 
3285 	if (rec->e_leaf_clusters == 0) {
3286 		/*
3287 		 * We consumed all of the merged-from record. An empty
3288 		 * extent cannot exist anywhere but the 1st array
3289 		 * position, so move things over if the merged-from
3290 		 * record doesn't occupy that position.
3291 		 *
3292 		 * This creates a new empty extent so the caller
3293 		 * should be smart enough to have removed any existing
3294 		 * ones.
3295 		 */
3296 		if (index > 0) {
3297 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3298 			size = index * sizeof(struct ocfs2_extent_rec);
3299 			memmove(&el->l_recs[1], &el->l_recs[0], size);
3300 		}
3301 
3302 		/*
3303 		 * Always memset - the caller doesn't check whether it
3304 		 * created an empty extent, so there could be junk in
3305 		 * the other fields.
3306 		 */
3307 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3308 	}
3309 }
3310 
3311 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3312 				struct ocfs2_path *left_path,
3313 				struct ocfs2_path **ret_right_path)
3314 {
3315 	int ret;
3316 	u32 right_cpos;
3317 	struct ocfs2_path *right_path = NULL;
3318 	struct ocfs2_extent_list *left_el;
3319 
3320 	*ret_right_path = NULL;
3321 
3322 	/* This function shouldn't be called for non-trees. */
3323 	BUG_ON(left_path->p_tree_depth == 0);
3324 
3325 	left_el = path_leaf_el(left_path);
3326 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3327 
3328 	ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3329 					     left_path, &right_cpos);
3330 	if (ret) {
3331 		mlog_errno(ret);
3332 		goto out;
3333 	}
3334 
3335 	/* This function shouldn't be called for the rightmost leaf. */
3336 	BUG_ON(right_cpos == 0);
3337 
3338 	right_path = ocfs2_new_path_from_path(left_path);
3339 	if (!right_path) {
3340 		ret = -ENOMEM;
3341 		mlog_errno(ret);
3342 		goto out;
3343 	}
3344 
3345 	ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3346 	if (ret) {
3347 		mlog_errno(ret);
3348 		goto out;
3349 	}
3350 
3351 	*ret_right_path = right_path;
3352 out:
3353 	if (ret)
3354 		ocfs2_free_path(right_path);
3355 	return ret;
3356 }
3357 
3358 /*
3359  * Remove split_rec clusters from the record at index and merge them
3360  * onto the beginning of the record "next" to it.
3361  * For index < l_count - 1, the next means the extent rec at index + 1.
3362  * For index == l_count - 1, the "next" means the 1st extent rec of the
3363  * next extent block.
3364  */
3365 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3366 				 handle_t *handle,
3367 				 struct ocfs2_extent_tree *et,
3368 				 struct ocfs2_extent_rec *split_rec,
3369 				 int index)
3370 {
3371 	int ret, next_free, i;
3372 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3373 	struct ocfs2_extent_rec *left_rec;
3374 	struct ocfs2_extent_rec *right_rec;
3375 	struct ocfs2_extent_list *right_el;
3376 	struct ocfs2_path *right_path = NULL;
3377 	int subtree_index = 0;
3378 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
3379 	struct buffer_head *bh = path_leaf_bh(left_path);
3380 	struct buffer_head *root_bh = NULL;
3381 
3382 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3383 	left_rec = &el->l_recs[index];
3384 
3385 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3386 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3387 		/* we meet with a cross extent block merge. */
3388 		ret = ocfs2_get_right_path(et, left_path, &right_path);
3389 		if (ret) {
3390 			mlog_errno(ret);
3391 			return ret;
3392 		}
3393 
3394 		right_el = path_leaf_el(right_path);
3395 		next_free = le16_to_cpu(right_el->l_next_free_rec);
3396 		BUG_ON(next_free <= 0);
3397 		right_rec = &right_el->l_recs[0];
3398 		if (ocfs2_is_empty_extent(right_rec)) {
3399 			BUG_ON(next_free <= 1);
3400 			right_rec = &right_el->l_recs[1];
3401 		}
3402 
3403 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3404 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3405 		       le32_to_cpu(right_rec->e_cpos));
3406 
3407 		subtree_index = ocfs2_find_subtree_root(et, left_path,
3408 							right_path);
3409 
3410 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3411 					jbd2_handle_buffer_credits(handle),
3412 					right_path);
3413 		if (ret) {
3414 			mlog_errno(ret);
3415 			goto out;
3416 		}
3417 
3418 		root_bh = left_path->p_node[subtree_index].bh;
3419 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3420 
3421 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3422 						   subtree_index);
3423 		if (ret) {
3424 			mlog_errno(ret);
3425 			goto out;
3426 		}
3427 
3428 		for (i = subtree_index + 1;
3429 		     i < path_num_items(right_path); i++) {
3430 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3431 							   right_path, i);
3432 			if (ret) {
3433 				mlog_errno(ret);
3434 				goto out;
3435 			}
3436 
3437 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3438 							   left_path, i);
3439 			if (ret) {
3440 				mlog_errno(ret);
3441 				goto out;
3442 			}
3443 		}
3444 
3445 	} else {
3446 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3447 		right_rec = &el->l_recs[index + 1];
3448 	}
3449 
3450 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3451 					   path_num_items(left_path) - 1);
3452 	if (ret) {
3453 		mlog_errno(ret);
3454 		goto out;
3455 	}
3456 
3457 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3458 
3459 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3460 	le64_add_cpu(&right_rec->e_blkno,
3461 		     -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3462 					       split_clusters));
3463 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3464 
3465 	ocfs2_cleanup_merge(el, index);
3466 
3467 	ocfs2_journal_dirty(handle, bh);
3468 	if (right_path) {
3469 		ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3470 		ocfs2_complete_edge_insert(handle, left_path, right_path,
3471 					   subtree_index);
3472 	}
3473 out:
3474 	ocfs2_free_path(right_path);
3475 	return ret;
3476 }
3477 
3478 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3479 			       struct ocfs2_path *right_path,
3480 			       struct ocfs2_path **ret_left_path)
3481 {
3482 	int ret;
3483 	u32 left_cpos;
3484 	struct ocfs2_path *left_path = NULL;
3485 
3486 	*ret_left_path = NULL;
3487 
3488 	/* This function shouldn't be called for non-trees. */
3489 	BUG_ON(right_path->p_tree_depth == 0);
3490 
3491 	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3492 					    right_path, &left_cpos);
3493 	if (ret) {
3494 		mlog_errno(ret);
3495 		goto out;
3496 	}
3497 
3498 	/* This function shouldn't be called for the leftmost leaf. */
3499 	BUG_ON(left_cpos == 0);
3500 
3501 	left_path = ocfs2_new_path_from_path(right_path);
3502 	if (!left_path) {
3503 		ret = -ENOMEM;
3504 		mlog_errno(ret);
3505 		goto out;
3506 	}
3507 
3508 	ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3509 	if (ret) {
3510 		mlog_errno(ret);
3511 		goto out;
3512 	}
3513 
3514 	*ret_left_path = left_path;
3515 out:
3516 	if (ret)
3517 		ocfs2_free_path(left_path);
3518 	return ret;
3519 }
3520 
3521 /*
3522  * Remove split_rec clusters from the record at index and merge them
3523  * onto the tail of the record "before" it.
3524  * For index > 0, the "before" means the extent rec at index - 1.
3525  *
3526  * For index == 0, the "before" means the last record of the previous
3527  * extent block. And there is also a situation that we may need to
3528  * remove the rightmost leaf extent block in the right_path and change
3529  * the right path to indicate the new rightmost path.
3530  */
3531 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3532 				handle_t *handle,
3533 				struct ocfs2_extent_tree *et,
3534 				struct ocfs2_extent_rec *split_rec,
3535 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3536 				int index)
3537 {
3538 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3539 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3540 	struct ocfs2_extent_rec *left_rec;
3541 	struct ocfs2_extent_rec *right_rec;
3542 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3543 	struct buffer_head *bh = path_leaf_bh(right_path);
3544 	struct buffer_head *root_bh = NULL;
3545 	struct ocfs2_path *left_path = NULL;
3546 	struct ocfs2_extent_list *left_el;
3547 
3548 	BUG_ON(index < 0);
3549 
3550 	right_rec = &el->l_recs[index];
3551 	if (index == 0) {
3552 		/* we meet with a cross extent block merge. */
3553 		ret = ocfs2_get_left_path(et, right_path, &left_path);
3554 		if (ret) {
3555 			mlog_errno(ret);
3556 			return ret;
3557 		}
3558 
3559 		left_el = path_leaf_el(left_path);
3560 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3561 		       le16_to_cpu(left_el->l_count));
3562 
3563 		left_rec = &left_el->l_recs[
3564 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3565 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3566 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3567 		       le32_to_cpu(split_rec->e_cpos));
3568 
3569 		subtree_index = ocfs2_find_subtree_root(et, left_path,
3570 							right_path);
3571 
3572 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3573 					jbd2_handle_buffer_credits(handle),
3574 					left_path);
3575 		if (ret) {
3576 			mlog_errno(ret);
3577 			goto out;
3578 		}
3579 
3580 		root_bh = left_path->p_node[subtree_index].bh;
3581 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3582 
3583 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3584 						   subtree_index);
3585 		if (ret) {
3586 			mlog_errno(ret);
3587 			goto out;
3588 		}
3589 
3590 		for (i = subtree_index + 1;
3591 		     i < path_num_items(right_path); i++) {
3592 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3593 							   right_path, i);
3594 			if (ret) {
3595 				mlog_errno(ret);
3596 				goto out;
3597 			}
3598 
3599 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3600 							   left_path, i);
3601 			if (ret) {
3602 				mlog_errno(ret);
3603 				goto out;
3604 			}
3605 		}
3606 	} else {
3607 		left_rec = &el->l_recs[index - 1];
3608 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3609 			has_empty_extent = 1;
3610 	}
3611 
3612 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3613 					   path_num_items(right_path) - 1);
3614 	if (ret) {
3615 		mlog_errno(ret);
3616 		goto out;
3617 	}
3618 
3619 	if (has_empty_extent && index == 1) {
3620 		/*
3621 		 * The easy case - we can just plop the record right in.
3622 		 */
3623 		*left_rec = *split_rec;
3624 	} else
3625 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3626 
3627 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3628 	le64_add_cpu(&right_rec->e_blkno,
3629 		     ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3630 					      split_clusters));
3631 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3632 
3633 	ocfs2_cleanup_merge(el, index);
3634 
3635 	ocfs2_journal_dirty(handle, bh);
3636 	if (left_path) {
3637 		ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3638 
3639 		/*
3640 		 * In the situation that the right_rec is empty and the extent
3641 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3642 		 * it and we need to delete the right extent block.
3643 		 */
3644 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3645 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3646 			/* extend credit for ocfs2_remove_rightmost_path */
3647 			ret = ocfs2_extend_rotate_transaction(handle, 0,
3648 					jbd2_handle_buffer_credits(handle),
3649 					right_path);
3650 			if (ret) {
3651 				mlog_errno(ret);
3652 				goto out;
3653 			}
3654 
3655 			ret = ocfs2_remove_rightmost_path(handle, et,
3656 							  right_path,
3657 							  dealloc);
3658 			if (ret) {
3659 				mlog_errno(ret);
3660 				goto out;
3661 			}
3662 
3663 			/* Now the rightmost extent block has been deleted.
3664 			 * So we use the new rightmost path.
3665 			 */
3666 			ocfs2_mv_path(right_path, left_path);
3667 		} else
3668 			ocfs2_complete_edge_insert(handle, left_path,
3669 						   right_path, subtree_index);
3670 	}
3671 out:
3672 	ocfs2_free_path(left_path);
3673 	return ret;
3674 }
3675 
3676 static int ocfs2_try_to_merge_extent(handle_t *handle,
3677 				     struct ocfs2_extent_tree *et,
3678 				     struct ocfs2_path *path,
3679 				     int split_index,
3680 				     struct ocfs2_extent_rec *split_rec,
3681 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3682 				     struct ocfs2_merge_ctxt *ctxt)
3683 {
3684 	int ret = 0;
3685 	struct ocfs2_extent_list *el = path_leaf_el(path);
3686 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3687 
3688 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3689 
3690 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3691 		/* extend credit for ocfs2_remove_rightmost_path */
3692 		ret = ocfs2_extend_rotate_transaction(handle, 0,
3693 				jbd2_handle_buffer_credits(handle),
3694 				path);
3695 		if (ret) {
3696 			mlog_errno(ret);
3697 			goto out;
3698 		}
3699 		/*
3700 		 * The merge code will need to create an empty
3701 		 * extent to take the place of the newly
3702 		 * emptied slot. Remove any pre-existing empty
3703 		 * extents - having more than one in a leaf is
3704 		 * illegal.
3705 		 */
3706 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3707 		if (ret) {
3708 			mlog_errno(ret);
3709 			goto out;
3710 		}
3711 		split_index--;
3712 		rec = &el->l_recs[split_index];
3713 	}
3714 
3715 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3716 		/*
3717 		 * Left-right contig implies this.
3718 		 */
3719 		BUG_ON(!ctxt->c_split_covers_rec);
3720 
3721 		/*
3722 		 * Since the leftright insert always covers the entire
3723 		 * extent, this call will delete the insert record
3724 		 * entirely, resulting in an empty extent record added to
3725 		 * the extent block.
3726 		 *
3727 		 * Since the adding of an empty extent shifts
3728 		 * everything back to the right, there's no need to
3729 		 * update split_index here.
3730 		 *
3731 		 * When the split_index is zero, we need to merge it to the
3732 		 * previous extent block. It is more efficient and easier
3733 		 * if we do merge_right first and merge_left later.
3734 		 */
3735 		ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3736 					    split_index);
3737 		if (ret) {
3738 			mlog_errno(ret);
3739 			goto out;
3740 		}
3741 
3742 		/*
3743 		 * We can only get this from logic error above.
3744 		 */
3745 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3746 
3747 		/* extend credit for ocfs2_remove_rightmost_path */
3748 		ret = ocfs2_extend_rotate_transaction(handle, 0,
3749 					jbd2_handle_buffer_credits(handle),
3750 					path);
3751 		if (ret) {
3752 			mlog_errno(ret);
3753 			goto out;
3754 		}
3755 
3756 		/* The merge left us with an empty extent, remove it. */
3757 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3758 		if (ret) {
3759 			mlog_errno(ret);
3760 			goto out;
3761 		}
3762 
3763 		rec = &el->l_recs[split_index];
3764 
3765 		/*
3766 		 * Note that we don't pass split_rec here on purpose -
3767 		 * we've merged it into the rec already.
3768 		 */
3769 		ret = ocfs2_merge_rec_left(path, handle, et, rec,
3770 					   dealloc, split_index);
3771 
3772 		if (ret) {
3773 			mlog_errno(ret);
3774 			goto out;
3775 		}
3776 
3777 		/* extend credit for ocfs2_remove_rightmost_path */
3778 		ret = ocfs2_extend_rotate_transaction(handle, 0,
3779 				jbd2_handle_buffer_credits(handle),
3780 				path);
3781 		if (ret) {
3782 			mlog_errno(ret);
3783 			goto out;
3784 		}
3785 
3786 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3787 		/*
3788 		 * Error from this last rotate is not critical, so
3789 		 * print but don't bubble it up.
3790 		 */
3791 		if (ret)
3792 			mlog_errno(ret);
3793 		ret = 0;
3794 	} else {
3795 		/*
3796 		 * Merge a record to the left or right.
3797 		 *
3798 		 * 'contig_type' is relative to the existing record,
3799 		 * so for example, if we're "right contig", it's to
3800 		 * the record on the left (hence the left merge).
3801 		 */
3802 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3803 			ret = ocfs2_merge_rec_left(path, handle, et,
3804 						   split_rec, dealloc,
3805 						   split_index);
3806 			if (ret) {
3807 				mlog_errno(ret);
3808 				goto out;
3809 			}
3810 		} else {
3811 			ret = ocfs2_merge_rec_right(path, handle,
3812 						    et, split_rec,
3813 						    split_index);
3814 			if (ret) {
3815 				mlog_errno(ret);
3816 				goto out;
3817 			}
3818 		}
3819 
3820 		if (ctxt->c_split_covers_rec) {
3821 			/* extend credit for ocfs2_remove_rightmost_path */
3822 			ret = ocfs2_extend_rotate_transaction(handle, 0,
3823 					jbd2_handle_buffer_credits(handle),
3824 					path);
3825 			if (ret) {
3826 				mlog_errno(ret);
3827 				ret = 0;
3828 				goto out;
3829 			}
3830 
3831 			/*
3832 			 * The merge may have left an empty extent in
3833 			 * our leaf. Try to rotate it away.
3834 			 */
3835 			ret = ocfs2_rotate_tree_left(handle, et, path,
3836 						     dealloc);
3837 			if (ret)
3838 				mlog_errno(ret);
3839 			ret = 0;
3840 		}
3841 	}
3842 
3843 out:
3844 	return ret;
3845 }
3846 
3847 static void ocfs2_subtract_from_rec(struct super_block *sb,
3848 				    enum ocfs2_split_type split,
3849 				    struct ocfs2_extent_rec *rec,
3850 				    struct ocfs2_extent_rec *split_rec)
3851 {
3852 	u64 len_blocks;
3853 
3854 	len_blocks = ocfs2_clusters_to_blocks(sb,
3855 				le16_to_cpu(split_rec->e_leaf_clusters));
3856 
3857 	if (split == SPLIT_LEFT) {
3858 		/*
3859 		 * Region is on the left edge of the existing
3860 		 * record.
3861 		 */
3862 		le32_add_cpu(&rec->e_cpos,
3863 			     le16_to_cpu(split_rec->e_leaf_clusters));
3864 		le64_add_cpu(&rec->e_blkno, len_blocks);
3865 		le16_add_cpu(&rec->e_leaf_clusters,
3866 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3867 	} else {
3868 		/*
3869 		 * Region is on the right edge of the existing
3870 		 * record.
3871 		 */
3872 		le16_add_cpu(&rec->e_leaf_clusters,
3873 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3874 	}
3875 }
3876 
3877 /*
3878  * Do the final bits of extent record insertion at the target leaf
3879  * list. If this leaf is part of an allocation tree, it is assumed
3880  * that the tree above has been prepared.
3881  */
3882 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3883 				 struct ocfs2_extent_rec *insert_rec,
3884 				 struct ocfs2_extent_list *el,
3885 				 struct ocfs2_insert_type *insert)
3886 {
3887 	int i = insert->ins_contig_index;
3888 	unsigned int range;
3889 	struct ocfs2_extent_rec *rec;
3890 
3891 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3892 
3893 	if (insert->ins_split != SPLIT_NONE) {
3894 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3895 		BUG_ON(i == -1);
3896 		rec = &el->l_recs[i];
3897 		ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3898 					insert->ins_split, rec,
3899 					insert_rec);
3900 		goto rotate;
3901 	}
3902 
3903 	/*
3904 	 * Contiguous insert - either left or right.
3905 	 */
3906 	if (insert->ins_contig != CONTIG_NONE) {
3907 		rec = &el->l_recs[i];
3908 		if (insert->ins_contig == CONTIG_LEFT) {
3909 			rec->e_blkno = insert_rec->e_blkno;
3910 			rec->e_cpos = insert_rec->e_cpos;
3911 		}
3912 		le16_add_cpu(&rec->e_leaf_clusters,
3913 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3914 		return;
3915 	}
3916 
3917 	/*
3918 	 * Handle insert into an empty leaf.
3919 	 */
3920 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3921 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3922 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3923 		el->l_recs[0] = *insert_rec;
3924 		el->l_next_free_rec = cpu_to_le16(1);
3925 		return;
3926 	}
3927 
3928 	/*
3929 	 * Appending insert.
3930 	 */
3931 	if (insert->ins_appending == APPEND_TAIL) {
3932 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3933 		rec = &el->l_recs[i];
3934 		range = le32_to_cpu(rec->e_cpos)
3935 			+ le16_to_cpu(rec->e_leaf_clusters);
3936 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3937 
3938 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3939 				le16_to_cpu(el->l_count),
3940 				"owner %llu, depth %u, count %u, next free %u, "
3941 				"rec.cpos %u, rec.clusters %u, "
3942 				"insert.cpos %u, insert.clusters %u\n",
3943 				ocfs2_metadata_cache_owner(et->et_ci),
3944 				le16_to_cpu(el->l_tree_depth),
3945 				le16_to_cpu(el->l_count),
3946 				le16_to_cpu(el->l_next_free_rec),
3947 				le32_to_cpu(el->l_recs[i].e_cpos),
3948 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3949 				le32_to_cpu(insert_rec->e_cpos),
3950 				le16_to_cpu(insert_rec->e_leaf_clusters));
3951 		i++;
3952 		el->l_recs[i] = *insert_rec;
3953 		le16_add_cpu(&el->l_next_free_rec, 1);
3954 		return;
3955 	}
3956 
3957 rotate:
3958 	/*
3959 	 * Ok, we have to rotate.
3960 	 *
3961 	 * At this point, it is safe to assume that inserting into an
3962 	 * empty leaf and appending to a leaf have both been handled
3963 	 * above.
3964 	 *
3965 	 * This leaf needs to have space, either by the empty 1st
3966 	 * extent record, or by virtue of an l_next_free_rec < l_count.
3967 	 */
3968 	ocfs2_rotate_leaf(el, insert_rec);
3969 }
3970 
3971 static void ocfs2_adjust_rightmost_records(handle_t *handle,
3972 					   struct ocfs2_extent_tree *et,
3973 					   struct ocfs2_path *path,
3974 					   struct ocfs2_extent_rec *insert_rec)
3975 {
3976 	int i, next_free;
3977 	struct buffer_head *bh;
3978 	struct ocfs2_extent_list *el;
3979 	struct ocfs2_extent_rec *rec;
3980 
3981 	/*
3982 	 * Update everything except the leaf block.
3983 	 */
3984 	for (i = 0; i < path->p_tree_depth; i++) {
3985 		bh = path->p_node[i].bh;
3986 		el = path->p_node[i].el;
3987 
3988 		next_free = le16_to_cpu(el->l_next_free_rec);
3989 		if (next_free == 0) {
3990 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3991 				    "Owner %llu has a bad extent list\n",
3992 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3993 			return;
3994 		}
3995 
3996 		rec = &el->l_recs[next_free - 1];
3997 
3998 		rec->e_int_clusters = insert_rec->e_cpos;
3999 		le32_add_cpu(&rec->e_int_clusters,
4000 			     le16_to_cpu(insert_rec->e_leaf_clusters));
4001 		le32_add_cpu(&rec->e_int_clusters,
4002 			     -le32_to_cpu(rec->e_cpos));
4003 
4004 		ocfs2_journal_dirty(handle, bh);
4005 	}
4006 }
4007 
4008 static int ocfs2_append_rec_to_path(handle_t *handle,
4009 				    struct ocfs2_extent_tree *et,
4010 				    struct ocfs2_extent_rec *insert_rec,
4011 				    struct ocfs2_path *right_path,
4012 				    struct ocfs2_path **ret_left_path)
4013 {
4014 	int ret, next_free;
4015 	struct ocfs2_extent_list *el;
4016 	struct ocfs2_path *left_path = NULL;
4017 
4018 	*ret_left_path = NULL;
4019 
4020 	/*
4021 	 * This shouldn't happen for non-trees. The extent rec cluster
4022 	 * count manipulation below only works for interior nodes.
4023 	 */
4024 	BUG_ON(right_path->p_tree_depth == 0);
4025 
4026 	/*
4027 	 * If our appending insert is at the leftmost edge of a leaf,
4028 	 * then we might need to update the rightmost records of the
4029 	 * neighboring path.
4030 	 */
4031 	el = path_leaf_el(right_path);
4032 	next_free = le16_to_cpu(el->l_next_free_rec);
4033 	if (next_free == 0 ||
4034 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
4035 		u32 left_cpos;
4036 
4037 		ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
4038 						    right_path, &left_cpos);
4039 		if (ret) {
4040 			mlog_errno(ret);
4041 			goto out;
4042 		}
4043 
4044 		trace_ocfs2_append_rec_to_path(
4045 			(unsigned long long)
4046 			ocfs2_metadata_cache_owner(et->et_ci),
4047 			le32_to_cpu(insert_rec->e_cpos),
4048 			left_cpos);
4049 
4050 		/*
4051 		 * No need to worry if the append is already in the
4052 		 * leftmost leaf.
4053 		 */
4054 		if (left_cpos) {
4055 			left_path = ocfs2_new_path_from_path(right_path);
4056 			if (!left_path) {
4057 				ret = -ENOMEM;
4058 				mlog_errno(ret);
4059 				goto out;
4060 			}
4061 
4062 			ret = ocfs2_find_path(et->et_ci, left_path,
4063 					      left_cpos);
4064 			if (ret) {
4065 				mlog_errno(ret);
4066 				goto out;
4067 			}
4068 
4069 			/*
4070 			 * ocfs2_insert_path() will pass the left_path to the
4071 			 * journal for us.
4072 			 */
4073 		}
4074 	}
4075 
4076 	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4077 	if (ret) {
4078 		mlog_errno(ret);
4079 		goto out;
4080 	}
4081 
4082 	ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4083 
4084 	*ret_left_path = left_path;
4085 	ret = 0;
4086 out:
4087 	if (ret != 0)
4088 		ocfs2_free_path(left_path);
4089 
4090 	return ret;
4091 }
4092 
4093 static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4094 			       struct ocfs2_path *left_path,
4095 			       struct ocfs2_path *right_path,
4096 			       struct ocfs2_extent_rec *split_rec,
4097 			       enum ocfs2_split_type split)
4098 {
4099 	int index;
4100 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
4101 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4102 	struct ocfs2_extent_rec *rec, *tmprec;
4103 
4104 	right_el = path_leaf_el(right_path);
4105 	if (left_path)
4106 		left_el = path_leaf_el(left_path);
4107 
4108 	el = right_el;
4109 	insert_el = right_el;
4110 	index = ocfs2_search_extent_list(el, cpos);
4111 	if (index != -1) {
4112 		if (index == 0 && left_path) {
4113 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4114 
4115 			/*
4116 			 * This typically means that the record
4117 			 * started in the left path but moved to the
4118 			 * right as a result of rotation. We either
4119 			 * move the existing record to the left, or we
4120 			 * do the later insert there.
4121 			 *
4122 			 * In this case, the left path should always
4123 			 * exist as the rotate code will have passed
4124 			 * it back for a post-insert update.
4125 			 */
4126 
4127 			if (split == SPLIT_LEFT) {
4128 				/*
4129 				 * It's a left split. Since we know
4130 				 * that the rotate code gave us an
4131 				 * empty extent in the left path, we
4132 				 * can just do the insert there.
4133 				 */
4134 				insert_el = left_el;
4135 			} else {
4136 				/*
4137 				 * Right split - we have to move the
4138 				 * existing record over to the left
4139 				 * leaf. The insert will be into the
4140 				 * newly created empty extent in the
4141 				 * right leaf.
4142 				 */
4143 				tmprec = &right_el->l_recs[index];
4144 				ocfs2_rotate_leaf(left_el, tmprec);
4145 				el = left_el;
4146 
4147 				memset(tmprec, 0, sizeof(*tmprec));
4148 				index = ocfs2_search_extent_list(left_el, cpos);
4149 				BUG_ON(index == -1);
4150 			}
4151 		}
4152 	} else {
4153 		BUG_ON(!left_path);
4154 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4155 		/*
4156 		 * Left path is easy - we can just allow the insert to
4157 		 * happen.
4158 		 */
4159 		el = left_el;
4160 		insert_el = left_el;
4161 		index = ocfs2_search_extent_list(el, cpos);
4162 		BUG_ON(index == -1);
4163 	}
4164 
4165 	rec = &el->l_recs[index];
4166 	ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4167 				split, rec, split_rec);
4168 	ocfs2_rotate_leaf(insert_el, split_rec);
4169 }
4170 
4171 /*
4172  * This function only does inserts on an allocation b-tree. For tree
4173  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4174  *
4175  * right_path is the path we want to do the actual insert
4176  * in. left_path should only be passed in if we need to update that
4177  * portion of the tree after an edge insert.
4178  */
4179 static int ocfs2_insert_path(handle_t *handle,
4180 			     struct ocfs2_extent_tree *et,
4181 			     struct ocfs2_path *left_path,
4182 			     struct ocfs2_path *right_path,
4183 			     struct ocfs2_extent_rec *insert_rec,
4184 			     struct ocfs2_insert_type *insert)
4185 {
4186 	int ret, subtree_index;
4187 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4188 
4189 	if (left_path) {
4190 		/*
4191 		 * There's a chance that left_path got passed back to
4192 		 * us without being accounted for in the
4193 		 * journal. Extend our transaction here to be sure we
4194 		 * can change those blocks.
4195 		 */
4196 		ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4197 		if (ret < 0) {
4198 			mlog_errno(ret);
4199 			goto out;
4200 		}
4201 
4202 		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4203 		if (ret < 0) {
4204 			mlog_errno(ret);
4205 			goto out;
4206 		}
4207 	}
4208 
4209 	/*
4210 	 * Pass both paths to the journal. The majority of inserts
4211 	 * will be touching all components anyway.
4212 	 */
4213 	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4214 	if (ret < 0) {
4215 		mlog_errno(ret);
4216 		goto out;
4217 	}
4218 
4219 	if (insert->ins_split != SPLIT_NONE) {
4220 		/*
4221 		 * We could call ocfs2_insert_at_leaf() for some types
4222 		 * of splits, but it's easier to just let one separate
4223 		 * function sort it all out.
4224 		 */
4225 		ocfs2_split_record(et, left_path, right_path,
4226 				   insert_rec, insert->ins_split);
4227 
4228 		/*
4229 		 * Split might have modified either leaf and we don't
4230 		 * have a guarantee that the later edge insert will
4231 		 * dirty this for us.
4232 		 */
4233 		if (left_path)
4234 			ocfs2_journal_dirty(handle,
4235 					    path_leaf_bh(left_path));
4236 	} else
4237 		ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4238 				     insert);
4239 
4240 	ocfs2_journal_dirty(handle, leaf_bh);
4241 
4242 	if (left_path) {
4243 		/*
4244 		 * The rotate code has indicated that we need to fix
4245 		 * up portions of the tree after the insert.
4246 		 *
4247 		 * XXX: Should we extend the transaction here?
4248 		 */
4249 		subtree_index = ocfs2_find_subtree_root(et, left_path,
4250 							right_path);
4251 		ocfs2_complete_edge_insert(handle, left_path, right_path,
4252 					   subtree_index);
4253 	}
4254 
4255 	ret = 0;
4256 out:
4257 	return ret;
4258 }
4259 
4260 static int ocfs2_do_insert_extent(handle_t *handle,
4261 				  struct ocfs2_extent_tree *et,
4262 				  struct ocfs2_extent_rec *insert_rec,
4263 				  struct ocfs2_insert_type *type)
4264 {
4265 	int ret, rotate = 0;
4266 	u32 cpos;
4267 	struct ocfs2_path *right_path = NULL;
4268 	struct ocfs2_path *left_path = NULL;
4269 	struct ocfs2_extent_list *el;
4270 
4271 	el = et->et_root_el;
4272 
4273 	ret = ocfs2_et_root_journal_access(handle, et,
4274 					   OCFS2_JOURNAL_ACCESS_WRITE);
4275 	if (ret) {
4276 		mlog_errno(ret);
4277 		goto out;
4278 	}
4279 
4280 	if (le16_to_cpu(el->l_tree_depth) == 0) {
4281 		ocfs2_insert_at_leaf(et, insert_rec, el, type);
4282 		goto out_update_clusters;
4283 	}
4284 
4285 	right_path = ocfs2_new_path_from_et(et);
4286 	if (!right_path) {
4287 		ret = -ENOMEM;
4288 		mlog_errno(ret);
4289 		goto out;
4290 	}
4291 
4292 	/*
4293 	 * Determine the path to start with. Rotations need the
4294 	 * rightmost path, everything else can go directly to the
4295 	 * target leaf.
4296 	 */
4297 	cpos = le32_to_cpu(insert_rec->e_cpos);
4298 	if (type->ins_appending == APPEND_NONE &&
4299 	    type->ins_contig == CONTIG_NONE) {
4300 		rotate = 1;
4301 		cpos = UINT_MAX;
4302 	}
4303 
4304 	ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4305 	if (ret) {
4306 		mlog_errno(ret);
4307 		goto out;
4308 	}
4309 
4310 	/*
4311 	 * Rotations and appends need special treatment - they modify
4312 	 * parts of the tree's above them.
4313 	 *
4314 	 * Both might pass back a path immediate to the left of the
4315 	 * one being inserted to. This will be cause
4316 	 * ocfs2_insert_path() to modify the rightmost records of
4317 	 * left_path to account for an edge insert.
4318 	 *
4319 	 * XXX: When modifying this code, keep in mind that an insert
4320 	 * can wind up skipping both of these two special cases...
4321 	 */
4322 	if (rotate) {
4323 		ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4324 					      le32_to_cpu(insert_rec->e_cpos),
4325 					      right_path, &left_path);
4326 		if (ret) {
4327 			mlog_errno(ret);
4328 			goto out;
4329 		}
4330 
4331 		/*
4332 		 * ocfs2_rotate_tree_right() might have extended the
4333 		 * transaction without re-journaling our tree root.
4334 		 */
4335 		ret = ocfs2_et_root_journal_access(handle, et,
4336 						   OCFS2_JOURNAL_ACCESS_WRITE);
4337 		if (ret) {
4338 			mlog_errno(ret);
4339 			goto out;
4340 		}
4341 	} else if (type->ins_appending == APPEND_TAIL
4342 		   && type->ins_contig != CONTIG_LEFT) {
4343 		ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4344 					       right_path, &left_path);
4345 		if (ret) {
4346 			mlog_errno(ret);
4347 			goto out;
4348 		}
4349 	}
4350 
4351 	ret = ocfs2_insert_path(handle, et, left_path, right_path,
4352 				insert_rec, type);
4353 	if (ret) {
4354 		mlog_errno(ret);
4355 		goto out;
4356 	}
4357 
4358 out_update_clusters:
4359 	if (type->ins_split == SPLIT_NONE)
4360 		ocfs2_et_update_clusters(et,
4361 					 le16_to_cpu(insert_rec->e_leaf_clusters));
4362 
4363 	ocfs2_journal_dirty(handle, et->et_root_bh);
4364 
4365 out:
4366 	ocfs2_free_path(left_path);
4367 	ocfs2_free_path(right_path);
4368 
4369 	return ret;
4370 }
4371 
4372 static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4373 			       struct ocfs2_path *path,
4374 			       struct ocfs2_extent_list *el, int index,
4375 			       struct ocfs2_extent_rec *split_rec,
4376 			       struct ocfs2_merge_ctxt *ctxt)
4377 {
4378 	int status = 0;
4379 	enum ocfs2_contig_type ret = CONTIG_NONE;
4380 	u32 left_cpos, right_cpos;
4381 	struct ocfs2_extent_rec *rec = NULL;
4382 	struct ocfs2_extent_list *new_el;
4383 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
4384 	struct buffer_head *bh;
4385 	struct ocfs2_extent_block *eb;
4386 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4387 
4388 	if (index > 0) {
4389 		rec = &el->l_recs[index - 1];
4390 	} else if (path->p_tree_depth > 0) {
4391 		status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4392 		if (status)
4393 			goto exit;
4394 
4395 		if (left_cpos != 0) {
4396 			left_path = ocfs2_new_path_from_path(path);
4397 			if (!left_path) {
4398 				status = -ENOMEM;
4399 				mlog_errno(status);
4400 				goto exit;
4401 			}
4402 
4403 			status = ocfs2_find_path(et->et_ci, left_path,
4404 						 left_cpos);
4405 			if (status)
4406 				goto free_left_path;
4407 
4408 			new_el = path_leaf_el(left_path);
4409 
4410 			if (le16_to_cpu(new_el->l_next_free_rec) !=
4411 			    le16_to_cpu(new_el->l_count)) {
4412 				bh = path_leaf_bh(left_path);
4413 				eb = (struct ocfs2_extent_block *)bh->b_data;
4414 				status = ocfs2_error(sb,
4415 						"Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
4416 						(unsigned long long)le64_to_cpu(eb->h_blkno),
4417 						le16_to_cpu(new_el->l_next_free_rec),
4418 						le16_to_cpu(new_el->l_count));
4419 				goto free_left_path;
4420 			}
4421 			rec = &new_el->l_recs[
4422 				le16_to_cpu(new_el->l_next_free_rec) - 1];
4423 		}
4424 	}
4425 
4426 	/*
4427 	 * We're careful to check for an empty extent record here -
4428 	 * the merge code will know what to do if it sees one.
4429 	 */
4430 	if (rec) {
4431 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
4432 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4433 				ret = CONTIG_RIGHT;
4434 		} else {
4435 			ret = ocfs2_et_extent_contig(et, rec, split_rec);
4436 		}
4437 	}
4438 
4439 	rec = NULL;
4440 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4441 		rec = &el->l_recs[index + 1];
4442 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4443 		 path->p_tree_depth > 0) {
4444 		status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4445 		if (status)
4446 			goto free_left_path;
4447 
4448 		if (right_cpos == 0)
4449 			goto free_left_path;
4450 
4451 		right_path = ocfs2_new_path_from_path(path);
4452 		if (!right_path) {
4453 			status = -ENOMEM;
4454 			mlog_errno(status);
4455 			goto free_left_path;
4456 		}
4457 
4458 		status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4459 		if (status)
4460 			goto free_right_path;
4461 
4462 		new_el = path_leaf_el(right_path);
4463 		rec = &new_el->l_recs[0];
4464 		if (ocfs2_is_empty_extent(rec)) {
4465 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4466 				bh = path_leaf_bh(right_path);
4467 				eb = (struct ocfs2_extent_block *)bh->b_data;
4468 				status = ocfs2_error(sb,
4469 						"Extent block #%llu has an invalid l_next_free_rec of %d\n",
4470 						(unsigned long long)le64_to_cpu(eb->h_blkno),
4471 						le16_to_cpu(new_el->l_next_free_rec));
4472 				goto free_right_path;
4473 			}
4474 			rec = &new_el->l_recs[1];
4475 		}
4476 	}
4477 
4478 	if (rec) {
4479 		enum ocfs2_contig_type contig_type;
4480 
4481 		contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4482 
4483 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4484 			ret = CONTIG_LEFTRIGHT;
4485 		else if (ret == CONTIG_NONE)
4486 			ret = contig_type;
4487 	}
4488 
4489 free_right_path:
4490 	ocfs2_free_path(right_path);
4491 free_left_path:
4492 	ocfs2_free_path(left_path);
4493 exit:
4494 	if (status == 0)
4495 		ctxt->c_contig_type = ret;
4496 
4497 	return status;
4498 }
4499 
4500 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4501 				     struct ocfs2_insert_type *insert,
4502 				     struct ocfs2_extent_list *el,
4503 				     struct ocfs2_extent_rec *insert_rec)
4504 {
4505 	int i;
4506 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4507 
4508 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4509 
4510 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4511 		contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4512 						     insert_rec);
4513 		if (contig_type != CONTIG_NONE) {
4514 			insert->ins_contig_index = i;
4515 			break;
4516 		}
4517 	}
4518 	insert->ins_contig = contig_type;
4519 
4520 	if (insert->ins_contig != CONTIG_NONE) {
4521 		struct ocfs2_extent_rec *rec =
4522 				&el->l_recs[insert->ins_contig_index];
4523 		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4524 				   le16_to_cpu(insert_rec->e_leaf_clusters);
4525 
4526 		/*
4527 		 * Caller might want us to limit the size of extents, don't
4528 		 * calculate contiguousness if we might exceed that limit.
4529 		 */
4530 		if (et->et_max_leaf_clusters &&
4531 		    (len > et->et_max_leaf_clusters))
4532 			insert->ins_contig = CONTIG_NONE;
4533 	}
4534 }
4535 
4536 /*
4537  * This should only be called against the rightmost leaf extent list.
4538  *
4539  * ocfs2_figure_appending_type() will figure out whether we'll have to
4540  * insert at the tail of the rightmost leaf.
4541  *
4542  * This should also work against the root extent list for tree's with 0
4543  * depth. If we consider the root extent list to be the rightmost leaf node
4544  * then the logic here makes sense.
4545  */
4546 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4547 					struct ocfs2_extent_list *el,
4548 					struct ocfs2_extent_rec *insert_rec)
4549 {
4550 	int i;
4551 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4552 	struct ocfs2_extent_rec *rec;
4553 
4554 	insert->ins_appending = APPEND_NONE;
4555 
4556 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4557 
4558 	if (!el->l_next_free_rec)
4559 		goto set_tail_append;
4560 
4561 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4562 		/* Were all records empty? */
4563 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4564 			goto set_tail_append;
4565 	}
4566 
4567 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4568 	rec = &el->l_recs[i];
4569 
4570 	if (cpos >=
4571 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4572 		goto set_tail_append;
4573 
4574 	return;
4575 
4576 set_tail_append:
4577 	insert->ins_appending = APPEND_TAIL;
4578 }
4579 
4580 /*
4581  * Helper function called at the beginning of an insert.
4582  *
4583  * This computes a few things that are commonly used in the process of
4584  * inserting into the btree:
4585  *   - Whether the new extent is contiguous with an existing one.
4586  *   - The current tree depth.
4587  *   - Whether the insert is an appending one.
4588  *   - The total # of free records in the tree.
4589  *
4590  * All of the information is stored on the ocfs2_insert_type
4591  * structure.
4592  */
4593 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4594 				    struct buffer_head **last_eb_bh,
4595 				    struct ocfs2_extent_rec *insert_rec,
4596 				    int *free_records,
4597 				    struct ocfs2_insert_type *insert)
4598 {
4599 	int ret;
4600 	struct ocfs2_extent_block *eb;
4601 	struct ocfs2_extent_list *el;
4602 	struct ocfs2_path *path = NULL;
4603 	struct buffer_head *bh = NULL;
4604 
4605 	insert->ins_split = SPLIT_NONE;
4606 
4607 	el = et->et_root_el;
4608 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4609 
4610 	if (el->l_tree_depth) {
4611 		/*
4612 		 * If we have tree depth, we read in the
4613 		 * rightmost extent block ahead of time as
4614 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4615 		 * may want it later.
4616 		 */
4617 		ret = ocfs2_read_extent_block(et->et_ci,
4618 					      ocfs2_et_get_last_eb_blk(et),
4619 					      &bh);
4620 		if (ret) {
4621 			mlog_errno(ret);
4622 			goto out;
4623 		}
4624 		eb = (struct ocfs2_extent_block *) bh->b_data;
4625 		el = &eb->h_list;
4626 	}
4627 
4628 	/*
4629 	 * Unless we have a contiguous insert, we'll need to know if
4630 	 * there is room left in our allocation tree for another
4631 	 * extent record.
4632 	 *
4633 	 * XXX: This test is simplistic, we can search for empty
4634 	 * extent records too.
4635 	 */
4636 	*free_records = le16_to_cpu(el->l_count) -
4637 		le16_to_cpu(el->l_next_free_rec);
4638 
4639 	if (!insert->ins_tree_depth) {
4640 		ocfs2_figure_contig_type(et, insert, el, insert_rec);
4641 		ocfs2_figure_appending_type(insert, el, insert_rec);
4642 		return 0;
4643 	}
4644 
4645 	path = ocfs2_new_path_from_et(et);
4646 	if (!path) {
4647 		ret = -ENOMEM;
4648 		mlog_errno(ret);
4649 		goto out;
4650 	}
4651 
4652 	/*
4653 	 * In the case that we're inserting past what the tree
4654 	 * currently accounts for, ocfs2_find_path() will return for
4655 	 * us the rightmost tree path. This is accounted for below in
4656 	 * the appending code.
4657 	 */
4658 	ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4659 	if (ret) {
4660 		mlog_errno(ret);
4661 		goto out;
4662 	}
4663 
4664 	el = path_leaf_el(path);
4665 
4666 	/*
4667 	 * Now that we have the path, there's two things we want to determine:
4668 	 * 1) Contiguousness (also set contig_index if this is so)
4669 	 *
4670 	 * 2) Are we doing an append? We can trivially break this up
4671          *     into two types of appends: simple record append, or a
4672          *     rotate inside the tail leaf.
4673 	 */
4674 	ocfs2_figure_contig_type(et, insert, el, insert_rec);
4675 
4676 	/*
4677 	 * The insert code isn't quite ready to deal with all cases of
4678 	 * left contiguousness. Specifically, if it's an insert into
4679 	 * the 1st record in a leaf, it will require the adjustment of
4680 	 * cluster count on the last record of the path directly to it's
4681 	 * left. For now, just catch that case and fool the layers
4682 	 * above us. This works just fine for tree_depth == 0, which
4683 	 * is why we allow that above.
4684 	 */
4685 	if (insert->ins_contig == CONTIG_LEFT &&
4686 	    insert->ins_contig_index == 0)
4687 		insert->ins_contig = CONTIG_NONE;
4688 
4689 	/*
4690 	 * Ok, so we can simply compare against last_eb to figure out
4691 	 * whether the path doesn't exist. This will only happen in
4692 	 * the case that we're doing a tail append, so maybe we can
4693 	 * take advantage of that information somehow.
4694 	 */
4695 	if (ocfs2_et_get_last_eb_blk(et) ==
4696 	    path_leaf_bh(path)->b_blocknr) {
4697 		/*
4698 		 * Ok, ocfs2_find_path() returned us the rightmost
4699 		 * tree path. This might be an appending insert. There are
4700 		 * two cases:
4701 		 *    1) We're doing a true append at the tail:
4702 		 *	-This might even be off the end of the leaf
4703 		 *    2) We're "appending" by rotating in the tail
4704 		 */
4705 		ocfs2_figure_appending_type(insert, el, insert_rec);
4706 	}
4707 
4708 out:
4709 	ocfs2_free_path(path);
4710 
4711 	if (ret == 0)
4712 		*last_eb_bh = bh;
4713 	else
4714 		brelse(bh);
4715 	return ret;
4716 }
4717 
4718 /*
4719  * Insert an extent into a btree.
4720  *
4721  * The caller needs to update the owning btree's cluster count.
4722  */
4723 int ocfs2_insert_extent(handle_t *handle,
4724 			struct ocfs2_extent_tree *et,
4725 			u32 cpos,
4726 			u64 start_blk,
4727 			u32 new_clusters,
4728 			u8 flags,
4729 			struct ocfs2_alloc_context *meta_ac)
4730 {
4731 	int status;
4732 	int free_records;
4733 	struct buffer_head *last_eb_bh = NULL;
4734 	struct ocfs2_insert_type insert = {0, };
4735 	struct ocfs2_extent_rec rec;
4736 
4737 	trace_ocfs2_insert_extent_start(
4738 		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4739 		cpos, new_clusters);
4740 
4741 	memset(&rec, 0, sizeof(rec));
4742 	rec.e_cpos = cpu_to_le32(cpos);
4743 	rec.e_blkno = cpu_to_le64(start_blk);
4744 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4745 	rec.e_flags = flags;
4746 	status = ocfs2_et_insert_check(et, &rec);
4747 	if (status) {
4748 		mlog_errno(status);
4749 		goto bail;
4750 	}
4751 
4752 	status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4753 					  &free_records, &insert);
4754 	if (status < 0) {
4755 		mlog_errno(status);
4756 		goto bail;
4757 	}
4758 
4759 	trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
4760 				  insert.ins_contig_index, free_records,
4761 				  insert.ins_tree_depth);
4762 
4763 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4764 		status = ocfs2_grow_tree(handle, et,
4765 					 &insert.ins_tree_depth, &last_eb_bh,
4766 					 meta_ac);
4767 		if (status) {
4768 			mlog_errno(status);
4769 			goto bail;
4770 		}
4771 	}
4772 
4773 	/* Finally, we can add clusters. This might rotate the tree for us. */
4774 	status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4775 	if (status < 0)
4776 		mlog_errno(status);
4777 	else
4778 		ocfs2_et_extent_map_insert(et, &rec);
4779 
4780 bail:
4781 	brelse(last_eb_bh);
4782 
4783 	return status;
4784 }
4785 
4786 /*
4787  * Allocate and add clusters into the extent b-tree.
4788  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4789  * The extent b-tree's root is specified by et, and
4790  * it is not limited to the file storage. Any extent tree can use this
4791  * function if it implements the proper ocfs2_extent_tree.
4792  */
4793 int ocfs2_add_clusters_in_btree(handle_t *handle,
4794 				struct ocfs2_extent_tree *et,
4795 				u32 *logical_offset,
4796 				u32 clusters_to_add,
4797 				int mark_unwritten,
4798 				struct ocfs2_alloc_context *data_ac,
4799 				struct ocfs2_alloc_context *meta_ac,
4800 				enum ocfs2_alloc_restarted *reason_ret)
4801 {
4802 	int status = 0, err = 0;
4803 	int need_free = 0;
4804 	int free_extents;
4805 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
4806 	u32 bit_off, num_bits;
4807 	u64 block;
4808 	u8 flags = 0;
4809 	struct ocfs2_super *osb =
4810 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4811 
4812 	BUG_ON(!clusters_to_add);
4813 
4814 	if (mark_unwritten)
4815 		flags = OCFS2_EXT_UNWRITTEN;
4816 
4817 	free_extents = ocfs2_num_free_extents(et);
4818 	if (free_extents < 0) {
4819 		status = free_extents;
4820 		mlog_errno(status);
4821 		goto leave;
4822 	}
4823 
4824 	/* there are two cases which could cause us to EAGAIN in the
4825 	 * we-need-more-metadata case:
4826 	 * 1) we haven't reserved *any*
4827 	 * 2) we are so fragmented, we've needed to add metadata too
4828 	 *    many times. */
4829 	if (!free_extents && !meta_ac) {
4830 		err = -1;
4831 		status = -EAGAIN;
4832 		reason = RESTART_META;
4833 		goto leave;
4834 	} else if ((!free_extents)
4835 		   && (ocfs2_alloc_context_bits_left(meta_ac)
4836 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
4837 		err = -2;
4838 		status = -EAGAIN;
4839 		reason = RESTART_META;
4840 		goto leave;
4841 	}
4842 
4843 	status = __ocfs2_claim_clusters(handle, data_ac, 1,
4844 					clusters_to_add, &bit_off, &num_bits);
4845 	if (status < 0) {
4846 		if (status != -ENOSPC)
4847 			mlog_errno(status);
4848 		goto leave;
4849 	}
4850 
4851 	BUG_ON(num_bits > clusters_to_add);
4852 
4853 	/* reserve our write early -- insert_extent may update the tree root */
4854 	status = ocfs2_et_root_journal_access(handle, et,
4855 					      OCFS2_JOURNAL_ACCESS_WRITE);
4856 	if (status < 0) {
4857 		mlog_errno(status);
4858 		need_free = 1;
4859 		goto bail;
4860 	}
4861 
4862 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4863 	trace_ocfs2_add_clusters_in_btree(
4864 	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4865 	     bit_off, num_bits);
4866 	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4867 				     num_bits, flags, meta_ac);
4868 	if (status < 0) {
4869 		mlog_errno(status);
4870 		need_free = 1;
4871 		goto bail;
4872 	}
4873 
4874 	ocfs2_journal_dirty(handle, et->et_root_bh);
4875 
4876 	clusters_to_add -= num_bits;
4877 	*logical_offset += num_bits;
4878 
4879 	if (clusters_to_add) {
4880 		err = clusters_to_add;
4881 		status = -EAGAIN;
4882 		reason = RESTART_TRANS;
4883 	}
4884 
4885 bail:
4886 	if (need_free) {
4887 		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
4888 			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
4889 					bit_off, num_bits);
4890 		else
4891 			ocfs2_free_clusters(handle,
4892 					data_ac->ac_inode,
4893 					data_ac->ac_bh,
4894 					ocfs2_clusters_to_blocks(osb->sb, bit_off),
4895 					num_bits);
4896 	}
4897 
4898 leave:
4899 	if (reason_ret)
4900 		*reason_ret = reason;
4901 	trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
4902 	return status;
4903 }
4904 
4905 static void ocfs2_make_right_split_rec(struct super_block *sb,
4906 				       struct ocfs2_extent_rec *split_rec,
4907 				       u32 cpos,
4908 				       struct ocfs2_extent_rec *rec)
4909 {
4910 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4911 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4912 
4913 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4914 
4915 	split_rec->e_cpos = cpu_to_le32(cpos);
4916 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4917 
4918 	split_rec->e_blkno = rec->e_blkno;
4919 	le64_add_cpu(&split_rec->e_blkno,
4920 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4921 
4922 	split_rec->e_flags = rec->e_flags;
4923 }
4924 
4925 static int ocfs2_split_and_insert(handle_t *handle,
4926 				  struct ocfs2_extent_tree *et,
4927 				  struct ocfs2_path *path,
4928 				  struct buffer_head **last_eb_bh,
4929 				  int split_index,
4930 				  struct ocfs2_extent_rec *orig_split_rec,
4931 				  struct ocfs2_alloc_context *meta_ac)
4932 {
4933 	int ret = 0, depth;
4934 	unsigned int insert_range, rec_range, do_leftright = 0;
4935 	struct ocfs2_extent_rec tmprec;
4936 	struct ocfs2_extent_list *rightmost_el;
4937 	struct ocfs2_extent_rec rec;
4938 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4939 	struct ocfs2_insert_type insert;
4940 	struct ocfs2_extent_block *eb;
4941 
4942 leftright:
4943 	/*
4944 	 * Store a copy of the record on the stack - it might move
4945 	 * around as the tree is manipulated below.
4946 	 */
4947 	rec = path_leaf_el(path)->l_recs[split_index];
4948 
4949 	rightmost_el = et->et_root_el;
4950 
4951 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4952 	if (depth) {
4953 		BUG_ON(!(*last_eb_bh));
4954 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4955 		rightmost_el = &eb->h_list;
4956 	}
4957 
4958 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4959 	    le16_to_cpu(rightmost_el->l_count)) {
4960 		ret = ocfs2_grow_tree(handle, et,
4961 				      &depth, last_eb_bh, meta_ac);
4962 		if (ret) {
4963 			mlog_errno(ret);
4964 			goto out;
4965 		}
4966 	}
4967 
4968 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4969 	insert.ins_appending = APPEND_NONE;
4970 	insert.ins_contig = CONTIG_NONE;
4971 	insert.ins_tree_depth = depth;
4972 
4973 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4974 		le16_to_cpu(split_rec.e_leaf_clusters);
4975 	rec_range = le32_to_cpu(rec.e_cpos) +
4976 		le16_to_cpu(rec.e_leaf_clusters);
4977 
4978 	if (split_rec.e_cpos == rec.e_cpos) {
4979 		insert.ins_split = SPLIT_LEFT;
4980 	} else if (insert_range == rec_range) {
4981 		insert.ins_split = SPLIT_RIGHT;
4982 	} else {
4983 		/*
4984 		 * Left/right split. We fake this as a right split
4985 		 * first and then make a second pass as a left split.
4986 		 */
4987 		insert.ins_split = SPLIT_RIGHT;
4988 
4989 		ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4990 					   &tmprec, insert_range, &rec);
4991 
4992 		split_rec = tmprec;
4993 
4994 		BUG_ON(do_leftright);
4995 		do_leftright = 1;
4996 	}
4997 
4998 	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4999 	if (ret) {
5000 		mlog_errno(ret);
5001 		goto out;
5002 	}
5003 
5004 	if (do_leftright == 1) {
5005 		u32 cpos;
5006 		struct ocfs2_extent_list *el;
5007 
5008 		do_leftright++;
5009 		split_rec = *orig_split_rec;
5010 
5011 		ocfs2_reinit_path(path, 1);
5012 
5013 		cpos = le32_to_cpu(split_rec.e_cpos);
5014 		ret = ocfs2_find_path(et->et_ci, path, cpos);
5015 		if (ret) {
5016 			mlog_errno(ret);
5017 			goto out;
5018 		}
5019 
5020 		el = path_leaf_el(path);
5021 		split_index = ocfs2_search_extent_list(el, cpos);
5022 		if (split_index == -1) {
5023 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5024 				    "Owner %llu has an extent at cpos %u which can no longer be found\n",
5025 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5026 				    cpos);
5027 			ret = -EROFS;
5028 			goto out;
5029 		}
5030 		goto leftright;
5031 	}
5032 out:
5033 
5034 	return ret;
5035 }
5036 
5037 static int ocfs2_replace_extent_rec(handle_t *handle,
5038 				    struct ocfs2_extent_tree *et,
5039 				    struct ocfs2_path *path,
5040 				    struct ocfs2_extent_list *el,
5041 				    int split_index,
5042 				    struct ocfs2_extent_rec *split_rec)
5043 {
5044 	int ret;
5045 
5046 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
5047 					   path_num_items(path) - 1);
5048 	if (ret) {
5049 		mlog_errno(ret);
5050 		goto out;
5051 	}
5052 
5053 	el->l_recs[split_index] = *split_rec;
5054 
5055 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5056 out:
5057 	return ret;
5058 }
5059 
5060 /*
5061  * Split part or all of the extent record at split_index in the leaf
5062  * pointed to by path. Merge with the contiguous extent record if needed.
5063  *
5064  * Care is taken to handle contiguousness so as to not grow the tree.
5065  *
5066  * meta_ac is not strictly necessary - we only truly need it if growth
5067  * of the tree is required. All other cases will degrade into a less
5068  * optimal tree layout.
5069  *
5070  * last_eb_bh should be the rightmost leaf block for any extent
5071  * btree. Since a split may grow the tree or a merge might shrink it,
5072  * the caller cannot trust the contents of that buffer after this call.
5073  *
5074  * This code is optimized for readability - several passes might be
5075  * made over certain portions of the tree. All of those blocks will
5076  * have been brought into cache (and pinned via the journal), so the
5077  * extra overhead is not expressed in terms of disk reads.
5078  */
5079 int ocfs2_split_extent(handle_t *handle,
5080 		       struct ocfs2_extent_tree *et,
5081 		       struct ocfs2_path *path,
5082 		       int split_index,
5083 		       struct ocfs2_extent_rec *split_rec,
5084 		       struct ocfs2_alloc_context *meta_ac,
5085 		       struct ocfs2_cached_dealloc_ctxt *dealloc)
5086 {
5087 	int ret = 0;
5088 	struct ocfs2_extent_list *el = path_leaf_el(path);
5089 	struct buffer_head *last_eb_bh = NULL;
5090 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5091 	struct ocfs2_merge_ctxt ctxt;
5092 
5093 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5094 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5095 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5096 		ret = -EIO;
5097 		mlog_errno(ret);
5098 		goto out;
5099 	}
5100 
5101 	ret = ocfs2_figure_merge_contig_type(et, path, el,
5102 					     split_index,
5103 					     split_rec,
5104 					     &ctxt);
5105 	if (ret) {
5106 		mlog_errno(ret);
5107 		goto out;
5108 	}
5109 
5110 	/*
5111 	 * The core merge / split code wants to know how much room is
5112 	 * left in this allocation tree, so we pass the
5113 	 * rightmost extent list.
5114 	 */
5115 	if (path->p_tree_depth) {
5116 		ret = ocfs2_read_extent_block(et->et_ci,
5117 					      ocfs2_et_get_last_eb_blk(et),
5118 					      &last_eb_bh);
5119 		if (ret) {
5120 			mlog_errno(ret);
5121 			goto out;
5122 		}
5123 	}
5124 
5125 	if (rec->e_cpos == split_rec->e_cpos &&
5126 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5127 		ctxt.c_split_covers_rec = 1;
5128 	else
5129 		ctxt.c_split_covers_rec = 0;
5130 
5131 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5132 
5133 	trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
5134 				 ctxt.c_has_empty_extent,
5135 				 ctxt.c_split_covers_rec);
5136 
5137 	if (ctxt.c_contig_type == CONTIG_NONE) {
5138 		if (ctxt.c_split_covers_rec)
5139 			ret = ocfs2_replace_extent_rec(handle, et, path, el,
5140 						       split_index, split_rec);
5141 		else
5142 			ret = ocfs2_split_and_insert(handle, et, path,
5143 						     &last_eb_bh, split_index,
5144 						     split_rec, meta_ac);
5145 		if (ret)
5146 			mlog_errno(ret);
5147 	} else {
5148 		ret = ocfs2_try_to_merge_extent(handle, et, path,
5149 						split_index, split_rec,
5150 						dealloc, &ctxt);
5151 		if (ret)
5152 			mlog_errno(ret);
5153 	}
5154 
5155 out:
5156 	brelse(last_eb_bh);
5157 	return ret;
5158 }
5159 
5160 /*
5161  * Change the flags of the already-existing extent at cpos for len clusters.
5162  *
5163  * new_flags: the flags we want to set.
5164  * clear_flags: the flags we want to clear.
5165  * phys: the new physical offset we want this new extent starts from.
5166  *
5167  * If the existing extent is larger than the request, initiate a
5168  * split. An attempt will be made at merging with adjacent extents.
5169  *
5170  * The caller is responsible for passing down meta_ac if we'll need it.
5171  */
5172 int ocfs2_change_extent_flag(handle_t *handle,
5173 			     struct ocfs2_extent_tree *et,
5174 			     u32 cpos, u32 len, u32 phys,
5175 			     struct ocfs2_alloc_context *meta_ac,
5176 			     struct ocfs2_cached_dealloc_ctxt *dealloc,
5177 			     int new_flags, int clear_flags)
5178 {
5179 	int ret, index;
5180 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5181 	u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5182 	struct ocfs2_extent_rec split_rec;
5183 	struct ocfs2_path *left_path = NULL;
5184 	struct ocfs2_extent_list *el;
5185 	struct ocfs2_extent_rec *rec;
5186 
5187 	left_path = ocfs2_new_path_from_et(et);
5188 	if (!left_path) {
5189 		ret = -ENOMEM;
5190 		mlog_errno(ret);
5191 		goto out;
5192 	}
5193 
5194 	ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5195 	if (ret) {
5196 		mlog_errno(ret);
5197 		goto out;
5198 	}
5199 	el = path_leaf_el(left_path);
5200 
5201 	index = ocfs2_search_extent_list(el, cpos);
5202 	if (index == -1) {
5203 		ocfs2_error(sb,
5204 			    "Owner %llu has an extent at cpos %u which can no longer be found\n",
5205 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5206 			    cpos);
5207 		ret = -EROFS;
5208 		goto out;
5209 	}
5210 
5211 	ret = -EIO;
5212 	rec = &el->l_recs[index];
5213 	if (new_flags && (rec->e_flags & new_flags)) {
5214 		mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5215 		     "extent that already had them\n",
5216 		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5217 		     new_flags);
5218 		goto out;
5219 	}
5220 
5221 	if (clear_flags && !(rec->e_flags & clear_flags)) {
5222 		mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5223 		     "extent that didn't have them\n",
5224 		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5225 		     clear_flags);
5226 		goto out;
5227 	}
5228 
5229 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5230 	split_rec.e_cpos = cpu_to_le32(cpos);
5231 	split_rec.e_leaf_clusters = cpu_to_le16(len);
5232 	split_rec.e_blkno = cpu_to_le64(start_blkno);
5233 	split_rec.e_flags = rec->e_flags;
5234 	if (new_flags)
5235 		split_rec.e_flags |= new_flags;
5236 	if (clear_flags)
5237 		split_rec.e_flags &= ~clear_flags;
5238 
5239 	ret = ocfs2_split_extent(handle, et, left_path,
5240 				 index, &split_rec, meta_ac,
5241 				 dealloc);
5242 	if (ret)
5243 		mlog_errno(ret);
5244 
5245 out:
5246 	ocfs2_free_path(left_path);
5247 	return ret;
5248 
5249 }
5250 
5251 /*
5252  * Mark the already-existing extent at cpos as written for len clusters.
5253  * This removes the unwritten extent flag.
5254  *
5255  * If the existing extent is larger than the request, initiate a
5256  * split. An attempt will be made at merging with adjacent extents.
5257  *
5258  * The caller is responsible for passing down meta_ac if we'll need it.
5259  */
5260 int ocfs2_mark_extent_written(struct inode *inode,
5261 			      struct ocfs2_extent_tree *et,
5262 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
5263 			      struct ocfs2_alloc_context *meta_ac,
5264 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
5265 {
5266 	int ret;
5267 
5268 	trace_ocfs2_mark_extent_written(
5269 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
5270 		cpos, len, phys);
5271 
5272 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5273 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
5274 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
5275 		ret = -EROFS;
5276 		goto out;
5277 	}
5278 
5279 	/*
5280 	 * XXX: This should be fixed up so that we just re-insert the
5281 	 * next extent records.
5282 	 */
5283 	ocfs2_et_extent_map_truncate(et, 0);
5284 
5285 	ret = ocfs2_change_extent_flag(handle, et, cpos,
5286 				       len, phys, meta_ac, dealloc,
5287 				       0, OCFS2_EXT_UNWRITTEN);
5288 	if (ret)
5289 		mlog_errno(ret);
5290 
5291 out:
5292 	return ret;
5293 }
5294 
5295 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5296 			    struct ocfs2_path *path,
5297 			    int index, u32 new_range,
5298 			    struct ocfs2_alloc_context *meta_ac)
5299 {
5300 	int ret, depth, credits;
5301 	struct buffer_head *last_eb_bh = NULL;
5302 	struct ocfs2_extent_block *eb;
5303 	struct ocfs2_extent_list *rightmost_el, *el;
5304 	struct ocfs2_extent_rec split_rec;
5305 	struct ocfs2_extent_rec *rec;
5306 	struct ocfs2_insert_type insert;
5307 
5308 	/*
5309 	 * Setup the record to split before we grow the tree.
5310 	 */
5311 	el = path_leaf_el(path);
5312 	rec = &el->l_recs[index];
5313 	ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5314 				   &split_rec, new_range, rec);
5315 
5316 	depth = path->p_tree_depth;
5317 	if (depth > 0) {
5318 		ret = ocfs2_read_extent_block(et->et_ci,
5319 					      ocfs2_et_get_last_eb_blk(et),
5320 					      &last_eb_bh);
5321 		if (ret < 0) {
5322 			mlog_errno(ret);
5323 			goto out;
5324 		}
5325 
5326 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5327 		rightmost_el = &eb->h_list;
5328 	} else
5329 		rightmost_el = path_leaf_el(path);
5330 
5331 	credits = path->p_tree_depth +
5332 		  ocfs2_extend_meta_needed(et->et_root_el);
5333 	ret = ocfs2_extend_trans(handle, credits);
5334 	if (ret) {
5335 		mlog_errno(ret);
5336 		goto out;
5337 	}
5338 
5339 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5340 	    le16_to_cpu(rightmost_el->l_count)) {
5341 		ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5342 				      meta_ac);
5343 		if (ret) {
5344 			mlog_errno(ret);
5345 			goto out;
5346 		}
5347 	}
5348 
5349 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5350 	insert.ins_appending = APPEND_NONE;
5351 	insert.ins_contig = CONTIG_NONE;
5352 	insert.ins_split = SPLIT_RIGHT;
5353 	insert.ins_tree_depth = depth;
5354 
5355 	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5356 	if (ret)
5357 		mlog_errno(ret);
5358 
5359 out:
5360 	brelse(last_eb_bh);
5361 	return ret;
5362 }
5363 
5364 static int ocfs2_truncate_rec(handle_t *handle,
5365 			      struct ocfs2_extent_tree *et,
5366 			      struct ocfs2_path *path, int index,
5367 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
5368 			      u32 cpos, u32 len)
5369 {
5370 	int ret;
5371 	u32 left_cpos, rec_range, trunc_range;
5372 	int is_rightmost_tree_rec = 0;
5373 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5374 	struct ocfs2_path *left_path = NULL;
5375 	struct ocfs2_extent_list *el = path_leaf_el(path);
5376 	struct ocfs2_extent_rec *rec;
5377 	struct ocfs2_extent_block *eb;
5378 
5379 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5380 		/* extend credit for ocfs2_remove_rightmost_path */
5381 		ret = ocfs2_extend_rotate_transaction(handle, 0,
5382 				jbd2_handle_buffer_credits(handle),
5383 				path);
5384 		if (ret) {
5385 			mlog_errno(ret);
5386 			goto out;
5387 		}
5388 
5389 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5390 		if (ret) {
5391 			mlog_errno(ret);
5392 			goto out;
5393 		}
5394 
5395 		index--;
5396 	}
5397 
5398 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5399 	    path->p_tree_depth) {
5400 		/*
5401 		 * Check whether this is the rightmost tree record. If
5402 		 * we remove all of this record or part of its right
5403 		 * edge then an update of the record lengths above it
5404 		 * will be required.
5405 		 */
5406 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5407 		if (eb->h_next_leaf_blk == 0)
5408 			is_rightmost_tree_rec = 1;
5409 	}
5410 
5411 	rec = &el->l_recs[index];
5412 	if (index == 0 && path->p_tree_depth &&
5413 	    le32_to_cpu(rec->e_cpos) == cpos) {
5414 		/*
5415 		 * Changing the leftmost offset (via partial or whole
5416 		 * record truncate) of an interior (or rightmost) path
5417 		 * means we have to update the subtree that is formed
5418 		 * by this leaf and the one to it's left.
5419 		 *
5420 		 * There are two cases we can skip:
5421 		 *   1) Path is the leftmost one in our btree.
5422 		 *   2) The leaf is rightmost and will be empty after
5423 		 *      we remove the extent record - the rotate code
5424 		 *      knows how to update the newly formed edge.
5425 		 */
5426 
5427 		ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5428 		if (ret) {
5429 			mlog_errno(ret);
5430 			goto out;
5431 		}
5432 
5433 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5434 			left_path = ocfs2_new_path_from_path(path);
5435 			if (!left_path) {
5436 				ret = -ENOMEM;
5437 				mlog_errno(ret);
5438 				goto out;
5439 			}
5440 
5441 			ret = ocfs2_find_path(et->et_ci, left_path,
5442 					      left_cpos);
5443 			if (ret) {
5444 				mlog_errno(ret);
5445 				goto out;
5446 			}
5447 		}
5448 	}
5449 
5450 	ret = ocfs2_extend_rotate_transaction(handle, 0,
5451 					jbd2_handle_buffer_credits(handle),
5452 					path);
5453 	if (ret) {
5454 		mlog_errno(ret);
5455 		goto out;
5456 	}
5457 
5458 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5459 	if (ret) {
5460 		mlog_errno(ret);
5461 		goto out;
5462 	}
5463 
5464 	ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5465 	if (ret) {
5466 		mlog_errno(ret);
5467 		goto out;
5468 	}
5469 
5470 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5471 	trunc_range = cpos + len;
5472 
5473 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5474 		int next_free;
5475 
5476 		memset(rec, 0, sizeof(*rec));
5477 		ocfs2_cleanup_merge(el, index);
5478 
5479 		next_free = le16_to_cpu(el->l_next_free_rec);
5480 		if (is_rightmost_tree_rec && next_free > 1) {
5481 			/*
5482 			 * We skip the edge update if this path will
5483 			 * be deleted by the rotate code.
5484 			 */
5485 			rec = &el->l_recs[next_free - 1];
5486 			ocfs2_adjust_rightmost_records(handle, et, path,
5487 						       rec);
5488 		}
5489 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
5490 		/* Remove leftmost portion of the record. */
5491 		le32_add_cpu(&rec->e_cpos, len);
5492 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5493 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5494 	} else if (rec_range == trunc_range) {
5495 		/* Remove rightmost portion of the record */
5496 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5497 		if (is_rightmost_tree_rec)
5498 			ocfs2_adjust_rightmost_records(handle, et, path, rec);
5499 	} else {
5500 		/* Caller should have trapped this. */
5501 		mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5502 		     "(%u, %u)\n",
5503 		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5504 		     le32_to_cpu(rec->e_cpos),
5505 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5506 		BUG();
5507 	}
5508 
5509 	if (left_path) {
5510 		int subtree_index;
5511 
5512 		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5513 		ocfs2_complete_edge_insert(handle, left_path, path,
5514 					   subtree_index);
5515 	}
5516 
5517 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5518 
5519 	ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5520 	if (ret)
5521 		mlog_errno(ret);
5522 
5523 out:
5524 	ocfs2_free_path(left_path);
5525 	return ret;
5526 }
5527 
5528 int ocfs2_remove_extent(handle_t *handle,
5529 			struct ocfs2_extent_tree *et,
5530 			u32 cpos, u32 len,
5531 			struct ocfs2_alloc_context *meta_ac,
5532 			struct ocfs2_cached_dealloc_ctxt *dealloc)
5533 {
5534 	int ret, index;
5535 	u32 rec_range, trunc_range;
5536 	struct ocfs2_extent_rec *rec;
5537 	struct ocfs2_extent_list *el;
5538 	struct ocfs2_path *path = NULL;
5539 
5540 	/*
5541 	 * XXX: Why are we truncating to 0 instead of wherever this
5542 	 * affects us?
5543 	 */
5544 	ocfs2_et_extent_map_truncate(et, 0);
5545 
5546 	path = ocfs2_new_path_from_et(et);
5547 	if (!path) {
5548 		ret = -ENOMEM;
5549 		mlog_errno(ret);
5550 		goto out;
5551 	}
5552 
5553 	ret = ocfs2_find_path(et->et_ci, path, cpos);
5554 	if (ret) {
5555 		mlog_errno(ret);
5556 		goto out;
5557 	}
5558 
5559 	el = path_leaf_el(path);
5560 	index = ocfs2_search_extent_list(el, cpos);
5561 	if (index == -1) {
5562 		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5563 			    "Owner %llu has an extent at cpos %u which can no longer be found\n",
5564 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5565 			    cpos);
5566 		ret = -EROFS;
5567 		goto out;
5568 	}
5569 
5570 	/*
5571 	 * We have 3 cases of extent removal:
5572 	 *   1) Range covers the entire extent rec
5573 	 *   2) Range begins or ends on one edge of the extent rec
5574 	 *   3) Range is in the middle of the extent rec (no shared edges)
5575 	 *
5576 	 * For case 1 we remove the extent rec and left rotate to
5577 	 * fill the hole.
5578 	 *
5579 	 * For case 2 we just shrink the existing extent rec, with a
5580 	 * tree update if the shrinking edge is also the edge of an
5581 	 * extent block.
5582 	 *
5583 	 * For case 3 we do a right split to turn the extent rec into
5584 	 * something case 2 can handle.
5585 	 */
5586 	rec = &el->l_recs[index];
5587 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5588 	trunc_range = cpos + len;
5589 
5590 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5591 
5592 	trace_ocfs2_remove_extent(
5593 		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5594 		cpos, len, index, le32_to_cpu(rec->e_cpos),
5595 		ocfs2_rec_clusters(el, rec));
5596 
5597 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5598 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5599 					 cpos, len);
5600 		if (ret) {
5601 			mlog_errno(ret);
5602 			goto out;
5603 		}
5604 	} else {
5605 		ret = ocfs2_split_tree(handle, et, path, index,
5606 				       trunc_range, meta_ac);
5607 		if (ret) {
5608 			mlog_errno(ret);
5609 			goto out;
5610 		}
5611 
5612 		/*
5613 		 * The split could have manipulated the tree enough to
5614 		 * move the record location, so we have to look for it again.
5615 		 */
5616 		ocfs2_reinit_path(path, 1);
5617 
5618 		ret = ocfs2_find_path(et->et_ci, path, cpos);
5619 		if (ret) {
5620 			mlog_errno(ret);
5621 			goto out;
5622 		}
5623 
5624 		el = path_leaf_el(path);
5625 		index = ocfs2_search_extent_list(el, cpos);
5626 		if (index == -1) {
5627 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5628 				    "Owner %llu: split at cpos %u lost record\n",
5629 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5630 				    cpos);
5631 			ret = -EROFS;
5632 			goto out;
5633 		}
5634 
5635 		/*
5636 		 * Double check our values here. If anything is fishy,
5637 		 * it's easier to catch it at the top level.
5638 		 */
5639 		rec = &el->l_recs[index];
5640 		rec_range = le32_to_cpu(rec->e_cpos) +
5641 			ocfs2_rec_clusters(el, rec);
5642 		if (rec_range != trunc_range) {
5643 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5644 				    "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
5645 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5646 				    cpos, len, le32_to_cpu(rec->e_cpos),
5647 				    ocfs2_rec_clusters(el, rec));
5648 			ret = -EROFS;
5649 			goto out;
5650 		}
5651 
5652 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5653 					 cpos, len);
5654 		if (ret)
5655 			mlog_errno(ret);
5656 	}
5657 
5658 out:
5659 	ocfs2_free_path(path);
5660 	return ret;
5661 }
5662 
5663 /*
5664  * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5665  * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5666  * number to reserve some extra blocks, and it only handles meta
5667  * data allocations.
5668  *
5669  * Currently, only ocfs2_remove_btree_range() uses it for truncating
5670  * and punching holes.
5671  */
5672 static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5673 					      struct ocfs2_extent_tree *et,
5674 					      u32 extents_to_split,
5675 					      struct ocfs2_alloc_context **ac,
5676 					      int extra_blocks)
5677 {
5678 	int ret = 0, num_free_extents;
5679 	unsigned int max_recs_needed = 2 * extents_to_split;
5680 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5681 
5682 	*ac = NULL;
5683 
5684 	num_free_extents = ocfs2_num_free_extents(et);
5685 	if (num_free_extents < 0) {
5686 		ret = num_free_extents;
5687 		mlog_errno(ret);
5688 		goto out;
5689 	}
5690 
5691 	if (!num_free_extents ||
5692 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5693 		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5694 
5695 	if (extra_blocks) {
5696 		ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5697 		if (ret < 0) {
5698 			if (ret != -ENOSPC)
5699 				mlog_errno(ret);
5700 		}
5701 	}
5702 
5703 out:
5704 	if (ret) {
5705 		if (*ac) {
5706 			ocfs2_free_alloc_context(*ac);
5707 			*ac = NULL;
5708 		}
5709 	}
5710 
5711 	return ret;
5712 }
5713 
5714 int ocfs2_remove_btree_range(struct inode *inode,
5715 			     struct ocfs2_extent_tree *et,
5716 			     u32 cpos, u32 phys_cpos, u32 len, int flags,
5717 			     struct ocfs2_cached_dealloc_ctxt *dealloc,
5718 			     u64 refcount_loc, bool refcount_tree_locked)
5719 {
5720 	int ret, credits = 0, extra_blocks = 0;
5721 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5722 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5723 	struct inode *tl_inode = osb->osb_tl_inode;
5724 	handle_t *handle;
5725 	struct ocfs2_alloc_context *meta_ac = NULL;
5726 	struct ocfs2_refcount_tree *ref_tree = NULL;
5727 
5728 	if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5729 		BUG_ON(!ocfs2_is_refcount_inode(inode));
5730 
5731 		if (!refcount_tree_locked) {
5732 			ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5733 						       &ref_tree, NULL);
5734 			if (ret) {
5735 				mlog_errno(ret);
5736 				goto bail;
5737 			}
5738 		}
5739 
5740 		ret = ocfs2_prepare_refcount_change_for_del(inode,
5741 							    refcount_loc,
5742 							    phys_blkno,
5743 							    len,
5744 							    &credits,
5745 							    &extra_blocks);
5746 		if (ret < 0) {
5747 			mlog_errno(ret);
5748 			goto bail;
5749 		}
5750 	}
5751 
5752 	ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5753 						 extra_blocks);
5754 	if (ret) {
5755 		mlog_errno(ret);
5756 		goto bail;
5757 	}
5758 
5759 	inode_lock(tl_inode);
5760 
5761 	if (ocfs2_truncate_log_needs_flush(osb)) {
5762 		ret = __ocfs2_flush_truncate_log(osb);
5763 		if (ret < 0) {
5764 			mlog_errno(ret);
5765 			goto out;
5766 		}
5767 	}
5768 
5769 	handle = ocfs2_start_trans(osb,
5770 			ocfs2_remove_extent_credits(osb->sb) + credits);
5771 	if (IS_ERR(handle)) {
5772 		ret = PTR_ERR(handle);
5773 		mlog_errno(ret);
5774 		goto out;
5775 	}
5776 
5777 	ret = ocfs2_et_root_journal_access(handle, et,
5778 					   OCFS2_JOURNAL_ACCESS_WRITE);
5779 	if (ret) {
5780 		mlog_errno(ret);
5781 		goto out_commit;
5782 	}
5783 
5784 	dquot_free_space_nodirty(inode,
5785 				  ocfs2_clusters_to_bytes(inode->i_sb, len));
5786 
5787 	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5788 	if (ret) {
5789 		mlog_errno(ret);
5790 		goto out_commit;
5791 	}
5792 
5793 	ocfs2_et_update_clusters(et, -len);
5794 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
5795 
5796 	ocfs2_journal_dirty(handle, et->et_root_bh);
5797 
5798 	if (phys_blkno) {
5799 		if (flags & OCFS2_EXT_REFCOUNTED)
5800 			ret = ocfs2_decrease_refcount(inode, handle,
5801 					ocfs2_blocks_to_clusters(osb->sb,
5802 								 phys_blkno),
5803 					len, meta_ac,
5804 					dealloc, 1);
5805 		else
5806 			ret = ocfs2_truncate_log_append(osb, handle,
5807 							phys_blkno, len);
5808 		if (ret)
5809 			mlog_errno(ret);
5810 
5811 	}
5812 
5813 out_commit:
5814 	ocfs2_commit_trans(osb, handle);
5815 out:
5816 	inode_unlock(tl_inode);
5817 bail:
5818 	if (meta_ac)
5819 		ocfs2_free_alloc_context(meta_ac);
5820 
5821 	if (ref_tree)
5822 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5823 
5824 	return ret;
5825 }
5826 
5827 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5828 {
5829 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5830 	struct ocfs2_dinode *di;
5831 	struct ocfs2_truncate_log *tl;
5832 
5833 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5834 	tl = &di->id2.i_dealloc;
5835 
5836 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5837 			"slot %d, invalid truncate log parameters: used = "
5838 			"%u, count = %u\n", osb->slot_num,
5839 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5840 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5841 }
5842 
5843 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5844 					   unsigned int new_start)
5845 {
5846 	unsigned int tail_index;
5847 	unsigned int current_tail;
5848 
5849 	/* No records, nothing to coalesce */
5850 	if (!le16_to_cpu(tl->tl_used))
5851 		return 0;
5852 
5853 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5854 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5855 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5856 
5857 	return current_tail == new_start;
5858 }
5859 
5860 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5861 			      handle_t *handle,
5862 			      u64 start_blk,
5863 			      unsigned int num_clusters)
5864 {
5865 	int status, index;
5866 	unsigned int start_cluster, tl_count;
5867 	struct inode *tl_inode = osb->osb_tl_inode;
5868 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5869 	struct ocfs2_dinode *di;
5870 	struct ocfs2_truncate_log *tl;
5871 
5872 	BUG_ON(inode_trylock(tl_inode));
5873 
5874 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5875 
5876 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5877 
5878 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5879 	 * by the underlying call to ocfs2_read_inode_block(), so any
5880 	 * corruption is a code bug */
5881 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5882 
5883 	tl = &di->id2.i_dealloc;
5884 	tl_count = le16_to_cpu(tl->tl_count);
5885 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5886 			tl_count == 0,
5887 			"Truncate record count on #%llu invalid "
5888 			"wanted %u, actual %u\n",
5889 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5890 			ocfs2_truncate_recs_per_inode(osb->sb),
5891 			le16_to_cpu(tl->tl_count));
5892 
5893 	/* Caller should have known to flush before calling us. */
5894 	index = le16_to_cpu(tl->tl_used);
5895 	if (index >= tl_count) {
5896 		status = -ENOSPC;
5897 		mlog_errno(status);
5898 		goto bail;
5899 	}
5900 
5901 	status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5902 					 OCFS2_JOURNAL_ACCESS_WRITE);
5903 	if (status < 0) {
5904 		mlog_errno(status);
5905 		goto bail;
5906 	}
5907 
5908 	trace_ocfs2_truncate_log_append(
5909 		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
5910 		start_cluster, num_clusters);
5911 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5912 		/*
5913 		 * Move index back to the record we are coalescing with.
5914 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5915 		 */
5916 		index--;
5917 
5918 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5919 		trace_ocfs2_truncate_log_append(
5920 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5921 			index, le32_to_cpu(tl->tl_recs[index].t_start),
5922 			num_clusters);
5923 	} else {
5924 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5925 		tl->tl_used = cpu_to_le16(index + 1);
5926 	}
5927 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5928 
5929 	ocfs2_journal_dirty(handle, tl_bh);
5930 
5931 	osb->truncated_clusters += num_clusters;
5932 bail:
5933 	return status;
5934 }
5935 
5936 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5937 					 struct inode *data_alloc_inode,
5938 					 struct buffer_head *data_alloc_bh)
5939 {
5940 	int status = 0;
5941 	int i;
5942 	unsigned int num_clusters;
5943 	u64 start_blk;
5944 	struct ocfs2_truncate_rec rec;
5945 	struct ocfs2_dinode *di;
5946 	struct ocfs2_truncate_log *tl;
5947 	struct inode *tl_inode = osb->osb_tl_inode;
5948 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5949 	handle_t *handle;
5950 
5951 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5952 	tl = &di->id2.i_dealloc;
5953 	i = le16_to_cpu(tl->tl_used) - 1;
5954 	while (i >= 0) {
5955 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5956 		if (IS_ERR(handle)) {
5957 			status = PTR_ERR(handle);
5958 			mlog_errno(status);
5959 			goto bail;
5960 		}
5961 
5962 		/* Caller has given us at least enough credits to
5963 		 * update the truncate log dinode */
5964 		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5965 						 OCFS2_JOURNAL_ACCESS_WRITE);
5966 		if (status < 0) {
5967 			ocfs2_commit_trans(osb, handle);
5968 			mlog_errno(status);
5969 			goto bail;
5970 		}
5971 
5972 		tl->tl_used = cpu_to_le16(i);
5973 
5974 		ocfs2_journal_dirty(handle, tl_bh);
5975 
5976 		rec = tl->tl_recs[i];
5977 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5978 						    le32_to_cpu(rec.t_start));
5979 		num_clusters = le32_to_cpu(rec.t_clusters);
5980 
5981 		/* if start_blk is not set, we ignore the record as
5982 		 * invalid. */
5983 		if (start_blk) {
5984 			trace_ocfs2_replay_truncate_records(
5985 				(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5986 				i, le32_to_cpu(rec.t_start), num_clusters);
5987 
5988 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5989 						     data_alloc_bh, start_blk,
5990 						     num_clusters);
5991 			if (status < 0) {
5992 				ocfs2_commit_trans(osb, handle);
5993 				mlog_errno(status);
5994 				goto bail;
5995 			}
5996 		}
5997 
5998 		ocfs2_commit_trans(osb, handle);
5999 		i--;
6000 	}
6001 
6002 	osb->truncated_clusters = 0;
6003 
6004 bail:
6005 	return status;
6006 }
6007 
6008 /* Expects you to already be holding tl_inode->i_rwsem */
6009 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6010 {
6011 	int status;
6012 	unsigned int num_to_flush;
6013 	struct inode *tl_inode = osb->osb_tl_inode;
6014 	struct inode *data_alloc_inode = NULL;
6015 	struct buffer_head *tl_bh = osb->osb_tl_bh;
6016 	struct buffer_head *data_alloc_bh = NULL;
6017 	struct ocfs2_dinode *di;
6018 	struct ocfs2_truncate_log *tl;
6019 	struct ocfs2_journal *journal = osb->journal;
6020 
6021 	BUG_ON(inode_trylock(tl_inode));
6022 
6023 	di = (struct ocfs2_dinode *) tl_bh->b_data;
6024 
6025 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
6026 	 * by the underlying call to ocfs2_read_inode_block(), so any
6027 	 * corruption is a code bug */
6028 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6029 
6030 	tl = &di->id2.i_dealloc;
6031 	num_to_flush = le16_to_cpu(tl->tl_used);
6032 	trace_ocfs2_flush_truncate_log(
6033 		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
6034 		num_to_flush);
6035 	if (!num_to_flush) {
6036 		status = 0;
6037 		goto out;
6038 	}
6039 
6040 	/* Appending truncate log(TA) and flushing truncate log(TF) are
6041 	 * two separated transactions. They can be both committed but not
6042 	 * checkpointed. If crash occurs then, both two transaction will be
6043 	 * replayed with several already released to global bitmap clusters.
6044 	 * Then truncate log will be replayed resulting in cluster double free.
6045 	 */
6046 	jbd2_journal_lock_updates(journal->j_journal);
6047 	status = jbd2_journal_flush(journal->j_journal, 0);
6048 	jbd2_journal_unlock_updates(journal->j_journal);
6049 	if (status < 0) {
6050 		mlog_errno(status);
6051 		goto out;
6052 	}
6053 
6054 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
6055 						       GLOBAL_BITMAP_SYSTEM_INODE,
6056 						       OCFS2_INVALID_SLOT);
6057 	if (!data_alloc_inode) {
6058 		status = -EINVAL;
6059 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
6060 		goto out;
6061 	}
6062 
6063 	inode_lock(data_alloc_inode);
6064 
6065 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
6066 	if (status < 0) {
6067 		mlog_errno(status);
6068 		goto out_mutex;
6069 	}
6070 
6071 	status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
6072 					       data_alloc_bh);
6073 	if (status < 0)
6074 		mlog_errno(status);
6075 
6076 	brelse(data_alloc_bh);
6077 	ocfs2_inode_unlock(data_alloc_inode, 1);
6078 
6079 out_mutex:
6080 	inode_unlock(data_alloc_inode);
6081 	iput(data_alloc_inode);
6082 
6083 out:
6084 	return status;
6085 }
6086 
6087 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6088 {
6089 	int status;
6090 	struct inode *tl_inode = osb->osb_tl_inode;
6091 
6092 	inode_lock(tl_inode);
6093 	status = __ocfs2_flush_truncate_log(osb);
6094 	inode_unlock(tl_inode);
6095 
6096 	return status;
6097 }
6098 
6099 static void ocfs2_truncate_log_worker(struct work_struct *work)
6100 {
6101 	int status;
6102 	struct ocfs2_super *osb =
6103 		container_of(work, struct ocfs2_super,
6104 			     osb_truncate_log_wq.work);
6105 
6106 	status = ocfs2_flush_truncate_log(osb);
6107 	if (status < 0)
6108 		mlog_errno(status);
6109 	else
6110 		ocfs2_init_steal_slots(osb);
6111 }
6112 
6113 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
6114 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6115 				       int cancel)
6116 {
6117 	if (osb->osb_tl_inode &&
6118 			atomic_read(&osb->osb_tl_disable) == 0) {
6119 		/* We want to push off log flushes while truncates are
6120 		 * still running. */
6121 		if (cancel)
6122 			cancel_delayed_work(&osb->osb_truncate_log_wq);
6123 
6124 		queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
6125 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6126 	}
6127 }
6128 
6129 /*
6130  * Try to flush truncate logs if we can free enough clusters from it.
6131  * As for return value, "< 0" means error, "0" no space and "1" means
6132  * we have freed enough spaces and let the caller try to allocate again.
6133  */
6134 int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
6135 					unsigned int needed)
6136 {
6137 	tid_t target;
6138 	int ret = 0;
6139 	unsigned int truncated_clusters;
6140 
6141 	inode_lock(osb->osb_tl_inode);
6142 	truncated_clusters = osb->truncated_clusters;
6143 	inode_unlock(osb->osb_tl_inode);
6144 
6145 	/*
6146 	 * Check whether we can succeed in allocating if we free
6147 	 * the truncate log.
6148 	 */
6149 	if (truncated_clusters < needed)
6150 		goto out;
6151 
6152 	ret = ocfs2_flush_truncate_log(osb);
6153 	if (ret) {
6154 		mlog_errno(ret);
6155 		goto out;
6156 	}
6157 
6158 	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
6159 		jbd2_log_wait_commit(osb->journal->j_journal, target);
6160 		ret = 1;
6161 	}
6162 out:
6163 	return ret;
6164 }
6165 
6166 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
6167 				       int slot_num,
6168 				       struct inode **tl_inode,
6169 				       struct buffer_head **tl_bh)
6170 {
6171 	int status;
6172 	struct inode *inode = NULL;
6173 	struct buffer_head *bh = NULL;
6174 	struct ocfs2_dinode *di;
6175 	struct ocfs2_truncate_log *tl;
6176 	unsigned int tl_count, tl_used;
6177 
6178 	inode = ocfs2_get_system_file_inode(osb,
6179 					   TRUNCATE_LOG_SYSTEM_INODE,
6180 					   slot_num);
6181 	if (!inode) {
6182 		status = -EINVAL;
6183 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
6184 		goto bail;
6185 	}
6186 
6187 	status = ocfs2_read_inode_block(inode, &bh);
6188 	if (status < 0) {
6189 		iput(inode);
6190 		mlog_errno(status);
6191 		goto bail;
6192 	}
6193 
6194 	di = (struct ocfs2_dinode *)bh->b_data;
6195 	tl = &di->id2.i_dealloc;
6196 	tl_count = le16_to_cpu(tl->tl_count);
6197 	tl_used = le16_to_cpu(tl->tl_used);
6198 	if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
6199 		     tl_count == 0 ||
6200 		     tl_used > tl_count)) {
6201 		status = -EFSCORRUPTED;
6202 		iput(inode);
6203 		brelse(bh);
6204 		mlog_errno(status);
6205 		goto bail;
6206 	}
6207 
6208 	*tl_inode = inode;
6209 	*tl_bh    = bh;
6210 bail:
6211 	return status;
6212 }
6213 
6214 /* called during the 1st stage of node recovery. we stamp a clean
6215  * truncate log and pass back a copy for processing later. if the
6216  * truncate log does not require processing, a *tl_copy is set to
6217  * NULL. */
6218 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6219 				      int slot_num,
6220 				      struct ocfs2_dinode **tl_copy)
6221 {
6222 	int status;
6223 	struct inode *tl_inode = NULL;
6224 	struct buffer_head *tl_bh = NULL;
6225 	struct ocfs2_dinode *di;
6226 	struct ocfs2_truncate_log *tl;
6227 
6228 	*tl_copy = NULL;
6229 
6230 	trace_ocfs2_begin_truncate_log_recovery(slot_num);
6231 
6232 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6233 	if (status < 0) {
6234 		mlog_errno(status);
6235 		goto bail;
6236 	}
6237 
6238 	di = (struct ocfs2_dinode *) tl_bh->b_data;
6239 
6240 	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6241 	 * validated by the underlying call to ocfs2_read_inode_block(),
6242 	 * so any corruption is a code bug */
6243 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6244 
6245 	tl = &di->id2.i_dealloc;
6246 	if (le16_to_cpu(tl->tl_used)) {
6247 		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
6248 
6249 		/*
6250 		 * Assuming the write-out below goes well, this copy will be
6251 		 * passed back to recovery for processing.
6252 		 */
6253 		*tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL);
6254 		if (!(*tl_copy)) {
6255 			status = -ENOMEM;
6256 			mlog_errno(status);
6257 			goto bail;
6258 		}
6259 
6260 		/* All we need to do to clear the truncate log is set
6261 		 * tl_used. */
6262 		tl->tl_used = 0;
6263 
6264 		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6265 		status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6266 		if (status < 0) {
6267 			mlog_errno(status);
6268 			goto bail;
6269 		}
6270 	}
6271 
6272 bail:
6273 	iput(tl_inode);
6274 	brelse(tl_bh);
6275 
6276 	if (status < 0) {
6277 		kfree(*tl_copy);
6278 		*tl_copy = NULL;
6279 		mlog_errno(status);
6280 	}
6281 
6282 	return status;
6283 }
6284 
6285 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6286 					 struct ocfs2_dinode *tl_copy)
6287 {
6288 	int status = 0;
6289 	int i;
6290 	unsigned int clusters, num_recs, start_cluster;
6291 	u64 start_blk;
6292 	handle_t *handle;
6293 	struct inode *tl_inode = osb->osb_tl_inode;
6294 	struct ocfs2_truncate_log *tl;
6295 
6296 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6297 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6298 		return -EINVAL;
6299 	}
6300 
6301 	tl = &tl_copy->id2.i_dealloc;
6302 	num_recs = le16_to_cpu(tl->tl_used);
6303 	trace_ocfs2_complete_truncate_log_recovery(
6304 		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
6305 		num_recs);
6306 
6307 	inode_lock(tl_inode);
6308 	for(i = 0; i < num_recs; i++) {
6309 		if (ocfs2_truncate_log_needs_flush(osb)) {
6310 			status = __ocfs2_flush_truncate_log(osb);
6311 			if (status < 0) {
6312 				mlog_errno(status);
6313 				goto bail_up;
6314 			}
6315 		}
6316 
6317 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6318 		if (IS_ERR(handle)) {
6319 			status = PTR_ERR(handle);
6320 			mlog_errno(status);
6321 			goto bail_up;
6322 		}
6323 
6324 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6325 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6326 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6327 
6328 		status = ocfs2_truncate_log_append(osb, handle,
6329 						   start_blk, clusters);
6330 		ocfs2_commit_trans(osb, handle);
6331 		if (status < 0) {
6332 			mlog_errno(status);
6333 			goto bail_up;
6334 		}
6335 	}
6336 
6337 bail_up:
6338 	inode_unlock(tl_inode);
6339 
6340 	return status;
6341 }
6342 
6343 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6344 {
6345 	int status;
6346 	struct inode *tl_inode = osb->osb_tl_inode;
6347 
6348 	atomic_set(&osb->osb_tl_disable, 1);
6349 
6350 	if (tl_inode) {
6351 		cancel_delayed_work(&osb->osb_truncate_log_wq);
6352 		flush_workqueue(osb->ocfs2_wq);
6353 
6354 		status = ocfs2_flush_truncate_log(osb);
6355 		if (status < 0)
6356 			mlog_errno(status);
6357 
6358 		brelse(osb->osb_tl_bh);
6359 		iput(osb->osb_tl_inode);
6360 	}
6361 }
6362 
6363 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6364 {
6365 	int status;
6366 	struct inode *tl_inode = NULL;
6367 	struct buffer_head *tl_bh = NULL;
6368 
6369 	status = ocfs2_get_truncate_log_info(osb,
6370 					     osb->slot_num,
6371 					     &tl_inode,
6372 					     &tl_bh);
6373 	if (status < 0)
6374 		mlog_errno(status);
6375 
6376 	/* ocfs2_truncate_log_shutdown keys on the existence of
6377 	 * osb->osb_tl_inode so we don't set any of the osb variables
6378 	 * until we're sure all is well. */
6379 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6380 			  ocfs2_truncate_log_worker);
6381 	atomic_set(&osb->osb_tl_disable, 0);
6382 	osb->osb_tl_bh    = tl_bh;
6383 	osb->osb_tl_inode = tl_inode;
6384 
6385 	return status;
6386 }
6387 
6388 /*
6389  * Delayed de-allocation of suballocator blocks.
6390  *
6391  * Some sets of block de-allocations might involve multiple suballocator inodes.
6392  *
6393  * The locking for this can get extremely complicated, especially when
6394  * the suballocator inodes to delete from aren't known until deep
6395  * within an unrelated codepath.
6396  *
6397  * ocfs2_extent_block structures are a good example of this - an inode
6398  * btree could have been grown by any number of nodes each allocating
6399  * out of their own suballoc inode.
6400  *
6401  * These structures allow the delay of block de-allocation until a
6402  * later time, when locking of multiple cluster inodes won't cause
6403  * deadlock.
6404  */
6405 
6406 /*
6407  * Describe a single bit freed from a suballocator.  For the block
6408  * suballocators, it represents one block.  For the global cluster
6409  * allocator, it represents some clusters and free_bit indicates
6410  * clusters number.
6411  */
6412 struct ocfs2_cached_block_free {
6413 	struct ocfs2_cached_block_free		*free_next;
6414 	u64					free_bg;
6415 	u64					free_blk;
6416 	unsigned int				free_bit;
6417 };
6418 
6419 struct ocfs2_per_slot_free_list {
6420 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
6421 	int					f_inode_type;
6422 	int					f_slot;
6423 	struct ocfs2_cached_block_free		*f_first;
6424 };
6425 
6426 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6427 				    int sysfile_type,
6428 				    int slot,
6429 				    struct ocfs2_cached_block_free *head)
6430 {
6431 	int ret;
6432 	u64 bg_blkno;
6433 	handle_t *handle;
6434 	struct inode *inode;
6435 	struct buffer_head *di_bh = NULL;
6436 	struct ocfs2_cached_block_free *tmp;
6437 
6438 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6439 	if (!inode) {
6440 		ret = -EINVAL;
6441 		mlog_errno(ret);
6442 		goto out;
6443 	}
6444 
6445 	inode_lock(inode);
6446 
6447 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
6448 	if (ret) {
6449 		mlog_errno(ret);
6450 		goto out_mutex;
6451 	}
6452 
6453 	while (head) {
6454 		if (head->free_bg)
6455 			bg_blkno = head->free_bg;
6456 		else
6457 			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6458 							      head->free_bit);
6459 		handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6460 		if (IS_ERR(handle)) {
6461 			ret = PTR_ERR(handle);
6462 			mlog_errno(ret);
6463 			goto out_unlock;
6464 		}
6465 
6466 		trace_ocfs2_free_cached_blocks(
6467 		     (unsigned long long)head->free_blk, head->free_bit);
6468 
6469 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6470 					       head->free_bit, bg_blkno, 1);
6471 		if (ret)
6472 			mlog_errno(ret);
6473 
6474 		ocfs2_commit_trans(osb, handle);
6475 
6476 		tmp = head;
6477 		head = head->free_next;
6478 		kfree(tmp);
6479 	}
6480 
6481 out_unlock:
6482 	ocfs2_inode_unlock(inode, 1);
6483 	brelse(di_bh);
6484 out_mutex:
6485 	inode_unlock(inode);
6486 	iput(inode);
6487 out:
6488 	while(head) {
6489 		/* Premature exit may have left some dangling items. */
6490 		tmp = head;
6491 		head = head->free_next;
6492 		kfree(tmp);
6493 	}
6494 
6495 	return ret;
6496 }
6497 
6498 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6499 				u64 blkno, unsigned int bit)
6500 {
6501 	int ret = 0;
6502 	struct ocfs2_cached_block_free *item;
6503 
6504 	item = kzalloc_obj(*item, GFP_NOFS);
6505 	if (item == NULL) {
6506 		ret = -ENOMEM;
6507 		mlog_errno(ret);
6508 		return ret;
6509 	}
6510 
6511 	trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
6512 
6513 	item->free_blk = blkno;
6514 	item->free_bit = bit;
6515 	item->free_next = ctxt->c_global_allocator;
6516 
6517 	ctxt->c_global_allocator = item;
6518 	return ret;
6519 }
6520 
6521 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6522 				      struct ocfs2_cached_block_free *head)
6523 {
6524 	struct ocfs2_cached_block_free *tmp;
6525 	struct inode *tl_inode = osb->osb_tl_inode;
6526 	handle_t *handle;
6527 	int ret = 0;
6528 
6529 	inode_lock(tl_inode);
6530 
6531 	while (head) {
6532 		if (ocfs2_truncate_log_needs_flush(osb)) {
6533 			ret = __ocfs2_flush_truncate_log(osb);
6534 			if (ret < 0) {
6535 				mlog_errno(ret);
6536 				break;
6537 			}
6538 		}
6539 
6540 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6541 		if (IS_ERR(handle)) {
6542 			ret = PTR_ERR(handle);
6543 			mlog_errno(ret);
6544 			break;
6545 		}
6546 
6547 		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6548 						head->free_bit);
6549 
6550 		ocfs2_commit_trans(osb, handle);
6551 		tmp = head;
6552 		head = head->free_next;
6553 		kfree(tmp);
6554 
6555 		if (ret < 0) {
6556 			mlog_errno(ret);
6557 			break;
6558 		}
6559 	}
6560 
6561 	inode_unlock(tl_inode);
6562 
6563 	while (head) {
6564 		/* Premature exit may have left some dangling items. */
6565 		tmp = head;
6566 		head = head->free_next;
6567 		kfree(tmp);
6568 	}
6569 
6570 	return ret;
6571 }
6572 
6573 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6574 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
6575 {
6576 	int ret = 0, ret2;
6577 	struct ocfs2_per_slot_free_list *fl;
6578 
6579 	if (!ctxt)
6580 		return 0;
6581 
6582 	while (ctxt->c_first_suballocator) {
6583 		fl = ctxt->c_first_suballocator;
6584 
6585 		if (fl->f_first) {
6586 			trace_ocfs2_run_deallocs(fl->f_inode_type,
6587 						 fl->f_slot);
6588 			ret2 = ocfs2_free_cached_blocks(osb,
6589 							fl->f_inode_type,
6590 							fl->f_slot,
6591 							fl->f_first);
6592 			if (ret2)
6593 				mlog_errno(ret2);
6594 			if (!ret)
6595 				ret = ret2;
6596 		}
6597 
6598 		ctxt->c_first_suballocator = fl->f_next_suballocator;
6599 		kfree(fl);
6600 	}
6601 
6602 	if (ctxt->c_global_allocator) {
6603 		ret2 = ocfs2_free_cached_clusters(osb,
6604 						  ctxt->c_global_allocator);
6605 		if (ret2)
6606 			mlog_errno(ret2);
6607 		if (!ret)
6608 			ret = ret2;
6609 
6610 		ctxt->c_global_allocator = NULL;
6611 	}
6612 
6613 	return ret;
6614 }
6615 
6616 static struct ocfs2_per_slot_free_list *
6617 ocfs2_find_per_slot_free_list(int type,
6618 			      int slot,
6619 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
6620 {
6621 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6622 
6623 	while (fl) {
6624 		if (fl->f_inode_type == type && fl->f_slot == slot)
6625 			return fl;
6626 
6627 		fl = fl->f_next_suballocator;
6628 	}
6629 
6630 	fl = kmalloc_obj(*fl, GFP_NOFS);
6631 	if (fl) {
6632 		fl->f_inode_type = type;
6633 		fl->f_slot = slot;
6634 		fl->f_first = NULL;
6635 		fl->f_next_suballocator = ctxt->c_first_suballocator;
6636 
6637 		ctxt->c_first_suballocator = fl;
6638 	}
6639 	return fl;
6640 }
6641 
6642 static struct ocfs2_per_slot_free_list *
6643 ocfs2_find_preferred_free_list(int type,
6644 			       int preferred_slot,
6645 			       int *real_slot,
6646 			       struct ocfs2_cached_dealloc_ctxt *ctxt)
6647 {
6648 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6649 
6650 	while (fl) {
6651 		if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
6652 			*real_slot = fl->f_slot;
6653 			return fl;
6654 		}
6655 
6656 		fl = fl->f_next_suballocator;
6657 	}
6658 
6659 	/* If we can't find any free list matching preferred slot, just use
6660 	 * the first one.
6661 	 */
6662 	fl = ctxt->c_first_suballocator;
6663 	*real_slot = fl->f_slot;
6664 
6665 	return fl;
6666 }
6667 
6668 /* Return Value 1 indicates empty */
6669 static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
6670 {
6671 	struct ocfs2_per_slot_free_list *fl = NULL;
6672 
6673 	if (!et->et_dealloc)
6674 		return 1;
6675 
6676 	fl = et->et_dealloc->c_first_suballocator;
6677 	if (!fl)
6678 		return 1;
6679 
6680 	if (!fl->f_first)
6681 		return 1;
6682 
6683 	return 0;
6684 }
6685 
6686 /* If extent was deleted from tree due to extent rotation and merging, and
6687  * no metadata is reserved ahead of time. Try to reuse some extents
6688  * just deleted. This is only used to reuse extent blocks.
6689  * It is supposed to find enough extent blocks in dealloc if our estimation
6690  * on metadata is accurate.
6691  */
6692 static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
6693 					struct ocfs2_extent_tree *et,
6694 					struct buffer_head **new_eb_bh,
6695 					int blk_wanted, int *blk_given)
6696 {
6697 	int i, status = 0, real_slot;
6698 	struct ocfs2_cached_dealloc_ctxt *dealloc;
6699 	struct ocfs2_per_slot_free_list *fl;
6700 	struct ocfs2_cached_block_free *bf;
6701 	struct ocfs2_extent_block *eb;
6702 	struct ocfs2_super *osb =
6703 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
6704 
6705 	*blk_given = 0;
6706 
6707 	/* If extent tree doesn't have a dealloc, this is not faulty. Just
6708 	 * tell upper caller dealloc can't provide any block and it should
6709 	 * ask for alloc to claim more space.
6710 	 */
6711 	dealloc = et->et_dealloc;
6712 	if (!dealloc)
6713 		goto bail;
6714 
6715 	for (i = 0; i < blk_wanted; i++) {
6716 		/* Prefer to use local slot */
6717 		fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
6718 						    osb->slot_num, &real_slot,
6719 						    dealloc);
6720 		/* If no more block can be reused, we should claim more
6721 		 * from alloc. Just return here normally.
6722 		 */
6723 		if (!fl) {
6724 			status = 0;
6725 			break;
6726 		}
6727 
6728 		bf = fl->f_first;
6729 		fl->f_first = bf->free_next;
6730 
6731 		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
6732 		if (new_eb_bh[i] == NULL) {
6733 			status = -ENOMEM;
6734 			mlog_errno(status);
6735 			goto bail;
6736 		}
6737 
6738 		mlog(0, "Reusing block(%llu) from "
6739 		     "dealloc(local slot:%d, real slot:%d)\n",
6740 		     bf->free_blk, osb->slot_num, real_slot);
6741 
6742 		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
6743 
6744 		status = ocfs2_journal_access_eb(handle, et->et_ci,
6745 						 new_eb_bh[i],
6746 						 OCFS2_JOURNAL_ACCESS_CREATE);
6747 		if (status < 0) {
6748 			mlog_errno(status);
6749 			goto bail;
6750 		}
6751 
6752 		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
6753 		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
6754 
6755 		/* We can't guarantee that buffer head is still cached, so
6756 		 * polutlate the extent block again.
6757 		 */
6758 		strscpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
6759 		eb->h_blkno = cpu_to_le64(bf->free_blk);
6760 		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
6761 		eb->h_suballoc_slot = cpu_to_le16(real_slot);
6762 		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
6763 		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
6764 		eb->h_list.l_count =
6765 			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
6766 
6767 		/* We'll also be dirtied by the caller, so
6768 		 * this isn't absolutely necessary.
6769 		 */
6770 		ocfs2_journal_dirty(handle, new_eb_bh[i]);
6771 
6772 		if (!fl->f_first) {
6773 			dealloc->c_first_suballocator = fl->f_next_suballocator;
6774 			kfree(fl);
6775 		}
6776 		kfree(bf);
6777 	}
6778 
6779 	*blk_given = i;
6780 
6781 bail:
6782 	if (unlikely(status < 0)) {
6783 		for (i = 0; i < blk_wanted; i++)
6784 			brelse(new_eb_bh[i]);
6785 	}
6786 
6787 	return status;
6788 }
6789 
6790 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6791 			      int type, int slot, u64 suballoc,
6792 			      u64 blkno, unsigned int bit)
6793 {
6794 	int ret;
6795 	struct ocfs2_per_slot_free_list *fl;
6796 	struct ocfs2_cached_block_free *item;
6797 
6798 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6799 	if (fl == NULL) {
6800 		ret = -ENOMEM;
6801 		mlog_errno(ret);
6802 		goto out;
6803 	}
6804 
6805 	item = kzalloc_obj(*item, GFP_NOFS);
6806 	if (item == NULL) {
6807 		ret = -ENOMEM;
6808 		mlog_errno(ret);
6809 		goto out;
6810 	}
6811 
6812 	trace_ocfs2_cache_block_dealloc(type, slot,
6813 					(unsigned long long)suballoc,
6814 					(unsigned long long)blkno, bit);
6815 
6816 	item->free_bg = suballoc;
6817 	item->free_blk = blkno;
6818 	item->free_bit = bit;
6819 	item->free_next = fl->f_first;
6820 
6821 	fl->f_first = item;
6822 
6823 	ret = 0;
6824 out:
6825 	return ret;
6826 }
6827 
6828 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6829 					 struct ocfs2_extent_block *eb)
6830 {
6831 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6832 					 le16_to_cpu(eb->h_suballoc_slot),
6833 					 le64_to_cpu(eb->h_suballoc_loc),
6834 					 le64_to_cpu(eb->h_blkno),
6835 					 le16_to_cpu(eb->h_suballoc_bit));
6836 }
6837 
6838 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6839 {
6840 	set_buffer_uptodate(bh);
6841 	mark_buffer_dirty(bh);
6842 	return 0;
6843 }
6844 
6845 void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle,
6846 		size_t from, size_t to, struct folio *folio, int zero,
6847 		u64 *phys)
6848 {
6849 	int ret, partial = 0;
6850 	loff_t start_byte = folio_pos(folio) + from;
6851 	loff_t length = to - from;
6852 
6853 	ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0);
6854 	if (ret)
6855 		mlog_errno(ret);
6856 
6857 	if (zero)
6858 		folio_zero_segment(folio, from, to);
6859 
6860 	/*
6861 	 * Need to set the buffers we zero'd into uptodate
6862 	 * here if they aren't - ocfs2_map_page_blocks()
6863 	 * might've skipped some
6864 	 */
6865 	ret = walk_page_buffers(handle, folio_buffers(folio),
6866 				from, to, &partial,
6867 				ocfs2_zero_func);
6868 	if (ret < 0)
6869 		mlog_errno(ret);
6870 	else if (ocfs2_should_order_data(inode)) {
6871 		ret = ocfs2_jbd2_inode_add_write(handle, inode,
6872 						 start_byte, length);
6873 		if (ret < 0)
6874 			mlog_errno(ret);
6875 	}
6876 
6877 	if (!partial)
6878 		folio_mark_uptodate(folio);
6879 
6880 	flush_dcache_folio(folio);
6881 }
6882 
6883 static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
6884 		loff_t end, struct folio **folios, int numfolios,
6885 		u64 phys, handle_t *handle)
6886 {
6887 	int i;
6888 	struct super_block *sb = inode->i_sb;
6889 
6890 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6891 
6892 	if (numfolios == 0)
6893 		goto out;
6894 
6895 	for (i = 0; i < numfolios; i++) {
6896 		struct folio *folio = folios[i];
6897 		size_t to = folio_size(folio);
6898 		size_t from = offset_in_folio(folio, start);
6899 
6900 		if (to > end - folio_pos(folio))
6901 			to = end - folio_pos(folio);
6902 
6903 		ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
6904 				&phys);
6905 
6906 		start = folio_next_pos(folio);
6907 	}
6908 out:
6909 	if (folios)
6910 		ocfs2_unlock_and_free_folios(folios, numfolios);
6911 }
6912 
6913 static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,
6914 		struct folio **folios, int *num)
6915 {
6916 	int numfolios, ret = 0;
6917 	struct address_space *mapping = inode->i_mapping;
6918 	unsigned long index;
6919 	loff_t last_page_bytes;
6920 
6921 	BUG_ON(start > end);
6922 
6923 	numfolios = 0;
6924 	last_page_bytes = PAGE_ALIGN(end);
6925 	index = start >> PAGE_SHIFT;
6926 	do {
6927 		folios[numfolios] = __filemap_get_folio(mapping, index,
6928 				FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
6929 		if (IS_ERR(folios[numfolios])) {
6930 			ret = PTR_ERR(folios[numfolios]);
6931 			mlog_errno(ret);
6932 			folios[numfolios] = NULL;
6933 			goto out;
6934 		}
6935 
6936 		index = folio_next_index(folios[numfolios]);
6937 		numfolios++;
6938 	} while (index < (last_page_bytes >> PAGE_SHIFT));
6939 
6940 out:
6941 	if (ret != 0) {
6942 		ocfs2_unlock_and_free_folios(folios, numfolios);
6943 		numfolios = 0;
6944 	}
6945 
6946 	*num = numfolios;
6947 
6948 	return ret;
6949 }
6950 
6951 static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end,
6952 				struct folio **folios, int *num)
6953 {
6954 	struct super_block *sb = inode->i_sb;
6955 
6956 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6957 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6958 
6959 	return ocfs2_grab_folios(inode, start, end, folios, num);
6960 }
6961 
6962 /*
6963  * Zero partial cluster for a hole punch or truncate. This avoids exposing
6964  * nonzero data on subsequent file extends.
6965  *
6966  * We need to call this before i_size is updated on the inode because
6967  * otherwise block_write_full_folio() will skip writeout of pages past
6968  * i_size.
6969  */
6970 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6971 				  u64 range_start, u64 range_end)
6972 {
6973 	int ret = 0, numfolios;
6974 	struct folio **folios = NULL;
6975 	u64 phys;
6976 	unsigned int ext_flags;
6977 	struct super_block *sb = inode->i_sb;
6978 
6979 	/*
6980 	 * File systems which don't support sparse files zero on every
6981 	 * extend.
6982 	 */
6983 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6984 		return 0;
6985 
6986 	/*
6987 	 * Avoid zeroing folios fully beyond current i_size. It is pointless as
6988 	 * underlying blocks of those folios should be already zeroed out and
6989 	 * page writeback will skip them anyway.
6990 	 */
6991 	range_end = min_t(u64, range_end, i_size_read(inode));
6992 	if (range_start >= range_end)
6993 		return 0;
6994 
6995 	folios = kzalloc_objs(struct folio *, ocfs2_pages_per_cluster(sb),
6996 			      GFP_NOFS);
6997 	if (folios == NULL) {
6998 		ret = -ENOMEM;
6999 		mlog_errno(ret);
7000 		goto out;
7001 	}
7002 
7003 	ret = ocfs2_extent_map_get_blocks(inode,
7004 					  range_start >> sb->s_blocksize_bits,
7005 					  &phys, NULL, &ext_flags);
7006 	if (ret) {
7007 		mlog_errno(ret);
7008 		goto out;
7009 	}
7010 
7011 	/*
7012 	 * Tail is a hole, or is marked unwritten. In either case, we
7013 	 * can count on read and write to return/push zero's.
7014 	 */
7015 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
7016 		goto out;
7017 
7018 	ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios,
7019 				   &numfolios);
7020 	if (ret) {
7021 		mlog_errno(ret);
7022 		goto out;
7023 	}
7024 
7025 	ocfs2_zero_cluster_folios(inode, range_start, range_end, folios,
7026 				 numfolios, phys, handle);
7027 
7028 	/*
7029 	 * Initiate writeout of the folios we zero'd here. We don't
7030 	 * wait on them - the truncate_inode_pages() call later will
7031 	 * do that for us.
7032 	 */
7033 	ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
7034 				       range_end - 1);
7035 	if (ret)
7036 		mlog_errno(ret);
7037 
7038 out:
7039 	kfree(folios);
7040 
7041 	return ret;
7042 }
7043 
7044 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
7045 					     struct ocfs2_dinode *di)
7046 {
7047 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
7048 	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
7049 
7050 	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
7051 		memset(&di->id2, 0, blocksize -
7052 				    offsetof(struct ocfs2_dinode, id2) -
7053 				    xattrsize);
7054 	else
7055 		memset(&di->id2, 0, blocksize -
7056 				    offsetof(struct ocfs2_dinode, id2));
7057 }
7058 
7059 void ocfs2_dinode_new_extent_list(struct inode *inode,
7060 				  struct ocfs2_dinode *di)
7061 {
7062 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
7063 	di->id2.i_list.l_tree_depth = 0;
7064 	di->id2.i_list.l_next_free_rec = 0;
7065 	di->id2.i_list.l_count = cpu_to_le16(
7066 		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
7067 }
7068 
7069 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
7070 {
7071 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7072 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7073 
7074 	spin_lock(&oi->ip_lock);
7075 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
7076 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7077 	spin_unlock(&oi->ip_lock);
7078 
7079 	/*
7080 	 * We clear the entire i_data structure here so that all
7081 	 * fields can be properly initialized.
7082 	 */
7083 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
7084 
7085 	idata->id_count = cpu_to_le16(
7086 			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
7087 }
7088 
7089 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7090 					 struct buffer_head *di_bh)
7091 {
7092 	int ret, has_data, num_folios = 0;
7093 	int need_free = 0;
7094 	u32 bit_off, num;
7095 	handle_t *handle;
7096 	u64 block;
7097 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7098 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7099 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7100 	struct ocfs2_alloc_context *data_ac = NULL;
7101 	struct folio *folio = NULL;
7102 	struct ocfs2_extent_tree et;
7103 	int did_quota = 0;
7104 
7105 	has_data = i_size_read(inode) ? 1 : 0;
7106 
7107 	if (has_data) {
7108 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
7109 		if (ret) {
7110 			mlog_errno(ret);
7111 			goto out;
7112 		}
7113 	}
7114 
7115 	handle = ocfs2_start_trans(osb,
7116 				   ocfs2_inline_to_extents_credits(osb->sb));
7117 	if (IS_ERR(handle)) {
7118 		ret = PTR_ERR(handle);
7119 		mlog_errno(ret);
7120 		goto out;
7121 	}
7122 
7123 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7124 				      OCFS2_JOURNAL_ACCESS_WRITE);
7125 	if (ret) {
7126 		mlog_errno(ret);
7127 		goto out_commit;
7128 	}
7129 
7130 	if (has_data) {
7131 		unsigned int page_end = min_t(unsigned, PAGE_SIZE,
7132 							osb->s_clustersize);
7133 		u64 phys;
7134 
7135 		ret = dquot_alloc_space_nodirty(inode,
7136 				       ocfs2_clusters_to_bytes(osb->sb, 1));
7137 		if (ret)
7138 			goto out_commit;
7139 		did_quota = 1;
7140 
7141 		data_ac->ac_resv = &oi->ip_la_data_resv;
7142 
7143 		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7144 					   &num);
7145 		if (ret) {
7146 			mlog_errno(ret);
7147 			goto out_commit;
7148 		}
7149 
7150 		/*
7151 		 * Save two copies, one for insert, and one that can
7152 		 * be changed by ocfs2_map_and_dirty_folio() below.
7153 		 */
7154 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7155 
7156 		ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio,
7157 					   &num_folios);
7158 		if (ret) {
7159 			mlog_errno(ret);
7160 			need_free = 1;
7161 			goto out_commit;
7162 		}
7163 
7164 		/*
7165 		 * This should populate the 1st page for us and mark
7166 		 * it up to date.
7167 		 */
7168 		ret = ocfs2_read_inline_data(inode, folio, di_bh);
7169 		if (ret) {
7170 			mlog_errno(ret);
7171 			need_free = 1;
7172 			goto out_unlock;
7173 		}
7174 
7175 		ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0,
7176 				&phys);
7177 	}
7178 
7179 	spin_lock(&oi->ip_lock);
7180 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7181 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7182 	spin_unlock(&oi->ip_lock);
7183 
7184 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
7185 	ocfs2_dinode_new_extent_list(inode, di);
7186 
7187 	ocfs2_journal_dirty(handle, di_bh);
7188 
7189 	if (has_data) {
7190 		/*
7191 		 * An error at this point should be extremely rare. If
7192 		 * this proves to be false, we could always re-build
7193 		 * the in-inode data from our pages.
7194 		 */
7195 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7196 		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7197 		if (ret) {
7198 			mlog_errno(ret);
7199 			need_free = 1;
7200 			goto out_unlock;
7201 		}
7202 
7203 		inode->i_blocks = ocfs2_inode_sector_count(inode);
7204 	}
7205 
7206 out_unlock:
7207 	if (folio)
7208 		ocfs2_unlock_and_free_folios(&folio, num_folios);
7209 
7210 out_commit:
7211 	if (ret < 0 && did_quota)
7212 		dquot_free_space_nodirty(inode,
7213 					  ocfs2_clusters_to_bytes(osb->sb, 1));
7214 
7215 	if (need_free) {
7216 		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
7217 			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
7218 					bit_off, num);
7219 		else
7220 			ocfs2_free_clusters(handle,
7221 					data_ac->ac_inode,
7222 					data_ac->ac_bh,
7223 					ocfs2_clusters_to_blocks(osb->sb, bit_off),
7224 					num);
7225 	}
7226 
7227 	ocfs2_commit_trans(osb, handle);
7228 
7229 out:
7230 	if (data_ac)
7231 		ocfs2_free_alloc_context(data_ac);
7232 	return ret;
7233 }
7234 
7235 /*
7236  * It is expected, that by the time you call this function,
7237  * inode->i_size and fe->i_size have been adjusted.
7238  *
7239  * WARNING: This will kfree the truncate context
7240  */
7241 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7242 			  struct inode *inode,
7243 			  struct buffer_head *di_bh)
7244 {
7245 	int status = 0, i, flags = 0;
7246 	u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7247 	u64 blkno = 0;
7248 	struct ocfs2_extent_list *el;
7249 	struct ocfs2_extent_rec *rec;
7250 	struct ocfs2_path *path = NULL;
7251 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7252 	struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7253 	u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7254 	struct ocfs2_extent_tree et;
7255 	struct ocfs2_cached_dealloc_ctxt dealloc;
7256 	struct ocfs2_refcount_tree *ref_tree = NULL;
7257 
7258 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7259 	ocfs2_init_dealloc_ctxt(&dealloc);
7260 
7261 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7262 						     i_size_read(inode));
7263 
7264 	path = ocfs2_new_path(di_bh, &di->id2.i_list,
7265 			      ocfs2_journal_access_di);
7266 	if (!path) {
7267 		status = -ENOMEM;
7268 		mlog_errno(status);
7269 		goto bail;
7270 	}
7271 
7272 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
7273 
7274 start:
7275 	/*
7276 	 * Check that we still have allocation to delete.
7277 	 */
7278 	if (OCFS2_I(inode)->ip_clusters == 0) {
7279 		status = 0;
7280 		goto bail;
7281 	}
7282 
7283 	/*
7284 	 * Truncate always works against the rightmost tree branch.
7285 	 */
7286 	status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7287 	if (status) {
7288 		mlog_errno(status);
7289 		goto bail;
7290 	}
7291 
7292 	trace_ocfs2_commit_truncate(
7293 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
7294 		new_highest_cpos,
7295 		OCFS2_I(inode)->ip_clusters,
7296 		path->p_tree_depth);
7297 
7298 	/*
7299 	 * By now, el will point to the extent list on the bottom most
7300 	 * portion of this tree. Only the tail record is considered in
7301 	 * each pass.
7302 	 *
7303 	 * We handle the following cases, in order:
7304 	 * - empty extent: delete the remaining branch
7305 	 * - remove the entire record
7306 	 * - remove a partial record
7307 	 * - no record needs to be removed (truncate has completed)
7308 	 */
7309 	el = path_leaf_el(path);
7310 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
7311 		ocfs2_error(inode->i_sb,
7312 			    "Inode %llu has empty extent block at %llu\n",
7313 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7314 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
7315 		status = -EROFS;
7316 		goto bail;
7317 	}
7318 
7319 	i = le16_to_cpu(el->l_next_free_rec) - 1;
7320 	rec = &el->l_recs[i];
7321 	flags = rec->e_flags;
7322 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7323 
7324 	if (i == 0 && ocfs2_is_empty_extent(rec)) {
7325 		/*
7326 		 * Lower levels depend on this never happening, but it's best
7327 		 * to check it up here before changing the tree.
7328 		*/
7329 		if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7330 			mlog(ML_ERROR, "Inode %llu has an empty "
7331 				    "extent record, depth %u\n", inode->i_ino,
7332 				    le16_to_cpu(root_el->l_tree_depth));
7333 			status = ocfs2_remove_rightmost_empty_extent(osb,
7334 					&et, path, &dealloc);
7335 			if (status) {
7336 				mlog_errno(status);
7337 				goto bail;
7338 			}
7339 
7340 			ocfs2_reinit_path(path, 1);
7341 			goto start;
7342 		} else {
7343 			trunc_cpos = le32_to_cpu(rec->e_cpos);
7344 			trunc_len = 0;
7345 			blkno = 0;
7346 		}
7347 	} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7348 		/*
7349 		 * Truncate entire record.
7350 		 */
7351 		trunc_cpos = le32_to_cpu(rec->e_cpos);
7352 		trunc_len = ocfs2_rec_clusters(el, rec);
7353 		blkno = le64_to_cpu(rec->e_blkno);
7354 	} else if (range > new_highest_cpos) {
7355 		/*
7356 		 * Partial truncate. it also should be
7357 		 * the last truncate we're doing.
7358 		 */
7359 		trunc_cpos = new_highest_cpos;
7360 		trunc_len = range - new_highest_cpos;
7361 		coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7362 		blkno = le64_to_cpu(rec->e_blkno) +
7363 				ocfs2_clusters_to_blocks(inode->i_sb, coff);
7364 	} else {
7365 		/*
7366 		 * Truncate completed, leave happily.
7367 		 */
7368 		status = 0;
7369 		goto bail;
7370 	}
7371 
7372 	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7373 
7374 	if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
7375 		status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
7376 				&ref_tree, NULL);
7377 		if (status) {
7378 			mlog_errno(status);
7379 			goto bail;
7380 		}
7381 	}
7382 
7383 	status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7384 					  phys_cpos, trunc_len, flags, &dealloc,
7385 					  refcount_loc, true);
7386 	if (status < 0) {
7387 		mlog_errno(status);
7388 		goto bail;
7389 	}
7390 
7391 	ocfs2_reinit_path(path, 1);
7392 
7393 	/*
7394 	 * The check above will catch the case where we've truncated
7395 	 * away all allocation.
7396 	 */
7397 	goto start;
7398 
7399 bail:
7400 	if (ref_tree)
7401 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7402 
7403 	ocfs2_schedule_truncate_log_flush(osb, 1);
7404 
7405 	ocfs2_run_deallocs(osb, &dealloc);
7406 
7407 	ocfs2_free_path(path);
7408 
7409 	return status;
7410 }
7411 
7412 /*
7413  * 'start' is inclusive, 'end' is not.
7414  */
7415 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7416 			  unsigned int start, unsigned int end, int trunc)
7417 {
7418 	int ret;
7419 	unsigned int numbytes;
7420 	handle_t *handle;
7421 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7422 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7423 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7424 
7425 	/* No need to punch hole beyond i_size. */
7426 	if (start >= i_size_read(inode))
7427 		return 0;
7428 
7429 	if (end > i_size_read(inode))
7430 		end = i_size_read(inode);
7431 
7432 	BUG_ON(start > end);
7433 
7434 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7435 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7436 	    !ocfs2_supports_inline_data(osb)) {
7437 		ocfs2_error(inode->i_sb,
7438 			    "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7439 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7440 			    le16_to_cpu(di->i_dyn_features),
7441 			    OCFS2_I(inode)->ip_dyn_features,
7442 			    osb->s_feature_incompat);
7443 		ret = -EROFS;
7444 		goto out;
7445 	}
7446 
7447 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7448 	if (IS_ERR(handle)) {
7449 		ret = PTR_ERR(handle);
7450 		mlog_errno(ret);
7451 		goto out;
7452 	}
7453 
7454 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7455 				      OCFS2_JOURNAL_ACCESS_WRITE);
7456 	if (ret) {
7457 		mlog_errno(ret);
7458 		goto out_commit;
7459 	}
7460 
7461 	numbytes = end - start;
7462 	memset(idata->id_data + start, 0, numbytes);
7463 
7464 	/*
7465 	 * No need to worry about the data page here - it's been
7466 	 * truncated already and inline data doesn't need it for
7467 	 * pushing zero's to disk, so we'll let read_folio pick it up
7468 	 * later.
7469 	 */
7470 	if (trunc) {
7471 		i_size_write(inode, start);
7472 		di->i_size = cpu_to_le64(start);
7473 	}
7474 
7475 	inode->i_blocks = ocfs2_inode_sector_count(inode);
7476 	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
7477 
7478 	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
7479 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
7480 
7481 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
7482 	ocfs2_journal_dirty(handle, di_bh);
7483 
7484 out_commit:
7485 	ocfs2_commit_trans(osb, handle);
7486 
7487 out:
7488 	return ret;
7489 }
7490 
7491 static int ocfs2_trim_extent(struct super_block *sb,
7492 			     struct ocfs2_group_desc *gd,
7493 			     u64 group, u32 start, u32 count)
7494 {
7495 	u64 discard, bcount;
7496 	struct ocfs2_super *osb = OCFS2_SB(sb);
7497 
7498 	bcount = ocfs2_clusters_to_blocks(sb, count);
7499 	discard = ocfs2_clusters_to_blocks(sb, start);
7500 
7501 	/*
7502 	 * For the first cluster group, the gd->bg_blkno is not at the start
7503 	 * of the group, but at an offset from the start. If we add it while
7504 	 * calculating discard for first group, we will wrongly start fstrim a
7505 	 * few blocks after the desried start block and the range can cross
7506 	 * over into the next cluster group. So, add it only if this is not
7507 	 * the first cluster group.
7508 	 */
7509 	if (group != osb->first_cluster_group_blkno)
7510 		discard += le64_to_cpu(gd->bg_blkno);
7511 
7512 	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
7513 
7514 	return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
7515 }
7516 
7517 static int ocfs2_trim_group(struct super_block *sb,
7518 			    struct ocfs2_group_desc *gd, u64 group,
7519 			    u32 start, u32 max, u32 minbits)
7520 {
7521 	int ret = 0, count = 0, next;
7522 	void *bitmap = gd->bg_bitmap;
7523 
7524 	if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
7525 		return 0;
7526 
7527 	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
7528 			       start, max, minbits);
7529 
7530 	while (start < max) {
7531 		start = ocfs2_find_next_zero_bit(bitmap, max, start);
7532 		if (start >= max)
7533 			break;
7534 		next = ocfs2_find_next_bit(bitmap, max, start);
7535 
7536 		if ((next - start) >= minbits) {
7537 			ret = ocfs2_trim_extent(sb, gd, group,
7538 						start, next - start);
7539 			if (ret < 0) {
7540 				mlog_errno(ret);
7541 				break;
7542 			}
7543 			count += next - start;
7544 		}
7545 		start = next + 1;
7546 
7547 		if (fatal_signal_pending(current)) {
7548 			count = -ERESTARTSYS;
7549 			break;
7550 		}
7551 
7552 		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
7553 			break;
7554 	}
7555 
7556 	if (ret < 0)
7557 		count = ret;
7558 
7559 	return count;
7560 }
7561 
7562 static
7563 int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
7564 {
7565 	struct ocfs2_super *osb = OCFS2_SB(sb);
7566 	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
7567 	int ret, cnt;
7568 	u32 first_bit, last_bit, minlen;
7569 	struct buffer_head *main_bm_bh = NULL;
7570 	struct inode *main_bm_inode = NULL;
7571 	struct buffer_head *gd_bh = NULL;
7572 	struct ocfs2_dinode *main_bm;
7573 	struct ocfs2_group_desc *gd = NULL;
7574 
7575 	start = range->start >> osb->s_clustersize_bits;
7576 	len = range->len >> osb->s_clustersize_bits;
7577 	minlen = range->minlen >> osb->s_clustersize_bits;
7578 
7579 	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7580 		return -EINVAL;
7581 
7582 	trace_ocfs2_trim_mainbm(start, len, minlen);
7583 
7584 next_group:
7585 	main_bm_inode = ocfs2_get_system_file_inode(osb,
7586 						    GLOBAL_BITMAP_SYSTEM_INODE,
7587 						    OCFS2_INVALID_SLOT);
7588 	if (!main_bm_inode) {
7589 		ret = -EIO;
7590 		mlog_errno(ret);
7591 		goto out;
7592 	}
7593 
7594 	inode_lock(main_bm_inode);
7595 
7596 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
7597 	if (ret < 0) {
7598 		mlog_errno(ret);
7599 		goto out_mutex;
7600 	}
7601 	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7602 
7603 	/*
7604 	 * Do some check before trim the first group.
7605 	 */
7606 	if (!group) {
7607 		if (start >= le32_to_cpu(main_bm->i_clusters)) {
7608 			ret = -EINVAL;
7609 			goto out_unlock;
7610 		}
7611 
7612 		if (start + len > le32_to_cpu(main_bm->i_clusters))
7613 			len = le32_to_cpu(main_bm->i_clusters) - start;
7614 
7615 		/*
7616 		 * Determine first and last group to examine based on
7617 		 * start and len
7618 		 */
7619 		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7620 		if (first_group == osb->first_cluster_group_blkno)
7621 			first_bit = start;
7622 		else
7623 			first_bit = start - ocfs2_blocks_to_clusters(sb,
7624 								first_group);
7625 		last_group = ocfs2_which_cluster_group(main_bm_inode,
7626 						       start + len - 1);
7627 		group = first_group;
7628 	}
7629 
7630 	do {
7631 		if (first_bit + len >= osb->bitmap_cpg)
7632 			last_bit = osb->bitmap_cpg;
7633 		else
7634 			last_bit = first_bit + len;
7635 
7636 		ret = ocfs2_read_group_descriptor(main_bm_inode,
7637 						  main_bm, group,
7638 						  &gd_bh);
7639 		if (ret < 0) {
7640 			mlog_errno(ret);
7641 			break;
7642 		}
7643 
7644 		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
7645 		cnt = ocfs2_trim_group(sb, gd, group,
7646 				       first_bit, last_bit, minlen);
7647 		brelse(gd_bh);
7648 		gd_bh = NULL;
7649 		if (cnt < 0) {
7650 			ret = cnt;
7651 			mlog_errno(ret);
7652 			break;
7653 		}
7654 
7655 		trimmed += cnt;
7656 		len -= osb->bitmap_cpg - first_bit;
7657 		first_bit = 0;
7658 		if (group == osb->first_cluster_group_blkno)
7659 			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7660 		else
7661 			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7662 	} while (0);
7663 
7664 out_unlock:
7665 	ocfs2_inode_unlock(main_bm_inode, 0);
7666 	brelse(main_bm_bh);
7667 	main_bm_bh = NULL;
7668 out_mutex:
7669 	inode_unlock(main_bm_inode);
7670 	iput(main_bm_inode);
7671 
7672 	/*
7673 	 * If all the groups trim are not done or failed, but we should release
7674 	 * main_bm related locks for avoiding the current IO starve, then go to
7675 	 * trim the next group
7676 	 */
7677 	if (ret >= 0 && group <= last_group) {
7678 		cond_resched();
7679 		goto next_group;
7680 	}
7681 out:
7682 	range->len = trimmed * osb->s_clustersize;
7683 	return ret;
7684 }
7685 
7686 int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7687 {
7688 	int ret;
7689 	struct ocfs2_super *osb = OCFS2_SB(sb);
7690 	struct ocfs2_trim_fs_info info, *pinfo = NULL;
7691 
7692 	ocfs2_trim_fs_lock_res_init(osb);
7693 
7694 	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
7695 
7696 	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7697 	if (ret < 0) {
7698 		if (ret != -EAGAIN) {
7699 			mlog_errno(ret);
7700 			ocfs2_trim_fs_lock_res_uninit(osb);
7701 			return ret;
7702 		}
7703 
7704 		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
7705 		     "finish, which is running from another node.\n",
7706 		     osb->dev_str);
7707 		ret = ocfs2_trim_fs_lock(osb, &info, 0);
7708 		if (ret < 0) {
7709 			mlog_errno(ret);
7710 			ocfs2_trim_fs_lock_res_uninit(osb);
7711 			return ret;
7712 		}
7713 
7714 		if (info.tf_valid && info.tf_success &&
7715 		    info.tf_start == range->start &&
7716 		    info.tf_len == range->len &&
7717 		    info.tf_minlen == range->minlen) {
7718 			/* Avoid sending duplicated trim to a shared device */
7719 			mlog(ML_NOTICE, "The same trim on device (%s) was "
7720 			     "just done from node (%u), return.\n",
7721 			     osb->dev_str, info.tf_nodenum);
7722 			range->len = info.tf_trimlen;
7723 			goto out;
7724 		}
7725 	}
7726 
7727 	info.tf_nodenum = osb->node_num;
7728 	info.tf_start = range->start;
7729 	info.tf_len = range->len;
7730 	info.tf_minlen = range->minlen;
7731 
7732 	ret = ocfs2_trim_mainbm(sb, range);
7733 
7734 	info.tf_trimlen = range->len;
7735 	info.tf_success = (ret < 0 ? 0 : 1);
7736 	pinfo = &info;
7737 out:
7738 	ocfs2_trim_fs_unlock(osb, pinfo);
7739 	ocfs2_trim_fs_lock_res_uninit(osb);
7740 	return ret;
7741 }
7742