xref: /linux/fs/ext4/extents.c (revision a1087ef6abedf0bfd60e5e3fddf33192cb2c1325)
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22 
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31 
32 #include <linux/module.h>
33 #include <linux/fs.h>
34 #include <linux/time.h>
35 #include <linux/jbd2.h>
36 #include <linux/highuid.h>
37 #include <linux/pagemap.h>
38 #include <linux/quotaops.h>
39 #include <linux/string.h>
40 #include <linux/slab.h>
41 #include <linux/falloc.h>
42 #include <asm/uaccess.h>
43 #include <linux/fiemap.h>
44 #include "ext4_jbd2.h"
45 #include "ext4_extents.h"
46 
47 static int ext4_ext_truncate_extend_restart(handle_t *handle,
48 					    struct inode *inode,
49 					    int needed)
50 {
51 	int err;
52 
53 	if (!ext4_handle_valid(handle))
54 		return 0;
55 	if (handle->h_buffer_credits > needed)
56 		return 0;
57 	err = ext4_journal_extend(handle, needed);
58 	if (err <= 0)
59 		return err;
60 	err = ext4_truncate_restart_trans(handle, inode, needed);
61 	if (err == 0)
62 		err = -EAGAIN;
63 
64 	return err;
65 }
66 
67 /*
68  * could return:
69  *  - EROFS
70  *  - ENOMEM
71  */
72 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
73 				struct ext4_ext_path *path)
74 {
75 	if (path->p_bh) {
76 		/* path points to block */
77 		return ext4_journal_get_write_access(handle, path->p_bh);
78 	}
79 	/* path points to leaf/index in inode body */
80 	/* we use in-core data, no need to protect them */
81 	return 0;
82 }
83 
84 /*
85  * could return:
86  *  - EROFS
87  *  - ENOMEM
88  *  - EIO
89  */
90 static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
91 				struct ext4_ext_path *path)
92 {
93 	int err;
94 	if (path->p_bh) {
95 		/* path points to block */
96 		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
97 	} else {
98 		/* path points to leaf/index in inode body */
99 		err = ext4_mark_inode_dirty(handle, inode);
100 	}
101 	return err;
102 }
103 
104 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
105 			      struct ext4_ext_path *path,
106 			      ext4_lblk_t block)
107 {
108 	struct ext4_inode_info *ei = EXT4_I(inode);
109 	ext4_fsblk_t bg_start;
110 	ext4_fsblk_t last_block;
111 	ext4_grpblk_t colour;
112 	ext4_group_t block_group;
113 	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
114 	int depth;
115 
116 	if (path) {
117 		struct ext4_extent *ex;
118 		depth = path->p_depth;
119 
120 		/* try to predict block placement */
121 		ex = path[depth].p_ext;
122 		if (ex)
123 			return (ext4_ext_pblock(ex) +
124 				(block - le32_to_cpu(ex->ee_block)));
125 
126 		/* it looks like index is empty;
127 		 * try to find starting block from index itself */
128 		if (path[depth].p_bh)
129 			return path[depth].p_bh->b_blocknr;
130 	}
131 
132 	/* OK. use inode's group */
133 	block_group = ei->i_block_group;
134 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
135 		/*
136 		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
137 		 * block groups per flexgroup, reserve the first block
138 		 * group for directories and special files.  Regular
139 		 * files will start at the second block group.  This
140 		 * tends to speed up directory access and improves
141 		 * fsck times.
142 		 */
143 		block_group &= ~(flex_size-1);
144 		if (S_ISREG(inode->i_mode))
145 			block_group++;
146 	}
147 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
148 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
149 
150 	/*
151 	 * If we are doing delayed allocation, we don't need take
152 	 * colour into account.
153 	 */
154 	if (test_opt(inode->i_sb, DELALLOC))
155 		return bg_start;
156 
157 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
158 		colour = (current->pid % 16) *
159 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
160 	else
161 		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
162 	return bg_start + colour + block;
163 }
164 
165 /*
166  * Allocation for a meta data block
167  */
168 static ext4_fsblk_t
169 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
170 			struct ext4_ext_path *path,
171 			struct ext4_extent *ex, int *err)
172 {
173 	ext4_fsblk_t goal, newblock;
174 
175 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
176 	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
177 	return newblock;
178 }
179 
180 static inline int ext4_ext_space_block(struct inode *inode, int check)
181 {
182 	int size;
183 
184 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
185 			/ sizeof(struct ext4_extent);
186 	if (!check) {
187 #ifdef AGGRESSIVE_TEST
188 		if (size > 6)
189 			size = 6;
190 #endif
191 	}
192 	return size;
193 }
194 
195 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
196 {
197 	int size;
198 
199 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
200 			/ sizeof(struct ext4_extent_idx);
201 	if (!check) {
202 #ifdef AGGRESSIVE_TEST
203 		if (size > 5)
204 			size = 5;
205 #endif
206 	}
207 	return size;
208 }
209 
210 static inline int ext4_ext_space_root(struct inode *inode, int check)
211 {
212 	int size;
213 
214 	size = sizeof(EXT4_I(inode)->i_data);
215 	size -= sizeof(struct ext4_extent_header);
216 	size /= sizeof(struct ext4_extent);
217 	if (!check) {
218 #ifdef AGGRESSIVE_TEST
219 		if (size > 3)
220 			size = 3;
221 #endif
222 	}
223 	return size;
224 }
225 
226 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
227 {
228 	int size;
229 
230 	size = sizeof(EXT4_I(inode)->i_data);
231 	size -= sizeof(struct ext4_extent_header);
232 	size /= sizeof(struct ext4_extent_idx);
233 	if (!check) {
234 #ifdef AGGRESSIVE_TEST
235 		if (size > 4)
236 			size = 4;
237 #endif
238 	}
239 	return size;
240 }
241 
242 /*
243  * Calculate the number of metadata blocks needed
244  * to allocate @blocks
245  * Worse case is one block per extent
246  */
247 int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
248 {
249 	struct ext4_inode_info *ei = EXT4_I(inode);
250 	int idxs, num = 0;
251 
252 	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
253 		/ sizeof(struct ext4_extent_idx));
254 
255 	/*
256 	 * If the new delayed allocation block is contiguous with the
257 	 * previous da block, it can share index blocks with the
258 	 * previous block, so we only need to allocate a new index
259 	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
260 	 * an additional index block, and at ldxs**3 blocks, yet
261 	 * another index blocks.
262 	 */
263 	if (ei->i_da_metadata_calc_len &&
264 	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
265 		if ((ei->i_da_metadata_calc_len % idxs) == 0)
266 			num++;
267 		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
268 			num++;
269 		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
270 			num++;
271 			ei->i_da_metadata_calc_len = 0;
272 		} else
273 			ei->i_da_metadata_calc_len++;
274 		ei->i_da_metadata_calc_last_lblock++;
275 		return num;
276 	}
277 
278 	/*
279 	 * In the worst case we need a new set of index blocks at
280 	 * every level of the inode's extent tree.
281 	 */
282 	ei->i_da_metadata_calc_len = 1;
283 	ei->i_da_metadata_calc_last_lblock = lblock;
284 	return ext_depth(inode) + 1;
285 }
286 
287 static int
288 ext4_ext_max_entries(struct inode *inode, int depth)
289 {
290 	int max;
291 
292 	if (depth == ext_depth(inode)) {
293 		if (depth == 0)
294 			max = ext4_ext_space_root(inode, 1);
295 		else
296 			max = ext4_ext_space_root_idx(inode, 1);
297 	} else {
298 		if (depth == 0)
299 			max = ext4_ext_space_block(inode, 1);
300 		else
301 			max = ext4_ext_space_block_idx(inode, 1);
302 	}
303 
304 	return max;
305 }
306 
307 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
308 {
309 	ext4_fsblk_t block = ext4_ext_pblock(ext);
310 	int len = ext4_ext_get_actual_len(ext);
311 
312 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
313 }
314 
315 static int ext4_valid_extent_idx(struct inode *inode,
316 				struct ext4_extent_idx *ext_idx)
317 {
318 	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
319 
320 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
321 }
322 
323 static int ext4_valid_extent_entries(struct inode *inode,
324 				struct ext4_extent_header *eh,
325 				int depth)
326 {
327 	struct ext4_extent *ext;
328 	struct ext4_extent_idx *ext_idx;
329 	unsigned short entries;
330 	if (eh->eh_entries == 0)
331 		return 1;
332 
333 	entries = le16_to_cpu(eh->eh_entries);
334 
335 	if (depth == 0) {
336 		/* leaf entries */
337 		ext = EXT_FIRST_EXTENT(eh);
338 		while (entries) {
339 			if (!ext4_valid_extent(inode, ext))
340 				return 0;
341 			ext++;
342 			entries--;
343 		}
344 	} else {
345 		ext_idx = EXT_FIRST_INDEX(eh);
346 		while (entries) {
347 			if (!ext4_valid_extent_idx(inode, ext_idx))
348 				return 0;
349 			ext_idx++;
350 			entries--;
351 		}
352 	}
353 	return 1;
354 }
355 
356 static int __ext4_ext_check(const char *function, unsigned int line,
357 			    struct inode *inode, struct ext4_extent_header *eh,
358 			    int depth)
359 {
360 	const char *error_msg;
361 	int max = 0;
362 
363 	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
364 		error_msg = "invalid magic";
365 		goto corrupted;
366 	}
367 	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
368 		error_msg = "unexpected eh_depth";
369 		goto corrupted;
370 	}
371 	if (unlikely(eh->eh_max == 0)) {
372 		error_msg = "invalid eh_max";
373 		goto corrupted;
374 	}
375 	max = ext4_ext_max_entries(inode, depth);
376 	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
377 		error_msg = "too large eh_max";
378 		goto corrupted;
379 	}
380 	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
381 		error_msg = "invalid eh_entries";
382 		goto corrupted;
383 	}
384 	if (!ext4_valid_extent_entries(inode, eh, depth)) {
385 		error_msg = "invalid extent entries";
386 		goto corrupted;
387 	}
388 	return 0;
389 
390 corrupted:
391 	ext4_error_inode(inode, function, line, 0,
392 			"bad header/extent: %s - magic %x, "
393 			"entries %u, max %u(%u), depth %u(%u)",
394 			error_msg, le16_to_cpu(eh->eh_magic),
395 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
396 			max, le16_to_cpu(eh->eh_depth), depth);
397 
398 	return -EIO;
399 }
400 
401 #define ext4_ext_check(inode, eh, depth)	\
402 	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
403 
404 int ext4_ext_check_inode(struct inode *inode)
405 {
406 	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
407 }
408 
409 #ifdef EXT_DEBUG
410 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
411 {
412 	int k, l = path->p_depth;
413 
414 	ext_debug("path:");
415 	for (k = 0; k <= l; k++, path++) {
416 		if (path->p_idx) {
417 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
418 			    ext4_idx_pblock(path->p_idx));
419 		} else if (path->p_ext) {
420 			ext_debug("  %d:[%d]%d:%llu ",
421 				  le32_to_cpu(path->p_ext->ee_block),
422 				  ext4_ext_is_uninitialized(path->p_ext),
423 				  ext4_ext_get_actual_len(path->p_ext),
424 				  ext4_ext_pblock(path->p_ext));
425 		} else
426 			ext_debug("  []");
427 	}
428 	ext_debug("\n");
429 }
430 
431 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
432 {
433 	int depth = ext_depth(inode);
434 	struct ext4_extent_header *eh;
435 	struct ext4_extent *ex;
436 	int i;
437 
438 	if (!path)
439 		return;
440 
441 	eh = path[depth].p_hdr;
442 	ex = EXT_FIRST_EXTENT(eh);
443 
444 	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
445 
446 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
447 		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
448 			  ext4_ext_is_uninitialized(ex),
449 			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
450 	}
451 	ext_debug("\n");
452 }
453 #else
454 #define ext4_ext_show_path(inode, path)
455 #define ext4_ext_show_leaf(inode, path)
456 #endif
457 
458 void ext4_ext_drop_refs(struct ext4_ext_path *path)
459 {
460 	int depth = path->p_depth;
461 	int i;
462 
463 	for (i = 0; i <= depth; i++, path++)
464 		if (path->p_bh) {
465 			brelse(path->p_bh);
466 			path->p_bh = NULL;
467 		}
468 }
469 
470 /*
471  * ext4_ext_binsearch_idx:
472  * binary search for the closest index of the given block
473  * the header must be checked before calling this
474  */
475 static void
476 ext4_ext_binsearch_idx(struct inode *inode,
477 			struct ext4_ext_path *path, ext4_lblk_t block)
478 {
479 	struct ext4_extent_header *eh = path->p_hdr;
480 	struct ext4_extent_idx *r, *l, *m;
481 
482 
483 	ext_debug("binsearch for %u(idx):  ", block);
484 
485 	l = EXT_FIRST_INDEX(eh) + 1;
486 	r = EXT_LAST_INDEX(eh);
487 	while (l <= r) {
488 		m = l + (r - l) / 2;
489 		if (block < le32_to_cpu(m->ei_block))
490 			r = m - 1;
491 		else
492 			l = m + 1;
493 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
494 				m, le32_to_cpu(m->ei_block),
495 				r, le32_to_cpu(r->ei_block));
496 	}
497 
498 	path->p_idx = l - 1;
499 	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
500 		  ext4_idx_pblock(path->p_idx));
501 
502 #ifdef CHECK_BINSEARCH
503 	{
504 		struct ext4_extent_idx *chix, *ix;
505 		int k;
506 
507 		chix = ix = EXT_FIRST_INDEX(eh);
508 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
509 		  if (k != 0 &&
510 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
511 				printk(KERN_DEBUG "k=%d, ix=0x%p, "
512 				       "first=0x%p\n", k,
513 				       ix, EXT_FIRST_INDEX(eh));
514 				printk(KERN_DEBUG "%u <= %u\n",
515 				       le32_to_cpu(ix->ei_block),
516 				       le32_to_cpu(ix[-1].ei_block));
517 			}
518 			BUG_ON(k && le32_to_cpu(ix->ei_block)
519 					   <= le32_to_cpu(ix[-1].ei_block));
520 			if (block < le32_to_cpu(ix->ei_block))
521 				break;
522 			chix = ix;
523 		}
524 		BUG_ON(chix != path->p_idx);
525 	}
526 #endif
527 
528 }
529 
530 /*
531  * ext4_ext_binsearch:
532  * binary search for closest extent of the given block
533  * the header must be checked before calling this
534  */
535 static void
536 ext4_ext_binsearch(struct inode *inode,
537 		struct ext4_ext_path *path, ext4_lblk_t block)
538 {
539 	struct ext4_extent_header *eh = path->p_hdr;
540 	struct ext4_extent *r, *l, *m;
541 
542 	if (eh->eh_entries == 0) {
543 		/*
544 		 * this leaf is empty:
545 		 * we get such a leaf in split/add case
546 		 */
547 		return;
548 	}
549 
550 	ext_debug("binsearch for %u:  ", block);
551 
552 	l = EXT_FIRST_EXTENT(eh) + 1;
553 	r = EXT_LAST_EXTENT(eh);
554 
555 	while (l <= r) {
556 		m = l + (r - l) / 2;
557 		if (block < le32_to_cpu(m->ee_block))
558 			r = m - 1;
559 		else
560 			l = m + 1;
561 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
562 				m, le32_to_cpu(m->ee_block),
563 				r, le32_to_cpu(r->ee_block));
564 	}
565 
566 	path->p_ext = l - 1;
567 	ext_debug("  -> %d:%llu:[%d]%d ",
568 			le32_to_cpu(path->p_ext->ee_block),
569 			ext4_ext_pblock(path->p_ext),
570 			ext4_ext_is_uninitialized(path->p_ext),
571 			ext4_ext_get_actual_len(path->p_ext));
572 
573 #ifdef CHECK_BINSEARCH
574 	{
575 		struct ext4_extent *chex, *ex;
576 		int k;
577 
578 		chex = ex = EXT_FIRST_EXTENT(eh);
579 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
580 			BUG_ON(k && le32_to_cpu(ex->ee_block)
581 					  <= le32_to_cpu(ex[-1].ee_block));
582 			if (block < le32_to_cpu(ex->ee_block))
583 				break;
584 			chex = ex;
585 		}
586 		BUG_ON(chex != path->p_ext);
587 	}
588 #endif
589 
590 }
591 
592 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
593 {
594 	struct ext4_extent_header *eh;
595 
596 	eh = ext_inode_hdr(inode);
597 	eh->eh_depth = 0;
598 	eh->eh_entries = 0;
599 	eh->eh_magic = EXT4_EXT_MAGIC;
600 	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
601 	ext4_mark_inode_dirty(handle, inode);
602 	ext4_ext_invalidate_cache(inode);
603 	return 0;
604 }
605 
606 struct ext4_ext_path *
607 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
608 					struct ext4_ext_path *path)
609 {
610 	struct ext4_extent_header *eh;
611 	struct buffer_head *bh;
612 	short int depth, i, ppos = 0, alloc = 0;
613 
614 	eh = ext_inode_hdr(inode);
615 	depth = ext_depth(inode);
616 
617 	/* account possible depth increase */
618 	if (!path) {
619 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
620 				GFP_NOFS);
621 		if (!path)
622 			return ERR_PTR(-ENOMEM);
623 		alloc = 1;
624 	}
625 	path[0].p_hdr = eh;
626 	path[0].p_bh = NULL;
627 
628 	i = depth;
629 	/* walk through the tree */
630 	while (i) {
631 		int need_to_validate = 0;
632 
633 		ext_debug("depth %d: num %d, max %d\n",
634 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
635 
636 		ext4_ext_binsearch_idx(inode, path + ppos, block);
637 		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
638 		path[ppos].p_depth = i;
639 		path[ppos].p_ext = NULL;
640 
641 		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
642 		if (unlikely(!bh))
643 			goto err;
644 		if (!bh_uptodate_or_lock(bh)) {
645 			if (bh_submit_read(bh) < 0) {
646 				put_bh(bh);
647 				goto err;
648 			}
649 			/* validate the extent entries */
650 			need_to_validate = 1;
651 		}
652 		eh = ext_block_hdr(bh);
653 		ppos++;
654 		if (unlikely(ppos > depth)) {
655 			put_bh(bh);
656 			EXT4_ERROR_INODE(inode,
657 					 "ppos %d > depth %d", ppos, depth);
658 			goto err;
659 		}
660 		path[ppos].p_bh = bh;
661 		path[ppos].p_hdr = eh;
662 		i--;
663 
664 		if (need_to_validate && ext4_ext_check(inode, eh, i))
665 			goto err;
666 	}
667 
668 	path[ppos].p_depth = i;
669 	path[ppos].p_ext = NULL;
670 	path[ppos].p_idx = NULL;
671 
672 	/* find extent */
673 	ext4_ext_binsearch(inode, path + ppos, block);
674 	/* if not an empty leaf */
675 	if (path[ppos].p_ext)
676 		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
677 
678 	ext4_ext_show_path(inode, path);
679 
680 	return path;
681 
682 err:
683 	ext4_ext_drop_refs(path);
684 	if (alloc)
685 		kfree(path);
686 	return ERR_PTR(-EIO);
687 }
688 
689 /*
690  * ext4_ext_insert_index:
691  * insert new index [@logical;@ptr] into the block at @curp;
692  * check where to insert: before @curp or after @curp
693  */
694 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
695 				 struct ext4_ext_path *curp,
696 				 int logical, ext4_fsblk_t ptr)
697 {
698 	struct ext4_extent_idx *ix;
699 	int len, err;
700 
701 	err = ext4_ext_get_access(handle, inode, curp);
702 	if (err)
703 		return err;
704 
705 	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
706 		EXT4_ERROR_INODE(inode,
707 				 "logical %d == ei_block %d!",
708 				 logical, le32_to_cpu(curp->p_idx->ei_block));
709 		return -EIO;
710 	}
711 	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
712 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
713 		/* insert after */
714 		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
715 			len = (len - 1) * sizeof(struct ext4_extent_idx);
716 			len = len < 0 ? 0 : len;
717 			ext_debug("insert new index %d after: %llu. "
718 					"move %d from 0x%p to 0x%p\n",
719 					logical, ptr, len,
720 					(curp->p_idx + 1), (curp->p_idx + 2));
721 			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
722 		}
723 		ix = curp->p_idx + 1;
724 	} else {
725 		/* insert before */
726 		len = len * sizeof(struct ext4_extent_idx);
727 		len = len < 0 ? 0 : len;
728 		ext_debug("insert new index %d before: %llu. "
729 				"move %d from 0x%p to 0x%p\n",
730 				logical, ptr, len,
731 				curp->p_idx, (curp->p_idx + 1));
732 		memmove(curp->p_idx + 1, curp->p_idx, len);
733 		ix = curp->p_idx;
734 	}
735 
736 	ix->ei_block = cpu_to_le32(logical);
737 	ext4_idx_store_pblock(ix, ptr);
738 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
739 
740 	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
741 			     > le16_to_cpu(curp->p_hdr->eh_max))) {
742 		EXT4_ERROR_INODE(inode,
743 				 "logical %d == ei_block %d!",
744 				 logical, le32_to_cpu(curp->p_idx->ei_block));
745 		return -EIO;
746 	}
747 	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
748 		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
749 		return -EIO;
750 	}
751 
752 	err = ext4_ext_dirty(handle, inode, curp);
753 	ext4_std_error(inode->i_sb, err);
754 
755 	return err;
756 }
757 
758 /*
759  * ext4_ext_split:
760  * inserts new subtree into the path, using free index entry
761  * at depth @at:
762  * - allocates all needed blocks (new leaf and all intermediate index blocks)
763  * - makes decision where to split
764  * - moves remaining extents and index entries (right to the split point)
765  *   into the newly allocated blocks
766  * - initializes subtree
767  */
768 static int ext4_ext_split(handle_t *handle, struct inode *inode,
769 				struct ext4_ext_path *path,
770 				struct ext4_extent *newext, int at)
771 {
772 	struct buffer_head *bh = NULL;
773 	int depth = ext_depth(inode);
774 	struct ext4_extent_header *neh;
775 	struct ext4_extent_idx *fidx;
776 	struct ext4_extent *ex;
777 	int i = at, k, m, a;
778 	ext4_fsblk_t newblock, oldblock;
779 	__le32 border;
780 	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
781 	int err = 0;
782 
783 	/* make decision: where to split? */
784 	/* FIXME: now decision is simplest: at current extent */
785 
786 	/* if current leaf will be split, then we should use
787 	 * border from split point */
788 	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
789 		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
790 		return -EIO;
791 	}
792 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
793 		border = path[depth].p_ext[1].ee_block;
794 		ext_debug("leaf will be split."
795 				" next leaf starts at %d\n",
796 				  le32_to_cpu(border));
797 	} else {
798 		border = newext->ee_block;
799 		ext_debug("leaf will be added."
800 				" next leaf starts at %d\n",
801 				le32_to_cpu(border));
802 	}
803 
804 	/*
805 	 * If error occurs, then we break processing
806 	 * and mark filesystem read-only. index won't
807 	 * be inserted and tree will be in consistent
808 	 * state. Next mount will repair buffers too.
809 	 */
810 
811 	/*
812 	 * Get array to track all allocated blocks.
813 	 * We need this to handle errors and free blocks
814 	 * upon them.
815 	 */
816 	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
817 	if (!ablocks)
818 		return -ENOMEM;
819 
820 	/* allocate all needed blocks */
821 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
822 	for (a = 0; a < depth - at; a++) {
823 		newblock = ext4_ext_new_meta_block(handle, inode, path,
824 						   newext, &err);
825 		if (newblock == 0)
826 			goto cleanup;
827 		ablocks[a] = newblock;
828 	}
829 
830 	/* initialize new leaf */
831 	newblock = ablocks[--a];
832 	if (unlikely(newblock == 0)) {
833 		EXT4_ERROR_INODE(inode, "newblock == 0!");
834 		err = -EIO;
835 		goto cleanup;
836 	}
837 	bh = sb_getblk(inode->i_sb, newblock);
838 	if (!bh) {
839 		err = -EIO;
840 		goto cleanup;
841 	}
842 	lock_buffer(bh);
843 
844 	err = ext4_journal_get_create_access(handle, bh);
845 	if (err)
846 		goto cleanup;
847 
848 	neh = ext_block_hdr(bh);
849 	neh->eh_entries = 0;
850 	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
851 	neh->eh_magic = EXT4_EXT_MAGIC;
852 	neh->eh_depth = 0;
853 	ex = EXT_FIRST_EXTENT(neh);
854 
855 	/* move remainder of path[depth] to the new leaf */
856 	if (unlikely(path[depth].p_hdr->eh_entries !=
857 		     path[depth].p_hdr->eh_max)) {
858 		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
859 				 path[depth].p_hdr->eh_entries,
860 				 path[depth].p_hdr->eh_max);
861 		err = -EIO;
862 		goto cleanup;
863 	}
864 	/* start copy from next extent */
865 	/* TODO: we could do it by single memmove */
866 	m = 0;
867 	path[depth].p_ext++;
868 	while (path[depth].p_ext <=
869 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
870 		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
871 				le32_to_cpu(path[depth].p_ext->ee_block),
872 				ext4_ext_pblock(path[depth].p_ext),
873 				ext4_ext_is_uninitialized(path[depth].p_ext),
874 				ext4_ext_get_actual_len(path[depth].p_ext),
875 				newblock);
876 		/*memmove(ex++, path[depth].p_ext++,
877 				sizeof(struct ext4_extent));
878 		neh->eh_entries++;*/
879 		path[depth].p_ext++;
880 		m++;
881 	}
882 	if (m) {
883 		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
884 		le16_add_cpu(&neh->eh_entries, m);
885 	}
886 
887 	set_buffer_uptodate(bh);
888 	unlock_buffer(bh);
889 
890 	err = ext4_handle_dirty_metadata(handle, inode, bh);
891 	if (err)
892 		goto cleanup;
893 	brelse(bh);
894 	bh = NULL;
895 
896 	/* correct old leaf */
897 	if (m) {
898 		err = ext4_ext_get_access(handle, inode, path + depth);
899 		if (err)
900 			goto cleanup;
901 		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
902 		err = ext4_ext_dirty(handle, inode, path + depth);
903 		if (err)
904 			goto cleanup;
905 
906 	}
907 
908 	/* create intermediate indexes */
909 	k = depth - at - 1;
910 	if (unlikely(k < 0)) {
911 		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
912 		err = -EIO;
913 		goto cleanup;
914 	}
915 	if (k)
916 		ext_debug("create %d intermediate indices\n", k);
917 	/* insert new index into current index block */
918 	/* current depth stored in i var */
919 	i = depth - 1;
920 	while (k--) {
921 		oldblock = newblock;
922 		newblock = ablocks[--a];
923 		bh = sb_getblk(inode->i_sb, newblock);
924 		if (!bh) {
925 			err = -EIO;
926 			goto cleanup;
927 		}
928 		lock_buffer(bh);
929 
930 		err = ext4_journal_get_create_access(handle, bh);
931 		if (err)
932 			goto cleanup;
933 
934 		neh = ext_block_hdr(bh);
935 		neh->eh_entries = cpu_to_le16(1);
936 		neh->eh_magic = EXT4_EXT_MAGIC;
937 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
938 		neh->eh_depth = cpu_to_le16(depth - i);
939 		fidx = EXT_FIRST_INDEX(neh);
940 		fidx->ei_block = border;
941 		ext4_idx_store_pblock(fidx, oldblock);
942 
943 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
944 				i, newblock, le32_to_cpu(border), oldblock);
945 		/* copy indexes */
946 		m = 0;
947 		path[i].p_idx++;
948 
949 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
950 				EXT_MAX_INDEX(path[i].p_hdr));
951 		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
952 					EXT_LAST_INDEX(path[i].p_hdr))) {
953 			EXT4_ERROR_INODE(inode,
954 					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
955 					 le32_to_cpu(path[i].p_ext->ee_block));
956 			err = -EIO;
957 			goto cleanup;
958 		}
959 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
960 			ext_debug("%d: move %d:%llu in new index %llu\n", i,
961 					le32_to_cpu(path[i].p_idx->ei_block),
962 					ext4_idx_pblock(path[i].p_idx),
963 					newblock);
964 			/*memmove(++fidx, path[i].p_idx++,
965 					sizeof(struct ext4_extent_idx));
966 			neh->eh_entries++;
967 			BUG_ON(neh->eh_entries > neh->eh_max);*/
968 			path[i].p_idx++;
969 			m++;
970 		}
971 		if (m) {
972 			memmove(++fidx, path[i].p_idx - m,
973 				sizeof(struct ext4_extent_idx) * m);
974 			le16_add_cpu(&neh->eh_entries, m);
975 		}
976 		set_buffer_uptodate(bh);
977 		unlock_buffer(bh);
978 
979 		err = ext4_handle_dirty_metadata(handle, inode, bh);
980 		if (err)
981 			goto cleanup;
982 		brelse(bh);
983 		bh = NULL;
984 
985 		/* correct old index */
986 		if (m) {
987 			err = ext4_ext_get_access(handle, inode, path + i);
988 			if (err)
989 				goto cleanup;
990 			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
991 			err = ext4_ext_dirty(handle, inode, path + i);
992 			if (err)
993 				goto cleanup;
994 		}
995 
996 		i--;
997 	}
998 
999 	/* insert new index */
1000 	err = ext4_ext_insert_index(handle, inode, path + at,
1001 				    le32_to_cpu(border), newblock);
1002 
1003 cleanup:
1004 	if (bh) {
1005 		if (buffer_locked(bh))
1006 			unlock_buffer(bh);
1007 		brelse(bh);
1008 	}
1009 
1010 	if (err) {
1011 		/* free all allocated blocks in error case */
1012 		for (i = 0; i < depth; i++) {
1013 			if (!ablocks[i])
1014 				continue;
1015 			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1016 					 EXT4_FREE_BLOCKS_METADATA);
1017 		}
1018 	}
1019 	kfree(ablocks);
1020 
1021 	return err;
1022 }
1023 
1024 /*
1025  * ext4_ext_grow_indepth:
1026  * implements tree growing procedure:
1027  * - allocates new block
1028  * - moves top-level data (index block or leaf) into the new block
1029  * - initializes new top-level, creating index that points to the
1030  *   just created block
1031  */
1032 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1033 					struct ext4_ext_path *path,
1034 					struct ext4_extent *newext)
1035 {
1036 	struct ext4_ext_path *curp = path;
1037 	struct ext4_extent_header *neh;
1038 	struct buffer_head *bh;
1039 	ext4_fsblk_t newblock;
1040 	int err = 0;
1041 
1042 	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1043 	if (newblock == 0)
1044 		return err;
1045 
1046 	bh = sb_getblk(inode->i_sb, newblock);
1047 	if (!bh) {
1048 		err = -EIO;
1049 		ext4_std_error(inode->i_sb, err);
1050 		return err;
1051 	}
1052 	lock_buffer(bh);
1053 
1054 	err = ext4_journal_get_create_access(handle, bh);
1055 	if (err) {
1056 		unlock_buffer(bh);
1057 		goto out;
1058 	}
1059 
1060 	/* move top-level index/leaf into new block */
1061 	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1062 
1063 	/* set size of new block */
1064 	neh = ext_block_hdr(bh);
1065 	/* old root could have indexes or leaves
1066 	 * so calculate e_max right way */
1067 	if (ext_depth(inode))
1068 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1069 	else
1070 		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1071 	neh->eh_magic = EXT4_EXT_MAGIC;
1072 	set_buffer_uptodate(bh);
1073 	unlock_buffer(bh);
1074 
1075 	err = ext4_handle_dirty_metadata(handle, inode, bh);
1076 	if (err)
1077 		goto out;
1078 
1079 	/* create index in new top-level index: num,max,pointer */
1080 	err = ext4_ext_get_access(handle, inode, curp);
1081 	if (err)
1082 		goto out;
1083 
1084 	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1085 	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1086 	curp->p_hdr->eh_entries = cpu_to_le16(1);
1087 	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1088 
1089 	if (path[0].p_hdr->eh_depth)
1090 		curp->p_idx->ei_block =
1091 			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1092 	else
1093 		curp->p_idx->ei_block =
1094 			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1095 	ext4_idx_store_pblock(curp->p_idx, newblock);
1096 
1097 	neh = ext_inode_hdr(inode);
1098 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1099 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1100 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1101 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1102 
1103 	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1104 	err = ext4_ext_dirty(handle, inode, curp);
1105 out:
1106 	brelse(bh);
1107 
1108 	return err;
1109 }
1110 
1111 /*
1112  * ext4_ext_create_new_leaf:
1113  * finds empty index and adds new leaf.
1114  * if no free index is found, then it requests in-depth growing.
1115  */
1116 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1117 					struct ext4_ext_path *path,
1118 					struct ext4_extent *newext)
1119 {
1120 	struct ext4_ext_path *curp;
1121 	int depth, i, err = 0;
1122 
1123 repeat:
1124 	i = depth = ext_depth(inode);
1125 
1126 	/* walk up to the tree and look for free index entry */
1127 	curp = path + depth;
1128 	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1129 		i--;
1130 		curp--;
1131 	}
1132 
1133 	/* we use already allocated block for index block,
1134 	 * so subsequent data blocks should be contiguous */
1135 	if (EXT_HAS_FREE_INDEX(curp)) {
1136 		/* if we found index with free entry, then use that
1137 		 * entry: create all needed subtree and add new leaf */
1138 		err = ext4_ext_split(handle, inode, path, newext, i);
1139 		if (err)
1140 			goto out;
1141 
1142 		/* refill path */
1143 		ext4_ext_drop_refs(path);
1144 		path = ext4_ext_find_extent(inode,
1145 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1146 				    path);
1147 		if (IS_ERR(path))
1148 			err = PTR_ERR(path);
1149 	} else {
1150 		/* tree is full, time to grow in depth */
1151 		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1152 		if (err)
1153 			goto out;
1154 
1155 		/* refill path */
1156 		ext4_ext_drop_refs(path);
1157 		path = ext4_ext_find_extent(inode,
1158 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1159 				    path);
1160 		if (IS_ERR(path)) {
1161 			err = PTR_ERR(path);
1162 			goto out;
1163 		}
1164 
1165 		/*
1166 		 * only first (depth 0 -> 1) produces free space;
1167 		 * in all other cases we have to split the grown tree
1168 		 */
1169 		depth = ext_depth(inode);
1170 		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1171 			/* now we need to split */
1172 			goto repeat;
1173 		}
1174 	}
1175 
1176 out:
1177 	return err;
1178 }
1179 
1180 /*
1181  * search the closest allocated block to the left for *logical
1182  * and returns it at @logical + it's physical address at @phys
1183  * if *logical is the smallest allocated block, the function
1184  * returns 0 at @phys
1185  * return value contains 0 (success) or error code
1186  */
1187 static int ext4_ext_search_left(struct inode *inode,
1188 				struct ext4_ext_path *path,
1189 				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1190 {
1191 	struct ext4_extent_idx *ix;
1192 	struct ext4_extent *ex;
1193 	int depth, ee_len;
1194 
1195 	if (unlikely(path == NULL)) {
1196 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1197 		return -EIO;
1198 	}
1199 	depth = path->p_depth;
1200 	*phys = 0;
1201 
1202 	if (depth == 0 && path->p_ext == NULL)
1203 		return 0;
1204 
1205 	/* usually extent in the path covers blocks smaller
1206 	 * then *logical, but it can be that extent is the
1207 	 * first one in the file */
1208 
1209 	ex = path[depth].p_ext;
1210 	ee_len = ext4_ext_get_actual_len(ex);
1211 	if (*logical < le32_to_cpu(ex->ee_block)) {
1212 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1213 			EXT4_ERROR_INODE(inode,
1214 					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1215 					 *logical, le32_to_cpu(ex->ee_block));
1216 			return -EIO;
1217 		}
1218 		while (--depth >= 0) {
1219 			ix = path[depth].p_idx;
1220 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1221 				EXT4_ERROR_INODE(inode,
1222 				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1223 				  ix != NULL ? ix->ei_block : 0,
1224 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1225 				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1226 				  depth);
1227 				return -EIO;
1228 			}
1229 		}
1230 		return 0;
1231 	}
1232 
1233 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1234 		EXT4_ERROR_INODE(inode,
1235 				 "logical %d < ee_block %d + ee_len %d!",
1236 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1237 		return -EIO;
1238 	}
1239 
1240 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1241 	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1242 	return 0;
1243 }
1244 
1245 /*
1246  * search the closest allocated block to the right for *logical
1247  * and returns it at @logical + it's physical address at @phys
1248  * if *logical is the smallest allocated block, the function
1249  * returns 0 at @phys
1250  * return value contains 0 (success) or error code
1251  */
1252 static int ext4_ext_search_right(struct inode *inode,
1253 				 struct ext4_ext_path *path,
1254 				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1255 {
1256 	struct buffer_head *bh = NULL;
1257 	struct ext4_extent_header *eh;
1258 	struct ext4_extent_idx *ix;
1259 	struct ext4_extent *ex;
1260 	ext4_fsblk_t block;
1261 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1262 	int ee_len;
1263 
1264 	if (unlikely(path == NULL)) {
1265 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1266 		return -EIO;
1267 	}
1268 	depth = path->p_depth;
1269 	*phys = 0;
1270 
1271 	if (depth == 0 && path->p_ext == NULL)
1272 		return 0;
1273 
1274 	/* usually extent in the path covers blocks smaller
1275 	 * then *logical, but it can be that extent is the
1276 	 * first one in the file */
1277 
1278 	ex = path[depth].p_ext;
1279 	ee_len = ext4_ext_get_actual_len(ex);
1280 	if (*logical < le32_to_cpu(ex->ee_block)) {
1281 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1282 			EXT4_ERROR_INODE(inode,
1283 					 "first_extent(path[%d].p_hdr) != ex",
1284 					 depth);
1285 			return -EIO;
1286 		}
1287 		while (--depth >= 0) {
1288 			ix = path[depth].p_idx;
1289 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1290 				EXT4_ERROR_INODE(inode,
1291 						 "ix != EXT_FIRST_INDEX *logical %d!",
1292 						 *logical);
1293 				return -EIO;
1294 			}
1295 		}
1296 		*logical = le32_to_cpu(ex->ee_block);
1297 		*phys = ext4_ext_pblock(ex);
1298 		return 0;
1299 	}
1300 
1301 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1302 		EXT4_ERROR_INODE(inode,
1303 				 "logical %d < ee_block %d + ee_len %d!",
1304 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1305 		return -EIO;
1306 	}
1307 
1308 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1309 		/* next allocated block in this leaf */
1310 		ex++;
1311 		*logical = le32_to_cpu(ex->ee_block);
1312 		*phys = ext4_ext_pblock(ex);
1313 		return 0;
1314 	}
1315 
1316 	/* go up and search for index to the right */
1317 	while (--depth >= 0) {
1318 		ix = path[depth].p_idx;
1319 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1320 			goto got_index;
1321 	}
1322 
1323 	/* we've gone up to the root and found no index to the right */
1324 	return 0;
1325 
1326 got_index:
1327 	/* we've found index to the right, let's
1328 	 * follow it and find the closest allocated
1329 	 * block to the right */
1330 	ix++;
1331 	block = ext4_idx_pblock(ix);
1332 	while (++depth < path->p_depth) {
1333 		bh = sb_bread(inode->i_sb, block);
1334 		if (bh == NULL)
1335 			return -EIO;
1336 		eh = ext_block_hdr(bh);
1337 		/* subtract from p_depth to get proper eh_depth */
1338 		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1339 			put_bh(bh);
1340 			return -EIO;
1341 		}
1342 		ix = EXT_FIRST_INDEX(eh);
1343 		block = ext4_idx_pblock(ix);
1344 		put_bh(bh);
1345 	}
1346 
1347 	bh = sb_bread(inode->i_sb, block);
1348 	if (bh == NULL)
1349 		return -EIO;
1350 	eh = ext_block_hdr(bh);
1351 	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1352 		put_bh(bh);
1353 		return -EIO;
1354 	}
1355 	ex = EXT_FIRST_EXTENT(eh);
1356 	*logical = le32_to_cpu(ex->ee_block);
1357 	*phys = ext4_ext_pblock(ex);
1358 	put_bh(bh);
1359 	return 0;
1360 }
1361 
1362 /*
1363  * ext4_ext_next_allocated_block:
1364  * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1365  * NOTE: it considers block number from index entry as
1366  * allocated block. Thus, index entries have to be consistent
1367  * with leaves.
1368  */
1369 static ext4_lblk_t
1370 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1371 {
1372 	int depth;
1373 
1374 	BUG_ON(path == NULL);
1375 	depth = path->p_depth;
1376 
1377 	if (depth == 0 && path->p_ext == NULL)
1378 		return EXT_MAX_BLOCK;
1379 
1380 	while (depth >= 0) {
1381 		if (depth == path->p_depth) {
1382 			/* leaf */
1383 			if (path[depth].p_ext !=
1384 					EXT_LAST_EXTENT(path[depth].p_hdr))
1385 			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1386 		} else {
1387 			/* index */
1388 			if (path[depth].p_idx !=
1389 					EXT_LAST_INDEX(path[depth].p_hdr))
1390 			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1391 		}
1392 		depth--;
1393 	}
1394 
1395 	return EXT_MAX_BLOCK;
1396 }
1397 
1398 /*
1399  * ext4_ext_next_leaf_block:
1400  * returns first allocated block from next leaf or EXT_MAX_BLOCK
1401  */
1402 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1403 					struct ext4_ext_path *path)
1404 {
1405 	int depth;
1406 
1407 	BUG_ON(path == NULL);
1408 	depth = path->p_depth;
1409 
1410 	/* zero-tree has no leaf blocks at all */
1411 	if (depth == 0)
1412 		return EXT_MAX_BLOCK;
1413 
1414 	/* go to index block */
1415 	depth--;
1416 
1417 	while (depth >= 0) {
1418 		if (path[depth].p_idx !=
1419 				EXT_LAST_INDEX(path[depth].p_hdr))
1420 			return (ext4_lblk_t)
1421 				le32_to_cpu(path[depth].p_idx[1].ei_block);
1422 		depth--;
1423 	}
1424 
1425 	return EXT_MAX_BLOCK;
1426 }
1427 
1428 /*
1429  * ext4_ext_correct_indexes:
1430  * if leaf gets modified and modified extent is first in the leaf,
1431  * then we have to correct all indexes above.
1432  * TODO: do we need to correct tree in all cases?
1433  */
1434 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1435 				struct ext4_ext_path *path)
1436 {
1437 	struct ext4_extent_header *eh;
1438 	int depth = ext_depth(inode);
1439 	struct ext4_extent *ex;
1440 	__le32 border;
1441 	int k, err = 0;
1442 
1443 	eh = path[depth].p_hdr;
1444 	ex = path[depth].p_ext;
1445 
1446 	if (unlikely(ex == NULL || eh == NULL)) {
1447 		EXT4_ERROR_INODE(inode,
1448 				 "ex %p == NULL or eh %p == NULL", ex, eh);
1449 		return -EIO;
1450 	}
1451 
1452 	if (depth == 0) {
1453 		/* there is no tree at all */
1454 		return 0;
1455 	}
1456 
1457 	if (ex != EXT_FIRST_EXTENT(eh)) {
1458 		/* we correct tree if first leaf got modified only */
1459 		return 0;
1460 	}
1461 
1462 	/*
1463 	 * TODO: we need correction if border is smaller than current one
1464 	 */
1465 	k = depth - 1;
1466 	border = path[depth].p_ext->ee_block;
1467 	err = ext4_ext_get_access(handle, inode, path + k);
1468 	if (err)
1469 		return err;
1470 	path[k].p_idx->ei_block = border;
1471 	err = ext4_ext_dirty(handle, inode, path + k);
1472 	if (err)
1473 		return err;
1474 
1475 	while (k--) {
1476 		/* change all left-side indexes */
1477 		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1478 			break;
1479 		err = ext4_ext_get_access(handle, inode, path + k);
1480 		if (err)
1481 			break;
1482 		path[k].p_idx->ei_block = border;
1483 		err = ext4_ext_dirty(handle, inode, path + k);
1484 		if (err)
1485 			break;
1486 	}
1487 
1488 	return err;
1489 }
1490 
1491 int
1492 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1493 				struct ext4_extent *ex2)
1494 {
1495 	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1496 
1497 	/*
1498 	 * Make sure that either both extents are uninitialized, or
1499 	 * both are _not_.
1500 	 */
1501 	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1502 		return 0;
1503 
1504 	if (ext4_ext_is_uninitialized(ex1))
1505 		max_len = EXT_UNINIT_MAX_LEN;
1506 	else
1507 		max_len = EXT_INIT_MAX_LEN;
1508 
1509 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1510 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1511 
1512 	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1513 			le32_to_cpu(ex2->ee_block))
1514 		return 0;
1515 
1516 	/*
1517 	 * To allow future support for preallocated extents to be added
1518 	 * as an RO_COMPAT feature, refuse to merge to extents if
1519 	 * this can result in the top bit of ee_len being set.
1520 	 */
1521 	if (ext1_ee_len + ext2_ee_len > max_len)
1522 		return 0;
1523 #ifdef AGGRESSIVE_TEST
1524 	if (ext1_ee_len >= 4)
1525 		return 0;
1526 #endif
1527 
1528 	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1529 		return 1;
1530 	return 0;
1531 }
1532 
1533 /*
1534  * This function tries to merge the "ex" extent to the next extent in the tree.
1535  * It always tries to merge towards right. If you want to merge towards
1536  * left, pass "ex - 1" as argument instead of "ex".
1537  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1538  * 1 if they got merged.
1539  */
1540 static int ext4_ext_try_to_merge(struct inode *inode,
1541 				 struct ext4_ext_path *path,
1542 				 struct ext4_extent *ex)
1543 {
1544 	struct ext4_extent_header *eh;
1545 	unsigned int depth, len;
1546 	int merge_done = 0;
1547 	int uninitialized = 0;
1548 
1549 	depth = ext_depth(inode);
1550 	BUG_ON(path[depth].p_hdr == NULL);
1551 	eh = path[depth].p_hdr;
1552 
1553 	while (ex < EXT_LAST_EXTENT(eh)) {
1554 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1555 			break;
1556 		/* merge with next extent! */
1557 		if (ext4_ext_is_uninitialized(ex))
1558 			uninitialized = 1;
1559 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1560 				+ ext4_ext_get_actual_len(ex + 1));
1561 		if (uninitialized)
1562 			ext4_ext_mark_uninitialized(ex);
1563 
1564 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1565 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1566 				* sizeof(struct ext4_extent);
1567 			memmove(ex + 1, ex + 2, len);
1568 		}
1569 		le16_add_cpu(&eh->eh_entries, -1);
1570 		merge_done = 1;
1571 		WARN_ON(eh->eh_entries == 0);
1572 		if (!eh->eh_entries)
1573 			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1574 	}
1575 
1576 	return merge_done;
1577 }
1578 
1579 /*
1580  * check if a portion of the "newext" extent overlaps with an
1581  * existing extent.
1582  *
1583  * If there is an overlap discovered, it updates the length of the newext
1584  * such that there will be no overlap, and then returns 1.
1585  * If there is no overlap found, it returns 0.
1586  */
1587 static unsigned int ext4_ext_check_overlap(struct inode *inode,
1588 					   struct ext4_extent *newext,
1589 					   struct ext4_ext_path *path)
1590 {
1591 	ext4_lblk_t b1, b2;
1592 	unsigned int depth, len1;
1593 	unsigned int ret = 0;
1594 
1595 	b1 = le32_to_cpu(newext->ee_block);
1596 	len1 = ext4_ext_get_actual_len(newext);
1597 	depth = ext_depth(inode);
1598 	if (!path[depth].p_ext)
1599 		goto out;
1600 	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1601 
1602 	/*
1603 	 * get the next allocated block if the extent in the path
1604 	 * is before the requested block(s)
1605 	 */
1606 	if (b2 < b1) {
1607 		b2 = ext4_ext_next_allocated_block(path);
1608 		if (b2 == EXT_MAX_BLOCK)
1609 			goto out;
1610 	}
1611 
1612 	/* check for wrap through zero on extent logical start block*/
1613 	if (b1 + len1 < b1) {
1614 		len1 = EXT_MAX_BLOCK - b1;
1615 		newext->ee_len = cpu_to_le16(len1);
1616 		ret = 1;
1617 	}
1618 
1619 	/* check for overlap */
1620 	if (b1 + len1 > b2) {
1621 		newext->ee_len = cpu_to_le16(b2 - b1);
1622 		ret = 1;
1623 	}
1624 out:
1625 	return ret;
1626 }
1627 
1628 /*
1629  * ext4_ext_insert_extent:
1630  * tries to merge requsted extent into the existing extent or
1631  * inserts requested extent as new one into the tree,
1632  * creating new leaf in the no-space case.
1633  */
1634 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1635 				struct ext4_ext_path *path,
1636 				struct ext4_extent *newext, int flag)
1637 {
1638 	struct ext4_extent_header *eh;
1639 	struct ext4_extent *ex, *fex;
1640 	struct ext4_extent *nearex; /* nearest extent */
1641 	struct ext4_ext_path *npath = NULL;
1642 	int depth, len, err;
1643 	ext4_lblk_t next;
1644 	unsigned uninitialized = 0;
1645 
1646 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1647 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1648 		return -EIO;
1649 	}
1650 	depth = ext_depth(inode);
1651 	ex = path[depth].p_ext;
1652 	if (unlikely(path[depth].p_hdr == NULL)) {
1653 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1654 		return -EIO;
1655 	}
1656 
1657 	/* try to insert block into found extent and return */
1658 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1659 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1660 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1661 			  ext4_ext_is_uninitialized(newext),
1662 			  ext4_ext_get_actual_len(newext),
1663 			  le32_to_cpu(ex->ee_block),
1664 			  ext4_ext_is_uninitialized(ex),
1665 			  ext4_ext_get_actual_len(ex),
1666 			  ext4_ext_pblock(ex));
1667 		err = ext4_ext_get_access(handle, inode, path + depth);
1668 		if (err)
1669 			return err;
1670 
1671 		/*
1672 		 * ext4_can_extents_be_merged should have checked that either
1673 		 * both extents are uninitialized, or both aren't. Thus we
1674 		 * need to check only one of them here.
1675 		 */
1676 		if (ext4_ext_is_uninitialized(ex))
1677 			uninitialized = 1;
1678 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1679 					+ ext4_ext_get_actual_len(newext));
1680 		if (uninitialized)
1681 			ext4_ext_mark_uninitialized(ex);
1682 		eh = path[depth].p_hdr;
1683 		nearex = ex;
1684 		goto merge;
1685 	}
1686 
1687 repeat:
1688 	depth = ext_depth(inode);
1689 	eh = path[depth].p_hdr;
1690 	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1691 		goto has_space;
1692 
1693 	/* probably next leaf has space for us? */
1694 	fex = EXT_LAST_EXTENT(eh);
1695 	next = ext4_ext_next_leaf_block(inode, path);
1696 	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1697 	    && next != EXT_MAX_BLOCK) {
1698 		ext_debug("next leaf block - %d\n", next);
1699 		BUG_ON(npath != NULL);
1700 		npath = ext4_ext_find_extent(inode, next, NULL);
1701 		if (IS_ERR(npath))
1702 			return PTR_ERR(npath);
1703 		BUG_ON(npath->p_depth != path->p_depth);
1704 		eh = npath[depth].p_hdr;
1705 		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1706 			ext_debug("next leaf isnt full(%d)\n",
1707 				  le16_to_cpu(eh->eh_entries));
1708 			path = npath;
1709 			goto repeat;
1710 		}
1711 		ext_debug("next leaf has no free space(%d,%d)\n",
1712 			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1713 	}
1714 
1715 	/*
1716 	 * There is no free space in the found leaf.
1717 	 * We're gonna add a new leaf in the tree.
1718 	 */
1719 	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1720 	if (err)
1721 		goto cleanup;
1722 	depth = ext_depth(inode);
1723 	eh = path[depth].p_hdr;
1724 
1725 has_space:
1726 	nearex = path[depth].p_ext;
1727 
1728 	err = ext4_ext_get_access(handle, inode, path + depth);
1729 	if (err)
1730 		goto cleanup;
1731 
1732 	if (!nearex) {
1733 		/* there is no extent in this leaf, create first one */
1734 		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1735 				le32_to_cpu(newext->ee_block),
1736 				ext4_ext_pblock(newext),
1737 				ext4_ext_is_uninitialized(newext),
1738 				ext4_ext_get_actual_len(newext));
1739 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1740 	} else if (le32_to_cpu(newext->ee_block)
1741 			   > le32_to_cpu(nearex->ee_block)) {
1742 /*		BUG_ON(newext->ee_block == nearex->ee_block); */
1743 		if (nearex != EXT_LAST_EXTENT(eh)) {
1744 			len = EXT_MAX_EXTENT(eh) - nearex;
1745 			len = (len - 1) * sizeof(struct ext4_extent);
1746 			len = len < 0 ? 0 : len;
1747 			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1748 					"move %d from 0x%p to 0x%p\n",
1749 					le32_to_cpu(newext->ee_block),
1750 					ext4_ext_pblock(newext),
1751 					ext4_ext_is_uninitialized(newext),
1752 					ext4_ext_get_actual_len(newext),
1753 					nearex, len, nearex + 1, nearex + 2);
1754 			memmove(nearex + 2, nearex + 1, len);
1755 		}
1756 		path[depth].p_ext = nearex + 1;
1757 	} else {
1758 		BUG_ON(newext->ee_block == nearex->ee_block);
1759 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1760 		len = len < 0 ? 0 : len;
1761 		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1762 				"move %d from 0x%p to 0x%p\n",
1763 				le32_to_cpu(newext->ee_block),
1764 				ext4_ext_pblock(newext),
1765 				ext4_ext_is_uninitialized(newext),
1766 				ext4_ext_get_actual_len(newext),
1767 				nearex, len, nearex + 1, nearex + 2);
1768 		memmove(nearex + 1, nearex, len);
1769 		path[depth].p_ext = nearex;
1770 	}
1771 
1772 	le16_add_cpu(&eh->eh_entries, 1);
1773 	nearex = path[depth].p_ext;
1774 	nearex->ee_block = newext->ee_block;
1775 	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1776 	nearex->ee_len = newext->ee_len;
1777 
1778 merge:
1779 	/* try to merge extents to the right */
1780 	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1781 		ext4_ext_try_to_merge(inode, path, nearex);
1782 
1783 	/* try to merge extents to the left */
1784 
1785 	/* time to correct all indexes above */
1786 	err = ext4_ext_correct_indexes(handle, inode, path);
1787 	if (err)
1788 		goto cleanup;
1789 
1790 	err = ext4_ext_dirty(handle, inode, path + depth);
1791 
1792 cleanup:
1793 	if (npath) {
1794 		ext4_ext_drop_refs(npath);
1795 		kfree(npath);
1796 	}
1797 	ext4_ext_invalidate_cache(inode);
1798 	return err;
1799 }
1800 
1801 static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1802 			       ext4_lblk_t num, ext_prepare_callback func,
1803 			       void *cbdata)
1804 {
1805 	struct ext4_ext_path *path = NULL;
1806 	struct ext4_ext_cache cbex;
1807 	struct ext4_extent *ex;
1808 	ext4_lblk_t next, start = 0, end = 0;
1809 	ext4_lblk_t last = block + num;
1810 	int depth, exists, err = 0;
1811 
1812 	BUG_ON(func == NULL);
1813 	BUG_ON(inode == NULL);
1814 
1815 	while (block < last && block != EXT_MAX_BLOCK) {
1816 		num = last - block;
1817 		/* find extent for this block */
1818 		down_read(&EXT4_I(inode)->i_data_sem);
1819 		path = ext4_ext_find_extent(inode, block, path);
1820 		up_read(&EXT4_I(inode)->i_data_sem);
1821 		if (IS_ERR(path)) {
1822 			err = PTR_ERR(path);
1823 			path = NULL;
1824 			break;
1825 		}
1826 
1827 		depth = ext_depth(inode);
1828 		if (unlikely(path[depth].p_hdr == NULL)) {
1829 			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1830 			err = -EIO;
1831 			break;
1832 		}
1833 		ex = path[depth].p_ext;
1834 		next = ext4_ext_next_allocated_block(path);
1835 
1836 		exists = 0;
1837 		if (!ex) {
1838 			/* there is no extent yet, so try to allocate
1839 			 * all requested space */
1840 			start = block;
1841 			end = block + num;
1842 		} else if (le32_to_cpu(ex->ee_block) > block) {
1843 			/* need to allocate space before found extent */
1844 			start = block;
1845 			end = le32_to_cpu(ex->ee_block);
1846 			if (block + num < end)
1847 				end = block + num;
1848 		} else if (block >= le32_to_cpu(ex->ee_block)
1849 					+ ext4_ext_get_actual_len(ex)) {
1850 			/* need to allocate space after found extent */
1851 			start = block;
1852 			end = block + num;
1853 			if (end >= next)
1854 				end = next;
1855 		} else if (block >= le32_to_cpu(ex->ee_block)) {
1856 			/*
1857 			 * some part of requested space is covered
1858 			 * by found extent
1859 			 */
1860 			start = block;
1861 			end = le32_to_cpu(ex->ee_block)
1862 				+ ext4_ext_get_actual_len(ex);
1863 			if (block + num < end)
1864 				end = block + num;
1865 			exists = 1;
1866 		} else {
1867 			BUG();
1868 		}
1869 		BUG_ON(end <= start);
1870 
1871 		if (!exists) {
1872 			cbex.ec_block = start;
1873 			cbex.ec_len = end - start;
1874 			cbex.ec_start = 0;
1875 			cbex.ec_type = EXT4_EXT_CACHE_GAP;
1876 		} else {
1877 			cbex.ec_block = le32_to_cpu(ex->ee_block);
1878 			cbex.ec_len = ext4_ext_get_actual_len(ex);
1879 			cbex.ec_start = ext4_ext_pblock(ex);
1880 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1881 		}
1882 
1883 		if (unlikely(cbex.ec_len == 0)) {
1884 			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1885 			err = -EIO;
1886 			break;
1887 		}
1888 		err = func(inode, path, &cbex, ex, cbdata);
1889 		ext4_ext_drop_refs(path);
1890 
1891 		if (err < 0)
1892 			break;
1893 
1894 		if (err == EXT_REPEAT)
1895 			continue;
1896 		else if (err == EXT_BREAK) {
1897 			err = 0;
1898 			break;
1899 		}
1900 
1901 		if (ext_depth(inode) != depth) {
1902 			/* depth was changed. we have to realloc path */
1903 			kfree(path);
1904 			path = NULL;
1905 		}
1906 
1907 		block = cbex.ec_block + cbex.ec_len;
1908 	}
1909 
1910 	if (path) {
1911 		ext4_ext_drop_refs(path);
1912 		kfree(path);
1913 	}
1914 
1915 	return err;
1916 }
1917 
1918 static void
1919 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1920 			__u32 len, ext4_fsblk_t start, int type)
1921 {
1922 	struct ext4_ext_cache *cex;
1923 	BUG_ON(len == 0);
1924 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1925 	cex = &EXT4_I(inode)->i_cached_extent;
1926 	cex->ec_type = type;
1927 	cex->ec_block = block;
1928 	cex->ec_len = len;
1929 	cex->ec_start = start;
1930 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1931 }
1932 
1933 /*
1934  * ext4_ext_put_gap_in_cache:
1935  * calculate boundaries of the gap that the requested block fits into
1936  * and cache this gap
1937  */
1938 static void
1939 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1940 				ext4_lblk_t block)
1941 {
1942 	int depth = ext_depth(inode);
1943 	unsigned long len;
1944 	ext4_lblk_t lblock;
1945 	struct ext4_extent *ex;
1946 
1947 	ex = path[depth].p_ext;
1948 	if (ex == NULL) {
1949 		/* there is no extent yet, so gap is [0;-] */
1950 		lblock = 0;
1951 		len = EXT_MAX_BLOCK;
1952 		ext_debug("cache gap(whole file):");
1953 	} else if (block < le32_to_cpu(ex->ee_block)) {
1954 		lblock = block;
1955 		len = le32_to_cpu(ex->ee_block) - block;
1956 		ext_debug("cache gap(before): %u [%u:%u]",
1957 				block,
1958 				le32_to_cpu(ex->ee_block),
1959 				 ext4_ext_get_actual_len(ex));
1960 	} else if (block >= le32_to_cpu(ex->ee_block)
1961 			+ ext4_ext_get_actual_len(ex)) {
1962 		ext4_lblk_t next;
1963 		lblock = le32_to_cpu(ex->ee_block)
1964 			+ ext4_ext_get_actual_len(ex);
1965 
1966 		next = ext4_ext_next_allocated_block(path);
1967 		ext_debug("cache gap(after): [%u:%u] %u",
1968 				le32_to_cpu(ex->ee_block),
1969 				ext4_ext_get_actual_len(ex),
1970 				block);
1971 		BUG_ON(next == lblock);
1972 		len = next - lblock;
1973 	} else {
1974 		lblock = len = 0;
1975 		BUG();
1976 	}
1977 
1978 	ext_debug(" -> %u:%lu\n", lblock, len);
1979 	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1980 }
1981 
1982 static int
1983 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1984 			struct ext4_extent *ex)
1985 {
1986 	struct ext4_ext_cache *cex;
1987 	int ret = EXT4_EXT_CACHE_NO;
1988 
1989 	/*
1990 	 * We borrow i_block_reservation_lock to protect i_cached_extent
1991 	 */
1992 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1993 	cex = &EXT4_I(inode)->i_cached_extent;
1994 
1995 	/* has cache valid data? */
1996 	if (cex->ec_type == EXT4_EXT_CACHE_NO)
1997 		goto errout;
1998 
1999 	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2000 			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2001 	if (in_range(block, cex->ec_block, cex->ec_len)) {
2002 		ex->ee_block = cpu_to_le32(cex->ec_block);
2003 		ext4_ext_store_pblock(ex, cex->ec_start);
2004 		ex->ee_len = cpu_to_le16(cex->ec_len);
2005 		ext_debug("%u cached by %u:%u:%llu\n",
2006 				block,
2007 				cex->ec_block, cex->ec_len, cex->ec_start);
2008 		ret = cex->ec_type;
2009 	}
2010 errout:
2011 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2012 	return ret;
2013 }
2014 
2015 /*
2016  * ext4_ext_rm_idx:
2017  * removes index from the index block.
2018  * It's used in truncate case only, thus all requests are for
2019  * last index in the block only.
2020  */
2021 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2022 			struct ext4_ext_path *path)
2023 {
2024 	int err;
2025 	ext4_fsblk_t leaf;
2026 
2027 	/* free index block */
2028 	path--;
2029 	leaf = ext4_idx_pblock(path->p_idx);
2030 	if (unlikely(path->p_hdr->eh_entries == 0)) {
2031 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2032 		return -EIO;
2033 	}
2034 	err = ext4_ext_get_access(handle, inode, path);
2035 	if (err)
2036 		return err;
2037 	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2038 	err = ext4_ext_dirty(handle, inode, path);
2039 	if (err)
2040 		return err;
2041 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2042 	ext4_free_blocks(handle, inode, 0, leaf, 1,
2043 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2044 	return err;
2045 }
2046 
2047 /*
2048  * ext4_ext_calc_credits_for_single_extent:
2049  * This routine returns max. credits that needed to insert an extent
2050  * to the extent tree.
2051  * When pass the actual path, the caller should calculate credits
2052  * under i_data_sem.
2053  */
2054 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2055 						struct ext4_ext_path *path)
2056 {
2057 	if (path) {
2058 		int depth = ext_depth(inode);
2059 		int ret = 0;
2060 
2061 		/* probably there is space in leaf? */
2062 		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2063 				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2064 
2065 			/*
2066 			 *  There are some space in the leaf tree, no
2067 			 *  need to account for leaf block credit
2068 			 *
2069 			 *  bitmaps and block group descriptor blocks
2070 			 *  and other metadat blocks still need to be
2071 			 *  accounted.
2072 			 */
2073 			/* 1 bitmap, 1 block group descriptor */
2074 			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2075 			return ret;
2076 		}
2077 	}
2078 
2079 	return ext4_chunk_trans_blocks(inode, nrblocks);
2080 }
2081 
2082 /*
2083  * How many index/leaf blocks need to change/allocate to modify nrblocks?
2084  *
2085  * if nrblocks are fit in a single extent (chunk flag is 1), then
2086  * in the worse case, each tree level index/leaf need to be changed
2087  * if the tree split due to insert a new extent, then the old tree
2088  * index/leaf need to be updated too
2089  *
2090  * If the nrblocks are discontiguous, they could cause
2091  * the whole tree split more than once, but this is really rare.
2092  */
2093 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2094 {
2095 	int index;
2096 	int depth = ext_depth(inode);
2097 
2098 	if (chunk)
2099 		index = depth * 2;
2100 	else
2101 		index = depth * 3;
2102 
2103 	return index;
2104 }
2105 
2106 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2107 				struct ext4_extent *ex,
2108 				ext4_lblk_t from, ext4_lblk_t to)
2109 {
2110 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2111 	int flags = EXT4_FREE_BLOCKS_FORGET;
2112 
2113 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2114 		flags |= EXT4_FREE_BLOCKS_METADATA;
2115 #ifdef EXTENTS_STATS
2116 	{
2117 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2118 		spin_lock(&sbi->s_ext_stats_lock);
2119 		sbi->s_ext_blocks += ee_len;
2120 		sbi->s_ext_extents++;
2121 		if (ee_len < sbi->s_ext_min)
2122 			sbi->s_ext_min = ee_len;
2123 		if (ee_len > sbi->s_ext_max)
2124 			sbi->s_ext_max = ee_len;
2125 		if (ext_depth(inode) > sbi->s_depth_max)
2126 			sbi->s_depth_max = ext_depth(inode);
2127 		spin_unlock(&sbi->s_ext_stats_lock);
2128 	}
2129 #endif
2130 	if (from >= le32_to_cpu(ex->ee_block)
2131 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2132 		/* tail removal */
2133 		ext4_lblk_t num;
2134 		ext4_fsblk_t start;
2135 
2136 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2137 		start = ext4_ext_pblock(ex) + ee_len - num;
2138 		ext_debug("free last %u blocks starting %llu\n", num, start);
2139 		ext4_free_blocks(handle, inode, 0, start, num, flags);
2140 	} else if (from == le32_to_cpu(ex->ee_block)
2141 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2142 		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2143 			from, to, le32_to_cpu(ex->ee_block), ee_len);
2144 	} else {
2145 		printk(KERN_INFO "strange request: removal(2) "
2146 				"%u-%u from %u:%u\n",
2147 				from, to, le32_to_cpu(ex->ee_block), ee_len);
2148 	}
2149 	return 0;
2150 }
2151 
2152 static int
2153 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2154 		struct ext4_ext_path *path, ext4_lblk_t start)
2155 {
2156 	int err = 0, correct_index = 0;
2157 	int depth = ext_depth(inode), credits;
2158 	struct ext4_extent_header *eh;
2159 	ext4_lblk_t a, b, block;
2160 	unsigned num;
2161 	ext4_lblk_t ex_ee_block;
2162 	unsigned short ex_ee_len;
2163 	unsigned uninitialized = 0;
2164 	struct ext4_extent *ex;
2165 
2166 	/* the header must be checked already in ext4_ext_remove_space() */
2167 	ext_debug("truncate since %u in leaf\n", start);
2168 	if (!path[depth].p_hdr)
2169 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2170 	eh = path[depth].p_hdr;
2171 	if (unlikely(path[depth].p_hdr == NULL)) {
2172 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2173 		return -EIO;
2174 	}
2175 	/* find where to start removing */
2176 	ex = EXT_LAST_EXTENT(eh);
2177 
2178 	ex_ee_block = le32_to_cpu(ex->ee_block);
2179 	ex_ee_len = ext4_ext_get_actual_len(ex);
2180 
2181 	while (ex >= EXT_FIRST_EXTENT(eh) &&
2182 			ex_ee_block + ex_ee_len > start) {
2183 
2184 		if (ext4_ext_is_uninitialized(ex))
2185 			uninitialized = 1;
2186 		else
2187 			uninitialized = 0;
2188 
2189 		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2190 			 uninitialized, ex_ee_len);
2191 		path[depth].p_ext = ex;
2192 
2193 		a = ex_ee_block > start ? ex_ee_block : start;
2194 		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2195 			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2196 
2197 		ext_debug("  border %u:%u\n", a, b);
2198 
2199 		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2200 			block = 0;
2201 			num = 0;
2202 			BUG();
2203 		} else if (a != ex_ee_block) {
2204 			/* remove tail of the extent */
2205 			block = ex_ee_block;
2206 			num = a - block;
2207 		} else if (b != ex_ee_block + ex_ee_len - 1) {
2208 			/* remove head of the extent */
2209 			block = a;
2210 			num = b - a;
2211 			/* there is no "make a hole" API yet */
2212 			BUG();
2213 		} else {
2214 			/* remove whole extent: excellent! */
2215 			block = ex_ee_block;
2216 			num = 0;
2217 			BUG_ON(a != ex_ee_block);
2218 			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2219 		}
2220 
2221 		/*
2222 		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2223 		 * descriptor) for each block group; assume two block
2224 		 * groups plus ex_ee_len/blocks_per_block_group for
2225 		 * the worst case
2226 		 */
2227 		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2228 		if (ex == EXT_FIRST_EXTENT(eh)) {
2229 			correct_index = 1;
2230 			credits += (ext_depth(inode)) + 1;
2231 		}
2232 		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2233 
2234 		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2235 		if (err)
2236 			goto out;
2237 
2238 		err = ext4_ext_get_access(handle, inode, path + depth);
2239 		if (err)
2240 			goto out;
2241 
2242 		err = ext4_remove_blocks(handle, inode, ex, a, b);
2243 		if (err)
2244 			goto out;
2245 
2246 		if (num == 0) {
2247 			/* this extent is removed; mark slot entirely unused */
2248 			ext4_ext_store_pblock(ex, 0);
2249 			le16_add_cpu(&eh->eh_entries, -1);
2250 		}
2251 
2252 		ex->ee_block = cpu_to_le32(block);
2253 		ex->ee_len = cpu_to_le16(num);
2254 		/*
2255 		 * Do not mark uninitialized if all the blocks in the
2256 		 * extent have been removed.
2257 		 */
2258 		if (uninitialized && num)
2259 			ext4_ext_mark_uninitialized(ex);
2260 
2261 		err = ext4_ext_dirty(handle, inode, path + depth);
2262 		if (err)
2263 			goto out;
2264 
2265 		ext_debug("new extent: %u:%u:%llu\n", block, num,
2266 				ext4_ext_pblock(ex));
2267 		ex--;
2268 		ex_ee_block = le32_to_cpu(ex->ee_block);
2269 		ex_ee_len = ext4_ext_get_actual_len(ex);
2270 	}
2271 
2272 	if (correct_index && eh->eh_entries)
2273 		err = ext4_ext_correct_indexes(handle, inode, path);
2274 
2275 	/* if this leaf is free, then we should
2276 	 * remove it from index block above */
2277 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2278 		err = ext4_ext_rm_idx(handle, inode, path + depth);
2279 
2280 out:
2281 	return err;
2282 }
2283 
2284 /*
2285  * ext4_ext_more_to_rm:
2286  * returns 1 if current index has to be freed (even partial)
2287  */
2288 static int
2289 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2290 {
2291 	BUG_ON(path->p_idx == NULL);
2292 
2293 	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2294 		return 0;
2295 
2296 	/*
2297 	 * if truncate on deeper level happened, it wasn't partial,
2298 	 * so we have to consider current index for truncation
2299 	 */
2300 	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2301 		return 0;
2302 	return 1;
2303 }
2304 
2305 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2306 {
2307 	struct super_block *sb = inode->i_sb;
2308 	int depth = ext_depth(inode);
2309 	struct ext4_ext_path *path;
2310 	handle_t *handle;
2311 	int i, err;
2312 
2313 	ext_debug("truncate since %u\n", start);
2314 
2315 	/* probably first extent we're gonna free will be last in block */
2316 	handle = ext4_journal_start(inode, depth + 1);
2317 	if (IS_ERR(handle))
2318 		return PTR_ERR(handle);
2319 
2320 again:
2321 	ext4_ext_invalidate_cache(inode);
2322 
2323 	/*
2324 	 * We start scanning from right side, freeing all the blocks
2325 	 * after i_size and walking into the tree depth-wise.
2326 	 */
2327 	depth = ext_depth(inode);
2328 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2329 	if (path == NULL) {
2330 		ext4_journal_stop(handle);
2331 		return -ENOMEM;
2332 	}
2333 	path[0].p_depth = depth;
2334 	path[0].p_hdr = ext_inode_hdr(inode);
2335 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2336 		err = -EIO;
2337 		goto out;
2338 	}
2339 	i = err = 0;
2340 
2341 	while (i >= 0 && err == 0) {
2342 		if (i == depth) {
2343 			/* this is leaf block */
2344 			err = ext4_ext_rm_leaf(handle, inode, path, start);
2345 			/* root level has p_bh == NULL, brelse() eats this */
2346 			brelse(path[i].p_bh);
2347 			path[i].p_bh = NULL;
2348 			i--;
2349 			continue;
2350 		}
2351 
2352 		/* this is index block */
2353 		if (!path[i].p_hdr) {
2354 			ext_debug("initialize header\n");
2355 			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2356 		}
2357 
2358 		if (!path[i].p_idx) {
2359 			/* this level hasn't been touched yet */
2360 			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2361 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2362 			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2363 				  path[i].p_hdr,
2364 				  le16_to_cpu(path[i].p_hdr->eh_entries));
2365 		} else {
2366 			/* we were already here, see at next index */
2367 			path[i].p_idx--;
2368 		}
2369 
2370 		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2371 				i, EXT_FIRST_INDEX(path[i].p_hdr),
2372 				path[i].p_idx);
2373 		if (ext4_ext_more_to_rm(path + i)) {
2374 			struct buffer_head *bh;
2375 			/* go to the next level */
2376 			ext_debug("move to level %d (block %llu)\n",
2377 				  i + 1, ext4_idx_pblock(path[i].p_idx));
2378 			memset(path + i + 1, 0, sizeof(*path));
2379 			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2380 			if (!bh) {
2381 				/* should we reset i_size? */
2382 				err = -EIO;
2383 				break;
2384 			}
2385 			if (WARN_ON(i + 1 > depth)) {
2386 				err = -EIO;
2387 				break;
2388 			}
2389 			if (ext4_ext_check(inode, ext_block_hdr(bh),
2390 							depth - i - 1)) {
2391 				err = -EIO;
2392 				break;
2393 			}
2394 			path[i + 1].p_bh = bh;
2395 
2396 			/* save actual number of indexes since this
2397 			 * number is changed at the next iteration */
2398 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2399 			i++;
2400 		} else {
2401 			/* we finished processing this index, go up */
2402 			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2403 				/* index is empty, remove it;
2404 				 * handle must be already prepared by the
2405 				 * truncatei_leaf() */
2406 				err = ext4_ext_rm_idx(handle, inode, path + i);
2407 			}
2408 			/* root level has p_bh == NULL, brelse() eats this */
2409 			brelse(path[i].p_bh);
2410 			path[i].p_bh = NULL;
2411 			i--;
2412 			ext_debug("return to level %d\n", i);
2413 		}
2414 	}
2415 
2416 	/* TODO: flexible tree reduction should be here */
2417 	if (path->p_hdr->eh_entries == 0) {
2418 		/*
2419 		 * truncate to zero freed all the tree,
2420 		 * so we need to correct eh_depth
2421 		 */
2422 		err = ext4_ext_get_access(handle, inode, path);
2423 		if (err == 0) {
2424 			ext_inode_hdr(inode)->eh_depth = 0;
2425 			ext_inode_hdr(inode)->eh_max =
2426 				cpu_to_le16(ext4_ext_space_root(inode, 0));
2427 			err = ext4_ext_dirty(handle, inode, path);
2428 		}
2429 	}
2430 out:
2431 	ext4_ext_drop_refs(path);
2432 	kfree(path);
2433 	if (err == -EAGAIN)
2434 		goto again;
2435 	ext4_journal_stop(handle);
2436 
2437 	return err;
2438 }
2439 
2440 /*
2441  * called at mount time
2442  */
2443 void ext4_ext_init(struct super_block *sb)
2444 {
2445 	/*
2446 	 * possible initialization would be here
2447 	 */
2448 
2449 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2450 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2451 		printk(KERN_INFO "EXT4-fs: file extents enabled");
2452 #ifdef AGGRESSIVE_TEST
2453 		printk(", aggressive tests");
2454 #endif
2455 #ifdef CHECK_BINSEARCH
2456 		printk(", check binsearch");
2457 #endif
2458 #ifdef EXTENTS_STATS
2459 		printk(", stats");
2460 #endif
2461 		printk("\n");
2462 #endif
2463 #ifdef EXTENTS_STATS
2464 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2465 		EXT4_SB(sb)->s_ext_min = 1 << 30;
2466 		EXT4_SB(sb)->s_ext_max = 0;
2467 #endif
2468 	}
2469 }
2470 
2471 /*
2472  * called at umount time
2473  */
2474 void ext4_ext_release(struct super_block *sb)
2475 {
2476 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2477 		return;
2478 
2479 #ifdef EXTENTS_STATS
2480 	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2481 		struct ext4_sb_info *sbi = EXT4_SB(sb);
2482 		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2483 			sbi->s_ext_blocks, sbi->s_ext_extents,
2484 			sbi->s_ext_blocks / sbi->s_ext_extents);
2485 		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2486 			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2487 	}
2488 #endif
2489 }
2490 
2491 /* FIXME!! we need to try to merge to left or right after zero-out  */
2492 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2493 {
2494 	ext4_fsblk_t ee_pblock;
2495 	unsigned int ee_len;
2496 	int ret;
2497 
2498 	ee_len    = ext4_ext_get_actual_len(ex);
2499 	ee_pblock = ext4_ext_pblock(ex);
2500 
2501 	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2502 	if (ret > 0)
2503 		ret = 0;
2504 
2505 	return ret;
2506 }
2507 
2508 #define EXT4_EXT_ZERO_LEN 7
2509 /*
2510  * This function is called by ext4_ext_map_blocks() if someone tries to write
2511  * to an uninitialized extent. It may result in splitting the uninitialized
2512  * extent into multiple extents (upto three - one initialized and two
2513  * uninitialized).
2514  * There are three possibilities:
2515  *   a> There is no split required: Entire extent should be initialized
2516  *   b> Splits in two extents: Write is happening at either end of the extent
2517  *   c> Splits in three extents: Somone is writing in middle of the extent
2518  */
2519 static int ext4_ext_convert_to_initialized(handle_t *handle,
2520 					   struct inode *inode,
2521 					   struct ext4_map_blocks *map,
2522 					   struct ext4_ext_path *path)
2523 {
2524 	struct ext4_extent *ex, newex, orig_ex;
2525 	struct ext4_extent *ex1 = NULL;
2526 	struct ext4_extent *ex2 = NULL;
2527 	struct ext4_extent *ex3 = NULL;
2528 	struct ext4_extent_header *eh;
2529 	ext4_lblk_t ee_block, eof_block;
2530 	unsigned int allocated, ee_len, depth;
2531 	ext4_fsblk_t newblock;
2532 	int err = 0;
2533 	int ret = 0;
2534 	int may_zeroout;
2535 
2536 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2537 		"block %llu, max_blocks %u\n", inode->i_ino,
2538 		(unsigned long long)map->m_lblk, map->m_len);
2539 
2540 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2541 		inode->i_sb->s_blocksize_bits;
2542 	if (eof_block < map->m_lblk + map->m_len)
2543 		eof_block = map->m_lblk + map->m_len;
2544 
2545 	depth = ext_depth(inode);
2546 	eh = path[depth].p_hdr;
2547 	ex = path[depth].p_ext;
2548 	ee_block = le32_to_cpu(ex->ee_block);
2549 	ee_len = ext4_ext_get_actual_len(ex);
2550 	allocated = ee_len - (map->m_lblk - ee_block);
2551 	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2552 
2553 	ex2 = ex;
2554 	orig_ex.ee_block = ex->ee_block;
2555 	orig_ex.ee_len   = cpu_to_le16(ee_len);
2556 	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2557 
2558 	/*
2559 	 * It is safe to convert extent to initialized via explicit
2560 	 * zeroout only if extent is fully insde i_size or new_size.
2561 	 */
2562 	may_zeroout = ee_block + ee_len <= eof_block;
2563 
2564 	err = ext4_ext_get_access(handle, inode, path + depth);
2565 	if (err)
2566 		goto out;
2567 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2568 	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2569 		err =  ext4_ext_zeroout(inode, &orig_ex);
2570 		if (err)
2571 			goto fix_extent_len;
2572 		/* update the extent length and mark as initialized */
2573 		ex->ee_block = orig_ex.ee_block;
2574 		ex->ee_len   = orig_ex.ee_len;
2575 		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2576 		ext4_ext_dirty(handle, inode, path + depth);
2577 		/* zeroed the full extent */
2578 		return allocated;
2579 	}
2580 
2581 	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2582 	if (map->m_lblk > ee_block) {
2583 		ex1 = ex;
2584 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2585 		ext4_ext_mark_uninitialized(ex1);
2586 		ex2 = &newex;
2587 	}
2588 	/*
2589 	 * for sanity, update the length of the ex2 extent before
2590 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2591 	 * overlap of blocks.
2592 	 */
2593 	if (!ex1 && allocated > map->m_len)
2594 		ex2->ee_len = cpu_to_le16(map->m_len);
2595 	/* ex3: to ee_block + ee_len : uninitialised */
2596 	if (allocated > map->m_len) {
2597 		unsigned int newdepth;
2598 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2599 		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2600 			/*
2601 			 * map->m_lblk == ee_block is handled by the zerouout
2602 			 * at the beginning.
2603 			 * Mark first half uninitialized.
2604 			 * Mark second half initialized and zero out the
2605 			 * initialized extent
2606 			 */
2607 			ex->ee_block = orig_ex.ee_block;
2608 			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2609 			ext4_ext_mark_uninitialized(ex);
2610 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2611 			ext4_ext_dirty(handle, inode, path + depth);
2612 
2613 			ex3 = &newex;
2614 			ex3->ee_block = cpu_to_le32(map->m_lblk);
2615 			ext4_ext_store_pblock(ex3, newblock);
2616 			ex3->ee_len = cpu_to_le16(allocated);
2617 			err = ext4_ext_insert_extent(handle, inode, path,
2618 							ex3, 0);
2619 			if (err == -ENOSPC) {
2620 				err =  ext4_ext_zeroout(inode, &orig_ex);
2621 				if (err)
2622 					goto fix_extent_len;
2623 				ex->ee_block = orig_ex.ee_block;
2624 				ex->ee_len   = orig_ex.ee_len;
2625 				ext4_ext_store_pblock(ex,
2626 					ext4_ext_pblock(&orig_ex));
2627 				ext4_ext_dirty(handle, inode, path + depth);
2628 				/* blocks available from map->m_lblk */
2629 				return allocated;
2630 
2631 			} else if (err)
2632 				goto fix_extent_len;
2633 
2634 			/*
2635 			 * We need to zero out the second half because
2636 			 * an fallocate request can update file size and
2637 			 * converting the second half to initialized extent
2638 			 * implies that we can leak some junk data to user
2639 			 * space.
2640 			 */
2641 			err =  ext4_ext_zeroout(inode, ex3);
2642 			if (err) {
2643 				/*
2644 				 * We should actually mark the
2645 				 * second half as uninit and return error
2646 				 * Insert would have changed the extent
2647 				 */
2648 				depth = ext_depth(inode);
2649 				ext4_ext_drop_refs(path);
2650 				path = ext4_ext_find_extent(inode, map->m_lblk,
2651 							    path);
2652 				if (IS_ERR(path)) {
2653 					err = PTR_ERR(path);
2654 					return err;
2655 				}
2656 				/* get the second half extent details */
2657 				ex = path[depth].p_ext;
2658 				err = ext4_ext_get_access(handle, inode,
2659 								path + depth);
2660 				if (err)
2661 					return err;
2662 				ext4_ext_mark_uninitialized(ex);
2663 				ext4_ext_dirty(handle, inode, path + depth);
2664 				return err;
2665 			}
2666 
2667 			/* zeroed the second half */
2668 			return allocated;
2669 		}
2670 		ex3 = &newex;
2671 		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2672 		ext4_ext_store_pblock(ex3, newblock + map->m_len);
2673 		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2674 		ext4_ext_mark_uninitialized(ex3);
2675 		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2676 		if (err == -ENOSPC && may_zeroout) {
2677 			err =  ext4_ext_zeroout(inode, &orig_ex);
2678 			if (err)
2679 				goto fix_extent_len;
2680 			/* update the extent length and mark as initialized */
2681 			ex->ee_block = orig_ex.ee_block;
2682 			ex->ee_len   = orig_ex.ee_len;
2683 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2684 			ext4_ext_dirty(handle, inode, path + depth);
2685 			/* zeroed the full extent */
2686 			/* blocks available from map->m_lblk */
2687 			return allocated;
2688 
2689 		} else if (err)
2690 			goto fix_extent_len;
2691 		/*
2692 		 * The depth, and hence eh & ex might change
2693 		 * as part of the insert above.
2694 		 */
2695 		newdepth = ext_depth(inode);
2696 		/*
2697 		 * update the extent length after successful insert of the
2698 		 * split extent
2699 		 */
2700 		ee_len -= ext4_ext_get_actual_len(ex3);
2701 		orig_ex.ee_len = cpu_to_le16(ee_len);
2702 		may_zeroout = ee_block + ee_len <= eof_block;
2703 
2704 		depth = newdepth;
2705 		ext4_ext_drop_refs(path);
2706 		path = ext4_ext_find_extent(inode, map->m_lblk, path);
2707 		if (IS_ERR(path)) {
2708 			err = PTR_ERR(path);
2709 			goto out;
2710 		}
2711 		eh = path[depth].p_hdr;
2712 		ex = path[depth].p_ext;
2713 		if (ex2 != &newex)
2714 			ex2 = ex;
2715 
2716 		err = ext4_ext_get_access(handle, inode, path + depth);
2717 		if (err)
2718 			goto out;
2719 
2720 		allocated = map->m_len;
2721 
2722 		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2723 		 * to insert a extent in the middle zerout directly
2724 		 * otherwise give the extent a chance to merge to left
2725 		 */
2726 		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2727 			map->m_lblk != ee_block && may_zeroout) {
2728 			err =  ext4_ext_zeroout(inode, &orig_ex);
2729 			if (err)
2730 				goto fix_extent_len;
2731 			/* update the extent length and mark as initialized */
2732 			ex->ee_block = orig_ex.ee_block;
2733 			ex->ee_len   = orig_ex.ee_len;
2734 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2735 			ext4_ext_dirty(handle, inode, path + depth);
2736 			/* zero out the first half */
2737 			/* blocks available from map->m_lblk */
2738 			return allocated;
2739 		}
2740 	}
2741 	/*
2742 	 * If there was a change of depth as part of the
2743 	 * insertion of ex3 above, we need to update the length
2744 	 * of the ex1 extent again here
2745 	 */
2746 	if (ex1 && ex1 != ex) {
2747 		ex1 = ex;
2748 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2749 		ext4_ext_mark_uninitialized(ex1);
2750 		ex2 = &newex;
2751 	}
2752 	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2753 	ex2->ee_block = cpu_to_le32(map->m_lblk);
2754 	ext4_ext_store_pblock(ex2, newblock);
2755 	ex2->ee_len = cpu_to_le16(allocated);
2756 	if (ex2 != ex)
2757 		goto insert;
2758 	/*
2759 	 * New (initialized) extent starts from the first block
2760 	 * in the current extent. i.e., ex2 == ex
2761 	 * We have to see if it can be merged with the extent
2762 	 * on the left.
2763 	 */
2764 	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2765 		/*
2766 		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2767 		 * since it merges towards right _only_.
2768 		 */
2769 		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2770 		if (ret) {
2771 			err = ext4_ext_correct_indexes(handle, inode, path);
2772 			if (err)
2773 				goto out;
2774 			depth = ext_depth(inode);
2775 			ex2--;
2776 		}
2777 	}
2778 	/*
2779 	 * Try to Merge towards right. This might be required
2780 	 * only when the whole extent is being written to.
2781 	 * i.e. ex2 == ex and ex3 == NULL.
2782 	 */
2783 	if (!ex3) {
2784 		ret = ext4_ext_try_to_merge(inode, path, ex2);
2785 		if (ret) {
2786 			err = ext4_ext_correct_indexes(handle, inode, path);
2787 			if (err)
2788 				goto out;
2789 		}
2790 	}
2791 	/* Mark modified extent as dirty */
2792 	err = ext4_ext_dirty(handle, inode, path + depth);
2793 	goto out;
2794 insert:
2795 	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2796 	if (err == -ENOSPC && may_zeroout) {
2797 		err =  ext4_ext_zeroout(inode, &orig_ex);
2798 		if (err)
2799 			goto fix_extent_len;
2800 		/* update the extent length and mark as initialized */
2801 		ex->ee_block = orig_ex.ee_block;
2802 		ex->ee_len   = orig_ex.ee_len;
2803 		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2804 		ext4_ext_dirty(handle, inode, path + depth);
2805 		/* zero out the first half */
2806 		return allocated;
2807 	} else if (err)
2808 		goto fix_extent_len;
2809 out:
2810 	ext4_ext_show_leaf(inode, path);
2811 	return err ? err : allocated;
2812 
2813 fix_extent_len:
2814 	ex->ee_block = orig_ex.ee_block;
2815 	ex->ee_len   = orig_ex.ee_len;
2816 	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2817 	ext4_ext_mark_uninitialized(ex);
2818 	ext4_ext_dirty(handle, inode, path + depth);
2819 	return err;
2820 }
2821 
2822 /*
2823  * This function is called by ext4_ext_map_blocks() from
2824  * ext4_get_blocks_dio_write() when DIO to write
2825  * to an uninitialized extent.
2826  *
2827  * Writing to an uninitized extent may result in splitting the uninitialized
2828  * extent into multiple /intialized unintialized extents (up to three)
2829  * There are three possibilities:
2830  *   a> There is no split required: Entire extent should be uninitialized
2831  *   b> Splits in two extents: Write is happening at either end of the extent
2832  *   c> Splits in three extents: Somone is writing in middle of the extent
2833  *
2834  * One of more index blocks maybe needed if the extent tree grow after
2835  * the unintialized extent split. To prevent ENOSPC occur at the IO
2836  * complete, we need to split the uninitialized extent before DIO submit
2837  * the IO. The uninitialized extent called at this time will be split
2838  * into three uninitialized extent(at most). After IO complete, the part
2839  * being filled will be convert to initialized by the end_io callback function
2840  * via ext4_convert_unwritten_extents().
2841  *
2842  * Returns the size of uninitialized extent to be written on success.
2843  */
2844 static int ext4_split_unwritten_extents(handle_t *handle,
2845 					struct inode *inode,
2846 					struct ext4_map_blocks *map,
2847 					struct ext4_ext_path *path,
2848 					int flags)
2849 {
2850 	struct ext4_extent *ex, newex, orig_ex;
2851 	struct ext4_extent *ex1 = NULL;
2852 	struct ext4_extent *ex2 = NULL;
2853 	struct ext4_extent *ex3 = NULL;
2854 	ext4_lblk_t ee_block, eof_block;
2855 	unsigned int allocated, ee_len, depth;
2856 	ext4_fsblk_t newblock;
2857 	int err = 0;
2858 	int may_zeroout;
2859 
2860 	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2861 		"block %llu, max_blocks %u\n", inode->i_ino,
2862 		(unsigned long long)map->m_lblk, map->m_len);
2863 
2864 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2865 		inode->i_sb->s_blocksize_bits;
2866 	if (eof_block < map->m_lblk + map->m_len)
2867 		eof_block = map->m_lblk + map->m_len;
2868 
2869 	depth = ext_depth(inode);
2870 	ex = path[depth].p_ext;
2871 	ee_block = le32_to_cpu(ex->ee_block);
2872 	ee_len = ext4_ext_get_actual_len(ex);
2873 	allocated = ee_len - (map->m_lblk - ee_block);
2874 	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2875 
2876 	ex2 = ex;
2877 	orig_ex.ee_block = ex->ee_block;
2878 	orig_ex.ee_len   = cpu_to_le16(ee_len);
2879 	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2880 
2881 	/*
2882 	 * It is safe to convert extent to initialized via explicit
2883 	 * zeroout only if extent is fully insde i_size or new_size.
2884 	 */
2885 	may_zeroout = ee_block + ee_len <= eof_block;
2886 
2887 	/*
2888  	 * If the uninitialized extent begins at the same logical
2889  	 * block where the write begins, and the write completely
2890  	 * covers the extent, then we don't need to split it.
2891  	 */
2892 	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2893 		return allocated;
2894 
2895 	err = ext4_ext_get_access(handle, inode, path + depth);
2896 	if (err)
2897 		goto out;
2898 	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2899 	if (map->m_lblk > ee_block) {
2900 		ex1 = ex;
2901 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2902 		ext4_ext_mark_uninitialized(ex1);
2903 		ex2 = &newex;
2904 	}
2905 	/*
2906 	 * for sanity, update the length of the ex2 extent before
2907 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2908 	 * overlap of blocks.
2909 	 */
2910 	if (!ex1 && allocated > map->m_len)
2911 		ex2->ee_len = cpu_to_le16(map->m_len);
2912 	/* ex3: to ee_block + ee_len : uninitialised */
2913 	if (allocated > map->m_len) {
2914 		unsigned int newdepth;
2915 		ex3 = &newex;
2916 		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2917 		ext4_ext_store_pblock(ex3, newblock + map->m_len);
2918 		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2919 		ext4_ext_mark_uninitialized(ex3);
2920 		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2921 		if (err == -ENOSPC && may_zeroout) {
2922 			err =  ext4_ext_zeroout(inode, &orig_ex);
2923 			if (err)
2924 				goto fix_extent_len;
2925 			/* update the extent length and mark as initialized */
2926 			ex->ee_block = orig_ex.ee_block;
2927 			ex->ee_len   = orig_ex.ee_len;
2928 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2929 			ext4_ext_dirty(handle, inode, path + depth);
2930 			/* zeroed the full extent */
2931 			/* blocks available from map->m_lblk */
2932 			return allocated;
2933 
2934 		} else if (err)
2935 			goto fix_extent_len;
2936 		/*
2937 		 * The depth, and hence eh & ex might change
2938 		 * as part of the insert above.
2939 		 */
2940 		newdepth = ext_depth(inode);
2941 		/*
2942 		 * update the extent length after successful insert of the
2943 		 * split extent
2944 		 */
2945 		ee_len -= ext4_ext_get_actual_len(ex3);
2946 		orig_ex.ee_len = cpu_to_le16(ee_len);
2947 		may_zeroout = ee_block + ee_len <= eof_block;
2948 
2949 		depth = newdepth;
2950 		ext4_ext_drop_refs(path);
2951 		path = ext4_ext_find_extent(inode, map->m_lblk, path);
2952 		if (IS_ERR(path)) {
2953 			err = PTR_ERR(path);
2954 			goto out;
2955 		}
2956 		ex = path[depth].p_ext;
2957 		if (ex2 != &newex)
2958 			ex2 = ex;
2959 
2960 		err = ext4_ext_get_access(handle, inode, path + depth);
2961 		if (err)
2962 			goto out;
2963 
2964 		allocated = map->m_len;
2965 	}
2966 	/*
2967 	 * If there was a change of depth as part of the
2968 	 * insertion of ex3 above, we need to update the length
2969 	 * of the ex1 extent again here
2970 	 */
2971 	if (ex1 && ex1 != ex) {
2972 		ex1 = ex;
2973 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2974 		ext4_ext_mark_uninitialized(ex1);
2975 		ex2 = &newex;
2976 	}
2977 	/*
2978 	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
2979 	 * using direct I/O, uninitialised still.
2980 	 */
2981 	ex2->ee_block = cpu_to_le32(map->m_lblk);
2982 	ext4_ext_store_pblock(ex2, newblock);
2983 	ex2->ee_len = cpu_to_le16(allocated);
2984 	ext4_ext_mark_uninitialized(ex2);
2985 	if (ex2 != ex)
2986 		goto insert;
2987 	/* Mark modified extent as dirty */
2988 	err = ext4_ext_dirty(handle, inode, path + depth);
2989 	ext_debug("out here\n");
2990 	goto out;
2991 insert:
2992 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2993 	if (err == -ENOSPC && may_zeroout) {
2994 		err =  ext4_ext_zeroout(inode, &orig_ex);
2995 		if (err)
2996 			goto fix_extent_len;
2997 		/* update the extent length and mark as initialized */
2998 		ex->ee_block = orig_ex.ee_block;
2999 		ex->ee_len   = orig_ex.ee_len;
3000 		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3001 		ext4_ext_dirty(handle, inode, path + depth);
3002 		/* zero out the first half */
3003 		return allocated;
3004 	} else if (err)
3005 		goto fix_extent_len;
3006 out:
3007 	ext4_ext_show_leaf(inode, path);
3008 	return err ? err : allocated;
3009 
3010 fix_extent_len:
3011 	ex->ee_block = orig_ex.ee_block;
3012 	ex->ee_len   = orig_ex.ee_len;
3013 	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3014 	ext4_ext_mark_uninitialized(ex);
3015 	ext4_ext_dirty(handle, inode, path + depth);
3016 	return err;
3017 }
3018 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3019 					      struct inode *inode,
3020 					      struct ext4_ext_path *path)
3021 {
3022 	struct ext4_extent *ex;
3023 	struct ext4_extent_header *eh;
3024 	int depth;
3025 	int err = 0;
3026 	int ret = 0;
3027 
3028 	depth = ext_depth(inode);
3029 	eh = path[depth].p_hdr;
3030 	ex = path[depth].p_ext;
3031 
3032 	err = ext4_ext_get_access(handle, inode, path + depth);
3033 	if (err)
3034 		goto out;
3035 	/* first mark the extent as initialized */
3036 	ext4_ext_mark_initialized(ex);
3037 
3038 	/*
3039 	 * We have to see if it can be merged with the extent
3040 	 * on the left.
3041 	 */
3042 	if (ex > EXT_FIRST_EXTENT(eh)) {
3043 		/*
3044 		 * To merge left, pass "ex - 1" to try_to_merge(),
3045 		 * since it merges towards right _only_.
3046 		 */
3047 		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3048 		if (ret) {
3049 			err = ext4_ext_correct_indexes(handle, inode, path);
3050 			if (err)
3051 				goto out;
3052 			depth = ext_depth(inode);
3053 			ex--;
3054 		}
3055 	}
3056 	/*
3057 	 * Try to Merge towards right.
3058 	 */
3059 	ret = ext4_ext_try_to_merge(inode, path, ex);
3060 	if (ret) {
3061 		err = ext4_ext_correct_indexes(handle, inode, path);
3062 		if (err)
3063 			goto out;
3064 		depth = ext_depth(inode);
3065 	}
3066 	/* Mark modified extent as dirty */
3067 	err = ext4_ext_dirty(handle, inode, path + depth);
3068 out:
3069 	ext4_ext_show_leaf(inode, path);
3070 	return err;
3071 }
3072 
3073 static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3074 			sector_t block, int count)
3075 {
3076 	int i;
3077 	for (i = 0; i < count; i++)
3078                 unmap_underlying_metadata(bdev, block + i);
3079 }
3080 
3081 /*
3082  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3083  */
3084 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3085 			      struct ext4_map_blocks *map,
3086 			      struct ext4_ext_path *path,
3087 			      unsigned int len)
3088 {
3089 	int i, depth;
3090 	struct ext4_extent_header *eh;
3091 	struct ext4_extent *ex, *last_ex;
3092 
3093 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3094 		return 0;
3095 
3096 	depth = ext_depth(inode);
3097 	eh = path[depth].p_hdr;
3098 	ex = path[depth].p_ext;
3099 
3100 	if (unlikely(!eh->eh_entries)) {
3101 		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3102 				 "EOFBLOCKS_FL set");
3103 		return -EIO;
3104 	}
3105 	last_ex = EXT_LAST_EXTENT(eh);
3106 	/*
3107 	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3108 	 * last block in the last extent in the file.  We test this by
3109 	 * first checking to see if the caller to
3110 	 * ext4_ext_get_blocks() was interested in the last block (or
3111 	 * a block beyond the last block) in the current extent.  If
3112 	 * this turns out to be false, we can bail out from this
3113 	 * function immediately.
3114 	 */
3115 	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
3116 	    ext4_ext_get_actual_len(last_ex))
3117 		return 0;
3118 	/*
3119 	 * If the caller does appear to be planning to write at or
3120 	 * beyond the end of the current extent, we then test to see
3121 	 * if the current extent is the last extent in the file, by
3122 	 * checking to make sure it was reached via the rightmost node
3123 	 * at each level of the tree.
3124 	 */
3125 	for (i = depth-1; i >= 0; i--)
3126 		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3127 			return 0;
3128 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3129 	return ext4_mark_inode_dirty(handle, inode);
3130 }
3131 
3132 static int
3133 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3134 			struct ext4_map_blocks *map,
3135 			struct ext4_ext_path *path, int flags,
3136 			unsigned int allocated, ext4_fsblk_t newblock)
3137 {
3138 	int ret = 0;
3139 	int err = 0;
3140 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3141 
3142 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3143 		  "block %llu, max_blocks %u, flags %d, allocated %u",
3144 		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3145 		  flags, allocated);
3146 	ext4_ext_show_leaf(inode, path);
3147 
3148 	/* get_block() before submit the IO, split the extent */
3149 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3150 		ret = ext4_split_unwritten_extents(handle, inode, map,
3151 						   path, flags);
3152 		/*
3153 		 * Flag the inode(non aio case) or end_io struct (aio case)
3154 		 * that this IO needs to convertion to written when IO is
3155 		 * completed
3156 		 */
3157 		if (io)
3158 			io->flag = EXT4_IO_END_UNWRITTEN;
3159 		else
3160 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3161 		if (ext4_should_dioread_nolock(inode))
3162 			map->m_flags |= EXT4_MAP_UNINIT;
3163 		goto out;
3164 	}
3165 	/* IO end_io complete, convert the filled extent to written */
3166 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3167 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3168 							path);
3169 		if (ret >= 0) {
3170 			ext4_update_inode_fsync_trans(handle, inode, 1);
3171 			err = check_eofblocks_fl(handle, inode, map, path,
3172 						 map->m_len);
3173 		} else
3174 			err = ret;
3175 		goto out2;
3176 	}
3177 	/* buffered IO case */
3178 	/*
3179 	 * repeat fallocate creation request
3180 	 * we already have an unwritten extent
3181 	 */
3182 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3183 		goto map_out;
3184 
3185 	/* buffered READ or buffered write_begin() lookup */
3186 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3187 		/*
3188 		 * We have blocks reserved already.  We
3189 		 * return allocated blocks so that delalloc
3190 		 * won't do block reservation for us.  But
3191 		 * the buffer head will be unmapped so that
3192 		 * a read from the block returns 0s.
3193 		 */
3194 		map->m_flags |= EXT4_MAP_UNWRITTEN;
3195 		goto out1;
3196 	}
3197 
3198 	/* buffered write, writepage time, convert*/
3199 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3200 	if (ret >= 0) {
3201 		ext4_update_inode_fsync_trans(handle, inode, 1);
3202 		err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
3203 		if (err < 0)
3204 			goto out2;
3205 	}
3206 
3207 out:
3208 	if (ret <= 0) {
3209 		err = ret;
3210 		goto out2;
3211 	} else
3212 		allocated = ret;
3213 	map->m_flags |= EXT4_MAP_NEW;
3214 	/*
3215 	 * if we allocated more blocks than requested
3216 	 * we need to make sure we unmap the extra block
3217 	 * allocated. The actual needed block will get
3218 	 * unmapped later when we find the buffer_head marked
3219 	 * new.
3220 	 */
3221 	if (allocated > map->m_len) {
3222 		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3223 					newblock + map->m_len,
3224 					allocated - map->m_len);
3225 		allocated = map->m_len;
3226 	}
3227 
3228 	/*
3229 	 * If we have done fallocate with the offset that is already
3230 	 * delayed allocated, we would have block reservation
3231 	 * and quota reservation done in the delayed write path.
3232 	 * But fallocate would have already updated quota and block
3233 	 * count for this offset. So cancel these reservation
3234 	 */
3235 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3236 		ext4_da_update_reserve_space(inode, allocated, 0);
3237 
3238 map_out:
3239 	map->m_flags |= EXT4_MAP_MAPPED;
3240 out1:
3241 	if (allocated > map->m_len)
3242 		allocated = map->m_len;
3243 	ext4_ext_show_leaf(inode, path);
3244 	map->m_pblk = newblock;
3245 	map->m_len = allocated;
3246 out2:
3247 	if (path) {
3248 		ext4_ext_drop_refs(path);
3249 		kfree(path);
3250 	}
3251 	return err ? err : allocated;
3252 }
3253 
3254 /*
3255  * Block allocation/map/preallocation routine for extents based files
3256  *
3257  *
3258  * Need to be called with
3259  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3260  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3261  *
3262  * return > 0, number of of blocks already mapped/allocated
3263  *          if create == 0 and these are pre-allocated blocks
3264  *          	buffer head is unmapped
3265  *          otherwise blocks are mapped
3266  *
3267  * return = 0, if plain look up failed (blocks have not been allocated)
3268  *          buffer head is unmapped
3269  *
3270  * return < 0, error case.
3271  */
3272 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3273 			struct ext4_map_blocks *map, int flags)
3274 {
3275 	struct ext4_ext_path *path = NULL;
3276 	struct ext4_extent_header *eh;
3277 	struct ext4_extent newex, *ex;
3278 	ext4_fsblk_t newblock;
3279 	int err = 0, depth, ret, cache_type;
3280 	unsigned int allocated = 0;
3281 	struct ext4_allocation_request ar;
3282 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3283 
3284 	ext_debug("blocks %u/%u requested for inode %lu\n",
3285 		  map->m_lblk, map->m_len, inode->i_ino);
3286 
3287 	/* check in cache */
3288 	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3289 	if (cache_type) {
3290 		if (cache_type == EXT4_EXT_CACHE_GAP) {
3291 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3292 				/*
3293 				 * block isn't allocated yet and
3294 				 * user doesn't want to allocate it
3295 				 */
3296 				goto out2;
3297 			}
3298 			/* we should allocate requested block */
3299 		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3300 			/* block is already allocated */
3301 			newblock = map->m_lblk
3302 				   - le32_to_cpu(newex.ee_block)
3303 				   + ext4_ext_pblock(&newex);
3304 			/* number of remaining blocks in the extent */
3305 			allocated = ext4_ext_get_actual_len(&newex) -
3306 				(map->m_lblk - le32_to_cpu(newex.ee_block));
3307 			goto out;
3308 		} else {
3309 			BUG();
3310 		}
3311 	}
3312 
3313 	/* find extent for this block */
3314 	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3315 	if (IS_ERR(path)) {
3316 		err = PTR_ERR(path);
3317 		path = NULL;
3318 		goto out2;
3319 	}
3320 
3321 	depth = ext_depth(inode);
3322 
3323 	/*
3324 	 * consistent leaf must not be empty;
3325 	 * this situation is possible, though, _during_ tree modification;
3326 	 * this is why assert can't be put in ext4_ext_find_extent()
3327 	 */
3328 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3329 		EXT4_ERROR_INODE(inode, "bad extent address "
3330 				 "lblock: %lu, depth: %d pblock %lld",
3331 				 (unsigned long) map->m_lblk, depth,
3332 				 path[depth].p_block);
3333 		err = -EIO;
3334 		goto out2;
3335 	}
3336 	eh = path[depth].p_hdr;
3337 
3338 	ex = path[depth].p_ext;
3339 	if (ex) {
3340 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3341 		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3342 		unsigned short ee_len;
3343 
3344 		/*
3345 		 * Uninitialized extents are treated as holes, except that
3346 		 * we split out initialized portions during a write.
3347 		 */
3348 		ee_len = ext4_ext_get_actual_len(ex);
3349 		/* if found extent covers block, simply return it */
3350 		if (in_range(map->m_lblk, ee_block, ee_len)) {
3351 			newblock = map->m_lblk - ee_block + ee_start;
3352 			/* number of remaining blocks in the extent */
3353 			allocated = ee_len - (map->m_lblk - ee_block);
3354 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3355 				  ee_block, ee_len, newblock);
3356 
3357 			/* Do not put uninitialized extent in the cache */
3358 			if (!ext4_ext_is_uninitialized(ex)) {
3359 				ext4_ext_put_in_cache(inode, ee_block,
3360 							ee_len, ee_start,
3361 							EXT4_EXT_CACHE_EXTENT);
3362 				goto out;
3363 			}
3364 			ret = ext4_ext_handle_uninitialized_extents(handle,
3365 					inode, map, path, flags, allocated,
3366 					newblock);
3367 			return ret;
3368 		}
3369 	}
3370 
3371 	/*
3372 	 * requested block isn't allocated yet;
3373 	 * we couldn't try to create block if create flag is zero
3374 	 */
3375 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3376 		/*
3377 		 * put just found gap into cache to speed up
3378 		 * subsequent requests
3379 		 */
3380 		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3381 		goto out2;
3382 	}
3383 	/*
3384 	 * Okay, we need to do block allocation.
3385 	 */
3386 
3387 	/* find neighbour allocated blocks */
3388 	ar.lleft = map->m_lblk;
3389 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3390 	if (err)
3391 		goto out2;
3392 	ar.lright = map->m_lblk;
3393 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3394 	if (err)
3395 		goto out2;
3396 
3397 	/*
3398 	 * See if request is beyond maximum number of blocks we can have in
3399 	 * a single extent. For an initialized extent this limit is
3400 	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3401 	 * EXT_UNINIT_MAX_LEN.
3402 	 */
3403 	if (map->m_len > EXT_INIT_MAX_LEN &&
3404 	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3405 		map->m_len = EXT_INIT_MAX_LEN;
3406 	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3407 		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3408 		map->m_len = EXT_UNINIT_MAX_LEN;
3409 
3410 	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3411 	newex.ee_block = cpu_to_le32(map->m_lblk);
3412 	newex.ee_len = cpu_to_le16(map->m_len);
3413 	err = ext4_ext_check_overlap(inode, &newex, path);
3414 	if (err)
3415 		allocated = ext4_ext_get_actual_len(&newex);
3416 	else
3417 		allocated = map->m_len;
3418 
3419 	/* allocate new block */
3420 	ar.inode = inode;
3421 	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3422 	ar.logical = map->m_lblk;
3423 	ar.len = allocated;
3424 	if (S_ISREG(inode->i_mode))
3425 		ar.flags = EXT4_MB_HINT_DATA;
3426 	else
3427 		/* disable in-core preallocation for non-regular files */
3428 		ar.flags = 0;
3429 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3430 	if (!newblock)
3431 		goto out2;
3432 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3433 		  ar.goal, newblock, allocated);
3434 
3435 	/* try to insert new extent into found leaf and return */
3436 	ext4_ext_store_pblock(&newex, newblock);
3437 	newex.ee_len = cpu_to_le16(ar.len);
3438 	/* Mark uninitialized */
3439 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3440 		ext4_ext_mark_uninitialized(&newex);
3441 		/*
3442 		 * io_end structure was created for every IO write to an
3443 		 * uninitialized extent. To avoid unecessary conversion,
3444 		 * here we flag the IO that really needs the conversion.
3445 		 * For non asycn direct IO case, flag the inode state
3446 		 * that we need to perform convertion when IO is done.
3447 		 */
3448 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3449 			if (io)
3450 				io->flag = EXT4_IO_END_UNWRITTEN;
3451 			else
3452 				ext4_set_inode_state(inode,
3453 						     EXT4_STATE_DIO_UNWRITTEN);
3454 		}
3455 		if (ext4_should_dioread_nolock(inode))
3456 			map->m_flags |= EXT4_MAP_UNINIT;
3457 	}
3458 
3459 	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
3460 	if (err)
3461 		goto out2;
3462 
3463 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3464 	if (err) {
3465 		/* free data blocks we just allocated */
3466 		/* not a good idea to call discard here directly,
3467 		 * but otherwise we'd need to call it every free() */
3468 		ext4_discard_preallocations(inode);
3469 		ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
3470 				 ext4_ext_get_actual_len(&newex), 0);
3471 		goto out2;
3472 	}
3473 
3474 	/* previous routine could use block we allocated */
3475 	newblock = ext4_ext_pblock(&newex);
3476 	allocated = ext4_ext_get_actual_len(&newex);
3477 	if (allocated > map->m_len)
3478 		allocated = map->m_len;
3479 	map->m_flags |= EXT4_MAP_NEW;
3480 
3481 	/*
3482 	 * Update reserved blocks/metadata blocks after successful
3483 	 * block allocation which had been deferred till now.
3484 	 */
3485 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3486 		ext4_da_update_reserve_space(inode, allocated, 1);
3487 
3488 	/*
3489 	 * Cache the extent and update transaction to commit on fdatasync only
3490 	 * when it is _not_ an uninitialized extent.
3491 	 */
3492 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3493 		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3494 						EXT4_EXT_CACHE_EXTENT);
3495 		ext4_update_inode_fsync_trans(handle, inode, 1);
3496 	} else
3497 		ext4_update_inode_fsync_trans(handle, inode, 0);
3498 out:
3499 	if (allocated > map->m_len)
3500 		allocated = map->m_len;
3501 	ext4_ext_show_leaf(inode, path);
3502 	map->m_flags |= EXT4_MAP_MAPPED;
3503 	map->m_pblk = newblock;
3504 	map->m_len = allocated;
3505 out2:
3506 	if (path) {
3507 		ext4_ext_drop_refs(path);
3508 		kfree(path);
3509 	}
3510 	return err ? err : allocated;
3511 }
3512 
3513 void ext4_ext_truncate(struct inode *inode)
3514 {
3515 	struct address_space *mapping = inode->i_mapping;
3516 	struct super_block *sb = inode->i_sb;
3517 	ext4_lblk_t last_block;
3518 	handle_t *handle;
3519 	int err = 0;
3520 
3521 	/*
3522 	 * probably first extent we're gonna free will be last in block
3523 	 */
3524 	err = ext4_writepage_trans_blocks(inode);
3525 	handle = ext4_journal_start(inode, err);
3526 	if (IS_ERR(handle))
3527 		return;
3528 
3529 	if (inode->i_size & (sb->s_blocksize - 1))
3530 		ext4_block_truncate_page(handle, mapping, inode->i_size);
3531 
3532 	if (ext4_orphan_add(handle, inode))
3533 		goto out_stop;
3534 
3535 	down_write(&EXT4_I(inode)->i_data_sem);
3536 	ext4_ext_invalidate_cache(inode);
3537 
3538 	ext4_discard_preallocations(inode);
3539 
3540 	/*
3541 	 * TODO: optimization is possible here.
3542 	 * Probably we need not scan at all,
3543 	 * because page truncation is enough.
3544 	 */
3545 
3546 	/* we have to know where to truncate from in crash case */
3547 	EXT4_I(inode)->i_disksize = inode->i_size;
3548 	ext4_mark_inode_dirty(handle, inode);
3549 
3550 	last_block = (inode->i_size + sb->s_blocksize - 1)
3551 			>> EXT4_BLOCK_SIZE_BITS(sb);
3552 	err = ext4_ext_remove_space(inode, last_block);
3553 
3554 	/* In a multi-transaction truncate, we only make the final
3555 	 * transaction synchronous.
3556 	 */
3557 	if (IS_SYNC(inode))
3558 		ext4_handle_sync(handle);
3559 
3560 out_stop:
3561 	up_write(&EXT4_I(inode)->i_data_sem);
3562 	/*
3563 	 * If this was a simple ftruncate() and the file will remain alive,
3564 	 * then we need to clear up the orphan record which we created above.
3565 	 * However, if this was a real unlink then we were called by
3566 	 * ext4_delete_inode(), and we allow that function to clean up the
3567 	 * orphan info for us.
3568 	 */
3569 	if (inode->i_nlink)
3570 		ext4_orphan_del(handle, inode);
3571 
3572 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3573 	ext4_mark_inode_dirty(handle, inode);
3574 	ext4_journal_stop(handle);
3575 }
3576 
3577 static void ext4_falloc_update_inode(struct inode *inode,
3578 				int mode, loff_t new_size, int update_ctime)
3579 {
3580 	struct timespec now;
3581 
3582 	if (update_ctime) {
3583 		now = current_fs_time(inode->i_sb);
3584 		if (!timespec_equal(&inode->i_ctime, &now))
3585 			inode->i_ctime = now;
3586 	}
3587 	/*
3588 	 * Update only when preallocation was requested beyond
3589 	 * the file size.
3590 	 */
3591 	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3592 		if (new_size > i_size_read(inode))
3593 			i_size_write(inode, new_size);
3594 		if (new_size > EXT4_I(inode)->i_disksize)
3595 			ext4_update_i_disksize(inode, new_size);
3596 	} else {
3597 		/*
3598 		 * Mark that we allocate beyond EOF so the subsequent truncate
3599 		 * can proceed even if the new size is the same as i_size.
3600 		 */
3601 		if (new_size > i_size_read(inode))
3602 			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3603 	}
3604 
3605 }
3606 
3607 /*
3608  * preallocate space for a file. This implements ext4's fallocate inode
3609  * operation, which gets called from sys_fallocate system call.
3610  * For block-mapped files, posix_fallocate should fall back to the method
3611  * of writing zeroes to the required new blocks (the same behavior which is
3612  * expected for file systems which do not support fallocate() system call).
3613  */
3614 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3615 {
3616 	handle_t *handle;
3617 	loff_t new_size;
3618 	unsigned int max_blocks;
3619 	int ret = 0;
3620 	int ret2 = 0;
3621 	int retries = 0;
3622 	struct ext4_map_blocks map;
3623 	unsigned int credits, blkbits = inode->i_blkbits;
3624 
3625 	/*
3626 	 * currently supporting (pre)allocate mode for extent-based
3627 	 * files _only_
3628 	 */
3629 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3630 		return -EOPNOTSUPP;
3631 
3632 	/* preallocation to directories is currently not supported */
3633 	if (S_ISDIR(inode->i_mode))
3634 		return -ENODEV;
3635 
3636 	map.m_lblk = offset >> blkbits;
3637 	/*
3638 	 * We can't just convert len to max_blocks because
3639 	 * If blocksize = 4096 offset = 3072 and len = 2048
3640 	 */
3641 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3642 		- map.m_lblk;
3643 	/*
3644 	 * credits to insert 1 extent into extent tree
3645 	 */
3646 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3647 	mutex_lock(&inode->i_mutex);
3648 	ret = inode_newsize_ok(inode, (len + offset));
3649 	if (ret) {
3650 		mutex_unlock(&inode->i_mutex);
3651 		return ret;
3652 	}
3653 retry:
3654 	while (ret >= 0 && ret < max_blocks) {
3655 		map.m_lblk = map.m_lblk + ret;
3656 		map.m_len = max_blocks = max_blocks - ret;
3657 		handle = ext4_journal_start(inode, credits);
3658 		if (IS_ERR(handle)) {
3659 			ret = PTR_ERR(handle);
3660 			break;
3661 		}
3662 		ret = ext4_map_blocks(handle, inode, &map,
3663 				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3664 		if (ret <= 0) {
3665 #ifdef EXT4FS_DEBUG
3666 			WARN_ON(ret <= 0);
3667 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3668 				    "returned error inode#%lu, block=%u, "
3669 				    "max_blocks=%u", __func__,
3670 				    inode->i_ino, map.m_lblk, max_blocks);
3671 #endif
3672 			ext4_mark_inode_dirty(handle, inode);
3673 			ret2 = ext4_journal_stop(handle);
3674 			break;
3675 		}
3676 		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3677 						blkbits) >> blkbits))
3678 			new_size = offset + len;
3679 		else
3680 			new_size = (map.m_lblk + ret) << blkbits;
3681 
3682 		ext4_falloc_update_inode(inode, mode, new_size,
3683 					 (map.m_flags & EXT4_MAP_NEW));
3684 		ext4_mark_inode_dirty(handle, inode);
3685 		ret2 = ext4_journal_stop(handle);
3686 		if (ret2)
3687 			break;
3688 	}
3689 	if (ret == -ENOSPC &&
3690 			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3691 		ret = 0;
3692 		goto retry;
3693 	}
3694 	mutex_unlock(&inode->i_mutex);
3695 	return ret > 0 ? ret2 : ret;
3696 }
3697 
3698 /*
3699  * This function convert a range of blocks to written extents
3700  * The caller of this function will pass the start offset and the size.
3701  * all unwritten extents within this range will be converted to
3702  * written extents.
3703  *
3704  * This function is called from the direct IO end io call back
3705  * function, to convert the fallocated extents after IO is completed.
3706  * Returns 0 on success.
3707  */
3708 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3709 				    ssize_t len)
3710 {
3711 	handle_t *handle;
3712 	unsigned int max_blocks;
3713 	int ret = 0;
3714 	int ret2 = 0;
3715 	struct ext4_map_blocks map;
3716 	unsigned int credits, blkbits = inode->i_blkbits;
3717 
3718 	map.m_lblk = offset >> blkbits;
3719 	/*
3720 	 * We can't just convert len to max_blocks because
3721 	 * If blocksize = 4096 offset = 3072 and len = 2048
3722 	 */
3723 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3724 		      map.m_lblk);
3725 	/*
3726 	 * credits to insert 1 extent into extent tree
3727 	 */
3728 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3729 	while (ret >= 0 && ret < max_blocks) {
3730 		map.m_lblk += ret;
3731 		map.m_len = (max_blocks -= ret);
3732 		handle = ext4_journal_start(inode, credits);
3733 		if (IS_ERR(handle)) {
3734 			ret = PTR_ERR(handle);
3735 			break;
3736 		}
3737 		ret = ext4_map_blocks(handle, inode, &map,
3738 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3739 		if (ret <= 0) {
3740 			WARN_ON(ret <= 0);
3741 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3742 				    "returned error inode#%lu, block=%u, "
3743 				    "max_blocks=%u", __func__,
3744 				    inode->i_ino, map.m_lblk, map.m_len);
3745 		}
3746 		ext4_mark_inode_dirty(handle, inode);
3747 		ret2 = ext4_journal_stop(handle);
3748 		if (ret <= 0 || ret2 )
3749 			break;
3750 	}
3751 	return ret > 0 ? ret2 : ret;
3752 }
3753 /*
3754  * Callback function called for each extent to gather FIEMAP information.
3755  */
3756 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3757 		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3758 		       void *data)
3759 {
3760 	struct fiemap_extent_info *fieinfo = data;
3761 	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3762 	__u64	logical;
3763 	__u64	physical;
3764 	__u64	length;
3765 	__u32	flags = 0;
3766 	int	error;
3767 
3768 	logical =  (__u64)newex->ec_block << blksize_bits;
3769 
3770 	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3771 		pgoff_t offset;
3772 		struct page *page;
3773 		struct buffer_head *bh = NULL;
3774 
3775 		offset = logical >> PAGE_SHIFT;
3776 		page = find_get_page(inode->i_mapping, offset);
3777 		if (!page || !page_has_buffers(page))
3778 			return EXT_CONTINUE;
3779 
3780 		bh = page_buffers(page);
3781 
3782 		if (!bh)
3783 			return EXT_CONTINUE;
3784 
3785 		if (buffer_delay(bh)) {
3786 			flags |= FIEMAP_EXTENT_DELALLOC;
3787 			page_cache_release(page);
3788 		} else {
3789 			page_cache_release(page);
3790 			return EXT_CONTINUE;
3791 		}
3792 	}
3793 
3794 	physical = (__u64)newex->ec_start << blksize_bits;
3795 	length =   (__u64)newex->ec_len << blksize_bits;
3796 
3797 	if (ex && ext4_ext_is_uninitialized(ex))
3798 		flags |= FIEMAP_EXTENT_UNWRITTEN;
3799 
3800 	/*
3801 	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3802 	 *
3803 	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3804 	 * this also indicates no more allocated blocks.
3805 	 *
3806 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3807 	 */
3808 	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3809 	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3810 		loff_t size = i_size_read(inode);
3811 		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3812 
3813 		flags |= FIEMAP_EXTENT_LAST;
3814 		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3815 		    logical+length > size)
3816 			length = (size - logical + bs - 1) & ~(bs-1);
3817 	}
3818 
3819 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
3820 					length, flags);
3821 	if (error < 0)
3822 		return error;
3823 	if (error == 1)
3824 		return EXT_BREAK;
3825 
3826 	return EXT_CONTINUE;
3827 }
3828 
3829 /* fiemap flags we can handle specified here */
3830 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3831 
3832 static int ext4_xattr_fiemap(struct inode *inode,
3833 				struct fiemap_extent_info *fieinfo)
3834 {
3835 	__u64 physical = 0;
3836 	__u64 length;
3837 	__u32 flags = FIEMAP_EXTENT_LAST;
3838 	int blockbits = inode->i_sb->s_blocksize_bits;
3839 	int error = 0;
3840 
3841 	/* in-inode? */
3842 	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3843 		struct ext4_iloc iloc;
3844 		int offset;	/* offset of xattr in inode */
3845 
3846 		error = ext4_get_inode_loc(inode, &iloc);
3847 		if (error)
3848 			return error;
3849 		physical = iloc.bh->b_blocknr << blockbits;
3850 		offset = EXT4_GOOD_OLD_INODE_SIZE +
3851 				EXT4_I(inode)->i_extra_isize;
3852 		physical += offset;
3853 		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3854 		flags |= FIEMAP_EXTENT_DATA_INLINE;
3855 		brelse(iloc.bh);
3856 	} else { /* external block */
3857 		physical = EXT4_I(inode)->i_file_acl << blockbits;
3858 		length = inode->i_sb->s_blocksize;
3859 	}
3860 
3861 	if (physical)
3862 		error = fiemap_fill_next_extent(fieinfo, 0, physical,
3863 						length, flags);
3864 	return (error < 0 ? error : 0);
3865 }
3866 
3867 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3868 		__u64 start, __u64 len)
3869 {
3870 	ext4_lblk_t start_blk;
3871 	int error = 0;
3872 
3873 	/* fallback to generic here if not in extents fmt */
3874 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3875 		return generic_block_fiemap(inode, fieinfo, start, len,
3876 			ext4_get_block);
3877 
3878 	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3879 		return -EBADR;
3880 
3881 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3882 		error = ext4_xattr_fiemap(inode, fieinfo);
3883 	} else {
3884 		ext4_lblk_t len_blks;
3885 		__u64 last_blk;
3886 
3887 		start_blk = start >> inode->i_sb->s_blocksize_bits;
3888 		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3889 		if (last_blk >= EXT_MAX_BLOCK)
3890 			last_blk = EXT_MAX_BLOCK-1;
3891 		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3892 
3893 		/*
3894 		 * Walk the extent tree gathering extent information.
3895 		 * ext4_ext_fiemap_cb will push extents back to user.
3896 		 */
3897 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
3898 					  ext4_ext_fiemap_cb, fieinfo);
3899 	}
3900 
3901 	return error;
3902 }
3903 
3904