xref: /freebsd/sys/fs/ext2fs/ext2_extents.c (revision 5def4c47d4bd90b209b9b4a4ba9faec15846d8fd)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 Zheng Liu <lz@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/types.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/vnode.h>
37 #include <sys/bio.h>
38 #include <sys/buf.h>
39 #include <sys/endian.h>
40 #include <sys/conf.h>
41 #include <sys/sdt.h>
42 #include <sys/stat.h>
43 
44 #include <fs/ext2fs/ext2_mount.h>
45 #include <fs/ext2fs/fs.h>
46 #include <fs/ext2fs/inode.h>
47 #include <fs/ext2fs/ext2fs.h>
48 #include <fs/ext2fs/ext2_extents.h>
49 #include <fs/ext2fs/ext2_extern.h>
50 
51 SDT_PROVIDER_DECLARE(ext2fs);
52 /*
53  * ext2fs trace probe:
54  * arg0: verbosity. Higher numbers give more verbose messages
55  * arg1: Textual message
56  */
57 SDT_PROBE_DEFINE2(ext2fs, , trace, extents, "int", "char*");
58 
59 static MALLOC_DEFINE(M_EXT2EXTENTS, "ext2_extents", "EXT2 extents");
60 
61 #ifdef EXT2FS_PRINT_EXTENTS
62 static const bool print_extents_walk = true;
63 
64 static int ext4_ext_check_header(struct inode *, struct ext4_extent_header *);
65 static int ext4_ext_walk_header(struct inode *, struct ext4_extent_header *);
66 static inline e4fs_daddr_t ext4_ext_index_pblock(struct ext4_extent_index *);
67 static inline e4fs_daddr_t ext4_ext_extent_pblock(struct ext4_extent *);
68 
69 static int
70 ext4_ext_blk_check(struct inode *ip, e4fs_daddr_t blk)
71 {
72 	struct m_ext2fs *fs;
73 
74 	fs = ip->i_e2fs;
75 
76 	if (blk < fs->e2fs->e2fs_first_dblock || blk >= fs->e2fs_bcount)
77 		return (EIO);
78 
79 	return (0);
80 }
81 
82 static int
83 ext4_ext_walk_index(struct inode *ip, struct ext4_extent_index *ex, bool do_walk)
84 {
85 	struct m_ext2fs *fs;
86 	struct buf *bp;
87 	e4fs_daddr_t blk;
88 	int error;
89 
90 	fs = ip->i_e2fs;
91 
92 	if (print_extents_walk)
93 		printf("    index %p => (blk %u pblk %ju)\n", ex,
94 		    le32toh(ex->ei_blk), (uint64_t)le16toh(ex->ei_leaf_hi) << 32 |
95 		    le32toh(ex->ei_leaf_lo));
96 
97 	if(!do_walk)
98 		return (0);
99 
100 	blk = ext4_ext_index_pblock(ex);
101 	error = ext4_ext_blk_check(ip, blk);
102 	if (error)
103 		return (error);
104 
105 	if ((error = bread(ip->i_devvp,
106 	    fsbtodb(fs, blk), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
107 		brelse(bp);
108 		return (error);
109 	}
110 
111 	error = ext4_ext_walk_header(ip, (struct ext4_extent_header *)bp->b_data);
112 
113 	brelse(bp);
114 
115 	return (error);
116 }
117 
118 static int
119 ext4_ext_walk_extent(struct inode *ip, struct ext4_extent *ep)
120 {
121 	e4fs_daddr_t blk;
122 	int error;
123 
124 	blk = ext4_ext_extent_pblock(ep);
125 	error = ext4_ext_blk_check(ip, blk);
126 	if (error)
127 		return (error);
128 
129 	if (print_extents_walk)
130 		printf("    ext %p => (blk %u len %u start %ju)\n",
131 		    ep, le32toh(ep->e_blk), le16toh(ep->e_len),
132 		    (uint64_t)blk);
133 
134 	return (0);
135 }
136 
137 static int
138 ext4_ext_walk_header(struct inode *ip, struct ext4_extent_header *eh)
139 {
140 	int i, error = 0;
141 
142 	error = ext4_ext_check_header(ip, eh);
143 	if (error)
144 		return (error);
145 
146 	if (print_extents_walk)
147 		printf("header %p => (entries %d max %d depth %d gen %d)\n",
148 		    eh, le16toh(eh->eh_ecount),
149 		    le16toh(eh->eh_max), le16toh(eh->eh_depth), le32toh(eh->eh_gen));
150 
151 	for (i = 0; i < le16toh(eh->eh_ecount) && error == 0; i++)
152 		if (eh->eh_depth != 0)
153 			error = ext4_ext_walk_index(ip,
154 			    (struct ext4_extent_index *)(eh + 1 + i), true);
155 		else
156 			error = ext4_ext_walk_extent(ip, (struct ext4_extent *)(eh + 1 + i));
157 
158 	return (error);
159 }
160 
161 static int
162 ext4_ext_print_path(struct inode *ip, struct ext4_extent_path *path)
163 {
164 	int k, l, error = 0;
165 
166 	l = path->ep_depth;
167 
168 	if (print_extents_walk)
169 		printf("ip=%ju, Path:\n", ip->i_number);
170 
171 	for (k = 0; k <= l && error == 0; k++, path++) {
172 		if (path->ep_index) {
173 			error = ext4_ext_walk_index(ip, path->ep_index, false);
174 		} else if (path->ep_ext) {
175 			error = ext4_ext_walk_extent(ip, path->ep_ext);
176 		}
177 	}
178 
179 	return (error);
180 }
181 
182 int
183 ext4_ext_walk(struct inode *ip)
184 {
185 	struct ext4_extent_header *ehp;
186 
187 	ehp = (struct ext4_extent_header *)ip->i_db;
188 
189 	if (print_extents_walk)
190 		printf("Extent status:ip=%ju\n", ip->i_number);
191 
192 	if (!(ip->i_flag & IN_E4EXTENTS))
193 		return (0);
194 
195 	return (ext4_ext_walk_header(ip, ehp));
196 }
197 #endif
198 
199 static inline struct ext4_extent_header *
200 ext4_ext_inode_header(struct inode *ip)
201 {
202 
203 	return ((struct ext4_extent_header *)ip->i_db);
204 }
205 
206 static inline struct ext4_extent_header *
207 ext4_ext_block_header(char *bdata)
208 {
209 
210 	return ((struct ext4_extent_header *)bdata);
211 }
212 
213 static inline unsigned short
214 ext4_ext_inode_depth(struct inode *ip)
215 {
216 	struct ext4_extent_header *ehp;
217 
218 	ehp = (struct ext4_extent_header *)ip->i_data;
219 	return (le16toh(ehp->eh_depth));
220 }
221 
222 static inline e4fs_daddr_t
223 ext4_ext_index_pblock(struct ext4_extent_index *index)
224 {
225 	e4fs_daddr_t blk;
226 
227 	blk = le32toh(index->ei_leaf_lo);
228 	blk |= (e4fs_daddr_t)le16toh(index->ei_leaf_hi) << 32;
229 
230 	return (blk);
231 }
232 
233 static inline void
234 ext4_index_store_pblock(struct ext4_extent_index *index, e4fs_daddr_t pb)
235 {
236 
237 	index->ei_leaf_lo = htole32(pb & 0xffffffff);
238 	index->ei_leaf_hi = htole16((pb >> 32) & 0xffff);
239 }
240 
241 static inline e4fs_daddr_t
242 ext4_ext_extent_pblock(struct ext4_extent *extent)
243 {
244 	e4fs_daddr_t blk;
245 
246 	blk = le32toh(extent->e_start_lo);
247 	blk |= (e4fs_daddr_t)le16toh(extent->e_start_hi) << 32;
248 
249 	return (blk);
250 }
251 
252 static inline void
253 ext4_ext_store_pblock(struct ext4_extent *ex, e4fs_daddr_t pb)
254 {
255 
256 	ex->e_start_lo = htole32(pb & 0xffffffff);
257 	ex->e_start_hi = htole16((pb >> 32) & 0xffff);
258 }
259 
260 int
261 ext4_ext_in_cache(struct inode *ip, daddr_t lbn, struct ext4_extent *ep)
262 {
263 	struct ext4_extent_cache *ecp;
264 	int ret = EXT4_EXT_CACHE_NO;
265 
266 	ecp = &ip->i_ext_cache;
267 	if (ecp->ec_type == EXT4_EXT_CACHE_NO)
268 		return (ret);
269 
270 	if (lbn >= ecp->ec_blk && lbn < ecp->ec_blk + ecp->ec_len) {
271 		ep->e_blk = htole32(ecp->ec_blk);
272 		ep->e_start_lo = htole32(ecp->ec_start & 0xffffffff);
273 		ep->e_start_hi = htole16(ecp->ec_start >> 32 & 0xffff);
274 		ep->e_len = htole16(ecp->ec_len);
275 		ret = ecp->ec_type;
276 	}
277 	return (ret);
278 }
279 
280 static int
281 ext4_ext_check_header(struct inode *ip, struct ext4_extent_header *eh)
282 {
283 	struct m_ext2fs *fs;
284 	char *error_msg;
285 
286 	fs = ip->i_e2fs;
287 
288 	if (le16toh(eh->eh_magic) != EXT4_EXT_MAGIC) {
289 		error_msg = "header: invalid magic";
290 		goto corrupted;
291 	}
292 	if (eh->eh_max == 0) {
293 		error_msg = "header: invalid eh_max";
294 		goto corrupted;
295 	}
296 	if (le16toh(eh->eh_ecount) > le16toh(eh->eh_max)) {
297 		error_msg = "header: invalid eh_entries";
298 		goto corrupted;
299 	}
300 
301 	return (0);
302 
303 corrupted:
304 	SDT_PROBE2(ext2fs, , trace, extents, 1, error_msg);
305 	return (EIO);
306 }
307 
308 static void
309 ext4_ext_binsearch_index(struct ext4_extent_path *path, int blk)
310 {
311 	struct ext4_extent_header *eh;
312 	struct ext4_extent_index *r, *l, *m;
313 
314 	eh = path->ep_header;
315 
316 	KASSERT(le16toh(eh->eh_ecount) <= le16toh(eh->eh_max) &&
317 	    le16toh(eh->eh_ecount) > 0,
318 	    ("ext4_ext_binsearch_index: bad args"));
319 
320 	l = EXT_FIRST_INDEX(eh) + 1;
321 	r = EXT_FIRST_INDEX(eh) + le16toh(eh->eh_ecount) - 1;
322 	while (l <= r) {
323 		m = l + (r - l) / 2;
324 		if (blk < le32toh(m->ei_blk))
325 			r = m - 1;
326 		else
327 			l = m + 1;
328 	}
329 
330 	path->ep_index = l - 1;
331 }
332 
333 static void
334 ext4_ext_binsearch_ext(struct ext4_extent_path *path, int blk)
335 {
336 	struct ext4_extent_header *eh;
337 	struct ext4_extent *r, *l, *m;
338 
339 	eh = path->ep_header;
340 
341 	KASSERT(le16toh(eh->eh_ecount) <= le16toh(eh->eh_max),
342 	    ("ext4_ext_binsearch_ext: bad args"));
343 
344 	if (eh->eh_ecount == 0)
345 		return;
346 
347 	l = EXT_FIRST_EXTENT(eh) + 1;
348 	r = EXT_FIRST_EXTENT(eh) + le16toh(eh->eh_ecount) - 1;
349 
350 	while (l <= r) {
351 		m = l + (r - l) / 2;
352 		if (blk < le32toh(m->e_blk))
353 			r = m - 1;
354 		else
355 			l = m + 1;
356 	}
357 
358 	path->ep_ext = l - 1;
359 }
360 
361 static int
362 ext4_ext_fill_path_bdata(struct ext4_extent_path *path,
363     struct buf *bp, uint64_t blk)
364 {
365 
366 	KASSERT(path->ep_data == NULL,
367 	    ("ext4_ext_fill_path_bdata: bad ep_data"));
368 
369 	path->ep_data = malloc(bp->b_bufsize, M_EXT2EXTENTS, M_WAITOK);
370 	memcpy(path->ep_data, bp->b_data, bp->b_bufsize);
371 	path->ep_blk = blk;
372 
373 	return (0);
374 }
375 
376 static void
377 ext4_ext_fill_path_buf(struct ext4_extent_path *path, struct buf *bp)
378 {
379 
380 	KASSERT(path->ep_data != NULL,
381 	    ("ext4_ext_fill_path_buf: bad ep_data"));
382 
383 	memcpy(bp->b_data, path->ep_data, bp->b_bufsize);
384 }
385 
386 static void
387 ext4_ext_drop_refs(struct ext4_extent_path *path)
388 {
389 	int depth, i;
390 
391 	if (!path)
392 		return;
393 
394 	depth = path->ep_depth;
395 	for (i = 0; i <= depth; i++, path++)
396 		if (path->ep_data) {
397 			free(path->ep_data, M_EXT2EXTENTS);
398 			path->ep_data = NULL;
399 		}
400 }
401 
402 void
403 ext4_ext_path_free(struct ext4_extent_path *path)
404 {
405 
406 	if (!path)
407 		return;
408 
409 	ext4_ext_drop_refs(path);
410 	free(path, M_EXT2EXTENTS);
411 }
412 
413 int
414 ext4_ext_find_extent(struct inode *ip, daddr_t block,
415     struct ext4_extent_path **ppath)
416 {
417 	struct m_ext2fs *fs;
418 	struct ext4_extent_header *eh;
419 	struct ext4_extent_path *path;
420 	struct buf *bp;
421 	uint64_t blk;
422 	int error, depth, i, ppos, alloc;
423 
424 	fs = ip->i_e2fs;
425 	eh = ext4_ext_inode_header(ip);
426 	depth = ext4_ext_inode_depth(ip);
427 	ppos = 0;
428 	alloc = 0;
429 
430 	error = ext4_ext_check_header(ip, eh);
431 	if (error)
432 		return (error);
433 
434 	if (ppath == NULL)
435 		return (EINVAL);
436 
437 	path = *ppath;
438 	if (path == NULL) {
439 		path = malloc(EXT4_EXT_DEPTH_MAX *
440 		    sizeof(struct ext4_extent_path),
441 		    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
442 		*ppath = path;
443 		alloc = 1;
444 	}
445 
446 	path[0].ep_header = eh;
447 	path[0].ep_data = NULL;
448 
449 	/* Walk through the tree. */
450 	i = depth;
451 	while (i) {
452 		ext4_ext_binsearch_index(&path[ppos], block);
453 		blk = ext4_ext_index_pblock(path[ppos].ep_index);
454 		path[ppos].ep_depth = i;
455 		path[ppos].ep_ext = NULL;
456 
457 		error = bread(ip->i_devvp, fsbtodb(ip->i_e2fs, blk),
458 		    ip->i_e2fs->e2fs_bsize, NOCRED, &bp);
459 		if (error) {
460 			goto error;
461 		}
462 
463 		ppos++;
464 		if (ppos > depth) {
465 			SDT_PROBE2(ext2fs, , trace, extents, 1,
466 			    "ppos > depth => extent corrupted");
467 			error = EIO;
468 			brelse(bp);
469 			goto error;
470 		}
471 
472 		ext4_ext_fill_path_bdata(&path[ppos], bp, blk);
473 		bqrelse(bp);
474 
475 		eh = ext4_ext_block_header(path[ppos].ep_data);
476 		if (ext4_ext_check_header(ip, eh) ||
477 		    ext2_extent_blk_csum_verify(ip, path[ppos].ep_data)) {
478 			error = EIO;
479 			goto error;
480 		}
481 
482 		path[ppos].ep_header = eh;
483 
484 		i--;
485 	}
486 
487 	error = ext4_ext_check_header(ip, eh);
488 	if (error)
489 		goto error;
490 
491 	/* Find extent. */
492 	path[ppos].ep_depth = i;
493 	path[ppos].ep_header = eh;
494 	path[ppos].ep_ext = NULL;
495 	path[ppos].ep_index = NULL;
496 	ext4_ext_binsearch_ext(&path[ppos], block);
497 	return (0);
498 
499 error:
500 	ext4_ext_drop_refs(path);
501 	if (alloc)
502 		free(path, M_EXT2EXTENTS);
503 
504 	*ppath = NULL;
505 
506 	return (error);
507 }
508 
509 static inline int
510 ext4_ext_space_root(struct inode *ip)
511 {
512 	int size;
513 
514 	size = sizeof(ip->i_data);
515 	size -= sizeof(struct ext4_extent_header);
516 	size /= sizeof(struct ext4_extent);
517 
518 	return (size);
519 }
520 
521 static inline int
522 ext4_ext_space_block(struct inode *ip)
523 {
524 	struct m_ext2fs *fs;
525 	int size;
526 
527 	fs = ip->i_e2fs;
528 
529 	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
530 	    sizeof(struct ext4_extent);
531 
532 	return (size);
533 }
534 
535 static inline int
536 ext4_ext_space_block_index(struct inode *ip)
537 {
538 	struct m_ext2fs *fs;
539 	int size;
540 
541 	fs = ip->i_e2fs;
542 
543 	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
544 	    sizeof(struct ext4_extent_index);
545 
546 	return (size);
547 }
548 
549 void
550 ext4_ext_tree_init(struct inode *ip)
551 {
552 	struct ext4_extent_header *ehp;
553 
554 	ip->i_flag |= IN_E4EXTENTS;
555 
556 	memset(ip->i_data, 0, EXT2_NDADDR + EXT2_NIADDR);
557 	ehp = (struct ext4_extent_header *)ip->i_data;
558 	ehp->eh_magic = htole16(EXT4_EXT_MAGIC);
559 	ehp->eh_max = htole16(ext4_ext_space_root(ip));
560 	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
561 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
562 	ext2_update(ip->i_vnode, 1);
563 }
564 
565 static inline void
566 ext4_ext_put_in_cache(struct inode *ip, uint32_t blk,
567 			uint32_t len, uint32_t start, int type)
568 {
569 
570 	KASSERT(len != 0, ("ext4_ext_put_in_cache: bad input"));
571 
572 	ip->i_ext_cache.ec_type = type;
573 	ip->i_ext_cache.ec_blk = blk;
574 	ip->i_ext_cache.ec_len = len;
575 	ip->i_ext_cache.ec_start = start;
576 }
577 
578 static e4fs_daddr_t
579 ext4_ext_blkpref(struct inode *ip, struct ext4_extent_path *path,
580     e4fs_daddr_t block)
581 {
582 	struct m_ext2fs *fs;
583 	struct ext4_extent *ex;
584 	e4fs_daddr_t bg_start;
585 	int depth;
586 
587 	fs = ip->i_e2fs;
588 
589 	if (path) {
590 		depth = path->ep_depth;
591 		ex = path[depth].ep_ext;
592 		if (ex) {
593 			e4fs_daddr_t pblk = ext4_ext_extent_pblock(ex);
594 			e2fs_daddr_t blk = le32toh(ex->e_blk);
595 
596 			if (block > blk)
597 				return (pblk + (block - blk));
598 			else
599 				return (pblk - (blk - block));
600 		}
601 
602 		/* Try to get block from index itself. */
603 		if (path[depth].ep_data)
604 			return (path[depth].ep_blk);
605 	}
606 
607 	/* Use inode's group. */
608 	bg_start = (ip->i_block_group * EXT2_BLOCKS_PER_GROUP(ip->i_e2fs)) +
609 	    le32toh(fs->e2fs->e2fs_first_dblock);
610 
611 	return (bg_start + block);
612 }
613 
614 static int inline
615 ext4_can_extents_be_merged(struct ext4_extent *ex1,
616     struct ext4_extent *ex2)
617 {
618 
619 	if (le32toh(ex1->e_blk) + le16toh(ex1->e_len) != le32toh(ex2->e_blk))
620 		return (0);
621 
622 	if (le16toh(ex1->e_len) + le16toh(ex2->e_len) > EXT4_MAX_LEN)
623 		return (0);
624 
625 	if (ext4_ext_extent_pblock(ex1) + le16toh(ex1->e_len) ==
626 	    ext4_ext_extent_pblock(ex2))
627 		return (1);
628 
629 	return (0);
630 }
631 
632 static unsigned
633 ext4_ext_next_leaf_block(struct inode *ip, struct ext4_extent_path *path)
634 {
635 	int depth = path->ep_depth;
636 
637 	/* Empty tree */
638 	if (depth == 0)
639 		return (EXT4_MAX_BLOCKS);
640 
641 	/* Go to indexes. */
642 	depth--;
643 
644 	while (depth >= 0) {
645 		if (path[depth].ep_index !=
646 		    EXT_LAST_INDEX(path[depth].ep_header))
647 			return (le32toh(path[depth].ep_index[1].ei_blk));
648 
649 		depth--;
650 	}
651 
652 	return (EXT4_MAX_BLOCKS);
653 }
654 
655 static int
656 ext4_ext_dirty(struct inode *ip, struct ext4_extent_path *path)
657 {
658 	struct m_ext2fs *fs;
659 	struct buf *bp;
660 	uint64_t blk;
661 	int error;
662 
663 	fs = ip->i_e2fs;
664 
665 	if (!path)
666 		return (EINVAL);
667 
668 	if (path->ep_data) {
669 		blk = path->ep_blk;
670 		bp = getblk(ip->i_devvp, fsbtodb(fs, blk),
671 		    fs->e2fs_bsize, 0, 0, 0);
672 		if (!bp)
673 			return (EIO);
674 		ext4_ext_fill_path_buf(path, bp);
675 		ext2_extent_blk_csum_set(ip, bp->b_data);
676 		error = bwrite(bp);
677 	} else {
678 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
679 		error = ext2_update(ip->i_vnode, 1);
680 	}
681 
682 	return (error);
683 }
684 
685 static int
686 ext4_ext_insert_index(struct inode *ip, struct ext4_extent_path *path,
687     uint32_t lblk, e4fs_daddr_t blk)
688 {
689 	struct m_ext2fs *fs;
690 	struct ext4_extent_index *idx;
691 	int len;
692 
693 	fs = ip->i_e2fs;
694 
695 	if (lblk == le32toh(path->ep_index->ei_blk)) {
696 		SDT_PROBE2(ext2fs, , trace, extents, 1,
697 		    "lblk == index blk => extent corrupted");
698 		return (EIO);
699 	}
700 
701 	if (le16toh(path->ep_header->eh_ecount) >=
702 	    le16toh(path->ep_header->eh_max)) {
703 		SDT_PROBE2(ext2fs, , trace, extents, 1,
704 		    "ecout > maxcount => extent corrupted");
705 		return (EIO);
706 	}
707 
708 	if (lblk > le32toh(path->ep_index->ei_blk)) {
709 		/* Insert after. */
710 		idx = path->ep_index + 1;
711 	} else {
712 		/* Insert before. */
713 		idx = path->ep_index;
714 	}
715 
716 	len = EXT_LAST_INDEX(path->ep_header) - idx + 1;
717 	if (len > 0)
718 		memmove(idx + 1, idx, len * sizeof(struct ext4_extent_index));
719 
720 	if (idx > EXT_MAX_INDEX(path->ep_header)) {
721 		SDT_PROBE2(ext2fs, , trace, extents, 1,
722 		    "index is out of range => extent corrupted");
723 		return (EIO);
724 	}
725 
726 	idx->ei_blk = htole32(lblk);
727 	ext4_index_store_pblock(idx, blk);
728 	path->ep_header->eh_ecount =
729 	    htole16(le16toh(path->ep_header->eh_ecount) + 1);
730 
731 	return (ext4_ext_dirty(ip, path));
732 }
733 
734 static e4fs_daddr_t
735 ext4_ext_alloc_meta(struct inode *ip)
736 {
737 	e4fs_daddr_t blk = ext2_alloc_meta(ip);
738 	if (blk) {
739 		ip->i_blocks += btodb(ip->i_e2fs->e2fs_bsize);
740 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
741 		ext2_update(ip->i_vnode, 1);
742 	}
743 
744 	return (blk);
745 }
746 
747 static void
748 ext4_ext_blkfree(struct inode *ip, uint64_t blk, int count, int flags)
749 {
750 	struct m_ext2fs *fs;
751 	int i, blocksreleased;
752 
753 	fs = ip->i_e2fs;
754 	blocksreleased = count;
755 
756 	for(i = 0; i < count; i++)
757 		ext2_blkfree(ip, blk + i, fs->e2fs_bsize);
758 
759 	if (ip->i_blocks >= blocksreleased)
760 		ip->i_blocks -= (btodb(fs->e2fs_bsize)*blocksreleased);
761 	else
762 		ip->i_blocks = 0;
763 
764 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
765 	ext2_update(ip->i_vnode, 1);
766 }
767 
768 static int
769 ext4_ext_split(struct inode *ip, struct ext4_extent_path *path,
770     struct ext4_extent *newext, int at)
771 {
772 	struct m_ext2fs *fs;
773 	struct  buf *bp;
774 	int depth = ext4_ext_inode_depth(ip);
775 	struct ext4_extent_header *neh;
776 	struct ext4_extent_index *fidx;
777 	struct ext4_extent *ex;
778 	int i = at, k, m, a;
779 	e4fs_daddr_t newblk, oldblk;
780 	uint32_t border;
781 	e4fs_daddr_t *ablks = NULL;
782 	int error = 0;
783 
784 	fs = ip->i_e2fs;
785 	bp = NULL;
786 
787 	/*
788 	 * We will split at current extent for now.
789 	 */
790 	if (path[depth].ep_ext > EXT_MAX_EXTENT(path[depth].ep_header)) {
791 		SDT_PROBE2(ext2fs, , trace, extents, 1,
792 		    "extent is out of range => extent corrupted");
793 		return (EIO);
794 	}
795 
796 	if (path[depth].ep_ext != EXT_MAX_EXTENT(path[depth].ep_header))
797 		border = le32toh(path[depth].ep_ext[1].e_blk);
798 	else
799 		border = le32toh(newext->e_blk);
800 
801 	/* Allocate new blocks. */
802 	ablks = malloc(sizeof(e4fs_daddr_t) * depth,
803 	    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
804 	for (a = 0; a < depth - at; a++) {
805 		newblk = ext4_ext_alloc_meta(ip);
806 		if (newblk == 0)
807 			goto cleanup;
808 		ablks[a] = newblk;
809 	}
810 
811 	newblk = ablks[--a];
812 	bp = getblk(ip->i_devvp, fsbtodb(fs, newblk), fs->e2fs_bsize, 0, 0, 0);
813 	if (!bp) {
814 		error = EIO;
815 		goto cleanup;
816 	}
817 
818 	neh = ext4_ext_block_header(bp->b_data);
819 	neh->eh_ecount = 0;
820 	neh->eh_max = le16toh(ext4_ext_space_block(ip));
821 	neh->eh_magic = le16toh(EXT4_EXT_MAGIC);
822 	neh->eh_depth = 0;
823 	ex = EXT_FIRST_EXTENT(neh);
824 
825 	if (le16toh(path[depth].ep_header->eh_ecount) !=
826 	    le16toh(path[depth].ep_header->eh_max)) {
827 		SDT_PROBE2(ext2fs, , trace, extents, 1,
828 		    "extents count out of range => extent corrupted");
829 		error = EIO;
830 		goto cleanup;
831 	}
832 
833 	/* Start copy from next extent. */
834 	m = 0;
835 	path[depth].ep_ext++;
836 	while (path[depth].ep_ext <= EXT_MAX_EXTENT(path[depth].ep_header)) {
837 		path[depth].ep_ext++;
838 		m++;
839 	}
840 	if (m) {
841 		memmove(ex, path[depth].ep_ext - m,
842 		    sizeof(struct ext4_extent) * m);
843 		neh->eh_ecount = htole16(le16toh(neh->eh_ecount) + m);
844 	}
845 
846 	ext2_extent_blk_csum_set(ip, bp->b_data);
847 	bwrite(bp);
848 	bp = NULL;
849 
850 	/* Fix old leaf. */
851 	if (m) {
852 		path[depth].ep_header->eh_ecount =
853 		    htole16(le16toh(path[depth].ep_header->eh_ecount) - m);
854 		ext4_ext_dirty(ip, path + depth);
855 	}
856 
857 	/* Create intermediate indexes. */
858 	k = depth - at - 1;
859 	KASSERT(k >= 0, ("ext4_ext_split: negative k"));
860 
861 	/* Insert new index into current index block. */
862 	i = depth - 1;
863 	while (k--) {
864 		oldblk = newblk;
865 		newblk = ablks[--a];
866 		error = bread(ip->i_devvp, fsbtodb(fs, newblk),
867 		    (int)fs->e2fs_bsize, NOCRED, &bp);
868 		if (error) {
869 			goto cleanup;
870 		}
871 
872 		neh = (struct ext4_extent_header *)bp->b_data;
873 		neh->eh_ecount = htole16(1);
874 		neh->eh_magic = htole16(EXT4_EXT_MAGIC);
875 		neh->eh_max = htole16(ext4_ext_space_block_index(ip));
876 		neh->eh_depth = htole16(depth - i);
877 		fidx = EXT_FIRST_INDEX(neh);
878 		fidx->ei_blk = htole32(border);
879 		ext4_index_store_pblock(fidx, oldblk);
880 
881 		m = 0;
882 		path[i].ep_index++;
883 		while (path[i].ep_index <= EXT_MAX_INDEX(path[i].ep_header)) {
884 			path[i].ep_index++;
885 			m++;
886 		}
887 		if (m) {
888 			memmove(++fidx, path[i].ep_index - m,
889 			    sizeof(struct ext4_extent_index) * m);
890 			neh->eh_ecount = htole16(le16toh(neh->eh_ecount) + m);
891 		}
892 
893 		ext2_extent_blk_csum_set(ip, bp->b_data);
894 		bwrite(bp);
895 		bp = NULL;
896 
897 		/* Fix old index. */
898 		if (m) {
899 			path[i].ep_header->eh_ecount =
900 			    htole16(le16toh(path[i].ep_header->eh_ecount) - m);
901 			ext4_ext_dirty(ip, path + i);
902 		}
903 
904 		i--;
905 	}
906 
907 	error = ext4_ext_insert_index(ip, path + at, border, newblk);
908 
909 cleanup:
910 	if (bp)
911 		brelse(bp);
912 
913 	if (error) {
914 		for (i = 0; i < depth; i++) {
915 			if (!ablks[i])
916 				continue;
917 			ext4_ext_blkfree(ip, ablks[i], 1, 0);
918 		}
919 	}
920 
921 	free(ablks, M_EXT2EXTENTS);
922 
923 	return (error);
924 }
925 
926 static int
927 ext4_ext_grow_indepth(struct inode *ip, struct ext4_extent_path *path,
928     struct ext4_extent *newext)
929 {
930 	struct m_ext2fs *fs;
931 	struct ext4_extent_path *curpath;
932 	struct ext4_extent_header *neh;
933 	struct buf *bp;
934 	e4fs_daddr_t newblk;
935 	int error = 0;
936 
937 	fs = ip->i_e2fs;
938 	curpath = path;
939 
940 	newblk = ext4_ext_alloc_meta(ip);
941 	if (newblk == 0)
942 		return (error);
943 
944 	bp = getblk(ip->i_devvp, fsbtodb(fs, newblk), fs->e2fs_bsize, 0, 0, 0);
945 	if (!bp)
946 		return (EIO);
947 
948 	/* Move top-level index/leaf into new block. */
949 	memmove(bp->b_data, curpath->ep_header, sizeof(ip->i_data));
950 
951 	/* Set size of new block */
952 	neh = ext4_ext_block_header(bp->b_data);
953 	neh->eh_magic = htole16(EXT4_EXT_MAGIC);
954 
955 	if (ext4_ext_inode_depth(ip))
956 		neh->eh_max = htole16(ext4_ext_space_block_index(ip));
957 	else
958 		neh->eh_max = htole16(ext4_ext_space_block(ip));
959 
960 	ext2_extent_blk_csum_set(ip, bp->b_data);
961 	error = bwrite(bp);
962 	if (error)
963 		goto out;
964 
965 	bp = NULL;
966 
967 	curpath->ep_header->eh_magic = htole16(EXT4_EXT_MAGIC);
968 	curpath->ep_header->eh_max = htole16(ext4_ext_space_root(ip));
969 	curpath->ep_header->eh_ecount = htole16(1);
970 	curpath->ep_index = EXT_FIRST_INDEX(curpath->ep_header);
971 	curpath->ep_index->ei_blk = EXT_FIRST_EXTENT(path[0].ep_header)->e_blk;
972 	ext4_index_store_pblock(curpath->ep_index, newblk);
973 
974 	neh = ext4_ext_inode_header(ip);
975 	neh->eh_depth = htole16(path->ep_depth + 1);
976 	ext4_ext_dirty(ip, curpath);
977 out:
978 	brelse(bp);
979 
980 	return (error);
981 }
982 
983 static int
984 ext4_ext_create_new_leaf(struct inode *ip, struct ext4_extent_path *path,
985     struct ext4_extent *newext)
986 {
987 	struct ext4_extent_path *curpath;
988 	int depth, i, error;
989 
990 repeat:
991 	i = depth = ext4_ext_inode_depth(ip);
992 
993 	/* Look for free index entry int the tree */
994 	curpath = path + depth;
995 	while (i > 0 && !EXT_HAS_FREE_INDEX(curpath)) {
996 		i--;
997 		curpath--;
998 	}
999 
1000 	/*
1001 	 * We use already allocated block for index block,
1002 	 * so subsequent data blocks should be contiguous.
1003 	 */
1004 	if (EXT_HAS_FREE_INDEX(curpath)) {
1005 		error = ext4_ext_split(ip, path, newext, i);
1006 		if (error)
1007 			goto out;
1008 
1009 		/* Refill path. */
1010 		ext4_ext_drop_refs(path);
1011 		error = ext4_ext_find_extent(ip, le32toh(newext->e_blk), &path);
1012 		if (error)
1013 			goto out;
1014 	} else {
1015 		/* Tree is full, do grow in depth. */
1016 		error = ext4_ext_grow_indepth(ip, path, newext);
1017 		if (error)
1018 			goto out;
1019 
1020 		/* Refill path. */
1021 		ext4_ext_drop_refs(path);
1022 		error = ext4_ext_find_extent(ip, le32toh(newext->e_blk), &path);
1023 		if (error)
1024 			goto out;
1025 
1026 		/* Check and split tree if required. */
1027 		depth = ext4_ext_inode_depth(ip);
1028 		if (le16toh(path[depth].ep_header->eh_ecount) ==
1029 		    le16toh(path[depth].ep_header->eh_max))
1030 			goto repeat;
1031 	}
1032 
1033 out:
1034 	return (error);
1035 }
1036 
1037 static int
1038 ext4_ext_correct_indexes(struct inode *ip, struct ext4_extent_path *path)
1039 {
1040 	struct ext4_extent_header *eh;
1041 	struct ext4_extent *ex;
1042 	int32_t border;
1043 	int depth, k;
1044 
1045 	depth = ext4_ext_inode_depth(ip);
1046 	eh = path[depth].ep_header;
1047 	ex = path[depth].ep_ext;
1048 
1049 	if (ex == NULL || eh == NULL)
1050 		return (EIO);
1051 
1052 	if (!depth)
1053 		return (0);
1054 
1055 	/* We will correct tree if first leaf got modified only. */
1056 	if (ex != EXT_FIRST_EXTENT(eh))
1057 		return (0);
1058 
1059 	k = depth - 1;
1060 	border = le32toh(path[depth].ep_ext->e_blk);
1061 	path[k].ep_index->ei_blk = htole32(border);
1062 	ext4_ext_dirty(ip, path + k);
1063 	while (k--) {
1064 		/* Change all left-side indexes. */
1065 		if (path[k+1].ep_index != EXT_FIRST_INDEX(path[k+1].ep_header))
1066 			break;
1067 
1068 		path[k].ep_index->ei_blk = htole32(border);
1069 		ext4_ext_dirty(ip, path + k);
1070 	}
1071 
1072 	return (0);
1073 }
1074 
1075 static int
1076 ext4_ext_insert_extent(struct inode *ip, struct ext4_extent_path *path,
1077     struct ext4_extent *newext)
1078 {
1079 	struct ext4_extent_header * eh;
1080 	struct ext4_extent *ex, *nex, *nearex;
1081 	struct ext4_extent_path *npath;
1082 	int depth, len, error, next;
1083 
1084 	depth = ext4_ext_inode_depth(ip);
1085 	ex = path[depth].ep_ext;
1086 	npath = NULL;
1087 
1088 	if (htole16(newext->e_len) == 0 || path[depth].ep_header == NULL)
1089 		return (EINVAL);
1090 
1091 	/* Insert block into found extent. */
1092 	if (ex && ext4_can_extents_be_merged(ex, newext)) {
1093 		ex->e_len = htole16(le16toh(ex->e_len) + le16toh(newext->e_len));
1094 		eh = path[depth].ep_header;
1095 		nearex = ex;
1096 		goto merge;
1097 	}
1098 
1099 repeat:
1100 	depth = ext4_ext_inode_depth(ip);
1101 	eh = path[depth].ep_header;
1102 	if (le16toh(eh->eh_ecount) < le16toh(eh->eh_max))
1103 		goto has_space;
1104 
1105 	/* Try next leaf */
1106 	nex = EXT_LAST_EXTENT(eh);
1107 	next = ext4_ext_next_leaf_block(ip, path);
1108 	if (le32toh(newext->e_blk) > le32toh(nex->e_blk) && next !=
1109 	    EXT4_MAX_BLOCKS) {
1110 		KASSERT(npath == NULL,
1111 		    ("ext4_ext_insert_extent: bad path"));
1112 
1113 		error = ext4_ext_find_extent(ip, next, &npath);
1114 		if (error)
1115 			goto cleanup;
1116 
1117 		if (npath->ep_depth != path->ep_depth) {
1118 			error = EIO;
1119 			goto cleanup;
1120 		}
1121 
1122 		eh = npath[depth].ep_header;
1123 		if (le16toh(eh->eh_ecount) < le16toh(eh->eh_max)) {
1124 			path = npath;
1125 			goto repeat;
1126 		}
1127 	}
1128 
1129 	/*
1130 	 * There is no free space in the found leaf,
1131 	 * try to add a new leaf to the tree.
1132 	 */
1133 	error = ext4_ext_create_new_leaf(ip, path, newext);
1134 	if (error)
1135 		goto cleanup;
1136 
1137 	depth = ext4_ext_inode_depth(ip);
1138 	eh = path[depth].ep_header;
1139 
1140 has_space:
1141 	nearex = path[depth].ep_ext;
1142 	if (!nearex) {
1143 		/* Create new extent in the leaf. */
1144 		path[depth].ep_ext = EXT_FIRST_EXTENT(eh);
1145 	} else if (le32toh(newext->e_blk) > le32toh(nearex->e_blk)) {
1146 		if (nearex != EXT_LAST_EXTENT(eh)) {
1147 			len = EXT_MAX_EXTENT(eh) - nearex;
1148 			len = (len - 1) * sizeof(struct ext4_extent);
1149 			len = len < 0 ? 0 : len;
1150 			memmove(nearex + 2, nearex + 1, len);
1151 		}
1152 		path[depth].ep_ext = nearex + 1;
1153 	} else {
1154 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1155 		len = len < 0 ? 0 : len;
1156 		memmove(nearex + 1, nearex, len);
1157 		path[depth].ep_ext = nearex;
1158 	}
1159 
1160 	eh->eh_ecount = htole16(le16toh(eh->eh_ecount) + 1);
1161 	nearex = path[depth].ep_ext;
1162 	nearex->e_blk = newext->e_blk;
1163 	nearex->e_start_lo = newext->e_start_lo;
1164 	nearex->e_start_hi = newext->e_start_hi;
1165 	nearex->e_len = newext->e_len;
1166 
1167 merge:
1168 	/* Try to merge extents to the right. */
1169 	while (nearex < EXT_LAST_EXTENT(eh)) {
1170 		if (!ext4_can_extents_be_merged(nearex, nearex + 1))
1171 			break;
1172 
1173 		/* Merge with next extent. */
1174 		nearex->e_len = htole16(le16toh(nearex->e_len) +
1175 		    le16toh(nearex[1].e_len));
1176 		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1177 			len = (EXT_LAST_EXTENT(eh) - nearex - 1) *
1178 			    sizeof(struct ext4_extent);
1179 			memmove(nearex + 1, nearex + 2, len);
1180 		}
1181 
1182 		eh->eh_ecount = htole16(le16toh(eh->eh_ecount) - 1);
1183 		KASSERT(le16toh(eh->eh_ecount) != 0,
1184 		    ("ext4_ext_insert_extent: bad ecount"));
1185 	}
1186 
1187 	/*
1188 	 * Try to merge extents to the left,
1189 	 * start from inexes correction.
1190 	 */
1191 	error = ext4_ext_correct_indexes(ip, path);
1192 	if (error)
1193 		goto cleanup;
1194 
1195 	ext4_ext_dirty(ip, path + depth);
1196 
1197 cleanup:
1198 	if (npath) {
1199 		ext4_ext_drop_refs(npath);
1200 		free(npath, M_EXT2EXTENTS);
1201 	}
1202 
1203 	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
1204 	return (error);
1205 }
1206 
1207 static e4fs_daddr_t
1208 ext4_new_blocks(struct inode *ip, daddr_t lbn, e4fs_daddr_t pref,
1209     struct ucred *cred, unsigned long *count, int *perror)
1210 {
1211 	struct m_ext2fs *fs;
1212 	e4fs_daddr_t newblk;
1213 
1214 	/*
1215 	 * We will allocate only single block for now.
1216 	 */
1217 	if (*count > 1)
1218 		return (0);
1219 
1220 	fs = ip->i_e2fs;
1221 	EXT2_LOCK(ip->i_ump);
1222 	*perror = ext2_alloc(ip, lbn, pref, (int)fs->e2fs_bsize, cred, &newblk);
1223 	if (*perror)
1224 		return (0);
1225 
1226 	if (newblk) {
1227 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1228 		ext2_update(ip->i_vnode, 1);
1229 	}
1230 
1231 	return (newblk);
1232 }
1233 
1234 int
1235 ext4_ext_get_blocks(struct inode *ip, e4fs_daddr_t iblk,
1236     unsigned long max_blocks, struct ucred *cred, struct buf **bpp,
1237     int *pallocated, daddr_t *nb)
1238 {
1239 	struct m_ext2fs *fs;
1240 	struct buf *bp = NULL;
1241 	struct ext4_extent_path *path;
1242 	struct ext4_extent newex, *ex;
1243 	e4fs_daddr_t bpref, newblk = 0;
1244 	unsigned long allocated = 0;
1245 	int error = 0, depth;
1246 
1247 	if(bpp)
1248 		*bpp = NULL;
1249 	*pallocated = 0;
1250 
1251 	/* Check cache. */
1252 	path = NULL;
1253 	if ((bpref = ext4_ext_in_cache(ip, iblk, &newex))) {
1254 		if (bpref == EXT4_EXT_CACHE_IN) {
1255 			/* Block is already allocated. */
1256 			newblk = iblk - le32toh(newex.e_blk) +
1257 			    ext4_ext_extent_pblock(&newex);
1258 			allocated = le16toh(newex.e_len) - (iblk - le32toh(newex.e_blk));
1259 			goto out;
1260 		} else {
1261 			error = EIO;
1262 			goto out2;
1263 		}
1264 	}
1265 
1266 	error = ext4_ext_find_extent(ip, iblk, &path);
1267 	if (error) {
1268 		goto out2;
1269 	}
1270 
1271 	depth = ext4_ext_inode_depth(ip);
1272 	if (path[depth].ep_ext == NULL && depth != 0) {
1273 		error = EIO;
1274 		goto out2;
1275 	}
1276 
1277 	if ((ex = path[depth].ep_ext)) {
1278 		uint64_t lblk = le32toh(ex->e_blk);
1279 		uint16_t e_len  = le16toh(ex->e_len);
1280 		e4fs_daddr_t e_start = ext4_ext_extent_pblock(ex);
1281 
1282 		if (e_len > EXT4_MAX_LEN)
1283 			goto out2;
1284 
1285 		/* If we found extent covers block, simply return it. */
1286 		if (iblk >= lblk && iblk < lblk + e_len) {
1287 			newblk = iblk - lblk + e_start;
1288 			allocated = e_len - (iblk - lblk);
1289 			ext4_ext_put_in_cache(ip, lblk, e_len,
1290 			    e_start, EXT4_EXT_CACHE_IN);
1291 			goto out;
1292 		}
1293 	}
1294 
1295 	/* Allocate the new block. */
1296 	if (S_ISREG(ip->i_mode) && (!ip->i_next_alloc_block)) {
1297 		ip->i_next_alloc_goal = 0;
1298 	}
1299 
1300 	bpref = ext4_ext_blkpref(ip, path, iblk);
1301 	allocated = max_blocks;
1302 	newblk = ext4_new_blocks(ip, iblk, bpref, cred, &allocated, &error);
1303 	if (!newblk)
1304 		goto out2;
1305 
1306 	/* Try to insert new extent into found leaf and return. */
1307 	newex.e_blk = htole32(iblk);
1308 	ext4_ext_store_pblock(&newex, newblk);
1309 	newex.e_len = htole16(allocated);
1310 	error = ext4_ext_insert_extent(ip, path, &newex);
1311 	if (error)
1312 		goto out2;
1313 
1314 	newblk = ext4_ext_extent_pblock(&newex);
1315 	ext4_ext_put_in_cache(ip, iblk, allocated, newblk, EXT4_EXT_CACHE_IN);
1316 	*pallocated = 1;
1317 
1318 out:
1319 	if (allocated > max_blocks)
1320 		allocated = max_blocks;
1321 
1322 	if (bpp)
1323 	{
1324 		fs = ip->i_e2fs;
1325 		error = bread(ip->i_devvp, fsbtodb(fs, newblk),
1326 		    fs->e2fs_bsize, cred, &bp);
1327 		if (error) {
1328 			brelse(bp);
1329 		} else {
1330 			*bpp = bp;
1331 		}
1332 	}
1333 
1334 out2:
1335 	if (path) {
1336 		ext4_ext_drop_refs(path);
1337 		free(path, M_EXT2EXTENTS);
1338 	}
1339 
1340 	if (nb)
1341 		*nb = newblk;
1342 
1343 	return (error);
1344 }
1345 
1346 static inline uint16_t
1347 ext4_ext_get_actual_len(struct ext4_extent *ext)
1348 {
1349 
1350 	return (le16toh(ext->e_len) <= EXT_INIT_MAX_LEN ?
1351 	    le16toh(ext->e_len) : (le16toh(ext->e_len) - EXT_INIT_MAX_LEN));
1352 }
1353 
1354 static inline struct ext4_extent_header *
1355 ext4_ext_header(struct inode *ip)
1356 {
1357 
1358 	return ((struct ext4_extent_header *)ip->i_db);
1359 }
1360 
1361 static int
1362 ext4_remove_blocks(struct inode *ip, struct ext4_extent *ex,
1363     unsigned long from, unsigned long to)
1364 {
1365 	unsigned long num, start;
1366 
1367 	if (from >= le32toh(ex->e_blk) &&
1368 	    to == le32toh(ex->e_blk) + ext4_ext_get_actual_len(ex) - 1) {
1369 		/* Tail cleanup. */
1370 		num = le32toh(ex->e_blk) + ext4_ext_get_actual_len(ex) - from;
1371 		start = ext4_ext_extent_pblock(ex) +
1372 		    ext4_ext_get_actual_len(ex) - num;
1373 		ext4_ext_blkfree(ip, start, num, 0);
1374 	}
1375 
1376 	return (0);
1377 }
1378 
1379 static int
1380 ext4_ext_rm_index(struct inode *ip, struct ext4_extent_path *path)
1381 {
1382 	e4fs_daddr_t leaf;
1383 
1384 	/* Free index block. */
1385 	path--;
1386 	leaf = ext4_ext_index_pblock(path->ep_index);
1387 	KASSERT(path->ep_header->eh_ecount != 0,
1388 	    ("ext4_ext_rm_index: bad ecount"));
1389 	path->ep_header->eh_ecount =
1390 	    htole16(le16toh(path->ep_header->eh_ecount) - 1);
1391 	ext4_ext_dirty(ip, path);
1392 	ext4_ext_blkfree(ip, leaf, 1, 0);
1393 	return (0);
1394 }
1395 
1396 static int
1397 ext4_ext_rm_leaf(struct inode *ip, struct ext4_extent_path *path,
1398     uint64_t start)
1399 {
1400 	struct ext4_extent_header *eh;
1401 	struct ext4_extent *ex;
1402 	unsigned int a, b, block, num;
1403 	unsigned long ex_blk;
1404 	unsigned short ex_len;
1405 	int depth;
1406 	int error, correct_index;
1407 
1408 	depth = ext4_ext_inode_depth(ip);
1409 	if (!path[depth].ep_header) {
1410 		if (path[depth].ep_data == NULL)
1411 			return (EINVAL);
1412 		path[depth].ep_header =
1413 		    (struct ext4_extent_header* )path[depth].ep_data;
1414 	}
1415 
1416 	eh = path[depth].ep_header;
1417 	if (!eh) {
1418 		SDT_PROBE2(ext2fs, , trace, extents, 1,
1419 		    "bad header => extent corrupted");
1420 		return (EIO);
1421 	}
1422 
1423 	ex = EXT_LAST_EXTENT(eh);
1424 	ex_blk = le32toh(ex->e_blk);
1425 	ex_len = ext4_ext_get_actual_len(ex);
1426 
1427 	error = 0;
1428 	correct_index = 0;
1429 	while (ex >= EXT_FIRST_EXTENT(eh) && ex_blk + ex_len > start) {
1430 		path[depth].ep_ext = ex;
1431 		a = ex_blk > start ? ex_blk : start;
1432 		b = (uint64_t)ex_blk + ex_len - 1 <
1433 		    EXT4_MAX_BLOCKS ? ex_blk + ex_len - 1 : EXT4_MAX_BLOCKS;
1434 
1435 		if (a != ex_blk && b != ex_blk + ex_len - 1)
1436 			return (EINVAL);
1437 		else if (a != ex_blk) {
1438 			/* Remove tail of the extent. */
1439 			block = ex_blk;
1440 			num = a - block;
1441 		} else if (b != ex_blk + ex_len - 1) {
1442 			/* Remove head of the extent, not implemented. */
1443 			return (EINVAL);
1444 		} else {
1445 			/* Remove whole extent. */
1446 			block = ex_blk;
1447 			num = 0;
1448 		}
1449 
1450 		if (ex == EXT_FIRST_EXTENT(eh))
1451 			correct_index = 1;
1452 
1453 		error = ext4_remove_blocks(ip, ex, a, b);
1454 		if (error)
1455 			goto out;
1456 
1457 		if (num == 0) {
1458 			ext4_ext_store_pblock(ex, 0);
1459 			eh->eh_ecount = htole16(le16toh(eh->eh_ecount) - 1);
1460 		}
1461 
1462 		ex->e_blk = htole32(block);
1463 		ex->e_len = htole16(num);
1464 
1465 		ext4_ext_dirty(ip, path + depth);
1466 
1467 		ex--;
1468 		ex_blk = htole32(ex->e_blk);
1469 		ex_len = ext4_ext_get_actual_len(ex);
1470 	};
1471 
1472 	if (correct_index && le16toh(eh->eh_ecount))
1473 		error = ext4_ext_correct_indexes(ip, path);
1474 
1475 	/*
1476 	 * If this leaf is free, we should
1477 	 * remove it from index block above.
1478 	 */
1479 	if (error == 0 && eh->eh_ecount == 0 &&
1480 	    path[depth].ep_data != NULL)
1481 		error = ext4_ext_rm_index(ip, path + depth);
1482 
1483 out:
1484 	return (error);
1485 }
1486 
1487 static struct buf *
1488 ext4_read_extent_tree_block(struct inode *ip, e4fs_daddr_t pblk,
1489     int depth, int flags)
1490 {
1491 	struct m_ext2fs *fs;
1492 	struct ext4_extent_header *eh;
1493 	struct buf *bp;
1494 	int error;
1495 
1496 	fs = ip->i_e2fs;
1497 	error = bread(ip->i_devvp, fsbtodb(fs, pblk),
1498 	    fs->e2fs_bsize, NOCRED, &bp);
1499 	if (error) {
1500 		return (NULL);
1501 	}
1502 
1503 	eh = ext4_ext_block_header(bp->b_data);
1504 	if (le16toh(eh->eh_depth) != depth) {
1505 		SDT_PROBE2(ext2fs, , trace, extents, 1,
1506 		    "unexpected eh_depth");
1507 		goto err;
1508 	}
1509 
1510 	error = ext4_ext_check_header(ip, eh);
1511 	if (error)
1512 		goto err;
1513 
1514 	return (bp);
1515 
1516 err:
1517 	brelse(bp);
1518 	return (NULL);
1519 
1520 }
1521 
1522 static int inline
1523 ext4_ext_more_to_rm(struct ext4_extent_path *path)
1524 {
1525 
1526 	KASSERT(path->ep_index != NULL,
1527 	    ("ext4_ext_more_to_rm: bad index from path"));
1528 
1529 	if (path->ep_index < EXT_FIRST_INDEX(path->ep_header))
1530 		return (0);
1531 
1532 	if (le16toh(path->ep_header->eh_ecount) == path->index_count)
1533 		return (0);
1534 
1535 	return (1);
1536 }
1537 
1538 int
1539 ext4_ext_remove_space(struct inode *ip, off_t length, int flags,
1540     struct ucred *cred, struct thread *td)
1541 {
1542 	struct buf *bp;
1543 	struct ext4_extent_header *ehp;
1544 	struct ext4_extent_path *path;
1545 	int depth;
1546 	int i, error;
1547 
1548 	ehp = (struct ext4_extent_header *)ip->i_db;
1549 	depth = ext4_ext_inode_depth(ip);
1550 
1551 	error = ext4_ext_check_header(ip, ehp);
1552 	if(error)
1553 		return (error);
1554 
1555 	path = malloc(sizeof(struct ext4_extent_path) * (depth + 1),
1556 	    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
1557 	path[0].ep_header = ehp;
1558 	path[0].ep_depth = depth;
1559 	i = 0;
1560 	while (error == 0 && i >= 0) {
1561 		if (i == depth) {
1562 			/* This is leaf. */
1563 			error = ext4_ext_rm_leaf(ip, path, length);
1564 			if (error)
1565 				break;
1566 			free(path[i].ep_data, M_EXT2EXTENTS);
1567 			path[i].ep_data = NULL;
1568 			i--;
1569 			continue;
1570 		}
1571 
1572 		/* This is index. */
1573 		if (!path[i].ep_header)
1574 			path[i].ep_header =
1575 			    (struct ext4_extent_header *)path[i].ep_data;
1576 
1577 		if (!path[i].ep_index) {
1578 			/* This level hasn't touched yet. */
1579 			path[i].ep_index = EXT_LAST_INDEX(path[i].ep_header);
1580 			path[i].index_count =
1581 			    le16toh(path[i].ep_header->eh_ecount) + 1;
1582 		} else {
1583 			/* We've already was here, see at next index. */
1584 			path[i].ep_index--;
1585 		}
1586 
1587 		if (ext4_ext_more_to_rm(path + i)) {
1588 			memset(path + i + 1, 0, sizeof(*path));
1589 			bp = ext4_read_extent_tree_block(ip,
1590 			    ext4_ext_index_pblock(path[i].ep_index),
1591 			    path[0].ep_depth - (i + 1), 0);
1592 			if (!bp) {
1593 				error = EIO;
1594 				break;
1595 			}
1596 
1597 			ext4_ext_fill_path_bdata(&path[i+1], bp,
1598 			    ext4_ext_index_pblock(path[i].ep_index));
1599 			brelse(bp);
1600 			path[i].index_count =
1601 			    le16toh(path[i].ep_header->eh_ecount);
1602 			i++;
1603 		} else {
1604 			if (path[i].ep_header->eh_ecount == 0 && i > 0) {
1605 				/* Index is empty, remove it. */
1606 				error = ext4_ext_rm_index(ip, path + i);
1607 			}
1608 			free(path[i].ep_data, M_EXT2EXTENTS);
1609 			path[i].ep_data = NULL;
1610 			i--;
1611 		}
1612 	}
1613 
1614 	if (path->ep_header->eh_ecount == 0) {
1615 		/*
1616 		 * Truncate the tree to zero.
1617 		 */
1618 		 ext4_ext_header(ip)->eh_depth = 0;
1619 		 ext4_ext_header(ip)->eh_max = htole16(ext4_ext_space_root(ip));
1620 		 ext4_ext_dirty(ip, path);
1621 	}
1622 
1623 	ext4_ext_drop_refs(path);
1624 	free(path, M_EXT2EXTENTS);
1625 
1626 	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
1627 	return (error);
1628 }
1629