xref: /freebsd/sys/fs/ext2fs/ext2_extents.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 Zheng Liu <lz@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/types.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/vnode.h>
37 #include <sys/bio.h>
38 #include <sys/buf.h>
39 #include <sys/endian.h>
40 #include <sys/conf.h>
41 #include <sys/sdt.h>
42 #include <sys/stat.h>
43 
44 #include <fs/ext2fs/ext2_mount.h>
45 #include <fs/ext2fs/fs.h>
46 #include <fs/ext2fs/inode.h>
47 #include <fs/ext2fs/ext2fs.h>
48 #include <fs/ext2fs/ext2_extents.h>
49 #include <fs/ext2fs/ext2_extern.h>
50 
51 SDT_PROVIDER_DECLARE(ext2fs);
52 /*
53  * ext2fs trace probe:
54  * arg0: verbosity. Higher numbers give more verbose messages
55  * arg1: Textual message
56  */
57 SDT_PROBE_DEFINE2(ext2fs, , trace, extents, "int", "char*");
58 
59 static MALLOC_DEFINE(M_EXT2EXTENTS, "ext2_extents", "EXT2 extents");
60 
61 #ifdef EXT2FS_PRINT_EXTENTS
62 static const bool print_extents_walk = true;
63 
64 static int ext4_ext_check_header(struct inode *, struct ext4_extent_header *,
65     int);
66 static int ext4_ext_walk_header(struct inode *, struct ext4_extent_header *,
67     int);
68 static inline e4fs_daddr_t ext4_ext_index_pblock(struct ext4_extent_index *);
69 static inline e4fs_daddr_t ext4_ext_extent_pblock(struct ext4_extent *);
70 
71 static int
72 ext4_ext_blk_check(struct inode *ip, e4fs_daddr_t blk)
73 {
74 	struct m_ext2fs *fs;
75 
76 	fs = ip->i_e2fs;
77 
78 	if (blk < fs->e2fs->e2fs_first_dblock || blk >= fs->e2fs_bcount)
79 		return (EIO);
80 
81 	return (0);
82 }
83 
84 static int
85 ext4_ext_walk_index(struct inode *ip, struct ext4_extent_index *ex, int depth,
86     bool do_walk)
87 {
88 	struct m_ext2fs *fs;
89 	struct buf *bp;
90 	e4fs_daddr_t blk;
91 	int error;
92 
93 	fs = ip->i_e2fs;
94 
95 	if (print_extents_walk)
96 		printf("    index %p => (blk %u pblk %ju)\n", ex,
97 		    le32toh(ex->ei_blk),
98 		    (uint64_t)le16toh(ex->ei_leaf_hi) << 32 |
99 		    le32toh(ex->ei_leaf_lo));
100 
101 	if(!do_walk)
102 		return (0);
103 
104 	blk = ext4_ext_index_pblock(ex);
105 	error = ext4_ext_blk_check(ip, blk);
106 	if (error)
107 		return (error);
108 
109 	if ((error = bread(ip->i_devvp,
110 	    fsbtodb(fs, blk), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
111 		brelse(bp);
112 		return (error);
113 	}
114 
115 	error = ext4_ext_walk_header(ip,
116 	    (struct ext4_extent_header *)bp->b_data, depth);
117 
118 	brelse(bp);
119 
120 	return (error);
121 }
122 
123 static int
124 ext4_ext_walk_extent(struct inode *ip, struct ext4_extent *ep)
125 {
126 	e4fs_daddr_t blk;
127 	int error;
128 
129 	blk = ext4_ext_extent_pblock(ep);
130 	error = ext4_ext_blk_check(ip, blk);
131 	if (error)
132 		return (error);
133 
134 	if (print_extents_walk)
135 		printf("    ext %p => (blk %u len %u start %ju)\n",
136 		    ep, le32toh(ep->e_blk), le16toh(ep->e_len),
137 		    (uint64_t)blk);
138 
139 	return (0);
140 }
141 
142 static int
143 ext4_ext_walk_header(struct inode *ip, struct ext4_extent_header *eh, int depth)
144 {
145 	int i, error = 0;
146 
147 	error = ext4_ext_check_header(ip, eh, depth);
148 	if (error)
149 		return (error);
150 
151 	if (print_extents_walk)
152 		printf("header %p => (entries %d max %d depth %d gen %d)\n",
153 		    eh, le16toh(eh->eh_ecount),
154 		    le16toh(eh->eh_max), le16toh(eh->eh_depth),
155 		    le32toh(eh->eh_gen));
156 
157 	for (i = 0; i < le16toh(eh->eh_ecount) && error == 0; i++)
158 		if (eh->eh_depth != 0)
159 			error = ext4_ext_walk_index(ip,
160 			    (struct ext4_extent_index *)(eh + 1 + i), depth - 1,
161 			    true);
162 		else
163 			error = ext4_ext_walk_extent(ip,
164 			    (struct ext4_extent *)(eh + 1 + i));
165 
166 	return (error);
167 }
168 
169 int
170 ext4_ext_walk(struct inode *ip)
171 {
172 	struct ext4_extent_header *ehp;
173 
174 	ehp = (struct ext4_extent_header *)ip->i_db;
175 
176 	if (print_extents_walk)
177 		printf("Extent status:ip=%ju\n", ip->i_number);
178 
179 	if (!(ip->i_flag & IN_E4EXTENTS))
180 		return (0);
181 
182 	return (ext4_ext_walk_header(ip, ehp, 0));
183 }
184 
185 static int
186 ext4_ext_print_path(struct inode *ip, struct ext4_extent_path *path)
187 {
188 	int k, depth, error = 0;
189 
190 	depth = path->ep_depth;
191 
192 	if (print_extents_walk)
193 		printf("ip=%ju, Path:\n", ip->i_number);
194 
195 	for (k = 0; k <= depth && error == 0; k++, path++) {
196 		if (path->ep_index) {
197 			error = ext4_ext_walk_index(ip, path->ep_index,
198 			    depth - 1, false);
199 		} else if (path->ep_ext) {
200 			error = ext4_ext_walk_extent(ip, path->ep_ext);
201 		}
202 	}
203 
204 	return (error);
205 }
206 #endif
207 
208 static inline struct ext4_extent_header *
209 ext4_ext_inode_header(struct inode *ip)
210 {
211 
212 	return ((struct ext4_extent_header *)ip->i_db);
213 }
214 
215 static inline struct ext4_extent_header *
216 ext4_ext_block_header(char *bdata)
217 {
218 
219 	return ((struct ext4_extent_header *)bdata);
220 }
221 
222 static inline unsigned short
223 ext4_ext_inode_depth(struct inode *ip)
224 {
225 	struct ext4_extent_header *ehp;
226 
227 	ehp = (struct ext4_extent_header *)ip->i_data;
228 	return (le16toh(ehp->eh_depth));
229 }
230 
231 static inline e4fs_daddr_t
232 ext4_ext_index_pblock(struct ext4_extent_index *index)
233 {
234 	e4fs_daddr_t blk;
235 
236 	blk = le32toh(index->ei_leaf_lo);
237 	blk |= (e4fs_daddr_t)le16toh(index->ei_leaf_hi) << 32;
238 
239 	return (blk);
240 }
241 
242 static inline void
243 ext4_index_store_pblock(struct ext4_extent_index *index, e4fs_daddr_t pb)
244 {
245 
246 	index->ei_leaf_lo = htole32(pb & 0xffffffff);
247 	index->ei_leaf_hi = htole16((pb >> 32) & 0xffff);
248 }
249 
250 static inline e4fs_daddr_t
251 ext4_ext_extent_pblock(struct ext4_extent *extent)
252 {
253 	e4fs_daddr_t blk;
254 
255 	blk = le32toh(extent->e_start_lo);
256 	blk |= (e4fs_daddr_t)le16toh(extent->e_start_hi) << 32;
257 
258 	return (blk);
259 }
260 
261 static inline void
262 ext4_ext_store_pblock(struct ext4_extent *ex, e4fs_daddr_t pb)
263 {
264 
265 	ex->e_start_lo = htole32(pb & 0xffffffff);
266 	ex->e_start_hi = htole16((pb >> 32) & 0xffff);
267 }
268 
269 int
270 ext4_ext_in_cache(struct inode *ip, daddr_t lbn, struct ext4_extent *ep)
271 {
272 	struct ext4_extent_cache *ecp;
273 	int ret = EXT4_EXT_CACHE_NO;
274 
275 	ecp = &ip->i_ext_cache;
276 	if (ecp->ec_type == EXT4_EXT_CACHE_NO)
277 		return (ret);
278 
279 	if (lbn >= ecp->ec_blk && lbn < ecp->ec_blk + ecp->ec_len) {
280 		ep->e_blk = htole32(ecp->ec_blk);
281 		ep->e_start_lo = htole32(ecp->ec_start & 0xffffffff);
282 		ep->e_start_hi = htole16(ecp->ec_start >> 32 & 0xffff);
283 		ep->e_len = htole16(ecp->ec_len);
284 		ret = ecp->ec_type;
285 	}
286 	return (ret);
287 }
288 
289 static inline int
290 ext4_ext_space_root(struct inode *ip)
291 {
292 	int size;
293 
294 	size = sizeof(ip->i_data);
295 	size -= sizeof(struct ext4_extent_header);
296 	size /= sizeof(struct ext4_extent);
297 
298 	return (size);
299 }
300 
301 static inline int
302 ext4_ext_space_block(struct inode *ip)
303 {
304 	struct m_ext2fs *fs;
305 	int size;
306 
307 	fs = ip->i_e2fs;
308 
309 	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
310 	    sizeof(struct ext4_extent);
311 
312 	return (size);
313 }
314 
315 static inline int
316 ext4_ext_space_root_idx(struct inode *ip)
317 {
318 	int size;
319 
320 	size = sizeof(ip->i_data);
321 	size -= sizeof(struct ext4_extent_header);
322 	size /= sizeof(struct ext4_extent_index);
323 
324 	return (size);
325 }
326 
327 static inline int
328 ext4_ext_space_block_idx(struct inode *ip)
329 {
330 	struct m_ext2fs *fs;
331 	int size;
332 
333 	fs = ip->i_e2fs;
334 
335 	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
336 	    sizeof(struct ext4_extent_index);
337 
338 	return (size);
339 }
340 
341 static int
342 ext4_ext_max_entries(struct inode *ip, int depth)
343 {
344 
345 	if (depth == ext4_ext_inode_depth(ip)) {
346 		if (depth == 0)
347 			return (ext4_ext_space_root(ip));
348 		else
349 			return (ext4_ext_space_root_idx(ip));
350 	} else {
351 		if (depth == 0)
352 			return (ext4_ext_space_block(ip));
353 		else
354 			return (ext4_ext_space_block_idx(ip));
355 	}
356 }
357 
358 static inline uint16_t
359 ext4_ext_get_actual_len(struct ext4_extent *ext)
360 {
361 
362 	return (le16toh(ext->e_len) <= EXT_INIT_MAX_LEN ?
363 	    le16toh(ext->e_len) : (le16toh(ext->e_len) - EXT_INIT_MAX_LEN));
364 }
365 
366 
367 static int
368 ext4_inode_block_validate(struct inode *ip, e4fs_daddr_t start_blk,
369     unsigned int count)
370 {
371 	struct m_ext2fs *fs;
372 
373 	fs = ip->i_e2fs;
374 
375 	if ((start_blk <= le32toh(fs->e2fs->e2fs_first_dblock)) ||
376 	    (start_blk + count < start_blk) ||
377 	    (start_blk + count > fs->e2fs_bcount))
378 		return (EIO);
379 
380 	return (0);
381 }
382 
383 static int
384 ext4_validate_extent(struct inode *ip, struct ext4_extent *ext)
385 {
386 	e4fs_daddr_t blk = ext4_ext_extent_pblock(ext);
387 	uint32_t lblk = le32toh(ext->e_blk);
388 	int len = ext4_ext_get_actual_len(ext);
389 
390 	if (lblk + len <= lblk)
391 		return (EIO);
392 
393 	return (ext4_inode_block_validate(ip, blk, len));
394 }
395 
396 static int
397 ext4_validate_extent_idx(struct inode *ip, struct ext4_extent_index *ext_idx)
398 {
399 	e4fs_daddr_t blk = ext4_ext_index_pblock(ext_idx);
400 
401 	return (ext4_inode_block_validate(ip, blk, 1));
402 }
403 
404 static int
405 ext4_validate_extent_entries(struct inode *ip, struct ext4_extent_header *eh,
406     int depth)
407 {
408 	unsigned int count;
409 
410 	count = le16toh(eh->eh_ecount);
411 	if (count == 0)
412 		return (0);
413 
414 	if (depth == 0) {
415 		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
416 		uint32_t lblk = 0;
417 		uint32_t prev = 0;
418 		int len = 0;
419 		while (count) {
420 			/* leaf entries */
421 			if (ext4_validate_extent(ip, ext))
422 				return (EIO);
423 
424 			/* Check for overlapping extents */
425 			lblk = le32toh(ext->e_blk);
426 			len = ext4_ext_get_actual_len(ext);
427 			if ((lblk <= prev) && prev)
428 				return (EIO);
429 
430 			ext++;
431 			count--;
432 			prev = lblk + len - 1;
433 		}
434 	} else {
435 		struct ext4_extent_index *ext_idx = EXT_FIRST_INDEX(eh);
436 		while (count) {
437 			if (ext4_validate_extent_idx(ip, ext_idx))
438 				return (EIO);
439 
440 			ext_idx++;
441 			count--;
442 		}
443 	}
444 
445 	return (0);
446 }
447 
448 static int
449 ext4_ext_check_header(struct inode *ip, struct ext4_extent_header *eh,
450     int depth)
451 {
452 	char *error_msg;
453 
454 	if (le16toh(eh->eh_magic) != EXT4_EXT_MAGIC) {
455 		error_msg = "header: invalid magic";
456 		goto corrupted;
457 	}
458 	if (le16toh(eh->eh_depth) != depth ||
459 	    le16toh(eh->eh_depth) > EXT4_EXT_DEPTH_MAX)
460 	{
461 		error_msg = "header: invalid eh_depth";
462 		goto corrupted;
463 	}
464 	if (eh->eh_max == 0) {
465 		error_msg = "header: invalid eh_max";
466 		goto corrupted;
467 	}
468 	if (le16toh(eh->eh_max) > ext4_ext_max_entries(ip, depth)) {
469 		error_msg = "header: too large eh_max";
470 		goto corrupted;
471 	}
472 	if (le16toh(eh->eh_ecount) > le16toh(eh->eh_max)) {
473 		error_msg = "header: invalid eh_entries";
474 		goto corrupted;
475 	}
476 	if (le16toh(eh->eh_depth) > EXT4_EXT_DEPTH_MAX) {
477 		error_msg = "header: invalid eh_depth";
478 		goto corrupted;
479 	}
480 	if (ext4_validate_extent_entries(ip, eh, depth)) {
481 		error_msg = "header: invalid extent entries";
482 		goto corrupted;
483 	}
484 
485 	return (0);
486 
487 corrupted:
488 	SDT_PROBE2(ext2fs, , trace, extents, 1, error_msg);
489 	return (EIO);
490 }
491 
492 static void
493 ext4_ext_binsearch_index(struct ext4_extent_path *path, int blk)
494 {
495 	struct ext4_extent_header *eh;
496 	struct ext4_extent_index *r, *l, *m;
497 
498 	eh = path->ep_header;
499 
500 	KASSERT(le16toh(eh->eh_ecount) <= le16toh(eh->eh_max) &&
501 	    le16toh(eh->eh_ecount) > 0,
502 	    ("ext4_ext_binsearch_index: bad args"));
503 
504 	l = EXT_FIRST_INDEX(eh) + 1;
505 	r = EXT_FIRST_INDEX(eh) + le16toh(eh->eh_ecount) - 1;
506 	while (l <= r) {
507 		m = l + (r - l) / 2;
508 		if (blk < le32toh(m->ei_blk))
509 			r = m - 1;
510 		else
511 			l = m + 1;
512 	}
513 
514 	path->ep_index = l - 1;
515 }
516 
517 static void
518 ext4_ext_binsearch_ext(struct ext4_extent_path *path, int blk)
519 {
520 	struct ext4_extent_header *eh;
521 	struct ext4_extent *r, *l, *m;
522 
523 	eh = path->ep_header;
524 
525 	KASSERT(le16toh(eh->eh_ecount) <= le16toh(eh->eh_max),
526 	    ("ext4_ext_binsearch_ext: bad args"));
527 
528 	if (eh->eh_ecount == 0)
529 		return;
530 
531 	l = EXT_FIRST_EXTENT(eh) + 1;
532 	r = EXT_FIRST_EXTENT(eh) + le16toh(eh->eh_ecount) - 1;
533 
534 	while (l <= r) {
535 		m = l + (r - l) / 2;
536 		if (blk < le32toh(m->e_blk))
537 			r = m - 1;
538 		else
539 			l = m + 1;
540 	}
541 
542 	path->ep_ext = l - 1;
543 }
544 
545 static int
546 ext4_ext_fill_path_bdata(struct ext4_extent_path *path,
547     struct buf *bp, uint64_t blk)
548 {
549 
550 	KASSERT(path->ep_data == NULL,
551 	    ("ext4_ext_fill_path_bdata: bad ep_data"));
552 
553 	path->ep_data = malloc(bp->b_bufsize, M_EXT2EXTENTS, M_WAITOK);
554 	memcpy(path->ep_data, bp->b_data, bp->b_bufsize);
555 	path->ep_blk = blk;
556 
557 	return (0);
558 }
559 
560 static void
561 ext4_ext_fill_path_buf(struct ext4_extent_path *path, struct buf *bp)
562 {
563 
564 	KASSERT(path->ep_data != NULL,
565 	    ("ext4_ext_fill_path_buf: bad ep_data"));
566 
567 	memcpy(bp->b_data, path->ep_data, bp->b_bufsize);
568 }
569 
570 static void
571 ext4_ext_drop_refs(struct ext4_extent_path *path)
572 {
573 	int depth, i;
574 
575 	if (!path)
576 		return;
577 
578 	depth = path->ep_depth;
579 	for (i = 0; i <= depth; i++, path++)
580 		if (path->ep_data) {
581 			free(path->ep_data, M_EXT2EXTENTS);
582 			path->ep_data = NULL;
583 		}
584 }
585 
586 void
587 ext4_ext_path_free(struct ext4_extent_path *path)
588 {
589 
590 	if (!path)
591 		return;
592 
593 	ext4_ext_drop_refs(path);
594 	free(path, M_EXT2EXTENTS);
595 }
596 
597 int
598 ext4_ext_find_extent(struct inode *ip, daddr_t block,
599     struct ext4_extent_path **ppath)
600 {
601 	struct ext4_extent_header *eh;
602 	struct ext4_extent_path *path;
603 	struct buf *bp;
604 	uint64_t blk;
605 	int error, depth, i, ppos, alloc;
606 
607 	eh = ext4_ext_inode_header(ip);
608 	depth = ext4_ext_inode_depth(ip);
609 	ppos = 0;
610 	alloc = 0;
611 
612 	error = ext4_ext_check_header(ip, eh, depth);
613 	if (error)
614 		return (error);
615 
616 	if (ppath == NULL)
617 		return (EINVAL);
618 
619 	path = *ppath;
620 	if (path == NULL) {
621 		path = malloc(EXT4_EXT_DEPTH_MAX *
622 		    sizeof(struct ext4_extent_path),
623 		    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
624 		*ppath = path;
625 		alloc = 1;
626 	}
627 
628 	path[0].ep_header = eh;
629 	path[0].ep_data = NULL;
630 
631 	/* Walk through the tree. */
632 	i = depth;
633 	while (i) {
634 		ext4_ext_binsearch_index(&path[ppos], block);
635 		blk = ext4_ext_index_pblock(path[ppos].ep_index);
636 		path[ppos].ep_depth = i;
637 		path[ppos].ep_ext = NULL;
638 
639 		error = bread(ip->i_devvp, fsbtodb(ip->i_e2fs, blk),
640 		    ip->i_e2fs->e2fs_bsize, NOCRED, &bp);
641 		if (error) {
642 			goto error;
643 		}
644 
645 		ppos++;
646 		if (ppos > depth) {
647 			SDT_PROBE2(ext2fs, , trace, extents, 1,
648 			    "ppos > depth => extent corrupted");
649 			error = EIO;
650 			brelse(bp);
651 			goto error;
652 		}
653 
654 		ext4_ext_fill_path_bdata(&path[ppos], bp, blk);
655 		bqrelse(bp);
656 
657 		eh = ext4_ext_block_header(path[ppos].ep_data);
658 		if (ext4_ext_check_header(ip, eh, i - 1) ||
659 		    ext2_extent_blk_csum_verify(ip, path[ppos].ep_data)) {
660 			error = EIO;
661 			goto error;
662 		}
663 
664 		path[ppos].ep_header = eh;
665 
666 		i--;
667 	}
668 
669 	error = ext4_ext_check_header(ip, eh, 0);
670 	if (error)
671 		goto error;
672 
673 	/* Find extent. */
674 	path[ppos].ep_depth = i;
675 	path[ppos].ep_header = eh;
676 	path[ppos].ep_ext = NULL;
677 	path[ppos].ep_index = NULL;
678 	ext4_ext_binsearch_ext(&path[ppos], block);
679 	return (0);
680 
681 error:
682 	ext4_ext_drop_refs(path);
683 	if (alloc)
684 		free(path, M_EXT2EXTENTS);
685 
686 	*ppath = NULL;
687 
688 	return (error);
689 }
690 
691 static inline int
692 ext4_ext_space_block_index(struct inode *ip)
693 {
694 	struct m_ext2fs *fs;
695 	int size;
696 
697 	fs = ip->i_e2fs;
698 
699 	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
700 	    sizeof(struct ext4_extent_index);
701 
702 	return (size);
703 }
704 
705 void
706 ext4_ext_tree_init(struct inode *ip)
707 {
708 	struct ext4_extent_header *ehp;
709 
710 	ip->i_flag |= IN_E4EXTENTS;
711 
712 	memset(ip->i_data, 0, EXT2_NDADDR + EXT2_NIADDR);
713 	ehp = (struct ext4_extent_header *)ip->i_data;
714 	ehp->eh_magic = htole16(EXT4_EXT_MAGIC);
715 	ehp->eh_max = htole16(ext4_ext_space_root(ip));
716 	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
717 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
718 	ext2_update(ip->i_vnode, 1);
719 }
720 
721 static inline void
722 ext4_ext_put_in_cache(struct inode *ip, uint32_t blk,
723 			uint32_t len, uint32_t start, int type)
724 {
725 
726 	KASSERT(len != 0, ("ext4_ext_put_in_cache: bad input"));
727 
728 	ip->i_ext_cache.ec_type = type;
729 	ip->i_ext_cache.ec_blk = blk;
730 	ip->i_ext_cache.ec_len = len;
731 	ip->i_ext_cache.ec_start = start;
732 }
733 
734 static e4fs_daddr_t
735 ext4_ext_blkpref(struct inode *ip, struct ext4_extent_path *path,
736     e4fs_daddr_t block)
737 {
738 	struct m_ext2fs *fs;
739 	struct ext4_extent *ex;
740 	e4fs_daddr_t bg_start;
741 	int depth;
742 
743 	fs = ip->i_e2fs;
744 
745 	if (path) {
746 		depth = path->ep_depth;
747 		ex = path[depth].ep_ext;
748 		if (ex) {
749 			e4fs_daddr_t pblk = ext4_ext_extent_pblock(ex);
750 			e2fs_daddr_t blk = le32toh(ex->e_blk);
751 
752 			if (block > blk)
753 				return (pblk + (block - blk));
754 			else
755 				return (pblk - (blk - block));
756 		}
757 
758 		/* Try to get block from index itself. */
759 		if (path[depth].ep_data)
760 			return (path[depth].ep_blk);
761 	}
762 
763 	/* Use inode's group. */
764 	bg_start = (ip->i_block_group * EXT2_BLOCKS_PER_GROUP(ip->i_e2fs)) +
765 	    le32toh(fs->e2fs->e2fs_first_dblock);
766 
767 	return (bg_start + block);
768 }
769 
770 static int inline
771 ext4_can_extents_be_merged(struct ext4_extent *ex1,
772     struct ext4_extent *ex2)
773 {
774 
775 	if (le32toh(ex1->e_blk) + le16toh(ex1->e_len) != le32toh(ex2->e_blk))
776 		return (0);
777 
778 	if (le16toh(ex1->e_len) + le16toh(ex2->e_len) > EXT4_MAX_LEN)
779 		return (0);
780 
781 	if (ext4_ext_extent_pblock(ex1) + le16toh(ex1->e_len) ==
782 	    ext4_ext_extent_pblock(ex2))
783 		return (1);
784 
785 	return (0);
786 }
787 
788 static unsigned
789 ext4_ext_next_leaf_block(struct inode *ip, struct ext4_extent_path *path)
790 {
791 	int depth = path->ep_depth;
792 
793 	/* Empty tree */
794 	if (depth == 0)
795 		return (EXT4_MAX_BLOCKS);
796 
797 	/* Go to indexes. */
798 	depth--;
799 
800 	while (depth >= 0) {
801 		if (path[depth].ep_index !=
802 		    EXT_LAST_INDEX(path[depth].ep_header))
803 			return (le32toh(path[depth].ep_index[1].ei_blk));
804 
805 		depth--;
806 	}
807 
808 	return (EXT4_MAX_BLOCKS);
809 }
810 
811 static int
812 ext4_ext_dirty(struct inode *ip, struct ext4_extent_path *path)
813 {
814 	struct m_ext2fs *fs;
815 	struct buf *bp;
816 	uint64_t blk;
817 	int error;
818 
819 	fs = ip->i_e2fs;
820 
821 	if (!path)
822 		return (EINVAL);
823 
824 	if (path->ep_data) {
825 		blk = path->ep_blk;
826 		bp = getblk(ip->i_devvp, fsbtodb(fs, blk),
827 		    fs->e2fs_bsize, 0, 0, 0);
828 		if (!bp)
829 			return (EIO);
830 		ext4_ext_fill_path_buf(path, bp);
831 		ext2_extent_blk_csum_set(ip, bp->b_data);
832 		error = bwrite(bp);
833 	} else {
834 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
835 		error = ext2_update(ip->i_vnode, 1);
836 	}
837 
838 	return (error);
839 }
840 
841 static int
842 ext4_ext_insert_index(struct inode *ip, struct ext4_extent_path *path,
843     uint32_t lblk, e4fs_daddr_t blk)
844 {
845 	struct ext4_extent_index *idx;
846 	int len;
847 
848 	if (lblk == le32toh(path->ep_index->ei_blk)) {
849 		SDT_PROBE2(ext2fs, , trace, extents, 1,
850 		    "lblk == index blk => extent corrupted");
851 		return (EIO);
852 	}
853 
854 	if (le16toh(path->ep_header->eh_ecount) >=
855 	    le16toh(path->ep_header->eh_max)) {
856 		SDT_PROBE2(ext2fs, , trace, extents, 1,
857 		    "ecout > maxcount => extent corrupted");
858 		return (EIO);
859 	}
860 
861 	if (lblk > le32toh(path->ep_index->ei_blk)) {
862 		/* Insert after. */
863 		idx = path->ep_index + 1;
864 	} else {
865 		/* Insert before. */
866 		idx = path->ep_index;
867 	}
868 
869 	len = EXT_LAST_INDEX(path->ep_header) - idx + 1;
870 	if (len > 0)
871 		memmove(idx + 1, idx, len * sizeof(struct ext4_extent_index));
872 
873 	if (idx > EXT_MAX_INDEX(path->ep_header)) {
874 		SDT_PROBE2(ext2fs, , trace, extents, 1,
875 		    "index is out of range => extent corrupted");
876 		return (EIO);
877 	}
878 
879 	idx->ei_blk = htole32(lblk);
880 	ext4_index_store_pblock(idx, blk);
881 	path->ep_header->eh_ecount =
882 	    htole16(le16toh(path->ep_header->eh_ecount) + 1);
883 
884 	return (ext4_ext_dirty(ip, path));
885 }
886 
887 static e4fs_daddr_t
888 ext4_ext_alloc_meta(struct inode *ip)
889 {
890 	e4fs_daddr_t blk = ext2_alloc_meta(ip);
891 	if (blk) {
892 		ip->i_blocks += btodb(ip->i_e2fs->e2fs_bsize);
893 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
894 		ext2_update(ip->i_vnode, 1);
895 	}
896 
897 	return (blk);
898 }
899 
900 static void
901 ext4_ext_blkfree(struct inode *ip, uint64_t blk, int count, int flags)
902 {
903 	struct m_ext2fs *fs;
904 	int i, blocksreleased;
905 
906 	fs = ip->i_e2fs;
907 	blocksreleased = count;
908 
909 	for(i = 0; i < count; i++)
910 		ext2_blkfree(ip, blk + i, fs->e2fs_bsize);
911 
912 	if (ip->i_blocks >= blocksreleased)
913 		ip->i_blocks -= (btodb(fs->e2fs_bsize)*blocksreleased);
914 	else
915 		ip->i_blocks = 0;
916 
917 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
918 	ext2_update(ip->i_vnode, 1);
919 }
920 
921 static int
922 ext4_ext_split(struct inode *ip, struct ext4_extent_path *path,
923     struct ext4_extent *newext, int at)
924 {
925 	struct m_ext2fs *fs;
926 	struct  buf *bp;
927 	int depth = ext4_ext_inode_depth(ip);
928 	struct ext4_extent_header *neh;
929 	struct ext4_extent_index *fidx;
930 	struct ext4_extent *ex;
931 	int i = at, k, m, a;
932 	e4fs_daddr_t newblk, oldblk;
933 	uint32_t border;
934 	e4fs_daddr_t *ablks = NULL;
935 	int error = 0;
936 
937 	fs = ip->i_e2fs;
938 	bp = NULL;
939 
940 	/*
941 	 * We will split at current extent for now.
942 	 */
943 	if (path[depth].ep_ext > EXT_MAX_EXTENT(path[depth].ep_header)) {
944 		SDT_PROBE2(ext2fs, , trace, extents, 1,
945 		    "extent is out of range => extent corrupted");
946 		return (EIO);
947 	}
948 
949 	if (path[depth].ep_ext != EXT_MAX_EXTENT(path[depth].ep_header))
950 		border = le32toh(path[depth].ep_ext[1].e_blk);
951 	else
952 		border = le32toh(newext->e_blk);
953 
954 	/* Allocate new blocks. */
955 	ablks = malloc(sizeof(e4fs_daddr_t) * depth,
956 	    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
957 	for (a = 0; a < depth - at; a++) {
958 		newblk = ext4_ext_alloc_meta(ip);
959 		if (newblk == 0)
960 			goto cleanup;
961 		ablks[a] = newblk;
962 	}
963 
964 	newblk = ablks[--a];
965 	bp = getblk(ip->i_devvp, fsbtodb(fs, newblk), fs->e2fs_bsize, 0, 0, 0);
966 	if (!bp) {
967 		error = EIO;
968 		goto cleanup;
969 	}
970 
971 	neh = ext4_ext_block_header(bp->b_data);
972 	neh->eh_ecount = 0;
973 	neh->eh_max = le16toh(ext4_ext_space_block(ip));
974 	neh->eh_magic = le16toh(EXT4_EXT_MAGIC);
975 	neh->eh_depth = 0;
976 	ex = EXT_FIRST_EXTENT(neh);
977 
978 	if (le16toh(path[depth].ep_header->eh_ecount) !=
979 	    le16toh(path[depth].ep_header->eh_max)) {
980 		SDT_PROBE2(ext2fs, , trace, extents, 1,
981 		    "extents count out of range => extent corrupted");
982 		error = EIO;
983 		goto cleanup;
984 	}
985 
986 	/* Start copy from next extent. */
987 	m = 0;
988 	path[depth].ep_ext++;
989 	while (path[depth].ep_ext <= EXT_MAX_EXTENT(path[depth].ep_header)) {
990 		path[depth].ep_ext++;
991 		m++;
992 	}
993 	if (m) {
994 		memmove(ex, path[depth].ep_ext - m,
995 		    sizeof(struct ext4_extent) * m);
996 		neh->eh_ecount = htole16(le16toh(neh->eh_ecount) + m);
997 	}
998 
999 	ext2_extent_blk_csum_set(ip, bp->b_data);
1000 	bwrite(bp);
1001 	bp = NULL;
1002 
1003 	/* Fix old leaf. */
1004 	if (m) {
1005 		path[depth].ep_header->eh_ecount =
1006 		    htole16(le16toh(path[depth].ep_header->eh_ecount) - m);
1007 		ext4_ext_dirty(ip, path + depth);
1008 	}
1009 
1010 	/* Create intermediate indexes. */
1011 	k = depth - at - 1;
1012 	KASSERT(k >= 0, ("ext4_ext_split: negative k"));
1013 
1014 	/* Insert new index into current index block. */
1015 	i = depth - 1;
1016 	while (k--) {
1017 		oldblk = newblk;
1018 		newblk = ablks[--a];
1019 		error = bread(ip->i_devvp, fsbtodb(fs, newblk),
1020 		    (int)fs->e2fs_bsize, NOCRED, &bp);
1021 		if (error) {
1022 			goto cleanup;
1023 		}
1024 
1025 		neh = (struct ext4_extent_header *)bp->b_data;
1026 		neh->eh_ecount = htole16(1);
1027 		neh->eh_magic = htole16(EXT4_EXT_MAGIC);
1028 		neh->eh_max = htole16(ext4_ext_space_block_index(ip));
1029 		neh->eh_depth = htole16(depth - i);
1030 		fidx = EXT_FIRST_INDEX(neh);
1031 		fidx->ei_blk = htole32(border);
1032 		ext4_index_store_pblock(fidx, oldblk);
1033 
1034 		m = 0;
1035 		path[i].ep_index++;
1036 		while (path[i].ep_index <= EXT_MAX_INDEX(path[i].ep_header)) {
1037 			path[i].ep_index++;
1038 			m++;
1039 		}
1040 		if (m) {
1041 			memmove(++fidx, path[i].ep_index - m,
1042 			    sizeof(struct ext4_extent_index) * m);
1043 			neh->eh_ecount = htole16(le16toh(neh->eh_ecount) + m);
1044 		}
1045 
1046 		ext2_extent_blk_csum_set(ip, bp->b_data);
1047 		bwrite(bp);
1048 		bp = NULL;
1049 
1050 		/* Fix old index. */
1051 		if (m) {
1052 			path[i].ep_header->eh_ecount =
1053 			    htole16(le16toh(path[i].ep_header->eh_ecount) - m);
1054 			ext4_ext_dirty(ip, path + i);
1055 		}
1056 
1057 		i--;
1058 	}
1059 
1060 	error = ext4_ext_insert_index(ip, path + at, border, newblk);
1061 
1062 cleanup:
1063 	if (bp)
1064 		brelse(bp);
1065 
1066 	if (error) {
1067 		for (i = 0; i < depth; i++) {
1068 			if (!ablks[i])
1069 				continue;
1070 			ext4_ext_blkfree(ip, ablks[i], 1, 0);
1071 		}
1072 	}
1073 
1074 	free(ablks, M_EXT2EXTENTS);
1075 
1076 	return (error);
1077 }
1078 
1079 static int
1080 ext4_ext_grow_indepth(struct inode *ip, struct ext4_extent_path *path,
1081     struct ext4_extent *newext)
1082 {
1083 	struct m_ext2fs *fs;
1084 	struct ext4_extent_path *curpath;
1085 	struct ext4_extent_header *neh;
1086 	struct buf *bp;
1087 	e4fs_daddr_t newblk;
1088 	int error = 0;
1089 
1090 	fs = ip->i_e2fs;
1091 	curpath = path;
1092 
1093 	newblk = ext4_ext_alloc_meta(ip);
1094 	if (newblk == 0)
1095 		return (error);
1096 
1097 	bp = getblk(ip->i_devvp, fsbtodb(fs, newblk), fs->e2fs_bsize, 0, 0, 0);
1098 	if (!bp)
1099 		return (EIO);
1100 
1101 	/* Move top-level index/leaf into new block. */
1102 	memmove(bp->b_data, curpath->ep_header, sizeof(ip->i_data));
1103 
1104 	/* Set size of new block */
1105 	neh = ext4_ext_block_header(bp->b_data);
1106 	neh->eh_magic = htole16(EXT4_EXT_MAGIC);
1107 
1108 	if (ext4_ext_inode_depth(ip))
1109 		neh->eh_max = htole16(ext4_ext_space_block_index(ip));
1110 	else
1111 		neh->eh_max = htole16(ext4_ext_space_block(ip));
1112 
1113 	ext2_extent_blk_csum_set(ip, bp->b_data);
1114 	error = bwrite(bp);
1115 	if (error)
1116 		goto out;
1117 
1118 	bp = NULL;
1119 
1120 	curpath->ep_header->eh_magic = htole16(EXT4_EXT_MAGIC);
1121 	curpath->ep_header->eh_max = htole16(ext4_ext_space_root(ip));
1122 	curpath->ep_header->eh_ecount = htole16(1);
1123 	curpath->ep_index = EXT_FIRST_INDEX(curpath->ep_header);
1124 	curpath->ep_index->ei_blk = EXT_FIRST_EXTENT(path[0].ep_header)->e_blk;
1125 	ext4_index_store_pblock(curpath->ep_index, newblk);
1126 
1127 	neh = ext4_ext_inode_header(ip);
1128 	neh->eh_depth = htole16(path->ep_depth + 1);
1129 	ext4_ext_dirty(ip, curpath);
1130 out:
1131 	brelse(bp);
1132 
1133 	return (error);
1134 }
1135 
1136 static int
1137 ext4_ext_create_new_leaf(struct inode *ip, struct ext4_extent_path *path,
1138     struct ext4_extent *newext)
1139 {
1140 	struct ext4_extent_path *curpath;
1141 	int depth, i, error;
1142 
1143 repeat:
1144 	i = depth = ext4_ext_inode_depth(ip);
1145 
1146 	/* Look for free index entry int the tree */
1147 	curpath = path + depth;
1148 	while (i > 0 && !EXT_HAS_FREE_INDEX(curpath)) {
1149 		i--;
1150 		curpath--;
1151 	}
1152 
1153 	/*
1154 	 * We use already allocated block for index block,
1155 	 * so subsequent data blocks should be contiguous.
1156 	 */
1157 	if (EXT_HAS_FREE_INDEX(curpath)) {
1158 		error = ext4_ext_split(ip, path, newext, i);
1159 		if (error)
1160 			goto out;
1161 
1162 		/* Refill path. */
1163 		ext4_ext_drop_refs(path);
1164 		error = ext4_ext_find_extent(ip, le32toh(newext->e_blk), &path);
1165 		if (error)
1166 			goto out;
1167 	} else {
1168 		/* Tree is full, do grow in depth. */
1169 		error = ext4_ext_grow_indepth(ip, path, newext);
1170 		if (error)
1171 			goto out;
1172 
1173 		/* Refill path. */
1174 		ext4_ext_drop_refs(path);
1175 		error = ext4_ext_find_extent(ip, le32toh(newext->e_blk), &path);
1176 		if (error)
1177 			goto out;
1178 
1179 		/* Check and split tree if required. */
1180 		depth = ext4_ext_inode_depth(ip);
1181 		if (le16toh(path[depth].ep_header->eh_ecount) ==
1182 		    le16toh(path[depth].ep_header->eh_max))
1183 			goto repeat;
1184 	}
1185 
1186 out:
1187 	return (error);
1188 }
1189 
1190 static int
1191 ext4_ext_correct_indexes(struct inode *ip, struct ext4_extent_path *path)
1192 {
1193 	struct ext4_extent_header *eh;
1194 	struct ext4_extent *ex;
1195 	int32_t border;
1196 	int depth, k;
1197 
1198 	depth = ext4_ext_inode_depth(ip);
1199 	eh = path[depth].ep_header;
1200 	ex = path[depth].ep_ext;
1201 
1202 	if (ex == NULL || eh == NULL)
1203 		return (EIO);
1204 
1205 	if (!depth)
1206 		return (0);
1207 
1208 	/* We will correct tree if first leaf got modified only. */
1209 	if (ex != EXT_FIRST_EXTENT(eh))
1210 		return (0);
1211 
1212 	k = depth - 1;
1213 	border = le32toh(path[depth].ep_ext->e_blk);
1214 	path[k].ep_index->ei_blk = htole32(border);
1215 	ext4_ext_dirty(ip, path + k);
1216 	while (k--) {
1217 		/* Change all left-side indexes. */
1218 		if (path[k+1].ep_index != EXT_FIRST_INDEX(path[k+1].ep_header))
1219 			break;
1220 
1221 		path[k].ep_index->ei_blk = htole32(border);
1222 		ext4_ext_dirty(ip, path + k);
1223 	}
1224 
1225 	return (0);
1226 }
1227 
1228 static int
1229 ext4_ext_insert_extent(struct inode *ip, struct ext4_extent_path *path,
1230     struct ext4_extent *newext)
1231 {
1232 	struct ext4_extent_header * eh;
1233 	struct ext4_extent *ex, *nex, *nearex;
1234 	struct ext4_extent_path *npath;
1235 	int depth, len, error, next;
1236 
1237 	depth = ext4_ext_inode_depth(ip);
1238 	ex = path[depth].ep_ext;
1239 	npath = NULL;
1240 
1241 	if (htole16(newext->e_len) == 0 || path[depth].ep_header == NULL)
1242 		return (EINVAL);
1243 
1244 	/* Insert block into found extent. */
1245 	if (ex && ext4_can_extents_be_merged(ex, newext)) {
1246 		ex->e_len = htole16(le16toh(ex->e_len) + le16toh(newext->e_len));
1247 		eh = path[depth].ep_header;
1248 		nearex = ex;
1249 		goto merge;
1250 	}
1251 
1252 repeat:
1253 	depth = ext4_ext_inode_depth(ip);
1254 	eh = path[depth].ep_header;
1255 	if (le16toh(eh->eh_ecount) < le16toh(eh->eh_max))
1256 		goto has_space;
1257 
1258 	/* Try next leaf */
1259 	nex = EXT_LAST_EXTENT(eh);
1260 	next = ext4_ext_next_leaf_block(ip, path);
1261 	if (le32toh(newext->e_blk) > le32toh(nex->e_blk) && next !=
1262 	    EXT4_MAX_BLOCKS) {
1263 		KASSERT(npath == NULL,
1264 		    ("ext4_ext_insert_extent: bad path"));
1265 
1266 		error = ext4_ext_find_extent(ip, next, &npath);
1267 		if (error)
1268 			goto cleanup;
1269 
1270 		if (npath->ep_depth != path->ep_depth) {
1271 			error = EIO;
1272 			goto cleanup;
1273 		}
1274 
1275 		eh = npath[depth].ep_header;
1276 		if (le16toh(eh->eh_ecount) < le16toh(eh->eh_max)) {
1277 			path = npath;
1278 			goto repeat;
1279 		}
1280 	}
1281 
1282 	/*
1283 	 * There is no free space in the found leaf,
1284 	 * try to add a new leaf to the tree.
1285 	 */
1286 	error = ext4_ext_create_new_leaf(ip, path, newext);
1287 	if (error)
1288 		goto cleanup;
1289 
1290 	depth = ext4_ext_inode_depth(ip);
1291 	eh = path[depth].ep_header;
1292 
1293 has_space:
1294 	nearex = path[depth].ep_ext;
1295 	if (!nearex) {
1296 		/* Create new extent in the leaf. */
1297 		path[depth].ep_ext = EXT_FIRST_EXTENT(eh);
1298 	} else if (le32toh(newext->e_blk) > le32toh(nearex->e_blk)) {
1299 		if (nearex != EXT_LAST_EXTENT(eh)) {
1300 			len = EXT_MAX_EXTENT(eh) - nearex;
1301 			len = (len - 1) * sizeof(struct ext4_extent);
1302 			len = len < 0 ? 0 : len;
1303 			memmove(nearex + 2, nearex + 1, len);
1304 		}
1305 		path[depth].ep_ext = nearex + 1;
1306 	} else {
1307 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1308 		len = len < 0 ? 0 : len;
1309 		memmove(nearex + 1, nearex, len);
1310 		path[depth].ep_ext = nearex;
1311 	}
1312 
1313 	eh->eh_ecount = htole16(le16toh(eh->eh_ecount) + 1);
1314 	nearex = path[depth].ep_ext;
1315 	nearex->e_blk = newext->e_blk;
1316 	nearex->e_start_lo = newext->e_start_lo;
1317 	nearex->e_start_hi = newext->e_start_hi;
1318 	nearex->e_len = newext->e_len;
1319 
1320 merge:
1321 	/* Try to merge extents to the right. */
1322 	while (nearex < EXT_LAST_EXTENT(eh)) {
1323 		if (!ext4_can_extents_be_merged(nearex, nearex + 1))
1324 			break;
1325 
1326 		/* Merge with next extent. */
1327 		nearex->e_len = htole16(le16toh(nearex->e_len) +
1328 		    le16toh(nearex[1].e_len));
1329 		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1330 			len = (EXT_LAST_EXTENT(eh) - nearex - 1) *
1331 			    sizeof(struct ext4_extent);
1332 			memmove(nearex + 1, nearex + 2, len);
1333 		}
1334 
1335 		eh->eh_ecount = htole16(le16toh(eh->eh_ecount) - 1);
1336 		KASSERT(le16toh(eh->eh_ecount) != 0,
1337 		    ("ext4_ext_insert_extent: bad ecount"));
1338 	}
1339 
1340 	/*
1341 	 * Try to merge extents to the left,
1342 	 * start from inexes correction.
1343 	 */
1344 	error = ext4_ext_correct_indexes(ip, path);
1345 	if (error)
1346 		goto cleanup;
1347 
1348 	ext4_ext_dirty(ip, path + depth);
1349 
1350 cleanup:
1351 	if (npath) {
1352 		ext4_ext_drop_refs(npath);
1353 		free(npath, M_EXT2EXTENTS);
1354 	}
1355 
1356 	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
1357 	return (error);
1358 }
1359 
1360 static e4fs_daddr_t
1361 ext4_new_blocks(struct inode *ip, daddr_t lbn, e4fs_daddr_t pref,
1362     struct ucred *cred, unsigned long *count, int *perror)
1363 {
1364 	struct m_ext2fs *fs;
1365 	e4fs_daddr_t newblk;
1366 
1367 	/*
1368 	 * We will allocate only single block for now.
1369 	 */
1370 	if (*count > 1)
1371 		return (0);
1372 
1373 	fs = ip->i_e2fs;
1374 	EXT2_LOCK(ip->i_ump);
1375 	*perror = ext2_alloc(ip, lbn, pref, (int)fs->e2fs_bsize, cred, &newblk);
1376 	if (*perror)
1377 		return (0);
1378 
1379 	if (newblk) {
1380 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1381 		ext2_update(ip->i_vnode, 1);
1382 	}
1383 
1384 	return (newblk);
1385 }
1386 
1387 int
1388 ext4_ext_get_blocks(struct inode *ip, e4fs_daddr_t iblk,
1389     unsigned long max_blocks, struct ucred *cred, struct buf **bpp,
1390     int *pallocated, daddr_t *nb)
1391 {
1392 	struct m_ext2fs *fs;
1393 	struct buf *bp = NULL;
1394 	struct ext4_extent_path *path;
1395 	struct ext4_extent newex, *ex;
1396 	e4fs_daddr_t bpref, newblk = 0;
1397 	unsigned long allocated = 0;
1398 	int error = 0, depth;
1399 
1400 	if(bpp)
1401 		*bpp = NULL;
1402 	*pallocated = 0;
1403 
1404 	/* Check cache. */
1405 	path = NULL;
1406 	if ((bpref = ext4_ext_in_cache(ip, iblk, &newex))) {
1407 		if (bpref == EXT4_EXT_CACHE_IN) {
1408 			/* Block is already allocated. */
1409 			newblk = iblk - le32toh(newex.e_blk) +
1410 			    ext4_ext_extent_pblock(&newex);
1411 			allocated = le16toh(newex.e_len) - (iblk - le32toh(newex.e_blk));
1412 			goto out;
1413 		} else {
1414 			error = EIO;
1415 			goto out2;
1416 		}
1417 	}
1418 
1419 	error = ext4_ext_find_extent(ip, iblk, &path);
1420 	if (error) {
1421 		goto out2;
1422 	}
1423 
1424 	depth = ext4_ext_inode_depth(ip);
1425 	if (path[depth].ep_ext == NULL && depth != 0) {
1426 		error = EIO;
1427 		goto out2;
1428 	}
1429 
1430 	if ((ex = path[depth].ep_ext)) {
1431 		uint64_t lblk = le32toh(ex->e_blk);
1432 		uint16_t e_len  = le16toh(ex->e_len);
1433 		e4fs_daddr_t e_start = ext4_ext_extent_pblock(ex);
1434 
1435 		if (e_len > EXT4_MAX_LEN)
1436 			goto out2;
1437 
1438 		/* If we found extent covers block, simply return it. */
1439 		if (iblk >= lblk && iblk < lblk + e_len) {
1440 			newblk = iblk - lblk + e_start;
1441 			allocated = e_len - (iblk - lblk);
1442 			ext4_ext_put_in_cache(ip, lblk, e_len,
1443 			    e_start, EXT4_EXT_CACHE_IN);
1444 			goto out;
1445 		}
1446 	}
1447 
1448 	/* Allocate the new block. */
1449 	if (S_ISREG(ip->i_mode) && (!ip->i_next_alloc_block)) {
1450 		ip->i_next_alloc_goal = 0;
1451 	}
1452 
1453 	bpref = ext4_ext_blkpref(ip, path, iblk);
1454 	allocated = max_blocks;
1455 	newblk = ext4_new_blocks(ip, iblk, bpref, cred, &allocated, &error);
1456 	if (!newblk)
1457 		goto out2;
1458 
1459 	/* Try to insert new extent into found leaf and return. */
1460 	newex.e_blk = htole32(iblk);
1461 	ext4_ext_store_pblock(&newex, newblk);
1462 	newex.e_len = htole16(allocated);
1463 	error = ext4_ext_insert_extent(ip, path, &newex);
1464 	if (error)
1465 		goto out2;
1466 
1467 	newblk = ext4_ext_extent_pblock(&newex);
1468 	ext4_ext_put_in_cache(ip, iblk, allocated, newblk, EXT4_EXT_CACHE_IN);
1469 	*pallocated = 1;
1470 
1471 out:
1472 	if (allocated > max_blocks)
1473 		allocated = max_blocks;
1474 
1475 	if (bpp)
1476 	{
1477 		fs = ip->i_e2fs;
1478 		error = bread(ip->i_devvp, fsbtodb(fs, newblk),
1479 		    fs->e2fs_bsize, cred, &bp);
1480 		if (error) {
1481 			brelse(bp);
1482 		} else {
1483 			*bpp = bp;
1484 		}
1485 	}
1486 
1487 out2:
1488 	if (path) {
1489 		ext4_ext_drop_refs(path);
1490 		free(path, M_EXT2EXTENTS);
1491 	}
1492 
1493 	if (nb)
1494 		*nb = newblk;
1495 
1496 	return (error);
1497 }
1498 
1499 static inline struct ext4_extent_header *
1500 ext4_ext_header(struct inode *ip)
1501 {
1502 
1503 	return ((struct ext4_extent_header *)ip->i_db);
1504 }
1505 
1506 static int
1507 ext4_remove_blocks(struct inode *ip, struct ext4_extent *ex,
1508     unsigned long from, unsigned long to)
1509 {
1510 	unsigned long num, start;
1511 
1512 	if (from >= le32toh(ex->e_blk) &&
1513 	    to == le32toh(ex->e_blk) + ext4_ext_get_actual_len(ex) - 1) {
1514 		/* Tail cleanup. */
1515 		num = le32toh(ex->e_blk) + ext4_ext_get_actual_len(ex) - from;
1516 		start = ext4_ext_extent_pblock(ex) +
1517 		    ext4_ext_get_actual_len(ex) - num;
1518 		ext4_ext_blkfree(ip, start, num, 0);
1519 	}
1520 
1521 	return (0);
1522 }
1523 
1524 static int
1525 ext4_ext_rm_index(struct inode *ip, struct ext4_extent_path *path)
1526 {
1527 	e4fs_daddr_t leaf;
1528 
1529 	/* Free index block. */
1530 	path--;
1531 	leaf = ext4_ext_index_pblock(path->ep_index);
1532 	KASSERT(path->ep_header->eh_ecount != 0,
1533 	    ("ext4_ext_rm_index: bad ecount"));
1534 	path->ep_header->eh_ecount =
1535 	    htole16(le16toh(path->ep_header->eh_ecount) - 1);
1536 	ext4_ext_dirty(ip, path);
1537 	ext4_ext_blkfree(ip, leaf, 1, 0);
1538 	return (0);
1539 }
1540 
1541 static int
1542 ext4_ext_rm_leaf(struct inode *ip, struct ext4_extent_path *path,
1543     uint64_t start)
1544 {
1545 	struct ext4_extent_header *eh;
1546 	struct ext4_extent *ex;
1547 	unsigned int a, b, block, num;
1548 	unsigned long ex_blk;
1549 	unsigned short ex_len;
1550 	int depth;
1551 	int error, correct_index;
1552 
1553 	depth = ext4_ext_inode_depth(ip);
1554 	if (!path[depth].ep_header) {
1555 		if (path[depth].ep_data == NULL)
1556 			return (EINVAL);
1557 		path[depth].ep_header =
1558 		    (struct ext4_extent_header* )path[depth].ep_data;
1559 	}
1560 
1561 	eh = path[depth].ep_header;
1562 	if (!eh) {
1563 		SDT_PROBE2(ext2fs, , trace, extents, 1,
1564 		    "bad header => extent corrupted");
1565 		return (EIO);
1566 	}
1567 
1568 	ex = EXT_LAST_EXTENT(eh);
1569 	ex_blk = le32toh(ex->e_blk);
1570 	ex_len = ext4_ext_get_actual_len(ex);
1571 
1572 	error = 0;
1573 	correct_index = 0;
1574 	while (ex >= EXT_FIRST_EXTENT(eh) && ex_blk + ex_len > start) {
1575 		path[depth].ep_ext = ex;
1576 		a = ex_blk > start ? ex_blk : start;
1577 		b = (uint64_t)ex_blk + ex_len - 1 <
1578 		    EXT4_MAX_BLOCKS ? ex_blk + ex_len - 1 : EXT4_MAX_BLOCKS;
1579 
1580 		if (a != ex_blk && b != ex_blk + ex_len - 1)
1581 			return (EINVAL);
1582 		else if (a != ex_blk) {
1583 			/* Remove tail of the extent. */
1584 			block = ex_blk;
1585 			num = a - block;
1586 		} else if (b != ex_blk + ex_len - 1) {
1587 			/* Remove head of the extent, not implemented. */
1588 			return (EINVAL);
1589 		} else {
1590 			/* Remove whole extent. */
1591 			block = ex_blk;
1592 			num = 0;
1593 		}
1594 
1595 		if (ex == EXT_FIRST_EXTENT(eh))
1596 			correct_index = 1;
1597 
1598 		error = ext4_remove_blocks(ip, ex, a, b);
1599 		if (error)
1600 			goto out;
1601 
1602 		if (num == 0) {
1603 			ext4_ext_store_pblock(ex, 0);
1604 			eh->eh_ecount = htole16(le16toh(eh->eh_ecount) - 1);
1605 		}
1606 
1607 		ex->e_blk = htole32(block);
1608 		ex->e_len = htole16(num);
1609 
1610 		ext4_ext_dirty(ip, path + depth);
1611 
1612 		ex--;
1613 		ex_blk = htole32(ex->e_blk);
1614 		ex_len = ext4_ext_get_actual_len(ex);
1615 	};
1616 
1617 	if (correct_index && le16toh(eh->eh_ecount))
1618 		error = ext4_ext_correct_indexes(ip, path);
1619 
1620 	/*
1621 	 * If this leaf is free, we should
1622 	 * remove it from index block above.
1623 	 */
1624 	if (error == 0 && eh->eh_ecount == 0 &&
1625 	    path[depth].ep_data != NULL)
1626 		error = ext4_ext_rm_index(ip, path + depth);
1627 
1628 out:
1629 	return (error);
1630 }
1631 
1632 static struct buf *
1633 ext4_read_extent_tree_block(struct inode *ip, e4fs_daddr_t pblk,
1634     int depth, int flags)
1635 {
1636 	struct m_ext2fs *fs;
1637 	struct ext4_extent_header *eh;
1638 	struct buf *bp;
1639 	int error;
1640 
1641 	fs = ip->i_e2fs;
1642 	error = bread(ip->i_devvp, fsbtodb(fs, pblk),
1643 	    fs->e2fs_bsize, NOCRED, &bp);
1644 	if (error) {
1645 		return (NULL);
1646 	}
1647 
1648 	eh = ext4_ext_block_header(bp->b_data);
1649 	if (le16toh(eh->eh_depth) != depth) {
1650 		SDT_PROBE2(ext2fs, , trace, extents, 1,
1651 		    "unexpected eh_depth");
1652 		goto err;
1653 	}
1654 
1655 	error = ext4_ext_check_header(ip, eh, depth);
1656 	if (error)
1657 		goto err;
1658 
1659 	return (bp);
1660 
1661 err:
1662 	brelse(bp);
1663 	return (NULL);
1664 
1665 }
1666 
1667 static int inline
1668 ext4_ext_more_to_rm(struct ext4_extent_path *path)
1669 {
1670 
1671 	KASSERT(path->ep_index != NULL,
1672 	    ("ext4_ext_more_to_rm: bad index from path"));
1673 
1674 	if (path->ep_index < EXT_FIRST_INDEX(path->ep_header))
1675 		return (0);
1676 
1677 	if (le16toh(path->ep_header->eh_ecount) == path->index_count)
1678 		return (0);
1679 
1680 	return (1);
1681 }
1682 
1683 int
1684 ext4_ext_remove_space(struct inode *ip, off_t length, int flags,
1685     struct ucred *cred, struct thread *td)
1686 {
1687 	struct buf *bp;
1688 	struct ext4_extent_header *ehp;
1689 	struct ext4_extent_path *path;
1690 	int depth;
1691 	int i, error;
1692 
1693 	ehp = (struct ext4_extent_header *)ip->i_db;
1694 	depth = ext4_ext_inode_depth(ip);
1695 
1696 	error = ext4_ext_check_header(ip, ehp, depth);
1697 	if(error)
1698 		return (error);
1699 
1700 	path = malloc(sizeof(struct ext4_extent_path) * (depth + 1),
1701 	    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
1702 	path[0].ep_header = ehp;
1703 	path[0].ep_depth = depth;
1704 	i = 0;
1705 	while (error == 0 && i >= 0) {
1706 		if (i == depth) {
1707 			/* This is leaf. */
1708 			error = ext4_ext_rm_leaf(ip, path, length);
1709 			if (error)
1710 				break;
1711 			free(path[i].ep_data, M_EXT2EXTENTS);
1712 			path[i].ep_data = NULL;
1713 			i--;
1714 			continue;
1715 		}
1716 
1717 		/* This is index. */
1718 		if (!path[i].ep_header)
1719 			path[i].ep_header =
1720 			    (struct ext4_extent_header *)path[i].ep_data;
1721 
1722 		if (!path[i].ep_index) {
1723 			/* This level hasn't touched yet. */
1724 			path[i].ep_index = EXT_LAST_INDEX(path[i].ep_header);
1725 			path[i].index_count =
1726 			    le16toh(path[i].ep_header->eh_ecount) + 1;
1727 		} else {
1728 			/* We've already was here, see at next index. */
1729 			path[i].ep_index--;
1730 		}
1731 
1732 		if (ext4_ext_more_to_rm(path + i)) {
1733 			memset(path + i + 1, 0, sizeof(*path));
1734 			bp = ext4_read_extent_tree_block(ip,
1735 			    ext4_ext_index_pblock(path[i].ep_index),
1736 			    path[0].ep_depth - (i + 1), 0);
1737 			if (!bp) {
1738 				error = EIO;
1739 				break;
1740 			}
1741 
1742 			ext4_ext_fill_path_bdata(&path[i+1], bp,
1743 			    ext4_ext_index_pblock(path[i].ep_index));
1744 			brelse(bp);
1745 			path[i].index_count =
1746 			    le16toh(path[i].ep_header->eh_ecount);
1747 			i++;
1748 		} else {
1749 			if (path[i].ep_header->eh_ecount == 0 && i > 0) {
1750 				/* Index is empty, remove it. */
1751 				error = ext4_ext_rm_index(ip, path + i);
1752 			}
1753 			free(path[i].ep_data, M_EXT2EXTENTS);
1754 			path[i].ep_data = NULL;
1755 			i--;
1756 		}
1757 	}
1758 
1759 	if (path->ep_header->eh_ecount == 0) {
1760 		/*
1761 		 * Truncate the tree to zero.
1762 		 */
1763 		 ext4_ext_header(ip)->eh_depth = 0;
1764 		 ext4_ext_header(ip)->eh_max = htole16(ext4_ext_space_root(ip));
1765 		 ext4_ext_dirty(ip, path);
1766 	}
1767 
1768 	ext4_ext_drop_refs(path);
1769 	free(path, M_EXT2EXTENTS);
1770 
1771 	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
1772 	return (error);
1773 }
1774