xref: /linux/fs/gfs2/bmap.c (revision 5c35a02c545a7bbe77f3a1ae337d9e29beed079b)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/blkdev.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/crc32.h>
16 #include <linux/iomap.h>
17 
18 #include "gfs2.h"
19 #include "incore.h"
20 #include "bmap.h"
21 #include "glock.h"
22 #include "inode.h"
23 #include "meta_io.h"
24 #include "quota.h"
25 #include "rgrp.h"
26 #include "log.h"
27 #include "super.h"
28 #include "trans.h"
29 #include "dir.h"
30 #include "util.h"
31 #include "trace_gfs2.h"
32 
33 /* This doesn't need to be that large as max 64 bit pointers in a 4k
34  * block is 512, so __u16 is fine for that. It saves stack space to
35  * keep it small.
36  */
37 struct metapath {
38 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
39 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
40 	int mp_fheight; /* find_metapath height */
41 	int mp_aheight; /* actual height (lookup height) */
42 };
43 
44 /**
45  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
46  * @ip: the inode
47  * @dibh: the dinode buffer
48  * @block: the block number that was allocated
49  * @page: The (optional) page. This is looked up if @page is NULL
50  *
51  * Returns: errno
52  */
53 
54 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
55 			       u64 block, struct page *page)
56 {
57 	struct inode *inode = &ip->i_inode;
58 	struct buffer_head *bh;
59 	int release = 0;
60 
61 	if (!page || page->index) {
62 		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
63 		if (!page)
64 			return -ENOMEM;
65 		release = 1;
66 	}
67 
68 	if (!PageUptodate(page)) {
69 		void *kaddr = kmap(page);
70 		u64 dsize = i_size_read(inode);
71 
72 		if (dsize > gfs2_max_stuffed_size(ip))
73 			dsize = gfs2_max_stuffed_size(ip);
74 
75 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
77 		kunmap(page);
78 
79 		SetPageUptodate(page);
80 	}
81 
82 	if (!page_has_buffers(page))
83 		create_empty_buffers(page, BIT(inode->i_blkbits),
84 				     BIT(BH_Uptodate));
85 
86 	bh = page_buffers(page);
87 
88 	if (!buffer_mapped(bh))
89 		map_bh(bh, inode->i_sb, block);
90 
91 	set_buffer_uptodate(bh);
92 	if (gfs2_is_jdata(ip))
93 		gfs2_trans_add_data(ip->i_gl, bh);
94 	else {
95 		mark_buffer_dirty(bh);
96 		gfs2_ordered_add_inode(ip);
97 	}
98 
99 	if (release) {
100 		unlock_page(page);
101 		put_page(page);
102 	}
103 
104 	return 0;
105 }
106 
107 /**
108  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
109  * @ip: The GFS2 inode to unstuff
110  * @page: The (optional) page. This is looked up if the @page is NULL
111  *
112  * This routine unstuffs a dinode and returns it to a "normal" state such
113  * that the height can be grown in the traditional way.
114  *
115  * Returns: errno
116  */
117 
118 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
119 {
120 	struct buffer_head *bh, *dibh;
121 	struct gfs2_dinode *di;
122 	u64 block = 0;
123 	int isdir = gfs2_is_dir(ip);
124 	int error;
125 
126 	down_write(&ip->i_rw_mutex);
127 
128 	error = gfs2_meta_inode_buffer(ip, &dibh);
129 	if (error)
130 		goto out;
131 
132 	if (i_size_read(&ip->i_inode)) {
133 		/* Get a free block, fill it with the stuffed data,
134 		   and write it out to disk */
135 
136 		unsigned int n = 1;
137 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
138 		if (error)
139 			goto out_brelse;
140 		if (isdir) {
141 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
142 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
143 			if (error)
144 				goto out_brelse;
145 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
146 					      dibh, sizeof(struct gfs2_dinode));
147 			brelse(bh);
148 		} else {
149 			error = gfs2_unstuffer_page(ip, dibh, block, page);
150 			if (error)
151 				goto out_brelse;
152 		}
153 	}
154 
155 	/*  Set up the pointer to the new block  */
156 
157 	gfs2_trans_add_meta(ip->i_gl, dibh);
158 	di = (struct gfs2_dinode *)dibh->b_data;
159 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
160 
161 	if (i_size_read(&ip->i_inode)) {
162 		*(__be64 *)(di + 1) = cpu_to_be64(block);
163 		gfs2_add_inode_blocks(&ip->i_inode, 1);
164 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
165 	}
166 
167 	ip->i_height = 1;
168 	di->di_height = cpu_to_be16(1);
169 
170 out_brelse:
171 	brelse(dibh);
172 out:
173 	up_write(&ip->i_rw_mutex);
174 	return error;
175 }
176 
177 
178 /**
179  * find_metapath - Find path through the metadata tree
180  * @sdp: The superblock
181  * @block: The disk block to look up
182  * @mp: The metapath to return the result in
183  * @height: The pre-calculated height of the metadata tree
184  *
185  *   This routine returns a struct metapath structure that defines a path
186  *   through the metadata of inode "ip" to get to block "block".
187  *
188  *   Example:
189  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
190  *   filesystem with a blocksize of 4096.
191  *
192  *   find_metapath() would return a struct metapath structure set to:
193  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
194  *
195  *   That means that in order to get to the block containing the byte at
196  *   offset 101342453, we would load the indirect block pointed to by pointer
197  *   0 in the dinode.  We would then load the indirect block pointed to by
198  *   pointer 48 in that indirect block.  We would then load the data block
199  *   pointed to by pointer 165 in that indirect block.
200  *
201  *             ----------------------------------------
202  *             | Dinode |                             |
203  *             |        |                            4|
204  *             |        |0 1 2 3 4 5                 9|
205  *             |        |                            6|
206  *             ----------------------------------------
207  *                       |
208  *                       |
209  *                       V
210  *             ----------------------------------------
211  *             | Indirect Block                       |
212  *             |                                     5|
213  *             |            4 4 4 4 4 5 5            1|
214  *             |0           5 6 7 8 9 0 1            2|
215  *             ----------------------------------------
216  *                                |
217  *                                |
218  *                                V
219  *             ----------------------------------------
220  *             | Indirect Block                       |
221  *             |                         1 1 1 1 1   5|
222  *             |                         6 6 6 6 6   1|
223  *             |0                        3 4 5 6 7   2|
224  *             ----------------------------------------
225  *                                           |
226  *                                           |
227  *                                           V
228  *             ----------------------------------------
229  *             | Data block containing offset         |
230  *             |            101342453                 |
231  *             |                                      |
232  *             |                                      |
233  *             ----------------------------------------
234  *
235  */
236 
237 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
238 			  struct metapath *mp, unsigned int height)
239 {
240 	unsigned int i;
241 
242 	mp->mp_fheight = height;
243 	for (i = height; i--;)
244 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
245 }
246 
247 static inline unsigned int metapath_branch_start(const struct metapath *mp)
248 {
249 	if (mp->mp_list[0] == 0)
250 		return 2;
251 	return 1;
252 }
253 
254 /**
255  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
256  * @height: The metadata height (0 = dinode)
257  * @mp: The metapath
258  */
259 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
260 {
261 	struct buffer_head *bh = mp->mp_bh[height];
262 	if (height == 0)
263 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
264 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
265 }
266 
267 /**
268  * metapointer - Return pointer to start of metadata in a buffer
269  * @height: The metadata height (0 = dinode)
270  * @mp: The metapath
271  *
272  * Return a pointer to the block number of the next height of the metadata
273  * tree given a buffer containing the pointer to the current height of the
274  * metadata tree.
275  */
276 
277 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
278 {
279 	__be64 *p = metaptr1(height, mp);
280 	return p + mp->mp_list[height];
281 }
282 
283 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
284 {
285 	const struct buffer_head *bh = mp->mp_bh[height];
286 	return (const __be64 *)(bh->b_data + bh->b_size);
287 }
288 
289 static void clone_metapath(struct metapath *clone, struct metapath *mp)
290 {
291 	unsigned int hgt;
292 
293 	*clone = *mp;
294 	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
295 		get_bh(clone->mp_bh[hgt]);
296 }
297 
298 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
299 {
300 	const __be64 *t;
301 
302 	for (t = start; t < end; t++) {
303 		struct buffer_head *rabh;
304 
305 		if (!*t)
306 			continue;
307 
308 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
309 		if (trylock_buffer(rabh)) {
310 			if (!buffer_uptodate(rabh)) {
311 				rabh->b_end_io = end_buffer_read_sync;
312 				submit_bh(REQ_OP_READ,
313 					  REQ_RAHEAD | REQ_META | REQ_PRIO,
314 					  rabh);
315 				continue;
316 			}
317 			unlock_buffer(rabh);
318 		}
319 		brelse(rabh);
320 	}
321 }
322 
323 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
324 			     unsigned int x, unsigned int h)
325 {
326 	for (; x < h; x++) {
327 		__be64 *ptr = metapointer(x, mp);
328 		u64 dblock = be64_to_cpu(*ptr);
329 		int ret;
330 
331 		if (!dblock)
332 			break;
333 		ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
334 		if (ret)
335 			return ret;
336 	}
337 	mp->mp_aheight = x + 1;
338 	return 0;
339 }
340 
341 /**
342  * lookup_metapath - Walk the metadata tree to a specific point
343  * @ip: The inode
344  * @mp: The metapath
345  *
346  * Assumes that the inode's buffer has already been looked up and
347  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
348  * by find_metapath().
349  *
350  * If this function encounters part of the tree which has not been
351  * allocated, it returns the current height of the tree at the point
352  * at which it found the unallocated block. Blocks which are found are
353  * added to the mp->mp_bh[] list.
354  *
355  * Returns: error
356  */
357 
358 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
359 {
360 	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
361 }
362 
363 /**
364  * fillup_metapath - fill up buffers for the metadata path to a specific height
365  * @ip: The inode
366  * @mp: The metapath
367  * @h: The height to which it should be mapped
368  *
369  * Similar to lookup_metapath, but does lookups for a range of heights
370  *
371  * Returns: error or the number of buffers filled
372  */
373 
374 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
375 {
376 	unsigned int x = 0;
377 	int ret;
378 
379 	if (h) {
380 		/* find the first buffer we need to look up. */
381 		for (x = h - 1; x > 0; x--) {
382 			if (mp->mp_bh[x])
383 				break;
384 		}
385 	}
386 	ret = __fillup_metapath(ip, mp, x, h);
387 	if (ret)
388 		return ret;
389 	return mp->mp_aheight - x - 1;
390 }
391 
392 static inline void release_metapath(struct metapath *mp)
393 {
394 	int i;
395 
396 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
397 		if (mp->mp_bh[i] == NULL)
398 			break;
399 		brelse(mp->mp_bh[i]);
400 	}
401 }
402 
403 /**
404  * gfs2_extent_length - Returns length of an extent of blocks
405  * @start: Start of the buffer
406  * @len: Length of the buffer in bytes
407  * @ptr: Current position in the buffer
408  * @limit: Max extent length to return (0 = unlimited)
409  * @eob: Set to 1 if we hit "end of block"
410  *
411  * If the first block is zero (unallocated) it will return the number of
412  * unallocated blocks in the extent, otherwise it will return the number
413  * of contiguous blocks in the extent.
414  *
415  * Returns: The length of the extent (minimum of one block)
416  */
417 
418 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
419 {
420 	const __be64 *end = (start + len);
421 	const __be64 *first = ptr;
422 	u64 d = be64_to_cpu(*ptr);
423 
424 	*eob = 0;
425 	do {
426 		ptr++;
427 		if (ptr >= end)
428 			break;
429 		if (limit && --limit == 0)
430 			break;
431 		if (d)
432 			d++;
433 	} while(be64_to_cpu(*ptr) == d);
434 	if (ptr >= end)
435 		*eob = 1;
436 	return (ptr - first);
437 }
438 
439 typedef const __be64 *(*gfs2_metadata_walker)(
440 		struct metapath *mp,
441 		const __be64 *start, const __be64 *end,
442 		u64 factor, void *data);
443 
444 #define WALK_STOP ((__be64 *)0)
445 #define WALK_NEXT ((__be64 *)1)
446 
447 static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
448 		u64 len, struct metapath *mp, gfs2_metadata_walker walker,
449 		void *data)
450 {
451 	struct metapath clone;
452 	struct gfs2_inode *ip = GFS2_I(inode);
453 	struct gfs2_sbd *sdp = GFS2_SB(inode);
454 	const __be64 *start, *end, *ptr;
455 	u64 factor = 1;
456 	unsigned int hgt;
457 	int ret = 0;
458 
459 	for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
460 		factor *= sdp->sd_inptrs;
461 
462 	for (;;) {
463 		u64 step;
464 
465 		/* Walk indirect block. */
466 		start = metapointer(hgt, mp);
467 		end = metaend(hgt, mp);
468 
469 		step = (end - start) * factor;
470 		if (step > len)
471 			end = start + DIV_ROUND_UP_ULL(len, factor);
472 
473 		ptr = walker(mp, start, end, factor, data);
474 		if (ptr == WALK_STOP)
475 			break;
476 		if (step >= len)
477 			break;
478 		len -= step;
479 		if (ptr != WALK_NEXT) {
480 			BUG_ON(!*ptr);
481 			mp->mp_list[hgt] += ptr - start;
482 			goto fill_up_metapath;
483 		}
484 
485 lower_metapath:
486 		/* Decrease height of metapath. */
487 		if (mp != &clone) {
488 			clone_metapath(&clone, mp);
489 			mp = &clone;
490 		}
491 		brelse(mp->mp_bh[hgt]);
492 		mp->mp_bh[hgt] = NULL;
493 		if (!hgt)
494 			break;
495 		hgt--;
496 		factor *= sdp->sd_inptrs;
497 
498 		/* Advance in metadata tree. */
499 		(mp->mp_list[hgt])++;
500 		start = metapointer(hgt, mp);
501 		end = metaend(hgt, mp);
502 		if (start >= end) {
503 			mp->mp_list[hgt] = 0;
504 			if (!hgt)
505 				break;
506 			goto lower_metapath;
507 		}
508 
509 fill_up_metapath:
510 		/* Increase height of metapath. */
511 		if (mp != &clone) {
512 			clone_metapath(&clone, mp);
513 			mp = &clone;
514 		}
515 		ret = fillup_metapath(ip, mp, ip->i_height - 1);
516 		if (ret < 0)
517 			break;
518 		hgt += ret;
519 		for (; ret; ret--)
520 			do_div(factor, sdp->sd_inptrs);
521 		mp->mp_aheight = hgt + 1;
522 	}
523 	if (mp == &clone)
524 		release_metapath(mp);
525 	return ret;
526 }
527 
528 struct gfs2_hole_walker_args {
529 	u64 blocks;
530 };
531 
532 static const __be64 *gfs2_hole_walker(struct metapath *mp,
533 		const __be64 *start, const __be64 *end,
534 		u64 factor, void *data)
535 {
536 	struct gfs2_hole_walker_args *args = data;
537 	const __be64 *ptr;
538 
539 	for (ptr = start; ptr < end; ptr++) {
540 		if (*ptr) {
541 			args->blocks += (ptr - start) * factor;
542 			if (mp->mp_aheight == mp->mp_fheight)
543 				return WALK_STOP;
544 			return ptr;  /* increase height */
545 		}
546 	}
547 	args->blocks += (end - start) * factor;
548 	return WALK_NEXT;
549 }
550 
551 /**
552  * gfs2_hole_size - figure out the size of a hole
553  * @inode: The inode
554  * @lblock: The logical starting block number
555  * @len: How far to look (in blocks)
556  * @mp: The metapath at lblock
557  * @iomap: The iomap to store the hole size in
558  *
559  * This function modifies @mp.
560  *
561  * Returns: errno on error
562  */
563 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
564 			  struct metapath *mp, struct iomap *iomap)
565 {
566 	struct gfs2_hole_walker_args args = { };
567 	int ret = 0;
568 
569 	ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
570 	if (!ret)
571 		iomap->length = args.blocks << inode->i_blkbits;
572 	return ret;
573 }
574 
575 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
576 					 struct gfs2_glock *gl, unsigned int i,
577 					 unsigned offset, u64 bn)
578 {
579 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
580 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
581 				 sizeof(struct gfs2_dinode)));
582 	BUG_ON(i < 1);
583 	BUG_ON(mp->mp_bh[i] != NULL);
584 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
585 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
586 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
587 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
588 	ptr += offset;
589 	*ptr = cpu_to_be64(bn);
590 	return ptr;
591 }
592 
593 enum alloc_state {
594 	ALLOC_DATA = 0,
595 	ALLOC_GROW_DEPTH = 1,
596 	ALLOC_GROW_HEIGHT = 2,
597 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
598 };
599 
600 /**
601  * gfs2_iomap_alloc - Build a metadata tree of the requested height
602  * @inode: The GFS2 inode
603  * @iomap: The iomap structure
604  * @flags: iomap flags
605  * @mp: The metapath, with proper height information calculated
606  *
607  * In this routine we may have to alloc:
608  *   i) Indirect blocks to grow the metadata tree height
609  *  ii) Indirect blocks to fill in lower part of the metadata tree
610  * iii) Data blocks
611  *
612  * The function is in two parts. The first part works out the total
613  * number of blocks which we need. The second part does the actual
614  * allocation asking for an extent at a time (if enough contiguous free
615  * blocks are available, there will only be one request per bmap call)
616  * and uses the state machine to initialise the blocks in order.
617  *
618  * Right now, this function will allocate at most one indirect block
619  * worth of data -- with a default block size of 4K, that's slightly
620  * less than 2M.  If this limitation is ever removed to allow huge
621  * allocations, we would probably still want to limit the iomap size we
622  * return to avoid stalling other tasks during huge writes; the next
623  * iomap iteration would then find the blocks already allocated.
624  *
625  * Returns: errno on error
626  */
627 
628 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
629 			    unsigned flags, struct metapath *mp)
630 {
631 	struct gfs2_inode *ip = GFS2_I(inode);
632 	struct gfs2_sbd *sdp = GFS2_SB(inode);
633 	struct buffer_head *dibh = mp->mp_bh[0];
634 	u64 bn;
635 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
636 	unsigned dblks = 0;
637 	unsigned ptrs_per_blk;
638 	const unsigned end_of_metadata = mp->mp_fheight - 1;
639 	int ret;
640 	enum alloc_state state;
641 	__be64 *ptr;
642 	__be64 zero_bn = 0;
643 	size_t maxlen = iomap->length >> inode->i_blkbits;
644 
645 	BUG_ON(mp->mp_aheight < 1);
646 	BUG_ON(dibh == NULL);
647 
648 	gfs2_trans_add_meta(ip->i_gl, dibh);
649 
650 	down_write(&ip->i_rw_mutex);
651 
652 	if (mp->mp_fheight == mp->mp_aheight) {
653 		struct buffer_head *bh;
654 		int eob;
655 
656 		/* Bottom indirect block exists, find unalloced extent size */
657 		ptr = metapointer(end_of_metadata, mp);
658 		bh = mp->mp_bh[end_of_metadata];
659 		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
660 					   maxlen, &eob);
661 		BUG_ON(dblks < 1);
662 		state = ALLOC_DATA;
663 	} else {
664 		/* Need to allocate indirect blocks */
665 		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
666 			sdp->sd_diptrs;
667 		dblks = min(maxlen, (size_t)(ptrs_per_blk -
668 					     mp->mp_list[end_of_metadata]));
669 		if (mp->mp_fheight == ip->i_height) {
670 			/* Writing into existing tree, extend tree down */
671 			iblks = mp->mp_fheight - mp->mp_aheight;
672 			state = ALLOC_GROW_DEPTH;
673 		} else {
674 			/* Building up tree height */
675 			state = ALLOC_GROW_HEIGHT;
676 			iblks = mp->mp_fheight - ip->i_height;
677 			branch_start = metapath_branch_start(mp);
678 			iblks += (mp->mp_fheight - branch_start);
679 		}
680 	}
681 
682 	/* start of the second part of the function (state machine) */
683 
684 	blks = dblks + iblks;
685 	i = mp->mp_aheight;
686 	do {
687 		n = blks - alloced;
688 		ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
689 		if (ret)
690 			goto out;
691 		alloced += n;
692 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
693 			gfs2_trans_add_unrevoke(sdp, bn, n);
694 		switch (state) {
695 		/* Growing height of tree */
696 		case ALLOC_GROW_HEIGHT:
697 			if (i == 1) {
698 				ptr = (__be64 *)(dibh->b_data +
699 						 sizeof(struct gfs2_dinode));
700 				zero_bn = *ptr;
701 			}
702 			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
703 			     i++, n--)
704 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
705 			if (i - 1 == mp->mp_fheight - ip->i_height) {
706 				i--;
707 				gfs2_buffer_copy_tail(mp->mp_bh[i],
708 						sizeof(struct gfs2_meta_header),
709 						dibh, sizeof(struct gfs2_dinode));
710 				gfs2_buffer_clear_tail(dibh,
711 						sizeof(struct gfs2_dinode) +
712 						sizeof(__be64));
713 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
714 					sizeof(struct gfs2_meta_header));
715 				*ptr = zero_bn;
716 				state = ALLOC_GROW_DEPTH;
717 				for(i = branch_start; i < mp->mp_fheight; i++) {
718 					if (mp->mp_bh[i] == NULL)
719 						break;
720 					brelse(mp->mp_bh[i]);
721 					mp->mp_bh[i] = NULL;
722 				}
723 				i = branch_start;
724 			}
725 			if (n == 0)
726 				break;
727 		/* Branching from existing tree */
728 		case ALLOC_GROW_DEPTH:
729 			if (i > 1 && i < mp->mp_fheight)
730 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
731 			for (; i < mp->mp_fheight && n > 0; i++, n--)
732 				gfs2_indirect_init(mp, ip->i_gl, i,
733 						   mp->mp_list[i-1], bn++);
734 			if (i == mp->mp_fheight)
735 				state = ALLOC_DATA;
736 			if (n == 0)
737 				break;
738 		/* Tree complete, adding data blocks */
739 		case ALLOC_DATA:
740 			BUG_ON(n > dblks);
741 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
742 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
743 			dblks = n;
744 			ptr = metapointer(end_of_metadata, mp);
745 			iomap->addr = bn << inode->i_blkbits;
746 			iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
747 			while (n-- > 0)
748 				*ptr++ = cpu_to_be64(bn++);
749 			break;
750 		}
751 	} while (iomap->addr == IOMAP_NULL_ADDR);
752 
753 	iomap->length = (u64)dblks << inode->i_blkbits;
754 	ip->i_height = mp->mp_fheight;
755 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
756 	gfs2_dinode_out(ip, dibh->b_data);
757 out:
758 	up_write(&ip->i_rw_mutex);
759 	return ret;
760 }
761 
762 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
763 {
764 	struct gfs2_inode *ip = GFS2_I(inode);
765 
766 	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
767 		      sizeof(struct gfs2_dinode);
768 	iomap->offset = 0;
769 	iomap->length = i_size_read(inode);
770 	iomap->type = IOMAP_INLINE;
771 }
772 
773 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
774 
775 /**
776  * gfs2_iomap_get - Map blocks from an inode to disk blocks
777  * @inode: The inode
778  * @pos: Starting position in bytes
779  * @length: Length to map, in bytes
780  * @flags: iomap flags
781  * @iomap: The iomap structure
782  * @mp: The metapath
783  *
784  * Returns: errno
785  */
786 static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
787 			  unsigned flags, struct iomap *iomap,
788 			  struct metapath *mp)
789 {
790 	struct gfs2_inode *ip = GFS2_I(inode);
791 	struct gfs2_sbd *sdp = GFS2_SB(inode);
792 	__be64 *ptr;
793 	sector_t lblock;
794 	sector_t lblock_stop;
795 	int ret;
796 	int eob;
797 	u64 len;
798 	struct buffer_head *bh;
799 	u8 height;
800 
801 	if (!length)
802 		return -EINVAL;
803 
804 	if (gfs2_is_stuffed(ip)) {
805 		if (flags & IOMAP_REPORT) {
806 			if (pos >= i_size_read(inode))
807 				return -ENOENT;
808 			gfs2_stuffed_iomap(inode, iomap);
809 			return 0;
810 		}
811 		BUG_ON(!(flags & IOMAP_WRITE));
812 	}
813 	lblock = pos >> inode->i_blkbits;
814 	iomap->offset = lblock << inode->i_blkbits;
815 	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
816 	len = lblock_stop - lblock + 1;
817 
818 	down_read(&ip->i_rw_mutex);
819 
820 	ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]);
821 	if (ret)
822 		goto unlock;
823 
824 	height = ip->i_height;
825 	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
826 		height++;
827 	find_metapath(sdp, lblock, mp, height);
828 	if (height > ip->i_height || gfs2_is_stuffed(ip))
829 		goto do_alloc;
830 
831 	ret = lookup_metapath(ip, mp);
832 	if (ret)
833 		goto unlock;
834 
835 	if (mp->mp_aheight != ip->i_height)
836 		goto do_alloc;
837 
838 	ptr = metapointer(ip->i_height - 1, mp);
839 	if (*ptr == 0)
840 		goto do_alloc;
841 
842 	bh = mp->mp_bh[ip->i_height - 1];
843 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob);
844 
845 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
846 	iomap->length = len << inode->i_blkbits;
847 	iomap->type = IOMAP_MAPPED;
848 	iomap->flags = IOMAP_F_MERGED;
849 	if (eob)
850 		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
851 
852 out:
853 	iomap->bdev = inode->i_sb->s_bdev;
854 unlock:
855 	up_read(&ip->i_rw_mutex);
856 	return ret;
857 
858 do_alloc:
859 	iomap->addr = IOMAP_NULL_ADDR;
860 	iomap->length = len << inode->i_blkbits;
861 	iomap->type = IOMAP_HOLE;
862 	iomap->flags = 0;
863 	if (flags & IOMAP_REPORT) {
864 		loff_t size = i_size_read(inode);
865 		if (pos >= size)
866 			ret = -ENOENT;
867 		else if (height == ip->i_height)
868 			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
869 		else
870 			iomap->length = size - pos;
871 	}
872 	goto out;
873 }
874 
875 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
876 			    unsigned flags, struct iomap *iomap)
877 {
878 	struct gfs2_inode *ip = GFS2_I(inode);
879 	struct metapath mp = { .mp_aheight = 1, };
880 	int ret;
881 
882 	trace_gfs2_iomap_start(ip, pos, length, flags);
883 	if (flags & IOMAP_WRITE) {
884 		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
885 		if (!ret && iomap->type == IOMAP_HOLE)
886 			ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
887 		release_metapath(&mp);
888 	} else {
889 		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
890 		release_metapath(&mp);
891 	}
892 	trace_gfs2_iomap_end(ip, iomap, ret);
893 	return ret;
894 }
895 
896 const struct iomap_ops gfs2_iomap_ops = {
897 	.iomap_begin = gfs2_iomap_begin,
898 };
899 
900 /**
901  * gfs2_block_map - Map one or more blocks of an inode to a disk block
902  * @inode: The inode
903  * @lblock: The logical block number
904  * @bh_map: The bh to be mapped
905  * @create: True if its ok to alloc blocks to satify the request
906  *
907  * The size of the requested mapping is defined in bh_map->b_size.
908  *
909  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
910  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
911  * bh_map->b_size to indicate the size of the mapping when @lblock and
912  * successive blocks are mapped, up to the requested size.
913  *
914  * Sets buffer_boundary() if a read of metadata will be required
915  * before the next block can be mapped. Sets buffer_new() if new
916  * blocks were allocated.
917  *
918  * Returns: errno
919  */
920 
921 int gfs2_block_map(struct inode *inode, sector_t lblock,
922 		   struct buffer_head *bh_map, int create)
923 {
924 	struct gfs2_inode *ip = GFS2_I(inode);
925 	loff_t pos = (loff_t)lblock << inode->i_blkbits;
926 	loff_t length = bh_map->b_size;
927 	struct metapath mp = { .mp_aheight = 1, };
928 	struct iomap iomap = { };
929 	int ret;
930 
931 	clear_buffer_mapped(bh_map);
932 	clear_buffer_new(bh_map);
933 	clear_buffer_boundary(bh_map);
934 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
935 
936 	if (create) {
937 		ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
938 		if (!ret && iomap.type == IOMAP_HOLE)
939 			ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
940 		release_metapath(&mp);
941 	} else {
942 		ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
943 		release_metapath(&mp);
944 
945 		/* Return unmapped buffer beyond the end of file. */
946 		if (ret == -ENOENT) {
947 			ret = 0;
948 			goto out;
949 		}
950 	}
951 	if (ret)
952 		goto out;
953 
954 	if (iomap.length > bh_map->b_size) {
955 		iomap.length = bh_map->b_size;
956 		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
957 	}
958 	if (iomap.addr != IOMAP_NULL_ADDR)
959 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
960 	bh_map->b_size = iomap.length;
961 	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
962 		set_buffer_boundary(bh_map);
963 	if (iomap.flags & IOMAP_F_NEW)
964 		set_buffer_new(bh_map);
965 
966 out:
967 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
968 	return ret;
969 }
970 
971 /*
972  * Deprecated: do not use in new code
973  */
974 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
975 {
976 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
977 	int ret;
978 	int create = *new;
979 
980 	BUG_ON(!extlen);
981 	BUG_ON(!dblock);
982 	BUG_ON(!new);
983 
984 	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
985 	ret = gfs2_block_map(inode, lblock, &bh, create);
986 	*extlen = bh.b_size >> inode->i_blkbits;
987 	*dblock = bh.b_blocknr;
988 	if (buffer_new(&bh))
989 		*new = 1;
990 	else
991 		*new = 0;
992 	return ret;
993 }
994 
995 /**
996  * gfs2_block_zero_range - Deal with zeroing out data
997  *
998  * This is partly borrowed from ext3.
999  */
1000 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1001 				 unsigned int length)
1002 {
1003 	struct address_space *mapping = inode->i_mapping;
1004 	struct gfs2_inode *ip = GFS2_I(inode);
1005 	unsigned long index = from >> PAGE_SHIFT;
1006 	unsigned offset = from & (PAGE_SIZE-1);
1007 	unsigned blocksize, iblock, pos;
1008 	struct buffer_head *bh;
1009 	struct page *page;
1010 	int err;
1011 
1012 	page = find_or_create_page(mapping, index, GFP_NOFS);
1013 	if (!page)
1014 		return 0;
1015 
1016 	blocksize = inode->i_sb->s_blocksize;
1017 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
1018 
1019 	if (!page_has_buffers(page))
1020 		create_empty_buffers(page, blocksize, 0);
1021 
1022 	/* Find the buffer that contains "offset" */
1023 	bh = page_buffers(page);
1024 	pos = blocksize;
1025 	while (offset >= pos) {
1026 		bh = bh->b_this_page;
1027 		iblock++;
1028 		pos += blocksize;
1029 	}
1030 
1031 	err = 0;
1032 
1033 	if (!buffer_mapped(bh)) {
1034 		gfs2_block_map(inode, iblock, bh, 0);
1035 		/* unmapped? It's a hole - nothing to do */
1036 		if (!buffer_mapped(bh))
1037 			goto unlock;
1038 	}
1039 
1040 	/* Ok, it's mapped. Make sure it's up-to-date */
1041 	if (PageUptodate(page))
1042 		set_buffer_uptodate(bh);
1043 
1044 	if (!buffer_uptodate(bh)) {
1045 		err = -EIO;
1046 		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1047 		wait_on_buffer(bh);
1048 		/* Uhhuh. Read error. Complain and punt. */
1049 		if (!buffer_uptodate(bh))
1050 			goto unlock;
1051 		err = 0;
1052 	}
1053 
1054 	if (gfs2_is_jdata(ip))
1055 		gfs2_trans_add_data(ip->i_gl, bh);
1056 	else
1057 		gfs2_ordered_add_inode(ip);
1058 
1059 	zero_user(page, offset, length);
1060 	mark_buffer_dirty(bh);
1061 unlock:
1062 	unlock_page(page);
1063 	put_page(page);
1064 	return err;
1065 }
1066 
1067 #define GFS2_JTRUNC_REVOKES 8192
1068 
1069 /**
1070  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1071  * @inode: The inode being truncated
1072  * @oldsize: The original (larger) size
1073  * @newsize: The new smaller size
1074  *
1075  * With jdata files, we have to journal a revoke for each block which is
1076  * truncated. As a result, we need to split this into separate transactions
1077  * if the number of pages being truncated gets too large.
1078  */
1079 
1080 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1081 {
1082 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1083 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1084 	u64 chunk;
1085 	int error;
1086 
1087 	while (oldsize != newsize) {
1088 		struct gfs2_trans *tr;
1089 		unsigned int offs;
1090 
1091 		chunk = oldsize - newsize;
1092 		if (chunk > max_chunk)
1093 			chunk = max_chunk;
1094 
1095 		offs = oldsize & ~PAGE_MASK;
1096 		if (offs && chunk > PAGE_SIZE)
1097 			chunk = offs + ((chunk - offs) & PAGE_MASK);
1098 
1099 		truncate_pagecache(inode, oldsize - chunk);
1100 		oldsize -= chunk;
1101 
1102 		tr = current->journal_info;
1103 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1104 			continue;
1105 
1106 		gfs2_trans_end(sdp);
1107 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1108 		if (error)
1109 			return error;
1110 	}
1111 
1112 	return 0;
1113 }
1114 
1115 static int trunc_start(struct inode *inode, u64 newsize)
1116 {
1117 	struct gfs2_inode *ip = GFS2_I(inode);
1118 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1119 	struct buffer_head *dibh = NULL;
1120 	int journaled = gfs2_is_jdata(ip);
1121 	u64 oldsize = inode->i_size;
1122 	int error;
1123 
1124 	if (journaled)
1125 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1126 	else
1127 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1128 	if (error)
1129 		return error;
1130 
1131 	error = gfs2_meta_inode_buffer(ip, &dibh);
1132 	if (error)
1133 		goto out;
1134 
1135 	gfs2_trans_add_meta(ip->i_gl, dibh);
1136 
1137 	if (gfs2_is_stuffed(ip)) {
1138 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1139 	} else {
1140 		unsigned int blocksize = i_blocksize(inode);
1141 		unsigned int offs = newsize & (blocksize - 1);
1142 		if (offs) {
1143 			error = gfs2_block_zero_range(inode, newsize,
1144 						      blocksize - offs);
1145 			if (error)
1146 				goto out;
1147 		}
1148 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1149 	}
1150 
1151 	i_size_write(inode, newsize);
1152 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1153 	gfs2_dinode_out(ip, dibh->b_data);
1154 
1155 	if (journaled)
1156 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1157 	else
1158 		truncate_pagecache(inode, newsize);
1159 
1160 out:
1161 	brelse(dibh);
1162 	if (current->journal_info)
1163 		gfs2_trans_end(sdp);
1164 	return error;
1165 }
1166 
1167 int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
1168 			 struct iomap *iomap)
1169 {
1170 	struct metapath mp = { .mp_aheight = 1, };
1171 	int ret;
1172 
1173 	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1174 	if (!ret && iomap->type == IOMAP_HOLE)
1175 		ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
1176 	release_metapath(&mp);
1177 	return ret;
1178 }
1179 
1180 /**
1181  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1182  * @ip: inode
1183  * @rg_gh: holder of resource group glock
1184  * @bh: buffer head to sweep
1185  * @start: starting point in bh
1186  * @end: end point in bh
1187  * @meta: true if bh points to metadata (rather than data)
1188  * @btotal: place to keep count of total blocks freed
1189  *
1190  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1191  * free, and free them all. However, we do it one rgrp at a time. If this
1192  * block has references to multiple rgrps, we break it into individual
1193  * transactions. This allows other processes to use the rgrps while we're
1194  * focused on a single one, for better concurrency / performance.
1195  * At every transaction boundary, we rewrite the inode into the journal.
1196  * That way the bitmaps are kept consistent with the inode and we can recover
1197  * if we're interrupted by power-outages.
1198  *
1199  * Returns: 0, or return code if an error occurred.
1200  *          *btotal has the total number of blocks freed
1201  */
1202 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1203 			      struct buffer_head *bh, __be64 *start, __be64 *end,
1204 			      bool meta, u32 *btotal)
1205 {
1206 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1207 	struct gfs2_rgrpd *rgd;
1208 	struct gfs2_trans *tr;
1209 	__be64 *p;
1210 	int blks_outside_rgrp;
1211 	u64 bn, bstart, isize_blks;
1212 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1213 	int ret = 0;
1214 	bool buf_in_tr = false; /* buffer was added to transaction */
1215 
1216 more_rgrps:
1217 	rgd = NULL;
1218 	if (gfs2_holder_initialized(rd_gh)) {
1219 		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1220 		gfs2_assert_withdraw(sdp,
1221 			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1222 	}
1223 	blks_outside_rgrp = 0;
1224 	bstart = 0;
1225 	blen = 0;
1226 
1227 	for (p = start; p < end; p++) {
1228 		if (!*p)
1229 			continue;
1230 		bn = be64_to_cpu(*p);
1231 
1232 		if (rgd) {
1233 			if (!rgrp_contains_block(rgd, bn)) {
1234 				blks_outside_rgrp++;
1235 				continue;
1236 			}
1237 		} else {
1238 			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1239 			if (unlikely(!rgd)) {
1240 				ret = -EIO;
1241 				goto out;
1242 			}
1243 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1244 						 0, rd_gh);
1245 			if (ret)
1246 				goto out;
1247 
1248 			/* Must be done with the rgrp glock held: */
1249 			if (gfs2_rs_active(&ip->i_res) &&
1250 			    rgd == ip->i_res.rs_rbm.rgd)
1251 				gfs2_rs_deltree(&ip->i_res);
1252 		}
1253 
1254 		/* The size of our transactions will be unknown until we
1255 		   actually process all the metadata blocks that relate to
1256 		   the rgrp. So we estimate. We know it can't be more than
1257 		   the dinode's i_blocks and we don't want to exceed the
1258 		   journal flush threshold, sd_log_thresh2. */
1259 		if (current->journal_info == NULL) {
1260 			unsigned int jblocks_rqsted, revokes;
1261 
1262 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1263 				RES_INDIRECT;
1264 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1265 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1266 				jblocks_rqsted +=
1267 					atomic_read(&sdp->sd_log_thresh2);
1268 			else
1269 				jblocks_rqsted += isize_blks;
1270 			revokes = jblocks_rqsted;
1271 			if (meta)
1272 				revokes += end - start;
1273 			else if (ip->i_depth)
1274 				revokes += sdp->sd_inptrs;
1275 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1276 			if (ret)
1277 				goto out_unlock;
1278 			down_write(&ip->i_rw_mutex);
1279 		}
1280 		/* check if we will exceed the transaction blocks requested */
1281 		tr = current->journal_info;
1282 		if (tr->tr_num_buf_new + RES_STATFS +
1283 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1284 			/* We set blks_outside_rgrp to ensure the loop will
1285 			   be repeated for the same rgrp, but with a new
1286 			   transaction. */
1287 			blks_outside_rgrp++;
1288 			/* This next part is tricky. If the buffer was added
1289 			   to the transaction, we've already set some block
1290 			   pointers to 0, so we better follow through and free
1291 			   them, or we will introduce corruption (so break).
1292 			   This may be impossible, or at least rare, but I
1293 			   decided to cover the case regardless.
1294 
1295 			   If the buffer was not added to the transaction
1296 			   (this call), doing so would exceed our transaction
1297 			   size, so we need to end the transaction and start a
1298 			   new one (so goto). */
1299 
1300 			if (buf_in_tr)
1301 				break;
1302 			goto out_unlock;
1303 		}
1304 
1305 		gfs2_trans_add_meta(ip->i_gl, bh);
1306 		buf_in_tr = true;
1307 		*p = 0;
1308 		if (bstart + blen == bn) {
1309 			blen++;
1310 			continue;
1311 		}
1312 		if (bstart) {
1313 			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1314 			(*btotal) += blen;
1315 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1316 		}
1317 		bstart = bn;
1318 		blen = 1;
1319 	}
1320 	if (bstart) {
1321 		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1322 		(*btotal) += blen;
1323 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1324 	}
1325 out_unlock:
1326 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1327 					    outside the rgrp we just processed,
1328 					    do it all over again. */
1329 		if (current->journal_info) {
1330 			struct buffer_head *dibh;
1331 
1332 			ret = gfs2_meta_inode_buffer(ip, &dibh);
1333 			if (ret)
1334 				goto out;
1335 
1336 			/* Every transaction boundary, we rewrite the dinode
1337 			   to keep its di_blocks current in case of failure. */
1338 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1339 				current_time(&ip->i_inode);
1340 			gfs2_trans_add_meta(ip->i_gl, dibh);
1341 			gfs2_dinode_out(ip, dibh->b_data);
1342 			brelse(dibh);
1343 			up_write(&ip->i_rw_mutex);
1344 			gfs2_trans_end(sdp);
1345 		}
1346 		gfs2_glock_dq_uninit(rd_gh);
1347 		cond_resched();
1348 		goto more_rgrps;
1349 	}
1350 out:
1351 	return ret;
1352 }
1353 
1354 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1355 {
1356 	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1357 		return false;
1358 	return true;
1359 }
1360 
1361 /**
1362  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1363  * @mp: starting metapath
1364  * @h: desired height to search
1365  *
1366  * Assumes the metapath is valid (with buffers) out to height h.
1367  * Returns: true if a non-null pointer was found in the metapath buffer
1368  *          false if all remaining pointers are NULL in the buffer
1369  */
1370 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1371 			     unsigned int h,
1372 			     __u16 *end_list, unsigned int end_aligned)
1373 {
1374 	struct buffer_head *bh = mp->mp_bh[h];
1375 	__be64 *first, *ptr, *end;
1376 
1377 	first = metaptr1(h, mp);
1378 	ptr = first + mp->mp_list[h];
1379 	end = (__be64 *)(bh->b_data + bh->b_size);
1380 	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1381 		bool keep_end = h < end_aligned;
1382 		end = first + end_list[h] + keep_end;
1383 	}
1384 
1385 	while (ptr < end) {
1386 		if (*ptr) { /* if we have a non-null pointer */
1387 			mp->mp_list[h] = ptr - first;
1388 			h++;
1389 			if (h < GFS2_MAX_META_HEIGHT)
1390 				mp->mp_list[h] = 0;
1391 			return true;
1392 		}
1393 		ptr++;
1394 	}
1395 	return false;
1396 }
1397 
1398 enum dealloc_states {
1399 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1400 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1401 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1402 	DEALLOC_DONE = 3,       /* process complete */
1403 };
1404 
1405 static inline void
1406 metapointer_range(struct metapath *mp, int height,
1407 		  __u16 *start_list, unsigned int start_aligned,
1408 		  __u16 *end_list, unsigned int end_aligned,
1409 		  __be64 **start, __be64 **end)
1410 {
1411 	struct buffer_head *bh = mp->mp_bh[height];
1412 	__be64 *first;
1413 
1414 	first = metaptr1(height, mp);
1415 	*start = first;
1416 	if (mp_eq_to_hgt(mp, start_list, height)) {
1417 		bool keep_start = height < start_aligned;
1418 		*start = first + start_list[height] + keep_start;
1419 	}
1420 	*end = (__be64 *)(bh->b_data + bh->b_size);
1421 	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1422 		bool keep_end = height < end_aligned;
1423 		*end = first + end_list[height] + keep_end;
1424 	}
1425 }
1426 
1427 static inline bool walk_done(struct gfs2_sbd *sdp,
1428 			     struct metapath *mp, int height,
1429 			     __u16 *end_list, unsigned int end_aligned)
1430 {
1431 	__u16 end;
1432 
1433 	if (end_list) {
1434 		bool keep_end = height < end_aligned;
1435 		if (!mp_eq_to_hgt(mp, end_list, height))
1436 			return false;
1437 		end = end_list[height] + keep_end;
1438 	} else
1439 		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1440 	return mp->mp_list[height] >= end;
1441 }
1442 
1443 /**
1444  * punch_hole - deallocate blocks in a file
1445  * @ip: inode to truncate
1446  * @offset: the start of the hole
1447  * @length: the size of the hole (or 0 for truncate)
1448  *
1449  * Punch a hole into a file or truncate a file at a given position.  This
1450  * function operates in whole blocks (@offset and @length are rounded
1451  * accordingly); partially filled blocks must be cleared otherwise.
1452  *
1453  * This function works from the bottom up, and from the right to the left. In
1454  * other words, it strips off the highest layer (data) before stripping any of
1455  * the metadata. Doing it this way is best in case the operation is interrupted
1456  * by power failure, etc.  The dinode is rewritten in every transaction to
1457  * guarantee integrity.
1458  */
1459 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1460 {
1461 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1462 	u64 maxsize = sdp->sd_heightsize[ip->i_height];
1463 	struct metapath mp = {};
1464 	struct buffer_head *dibh, *bh;
1465 	struct gfs2_holder rd_gh;
1466 	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1467 	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1468 	__u16 start_list[GFS2_MAX_META_HEIGHT];
1469 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1470 	unsigned int start_aligned, uninitialized_var(end_aligned);
1471 	unsigned int strip_h = ip->i_height - 1;
1472 	u32 btotal = 0;
1473 	int ret, state;
1474 	int mp_h; /* metapath buffers are read in to this height */
1475 	u64 prev_bnr = 0;
1476 	__be64 *start, *end;
1477 
1478 	if (offset >= maxsize) {
1479 		/*
1480 		 * The starting point lies beyond the allocated meta-data;
1481 		 * there are no blocks do deallocate.
1482 		 */
1483 		return 0;
1484 	}
1485 
1486 	/*
1487 	 * The start position of the hole is defined by lblock, start_list, and
1488 	 * start_aligned.  The end position of the hole is defined by lend,
1489 	 * end_list, and end_aligned.
1490 	 *
1491 	 * start_aligned and end_aligned define down to which height the start
1492 	 * and end positions are aligned to the metadata tree (i.e., the
1493 	 * position is a multiple of the metadata granularity at the height
1494 	 * above).  This determines at which heights additional meta pointers
1495 	 * needs to be preserved for the remaining data.
1496 	 */
1497 
1498 	if (length) {
1499 		u64 end_offset = offset + length;
1500 		u64 lend;
1501 
1502 		/*
1503 		 * Clip the end at the maximum file size for the given height:
1504 		 * that's how far the metadata goes; files bigger than that
1505 		 * will have additional layers of indirection.
1506 		 */
1507 		if (end_offset > maxsize)
1508 			end_offset = maxsize;
1509 		lend = end_offset >> bsize_shift;
1510 
1511 		if (lblock >= lend)
1512 			return 0;
1513 
1514 		find_metapath(sdp, lend, &mp, ip->i_height);
1515 		end_list = __end_list;
1516 		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1517 
1518 		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1519 			if (end_list[mp_h])
1520 				break;
1521 		}
1522 		end_aligned = mp_h;
1523 	}
1524 
1525 	find_metapath(sdp, lblock, &mp, ip->i_height);
1526 	memcpy(start_list, mp.mp_list, sizeof(start_list));
1527 
1528 	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1529 		if (start_list[mp_h])
1530 			break;
1531 	}
1532 	start_aligned = mp_h;
1533 
1534 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1535 	if (ret)
1536 		return ret;
1537 
1538 	mp.mp_bh[0] = dibh;
1539 	ret = lookup_metapath(ip, &mp);
1540 	if (ret)
1541 		goto out_metapath;
1542 
1543 	/* issue read-ahead on metadata */
1544 	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1545 		metapointer_range(&mp, mp_h, start_list, start_aligned,
1546 				  end_list, end_aligned, &start, &end);
1547 		gfs2_metapath_ra(ip->i_gl, start, end);
1548 	}
1549 
1550 	if (mp.mp_aheight == ip->i_height)
1551 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1552 	else
1553 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1554 
1555 	ret = gfs2_rindex_update(sdp);
1556 	if (ret)
1557 		goto out_metapath;
1558 
1559 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1560 	if (ret)
1561 		goto out_metapath;
1562 	gfs2_holder_mark_uninitialized(&rd_gh);
1563 
1564 	mp_h = strip_h;
1565 
1566 	while (state != DEALLOC_DONE) {
1567 		switch (state) {
1568 		/* Truncate a full metapath at the given strip height.
1569 		 * Note that strip_h == mp_h in order to be in this state. */
1570 		case DEALLOC_MP_FULL:
1571 			bh = mp.mp_bh[mp_h];
1572 			gfs2_assert_withdraw(sdp, bh);
1573 			if (gfs2_assert_withdraw(sdp,
1574 						 prev_bnr != bh->b_blocknr)) {
1575 				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1576 				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1577 				       sdp->sd_fsname,
1578 				       (unsigned long long)ip->i_no_addr,
1579 				       prev_bnr, ip->i_height, strip_h, mp_h);
1580 			}
1581 			prev_bnr = bh->b_blocknr;
1582 
1583 			if (gfs2_metatype_check(sdp, bh,
1584 						(mp_h ? GFS2_METATYPE_IN :
1585 							GFS2_METATYPE_DI))) {
1586 				ret = -EIO;
1587 				goto out;
1588 			}
1589 
1590 			/*
1591 			 * Below, passing end_aligned as 0 gives us the
1592 			 * metapointer range excluding the end point: the end
1593 			 * point is the first metapath we must not deallocate!
1594 			 */
1595 
1596 			metapointer_range(&mp, mp_h, start_list, start_aligned,
1597 					  end_list, 0 /* end_aligned */,
1598 					  &start, &end);
1599 			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1600 						 start, end,
1601 						 mp_h != ip->i_height - 1,
1602 						 &btotal);
1603 
1604 			/* If we hit an error or just swept dinode buffer,
1605 			   just exit. */
1606 			if (ret || !mp_h) {
1607 				state = DEALLOC_DONE;
1608 				break;
1609 			}
1610 			state = DEALLOC_MP_LOWER;
1611 			break;
1612 
1613 		/* lower the metapath strip height */
1614 		case DEALLOC_MP_LOWER:
1615 			/* We're done with the current buffer, so release it,
1616 			   unless it's the dinode buffer. Then back up to the
1617 			   previous pointer. */
1618 			if (mp_h) {
1619 				brelse(mp.mp_bh[mp_h]);
1620 				mp.mp_bh[mp_h] = NULL;
1621 			}
1622 			/* If we can't get any lower in height, we've stripped
1623 			   off all we can. Next step is to back up and start
1624 			   stripping the previous level of metadata. */
1625 			if (mp_h == 0) {
1626 				strip_h--;
1627 				memcpy(mp.mp_list, start_list, sizeof(start_list));
1628 				mp_h = strip_h;
1629 				state = DEALLOC_FILL_MP;
1630 				break;
1631 			}
1632 			mp.mp_list[mp_h] = 0;
1633 			mp_h--; /* search one metadata height down */
1634 			mp.mp_list[mp_h]++;
1635 			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1636 				break;
1637 			/* Here we've found a part of the metapath that is not
1638 			 * allocated. We need to search at that height for the
1639 			 * next non-null pointer. */
1640 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1641 				state = DEALLOC_FILL_MP;
1642 				mp_h++;
1643 			}
1644 			/* No more non-null pointers at this height. Back up
1645 			   to the previous height and try again. */
1646 			break; /* loop around in the same state */
1647 
1648 		/* Fill the metapath with buffers to the given height. */
1649 		case DEALLOC_FILL_MP:
1650 			/* Fill the buffers out to the current height. */
1651 			ret = fillup_metapath(ip, &mp, mp_h);
1652 			if (ret < 0)
1653 				goto out;
1654 
1655 			/* issue read-ahead on metadata */
1656 			if (mp.mp_aheight > 1) {
1657 				for (; ret > 1; ret--) {
1658 					metapointer_range(&mp, mp.mp_aheight - ret,
1659 							  start_list, start_aligned,
1660 							  end_list, end_aligned,
1661 							  &start, &end);
1662 					gfs2_metapath_ra(ip->i_gl, start, end);
1663 				}
1664 			}
1665 
1666 			/* If buffers found for the entire strip height */
1667 			if (mp.mp_aheight - 1 == strip_h) {
1668 				state = DEALLOC_MP_FULL;
1669 				break;
1670 			}
1671 			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1672 				mp_h = mp.mp_aheight - 1;
1673 
1674 			/* If we find a non-null block pointer, crawl a bit
1675 			   higher up in the metapath and try again, otherwise
1676 			   we need to look lower for a new starting point. */
1677 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1678 				mp_h++;
1679 			else
1680 				state = DEALLOC_MP_LOWER;
1681 			break;
1682 		}
1683 	}
1684 
1685 	if (btotal) {
1686 		if (current->journal_info == NULL) {
1687 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1688 					       RES_QUOTA, 0);
1689 			if (ret)
1690 				goto out;
1691 			down_write(&ip->i_rw_mutex);
1692 		}
1693 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1694 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1695 				  ip->i_inode.i_gid);
1696 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1697 		gfs2_trans_add_meta(ip->i_gl, dibh);
1698 		gfs2_dinode_out(ip, dibh->b_data);
1699 		up_write(&ip->i_rw_mutex);
1700 		gfs2_trans_end(sdp);
1701 	}
1702 
1703 out:
1704 	if (gfs2_holder_initialized(&rd_gh))
1705 		gfs2_glock_dq_uninit(&rd_gh);
1706 	if (current->journal_info) {
1707 		up_write(&ip->i_rw_mutex);
1708 		gfs2_trans_end(sdp);
1709 		cond_resched();
1710 	}
1711 	gfs2_quota_unhold(ip);
1712 out_metapath:
1713 	release_metapath(&mp);
1714 	return ret;
1715 }
1716 
1717 static int trunc_end(struct gfs2_inode *ip)
1718 {
1719 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1720 	struct buffer_head *dibh;
1721 	int error;
1722 
1723 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1724 	if (error)
1725 		return error;
1726 
1727 	down_write(&ip->i_rw_mutex);
1728 
1729 	error = gfs2_meta_inode_buffer(ip, &dibh);
1730 	if (error)
1731 		goto out;
1732 
1733 	if (!i_size_read(&ip->i_inode)) {
1734 		ip->i_height = 0;
1735 		ip->i_goal = ip->i_no_addr;
1736 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1737 		gfs2_ordered_del_inode(ip);
1738 	}
1739 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1740 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1741 
1742 	gfs2_trans_add_meta(ip->i_gl, dibh);
1743 	gfs2_dinode_out(ip, dibh->b_data);
1744 	brelse(dibh);
1745 
1746 out:
1747 	up_write(&ip->i_rw_mutex);
1748 	gfs2_trans_end(sdp);
1749 	return error;
1750 }
1751 
1752 /**
1753  * do_shrink - make a file smaller
1754  * @inode: the inode
1755  * @newsize: the size to make the file
1756  *
1757  * Called with an exclusive lock on @inode. The @size must
1758  * be equal to or smaller than the current inode size.
1759  *
1760  * Returns: errno
1761  */
1762 
1763 static int do_shrink(struct inode *inode, u64 newsize)
1764 {
1765 	struct gfs2_inode *ip = GFS2_I(inode);
1766 	int error;
1767 
1768 	error = trunc_start(inode, newsize);
1769 	if (error < 0)
1770 		return error;
1771 	if (gfs2_is_stuffed(ip))
1772 		return 0;
1773 
1774 	error = punch_hole(ip, newsize, 0);
1775 	if (error == 0)
1776 		error = trunc_end(ip);
1777 
1778 	return error;
1779 }
1780 
1781 void gfs2_trim_blocks(struct inode *inode)
1782 {
1783 	int ret;
1784 
1785 	ret = do_shrink(inode, inode->i_size);
1786 	WARN_ON(ret != 0);
1787 }
1788 
1789 /**
1790  * do_grow - Touch and update inode size
1791  * @inode: The inode
1792  * @size: The new size
1793  *
1794  * This function updates the timestamps on the inode and
1795  * may also increase the size of the inode. This function
1796  * must not be called with @size any smaller than the current
1797  * inode size.
1798  *
1799  * Although it is not strictly required to unstuff files here,
1800  * earlier versions of GFS2 have a bug in the stuffed file reading
1801  * code which will result in a buffer overrun if the size is larger
1802  * than the max stuffed file size. In order to prevent this from
1803  * occurring, such files are unstuffed, but in other cases we can
1804  * just update the inode size directly.
1805  *
1806  * Returns: 0 on success, or -ve on error
1807  */
1808 
1809 static int do_grow(struct inode *inode, u64 size)
1810 {
1811 	struct gfs2_inode *ip = GFS2_I(inode);
1812 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1813 	struct gfs2_alloc_parms ap = { .target = 1, };
1814 	struct buffer_head *dibh;
1815 	int error;
1816 	int unstuff = 0;
1817 
1818 	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
1819 		error = gfs2_quota_lock_check(ip, &ap);
1820 		if (error)
1821 			return error;
1822 
1823 		error = gfs2_inplace_reserve(ip, &ap);
1824 		if (error)
1825 			goto do_grow_qunlock;
1826 		unstuff = 1;
1827 	}
1828 
1829 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1830 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1831 				  0 : RES_QUOTA), 0);
1832 	if (error)
1833 		goto do_grow_release;
1834 
1835 	if (unstuff) {
1836 		error = gfs2_unstuff_dinode(ip, NULL);
1837 		if (error)
1838 			goto do_end_trans;
1839 	}
1840 
1841 	error = gfs2_meta_inode_buffer(ip, &dibh);
1842 	if (error)
1843 		goto do_end_trans;
1844 
1845 	i_size_write(inode, size);
1846 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1847 	gfs2_trans_add_meta(ip->i_gl, dibh);
1848 	gfs2_dinode_out(ip, dibh->b_data);
1849 	brelse(dibh);
1850 
1851 do_end_trans:
1852 	gfs2_trans_end(sdp);
1853 do_grow_release:
1854 	if (unstuff) {
1855 		gfs2_inplace_release(ip);
1856 do_grow_qunlock:
1857 		gfs2_quota_unlock(ip);
1858 	}
1859 	return error;
1860 }
1861 
1862 /**
1863  * gfs2_setattr_size - make a file a given size
1864  * @inode: the inode
1865  * @newsize: the size to make the file
1866  *
1867  * The file size can grow, shrink, or stay the same size. This
1868  * is called holding i_rwsem and an exclusive glock on the inode
1869  * in question.
1870  *
1871  * Returns: errno
1872  */
1873 
1874 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1875 {
1876 	struct gfs2_inode *ip = GFS2_I(inode);
1877 	int ret;
1878 
1879 	BUG_ON(!S_ISREG(inode->i_mode));
1880 
1881 	ret = inode_newsize_ok(inode, newsize);
1882 	if (ret)
1883 		return ret;
1884 
1885 	inode_dio_wait(inode);
1886 
1887 	ret = gfs2_rsqa_alloc(ip);
1888 	if (ret)
1889 		goto out;
1890 
1891 	if (newsize >= inode->i_size) {
1892 		ret = do_grow(inode, newsize);
1893 		goto out;
1894 	}
1895 
1896 	ret = do_shrink(inode, newsize);
1897 out:
1898 	gfs2_rsqa_delete(ip, NULL);
1899 	return ret;
1900 }
1901 
1902 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1903 {
1904 	int error;
1905 	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
1906 	if (!error)
1907 		error = trunc_end(ip);
1908 	return error;
1909 }
1910 
1911 int gfs2_file_dealloc(struct gfs2_inode *ip)
1912 {
1913 	return punch_hole(ip, 0, 0);
1914 }
1915 
1916 /**
1917  * gfs2_free_journal_extents - Free cached journal bmap info
1918  * @jd: The journal
1919  *
1920  */
1921 
1922 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1923 {
1924 	struct gfs2_journal_extent *jext;
1925 
1926 	while(!list_empty(&jd->extent_list)) {
1927 		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1928 		list_del(&jext->list);
1929 		kfree(jext);
1930 	}
1931 }
1932 
1933 /**
1934  * gfs2_add_jextent - Add or merge a new extent to extent cache
1935  * @jd: The journal descriptor
1936  * @lblock: The logical block at start of new extent
1937  * @dblock: The physical block at start of new extent
1938  * @blocks: Size of extent in fs blocks
1939  *
1940  * Returns: 0 on success or -ENOMEM
1941  */
1942 
1943 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1944 {
1945 	struct gfs2_journal_extent *jext;
1946 
1947 	if (!list_empty(&jd->extent_list)) {
1948 		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1949 		if ((jext->dblock + jext->blocks) == dblock) {
1950 			jext->blocks += blocks;
1951 			return 0;
1952 		}
1953 	}
1954 
1955 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1956 	if (jext == NULL)
1957 		return -ENOMEM;
1958 	jext->dblock = dblock;
1959 	jext->lblock = lblock;
1960 	jext->blocks = blocks;
1961 	list_add_tail(&jext->list, &jd->extent_list);
1962 	jd->nr_extents++;
1963 	return 0;
1964 }
1965 
1966 /**
1967  * gfs2_map_journal_extents - Cache journal bmap info
1968  * @sdp: The super block
1969  * @jd: The journal to map
1970  *
1971  * Create a reusable "extent" mapping from all logical
1972  * blocks to all physical blocks for the given journal.  This will save
1973  * us time when writing journal blocks.  Most journals will have only one
1974  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1975  * arranges the journal blocks sequentially to maximize performance.
1976  * So the extent would map the first block for the entire file length.
1977  * However, gfs2_jadd can happen while file activity is happening, so
1978  * those journals may not be sequential.  Less likely is the case where
1979  * the users created their own journals by mounting the metafs and
1980  * laying it out.  But it's still possible.  These journals might have
1981  * several extents.
1982  *
1983  * Returns: 0 on success, or error on failure
1984  */
1985 
1986 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1987 {
1988 	u64 lblock = 0;
1989 	u64 lblock_stop;
1990 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1991 	struct buffer_head bh;
1992 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1993 	u64 size;
1994 	int rc;
1995 
1996 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
1997 	size = (lblock_stop - lblock) << shift;
1998 	jd->nr_extents = 0;
1999 	WARN_ON(!list_empty(&jd->extent_list));
2000 
2001 	do {
2002 		bh.b_state = 0;
2003 		bh.b_blocknr = 0;
2004 		bh.b_size = size;
2005 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2006 		if (rc || !buffer_mapped(&bh))
2007 			goto fail;
2008 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2009 		if (rc)
2010 			goto fail;
2011 		size -= bh.b_size;
2012 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2013 	} while(size > 0);
2014 
2015 	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
2016 		jd->nr_extents);
2017 	return 0;
2018 
2019 fail:
2020 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2021 		rc, jd->jd_jid,
2022 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
2023 		jd->nr_extents);
2024 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2025 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2026 		bh.b_state, (unsigned long long)bh.b_size);
2027 	gfs2_free_journal_extents(jd);
2028 	return rc;
2029 }
2030 
2031 /**
2032  * gfs2_write_alloc_required - figure out if a write will require an allocation
2033  * @ip: the file being written to
2034  * @offset: the offset to write to
2035  * @len: the number of bytes being written
2036  *
2037  * Returns: 1 if an alloc is required, 0 otherwise
2038  */
2039 
2040 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2041 			      unsigned int len)
2042 {
2043 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2044 	struct buffer_head bh;
2045 	unsigned int shift;
2046 	u64 lblock, lblock_stop, size;
2047 	u64 end_of_file;
2048 
2049 	if (!len)
2050 		return 0;
2051 
2052 	if (gfs2_is_stuffed(ip)) {
2053 		if (offset + len > gfs2_max_stuffed_size(ip))
2054 			return 1;
2055 		return 0;
2056 	}
2057 
2058 	shift = sdp->sd_sb.sb_bsize_shift;
2059 	BUG_ON(gfs2_is_dir(ip));
2060 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2061 	lblock = offset >> shift;
2062 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2063 	if (lblock_stop > end_of_file)
2064 		return 1;
2065 
2066 	size = (lblock_stop - lblock) << shift;
2067 	do {
2068 		bh.b_state = 0;
2069 		bh.b_size = size;
2070 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2071 		if (!buffer_mapped(&bh))
2072 			return 1;
2073 		size -= bh.b_size;
2074 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2075 	} while(size > 0);
2076 
2077 	return 0;
2078 }
2079 
2080 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2081 {
2082 	struct gfs2_inode *ip = GFS2_I(inode);
2083 	struct buffer_head *dibh;
2084 	int error;
2085 
2086 	if (offset >= inode->i_size)
2087 		return 0;
2088 	if (offset + length > inode->i_size)
2089 		length = inode->i_size - offset;
2090 
2091 	error = gfs2_meta_inode_buffer(ip, &dibh);
2092 	if (error)
2093 		return error;
2094 	gfs2_trans_add_meta(ip->i_gl, dibh);
2095 	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2096 	       length);
2097 	brelse(dibh);
2098 	return 0;
2099 }
2100 
2101 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2102 					 loff_t length)
2103 {
2104 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2105 	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2106 	int error;
2107 
2108 	while (length) {
2109 		struct gfs2_trans *tr;
2110 		loff_t chunk;
2111 		unsigned int offs;
2112 
2113 		chunk = length;
2114 		if (chunk > max_chunk)
2115 			chunk = max_chunk;
2116 
2117 		offs = offset & ~PAGE_MASK;
2118 		if (offs && chunk > PAGE_SIZE)
2119 			chunk = offs + ((chunk - offs) & PAGE_MASK);
2120 
2121 		truncate_pagecache_range(inode, offset, chunk);
2122 		offset += chunk;
2123 		length -= chunk;
2124 
2125 		tr = current->journal_info;
2126 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2127 			continue;
2128 
2129 		gfs2_trans_end(sdp);
2130 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2131 		if (error)
2132 			return error;
2133 	}
2134 	return 0;
2135 }
2136 
2137 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2138 {
2139 	struct inode *inode = file_inode(file);
2140 	struct gfs2_inode *ip = GFS2_I(inode);
2141 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2142 	int error;
2143 
2144 	if (gfs2_is_jdata(ip))
2145 		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2146 					 GFS2_JTRUNC_REVOKES);
2147 	else
2148 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2149 	if (error)
2150 		return error;
2151 
2152 	if (gfs2_is_stuffed(ip)) {
2153 		error = stuffed_zero_range(inode, offset, length);
2154 		if (error)
2155 			goto out;
2156 	} else {
2157 		unsigned int start_off, end_off, blocksize;
2158 
2159 		blocksize = i_blocksize(inode);
2160 		start_off = offset & (blocksize - 1);
2161 		end_off = (offset + length) & (blocksize - 1);
2162 		if (start_off) {
2163 			unsigned int len = length;
2164 			if (length > blocksize - start_off)
2165 				len = blocksize - start_off;
2166 			error = gfs2_block_zero_range(inode, offset, len);
2167 			if (error)
2168 				goto out;
2169 			if (start_off + length < blocksize)
2170 				end_off = 0;
2171 		}
2172 		if (end_off) {
2173 			error = gfs2_block_zero_range(inode,
2174 				offset + length - end_off, end_off);
2175 			if (error)
2176 				goto out;
2177 		}
2178 	}
2179 
2180 	if (gfs2_is_jdata(ip)) {
2181 		BUG_ON(!current->journal_info);
2182 		gfs2_journaled_truncate_range(inode, offset, length);
2183 	} else
2184 		truncate_pagecache_range(inode, offset, offset + length - 1);
2185 
2186 	file_update_time(file);
2187 	mark_inode_dirty(inode);
2188 
2189 	if (current->journal_info)
2190 		gfs2_trans_end(sdp);
2191 
2192 	if (!gfs2_is_stuffed(ip))
2193 		error = punch_hole(ip, offset, length);
2194 
2195 out:
2196 	if (current->journal_info)
2197 		gfs2_trans_end(sdp);
2198 	return error;
2199 }
2200