xref: /linux/fs/gfs2/bmap.c (revision e60e1ee60630cafef5e430c2ae364877e061d980)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/blkdev.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/crc32.h>
16 #include <linux/iomap.h>
17 
18 #include "gfs2.h"
19 #include "incore.h"
20 #include "bmap.h"
21 #include "glock.h"
22 #include "inode.h"
23 #include "meta_io.h"
24 #include "quota.h"
25 #include "rgrp.h"
26 #include "log.h"
27 #include "super.h"
28 #include "trans.h"
29 #include "dir.h"
30 #include "util.h"
31 #include "trace_gfs2.h"
32 
33 /* This doesn't need to be that large as max 64 bit pointers in a 4k
34  * block is 512, so __u16 is fine for that. It saves stack space to
35  * keep it small.
36  */
37 struct metapath {
38 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
39 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
40 	int mp_fheight; /* find_metapath height */
41 	int mp_aheight; /* actual height (lookup height) */
42 };
43 
44 /**
45  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
46  * @ip: the inode
47  * @dibh: the dinode buffer
48  * @block: the block number that was allocated
49  * @page: The (optional) page. This is looked up if @page is NULL
50  *
51  * Returns: errno
52  */
53 
54 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
55 			       u64 block, struct page *page)
56 {
57 	struct inode *inode = &ip->i_inode;
58 	struct buffer_head *bh;
59 	int release = 0;
60 
61 	if (!page || page->index) {
62 		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
63 		if (!page)
64 			return -ENOMEM;
65 		release = 1;
66 	}
67 
68 	if (!PageUptodate(page)) {
69 		void *kaddr = kmap(page);
70 		u64 dsize = i_size_read(inode);
71 
72 		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
73 			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
74 
75 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
77 		kunmap(page);
78 
79 		SetPageUptodate(page);
80 	}
81 
82 	if (!page_has_buffers(page))
83 		create_empty_buffers(page, BIT(inode->i_blkbits),
84 				     BIT(BH_Uptodate));
85 
86 	bh = page_buffers(page);
87 
88 	if (!buffer_mapped(bh))
89 		map_bh(bh, inode->i_sb, block);
90 
91 	set_buffer_uptodate(bh);
92 	if (!gfs2_is_jdata(ip))
93 		mark_buffer_dirty(bh);
94 	if (!gfs2_is_writeback(ip))
95 		gfs2_trans_add_data(ip->i_gl, bh);
96 
97 	if (release) {
98 		unlock_page(page);
99 		put_page(page);
100 	}
101 
102 	return 0;
103 }
104 
105 /**
106  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
107  * @ip: The GFS2 inode to unstuff
108  * @page: The (optional) page. This is looked up if the @page is NULL
109  *
110  * This routine unstuffs a dinode and returns it to a "normal" state such
111  * that the height can be grown in the traditional way.
112  *
113  * Returns: errno
114  */
115 
116 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
117 {
118 	struct buffer_head *bh, *dibh;
119 	struct gfs2_dinode *di;
120 	u64 block = 0;
121 	int isdir = gfs2_is_dir(ip);
122 	int error;
123 
124 	down_write(&ip->i_rw_mutex);
125 
126 	error = gfs2_meta_inode_buffer(ip, &dibh);
127 	if (error)
128 		goto out;
129 
130 	if (i_size_read(&ip->i_inode)) {
131 		/* Get a free block, fill it with the stuffed data,
132 		   and write it out to disk */
133 
134 		unsigned int n = 1;
135 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
136 		if (error)
137 			goto out_brelse;
138 		if (isdir) {
139 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
140 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
141 			if (error)
142 				goto out_brelse;
143 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
144 					      dibh, sizeof(struct gfs2_dinode));
145 			brelse(bh);
146 		} else {
147 			error = gfs2_unstuffer_page(ip, dibh, block, page);
148 			if (error)
149 				goto out_brelse;
150 		}
151 	}
152 
153 	/*  Set up the pointer to the new block  */
154 
155 	gfs2_trans_add_meta(ip->i_gl, dibh);
156 	di = (struct gfs2_dinode *)dibh->b_data;
157 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
158 
159 	if (i_size_read(&ip->i_inode)) {
160 		*(__be64 *)(di + 1) = cpu_to_be64(block);
161 		gfs2_add_inode_blocks(&ip->i_inode, 1);
162 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
163 	}
164 
165 	ip->i_height = 1;
166 	di->di_height = cpu_to_be16(1);
167 
168 out_brelse:
169 	brelse(dibh);
170 out:
171 	up_write(&ip->i_rw_mutex);
172 	return error;
173 }
174 
175 
176 /**
177  * find_metapath - Find path through the metadata tree
178  * @sdp: The superblock
179  * @mp: The metapath to return the result in
180  * @block: The disk block to look up
181  * @height: The pre-calculated height of the metadata tree
182  *
183  *   This routine returns a struct metapath structure that defines a path
184  *   through the metadata of inode "ip" to get to block "block".
185  *
186  *   Example:
187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
188  *   filesystem with a blocksize of 4096.
189  *
190  *   find_metapath() would return a struct metapath structure set to:
191  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
192  *   and mp_list[2] = 165.
193  *
194  *   That means that in order to get to the block containing the byte at
195  *   offset 101342453, we would load the indirect block pointed to by pointer
196  *   0 in the dinode.  We would then load the indirect block pointed to by
197  *   pointer 48 in that indirect block.  We would then load the data block
198  *   pointed to by pointer 165 in that indirect block.
199  *
200  *             ----------------------------------------
201  *             | Dinode |                             |
202  *             |        |                            4|
203  *             |        |0 1 2 3 4 5                 9|
204  *             |        |                            6|
205  *             ----------------------------------------
206  *                       |
207  *                       |
208  *                       V
209  *             ----------------------------------------
210  *             | Indirect Block                       |
211  *             |                                     5|
212  *             |            4 4 4 4 4 5 5            1|
213  *             |0           5 6 7 8 9 0 1            2|
214  *             ----------------------------------------
215  *                                |
216  *                                |
217  *                                V
218  *             ----------------------------------------
219  *             | Indirect Block                       |
220  *             |                         1 1 1 1 1   5|
221  *             |                         6 6 6 6 6   1|
222  *             |0                        3 4 5 6 7   2|
223  *             ----------------------------------------
224  *                                           |
225  *                                           |
226  *                                           V
227  *             ----------------------------------------
228  *             | Data block containing offset         |
229  *             |            101342453                 |
230  *             |                                      |
231  *             |                                      |
232  *             ----------------------------------------
233  *
234  */
235 
236 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
237 			  struct metapath *mp, unsigned int height)
238 {
239 	unsigned int i;
240 
241 	mp->mp_fheight = height;
242 	for (i = height; i--;)
243 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
244 }
245 
246 static inline unsigned int metapath_branch_start(const struct metapath *mp)
247 {
248 	if (mp->mp_list[0] == 0)
249 		return 2;
250 	return 1;
251 }
252 
253 /**
254  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
255  * @height: The metadata height (0 = dinode)
256  * @mp: The metapath
257  */
258 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
259 {
260 	struct buffer_head *bh = mp->mp_bh[height];
261 	if (height == 0)
262 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
263 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
264 }
265 
266 /**
267  * metapointer - Return pointer to start of metadata in a buffer
268  * @height: The metadata height (0 = dinode)
269  * @mp: The metapath
270  *
271  * Return a pointer to the block number of the next height of the metadata
272  * tree given a buffer containing the pointer to the current height of the
273  * metadata tree.
274  */
275 
276 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
277 {
278 	__be64 *p = metaptr1(height, mp);
279 	return p + mp->mp_list[height];
280 }
281 
282 static void gfs2_metapath_ra(struct gfs2_glock *gl,
283 			     const struct buffer_head *bh, const __be64 *pos)
284 {
285 	struct buffer_head *rabh;
286 	const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
287 	const __be64 *t;
288 
289 	for (t = pos; t < endp; t++) {
290 		if (!*t)
291 			continue;
292 
293 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
294 		if (trylock_buffer(rabh)) {
295 			if (!buffer_uptodate(rabh)) {
296 				rabh->b_end_io = end_buffer_read_sync;
297 				submit_bh(REQ_OP_READ,
298 					  REQ_RAHEAD | REQ_META | REQ_PRIO,
299 					  rabh);
300 				continue;
301 			}
302 			unlock_buffer(rabh);
303 		}
304 		brelse(rabh);
305 	}
306 }
307 
308 /**
309  * lookup_mp_height - helper function for lookup_metapath
310  * @ip: the inode
311  * @mp: the metapath
312  * @h: the height which needs looking up
313  */
314 static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
315 {
316 	__be64 *ptr = metapointer(h, mp);
317 	u64 dblock = be64_to_cpu(*ptr);
318 
319 	if (!dblock)
320 		return h + 1;
321 
322 	return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
323 }
324 
325 /**
326  * lookup_metapath - Walk the metadata tree to a specific point
327  * @ip: The inode
328  * @mp: The metapath
329  *
330  * Assumes that the inode's buffer has already been looked up and
331  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
332  * by find_metapath().
333  *
334  * If this function encounters part of the tree which has not been
335  * allocated, it returns the current height of the tree at the point
336  * at which it found the unallocated block. Blocks which are found are
337  * added to the mp->mp_bh[] list.
338  *
339  * Returns: error or height of metadata tree
340  */
341 
342 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
343 {
344 	unsigned int end_of_metadata = ip->i_height - 1;
345 	unsigned int x;
346 	int ret;
347 
348 	for (x = 0; x < end_of_metadata; x++) {
349 		ret = lookup_mp_height(ip, mp, x);
350 		if (ret)
351 			goto out;
352 	}
353 
354 	ret = ip->i_height;
355 out:
356 	mp->mp_aheight = ret;
357 	return ret;
358 }
359 
360 /**
361  * fillup_metapath - fill up buffers for the metadata path to a specific height
362  * @ip: The inode
363  * @mp: The metapath
364  * @h: The height to which it should be mapped
365  *
366  * Similar to lookup_metapath, but does lookups for a range of heights
367  *
368  * Returns: error or height of metadata tree
369  */
370 
371 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
372 {
373 	unsigned int start_h = h - 1;
374 	int ret;
375 
376 	if (h) {
377 		/* find the first buffer we need to look up. */
378 		while (start_h > 0 && mp->mp_bh[start_h] == NULL)
379 			start_h--;
380 		for (; start_h < h; start_h++) {
381 			ret = lookup_mp_height(ip, mp, start_h);
382 			if (ret)
383 				return ret;
384 		}
385 	}
386 	return ip->i_height;
387 }
388 
389 static inline void release_metapath(struct metapath *mp)
390 {
391 	int i;
392 
393 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
394 		if (mp->mp_bh[i] == NULL)
395 			break;
396 		brelse(mp->mp_bh[i]);
397 	}
398 }
399 
400 /**
401  * gfs2_extent_length - Returns length of an extent of blocks
402  * @start: Start of the buffer
403  * @len: Length of the buffer in bytes
404  * @ptr: Current position in the buffer
405  * @limit: Max extent length to return (0 = unlimited)
406  * @eob: Set to 1 if we hit "end of block"
407  *
408  * If the first block is zero (unallocated) it will return the number of
409  * unallocated blocks in the extent, otherwise it will return the number
410  * of contiguous blocks in the extent.
411  *
412  * Returns: The length of the extent (minimum of one block)
413  */
414 
415 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
416 {
417 	const __be64 *end = (start + len);
418 	const __be64 *first = ptr;
419 	u64 d = be64_to_cpu(*ptr);
420 
421 	*eob = 0;
422 	do {
423 		ptr++;
424 		if (ptr >= end)
425 			break;
426 		if (limit && --limit == 0)
427 			break;
428 		if (d)
429 			d++;
430 	} while(be64_to_cpu(*ptr) == d);
431 	if (ptr >= end)
432 		*eob = 1;
433 	return (ptr - first);
434 }
435 
436 static inline void bmap_lock(struct gfs2_inode *ip, int create)
437 {
438 	if (create)
439 		down_write(&ip->i_rw_mutex);
440 	else
441 		down_read(&ip->i_rw_mutex);
442 }
443 
444 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
445 {
446 	if (create)
447 		up_write(&ip->i_rw_mutex);
448 	else
449 		up_read(&ip->i_rw_mutex);
450 }
451 
452 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
453 					 struct gfs2_glock *gl, unsigned int i,
454 					 unsigned offset, u64 bn)
455 {
456 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
457 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
458 				 sizeof(struct gfs2_dinode)));
459 	BUG_ON(i < 1);
460 	BUG_ON(mp->mp_bh[i] != NULL);
461 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
462 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
463 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
464 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
465 	ptr += offset;
466 	*ptr = cpu_to_be64(bn);
467 	return ptr;
468 }
469 
470 enum alloc_state {
471 	ALLOC_DATA = 0,
472 	ALLOC_GROW_DEPTH = 1,
473 	ALLOC_GROW_HEIGHT = 2,
474 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
475 };
476 
477 static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
478 {
479 	if (hgt)
480 		return sdp->sd_inptrs;
481 	return sdp->sd_diptrs;
482 }
483 
484 /**
485  * gfs2_bmap_alloc - Build a metadata tree of the requested height
486  * @inode: The GFS2 inode
487  * @lblock: The logical starting block of the extent
488  * @bh_map: This is used to return the mapping details
489  * @zero_new: True if newly allocated blocks should be zeroed
490  * @mp: The metapath, with proper height information calculated
491  * @maxlen: The max number of data blocks to alloc
492  * @dblock: Pointer to return the resulting new block
493  * @dblks: Pointer to return the number of blocks allocated
494  *
495  * In this routine we may have to alloc:
496  *   i) Indirect blocks to grow the metadata tree height
497  *  ii) Indirect blocks to fill in lower part of the metadata tree
498  * iii) Data blocks
499  *
500  * The function is in two parts. The first part works out the total
501  * number of blocks which we need. The second part does the actual
502  * allocation asking for an extent at a time (if enough contiguous free
503  * blocks are available, there will only be one request per bmap call)
504  * and uses the state machine to initialise the blocks in order.
505  *
506  * Returns: errno on error
507  */
508 
509 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
510 			    unsigned flags, struct metapath *mp)
511 {
512 	struct gfs2_inode *ip = GFS2_I(inode);
513 	struct gfs2_sbd *sdp = GFS2_SB(inode);
514 	struct super_block *sb = sdp->sd_vfs;
515 	struct buffer_head *dibh = mp->mp_bh[0];
516 	u64 bn;
517 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
518 	unsigned dblks = 0;
519 	unsigned ptrs_per_blk;
520 	const unsigned end_of_metadata = mp->mp_fheight - 1;
521 	int ret;
522 	enum alloc_state state;
523 	__be64 *ptr;
524 	__be64 zero_bn = 0;
525 	size_t maxlen = iomap->length >> inode->i_blkbits;
526 
527 	BUG_ON(mp->mp_aheight < 1);
528 	BUG_ON(dibh == NULL);
529 
530 	gfs2_trans_add_meta(ip->i_gl, dibh);
531 
532 	if (mp->mp_fheight == mp->mp_aheight) {
533 		struct buffer_head *bh;
534 		int eob;
535 
536 		/* Bottom indirect block exists, find unalloced extent size */
537 		ptr = metapointer(end_of_metadata, mp);
538 		bh = mp->mp_bh[end_of_metadata];
539 		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
540 					   maxlen, &eob);
541 		BUG_ON(dblks < 1);
542 		state = ALLOC_DATA;
543 	} else {
544 		/* Need to allocate indirect blocks */
545 		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
546 			sdp->sd_diptrs;
547 		dblks = min(maxlen, (size_t)(ptrs_per_blk -
548 					     mp->mp_list[end_of_metadata]));
549 		if (mp->mp_fheight == ip->i_height) {
550 			/* Writing into existing tree, extend tree down */
551 			iblks = mp->mp_fheight - mp->mp_aheight;
552 			state = ALLOC_GROW_DEPTH;
553 		} else {
554 			/* Building up tree height */
555 			state = ALLOC_GROW_HEIGHT;
556 			iblks = mp->mp_fheight - ip->i_height;
557 			branch_start = metapath_branch_start(mp);
558 			iblks += (mp->mp_fheight - branch_start);
559 		}
560 	}
561 
562 	/* start of the second part of the function (state machine) */
563 
564 	blks = dblks + iblks;
565 	i = mp->mp_aheight;
566 	do {
567 		int error;
568 		n = blks - alloced;
569 		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
570 		if (error)
571 			return error;
572 		alloced += n;
573 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
574 			gfs2_trans_add_unrevoke(sdp, bn, n);
575 		switch (state) {
576 		/* Growing height of tree */
577 		case ALLOC_GROW_HEIGHT:
578 			if (i == 1) {
579 				ptr = (__be64 *)(dibh->b_data +
580 						 sizeof(struct gfs2_dinode));
581 				zero_bn = *ptr;
582 			}
583 			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
584 			     i++, n--)
585 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
586 			if (i - 1 == mp->mp_fheight - ip->i_height) {
587 				i--;
588 				gfs2_buffer_copy_tail(mp->mp_bh[i],
589 						sizeof(struct gfs2_meta_header),
590 						dibh, sizeof(struct gfs2_dinode));
591 				gfs2_buffer_clear_tail(dibh,
592 						sizeof(struct gfs2_dinode) +
593 						sizeof(__be64));
594 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
595 					sizeof(struct gfs2_meta_header));
596 				*ptr = zero_bn;
597 				state = ALLOC_GROW_DEPTH;
598 				for(i = branch_start; i < mp->mp_fheight; i++) {
599 					if (mp->mp_bh[i] == NULL)
600 						break;
601 					brelse(mp->mp_bh[i]);
602 					mp->mp_bh[i] = NULL;
603 				}
604 				i = branch_start;
605 			}
606 			if (n == 0)
607 				break;
608 		/* Branching from existing tree */
609 		case ALLOC_GROW_DEPTH:
610 			if (i > 1 && i < mp->mp_fheight)
611 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
612 			for (; i < mp->mp_fheight && n > 0; i++, n--)
613 				gfs2_indirect_init(mp, ip->i_gl, i,
614 						   mp->mp_list[i-1], bn++);
615 			if (i == mp->mp_fheight)
616 				state = ALLOC_DATA;
617 			if (n == 0)
618 				break;
619 		/* Tree complete, adding data blocks */
620 		case ALLOC_DATA:
621 			BUG_ON(n > dblks);
622 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
623 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
624 			dblks = n;
625 			ptr = metapointer(end_of_metadata, mp);
626 			iomap->addr = bn << inode->i_blkbits;
627 			iomap->flags |= IOMAP_F_NEW;
628 			while (n-- > 0)
629 				*ptr++ = cpu_to_be64(bn++);
630 			if (flags & IOMAP_ZERO) {
631 				ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
632 						       dblks, GFP_NOFS);
633 				if (ret) {
634 					fs_err(sdp,
635 					       "Failed to zero data buffers\n");
636 					flags &= ~IOMAP_ZERO;
637 				}
638 			}
639 			break;
640 		}
641 	} while (iomap->addr == IOMAP_NULL_ADDR);
642 
643 	iomap->length = (u64)dblks << inode->i_blkbits;
644 	ip->i_height = mp->mp_fheight;
645 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
646 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
647 	return 0;
648 }
649 
650 /**
651  * hole_size - figure out the size of a hole
652  * @inode: The inode
653  * @lblock: The logical starting block number
654  * @mp: The metapath
655  *
656  * Returns: The hole size in bytes
657  *
658  */
659 static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
660 {
661 	struct gfs2_inode *ip = GFS2_I(inode);
662 	struct gfs2_sbd *sdp = GFS2_SB(inode);
663 	struct metapath mp_eof;
664 	u64 factor = 1;
665 	int hgt;
666 	u64 holesz = 0;
667 	const __be64 *first, *end, *ptr;
668 	const struct buffer_head *bh;
669 	u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
670 	int zeroptrs;
671 	bool done = false;
672 
673 	/* Get another metapath, to the very last byte */
674 	find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
675 	for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
676 		bh = mp->mp_bh[hgt];
677 		if (bh) {
678 			zeroptrs = 0;
679 			first = metapointer(hgt, mp);
680 			end = (const __be64 *)(bh->b_data + bh->b_size);
681 
682 			for (ptr = first; ptr < end; ptr++) {
683 				if (*ptr) {
684 					done = true;
685 					break;
686 				} else {
687 					zeroptrs++;
688 				}
689 			}
690 		} else {
691 			zeroptrs = sdp->sd_inptrs;
692 		}
693 		if (factor * zeroptrs >= lblock_stop - lblock + 1) {
694 			holesz = lblock_stop - lblock + 1;
695 			break;
696 		}
697 		holesz += factor * zeroptrs;
698 
699 		factor *= sdp->sd_inptrs;
700 		if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
701 			(mp->mp_list[hgt - 1])++;
702 	}
703 	return holesz << inode->i_blkbits;
704 }
705 
706 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
707 {
708 	struct gfs2_inode *ip = GFS2_I(inode);
709 
710 	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
711 		      sizeof(struct gfs2_dinode);
712 	iomap->offset = 0;
713 	iomap->length = i_size_read(inode);
714 	iomap->type = IOMAP_MAPPED;
715 	iomap->flags = IOMAP_F_DATA_INLINE;
716 }
717 
718 /**
719  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
720  * @inode: The inode
721  * @pos: Starting position in bytes
722  * @length: Length to map, in bytes
723  * @flags: iomap flags
724  * @iomap: The iomap structure
725  *
726  * Returns: errno
727  */
728 int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
729 		     unsigned flags, struct iomap *iomap)
730 {
731 	struct gfs2_inode *ip = GFS2_I(inode);
732 	struct gfs2_sbd *sdp = GFS2_SB(inode);
733 	struct metapath mp = { .mp_aheight = 1, };
734 	unsigned int factor = sdp->sd_sb.sb_bsize;
735 	const u64 *arr = sdp->sd_heightsize;
736 	__be64 *ptr;
737 	sector_t lblock;
738 	sector_t lend;
739 	int ret;
740 	int eob;
741 	unsigned int len;
742 	struct buffer_head *bh;
743 	u8 height;
744 
745 	trace_gfs2_iomap_start(ip, pos, length, flags);
746 	if (!length) {
747 		ret = -EINVAL;
748 		goto out;
749 	}
750 
751 	if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
752 		gfs2_stuffed_iomap(inode, iomap);
753 		if (pos >= iomap->length)
754 			return -ENOENT;
755 		ret = 0;
756 		goto out;
757 	}
758 
759 	lblock = pos >> inode->i_blkbits;
760 	lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
761 
762 	iomap->offset = lblock << inode->i_blkbits;
763 	iomap->addr = IOMAP_NULL_ADDR;
764 	iomap->type = IOMAP_HOLE;
765 	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
766 	iomap->flags = IOMAP_F_MERGED;
767 	bmap_lock(ip, 0);
768 
769 	/*
770 	 * Directory data blocks have a struct gfs2_meta_header header, so the
771 	 * remaining size is smaller than the filesystem block size.  Logical
772 	 * block numbers for directories are in units of this remaining size!
773 	 */
774 	if (gfs2_is_dir(ip)) {
775 		factor = sdp->sd_jbsize;
776 		arr = sdp->sd_jheightsize;
777 	}
778 
779 	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
780 	if (ret)
781 		goto out_release;
782 
783 	height = ip->i_height;
784 	while ((lblock + 1) * factor > arr[height])
785 		height++;
786 	find_metapath(sdp, lblock, &mp, height);
787 	if (height > ip->i_height || gfs2_is_stuffed(ip))
788 		goto do_alloc;
789 
790 	ret = lookup_metapath(ip, &mp);
791 	if (ret < 0)
792 		goto out_release;
793 
794 	if (mp.mp_aheight != ip->i_height)
795 		goto do_alloc;
796 
797 	ptr = metapointer(ip->i_height - 1, &mp);
798 	if (*ptr == 0)
799 		goto do_alloc;
800 
801 	iomap->type = IOMAP_MAPPED;
802 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
803 
804 	bh = mp.mp_bh[ip->i_height - 1];
805 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
806 	if (eob)
807 		iomap->flags |= IOMAP_F_BOUNDARY;
808 	iomap->length = (u64)len << inode->i_blkbits;
809 
810 	ret = 0;
811 
812 out_release:
813 	release_metapath(&mp);
814 	bmap_unlock(ip, 0);
815 out:
816 	trace_gfs2_iomap_end(ip, iomap, ret);
817 	return ret;
818 
819 do_alloc:
820 	if (!(flags & IOMAP_WRITE)) {
821 		if (pos >= i_size_read(inode)) {
822 			ret = -ENOENT;
823 			goto out_release;
824 		}
825 		ret = 0;
826 		iomap->length = hole_size(inode, lblock, &mp);
827 		goto out_release;
828 	}
829 
830 	ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
831 	goto out_release;
832 }
833 
834 /**
835  * gfs2_block_map - Map a block from an inode to a disk block
836  * @inode: The inode
837  * @lblock: The logical block number
838  * @bh_map: The bh to be mapped
839  * @create: True if its ok to alloc blocks to satify the request
840  *
841  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
842  * read of metadata will be required before the next block can be
843  * mapped. Sets buffer_new() if new blocks were allocated.
844  *
845  * Returns: errno
846  */
847 
848 int gfs2_block_map(struct inode *inode, sector_t lblock,
849 		   struct buffer_head *bh_map, int create)
850 {
851 	struct gfs2_inode *ip = GFS2_I(inode);
852 	struct iomap iomap;
853 	int ret, flags = 0;
854 
855 	clear_buffer_mapped(bh_map);
856 	clear_buffer_new(bh_map);
857 	clear_buffer_boundary(bh_map);
858 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
859 
860 	if (create)
861 		flags |= IOMAP_WRITE;
862 	if (buffer_zeronew(bh_map))
863 		flags |= IOMAP_ZERO;
864 	ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
865 			       bh_map->b_size, flags, &iomap);
866 	if (ret) {
867 		if (!create && ret == -ENOENT) {
868 			/* Return unmapped buffer beyond the end of file.  */
869 			ret = 0;
870 		}
871 		goto out;
872 	}
873 
874 	if (iomap.length > bh_map->b_size) {
875 		iomap.length = bh_map->b_size;
876 		iomap.flags &= ~IOMAP_F_BOUNDARY;
877 	}
878 	if (iomap.addr != IOMAP_NULL_ADDR)
879 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
880 	bh_map->b_size = iomap.length;
881 	if (iomap.flags & IOMAP_F_BOUNDARY)
882 		set_buffer_boundary(bh_map);
883 	if (iomap.flags & IOMAP_F_NEW)
884 		set_buffer_new(bh_map);
885 
886 out:
887 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
888 	return ret;
889 }
890 
891 /*
892  * Deprecated: do not use in new code
893  */
894 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
895 {
896 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
897 	int ret;
898 	int create = *new;
899 
900 	BUG_ON(!extlen);
901 	BUG_ON(!dblock);
902 	BUG_ON(!new);
903 
904 	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
905 	ret = gfs2_block_map(inode, lblock, &bh, create);
906 	*extlen = bh.b_size >> inode->i_blkbits;
907 	*dblock = bh.b_blocknr;
908 	if (buffer_new(&bh))
909 		*new = 1;
910 	else
911 		*new = 0;
912 	return ret;
913 }
914 
915 /**
916  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
917  *
918  * This is partly borrowed from ext3.
919  */
920 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
921 {
922 	struct inode *inode = mapping->host;
923 	struct gfs2_inode *ip = GFS2_I(inode);
924 	unsigned long index = from >> PAGE_SHIFT;
925 	unsigned offset = from & (PAGE_SIZE-1);
926 	unsigned blocksize, iblock, length, pos;
927 	struct buffer_head *bh;
928 	struct page *page;
929 	int err;
930 
931 	page = find_or_create_page(mapping, index, GFP_NOFS);
932 	if (!page)
933 		return 0;
934 
935 	blocksize = inode->i_sb->s_blocksize;
936 	length = blocksize - (offset & (blocksize - 1));
937 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
938 
939 	if (!page_has_buffers(page))
940 		create_empty_buffers(page, blocksize, 0);
941 
942 	/* Find the buffer that contains "offset" */
943 	bh = page_buffers(page);
944 	pos = blocksize;
945 	while (offset >= pos) {
946 		bh = bh->b_this_page;
947 		iblock++;
948 		pos += blocksize;
949 	}
950 
951 	err = 0;
952 
953 	if (!buffer_mapped(bh)) {
954 		gfs2_block_map(inode, iblock, bh, 0);
955 		/* unmapped? It's a hole - nothing to do */
956 		if (!buffer_mapped(bh))
957 			goto unlock;
958 	}
959 
960 	/* Ok, it's mapped. Make sure it's up-to-date */
961 	if (PageUptodate(page))
962 		set_buffer_uptodate(bh);
963 
964 	if (!buffer_uptodate(bh)) {
965 		err = -EIO;
966 		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
967 		wait_on_buffer(bh);
968 		/* Uhhuh. Read error. Complain and punt. */
969 		if (!buffer_uptodate(bh))
970 			goto unlock;
971 		err = 0;
972 	}
973 
974 	if (!gfs2_is_writeback(ip))
975 		gfs2_trans_add_data(ip->i_gl, bh);
976 
977 	zero_user(page, offset, length);
978 	mark_buffer_dirty(bh);
979 unlock:
980 	unlock_page(page);
981 	put_page(page);
982 	return err;
983 }
984 
985 #define GFS2_JTRUNC_REVOKES 8192
986 
987 /**
988  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
989  * @inode: The inode being truncated
990  * @oldsize: The original (larger) size
991  * @newsize: The new smaller size
992  *
993  * With jdata files, we have to journal a revoke for each block which is
994  * truncated. As a result, we need to split this into separate transactions
995  * if the number of pages being truncated gets too large.
996  */
997 
998 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
999 {
1000 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1001 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1002 	u64 chunk;
1003 	int error;
1004 
1005 	while (oldsize != newsize) {
1006 		chunk = oldsize - newsize;
1007 		if (chunk > max_chunk)
1008 			chunk = max_chunk;
1009 		truncate_pagecache(inode, oldsize - chunk);
1010 		oldsize -= chunk;
1011 		gfs2_trans_end(sdp);
1012 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1013 		if (error)
1014 			return error;
1015 	}
1016 
1017 	return 0;
1018 }
1019 
1020 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1021 {
1022 	struct gfs2_inode *ip = GFS2_I(inode);
1023 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1024 	struct address_space *mapping = inode->i_mapping;
1025 	struct buffer_head *dibh;
1026 	int journaled = gfs2_is_jdata(ip);
1027 	int error;
1028 
1029 	if (journaled)
1030 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1031 	else
1032 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1033 	if (error)
1034 		return error;
1035 
1036 	error = gfs2_meta_inode_buffer(ip, &dibh);
1037 	if (error)
1038 		goto out;
1039 
1040 	gfs2_trans_add_meta(ip->i_gl, dibh);
1041 
1042 	if (gfs2_is_stuffed(ip)) {
1043 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 	} else {
1045 		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1046 			error = gfs2_block_truncate_page(mapping, newsize);
1047 			if (error)
1048 				goto out_brelse;
1049 		}
1050 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1051 	}
1052 
1053 	i_size_write(inode, newsize);
1054 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1055 	gfs2_dinode_out(ip, dibh->b_data);
1056 
1057 	if (journaled)
1058 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1059 	else
1060 		truncate_pagecache(inode, newsize);
1061 
1062 	if (error) {
1063 		brelse(dibh);
1064 		return error;
1065 	}
1066 
1067 out_brelse:
1068 	brelse(dibh);
1069 out:
1070 	gfs2_trans_end(sdp);
1071 	return error;
1072 }
1073 
1074 /**
1075  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1076  * @ip: inode
1077  * @rg_gh: holder of resource group glock
1078  * @mp: current metapath fully populated with buffers
1079  * @btotal: place to keep count of total blocks freed
1080  * @hgt: height we're processing
1081  * @first: true if this is the first call to this function for this height
1082  *
1083  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1084  * free, and free them all. However, we do it one rgrp at a time. If this
1085  * block has references to multiple rgrps, we break it into individual
1086  * transactions. This allows other processes to use the rgrps while we're
1087  * focused on a single one, for better concurrency / performance.
1088  * At every transaction boundary, we rewrite the inode into the journal.
1089  * That way the bitmaps are kept consistent with the inode and we can recover
1090  * if we're interrupted by power-outages.
1091  *
1092  * Returns: 0, or return code if an error occurred.
1093  *          *btotal has the total number of blocks freed
1094  */
1095 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1096 			      const struct metapath *mp, u32 *btotal, int hgt,
1097 			      bool preserve1)
1098 {
1099 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1100 	struct gfs2_rgrpd *rgd;
1101 	struct gfs2_trans *tr;
1102 	struct buffer_head *bh = mp->mp_bh[hgt];
1103 	__be64 *top, *bottom, *p;
1104 	int blks_outside_rgrp;
1105 	u64 bn, bstart, isize_blks;
1106 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1107 	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
1108 	int ret = 0;
1109 	bool buf_in_tr = false; /* buffer was added to transaction */
1110 
1111 	if (gfs2_metatype_check(sdp, bh,
1112 				(hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
1113 		return -EIO;
1114 
1115 more_rgrps:
1116 	blks_outside_rgrp = 0;
1117 	bstart = 0;
1118 	blen = 0;
1119 	top = metapointer(hgt, mp); /* first ptr from metapath */
1120 	/* If we're keeping some data at the truncation point, we've got to
1121 	   preserve the metadata tree by adding 1 to the starting metapath. */
1122 	if (preserve1)
1123 		top++;
1124 
1125 	bottom = (__be64 *)(bh->b_data + bh->b_size);
1126 
1127 	for (p = top; p < bottom; p++) {
1128 		if (!*p)
1129 			continue;
1130 		bn = be64_to_cpu(*p);
1131 		if (gfs2_holder_initialized(rd_gh)) {
1132 			rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1133 			gfs2_assert_withdraw(sdp,
1134 				     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1135 		} else {
1136 			rgd = gfs2_blk2rgrpd(sdp, bn, false);
1137 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1138 						 0, rd_gh);
1139 			if (ret)
1140 				goto out;
1141 
1142 			/* Must be done with the rgrp glock held: */
1143 			if (gfs2_rs_active(&ip->i_res) &&
1144 			    rgd == ip->i_res.rs_rbm.rgd)
1145 				gfs2_rs_deltree(&ip->i_res);
1146 		}
1147 
1148 		if (!rgrp_contains_block(rgd, bn)) {
1149 			blks_outside_rgrp++;
1150 			continue;
1151 		}
1152 
1153 		/* The size of our transactions will be unknown until we
1154 		   actually process all the metadata blocks that relate to
1155 		   the rgrp. So we estimate. We know it can't be more than
1156 		   the dinode's i_blocks and we don't want to exceed the
1157 		   journal flush threshold, sd_log_thresh2. */
1158 		if (current->journal_info == NULL) {
1159 			unsigned int jblocks_rqsted, revokes;
1160 
1161 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1162 				RES_INDIRECT;
1163 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1164 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1165 				jblocks_rqsted +=
1166 					atomic_read(&sdp->sd_log_thresh2);
1167 			else
1168 				jblocks_rqsted += isize_blks;
1169 			revokes = jblocks_rqsted;
1170 			if (meta)
1171 				revokes += hptrs(sdp, hgt);
1172 			else if (ip->i_depth)
1173 				revokes += sdp->sd_inptrs;
1174 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1175 			if (ret)
1176 				goto out_unlock;
1177 			down_write(&ip->i_rw_mutex);
1178 		}
1179 		/* check if we will exceed the transaction blocks requested */
1180 		tr = current->journal_info;
1181 		if (tr->tr_num_buf_new + RES_STATFS +
1182 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1183 			/* We set blks_outside_rgrp to ensure the loop will
1184 			   be repeated for the same rgrp, but with a new
1185 			   transaction. */
1186 			blks_outside_rgrp++;
1187 			/* This next part is tricky. If the buffer was added
1188 			   to the transaction, we've already set some block
1189 			   pointers to 0, so we better follow through and free
1190 			   them, or we will introduce corruption (so break).
1191 			   This may be impossible, or at least rare, but I
1192 			   decided to cover the case regardless.
1193 
1194 			   If the buffer was not added to the transaction
1195 			   (this call), doing so would exceed our transaction
1196 			   size, so we need to end the transaction and start a
1197 			   new one (so goto). */
1198 
1199 			if (buf_in_tr)
1200 				break;
1201 			goto out_unlock;
1202 		}
1203 
1204 		gfs2_trans_add_meta(ip->i_gl, bh);
1205 		buf_in_tr = true;
1206 		*p = 0;
1207 		if (bstart + blen == bn) {
1208 			blen++;
1209 			continue;
1210 		}
1211 		if (bstart) {
1212 			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1213 			(*btotal) += blen;
1214 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1215 		}
1216 		bstart = bn;
1217 		blen = 1;
1218 	}
1219 	if (bstart) {
1220 		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1221 		(*btotal) += blen;
1222 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1223 	}
1224 out_unlock:
1225 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1226 					    outside the rgrp we just processed,
1227 					    do it all over again. */
1228 		if (current->journal_info) {
1229 			struct buffer_head *dibh = mp->mp_bh[0];
1230 
1231 			/* Every transaction boundary, we rewrite the dinode
1232 			   to keep its di_blocks current in case of failure. */
1233 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1234 				current_time(&ip->i_inode);
1235 			gfs2_trans_add_meta(ip->i_gl, dibh);
1236 			gfs2_dinode_out(ip, dibh->b_data);
1237 			up_write(&ip->i_rw_mutex);
1238 			gfs2_trans_end(sdp);
1239 		}
1240 		gfs2_glock_dq_uninit(rd_gh);
1241 		cond_resched();
1242 		goto more_rgrps;
1243 	}
1244 out:
1245 	return ret;
1246 }
1247 
1248 /**
1249  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1250  * assumes the metapath is valid (with buffers) out to height h
1251  * @mp: starting metapath
1252  * @h: desired height to search
1253  *
1254  * Returns: true if a non-null pointer was found in the metapath buffer
1255  *          false if all remaining pointers are NULL in the buffer
1256  */
1257 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1258 			     unsigned int h)
1259 {
1260 	__be64 *ptr;
1261 	unsigned int ptrs = hptrs(sdp, h) - 1;
1262 
1263 	while (true) {
1264 		ptr = metapointer(h, mp);
1265 		if (*ptr) { /* if we have a non-null pointer */
1266 			/* Now zero the metapath after the current height. */
1267 			h++;
1268 			if (h < GFS2_MAX_META_HEIGHT)
1269 				memset(&mp->mp_list[h], 0,
1270 				       (GFS2_MAX_META_HEIGHT - h) *
1271 				       sizeof(mp->mp_list[0]));
1272 			return true;
1273 		}
1274 
1275 		if (mp->mp_list[h] < ptrs)
1276 			mp->mp_list[h]++;
1277 		else
1278 			return false; /* no more pointers in this buffer */
1279 	}
1280 }
1281 
1282 enum dealloc_states {
1283 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1284 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1285 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1286 	DEALLOC_DONE = 3,       /* process complete */
1287 };
1288 
1289 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h)
1290 {
1291 	if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0])))
1292 		return false;
1293 	return true;
1294 }
1295 
1296 /**
1297  * trunc_dealloc - truncate a file down to a desired size
1298  * @ip: inode to truncate
1299  * @newsize: The desired size of the file
1300  *
1301  * This function truncates a file to newsize. It works from the
1302  * bottom up, and from the right to the left. In other words, it strips off
1303  * the highest layer (data) before stripping any of the metadata. Doing it
1304  * this way is best in case the operation is interrupted by power failure, etc.
1305  * The dinode is rewritten in every transaction to guarantee integrity.
1306  */
1307 static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1308 {
1309 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1310 	struct metapath mp;
1311 	struct buffer_head *dibh, *bh;
1312 	struct gfs2_holder rd_gh;
1313 	u64 lblock;
1314 	__u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1315 	unsigned int strip_h = ip->i_height - 1;
1316 	u32 btotal = 0;
1317 	int ret, state;
1318 	int mp_h; /* metapath buffers are read in to this height */
1319 	sector_t last_ra = 0;
1320 	u64 prev_bnr = 0;
1321 	bool preserve1; /* need to preserve the first meta pointer? */
1322 
1323 	if (!newsize)
1324 		lblock = 0;
1325 	else
1326 		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1327 
1328 	memset(&mp, 0, sizeof(mp));
1329 	find_metapath(sdp, lblock, &mp, ip->i_height);
1330 
1331 	memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1332 
1333 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1334 	if (ret)
1335 		return ret;
1336 
1337 	mp.mp_bh[0] = dibh;
1338 	ret = lookup_metapath(ip, &mp);
1339 	if (ret == ip->i_height)
1340 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1341 	else
1342 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1343 
1344 	ret = gfs2_rindex_update(sdp);
1345 	if (ret)
1346 		goto out_metapath;
1347 
1348 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1349 	if (ret)
1350 		goto out_metapath;
1351 	gfs2_holder_mark_uninitialized(&rd_gh);
1352 
1353 	mp_h = strip_h;
1354 
1355 	while (state != DEALLOC_DONE) {
1356 		switch (state) {
1357 		/* Truncate a full metapath at the given strip height.
1358 		 * Note that strip_h == mp_h in order to be in this state. */
1359 		case DEALLOC_MP_FULL:
1360 			if (mp_h > 0) { /* issue read-ahead on metadata */
1361 				__be64 *top;
1362 
1363 				bh = mp.mp_bh[mp_h - 1];
1364 				if (bh->b_blocknr != last_ra) {
1365 					last_ra = bh->b_blocknr;
1366 					top = metaptr1(mp_h - 1, &mp);
1367 					gfs2_metapath_ra(ip->i_gl, bh, top);
1368 				}
1369 			}
1370 			/* If we're truncating to a non-zero size and the mp is
1371 			   at the beginning of file for the strip height, we
1372 			   need to preserve the first metadata pointer. */
1373 			preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h));
1374 			bh = mp.mp_bh[mp_h];
1375 			gfs2_assert_withdraw(sdp, bh);
1376 			if (gfs2_assert_withdraw(sdp,
1377 						 prev_bnr != bh->b_blocknr)) {
1378 				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1379 				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1380 				       sdp->sd_fsname,
1381 				       (unsigned long long)ip->i_no_addr,
1382 				       prev_bnr, ip->i_height, strip_h, mp_h);
1383 			}
1384 			prev_bnr = bh->b_blocknr;
1385 			ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1386 						 mp_h, preserve1);
1387 			/* If we hit an error or just swept dinode buffer,
1388 			   just exit. */
1389 			if (ret || !mp_h) {
1390 				state = DEALLOC_DONE;
1391 				break;
1392 			}
1393 			state = DEALLOC_MP_LOWER;
1394 			break;
1395 
1396 		/* lower the metapath strip height */
1397 		case DEALLOC_MP_LOWER:
1398 			/* We're done with the current buffer, so release it,
1399 			   unless it's the dinode buffer. Then back up to the
1400 			   previous pointer. */
1401 			if (mp_h) {
1402 				brelse(mp.mp_bh[mp_h]);
1403 				mp.mp_bh[mp_h] = NULL;
1404 			}
1405 			/* If we can't get any lower in height, we've stripped
1406 			   off all we can. Next step is to back up and start
1407 			   stripping the previous level of metadata. */
1408 			if (mp_h == 0) {
1409 				strip_h--;
1410 				memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1411 				mp_h = strip_h;
1412 				state = DEALLOC_FILL_MP;
1413 				break;
1414 			}
1415 			mp.mp_list[mp_h] = 0;
1416 			mp_h--; /* search one metadata height down */
1417 			if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1418 				break; /* loop around in the same state */
1419 			mp.mp_list[mp_h]++;
1420 			/* Here we've found a part of the metapath that is not
1421 			 * allocated. We need to search at that height for the
1422 			 * next non-null pointer. */
1423 			if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1424 				state = DEALLOC_FILL_MP;
1425 				mp_h++;
1426 			}
1427 			/* No more non-null pointers at this height. Back up
1428 			   to the previous height and try again. */
1429 			break; /* loop around in the same state */
1430 
1431 		/* Fill the metapath with buffers to the given height. */
1432 		case DEALLOC_FILL_MP:
1433 			/* Fill the buffers out to the current height. */
1434 			ret = fillup_metapath(ip, &mp, mp_h);
1435 			if (ret < 0)
1436 				goto out;
1437 
1438 			/* If buffers found for the entire strip height */
1439 			if ((ret == ip->i_height) && (mp_h == strip_h)) {
1440 				state = DEALLOC_MP_FULL;
1441 				break;
1442 			}
1443 			if (ret < ip->i_height) /* We have a partial height */
1444 				mp_h = ret - 1;
1445 
1446 			/* If we find a non-null block pointer, crawl a bit
1447 			   higher up in the metapath and try again, otherwise
1448 			   we need to look lower for a new starting point. */
1449 			if (find_nonnull_ptr(sdp, &mp, mp_h))
1450 				mp_h++;
1451 			else
1452 				state = DEALLOC_MP_LOWER;
1453 			break;
1454 		}
1455 	}
1456 
1457 	if (btotal) {
1458 		if (current->journal_info == NULL) {
1459 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1460 					       RES_QUOTA, 0);
1461 			if (ret)
1462 				goto out;
1463 			down_write(&ip->i_rw_mutex);
1464 		}
1465 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1466 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1467 				  ip->i_inode.i_gid);
1468 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1469 		gfs2_trans_add_meta(ip->i_gl, dibh);
1470 		gfs2_dinode_out(ip, dibh->b_data);
1471 		up_write(&ip->i_rw_mutex);
1472 		gfs2_trans_end(sdp);
1473 	}
1474 
1475 out:
1476 	if (gfs2_holder_initialized(&rd_gh))
1477 		gfs2_glock_dq_uninit(&rd_gh);
1478 	if (current->journal_info) {
1479 		up_write(&ip->i_rw_mutex);
1480 		gfs2_trans_end(sdp);
1481 		cond_resched();
1482 	}
1483 	gfs2_quota_unhold(ip);
1484 out_metapath:
1485 	release_metapath(&mp);
1486 	return ret;
1487 }
1488 
1489 static int trunc_end(struct gfs2_inode *ip)
1490 {
1491 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1492 	struct buffer_head *dibh;
1493 	int error;
1494 
1495 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1496 	if (error)
1497 		return error;
1498 
1499 	down_write(&ip->i_rw_mutex);
1500 
1501 	error = gfs2_meta_inode_buffer(ip, &dibh);
1502 	if (error)
1503 		goto out;
1504 
1505 	if (!i_size_read(&ip->i_inode)) {
1506 		ip->i_height = 0;
1507 		ip->i_goal = ip->i_no_addr;
1508 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1509 		gfs2_ordered_del_inode(ip);
1510 	}
1511 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1512 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1513 
1514 	gfs2_trans_add_meta(ip->i_gl, dibh);
1515 	gfs2_dinode_out(ip, dibh->b_data);
1516 	brelse(dibh);
1517 
1518 out:
1519 	up_write(&ip->i_rw_mutex);
1520 	gfs2_trans_end(sdp);
1521 	return error;
1522 }
1523 
1524 /**
1525  * do_shrink - make a file smaller
1526  * @inode: the inode
1527  * @oldsize: the current inode size
1528  * @newsize: the size to make the file
1529  *
1530  * Called with an exclusive lock on @inode. The @size must
1531  * be equal to or smaller than the current inode size.
1532  *
1533  * Returns: errno
1534  */
1535 
1536 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1537 {
1538 	struct gfs2_inode *ip = GFS2_I(inode);
1539 	int error;
1540 
1541 	error = trunc_start(inode, oldsize, newsize);
1542 	if (error < 0)
1543 		return error;
1544 	if (gfs2_is_stuffed(ip))
1545 		return 0;
1546 
1547 	error = trunc_dealloc(ip, newsize);
1548 	if (error == 0)
1549 		error = trunc_end(ip);
1550 
1551 	return error;
1552 }
1553 
1554 void gfs2_trim_blocks(struct inode *inode)
1555 {
1556 	u64 size = inode->i_size;
1557 	int ret;
1558 
1559 	ret = do_shrink(inode, size, size);
1560 	WARN_ON(ret != 0);
1561 }
1562 
1563 /**
1564  * do_grow - Touch and update inode size
1565  * @inode: The inode
1566  * @size: The new size
1567  *
1568  * This function updates the timestamps on the inode and
1569  * may also increase the size of the inode. This function
1570  * must not be called with @size any smaller than the current
1571  * inode size.
1572  *
1573  * Although it is not strictly required to unstuff files here,
1574  * earlier versions of GFS2 have a bug in the stuffed file reading
1575  * code which will result in a buffer overrun if the size is larger
1576  * than the max stuffed file size. In order to prevent this from
1577  * occurring, such files are unstuffed, but in other cases we can
1578  * just update the inode size directly.
1579  *
1580  * Returns: 0 on success, or -ve on error
1581  */
1582 
1583 static int do_grow(struct inode *inode, u64 size)
1584 {
1585 	struct gfs2_inode *ip = GFS2_I(inode);
1586 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1587 	struct gfs2_alloc_parms ap = { .target = 1, };
1588 	struct buffer_head *dibh;
1589 	int error;
1590 	int unstuff = 0;
1591 
1592 	if (gfs2_is_stuffed(ip) &&
1593 	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1594 		error = gfs2_quota_lock_check(ip, &ap);
1595 		if (error)
1596 			return error;
1597 
1598 		error = gfs2_inplace_reserve(ip, &ap);
1599 		if (error)
1600 			goto do_grow_qunlock;
1601 		unstuff = 1;
1602 	}
1603 
1604 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1605 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1606 				  0 : RES_QUOTA), 0);
1607 	if (error)
1608 		goto do_grow_release;
1609 
1610 	if (unstuff) {
1611 		error = gfs2_unstuff_dinode(ip, NULL);
1612 		if (error)
1613 			goto do_end_trans;
1614 	}
1615 
1616 	error = gfs2_meta_inode_buffer(ip, &dibh);
1617 	if (error)
1618 		goto do_end_trans;
1619 
1620 	i_size_write(inode, size);
1621 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1622 	gfs2_trans_add_meta(ip->i_gl, dibh);
1623 	gfs2_dinode_out(ip, dibh->b_data);
1624 	brelse(dibh);
1625 
1626 do_end_trans:
1627 	gfs2_trans_end(sdp);
1628 do_grow_release:
1629 	if (unstuff) {
1630 		gfs2_inplace_release(ip);
1631 do_grow_qunlock:
1632 		gfs2_quota_unlock(ip);
1633 	}
1634 	return error;
1635 }
1636 
1637 /**
1638  * gfs2_setattr_size - make a file a given size
1639  * @inode: the inode
1640  * @newsize: the size to make the file
1641  *
1642  * The file size can grow, shrink, or stay the same size. This
1643  * is called holding i_mutex and an exclusive glock on the inode
1644  * in question.
1645  *
1646  * Returns: errno
1647  */
1648 
1649 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1650 {
1651 	struct gfs2_inode *ip = GFS2_I(inode);
1652 	int ret;
1653 	u64 oldsize;
1654 
1655 	BUG_ON(!S_ISREG(inode->i_mode));
1656 
1657 	ret = inode_newsize_ok(inode, newsize);
1658 	if (ret)
1659 		return ret;
1660 
1661 	inode_dio_wait(inode);
1662 
1663 	ret = gfs2_rsqa_alloc(ip);
1664 	if (ret)
1665 		goto out;
1666 
1667 	oldsize = inode->i_size;
1668 	if (newsize >= oldsize) {
1669 		ret = do_grow(inode, newsize);
1670 		goto out;
1671 	}
1672 
1673 	ret = do_shrink(inode, oldsize, newsize);
1674 out:
1675 	gfs2_rsqa_delete(ip, NULL);
1676 	return ret;
1677 }
1678 
1679 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1680 {
1681 	int error;
1682 	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1683 	if (!error)
1684 		error = trunc_end(ip);
1685 	return error;
1686 }
1687 
1688 int gfs2_file_dealloc(struct gfs2_inode *ip)
1689 {
1690 	return trunc_dealloc(ip, 0);
1691 }
1692 
1693 /**
1694  * gfs2_free_journal_extents - Free cached journal bmap info
1695  * @jd: The journal
1696  *
1697  */
1698 
1699 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1700 {
1701 	struct gfs2_journal_extent *jext;
1702 
1703 	while(!list_empty(&jd->extent_list)) {
1704 		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1705 		list_del(&jext->list);
1706 		kfree(jext);
1707 	}
1708 }
1709 
1710 /**
1711  * gfs2_add_jextent - Add or merge a new extent to extent cache
1712  * @jd: The journal descriptor
1713  * @lblock: The logical block at start of new extent
1714  * @dblock: The physical block at start of new extent
1715  * @blocks: Size of extent in fs blocks
1716  *
1717  * Returns: 0 on success or -ENOMEM
1718  */
1719 
1720 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1721 {
1722 	struct gfs2_journal_extent *jext;
1723 
1724 	if (!list_empty(&jd->extent_list)) {
1725 		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1726 		if ((jext->dblock + jext->blocks) == dblock) {
1727 			jext->blocks += blocks;
1728 			return 0;
1729 		}
1730 	}
1731 
1732 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1733 	if (jext == NULL)
1734 		return -ENOMEM;
1735 	jext->dblock = dblock;
1736 	jext->lblock = lblock;
1737 	jext->blocks = blocks;
1738 	list_add_tail(&jext->list, &jd->extent_list);
1739 	jd->nr_extents++;
1740 	return 0;
1741 }
1742 
1743 /**
1744  * gfs2_map_journal_extents - Cache journal bmap info
1745  * @sdp: The super block
1746  * @jd: The journal to map
1747  *
1748  * Create a reusable "extent" mapping from all logical
1749  * blocks to all physical blocks for the given journal.  This will save
1750  * us time when writing journal blocks.  Most journals will have only one
1751  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1752  * arranges the journal blocks sequentially to maximize performance.
1753  * So the extent would map the first block for the entire file length.
1754  * However, gfs2_jadd can happen while file activity is happening, so
1755  * those journals may not be sequential.  Less likely is the case where
1756  * the users created their own journals by mounting the metafs and
1757  * laying it out.  But it's still possible.  These journals might have
1758  * several extents.
1759  *
1760  * Returns: 0 on success, or error on failure
1761  */
1762 
1763 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1764 {
1765 	u64 lblock = 0;
1766 	u64 lblock_stop;
1767 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1768 	struct buffer_head bh;
1769 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1770 	u64 size;
1771 	int rc;
1772 
1773 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
1774 	size = (lblock_stop - lblock) << shift;
1775 	jd->nr_extents = 0;
1776 	WARN_ON(!list_empty(&jd->extent_list));
1777 
1778 	do {
1779 		bh.b_state = 0;
1780 		bh.b_blocknr = 0;
1781 		bh.b_size = size;
1782 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1783 		if (rc || !buffer_mapped(&bh))
1784 			goto fail;
1785 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1786 		if (rc)
1787 			goto fail;
1788 		size -= bh.b_size;
1789 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1790 	} while(size > 0);
1791 
1792 	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1793 		jd->nr_extents);
1794 	return 0;
1795 
1796 fail:
1797 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1798 		rc, jd->jd_jid,
1799 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
1800 		jd->nr_extents);
1801 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1802 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1803 		bh.b_state, (unsigned long long)bh.b_size);
1804 	gfs2_free_journal_extents(jd);
1805 	return rc;
1806 }
1807 
1808 /**
1809  * gfs2_write_alloc_required - figure out if a write will require an allocation
1810  * @ip: the file being written to
1811  * @offset: the offset to write to
1812  * @len: the number of bytes being written
1813  *
1814  * Returns: 1 if an alloc is required, 0 otherwise
1815  */
1816 
1817 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1818 			      unsigned int len)
1819 {
1820 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1821 	struct buffer_head bh;
1822 	unsigned int shift;
1823 	u64 lblock, lblock_stop, size;
1824 	u64 end_of_file;
1825 
1826 	if (!len)
1827 		return 0;
1828 
1829 	if (gfs2_is_stuffed(ip)) {
1830 		if (offset + len >
1831 		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1832 			return 1;
1833 		return 0;
1834 	}
1835 
1836 	shift = sdp->sd_sb.sb_bsize_shift;
1837 	BUG_ON(gfs2_is_dir(ip));
1838 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1839 	lblock = offset >> shift;
1840 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1841 	if (lblock_stop > end_of_file)
1842 		return 1;
1843 
1844 	size = (lblock_stop - lblock) << shift;
1845 	do {
1846 		bh.b_state = 0;
1847 		bh.b_size = size;
1848 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1849 		if (!buffer_mapped(&bh))
1850 			return 1;
1851 		size -= bh.b_size;
1852 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1853 	} while(size > 0);
1854 
1855 	return 0;
1856 }
1857 
1858