xref: /linux/fs/ext4/inode.c (revision a1087ef6abedf0bfd60e5e3fddf33192cb2c1325)
1 /*
2  *  linux/fs/ext4/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *	(sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *	(jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23  */
24 
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/jbd2.h>
29 #include <linux/highuid.h>
30 #include <linux/pagemap.h>
31 #include <linux/quotaops.h>
32 #include <linux/string.h>
33 #include <linux/buffer_head.h>
34 #include <linux/writeback.h>
35 #include <linux/pagevec.h>
36 #include <linux/mpage.h>
37 #include <linux/namei.h>
38 #include <linux/uio.h>
39 #include <linux/bio.h>
40 #include <linux/workqueue.h>
41 #include <linux/kernel.h>
42 #include <linux/slab.h>
43 
44 #include "ext4_jbd2.h"
45 #include "xattr.h"
46 #include "acl.h"
47 #include "ext4_extents.h"
48 
49 #include <trace/events/ext4.h>
50 
51 #define MPAGE_DA_EXTENT_TAIL 0x01
52 
53 static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 					      loff_t new_size)
55 {
56 	return jbd2_journal_begin_ordered_truncate(
57 					EXT4_SB(inode->i_sb)->s_journal,
58 					&EXT4_I(inode)->jinode,
59 					new_size);
60 }
61 
62 static void ext4_invalidatepage(struct page *page, unsigned long offset);
63 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
64 				   struct buffer_head *bh_result, int create);
65 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
66 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
67 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
68 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
69 
70 /*
71  * Test whether an inode is a fast symlink.
72  */
73 static int ext4_inode_is_fast_symlink(struct inode *inode)
74 {
75 	int ea_blocks = EXT4_I(inode)->i_file_acl ?
76 		(inode->i_sb->s_blocksize >> 9) : 0;
77 
78 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
79 }
80 
81 /*
82  * Work out how many blocks we need to proceed with the next chunk of a
83  * truncate transaction.
84  */
85 static unsigned long blocks_for_truncate(struct inode *inode)
86 {
87 	ext4_lblk_t needed;
88 
89 	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
90 
91 	/* Give ourselves just enough room to cope with inodes in which
92 	 * i_blocks is corrupt: we've seen disk corruptions in the past
93 	 * which resulted in random data in an inode which looked enough
94 	 * like a regular file for ext4 to try to delete it.  Things
95 	 * will go a bit crazy if that happens, but at least we should
96 	 * try not to panic the whole kernel. */
97 	if (needed < 2)
98 		needed = 2;
99 
100 	/* But we need to bound the transaction so we don't overflow the
101 	 * journal. */
102 	if (needed > EXT4_MAX_TRANS_DATA)
103 		needed = EXT4_MAX_TRANS_DATA;
104 
105 	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
106 }
107 
108 /*
109  * Truncate transactions can be complex and absolutely huge.  So we need to
110  * be able to restart the transaction at a conventient checkpoint to make
111  * sure we don't overflow the journal.
112  *
113  * start_transaction gets us a new handle for a truncate transaction,
114  * and extend_transaction tries to extend the existing one a bit.  If
115  * extend fails, we need to propagate the failure up and restart the
116  * transaction in the top-level truncate loop. --sct
117  */
118 static handle_t *start_transaction(struct inode *inode)
119 {
120 	handle_t *result;
121 
122 	result = ext4_journal_start(inode, blocks_for_truncate(inode));
123 	if (!IS_ERR(result))
124 		return result;
125 
126 	ext4_std_error(inode->i_sb, PTR_ERR(result));
127 	return result;
128 }
129 
130 /*
131  * Try to extend this transaction for the purposes of truncation.
132  *
133  * Returns 0 if we managed to create more room.  If we can't create more
134  * room, and the transaction must be restarted we return 1.
135  */
136 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
137 {
138 	if (!ext4_handle_valid(handle))
139 		return 0;
140 	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
141 		return 0;
142 	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
143 		return 0;
144 	return 1;
145 }
146 
147 /*
148  * Restart the transaction associated with *handle.  This does a commit,
149  * so before we call here everything must be consistently dirtied against
150  * this transaction.
151  */
152 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
153 				 int nblocks)
154 {
155 	int ret;
156 
157 	/*
158 	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
159 	 * moment, get_block can be called only for blocks inside i_size since
160 	 * page cache has been already dropped and writes are blocked by
161 	 * i_mutex. So we can safely drop the i_data_sem here.
162 	 */
163 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
164 	jbd_debug(2, "restarting handle %p\n", handle);
165 	up_write(&EXT4_I(inode)->i_data_sem);
166 	ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
167 	down_write(&EXT4_I(inode)->i_data_sem);
168 	ext4_discard_preallocations(inode);
169 
170 	return ret;
171 }
172 
173 /*
174  * Called at the last iput() if i_nlink is zero.
175  */
176 void ext4_evict_inode(struct inode *inode)
177 {
178 	handle_t *handle;
179 	int err;
180 
181 	if (inode->i_nlink) {
182 		truncate_inode_pages(&inode->i_data, 0);
183 		goto no_delete;
184 	}
185 
186 	if (!is_bad_inode(inode))
187 		dquot_initialize(inode);
188 
189 	if (ext4_should_order_data(inode))
190 		ext4_begin_ordered_truncate(inode, 0);
191 	truncate_inode_pages(&inode->i_data, 0);
192 
193 	if (is_bad_inode(inode))
194 		goto no_delete;
195 
196 	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
197 	if (IS_ERR(handle)) {
198 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
199 		/*
200 		 * If we're going to skip the normal cleanup, we still need to
201 		 * make sure that the in-core orphan linked list is properly
202 		 * cleaned up.
203 		 */
204 		ext4_orphan_del(NULL, inode);
205 		goto no_delete;
206 	}
207 
208 	if (IS_SYNC(inode))
209 		ext4_handle_sync(handle);
210 	inode->i_size = 0;
211 	err = ext4_mark_inode_dirty(handle, inode);
212 	if (err) {
213 		ext4_warning(inode->i_sb,
214 			     "couldn't mark inode dirty (err %d)", err);
215 		goto stop_handle;
216 	}
217 	if (inode->i_blocks)
218 		ext4_truncate(inode);
219 
220 	/*
221 	 * ext4_ext_truncate() doesn't reserve any slop when it
222 	 * restarts journal transactions; therefore there may not be
223 	 * enough credits left in the handle to remove the inode from
224 	 * the orphan list and set the dtime field.
225 	 */
226 	if (!ext4_handle_has_enough_credits(handle, 3)) {
227 		err = ext4_journal_extend(handle, 3);
228 		if (err > 0)
229 			err = ext4_journal_restart(handle, 3);
230 		if (err != 0) {
231 			ext4_warning(inode->i_sb,
232 				     "couldn't extend journal (err %d)", err);
233 		stop_handle:
234 			ext4_journal_stop(handle);
235 			ext4_orphan_del(NULL, inode);
236 			goto no_delete;
237 		}
238 	}
239 
240 	/*
241 	 * Kill off the orphan record which ext4_truncate created.
242 	 * AKPM: I think this can be inside the above `if'.
243 	 * Note that ext4_orphan_del() has to be able to cope with the
244 	 * deletion of a non-existent orphan - this is because we don't
245 	 * know if ext4_truncate() actually created an orphan record.
246 	 * (Well, we could do this if we need to, but heck - it works)
247 	 */
248 	ext4_orphan_del(handle, inode);
249 	EXT4_I(inode)->i_dtime	= get_seconds();
250 
251 	/*
252 	 * One subtle ordering requirement: if anything has gone wrong
253 	 * (transaction abort, IO errors, whatever), then we can still
254 	 * do these next steps (the fs will already have been marked as
255 	 * having errors), but we can't free the inode if the mark_dirty
256 	 * fails.
257 	 */
258 	if (ext4_mark_inode_dirty(handle, inode))
259 		/* If that failed, just do the required in-core inode clear. */
260 		ext4_clear_inode(inode);
261 	else
262 		ext4_free_inode(handle, inode);
263 	ext4_journal_stop(handle);
264 	return;
265 no_delete:
266 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
267 }
268 
269 typedef struct {
270 	__le32	*p;
271 	__le32	key;
272 	struct buffer_head *bh;
273 } Indirect;
274 
275 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
276 {
277 	p->key = *(p->p = v);
278 	p->bh = bh;
279 }
280 
281 /**
282  *	ext4_block_to_path - parse the block number into array of offsets
283  *	@inode: inode in question (we are only interested in its superblock)
284  *	@i_block: block number to be parsed
285  *	@offsets: array to store the offsets in
286  *	@boundary: set this non-zero if the referred-to block is likely to be
287  *	       followed (on disk) by an indirect block.
288  *
289  *	To store the locations of file's data ext4 uses a data structure common
290  *	for UNIX filesystems - tree of pointers anchored in the inode, with
291  *	data blocks at leaves and indirect blocks in intermediate nodes.
292  *	This function translates the block number into path in that tree -
293  *	return value is the path length and @offsets[n] is the offset of
294  *	pointer to (n+1)th node in the nth one. If @block is out of range
295  *	(negative or too large) warning is printed and zero returned.
296  *
297  *	Note: function doesn't find node addresses, so no IO is needed. All
298  *	we need to know is the capacity of indirect blocks (taken from the
299  *	inode->i_sb).
300  */
301 
302 /*
303  * Portability note: the last comparison (check that we fit into triple
304  * indirect block) is spelled differently, because otherwise on an
305  * architecture with 32-bit longs and 8Kb pages we might get into trouble
306  * if our filesystem had 8Kb blocks. We might use long long, but that would
307  * kill us on x86. Oh, well, at least the sign propagation does not matter -
308  * i_block would have to be negative in the very beginning, so we would not
309  * get there at all.
310  */
311 
312 static int ext4_block_to_path(struct inode *inode,
313 			      ext4_lblk_t i_block,
314 			      ext4_lblk_t offsets[4], int *boundary)
315 {
316 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
317 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
318 	const long direct_blocks = EXT4_NDIR_BLOCKS,
319 		indirect_blocks = ptrs,
320 		double_blocks = (1 << (ptrs_bits * 2));
321 	int n = 0;
322 	int final = 0;
323 
324 	if (i_block < direct_blocks) {
325 		offsets[n++] = i_block;
326 		final = direct_blocks;
327 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
328 		offsets[n++] = EXT4_IND_BLOCK;
329 		offsets[n++] = i_block;
330 		final = ptrs;
331 	} else if ((i_block -= indirect_blocks) < double_blocks) {
332 		offsets[n++] = EXT4_DIND_BLOCK;
333 		offsets[n++] = i_block >> ptrs_bits;
334 		offsets[n++] = i_block & (ptrs - 1);
335 		final = ptrs;
336 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
337 		offsets[n++] = EXT4_TIND_BLOCK;
338 		offsets[n++] = i_block >> (ptrs_bits * 2);
339 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
340 		offsets[n++] = i_block & (ptrs - 1);
341 		final = ptrs;
342 	} else {
343 		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
344 			     i_block + direct_blocks +
345 			     indirect_blocks + double_blocks, inode->i_ino);
346 	}
347 	if (boundary)
348 		*boundary = final - 1 - (i_block & (ptrs - 1));
349 	return n;
350 }
351 
352 static int __ext4_check_blockref(const char *function, unsigned int line,
353 				 struct inode *inode,
354 				 __le32 *p, unsigned int max)
355 {
356 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
357 	__le32 *bref = p;
358 	unsigned int blk;
359 
360 	while (bref < p+max) {
361 		blk = le32_to_cpu(*bref++);
362 		if (blk &&
363 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
364 						    blk, 1))) {
365 			es->s_last_error_block = cpu_to_le64(blk);
366 			ext4_error_inode(inode, function, line, blk,
367 					 "invalid block");
368 			return -EIO;
369 		}
370 	}
371 	return 0;
372 }
373 
374 
375 #define ext4_check_indirect_blockref(inode, bh)                         \
376 	__ext4_check_blockref(__func__, __LINE__, inode,		\
377 			      (__le32 *)(bh)->b_data,			\
378 			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
379 
380 #define ext4_check_inode_blockref(inode)                                \
381 	__ext4_check_blockref(__func__, __LINE__, inode,		\
382 			      EXT4_I(inode)->i_data,			\
383 			      EXT4_NDIR_BLOCKS)
384 
385 /**
386  *	ext4_get_branch - read the chain of indirect blocks leading to data
387  *	@inode: inode in question
388  *	@depth: depth of the chain (1 - direct pointer, etc.)
389  *	@offsets: offsets of pointers in inode/indirect blocks
390  *	@chain: place to store the result
391  *	@err: here we store the error value
392  *
393  *	Function fills the array of triples <key, p, bh> and returns %NULL
394  *	if everything went OK or the pointer to the last filled triple
395  *	(incomplete one) otherwise. Upon the return chain[i].key contains
396  *	the number of (i+1)-th block in the chain (as it is stored in memory,
397  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
398  *	number (it points into struct inode for i==0 and into the bh->b_data
399  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
400  *	block for i>0 and NULL for i==0. In other words, it holds the block
401  *	numbers of the chain, addresses they were taken from (and where we can
402  *	verify that chain did not change) and buffer_heads hosting these
403  *	numbers.
404  *
405  *	Function stops when it stumbles upon zero pointer (absent block)
406  *		(pointer to last triple returned, *@err == 0)
407  *	or when it gets an IO error reading an indirect block
408  *		(ditto, *@err == -EIO)
409  *	or when it reads all @depth-1 indirect blocks successfully and finds
410  *	the whole chain, all way to the data (returns %NULL, *err == 0).
411  *
412  *      Need to be called with
413  *      down_read(&EXT4_I(inode)->i_data_sem)
414  */
415 static Indirect *ext4_get_branch(struct inode *inode, int depth,
416 				 ext4_lblk_t  *offsets,
417 				 Indirect chain[4], int *err)
418 {
419 	struct super_block *sb = inode->i_sb;
420 	Indirect *p = chain;
421 	struct buffer_head *bh;
422 
423 	*err = 0;
424 	/* i_data is not going away, no lock needed */
425 	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
426 	if (!p->key)
427 		goto no_block;
428 	while (--depth) {
429 		bh = sb_getblk(sb, le32_to_cpu(p->key));
430 		if (unlikely(!bh))
431 			goto failure;
432 
433 		if (!bh_uptodate_or_lock(bh)) {
434 			if (bh_submit_read(bh) < 0) {
435 				put_bh(bh);
436 				goto failure;
437 			}
438 			/* validate block references */
439 			if (ext4_check_indirect_blockref(inode, bh)) {
440 				put_bh(bh);
441 				goto failure;
442 			}
443 		}
444 
445 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
446 		/* Reader: end */
447 		if (!p->key)
448 			goto no_block;
449 	}
450 	return NULL;
451 
452 failure:
453 	*err = -EIO;
454 no_block:
455 	return p;
456 }
457 
458 /**
459  *	ext4_find_near - find a place for allocation with sufficient locality
460  *	@inode: owner
461  *	@ind: descriptor of indirect block.
462  *
463  *	This function returns the preferred place for block allocation.
464  *	It is used when heuristic for sequential allocation fails.
465  *	Rules are:
466  *	  + if there is a block to the left of our position - allocate near it.
467  *	  + if pointer will live in indirect block - allocate near that block.
468  *	  + if pointer will live in inode - allocate in the same
469  *	    cylinder group.
470  *
471  * In the latter case we colour the starting block by the callers PID to
472  * prevent it from clashing with concurrent allocations for a different inode
473  * in the same block group.   The PID is used here so that functionally related
474  * files will be close-by on-disk.
475  *
476  *	Caller must make sure that @ind is valid and will stay that way.
477  */
478 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
479 {
480 	struct ext4_inode_info *ei = EXT4_I(inode);
481 	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
482 	__le32 *p;
483 	ext4_fsblk_t bg_start;
484 	ext4_fsblk_t last_block;
485 	ext4_grpblk_t colour;
486 	ext4_group_t block_group;
487 	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
488 
489 	/* Try to find previous block */
490 	for (p = ind->p - 1; p >= start; p--) {
491 		if (*p)
492 			return le32_to_cpu(*p);
493 	}
494 
495 	/* No such thing, so let's try location of indirect block */
496 	if (ind->bh)
497 		return ind->bh->b_blocknr;
498 
499 	/*
500 	 * It is going to be referred to from the inode itself? OK, just put it
501 	 * into the same cylinder group then.
502 	 */
503 	block_group = ei->i_block_group;
504 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
505 		block_group &= ~(flex_size-1);
506 		if (S_ISREG(inode->i_mode))
507 			block_group++;
508 	}
509 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
510 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
511 
512 	/*
513 	 * If we are doing delayed allocation, we don't need take
514 	 * colour into account.
515 	 */
516 	if (test_opt(inode->i_sb, DELALLOC))
517 		return bg_start;
518 
519 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
520 		colour = (current->pid % 16) *
521 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
522 	else
523 		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
524 	return bg_start + colour;
525 }
526 
527 /**
528  *	ext4_find_goal - find a preferred place for allocation.
529  *	@inode: owner
530  *	@block:  block we want
531  *	@partial: pointer to the last triple within a chain
532  *
533  *	Normally this function find the preferred place for block allocation,
534  *	returns it.
535  *	Because this is only used for non-extent files, we limit the block nr
536  *	to 32 bits.
537  */
538 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
539 				   Indirect *partial)
540 {
541 	ext4_fsblk_t goal;
542 
543 	/*
544 	 * XXX need to get goal block from mballoc's data structures
545 	 */
546 
547 	goal = ext4_find_near(inode, partial);
548 	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
549 	return goal;
550 }
551 
552 /**
553  *	ext4_blks_to_allocate: Look up the block map and count the number
554  *	of direct blocks need to be allocated for the given branch.
555  *
556  *	@branch: chain of indirect blocks
557  *	@k: number of blocks need for indirect blocks
558  *	@blks: number of data blocks to be mapped.
559  *	@blocks_to_boundary:  the offset in the indirect block
560  *
561  *	return the total number of blocks to be allocate, including the
562  *	direct and indirect blocks.
563  */
564 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
565 				 int blocks_to_boundary)
566 {
567 	unsigned int count = 0;
568 
569 	/*
570 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
571 	 * then it's clear blocks on that path have not allocated
572 	 */
573 	if (k > 0) {
574 		/* right now we don't handle cross boundary allocation */
575 		if (blks < blocks_to_boundary + 1)
576 			count += blks;
577 		else
578 			count += blocks_to_boundary + 1;
579 		return count;
580 	}
581 
582 	count++;
583 	while (count < blks && count <= blocks_to_boundary &&
584 		le32_to_cpu(*(branch[0].p + count)) == 0) {
585 		count++;
586 	}
587 	return count;
588 }
589 
590 /**
591  *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
592  *	@indirect_blks: the number of blocks need to allocate for indirect
593  *			blocks
594  *
595  *	@new_blocks: on return it will store the new block numbers for
596  *	the indirect blocks(if needed) and the first direct block,
597  *	@blks:	on return it will store the total number of allocated
598  *		direct blocks
599  */
600 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
601 			     ext4_lblk_t iblock, ext4_fsblk_t goal,
602 			     int indirect_blks, int blks,
603 			     ext4_fsblk_t new_blocks[4], int *err)
604 {
605 	struct ext4_allocation_request ar;
606 	int target, i;
607 	unsigned long count = 0, blk_allocated = 0;
608 	int index = 0;
609 	ext4_fsblk_t current_block = 0;
610 	int ret = 0;
611 
612 	/*
613 	 * Here we try to allocate the requested multiple blocks at once,
614 	 * on a best-effort basis.
615 	 * To build a branch, we should allocate blocks for
616 	 * the indirect blocks(if not allocated yet), and at least
617 	 * the first direct block of this branch.  That's the
618 	 * minimum number of blocks need to allocate(required)
619 	 */
620 	/* first we try to allocate the indirect blocks */
621 	target = indirect_blks;
622 	while (target > 0) {
623 		count = target;
624 		/* allocating blocks for indirect blocks and direct blocks */
625 		current_block = ext4_new_meta_blocks(handle, inode,
626 							goal, &count, err);
627 		if (*err)
628 			goto failed_out;
629 
630 		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
631 			EXT4_ERROR_INODE(inode,
632 					 "current_block %llu + count %lu > %d!",
633 					 current_block, count,
634 					 EXT4_MAX_BLOCK_FILE_PHYS);
635 			*err = -EIO;
636 			goto failed_out;
637 		}
638 
639 		target -= count;
640 		/* allocate blocks for indirect blocks */
641 		while (index < indirect_blks && count) {
642 			new_blocks[index++] = current_block++;
643 			count--;
644 		}
645 		if (count > 0) {
646 			/*
647 			 * save the new block number
648 			 * for the first direct block
649 			 */
650 			new_blocks[index] = current_block;
651 			printk(KERN_INFO "%s returned more blocks than "
652 						"requested\n", __func__);
653 			WARN_ON(1);
654 			break;
655 		}
656 	}
657 
658 	target = blks - count ;
659 	blk_allocated = count;
660 	if (!target)
661 		goto allocated;
662 	/* Now allocate data blocks */
663 	memset(&ar, 0, sizeof(ar));
664 	ar.inode = inode;
665 	ar.goal = goal;
666 	ar.len = target;
667 	ar.logical = iblock;
668 	if (S_ISREG(inode->i_mode))
669 		/* enable in-core preallocation only for regular files */
670 		ar.flags = EXT4_MB_HINT_DATA;
671 
672 	current_block = ext4_mb_new_blocks(handle, &ar, err);
673 	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
674 		EXT4_ERROR_INODE(inode,
675 				 "current_block %llu + ar.len %d > %d!",
676 				 current_block, ar.len,
677 				 EXT4_MAX_BLOCK_FILE_PHYS);
678 		*err = -EIO;
679 		goto failed_out;
680 	}
681 
682 	if (*err && (target == blks)) {
683 		/*
684 		 * if the allocation failed and we didn't allocate
685 		 * any blocks before
686 		 */
687 		goto failed_out;
688 	}
689 	if (!*err) {
690 		if (target == blks) {
691 			/*
692 			 * save the new block number
693 			 * for the first direct block
694 			 */
695 			new_blocks[index] = current_block;
696 		}
697 		blk_allocated += ar.len;
698 	}
699 allocated:
700 	/* total number of blocks allocated for direct blocks */
701 	ret = blk_allocated;
702 	*err = 0;
703 	return ret;
704 failed_out:
705 	for (i = 0; i < index; i++)
706 		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
707 	return ret;
708 }
709 
710 /**
711  *	ext4_alloc_branch - allocate and set up a chain of blocks.
712  *	@inode: owner
713  *	@indirect_blks: number of allocated indirect blocks
714  *	@blks: number of allocated direct blocks
715  *	@offsets: offsets (in the blocks) to store the pointers to next.
716  *	@branch: place to store the chain in.
717  *
718  *	This function allocates blocks, zeroes out all but the last one,
719  *	links them into chain and (if we are synchronous) writes them to disk.
720  *	In other words, it prepares a branch that can be spliced onto the
721  *	inode. It stores the information about that chain in the branch[], in
722  *	the same format as ext4_get_branch() would do. We are calling it after
723  *	we had read the existing part of chain and partial points to the last
724  *	triple of that (one with zero ->key). Upon the exit we have the same
725  *	picture as after the successful ext4_get_block(), except that in one
726  *	place chain is disconnected - *branch->p is still zero (we did not
727  *	set the last link), but branch->key contains the number that should
728  *	be placed into *branch->p to fill that gap.
729  *
730  *	If allocation fails we free all blocks we've allocated (and forget
731  *	their buffer_heads) and return the error value the from failed
732  *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
733  *	as described above and return 0.
734  */
735 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
736 			     ext4_lblk_t iblock, int indirect_blks,
737 			     int *blks, ext4_fsblk_t goal,
738 			     ext4_lblk_t *offsets, Indirect *branch)
739 {
740 	int blocksize = inode->i_sb->s_blocksize;
741 	int i, n = 0;
742 	int err = 0;
743 	struct buffer_head *bh;
744 	int num;
745 	ext4_fsblk_t new_blocks[4];
746 	ext4_fsblk_t current_block;
747 
748 	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
749 				*blks, new_blocks, &err);
750 	if (err)
751 		return err;
752 
753 	branch[0].key = cpu_to_le32(new_blocks[0]);
754 	/*
755 	 * metadata blocks and data blocks are allocated.
756 	 */
757 	for (n = 1; n <= indirect_blks;  n++) {
758 		/*
759 		 * Get buffer_head for parent block, zero it out
760 		 * and set the pointer to new one, then send
761 		 * parent to disk.
762 		 */
763 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
764 		if (unlikely(!bh)) {
765 			err = -EIO;
766 			goto failed;
767 		}
768 
769 		branch[n].bh = bh;
770 		lock_buffer(bh);
771 		BUFFER_TRACE(bh, "call get_create_access");
772 		err = ext4_journal_get_create_access(handle, bh);
773 		if (err) {
774 			/* Don't brelse(bh) here; it's done in
775 			 * ext4_journal_forget() below */
776 			unlock_buffer(bh);
777 			goto failed;
778 		}
779 
780 		memset(bh->b_data, 0, blocksize);
781 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
782 		branch[n].key = cpu_to_le32(new_blocks[n]);
783 		*branch[n].p = branch[n].key;
784 		if (n == indirect_blks) {
785 			current_block = new_blocks[n];
786 			/*
787 			 * End of chain, update the last new metablock of
788 			 * the chain to point to the new allocated
789 			 * data blocks numbers
790 			 */
791 			for (i = 1; i < num; i++)
792 				*(branch[n].p + i) = cpu_to_le32(++current_block);
793 		}
794 		BUFFER_TRACE(bh, "marking uptodate");
795 		set_buffer_uptodate(bh);
796 		unlock_buffer(bh);
797 
798 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
799 		err = ext4_handle_dirty_metadata(handle, inode, bh);
800 		if (err)
801 			goto failed;
802 	}
803 	*blks = num;
804 	return err;
805 failed:
806 	/* Allocation failed, free what we already allocated */
807 	ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
808 	for (i = 1; i <= n ; i++) {
809 		/*
810 		 * branch[i].bh is newly allocated, so there is no
811 		 * need to revoke the block, which is why we don't
812 		 * need to set EXT4_FREE_BLOCKS_METADATA.
813 		 */
814 		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
815 				 EXT4_FREE_BLOCKS_FORGET);
816 	}
817 	for (i = n+1; i < indirect_blks; i++)
818 		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
819 
820 	ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
821 
822 	return err;
823 }
824 
825 /**
826  * ext4_splice_branch - splice the allocated branch onto inode.
827  * @inode: owner
828  * @block: (logical) number of block we are adding
829  * @chain: chain of indirect blocks (with a missing link - see
830  *	ext4_alloc_branch)
831  * @where: location of missing link
832  * @num:   number of indirect blocks we are adding
833  * @blks:  number of direct blocks we are adding
834  *
835  * This function fills the missing link and does all housekeeping needed in
836  * inode (->i_blocks, etc.). In case of success we end up with the full
837  * chain to new block and return 0.
838  */
839 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
840 			      ext4_lblk_t block, Indirect *where, int num,
841 			      int blks)
842 {
843 	int i;
844 	int err = 0;
845 	ext4_fsblk_t current_block;
846 
847 	/*
848 	 * If we're splicing into a [td]indirect block (as opposed to the
849 	 * inode) then we need to get write access to the [td]indirect block
850 	 * before the splice.
851 	 */
852 	if (where->bh) {
853 		BUFFER_TRACE(where->bh, "get_write_access");
854 		err = ext4_journal_get_write_access(handle, where->bh);
855 		if (err)
856 			goto err_out;
857 	}
858 	/* That's it */
859 
860 	*where->p = where->key;
861 
862 	/*
863 	 * Update the host buffer_head or inode to point to more just allocated
864 	 * direct blocks blocks
865 	 */
866 	if (num == 0 && blks > 1) {
867 		current_block = le32_to_cpu(where->key) + 1;
868 		for (i = 1; i < blks; i++)
869 			*(where->p + i) = cpu_to_le32(current_block++);
870 	}
871 
872 	/* We are done with atomic stuff, now do the rest of housekeeping */
873 	/* had we spliced it onto indirect block? */
874 	if (where->bh) {
875 		/*
876 		 * If we spliced it onto an indirect block, we haven't
877 		 * altered the inode.  Note however that if it is being spliced
878 		 * onto an indirect block at the very end of the file (the
879 		 * file is growing) then we *will* alter the inode to reflect
880 		 * the new i_size.  But that is not done here - it is done in
881 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
882 		 */
883 		jbd_debug(5, "splicing indirect only\n");
884 		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
885 		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
886 		if (err)
887 			goto err_out;
888 	} else {
889 		/*
890 		 * OK, we spliced it into the inode itself on a direct block.
891 		 */
892 		ext4_mark_inode_dirty(handle, inode);
893 		jbd_debug(5, "splicing direct\n");
894 	}
895 	return err;
896 
897 err_out:
898 	for (i = 1; i <= num; i++) {
899 		/*
900 		 * branch[i].bh is newly allocated, so there is no
901 		 * need to revoke the block, which is why we don't
902 		 * need to set EXT4_FREE_BLOCKS_METADATA.
903 		 */
904 		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
905 				 EXT4_FREE_BLOCKS_FORGET);
906 	}
907 	ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
908 			 blks, 0);
909 
910 	return err;
911 }
912 
913 /*
914  * The ext4_ind_map_blocks() function handles non-extents inodes
915  * (i.e., using the traditional indirect/double-indirect i_blocks
916  * scheme) for ext4_map_blocks().
917  *
918  * Allocation strategy is simple: if we have to allocate something, we will
919  * have to go the whole way to leaf. So let's do it before attaching anything
920  * to tree, set linkage between the newborn blocks, write them if sync is
921  * required, recheck the path, free and repeat if check fails, otherwise
922  * set the last missing link (that will protect us from any truncate-generated
923  * removals - all blocks on the path are immune now) and possibly force the
924  * write on the parent block.
925  * That has a nice additional property: no special recovery from the failed
926  * allocations is needed - we simply release blocks and do not touch anything
927  * reachable from inode.
928  *
929  * `handle' can be NULL if create == 0.
930  *
931  * return > 0, # of blocks mapped or allocated.
932  * return = 0, if plain lookup failed.
933  * return < 0, error case.
934  *
935  * The ext4_ind_get_blocks() function should be called with
936  * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
937  * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
938  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
939  * blocks.
940  */
941 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
942 			       struct ext4_map_blocks *map,
943 			       int flags)
944 {
945 	int err = -EIO;
946 	ext4_lblk_t offsets[4];
947 	Indirect chain[4];
948 	Indirect *partial;
949 	ext4_fsblk_t goal;
950 	int indirect_blks;
951 	int blocks_to_boundary = 0;
952 	int depth;
953 	int count = 0;
954 	ext4_fsblk_t first_block = 0;
955 
956 	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
957 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
958 	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
959 				   &blocks_to_boundary);
960 
961 	if (depth == 0)
962 		goto out;
963 
964 	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
965 
966 	/* Simplest case - block found, no allocation needed */
967 	if (!partial) {
968 		first_block = le32_to_cpu(chain[depth - 1].key);
969 		count++;
970 		/*map more blocks*/
971 		while (count < map->m_len && count <= blocks_to_boundary) {
972 			ext4_fsblk_t blk;
973 
974 			blk = le32_to_cpu(*(chain[depth-1].p + count));
975 
976 			if (blk == first_block + count)
977 				count++;
978 			else
979 				break;
980 		}
981 		goto got_it;
982 	}
983 
984 	/* Next simple case - plain lookup or failed read of indirect block */
985 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
986 		goto cleanup;
987 
988 	/*
989 	 * Okay, we need to do block allocation.
990 	*/
991 	goal = ext4_find_goal(inode, map->m_lblk, partial);
992 
993 	/* the number of blocks need to allocate for [d,t]indirect blocks */
994 	indirect_blks = (chain + depth) - partial - 1;
995 
996 	/*
997 	 * Next look up the indirect map to count the totoal number of
998 	 * direct blocks to allocate for this branch.
999 	 */
1000 	count = ext4_blks_to_allocate(partial, indirect_blks,
1001 				      map->m_len, blocks_to_boundary);
1002 	/*
1003 	 * Block out ext4_truncate while we alter the tree
1004 	 */
1005 	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1006 				&count, goal,
1007 				offsets + (partial - chain), partial);
1008 
1009 	/*
1010 	 * The ext4_splice_branch call will free and forget any buffers
1011 	 * on the new chain if there is a failure, but that risks using
1012 	 * up transaction credits, especially for bitmaps where the
1013 	 * credits cannot be returned.  Can we handle this somehow?  We
1014 	 * may need to return -EAGAIN upwards in the worst case.  --sct
1015 	 */
1016 	if (!err)
1017 		err = ext4_splice_branch(handle, inode, map->m_lblk,
1018 					 partial, indirect_blks, count);
1019 	if (err)
1020 		goto cleanup;
1021 
1022 	map->m_flags |= EXT4_MAP_NEW;
1023 
1024 	ext4_update_inode_fsync_trans(handle, inode, 1);
1025 got_it:
1026 	map->m_flags |= EXT4_MAP_MAPPED;
1027 	map->m_pblk = le32_to_cpu(chain[depth-1].key);
1028 	map->m_len = count;
1029 	if (count > blocks_to_boundary)
1030 		map->m_flags |= EXT4_MAP_BOUNDARY;
1031 	err = count;
1032 	/* Clean up and exit */
1033 	partial = chain + depth - 1;	/* the whole chain */
1034 cleanup:
1035 	while (partial > chain) {
1036 		BUFFER_TRACE(partial->bh, "call brelse");
1037 		brelse(partial->bh);
1038 		partial--;
1039 	}
1040 out:
1041 	return err;
1042 }
1043 
1044 #ifdef CONFIG_QUOTA
1045 qsize_t *ext4_get_reserved_space(struct inode *inode)
1046 {
1047 	return &EXT4_I(inode)->i_reserved_quota;
1048 }
1049 #endif
1050 
1051 /*
1052  * Calculate the number of metadata blocks need to reserve
1053  * to allocate a new block at @lblocks for non extent file based file
1054  */
1055 static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1056 					      sector_t lblock)
1057 {
1058 	struct ext4_inode_info *ei = EXT4_I(inode);
1059 	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1060 	int blk_bits;
1061 
1062 	if (lblock < EXT4_NDIR_BLOCKS)
1063 		return 0;
1064 
1065 	lblock -= EXT4_NDIR_BLOCKS;
1066 
1067 	if (ei->i_da_metadata_calc_len &&
1068 	    (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1069 		ei->i_da_metadata_calc_len++;
1070 		return 0;
1071 	}
1072 	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1073 	ei->i_da_metadata_calc_len = 1;
1074 	blk_bits = order_base_2(lblock);
1075 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1076 }
1077 
1078 /*
1079  * Calculate the number of metadata blocks need to reserve
1080  * to allocate a block located at @lblock
1081  */
1082 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1083 {
1084 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1085 		return ext4_ext_calc_metadata_amount(inode, lblock);
1086 
1087 	return ext4_indirect_calc_metadata_amount(inode, lblock);
1088 }
1089 
1090 /*
1091  * Called with i_data_sem down, which is important since we can call
1092  * ext4_discard_preallocations() from here.
1093  */
1094 void ext4_da_update_reserve_space(struct inode *inode,
1095 					int used, int quota_claim)
1096 {
1097 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1098 	struct ext4_inode_info *ei = EXT4_I(inode);
1099 
1100 	spin_lock(&ei->i_block_reservation_lock);
1101 	trace_ext4_da_update_reserve_space(inode, used);
1102 	if (unlikely(used > ei->i_reserved_data_blocks)) {
1103 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1104 			 "with only %d reserved data blocks\n",
1105 			 __func__, inode->i_ino, used,
1106 			 ei->i_reserved_data_blocks);
1107 		WARN_ON(1);
1108 		used = ei->i_reserved_data_blocks;
1109 	}
1110 
1111 	/* Update per-inode reservations */
1112 	ei->i_reserved_data_blocks -= used;
1113 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1114 	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1115 			   used + ei->i_allocated_meta_blocks);
1116 	ei->i_allocated_meta_blocks = 0;
1117 
1118 	if (ei->i_reserved_data_blocks == 0) {
1119 		/*
1120 		 * We can release all of the reserved metadata blocks
1121 		 * only when we have written all of the delayed
1122 		 * allocation blocks.
1123 		 */
1124 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1125 				   ei->i_reserved_meta_blocks);
1126 		ei->i_reserved_meta_blocks = 0;
1127 		ei->i_da_metadata_calc_len = 0;
1128 	}
1129 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1130 
1131 	/* Update quota subsystem for data blocks */
1132 	if (quota_claim)
1133 		dquot_claim_block(inode, used);
1134 	else {
1135 		/*
1136 		 * We did fallocate with an offset that is already delayed
1137 		 * allocated. So on delayed allocated writeback we should
1138 		 * not re-claim the quota for fallocated blocks.
1139 		 */
1140 		dquot_release_reservation_block(inode, used);
1141 	}
1142 
1143 	/*
1144 	 * If we have done all the pending block allocations and if
1145 	 * there aren't any writers on the inode, we can discard the
1146 	 * inode's preallocations.
1147 	 */
1148 	if ((ei->i_reserved_data_blocks == 0) &&
1149 	    (atomic_read(&inode->i_writecount) == 0))
1150 		ext4_discard_preallocations(inode);
1151 }
1152 
1153 static int __check_block_validity(struct inode *inode, const char *func,
1154 				unsigned int line,
1155 				struct ext4_map_blocks *map)
1156 {
1157 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1158 				   map->m_len)) {
1159 		ext4_error_inode(inode, func, line, map->m_pblk,
1160 				 "lblock %lu mapped to illegal pblock "
1161 				 "(length %d)", (unsigned long) map->m_lblk,
1162 				 map->m_len);
1163 		return -EIO;
1164 	}
1165 	return 0;
1166 }
1167 
1168 #define check_block_validity(inode, map)	\
1169 	__check_block_validity((inode), __func__, __LINE__, (map))
1170 
1171 /*
1172  * Return the number of contiguous dirty pages in a given inode
1173  * starting at page frame idx.
1174  */
1175 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1176 				    unsigned int max_pages)
1177 {
1178 	struct address_space *mapping = inode->i_mapping;
1179 	pgoff_t	index;
1180 	struct pagevec pvec;
1181 	pgoff_t num = 0;
1182 	int i, nr_pages, done = 0;
1183 
1184 	if (max_pages == 0)
1185 		return 0;
1186 	pagevec_init(&pvec, 0);
1187 	while (!done) {
1188 		index = idx;
1189 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1190 					      PAGECACHE_TAG_DIRTY,
1191 					      (pgoff_t)PAGEVEC_SIZE);
1192 		if (nr_pages == 0)
1193 			break;
1194 		for (i = 0; i < nr_pages; i++) {
1195 			struct page *page = pvec.pages[i];
1196 			struct buffer_head *bh, *head;
1197 
1198 			lock_page(page);
1199 			if (unlikely(page->mapping != mapping) ||
1200 			    !PageDirty(page) ||
1201 			    PageWriteback(page) ||
1202 			    page->index != idx) {
1203 				done = 1;
1204 				unlock_page(page);
1205 				break;
1206 			}
1207 			if (page_has_buffers(page)) {
1208 				bh = head = page_buffers(page);
1209 				do {
1210 					if (!buffer_delay(bh) &&
1211 					    !buffer_unwritten(bh))
1212 						done = 1;
1213 					bh = bh->b_this_page;
1214 				} while (!done && (bh != head));
1215 			}
1216 			unlock_page(page);
1217 			if (done)
1218 				break;
1219 			idx++;
1220 			num++;
1221 			if (num >= max_pages) {
1222 				done = 1;
1223 				break;
1224 			}
1225 		}
1226 		pagevec_release(&pvec);
1227 	}
1228 	return num;
1229 }
1230 
1231 /*
1232  * The ext4_map_blocks() function tries to look up the requested blocks,
1233  * and returns if the blocks are already mapped.
1234  *
1235  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1236  * and store the allocated blocks in the result buffer head and mark it
1237  * mapped.
1238  *
1239  * If file type is extents based, it will call ext4_ext_map_blocks(),
1240  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1241  * based files
1242  *
1243  * On success, it returns the number of blocks being mapped or allocate.
1244  * if create==0 and the blocks are pre-allocated and uninitialized block,
1245  * the result buffer head is unmapped. If the create ==1, it will make sure
1246  * the buffer head is mapped.
1247  *
1248  * It returns 0 if plain look up failed (blocks have not been allocated), in
1249  * that casem, buffer head is unmapped
1250  *
1251  * It returns the error in case of allocation failure.
1252  */
1253 int ext4_map_blocks(handle_t *handle, struct inode *inode,
1254 		    struct ext4_map_blocks *map, int flags)
1255 {
1256 	int retval;
1257 
1258 	map->m_flags = 0;
1259 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1260 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
1261 		  (unsigned long) map->m_lblk);
1262 	/*
1263 	 * Try to see if we can get the block without requesting a new
1264 	 * file system block.
1265 	 */
1266 	down_read((&EXT4_I(inode)->i_data_sem));
1267 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1268 		retval = ext4_ext_map_blocks(handle, inode, map, 0);
1269 	} else {
1270 		retval = ext4_ind_map_blocks(handle, inode, map, 0);
1271 	}
1272 	up_read((&EXT4_I(inode)->i_data_sem));
1273 
1274 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1275 		int ret = check_block_validity(inode, map);
1276 		if (ret != 0)
1277 			return ret;
1278 	}
1279 
1280 	/* If it is only a block(s) look up */
1281 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1282 		return retval;
1283 
1284 	/*
1285 	 * Returns if the blocks have already allocated
1286 	 *
1287 	 * Note that if blocks have been preallocated
1288 	 * ext4_ext_get_block() returns th create = 0
1289 	 * with buffer head unmapped.
1290 	 */
1291 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1292 		return retval;
1293 
1294 	/*
1295 	 * When we call get_blocks without the create flag, the
1296 	 * BH_Unwritten flag could have gotten set if the blocks
1297 	 * requested were part of a uninitialized extent.  We need to
1298 	 * clear this flag now that we are committed to convert all or
1299 	 * part of the uninitialized extent to be an initialized
1300 	 * extent.  This is because we need to avoid the combination
1301 	 * of BH_Unwritten and BH_Mapped flags being simultaneously
1302 	 * set on the buffer_head.
1303 	 */
1304 	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1305 
1306 	/*
1307 	 * New blocks allocate and/or writing to uninitialized extent
1308 	 * will possibly result in updating i_data, so we take
1309 	 * the write lock of i_data_sem, and call get_blocks()
1310 	 * with create == 1 flag.
1311 	 */
1312 	down_write((&EXT4_I(inode)->i_data_sem));
1313 
1314 	/*
1315 	 * if the caller is from delayed allocation writeout path
1316 	 * we have already reserved fs blocks for allocation
1317 	 * let the underlying get_block() function know to
1318 	 * avoid double accounting
1319 	 */
1320 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1321 		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
1322 	/*
1323 	 * We need to check for EXT4 here because migrate
1324 	 * could have changed the inode type in between
1325 	 */
1326 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1327 		retval = ext4_ext_map_blocks(handle, inode, map, flags);
1328 	} else {
1329 		retval = ext4_ind_map_blocks(handle, inode, map, flags);
1330 
1331 		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1332 			/*
1333 			 * We allocated new blocks which will result in
1334 			 * i_data's format changing.  Force the migrate
1335 			 * to fail by clearing migrate flags
1336 			 */
1337 			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1338 		}
1339 
1340 		/*
1341 		 * Update reserved blocks/metadata blocks after successful
1342 		 * block allocation which had been deferred till now. We don't
1343 		 * support fallocate for non extent files. So we can update
1344 		 * reserve space here.
1345 		 */
1346 		if ((retval > 0) &&
1347 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1348 			ext4_da_update_reserve_space(inode, retval, 1);
1349 	}
1350 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1351 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1352 
1353 	up_write((&EXT4_I(inode)->i_data_sem));
1354 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1355 		int ret = check_block_validity(inode, map);
1356 		if (ret != 0)
1357 			return ret;
1358 	}
1359 	return retval;
1360 }
1361 
1362 /* Maximum number of blocks we map for direct IO at once. */
1363 #define DIO_MAX_BLOCKS 4096
1364 
1365 static int _ext4_get_block(struct inode *inode, sector_t iblock,
1366 			   struct buffer_head *bh, int flags)
1367 {
1368 	handle_t *handle = ext4_journal_current_handle();
1369 	struct ext4_map_blocks map;
1370 	int ret = 0, started = 0;
1371 	int dio_credits;
1372 
1373 	map.m_lblk = iblock;
1374 	map.m_len = bh->b_size >> inode->i_blkbits;
1375 
1376 	if (flags && !handle) {
1377 		/* Direct IO write... */
1378 		if (map.m_len > DIO_MAX_BLOCKS)
1379 			map.m_len = DIO_MAX_BLOCKS;
1380 		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1381 		handle = ext4_journal_start(inode, dio_credits);
1382 		if (IS_ERR(handle)) {
1383 			ret = PTR_ERR(handle);
1384 			return ret;
1385 		}
1386 		started = 1;
1387 	}
1388 
1389 	ret = ext4_map_blocks(handle, inode, &map, flags);
1390 	if (ret > 0) {
1391 		map_bh(bh, inode->i_sb, map.m_pblk);
1392 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1393 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1394 		ret = 0;
1395 	}
1396 	if (started)
1397 		ext4_journal_stop(handle);
1398 	return ret;
1399 }
1400 
1401 int ext4_get_block(struct inode *inode, sector_t iblock,
1402 		   struct buffer_head *bh, int create)
1403 {
1404 	return _ext4_get_block(inode, iblock, bh,
1405 			       create ? EXT4_GET_BLOCKS_CREATE : 0);
1406 }
1407 
1408 /*
1409  * `handle' can be NULL if create is zero
1410  */
1411 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1412 				ext4_lblk_t block, int create, int *errp)
1413 {
1414 	struct ext4_map_blocks map;
1415 	struct buffer_head *bh;
1416 	int fatal = 0, err;
1417 
1418 	J_ASSERT(handle != NULL || create == 0);
1419 
1420 	map.m_lblk = block;
1421 	map.m_len = 1;
1422 	err = ext4_map_blocks(handle, inode, &map,
1423 			      create ? EXT4_GET_BLOCKS_CREATE : 0);
1424 
1425 	if (err < 0)
1426 		*errp = err;
1427 	if (err <= 0)
1428 		return NULL;
1429 	*errp = 0;
1430 
1431 	bh = sb_getblk(inode->i_sb, map.m_pblk);
1432 	if (!bh) {
1433 		*errp = -EIO;
1434 		return NULL;
1435 	}
1436 	if (map.m_flags & EXT4_MAP_NEW) {
1437 		J_ASSERT(create != 0);
1438 		J_ASSERT(handle != NULL);
1439 
1440 		/*
1441 		 * Now that we do not always journal data, we should
1442 		 * keep in mind whether this should always journal the
1443 		 * new buffer as metadata.  For now, regular file
1444 		 * writes use ext4_get_block instead, so it's not a
1445 		 * problem.
1446 		 */
1447 		lock_buffer(bh);
1448 		BUFFER_TRACE(bh, "call get_create_access");
1449 		fatal = ext4_journal_get_create_access(handle, bh);
1450 		if (!fatal && !buffer_uptodate(bh)) {
1451 			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1452 			set_buffer_uptodate(bh);
1453 		}
1454 		unlock_buffer(bh);
1455 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1456 		err = ext4_handle_dirty_metadata(handle, inode, bh);
1457 		if (!fatal)
1458 			fatal = err;
1459 	} else {
1460 		BUFFER_TRACE(bh, "not a new buffer");
1461 	}
1462 	if (fatal) {
1463 		*errp = fatal;
1464 		brelse(bh);
1465 		bh = NULL;
1466 	}
1467 	return bh;
1468 }
1469 
1470 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1471 			       ext4_lblk_t block, int create, int *err)
1472 {
1473 	struct buffer_head *bh;
1474 
1475 	bh = ext4_getblk(handle, inode, block, create, err);
1476 	if (!bh)
1477 		return bh;
1478 	if (buffer_uptodate(bh))
1479 		return bh;
1480 	ll_rw_block(READ_META, 1, &bh);
1481 	wait_on_buffer(bh);
1482 	if (buffer_uptodate(bh))
1483 		return bh;
1484 	put_bh(bh);
1485 	*err = -EIO;
1486 	return NULL;
1487 }
1488 
1489 static int walk_page_buffers(handle_t *handle,
1490 			     struct buffer_head *head,
1491 			     unsigned from,
1492 			     unsigned to,
1493 			     int *partial,
1494 			     int (*fn)(handle_t *handle,
1495 				       struct buffer_head *bh))
1496 {
1497 	struct buffer_head *bh;
1498 	unsigned block_start, block_end;
1499 	unsigned blocksize = head->b_size;
1500 	int err, ret = 0;
1501 	struct buffer_head *next;
1502 
1503 	for (bh = head, block_start = 0;
1504 	     ret == 0 && (bh != head || !block_start);
1505 	     block_start = block_end, bh = next) {
1506 		next = bh->b_this_page;
1507 		block_end = block_start + blocksize;
1508 		if (block_end <= from || block_start >= to) {
1509 			if (partial && !buffer_uptodate(bh))
1510 				*partial = 1;
1511 			continue;
1512 		}
1513 		err = (*fn)(handle, bh);
1514 		if (!ret)
1515 			ret = err;
1516 	}
1517 	return ret;
1518 }
1519 
1520 /*
1521  * To preserve ordering, it is essential that the hole instantiation and
1522  * the data write be encapsulated in a single transaction.  We cannot
1523  * close off a transaction and start a new one between the ext4_get_block()
1524  * and the commit_write().  So doing the jbd2_journal_start at the start of
1525  * prepare_write() is the right place.
1526  *
1527  * Also, this function can nest inside ext4_writepage() ->
1528  * block_write_full_page(). In that case, we *know* that ext4_writepage()
1529  * has generated enough buffer credits to do the whole page.  So we won't
1530  * block on the journal in that case, which is good, because the caller may
1531  * be PF_MEMALLOC.
1532  *
1533  * By accident, ext4 can be reentered when a transaction is open via
1534  * quota file writes.  If we were to commit the transaction while thus
1535  * reentered, there can be a deadlock - we would be holding a quota
1536  * lock, and the commit would never complete if another thread had a
1537  * transaction open and was blocking on the quota lock - a ranking
1538  * violation.
1539  *
1540  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1541  * will _not_ run commit under these circumstances because handle->h_ref
1542  * is elevated.  We'll still have enough credits for the tiny quotafile
1543  * write.
1544  */
1545 static int do_journal_get_write_access(handle_t *handle,
1546 				       struct buffer_head *bh)
1547 {
1548 	int dirty = buffer_dirty(bh);
1549 	int ret;
1550 
1551 	if (!buffer_mapped(bh) || buffer_freed(bh))
1552 		return 0;
1553 	/*
1554 	 * __block_write_begin() could have dirtied some buffers. Clean
1555 	 * the dirty bit as jbd2_journal_get_write_access() could complain
1556 	 * otherwise about fs integrity issues. Setting of the dirty bit
1557 	 * by __block_write_begin() isn't a real problem here as we clear
1558 	 * the bit before releasing a page lock and thus writeback cannot
1559 	 * ever write the buffer.
1560 	 */
1561 	if (dirty)
1562 		clear_buffer_dirty(bh);
1563 	ret = ext4_journal_get_write_access(handle, bh);
1564 	if (!ret && dirty)
1565 		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1566 	return ret;
1567 }
1568 
1569 /*
1570  * Truncate blocks that were not used by write. We have to truncate the
1571  * pagecache as well so that corresponding buffers get properly unmapped.
1572  */
1573 static void ext4_truncate_failed_write(struct inode *inode)
1574 {
1575 	truncate_inode_pages(inode->i_mapping, inode->i_size);
1576 	ext4_truncate(inode);
1577 }
1578 
1579 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1580 		   struct buffer_head *bh_result, int create);
1581 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1582 			    loff_t pos, unsigned len, unsigned flags,
1583 			    struct page **pagep, void **fsdata)
1584 {
1585 	struct inode *inode = mapping->host;
1586 	int ret, needed_blocks;
1587 	handle_t *handle;
1588 	int retries = 0;
1589 	struct page *page;
1590 	pgoff_t index;
1591 	unsigned from, to;
1592 
1593 	trace_ext4_write_begin(inode, pos, len, flags);
1594 	/*
1595 	 * Reserve one block more for addition to orphan list in case
1596 	 * we allocate blocks but write fails for some reason
1597 	 */
1598 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1599 	index = pos >> PAGE_CACHE_SHIFT;
1600 	from = pos & (PAGE_CACHE_SIZE - 1);
1601 	to = from + len;
1602 
1603 retry:
1604 	handle = ext4_journal_start(inode, needed_blocks);
1605 	if (IS_ERR(handle)) {
1606 		ret = PTR_ERR(handle);
1607 		goto out;
1608 	}
1609 
1610 	/* We cannot recurse into the filesystem as the transaction is already
1611 	 * started */
1612 	flags |= AOP_FLAG_NOFS;
1613 
1614 	page = grab_cache_page_write_begin(mapping, index, flags);
1615 	if (!page) {
1616 		ext4_journal_stop(handle);
1617 		ret = -ENOMEM;
1618 		goto out;
1619 	}
1620 	*pagep = page;
1621 
1622 	if (ext4_should_dioread_nolock(inode))
1623 		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1624 	else
1625 		ret = __block_write_begin(page, pos, len, ext4_get_block);
1626 
1627 	if (!ret && ext4_should_journal_data(inode)) {
1628 		ret = walk_page_buffers(handle, page_buffers(page),
1629 				from, to, NULL, do_journal_get_write_access);
1630 	}
1631 
1632 	if (ret) {
1633 		unlock_page(page);
1634 		page_cache_release(page);
1635 		/*
1636 		 * __block_write_begin may have instantiated a few blocks
1637 		 * outside i_size.  Trim these off again. Don't need
1638 		 * i_size_read because we hold i_mutex.
1639 		 *
1640 		 * Add inode to orphan list in case we crash before
1641 		 * truncate finishes
1642 		 */
1643 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
1644 			ext4_orphan_add(handle, inode);
1645 
1646 		ext4_journal_stop(handle);
1647 		if (pos + len > inode->i_size) {
1648 			ext4_truncate_failed_write(inode);
1649 			/*
1650 			 * If truncate failed early the inode might
1651 			 * still be on the orphan list; we need to
1652 			 * make sure the inode is removed from the
1653 			 * orphan list in that case.
1654 			 */
1655 			if (inode->i_nlink)
1656 				ext4_orphan_del(NULL, inode);
1657 		}
1658 	}
1659 
1660 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1661 		goto retry;
1662 out:
1663 	return ret;
1664 }
1665 
1666 /* For write_end() in data=journal mode */
1667 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1668 {
1669 	if (!buffer_mapped(bh) || buffer_freed(bh))
1670 		return 0;
1671 	set_buffer_uptodate(bh);
1672 	return ext4_handle_dirty_metadata(handle, NULL, bh);
1673 }
1674 
1675 static int ext4_generic_write_end(struct file *file,
1676 				  struct address_space *mapping,
1677 				  loff_t pos, unsigned len, unsigned copied,
1678 				  struct page *page, void *fsdata)
1679 {
1680 	int i_size_changed = 0;
1681 	struct inode *inode = mapping->host;
1682 	handle_t *handle = ext4_journal_current_handle();
1683 
1684 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1685 
1686 	/*
1687 	 * No need to use i_size_read() here, the i_size
1688 	 * cannot change under us because we hold i_mutex.
1689 	 *
1690 	 * But it's important to update i_size while still holding page lock:
1691 	 * page writeout could otherwise come in and zero beyond i_size.
1692 	 */
1693 	if (pos + copied > inode->i_size) {
1694 		i_size_write(inode, pos + copied);
1695 		i_size_changed = 1;
1696 	}
1697 
1698 	if (pos + copied >  EXT4_I(inode)->i_disksize) {
1699 		/* We need to mark inode dirty even if
1700 		 * new_i_size is less that inode->i_size
1701 		 * bu greater than i_disksize.(hint delalloc)
1702 		 */
1703 		ext4_update_i_disksize(inode, (pos + copied));
1704 		i_size_changed = 1;
1705 	}
1706 	unlock_page(page);
1707 	page_cache_release(page);
1708 
1709 	/*
1710 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
1711 	 * makes the holding time of page lock longer. Second, it forces lock
1712 	 * ordering of page lock and transaction start for journaling
1713 	 * filesystems.
1714 	 */
1715 	if (i_size_changed)
1716 		ext4_mark_inode_dirty(handle, inode);
1717 
1718 	return copied;
1719 }
1720 
1721 /*
1722  * We need to pick up the new inode size which generic_commit_write gave us
1723  * `file' can be NULL - eg, when called from page_symlink().
1724  *
1725  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1726  * buffers are managed internally.
1727  */
1728 static int ext4_ordered_write_end(struct file *file,
1729 				  struct address_space *mapping,
1730 				  loff_t pos, unsigned len, unsigned copied,
1731 				  struct page *page, void *fsdata)
1732 {
1733 	handle_t *handle = ext4_journal_current_handle();
1734 	struct inode *inode = mapping->host;
1735 	int ret = 0, ret2;
1736 
1737 	trace_ext4_ordered_write_end(inode, pos, len, copied);
1738 	ret = ext4_jbd2_file_inode(handle, inode);
1739 
1740 	if (ret == 0) {
1741 		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1742 							page, fsdata);
1743 		copied = ret2;
1744 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
1745 			/* if we have allocated more blocks and copied
1746 			 * less. We will have blocks allocated outside
1747 			 * inode->i_size. So truncate them
1748 			 */
1749 			ext4_orphan_add(handle, inode);
1750 		if (ret2 < 0)
1751 			ret = ret2;
1752 	}
1753 	ret2 = ext4_journal_stop(handle);
1754 	if (!ret)
1755 		ret = ret2;
1756 
1757 	if (pos + len > inode->i_size) {
1758 		ext4_truncate_failed_write(inode);
1759 		/*
1760 		 * If truncate failed early the inode might still be
1761 		 * on the orphan list; we need to make sure the inode
1762 		 * is removed from the orphan list in that case.
1763 		 */
1764 		if (inode->i_nlink)
1765 			ext4_orphan_del(NULL, inode);
1766 	}
1767 
1768 
1769 	return ret ? ret : copied;
1770 }
1771 
1772 static int ext4_writeback_write_end(struct file *file,
1773 				    struct address_space *mapping,
1774 				    loff_t pos, unsigned len, unsigned copied,
1775 				    struct page *page, void *fsdata)
1776 {
1777 	handle_t *handle = ext4_journal_current_handle();
1778 	struct inode *inode = mapping->host;
1779 	int ret = 0, ret2;
1780 
1781 	trace_ext4_writeback_write_end(inode, pos, len, copied);
1782 	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1783 							page, fsdata);
1784 	copied = ret2;
1785 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1786 		/* if we have allocated more blocks and copied
1787 		 * less. We will have blocks allocated outside
1788 		 * inode->i_size. So truncate them
1789 		 */
1790 		ext4_orphan_add(handle, inode);
1791 
1792 	if (ret2 < 0)
1793 		ret = ret2;
1794 
1795 	ret2 = ext4_journal_stop(handle);
1796 	if (!ret)
1797 		ret = ret2;
1798 
1799 	if (pos + len > inode->i_size) {
1800 		ext4_truncate_failed_write(inode);
1801 		/*
1802 		 * If truncate failed early the inode might still be
1803 		 * on the orphan list; we need to make sure the inode
1804 		 * is removed from the orphan list in that case.
1805 		 */
1806 		if (inode->i_nlink)
1807 			ext4_orphan_del(NULL, inode);
1808 	}
1809 
1810 	return ret ? ret : copied;
1811 }
1812 
1813 static int ext4_journalled_write_end(struct file *file,
1814 				     struct address_space *mapping,
1815 				     loff_t pos, unsigned len, unsigned copied,
1816 				     struct page *page, void *fsdata)
1817 {
1818 	handle_t *handle = ext4_journal_current_handle();
1819 	struct inode *inode = mapping->host;
1820 	int ret = 0, ret2;
1821 	int partial = 0;
1822 	unsigned from, to;
1823 	loff_t new_i_size;
1824 
1825 	trace_ext4_journalled_write_end(inode, pos, len, copied);
1826 	from = pos & (PAGE_CACHE_SIZE - 1);
1827 	to = from + len;
1828 
1829 	if (copied < len) {
1830 		if (!PageUptodate(page))
1831 			copied = 0;
1832 		page_zero_new_buffers(page, from+copied, to);
1833 	}
1834 
1835 	ret = walk_page_buffers(handle, page_buffers(page), from,
1836 				to, &partial, write_end_fn);
1837 	if (!partial)
1838 		SetPageUptodate(page);
1839 	new_i_size = pos + copied;
1840 	if (new_i_size > inode->i_size)
1841 		i_size_write(inode, pos+copied);
1842 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1843 	if (new_i_size > EXT4_I(inode)->i_disksize) {
1844 		ext4_update_i_disksize(inode, new_i_size);
1845 		ret2 = ext4_mark_inode_dirty(handle, inode);
1846 		if (!ret)
1847 			ret = ret2;
1848 	}
1849 
1850 	unlock_page(page);
1851 	page_cache_release(page);
1852 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1853 		/* if we have allocated more blocks and copied
1854 		 * less. We will have blocks allocated outside
1855 		 * inode->i_size. So truncate them
1856 		 */
1857 		ext4_orphan_add(handle, inode);
1858 
1859 	ret2 = ext4_journal_stop(handle);
1860 	if (!ret)
1861 		ret = ret2;
1862 	if (pos + len > inode->i_size) {
1863 		ext4_truncate_failed_write(inode);
1864 		/*
1865 		 * If truncate failed early the inode might still be
1866 		 * on the orphan list; we need to make sure the inode
1867 		 * is removed from the orphan list in that case.
1868 		 */
1869 		if (inode->i_nlink)
1870 			ext4_orphan_del(NULL, inode);
1871 	}
1872 
1873 	return ret ? ret : copied;
1874 }
1875 
1876 /*
1877  * Reserve a single block located at lblock
1878  */
1879 static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1880 {
1881 	int retries = 0;
1882 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1883 	struct ext4_inode_info *ei = EXT4_I(inode);
1884 	unsigned long md_needed;
1885 	int ret;
1886 
1887 	/*
1888 	 * recalculate the amount of metadata blocks to reserve
1889 	 * in order to allocate nrblocks
1890 	 * worse case is one extent per block
1891 	 */
1892 repeat:
1893 	spin_lock(&ei->i_block_reservation_lock);
1894 	md_needed = ext4_calc_metadata_amount(inode, lblock);
1895 	trace_ext4_da_reserve_space(inode, md_needed);
1896 	spin_unlock(&ei->i_block_reservation_lock);
1897 
1898 	/*
1899 	 * We will charge metadata quota at writeout time; this saves
1900 	 * us from metadata over-estimation, though we may go over by
1901 	 * a small amount in the end.  Here we just reserve for data.
1902 	 */
1903 	ret = dquot_reserve_block(inode, 1);
1904 	if (ret)
1905 		return ret;
1906 	/*
1907 	 * We do still charge estimated metadata to the sb though;
1908 	 * we cannot afford to run out of free blocks.
1909 	 */
1910 	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1911 		dquot_release_reservation_block(inode, 1);
1912 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1913 			yield();
1914 			goto repeat;
1915 		}
1916 		return -ENOSPC;
1917 	}
1918 	spin_lock(&ei->i_block_reservation_lock);
1919 	ei->i_reserved_data_blocks++;
1920 	ei->i_reserved_meta_blocks += md_needed;
1921 	spin_unlock(&ei->i_block_reservation_lock);
1922 
1923 	return 0;       /* success */
1924 }
1925 
1926 static void ext4_da_release_space(struct inode *inode, int to_free)
1927 {
1928 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1929 	struct ext4_inode_info *ei = EXT4_I(inode);
1930 
1931 	if (!to_free)
1932 		return;		/* Nothing to release, exit */
1933 
1934 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1935 
1936 	trace_ext4_da_release_space(inode, to_free);
1937 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1938 		/*
1939 		 * if there aren't enough reserved blocks, then the
1940 		 * counter is messed up somewhere.  Since this
1941 		 * function is called from invalidate page, it's
1942 		 * harmless to return without any action.
1943 		 */
1944 		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1945 			 "ino %lu, to_free %d with only %d reserved "
1946 			 "data blocks\n", inode->i_ino, to_free,
1947 			 ei->i_reserved_data_blocks);
1948 		WARN_ON(1);
1949 		to_free = ei->i_reserved_data_blocks;
1950 	}
1951 	ei->i_reserved_data_blocks -= to_free;
1952 
1953 	if (ei->i_reserved_data_blocks == 0) {
1954 		/*
1955 		 * We can release all of the reserved metadata blocks
1956 		 * only when we have written all of the delayed
1957 		 * allocation blocks.
1958 		 */
1959 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1960 				   ei->i_reserved_meta_blocks);
1961 		ei->i_reserved_meta_blocks = 0;
1962 		ei->i_da_metadata_calc_len = 0;
1963 	}
1964 
1965 	/* update fs dirty data blocks counter */
1966 	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1967 
1968 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1969 
1970 	dquot_release_reservation_block(inode, to_free);
1971 }
1972 
1973 static void ext4_da_page_release_reservation(struct page *page,
1974 					     unsigned long offset)
1975 {
1976 	int to_release = 0;
1977 	struct buffer_head *head, *bh;
1978 	unsigned int curr_off = 0;
1979 
1980 	head = page_buffers(page);
1981 	bh = head;
1982 	do {
1983 		unsigned int next_off = curr_off + bh->b_size;
1984 
1985 		if ((offset <= curr_off) && (buffer_delay(bh))) {
1986 			to_release++;
1987 			clear_buffer_delay(bh);
1988 		}
1989 		curr_off = next_off;
1990 	} while ((bh = bh->b_this_page) != head);
1991 	ext4_da_release_space(page->mapping->host, to_release);
1992 }
1993 
1994 /*
1995  * Delayed allocation stuff
1996  */
1997 
1998 /*
1999  * mpage_da_submit_io - walks through extent of pages and try to write
2000  * them with writepage() call back
2001  *
2002  * @mpd->inode: inode
2003  * @mpd->first_page: first page of the extent
2004  * @mpd->next_page: page after the last page of the extent
2005  *
2006  * By the time mpage_da_submit_io() is called we expect all blocks
2007  * to be allocated. this may be wrong if allocation failed.
2008  *
2009  * As pages are already locked by write_cache_pages(), we can't use it
2010  */
2011 static int mpage_da_submit_io(struct mpage_da_data *mpd,
2012 			      struct ext4_map_blocks *map)
2013 {
2014 	struct pagevec pvec;
2015 	unsigned long index, end;
2016 	int ret = 0, err, nr_pages, i;
2017 	struct inode *inode = mpd->inode;
2018 	struct address_space *mapping = inode->i_mapping;
2019 	loff_t size = i_size_read(inode);
2020 	unsigned int len, block_start;
2021 	struct buffer_head *bh, *page_bufs = NULL;
2022 	int journal_data = ext4_should_journal_data(inode);
2023 	sector_t pblock = 0, cur_logical = 0;
2024 	struct ext4_io_submit io_submit;
2025 
2026 	BUG_ON(mpd->next_page <= mpd->first_page);
2027 	memset(&io_submit, 0, sizeof(io_submit));
2028 	/*
2029 	 * We need to start from the first_page to the next_page - 1
2030 	 * to make sure we also write the mapped dirty buffer_heads.
2031 	 * If we look at mpd->b_blocknr we would only be looking
2032 	 * at the currently mapped buffer_heads.
2033 	 */
2034 	index = mpd->first_page;
2035 	end = mpd->next_page - 1;
2036 
2037 	pagevec_init(&pvec, 0);
2038 	while (index <= end) {
2039 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2040 		if (nr_pages == 0)
2041 			break;
2042 		for (i = 0; i < nr_pages; i++) {
2043 			int commit_write = 0, redirty_page = 0;
2044 			struct page *page = pvec.pages[i];
2045 
2046 			index = page->index;
2047 			if (index > end)
2048 				break;
2049 
2050 			if (index == size >> PAGE_CACHE_SHIFT)
2051 				len = size & ~PAGE_CACHE_MASK;
2052 			else
2053 				len = PAGE_CACHE_SIZE;
2054 			if (map) {
2055 				cur_logical = index << (PAGE_CACHE_SHIFT -
2056 							inode->i_blkbits);
2057 				pblock = map->m_pblk + (cur_logical -
2058 							map->m_lblk);
2059 			}
2060 			index++;
2061 
2062 			BUG_ON(!PageLocked(page));
2063 			BUG_ON(PageWriteback(page));
2064 
2065 			/*
2066 			 * If the page does not have buffers (for
2067 			 * whatever reason), try to create them using
2068 			 * __block_write_begin.  If this fails,
2069 			 * redirty the page and move on.
2070 			 */
2071 			if (!page_has_buffers(page)) {
2072 				if (__block_write_begin(page, 0, len,
2073 						noalloc_get_block_write)) {
2074 				redirty_page:
2075 					redirty_page_for_writepage(mpd->wbc,
2076 								   page);
2077 					unlock_page(page);
2078 					continue;
2079 				}
2080 				commit_write = 1;
2081 			}
2082 
2083 			bh = page_bufs = page_buffers(page);
2084 			block_start = 0;
2085 			do {
2086 				if (!bh)
2087 					goto redirty_page;
2088 				if (map && (cur_logical >= map->m_lblk) &&
2089 				    (cur_logical <= (map->m_lblk +
2090 						     (map->m_len - 1)))) {
2091 					if (buffer_delay(bh)) {
2092 						clear_buffer_delay(bh);
2093 						bh->b_blocknr = pblock;
2094 					}
2095 					if (buffer_unwritten(bh) ||
2096 					    buffer_mapped(bh))
2097 						BUG_ON(bh->b_blocknr != pblock);
2098 					if (map->m_flags & EXT4_MAP_UNINIT)
2099 						set_buffer_uninit(bh);
2100 					clear_buffer_unwritten(bh);
2101 				}
2102 
2103 				/* redirty page if block allocation undone */
2104 				if (buffer_delay(bh) || buffer_unwritten(bh))
2105 					redirty_page = 1;
2106 				bh = bh->b_this_page;
2107 				block_start += bh->b_size;
2108 				cur_logical++;
2109 				pblock++;
2110 			} while (bh != page_bufs);
2111 
2112 			if (redirty_page)
2113 				goto redirty_page;
2114 
2115 			if (commit_write)
2116 				/* mark the buffer_heads as dirty & uptodate */
2117 				block_commit_write(page, 0, len);
2118 
2119 			/*
2120 			 * Delalloc doesn't support data journalling,
2121 			 * but eventually maybe we'll lift this
2122 			 * restriction.
2123 			 */
2124 			if (unlikely(journal_data && PageChecked(page)))
2125 				err = __ext4_journalled_writepage(page, len);
2126 			else
2127 				err = ext4_bio_write_page(&io_submit, page,
2128 							  len, mpd->wbc);
2129 
2130 			if (!err)
2131 				mpd->pages_written++;
2132 			/*
2133 			 * In error case, we have to continue because
2134 			 * remaining pages are still locked
2135 			 */
2136 			if (ret == 0)
2137 				ret = err;
2138 		}
2139 		pagevec_release(&pvec);
2140 	}
2141 	ext4_io_submit(&io_submit);
2142 	return ret;
2143 }
2144 
2145 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2146 					sector_t logical, long blk_cnt)
2147 {
2148 	int nr_pages, i;
2149 	pgoff_t index, end;
2150 	struct pagevec pvec;
2151 	struct inode *inode = mpd->inode;
2152 	struct address_space *mapping = inode->i_mapping;
2153 
2154 	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2155 	end   = (logical + blk_cnt - 1) >>
2156 				(PAGE_CACHE_SHIFT - inode->i_blkbits);
2157 	while (index <= end) {
2158 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2159 		if (nr_pages == 0)
2160 			break;
2161 		for (i = 0; i < nr_pages; i++) {
2162 			struct page *page = pvec.pages[i];
2163 			if (page->index > end)
2164 				break;
2165 			BUG_ON(!PageLocked(page));
2166 			BUG_ON(PageWriteback(page));
2167 			block_invalidatepage(page, 0);
2168 			ClearPageUptodate(page);
2169 			unlock_page(page);
2170 		}
2171 		index = pvec.pages[nr_pages - 1]->index + 1;
2172 		pagevec_release(&pvec);
2173 	}
2174 	return;
2175 }
2176 
2177 static void ext4_print_free_blocks(struct inode *inode)
2178 {
2179 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2180 	printk(KERN_CRIT "Total free blocks count %lld\n",
2181 	       ext4_count_free_blocks(inode->i_sb));
2182 	printk(KERN_CRIT "Free/Dirty block details\n");
2183 	printk(KERN_CRIT "free_blocks=%lld\n",
2184 	       (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2185 	printk(KERN_CRIT "dirty_blocks=%lld\n",
2186 	       (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2187 	printk(KERN_CRIT "Block reservation details\n");
2188 	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2189 	       EXT4_I(inode)->i_reserved_data_blocks);
2190 	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2191 	       EXT4_I(inode)->i_reserved_meta_blocks);
2192 	return;
2193 }
2194 
2195 /*
2196  * mpage_da_map_and_submit - go through given space, map them
2197  *       if necessary, and then submit them for I/O
2198  *
2199  * @mpd - bh describing space
2200  *
2201  * The function skips space we know is already mapped to disk blocks.
2202  *
2203  */
2204 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2205 {
2206 	int err, blks, get_blocks_flags;
2207 	struct ext4_map_blocks map, *mapp = NULL;
2208 	sector_t next = mpd->b_blocknr;
2209 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2210 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2211 	handle_t *handle = NULL;
2212 
2213 	/*
2214 	 * If the blocks are mapped already, or we couldn't accumulate
2215 	 * any blocks, then proceed immediately to the submission stage.
2216 	 */
2217 	if ((mpd->b_size == 0) ||
2218 	    ((mpd->b_state  & (1 << BH_Mapped)) &&
2219 	     !(mpd->b_state & (1 << BH_Delay)) &&
2220 	     !(mpd->b_state & (1 << BH_Unwritten))))
2221 		goto submit_io;
2222 
2223 	handle = ext4_journal_current_handle();
2224 	BUG_ON(!handle);
2225 
2226 	/*
2227 	 * Call ext4_map_blocks() to allocate any delayed allocation
2228 	 * blocks, or to convert an uninitialized extent to be
2229 	 * initialized (in the case where we have written into
2230 	 * one or more preallocated blocks).
2231 	 *
2232 	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2233 	 * indicate that we are on the delayed allocation path.  This
2234 	 * affects functions in many different parts of the allocation
2235 	 * call path.  This flag exists primarily because we don't
2236 	 * want to change *many* call functions, so ext4_map_blocks()
2237 	 * will set the magic i_delalloc_reserved_flag once the
2238 	 * inode's allocation semaphore is taken.
2239 	 *
2240 	 * If the blocks in questions were delalloc blocks, set
2241 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2242 	 * variables are updated after the blocks have been allocated.
2243 	 */
2244 	map.m_lblk = next;
2245 	map.m_len = max_blocks;
2246 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2247 	if (ext4_should_dioread_nolock(mpd->inode))
2248 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2249 	if (mpd->b_state & (1 << BH_Delay))
2250 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2251 
2252 	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2253 	if (blks < 0) {
2254 		struct super_block *sb = mpd->inode->i_sb;
2255 
2256 		err = blks;
2257 		/*
2258 		 * If get block returns EAGAIN or ENOSPC and there
2259 		 * appears to be free blocks we will call
2260 		 * ext4_writepage() for all of the pages which will
2261 		 * just redirty the pages.
2262 		 */
2263 		if (err == -EAGAIN)
2264 			goto submit_io;
2265 
2266 		if (err == -ENOSPC &&
2267 		    ext4_count_free_blocks(sb)) {
2268 			mpd->retval = err;
2269 			goto submit_io;
2270 		}
2271 
2272 		/*
2273 		 * get block failure will cause us to loop in
2274 		 * writepages, because a_ops->writepage won't be able
2275 		 * to make progress. The page will be redirtied by
2276 		 * writepage and writepages will again try to write
2277 		 * the same.
2278 		 */
2279 		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2280 			ext4_msg(sb, KERN_CRIT,
2281 				 "delayed block allocation failed for inode %lu "
2282 				 "at logical offset %llu with max blocks %zd "
2283 				 "with error %d", mpd->inode->i_ino,
2284 				 (unsigned long long) next,
2285 				 mpd->b_size >> mpd->inode->i_blkbits, err);
2286 			ext4_msg(sb, KERN_CRIT,
2287 				"This should not happen!! Data will be lost\n");
2288 			if (err == -ENOSPC)
2289 				ext4_print_free_blocks(mpd->inode);
2290 		}
2291 		/* invalidate all the pages */
2292 		ext4_da_block_invalidatepages(mpd, next,
2293 				mpd->b_size >> mpd->inode->i_blkbits);
2294 		return;
2295 	}
2296 	BUG_ON(blks == 0);
2297 
2298 	mapp = &map;
2299 	if (map.m_flags & EXT4_MAP_NEW) {
2300 		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2301 		int i;
2302 
2303 		for (i = 0; i < map.m_len; i++)
2304 			unmap_underlying_metadata(bdev, map.m_pblk + i);
2305 	}
2306 
2307 	if (ext4_should_order_data(mpd->inode)) {
2308 		err = ext4_jbd2_file_inode(handle, mpd->inode);
2309 		if (err)
2310 			/* This only happens if the journal is aborted */
2311 			return;
2312 	}
2313 
2314 	/*
2315 	 * Update on-disk size along with block allocation.
2316 	 */
2317 	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2318 	if (disksize > i_size_read(mpd->inode))
2319 		disksize = i_size_read(mpd->inode);
2320 	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2321 		ext4_update_i_disksize(mpd->inode, disksize);
2322 		err = ext4_mark_inode_dirty(handle, mpd->inode);
2323 		if (err)
2324 			ext4_error(mpd->inode->i_sb,
2325 				   "Failed to mark inode %lu dirty",
2326 				   mpd->inode->i_ino);
2327 	}
2328 
2329 submit_io:
2330 	mpage_da_submit_io(mpd, mapp);
2331 	mpd->io_done = 1;
2332 }
2333 
2334 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
2335 		(1 << BH_Delay) | (1 << BH_Unwritten))
2336 
2337 /*
2338  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
2339  *
2340  * @mpd->lbh - extent of blocks
2341  * @logical - logical number of the block in the file
2342  * @bh - bh of the block (used to access block's state)
2343  *
2344  * the function is used to collect contig. blocks in same state
2345  */
2346 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2347 				   sector_t logical, size_t b_size,
2348 				   unsigned long b_state)
2349 {
2350 	sector_t next;
2351 	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2352 
2353 	/*
2354 	 * XXX Don't go larger than mballoc is willing to allocate
2355 	 * This is a stopgap solution.  We eventually need to fold
2356 	 * mpage_da_submit_io() into this function and then call
2357 	 * ext4_map_blocks() multiple times in a loop
2358 	 */
2359 	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2360 		goto flush_it;
2361 
2362 	/* check if thereserved journal credits might overflow */
2363 	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2364 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2365 			/*
2366 			 * With non-extent format we are limited by the journal
2367 			 * credit available.  Total credit needed to insert
2368 			 * nrblocks contiguous blocks is dependent on the
2369 			 * nrblocks.  So limit nrblocks.
2370 			 */
2371 			goto flush_it;
2372 		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
2373 				EXT4_MAX_TRANS_DATA) {
2374 			/*
2375 			 * Adding the new buffer_head would make it cross the
2376 			 * allowed limit for which we have journal credit
2377 			 * reserved. So limit the new bh->b_size
2378 			 */
2379 			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
2380 						mpd->inode->i_blkbits;
2381 			/* we will do mpage_da_submit_io in the next loop */
2382 		}
2383 	}
2384 	/*
2385 	 * First block in the extent
2386 	 */
2387 	if (mpd->b_size == 0) {
2388 		mpd->b_blocknr = logical;
2389 		mpd->b_size = b_size;
2390 		mpd->b_state = b_state & BH_FLAGS;
2391 		return;
2392 	}
2393 
2394 	next = mpd->b_blocknr + nrblocks;
2395 	/*
2396 	 * Can we merge the block to our big extent?
2397 	 */
2398 	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2399 		mpd->b_size += b_size;
2400 		return;
2401 	}
2402 
2403 flush_it:
2404 	/*
2405 	 * We couldn't merge the block to our extent, so we
2406 	 * need to flush current  extent and start new one
2407 	 */
2408 	mpage_da_map_and_submit(mpd);
2409 	return;
2410 }
2411 
2412 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2413 {
2414 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
2415 }
2416 
2417 /*
2418  * __mpage_da_writepage - finds extent of pages and blocks
2419  *
2420  * @page: page to consider
2421  * @wbc: not used, we just follow rules
2422  * @data: context
2423  *
2424  * The function finds extents of pages and scan them for all blocks.
2425  */
2426 static int __mpage_da_writepage(struct page *page,
2427 				struct writeback_control *wbc,
2428 				struct mpage_da_data *mpd)
2429 {
2430 	struct inode *inode = mpd->inode;
2431 	struct buffer_head *bh, *head;
2432 	sector_t logical;
2433 
2434 	/*
2435 	 * Can we merge this page to current extent?
2436 	 */
2437 	if (mpd->next_page != page->index) {
2438 		/*
2439 		 * Nope, we can't. So, we map non-allocated blocks
2440 		 * and start IO on them
2441 		 */
2442 		if (mpd->next_page != mpd->first_page) {
2443 			mpage_da_map_and_submit(mpd);
2444 			/*
2445 			 * skip rest of the page in the page_vec
2446 			 */
2447 			redirty_page_for_writepage(wbc, page);
2448 			unlock_page(page);
2449 			return MPAGE_DA_EXTENT_TAIL;
2450 		}
2451 
2452 		/*
2453 		 * Start next extent of pages ...
2454 		 */
2455 		mpd->first_page = page->index;
2456 
2457 		/*
2458 		 * ... and blocks
2459 		 */
2460 		mpd->b_size = 0;
2461 		mpd->b_state = 0;
2462 		mpd->b_blocknr = 0;
2463 	}
2464 
2465 	mpd->next_page = page->index + 1;
2466 	logical = (sector_t) page->index <<
2467 		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
2468 
2469 	if (!page_has_buffers(page)) {
2470 		mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2471 				       (1 << BH_Dirty) | (1 << BH_Uptodate));
2472 		if (mpd->io_done)
2473 			return MPAGE_DA_EXTENT_TAIL;
2474 	} else {
2475 		/*
2476 		 * Page with regular buffer heads, just add all dirty ones
2477 		 */
2478 		head = page_buffers(page);
2479 		bh = head;
2480 		do {
2481 			BUG_ON(buffer_locked(bh));
2482 			/*
2483 			 * We need to try to allocate
2484 			 * unmapped blocks in the same page.
2485 			 * Otherwise we won't make progress
2486 			 * with the page in ext4_writepage
2487 			 */
2488 			if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2489 				mpage_add_bh_to_extent(mpd, logical,
2490 						       bh->b_size,
2491 						       bh->b_state);
2492 				if (mpd->io_done)
2493 					return MPAGE_DA_EXTENT_TAIL;
2494 			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2495 				/*
2496 				 * mapped dirty buffer. We need to update
2497 				 * the b_state because we look at
2498 				 * b_state in mpage_da_map_blocks. We don't
2499 				 * update b_size because if we find an
2500 				 * unmapped buffer_head later we need to
2501 				 * use the b_state flag of that buffer_head.
2502 				 */
2503 				if (mpd->b_size == 0)
2504 					mpd->b_state = bh->b_state & BH_FLAGS;
2505 			}
2506 			logical++;
2507 		} while ((bh = bh->b_this_page) != head);
2508 	}
2509 
2510 	return 0;
2511 }
2512 
2513 /*
2514  * This is a special get_blocks_t callback which is used by
2515  * ext4_da_write_begin().  It will either return mapped block or
2516  * reserve space for a single block.
2517  *
2518  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2519  * We also have b_blocknr = -1 and b_bdev initialized properly
2520  *
2521  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2522  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2523  * initialized properly.
2524  */
2525 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2526 				  struct buffer_head *bh, int create)
2527 {
2528 	struct ext4_map_blocks map;
2529 	int ret = 0;
2530 	sector_t invalid_block = ~((sector_t) 0xffff);
2531 
2532 	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2533 		invalid_block = ~0;
2534 
2535 	BUG_ON(create == 0);
2536 	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2537 
2538 	map.m_lblk = iblock;
2539 	map.m_len = 1;
2540 
2541 	/*
2542 	 * first, we need to know whether the block is allocated already
2543 	 * preallocated blocks are unmapped but should treated
2544 	 * the same as allocated blocks.
2545 	 */
2546 	ret = ext4_map_blocks(NULL, inode, &map, 0);
2547 	if (ret < 0)
2548 		return ret;
2549 	if (ret == 0) {
2550 		if (buffer_delay(bh))
2551 			return 0; /* Not sure this could or should happen */
2552 		/*
2553 		 * XXX: __block_write_begin() unmaps passed block, is it OK?
2554 		 */
2555 		ret = ext4_da_reserve_space(inode, iblock);
2556 		if (ret)
2557 			/* not enough space to reserve */
2558 			return ret;
2559 
2560 		map_bh(bh, inode->i_sb, invalid_block);
2561 		set_buffer_new(bh);
2562 		set_buffer_delay(bh);
2563 		return 0;
2564 	}
2565 
2566 	map_bh(bh, inode->i_sb, map.m_pblk);
2567 	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2568 
2569 	if (buffer_unwritten(bh)) {
2570 		/* A delayed write to unwritten bh should be marked
2571 		 * new and mapped.  Mapped ensures that we don't do
2572 		 * get_block multiple times when we write to the same
2573 		 * offset and new ensures that we do proper zero out
2574 		 * for partial write.
2575 		 */
2576 		set_buffer_new(bh);
2577 		set_buffer_mapped(bh);
2578 	}
2579 	return 0;
2580 }
2581 
2582 /*
2583  * This function is used as a standard get_block_t calback function
2584  * when there is no desire to allocate any blocks.  It is used as a
2585  * callback function for block_write_begin() and block_write_full_page().
2586  * These functions should only try to map a single block at a time.
2587  *
2588  * Since this function doesn't do block allocations even if the caller
2589  * requests it by passing in create=1, it is critically important that
2590  * any caller checks to make sure that any buffer heads are returned
2591  * by this function are either all already mapped or marked for
2592  * delayed allocation before calling  block_write_full_page().  Otherwise,
2593  * b_blocknr could be left unitialized, and the page write functions will
2594  * be taken by surprise.
2595  */
2596 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2597 				   struct buffer_head *bh_result, int create)
2598 {
2599 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2600 	return _ext4_get_block(inode, iblock, bh_result, 0);
2601 }
2602 
2603 static int bget_one(handle_t *handle, struct buffer_head *bh)
2604 {
2605 	get_bh(bh);
2606 	return 0;
2607 }
2608 
2609 static int bput_one(handle_t *handle, struct buffer_head *bh)
2610 {
2611 	put_bh(bh);
2612 	return 0;
2613 }
2614 
2615 static int __ext4_journalled_writepage(struct page *page,
2616 				       unsigned int len)
2617 {
2618 	struct address_space *mapping = page->mapping;
2619 	struct inode *inode = mapping->host;
2620 	struct buffer_head *page_bufs;
2621 	handle_t *handle = NULL;
2622 	int ret = 0;
2623 	int err;
2624 
2625 	ClearPageChecked(page);
2626 	page_bufs = page_buffers(page);
2627 	BUG_ON(!page_bufs);
2628 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
2629 	/* As soon as we unlock the page, it can go away, but we have
2630 	 * references to buffers so we are safe */
2631 	unlock_page(page);
2632 
2633 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2634 	if (IS_ERR(handle)) {
2635 		ret = PTR_ERR(handle);
2636 		goto out;
2637 	}
2638 
2639 	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2640 				do_journal_get_write_access);
2641 
2642 	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2643 				write_end_fn);
2644 	if (ret == 0)
2645 		ret = err;
2646 	err = ext4_journal_stop(handle);
2647 	if (!ret)
2648 		ret = err;
2649 
2650 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2651 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2652 out:
2653 	return ret;
2654 }
2655 
2656 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2657 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2658 
2659 /*
2660  * Note that we don't need to start a transaction unless we're journaling data
2661  * because we should have holes filled from ext4_page_mkwrite(). We even don't
2662  * need to file the inode to the transaction's list in ordered mode because if
2663  * we are writing back data added by write(), the inode is already there and if
2664  * we are writing back data modified via mmap(), noone guarantees in which
2665  * transaction the data will hit the disk. In case we are journaling data, we
2666  * cannot start transaction directly because transaction start ranks above page
2667  * lock so we have to do some magic.
2668  *
2669  * This function can get called via...
2670  *   - ext4_da_writepages after taking page lock (have journal handle)
2671  *   - journal_submit_inode_data_buffers (no journal handle)
2672  *   - shrink_page_list via pdflush (no journal handle)
2673  *   - grab_page_cache when doing write_begin (have journal handle)
2674  *
2675  * We don't do any block allocation in this function. If we have page with
2676  * multiple blocks we need to write those buffer_heads that are mapped. This
2677  * is important for mmaped based write. So if we do with blocksize 1K
2678  * truncate(f, 1024);
2679  * a = mmap(f, 0, 4096);
2680  * a[0] = 'a';
2681  * truncate(f, 4096);
2682  * we have in the page first buffer_head mapped via page_mkwrite call back
2683  * but other bufer_heads would be unmapped but dirty(dirty done via the
2684  * do_wp_page). So writepage should write the first block. If we modify
2685  * the mmap area beyond 1024 we will again get a page_fault and the
2686  * page_mkwrite callback will do the block allocation and mark the
2687  * buffer_heads mapped.
2688  *
2689  * We redirty the page if we have any buffer_heads that is either delay or
2690  * unwritten in the page.
2691  *
2692  * We can get recursively called as show below.
2693  *
2694  *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2695  *		ext4_writepage()
2696  *
2697  * But since we don't do any block allocation we should not deadlock.
2698  * Page also have the dirty flag cleared so we don't get recurive page_lock.
2699  */
2700 static int ext4_writepage(struct page *page,
2701 			  struct writeback_control *wbc)
2702 {
2703 	int ret = 0, commit_write = 0;
2704 	loff_t size;
2705 	unsigned int len;
2706 	struct buffer_head *page_bufs = NULL;
2707 	struct inode *inode = page->mapping->host;
2708 
2709 	trace_ext4_writepage(inode, page);
2710 	size = i_size_read(inode);
2711 	if (page->index == size >> PAGE_CACHE_SHIFT)
2712 		len = size & ~PAGE_CACHE_MASK;
2713 	else
2714 		len = PAGE_CACHE_SIZE;
2715 
2716 	/*
2717 	 * If the page does not have buffers (for whatever reason),
2718 	 * try to create them using __block_write_begin.  If this
2719 	 * fails, redirty the page and move on.
2720 	 */
2721 	if (!page_has_buffers(page)) {
2722 		if (__block_write_begin(page, 0, len,
2723 					noalloc_get_block_write)) {
2724 		redirty_page:
2725 			redirty_page_for_writepage(wbc, page);
2726 			unlock_page(page);
2727 			return 0;
2728 		}
2729 		commit_write = 1;
2730 	}
2731 	page_bufs = page_buffers(page);
2732 	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2733 			      ext4_bh_delay_or_unwritten)) {
2734 		/*
2735 		 * We don't want to do block allocation, so redirty
2736 		 * the page and return.  We may reach here when we do
2737 		 * a journal commit via journal_submit_inode_data_buffers.
2738 		 * We can also reach here via shrink_page_list
2739 		 */
2740 		goto redirty_page;
2741 	}
2742 	if (commit_write)
2743 		/* now mark the buffer_heads as dirty and uptodate */
2744 		block_commit_write(page, 0, len);
2745 
2746 	if (PageChecked(page) && ext4_should_journal_data(inode))
2747 		/*
2748 		 * It's mmapped pagecache.  Add buffers and journal it.  There
2749 		 * doesn't seem much point in redirtying the page here.
2750 		 */
2751 		return __ext4_journalled_writepage(page, len);
2752 
2753 	if (buffer_uninit(page_bufs)) {
2754 		ext4_set_bh_endio(page_bufs, inode);
2755 		ret = block_write_full_page_endio(page, noalloc_get_block_write,
2756 					    wbc, ext4_end_io_buffer_write);
2757 	} else
2758 		ret = block_write_full_page(page, noalloc_get_block_write,
2759 					    wbc);
2760 
2761 	return ret;
2762 }
2763 
2764 /*
2765  * This is called via ext4_da_writepages() to
2766  * calulate the total number of credits to reserve to fit
2767  * a single extent allocation into a single transaction,
2768  * ext4_da_writpeages() will loop calling this before
2769  * the block allocation.
2770  */
2771 
2772 static int ext4_da_writepages_trans_blocks(struct inode *inode)
2773 {
2774 	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2775 
2776 	/*
2777 	 * With non-extent format the journal credit needed to
2778 	 * insert nrblocks contiguous block is dependent on
2779 	 * number of contiguous block. So we will limit
2780 	 * number of contiguous block to a sane value
2781 	 */
2782 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2783 	    (max_blocks > EXT4_MAX_TRANS_DATA))
2784 		max_blocks = EXT4_MAX_TRANS_DATA;
2785 
2786 	return ext4_chunk_trans_blocks(inode, max_blocks);
2787 }
2788 
2789 /*
2790  * write_cache_pages_da - walk the list of dirty pages of the given
2791  * address space and call the callback function (which usually writes
2792  * the pages).
2793  *
2794  * This is a forked version of write_cache_pages().  Differences:
2795  *	Range cyclic is ignored.
2796  *	no_nrwrite_index_update is always presumed true
2797  */
2798 static int write_cache_pages_da(struct address_space *mapping,
2799 				struct writeback_control *wbc,
2800 				struct mpage_da_data *mpd,
2801 				pgoff_t *done_index)
2802 {
2803 	int ret = 0;
2804 	int done = 0;
2805 	struct pagevec pvec;
2806 	unsigned nr_pages;
2807 	pgoff_t index;
2808 	pgoff_t end;		/* Inclusive */
2809 	long nr_to_write = wbc->nr_to_write;
2810 	int tag;
2811 
2812 	pagevec_init(&pvec, 0);
2813 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
2814 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
2815 
2816 	if (wbc->sync_mode == WB_SYNC_ALL)
2817 		tag = PAGECACHE_TAG_TOWRITE;
2818 	else
2819 		tag = PAGECACHE_TAG_DIRTY;
2820 
2821 	*done_index = index;
2822 	while (!done && (index <= end)) {
2823 		int i;
2824 
2825 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2826 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2827 		if (nr_pages == 0)
2828 			break;
2829 
2830 		for (i = 0; i < nr_pages; i++) {
2831 			struct page *page = pvec.pages[i];
2832 
2833 			/*
2834 			 * At this point, the page may be truncated or
2835 			 * invalidated (changing page->mapping to NULL), or
2836 			 * even swizzled back from swapper_space to tmpfs file
2837 			 * mapping. However, page->index will not change
2838 			 * because we have a reference on the page.
2839 			 */
2840 			if (page->index > end) {
2841 				done = 1;
2842 				break;
2843 			}
2844 
2845 			*done_index = page->index + 1;
2846 
2847 			lock_page(page);
2848 
2849 			/*
2850 			 * Page truncated or invalidated. We can freely skip it
2851 			 * then, even for data integrity operations: the page
2852 			 * has disappeared concurrently, so there could be no
2853 			 * real expectation of this data interity operation
2854 			 * even if there is now a new, dirty page at the same
2855 			 * pagecache address.
2856 			 */
2857 			if (unlikely(page->mapping != mapping)) {
2858 continue_unlock:
2859 				unlock_page(page);
2860 				continue;
2861 			}
2862 
2863 			if (!PageDirty(page)) {
2864 				/* someone wrote it for us */
2865 				goto continue_unlock;
2866 			}
2867 
2868 			if (PageWriteback(page)) {
2869 				if (wbc->sync_mode != WB_SYNC_NONE)
2870 					wait_on_page_writeback(page);
2871 				else
2872 					goto continue_unlock;
2873 			}
2874 
2875 			BUG_ON(PageWriteback(page));
2876 			if (!clear_page_dirty_for_io(page))
2877 				goto continue_unlock;
2878 
2879 			ret = __mpage_da_writepage(page, wbc, mpd);
2880 			if (unlikely(ret)) {
2881 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
2882 					unlock_page(page);
2883 					ret = 0;
2884 				} else {
2885 					done = 1;
2886 					break;
2887 				}
2888 			}
2889 
2890 			if (nr_to_write > 0) {
2891 				nr_to_write--;
2892 				if (nr_to_write == 0 &&
2893 				    wbc->sync_mode == WB_SYNC_NONE) {
2894 					/*
2895 					 * We stop writing back only if we are
2896 					 * not doing integrity sync. In case of
2897 					 * integrity sync we have to keep going
2898 					 * because someone may be concurrently
2899 					 * dirtying pages, and we might have
2900 					 * synced a lot of newly appeared dirty
2901 					 * pages, but have not synced all of the
2902 					 * old dirty pages.
2903 					 */
2904 					done = 1;
2905 					break;
2906 				}
2907 			}
2908 		}
2909 		pagevec_release(&pvec);
2910 		cond_resched();
2911 	}
2912 	return ret;
2913 }
2914 
2915 
2916 static int ext4_da_writepages(struct address_space *mapping,
2917 			      struct writeback_control *wbc)
2918 {
2919 	pgoff_t	index;
2920 	int range_whole = 0;
2921 	handle_t *handle = NULL;
2922 	struct mpage_da_data mpd;
2923 	struct inode *inode = mapping->host;
2924 	int pages_written = 0;
2925 	long pages_skipped;
2926 	unsigned int max_pages;
2927 	int range_cyclic, cycled = 1, io_done = 0;
2928 	int needed_blocks, ret = 0;
2929 	long desired_nr_to_write, nr_to_writebump = 0;
2930 	loff_t range_start = wbc->range_start;
2931 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2932 	pgoff_t done_index = 0;
2933 	pgoff_t end;
2934 
2935 	trace_ext4_da_writepages(inode, wbc);
2936 
2937 	/*
2938 	 * No pages to write? This is mainly a kludge to avoid starting
2939 	 * a transaction for special inodes like journal inode on last iput()
2940 	 * because that could violate lock ordering on umount
2941 	 */
2942 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2943 		return 0;
2944 
2945 	/*
2946 	 * If the filesystem has aborted, it is read-only, so return
2947 	 * right away instead of dumping stack traces later on that
2948 	 * will obscure the real source of the problem.  We test
2949 	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2950 	 * the latter could be true if the filesystem is mounted
2951 	 * read-only, and in that case, ext4_da_writepages should
2952 	 * *never* be called, so if that ever happens, we would want
2953 	 * the stack trace.
2954 	 */
2955 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2956 		return -EROFS;
2957 
2958 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2959 		range_whole = 1;
2960 
2961 	range_cyclic = wbc->range_cyclic;
2962 	if (wbc->range_cyclic) {
2963 		index = mapping->writeback_index;
2964 		if (index)
2965 			cycled = 0;
2966 		wbc->range_start = index << PAGE_CACHE_SHIFT;
2967 		wbc->range_end  = LLONG_MAX;
2968 		wbc->range_cyclic = 0;
2969 		end = -1;
2970 	} else {
2971 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2972 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2973 	}
2974 
2975 	/*
2976 	 * This works around two forms of stupidity.  The first is in
2977 	 * the writeback code, which caps the maximum number of pages
2978 	 * written to be 1024 pages.  This is wrong on multiple
2979 	 * levels; different architectues have a different page size,
2980 	 * which changes the maximum amount of data which gets
2981 	 * written.  Secondly, 4 megabytes is way too small.  XFS
2982 	 * forces this value to be 16 megabytes by multiplying
2983 	 * nr_to_write parameter by four, and then relies on its
2984 	 * allocator to allocate larger extents to make them
2985 	 * contiguous.  Unfortunately this brings us to the second
2986 	 * stupidity, which is that ext4's mballoc code only allocates
2987 	 * at most 2048 blocks.  So we force contiguous writes up to
2988 	 * the number of dirty blocks in the inode, or
2989 	 * sbi->max_writeback_mb_bump whichever is smaller.
2990 	 */
2991 	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2992 	if (!range_cyclic && range_whole) {
2993 		if (wbc->nr_to_write == LONG_MAX)
2994 			desired_nr_to_write = wbc->nr_to_write;
2995 		else
2996 			desired_nr_to_write = wbc->nr_to_write * 8;
2997 	} else
2998 		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2999 							   max_pages);
3000 	if (desired_nr_to_write > max_pages)
3001 		desired_nr_to_write = max_pages;
3002 
3003 	if (wbc->nr_to_write < desired_nr_to_write) {
3004 		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
3005 		wbc->nr_to_write = desired_nr_to_write;
3006 	}
3007 
3008 	mpd.wbc = wbc;
3009 	mpd.inode = mapping->host;
3010 
3011 	pages_skipped = wbc->pages_skipped;
3012 
3013 retry:
3014 	if (wbc->sync_mode == WB_SYNC_ALL)
3015 		tag_pages_for_writeback(mapping, index, end);
3016 
3017 	while (!ret && wbc->nr_to_write > 0) {
3018 
3019 		/*
3020 		 * we  insert one extent at a time. So we need
3021 		 * credit needed for single extent allocation.
3022 		 * journalled mode is currently not supported
3023 		 * by delalloc
3024 		 */
3025 		BUG_ON(ext4_should_journal_data(inode));
3026 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
3027 
3028 		/* start a new transaction*/
3029 		handle = ext4_journal_start(inode, needed_blocks);
3030 		if (IS_ERR(handle)) {
3031 			ret = PTR_ERR(handle);
3032 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
3033 			       "%ld pages, ino %lu; err %d", __func__,
3034 				wbc->nr_to_write, inode->i_ino, ret);
3035 			goto out_writepages;
3036 		}
3037 
3038 		/*
3039 		 * Now call __mpage_da_writepage to find the next
3040 		 * contiguous region of logical blocks that need
3041 		 * blocks to be allocated by ext4.  We don't actually
3042 		 * submit the blocks for I/O here, even though
3043 		 * write_cache_pages thinks it will, and will set the
3044 		 * pages as clean for write before calling
3045 		 * __mpage_da_writepage().
3046 		 */
3047 		mpd.b_size = 0;
3048 		mpd.b_state = 0;
3049 		mpd.b_blocknr = 0;
3050 		mpd.first_page = 0;
3051 		mpd.next_page = 0;
3052 		mpd.io_done = 0;
3053 		mpd.pages_written = 0;
3054 		mpd.retval = 0;
3055 		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3056 		/*
3057 		 * If we have a contiguous extent of pages and we
3058 		 * haven't done the I/O yet, map the blocks and submit
3059 		 * them for I/O.
3060 		 */
3061 		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3062 			mpage_da_map_and_submit(&mpd);
3063 			ret = MPAGE_DA_EXTENT_TAIL;
3064 		}
3065 		trace_ext4_da_write_pages(inode, &mpd);
3066 		wbc->nr_to_write -= mpd.pages_written;
3067 
3068 		ext4_journal_stop(handle);
3069 
3070 		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
3071 			/* commit the transaction which would
3072 			 * free blocks released in the transaction
3073 			 * and try again
3074 			 */
3075 			jbd2_journal_force_commit_nested(sbi->s_journal);
3076 			wbc->pages_skipped = pages_skipped;
3077 			ret = 0;
3078 		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
3079 			/*
3080 			 * got one extent now try with
3081 			 * rest of the pages
3082 			 */
3083 			pages_written += mpd.pages_written;
3084 			wbc->pages_skipped = pages_skipped;
3085 			ret = 0;
3086 			io_done = 1;
3087 		} else if (wbc->nr_to_write)
3088 			/*
3089 			 * There is no more writeout needed
3090 			 * or we requested for a noblocking writeout
3091 			 * and we found the device congested
3092 			 */
3093 			break;
3094 	}
3095 	if (!io_done && !cycled) {
3096 		cycled = 1;
3097 		index = 0;
3098 		wbc->range_start = index << PAGE_CACHE_SHIFT;
3099 		wbc->range_end  = mapping->writeback_index - 1;
3100 		goto retry;
3101 	}
3102 	if (pages_skipped != wbc->pages_skipped)
3103 		ext4_msg(inode->i_sb, KERN_CRIT,
3104 			 "This should not happen leaving %s "
3105 			 "with nr_to_write = %ld ret = %d",
3106 			 __func__, wbc->nr_to_write, ret);
3107 
3108 	/* Update index */
3109 	wbc->range_cyclic = range_cyclic;
3110 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3111 		/*
3112 		 * set the writeback_index so that range_cyclic
3113 		 * mode will write it back later
3114 		 */
3115 		mapping->writeback_index = done_index;
3116 
3117 out_writepages:
3118 	wbc->nr_to_write -= nr_to_writebump;
3119 	wbc->range_start = range_start;
3120 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3121 	return ret;
3122 }
3123 
3124 #define FALL_BACK_TO_NONDELALLOC 1
3125 static int ext4_nonda_switch(struct super_block *sb)
3126 {
3127 	s64 free_blocks, dirty_blocks;
3128 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3129 
3130 	/*
3131 	 * switch to non delalloc mode if we are running low
3132 	 * on free block. The free block accounting via percpu
3133 	 * counters can get slightly wrong with percpu_counter_batch getting
3134 	 * accumulated on each CPU without updating global counters
3135 	 * Delalloc need an accurate free block accounting. So switch
3136 	 * to non delalloc when we are near to error range.
3137 	 */
3138 	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
3139 	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
3140 	if (2 * free_blocks < 3 * dirty_blocks ||
3141 		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3142 		/*
3143 		 * free block count is less than 150% of dirty blocks
3144 		 * or free blocks is less than watermark
3145 		 */
3146 		return 1;
3147 	}
3148 	/*
3149 	 * Even if we don't switch but are nearing capacity,
3150 	 * start pushing delalloc when 1/2 of free blocks are dirty.
3151 	 */
3152 	if (free_blocks < 2 * dirty_blocks)
3153 		writeback_inodes_sb_if_idle(sb);
3154 
3155 	return 0;
3156 }
3157 
3158 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3159 			       loff_t pos, unsigned len, unsigned flags,
3160 			       struct page **pagep, void **fsdata)
3161 {
3162 	int ret, retries = 0;
3163 	struct page *page;
3164 	pgoff_t index;
3165 	struct inode *inode = mapping->host;
3166 	handle_t *handle;
3167 
3168 	index = pos >> PAGE_CACHE_SHIFT;
3169 
3170 	if (ext4_nonda_switch(inode->i_sb)) {
3171 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
3172 		return ext4_write_begin(file, mapping, pos,
3173 					len, flags, pagep, fsdata);
3174 	}
3175 	*fsdata = (void *)0;
3176 	trace_ext4_da_write_begin(inode, pos, len, flags);
3177 retry:
3178 	/*
3179 	 * With delayed allocation, we don't log the i_disksize update
3180 	 * if there is delayed block allocation. But we still need
3181 	 * to journalling the i_disksize update if writes to the end
3182 	 * of file which has an already mapped buffer.
3183 	 */
3184 	handle = ext4_journal_start(inode, 1);
3185 	if (IS_ERR(handle)) {
3186 		ret = PTR_ERR(handle);
3187 		goto out;
3188 	}
3189 	/* We cannot recurse into the filesystem as the transaction is already
3190 	 * started */
3191 	flags |= AOP_FLAG_NOFS;
3192 
3193 	page = grab_cache_page_write_begin(mapping, index, flags);
3194 	if (!page) {
3195 		ext4_journal_stop(handle);
3196 		ret = -ENOMEM;
3197 		goto out;
3198 	}
3199 	*pagep = page;
3200 
3201 	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3202 	if (ret < 0) {
3203 		unlock_page(page);
3204 		ext4_journal_stop(handle);
3205 		page_cache_release(page);
3206 		/*
3207 		 * block_write_begin may have instantiated a few blocks
3208 		 * outside i_size.  Trim these off again. Don't need
3209 		 * i_size_read because we hold i_mutex.
3210 		 */
3211 		if (pos + len > inode->i_size)
3212 			ext4_truncate_failed_write(inode);
3213 	}
3214 
3215 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3216 		goto retry;
3217 out:
3218 	return ret;
3219 }
3220 
3221 /*
3222  * Check if we should update i_disksize
3223  * when write to the end of file but not require block allocation
3224  */
3225 static int ext4_da_should_update_i_disksize(struct page *page,
3226 					    unsigned long offset)
3227 {
3228 	struct buffer_head *bh;
3229 	struct inode *inode = page->mapping->host;
3230 	unsigned int idx;
3231 	int i;
3232 
3233 	bh = page_buffers(page);
3234 	idx = offset >> inode->i_blkbits;
3235 
3236 	for (i = 0; i < idx; i++)
3237 		bh = bh->b_this_page;
3238 
3239 	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
3240 		return 0;
3241 	return 1;
3242 }
3243 
3244 static int ext4_da_write_end(struct file *file,
3245 			     struct address_space *mapping,
3246 			     loff_t pos, unsigned len, unsigned copied,
3247 			     struct page *page, void *fsdata)
3248 {
3249 	struct inode *inode = mapping->host;
3250 	int ret = 0, ret2;
3251 	handle_t *handle = ext4_journal_current_handle();
3252 	loff_t new_i_size;
3253 	unsigned long start, end;
3254 	int write_mode = (int)(unsigned long)fsdata;
3255 
3256 	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
3257 		if (ext4_should_order_data(inode)) {
3258 			return ext4_ordered_write_end(file, mapping, pos,
3259 					len, copied, page, fsdata);
3260 		} else if (ext4_should_writeback_data(inode)) {
3261 			return ext4_writeback_write_end(file, mapping, pos,
3262 					len, copied, page, fsdata);
3263 		} else {
3264 			BUG();
3265 		}
3266 	}
3267 
3268 	trace_ext4_da_write_end(inode, pos, len, copied);
3269 	start = pos & (PAGE_CACHE_SIZE - 1);
3270 	end = start + copied - 1;
3271 
3272 	/*
3273 	 * generic_write_end() will run mark_inode_dirty() if i_size
3274 	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
3275 	 * into that.
3276 	 */
3277 
3278 	new_i_size = pos + copied;
3279 	if (new_i_size > EXT4_I(inode)->i_disksize) {
3280 		if (ext4_da_should_update_i_disksize(page, end)) {
3281 			down_write(&EXT4_I(inode)->i_data_sem);
3282 			if (new_i_size > EXT4_I(inode)->i_disksize) {
3283 				/*
3284 				 * Updating i_disksize when extending file
3285 				 * without needing block allocation
3286 				 */
3287 				if (ext4_should_order_data(inode))
3288 					ret = ext4_jbd2_file_inode(handle,
3289 								   inode);
3290 
3291 				EXT4_I(inode)->i_disksize = new_i_size;
3292 			}
3293 			up_write(&EXT4_I(inode)->i_data_sem);
3294 			/* We need to mark inode dirty even if
3295 			 * new_i_size is less that inode->i_size
3296 			 * bu greater than i_disksize.(hint delalloc)
3297 			 */
3298 			ext4_mark_inode_dirty(handle, inode);
3299 		}
3300 	}
3301 	ret2 = generic_write_end(file, mapping, pos, len, copied,
3302 							page, fsdata);
3303 	copied = ret2;
3304 	if (ret2 < 0)
3305 		ret = ret2;
3306 	ret2 = ext4_journal_stop(handle);
3307 	if (!ret)
3308 		ret = ret2;
3309 
3310 	return ret ? ret : copied;
3311 }
3312 
3313 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3314 {
3315 	/*
3316 	 * Drop reserved blocks
3317 	 */
3318 	BUG_ON(!PageLocked(page));
3319 	if (!page_has_buffers(page))
3320 		goto out;
3321 
3322 	ext4_da_page_release_reservation(page, offset);
3323 
3324 out:
3325 	ext4_invalidatepage(page, offset);
3326 
3327 	return;
3328 }
3329 
3330 /*
3331  * Force all delayed allocation blocks to be allocated for a given inode.
3332  */
3333 int ext4_alloc_da_blocks(struct inode *inode)
3334 {
3335 	trace_ext4_alloc_da_blocks(inode);
3336 
3337 	if (!EXT4_I(inode)->i_reserved_data_blocks &&
3338 	    !EXT4_I(inode)->i_reserved_meta_blocks)
3339 		return 0;
3340 
3341 	/*
3342 	 * We do something simple for now.  The filemap_flush() will
3343 	 * also start triggering a write of the data blocks, which is
3344 	 * not strictly speaking necessary (and for users of
3345 	 * laptop_mode, not even desirable).  However, to do otherwise
3346 	 * would require replicating code paths in:
3347 	 *
3348 	 * ext4_da_writepages() ->
3349 	 *    write_cache_pages() ---> (via passed in callback function)
3350 	 *        __mpage_da_writepage() -->
3351 	 *           mpage_add_bh_to_extent()
3352 	 *           mpage_da_map_blocks()
3353 	 *
3354 	 * The problem is that write_cache_pages(), located in
3355 	 * mm/page-writeback.c, marks pages clean in preparation for
3356 	 * doing I/O, which is not desirable if we're not planning on
3357 	 * doing I/O at all.
3358 	 *
3359 	 * We could call write_cache_pages(), and then redirty all of
3360 	 * the pages by calling redirty_page_for_writeback() but that
3361 	 * would be ugly in the extreme.  So instead we would need to
3362 	 * replicate parts of the code in the above functions,
3363 	 * simplifying them becuase we wouldn't actually intend to
3364 	 * write out the pages, but rather only collect contiguous
3365 	 * logical block extents, call the multi-block allocator, and
3366 	 * then update the buffer heads with the block allocations.
3367 	 *
3368 	 * For now, though, we'll cheat by calling filemap_flush(),
3369 	 * which will map the blocks, and start the I/O, but not
3370 	 * actually wait for the I/O to complete.
3371 	 */
3372 	return filemap_flush(inode->i_mapping);
3373 }
3374 
3375 /*
3376  * bmap() is special.  It gets used by applications such as lilo and by
3377  * the swapper to find the on-disk block of a specific piece of data.
3378  *
3379  * Naturally, this is dangerous if the block concerned is still in the
3380  * journal.  If somebody makes a swapfile on an ext4 data-journaling
3381  * filesystem and enables swap, then they may get a nasty shock when the
3382  * data getting swapped to that swapfile suddenly gets overwritten by
3383  * the original zero's written out previously to the journal and
3384  * awaiting writeback in the kernel's buffer cache.
3385  *
3386  * So, if we see any bmap calls here on a modified, data-journaled file,
3387  * take extra steps to flush any blocks which might be in the cache.
3388  */
3389 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3390 {
3391 	struct inode *inode = mapping->host;
3392 	journal_t *journal;
3393 	int err;
3394 
3395 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3396 			test_opt(inode->i_sb, DELALLOC)) {
3397 		/*
3398 		 * With delalloc we want to sync the file
3399 		 * so that we can make sure we allocate
3400 		 * blocks for file
3401 		 */
3402 		filemap_write_and_wait(mapping);
3403 	}
3404 
3405 	if (EXT4_JOURNAL(inode) &&
3406 	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3407 		/*
3408 		 * This is a REALLY heavyweight approach, but the use of
3409 		 * bmap on dirty files is expected to be extremely rare:
3410 		 * only if we run lilo or swapon on a freshly made file
3411 		 * do we expect this to happen.
3412 		 *
3413 		 * (bmap requires CAP_SYS_RAWIO so this does not
3414 		 * represent an unprivileged user DOS attack --- we'd be
3415 		 * in trouble if mortal users could trigger this path at
3416 		 * will.)
3417 		 *
3418 		 * NB. EXT4_STATE_JDATA is not set on files other than
3419 		 * regular files.  If somebody wants to bmap a directory
3420 		 * or symlink and gets confused because the buffer
3421 		 * hasn't yet been flushed to disk, they deserve
3422 		 * everything they get.
3423 		 */
3424 
3425 		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3426 		journal = EXT4_JOURNAL(inode);
3427 		jbd2_journal_lock_updates(journal);
3428 		err = jbd2_journal_flush(journal);
3429 		jbd2_journal_unlock_updates(journal);
3430 
3431 		if (err)
3432 			return 0;
3433 	}
3434 
3435 	return generic_block_bmap(mapping, block, ext4_get_block);
3436 }
3437 
3438 static int ext4_readpage(struct file *file, struct page *page)
3439 {
3440 	return mpage_readpage(page, ext4_get_block);
3441 }
3442 
3443 static int
3444 ext4_readpages(struct file *file, struct address_space *mapping,
3445 		struct list_head *pages, unsigned nr_pages)
3446 {
3447 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3448 }
3449 
3450 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3451 {
3452 	struct buffer_head *head, *bh;
3453 	unsigned int curr_off = 0;
3454 
3455 	if (!page_has_buffers(page))
3456 		return;
3457 	head = bh = page_buffers(page);
3458 	do {
3459 		if (offset <= curr_off && test_clear_buffer_uninit(bh)
3460 					&& bh->b_private) {
3461 			ext4_free_io_end(bh->b_private);
3462 			bh->b_private = NULL;
3463 			bh->b_end_io = NULL;
3464 		}
3465 		curr_off = curr_off + bh->b_size;
3466 		bh = bh->b_this_page;
3467 	} while (bh != head);
3468 }
3469 
3470 static void ext4_invalidatepage(struct page *page, unsigned long offset)
3471 {
3472 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3473 
3474 	/*
3475 	 * free any io_end structure allocated for buffers to be discarded
3476 	 */
3477 	if (ext4_should_dioread_nolock(page->mapping->host))
3478 		ext4_invalidatepage_free_endio(page, offset);
3479 	/*
3480 	 * If it's a full truncate we just forget about the pending dirtying
3481 	 */
3482 	if (offset == 0)
3483 		ClearPageChecked(page);
3484 
3485 	if (journal)
3486 		jbd2_journal_invalidatepage(journal, page, offset);
3487 	else
3488 		block_invalidatepage(page, offset);
3489 }
3490 
3491 static int ext4_releasepage(struct page *page, gfp_t wait)
3492 {
3493 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3494 
3495 	WARN_ON(PageChecked(page));
3496 	if (!page_has_buffers(page))
3497 		return 0;
3498 	if (journal)
3499 		return jbd2_journal_try_to_free_buffers(journal, page, wait);
3500 	else
3501 		return try_to_free_buffers(page);
3502 }
3503 
3504 /*
3505  * O_DIRECT for ext3 (or indirect map) based files
3506  *
3507  * If the O_DIRECT write will extend the file then add this inode to the
3508  * orphan list.  So recovery will truncate it back to the original size
3509  * if the machine crashes during the write.
3510  *
3511  * If the O_DIRECT write is intantiating holes inside i_size and the machine
3512  * crashes then stale disk data _may_ be exposed inside the file. But current
3513  * VFS code falls back into buffered path in that case so we are safe.
3514  */
3515 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3516 			      const struct iovec *iov, loff_t offset,
3517 			      unsigned long nr_segs)
3518 {
3519 	struct file *file = iocb->ki_filp;
3520 	struct inode *inode = file->f_mapping->host;
3521 	struct ext4_inode_info *ei = EXT4_I(inode);
3522 	handle_t *handle;
3523 	ssize_t ret;
3524 	int orphan = 0;
3525 	size_t count = iov_length(iov, nr_segs);
3526 	int retries = 0;
3527 
3528 	if (rw == WRITE) {
3529 		loff_t final_size = offset + count;
3530 
3531 		if (final_size > inode->i_size) {
3532 			/* Credits for sb + inode write */
3533 			handle = ext4_journal_start(inode, 2);
3534 			if (IS_ERR(handle)) {
3535 				ret = PTR_ERR(handle);
3536 				goto out;
3537 			}
3538 			ret = ext4_orphan_add(handle, inode);
3539 			if (ret) {
3540 				ext4_journal_stop(handle);
3541 				goto out;
3542 			}
3543 			orphan = 1;
3544 			ei->i_disksize = inode->i_size;
3545 			ext4_journal_stop(handle);
3546 		}
3547 	}
3548 
3549 retry:
3550 	if (rw == READ && ext4_should_dioread_nolock(inode))
3551 		ret = __blockdev_direct_IO(rw, iocb, inode,
3552 				 inode->i_sb->s_bdev, iov,
3553 				 offset, nr_segs,
3554 				 ext4_get_block, NULL, NULL, 0);
3555 	else {
3556 		ret = blockdev_direct_IO(rw, iocb, inode,
3557 				 inode->i_sb->s_bdev, iov,
3558 				 offset, nr_segs,
3559 				 ext4_get_block, NULL);
3560 
3561 		if (unlikely((rw & WRITE) && ret < 0)) {
3562 			loff_t isize = i_size_read(inode);
3563 			loff_t end = offset + iov_length(iov, nr_segs);
3564 
3565 			if (end > isize)
3566 				vmtruncate(inode, isize);
3567 		}
3568 	}
3569 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3570 		goto retry;
3571 
3572 	if (orphan) {
3573 		int err;
3574 
3575 		/* Credits for sb + inode write */
3576 		handle = ext4_journal_start(inode, 2);
3577 		if (IS_ERR(handle)) {
3578 			/* This is really bad luck. We've written the data
3579 			 * but cannot extend i_size. Bail out and pretend
3580 			 * the write failed... */
3581 			ret = PTR_ERR(handle);
3582 			if (inode->i_nlink)
3583 				ext4_orphan_del(NULL, inode);
3584 
3585 			goto out;
3586 		}
3587 		if (inode->i_nlink)
3588 			ext4_orphan_del(handle, inode);
3589 		if (ret > 0) {
3590 			loff_t end = offset + ret;
3591 			if (end > inode->i_size) {
3592 				ei->i_disksize = end;
3593 				i_size_write(inode, end);
3594 				/*
3595 				 * We're going to return a positive `ret'
3596 				 * here due to non-zero-length I/O, so there's
3597 				 * no way of reporting error returns from
3598 				 * ext4_mark_inode_dirty() to userspace.  So
3599 				 * ignore it.
3600 				 */
3601 				ext4_mark_inode_dirty(handle, inode);
3602 			}
3603 		}
3604 		err = ext4_journal_stop(handle);
3605 		if (ret == 0)
3606 			ret = err;
3607 	}
3608 out:
3609 	return ret;
3610 }
3611 
3612 /*
3613  * ext4_get_block used when preparing for a DIO write or buffer write.
3614  * We allocate an uinitialized extent if blocks haven't been allocated.
3615  * The extent will be converted to initialized after the IO is complete.
3616  */
3617 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3618 		   struct buffer_head *bh_result, int create)
3619 {
3620 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3621 		   inode->i_ino, create);
3622 	return _ext4_get_block(inode, iblock, bh_result,
3623 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
3624 }
3625 
3626 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3627 			    ssize_t size, void *private, int ret,
3628 			    bool is_async)
3629 {
3630         ext4_io_end_t *io_end = iocb->private;
3631 	struct workqueue_struct *wq;
3632 	unsigned long flags;
3633 	struct ext4_inode_info *ei;
3634 
3635 	/* if not async direct IO or dio with 0 bytes write, just return */
3636 	if (!io_end || !size)
3637 		goto out;
3638 
3639 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
3640 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3641  		  iocb->private, io_end->inode->i_ino, iocb, offset,
3642 		  size);
3643 
3644 	/* if not aio dio with unwritten extents, just free io and return */
3645 	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3646 		ext4_free_io_end(io_end);
3647 		iocb->private = NULL;
3648 out:
3649 		if (is_async)
3650 			aio_complete(iocb, ret, 0);
3651 		return;
3652 	}
3653 
3654 	io_end->offset = offset;
3655 	io_end->size = size;
3656 	if (is_async) {
3657 		io_end->iocb = iocb;
3658 		io_end->result = ret;
3659 	}
3660 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3661 
3662 	/* Add the io_end to per-inode completed aio dio list*/
3663 	ei = EXT4_I(io_end->inode);
3664 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3665 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
3666 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3667 
3668 	/* queue the work to convert unwritten extents to written */
3669 	queue_work(wq, &io_end->work);
3670 	iocb->private = NULL;
3671 }
3672 
3673 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3674 {
3675 	ext4_io_end_t *io_end = bh->b_private;
3676 	struct workqueue_struct *wq;
3677 	struct inode *inode;
3678 	unsigned long flags;
3679 
3680 	if (!test_clear_buffer_uninit(bh) || !io_end)
3681 		goto out;
3682 
3683 	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3684 		printk("sb umounted, discard end_io request for inode %lu\n",
3685 			io_end->inode->i_ino);
3686 		ext4_free_io_end(io_end);
3687 		goto out;
3688 	}
3689 
3690 	io_end->flag = EXT4_IO_END_UNWRITTEN;
3691 	inode = io_end->inode;
3692 
3693 	/* Add the io_end to per-inode completed io list*/
3694 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3695 	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3696 	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3697 
3698 	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3699 	/* queue the work to convert unwritten extents to written */
3700 	queue_work(wq, &io_end->work);
3701 out:
3702 	bh->b_private = NULL;
3703 	bh->b_end_io = NULL;
3704 	clear_buffer_uninit(bh);
3705 	end_buffer_async_write(bh, uptodate);
3706 }
3707 
3708 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3709 {
3710 	ext4_io_end_t *io_end;
3711 	struct page *page = bh->b_page;
3712 	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3713 	size_t size = bh->b_size;
3714 
3715 retry:
3716 	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3717 	if (!io_end) {
3718 		if (printk_ratelimit())
3719 			printk(KERN_WARNING "%s: allocation fail\n", __func__);
3720 		schedule();
3721 		goto retry;
3722 	}
3723 	io_end->offset = offset;
3724 	io_end->size = size;
3725 	/*
3726 	 * We need to hold a reference to the page to make sure it
3727 	 * doesn't get evicted before ext4_end_io_work() has a chance
3728 	 * to convert the extent from written to unwritten.
3729 	 */
3730 	io_end->page = page;
3731 	get_page(io_end->page);
3732 
3733 	bh->b_private = io_end;
3734 	bh->b_end_io = ext4_end_io_buffer_write;
3735 	return 0;
3736 }
3737 
3738 /*
3739  * For ext4 extent files, ext4 will do direct-io write to holes,
3740  * preallocated extents, and those write extend the file, no need to
3741  * fall back to buffered IO.
3742  *
3743  * For holes, we fallocate those blocks, mark them as unintialized
3744  * If those blocks were preallocated, we mark sure they are splited, but
3745  * still keep the range to write as unintialized.
3746  *
3747  * The unwrritten extents will be converted to written when DIO is completed.
3748  * For async direct IO, since the IO may still pending when return, we
3749  * set up an end_io call back function, which will do the convertion
3750  * when async direct IO completed.
3751  *
3752  * If the O_DIRECT write will extend the file then add this inode to the
3753  * orphan list.  So recovery will truncate it back to the original size
3754  * if the machine crashes during the write.
3755  *
3756  */
3757 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3758 			      const struct iovec *iov, loff_t offset,
3759 			      unsigned long nr_segs)
3760 {
3761 	struct file *file = iocb->ki_filp;
3762 	struct inode *inode = file->f_mapping->host;
3763 	ssize_t ret;
3764 	size_t count = iov_length(iov, nr_segs);
3765 
3766 	loff_t final_size = offset + count;
3767 	if (rw == WRITE && final_size <= inode->i_size) {
3768 		/*
3769  		 * We could direct write to holes and fallocate.
3770 		 *
3771  		 * Allocated blocks to fill the hole are marked as uninitialized
3772  		 * to prevent paralel buffered read to expose the stale data
3773  		 * before DIO complete the data IO.
3774 		 *
3775  		 * As to previously fallocated extents, ext4 get_block
3776  		 * will just simply mark the buffer mapped but still
3777  		 * keep the extents uninitialized.
3778  		 *
3779 		 * for non AIO case, we will convert those unwritten extents
3780 		 * to written after return back from blockdev_direct_IO.
3781 		 *
3782 		 * for async DIO, the conversion needs to be defered when
3783 		 * the IO is completed. The ext4 end_io callback function
3784 		 * will be called to take care of the conversion work.
3785 		 * Here for async case, we allocate an io_end structure to
3786 		 * hook to the iocb.
3787  		 */
3788 		iocb->private = NULL;
3789 		EXT4_I(inode)->cur_aio_dio = NULL;
3790 		if (!is_sync_kiocb(iocb)) {
3791 			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3792 			if (!iocb->private)
3793 				return -ENOMEM;
3794 			/*
3795 			 * we save the io structure for current async
3796 			 * direct IO, so that later ext4_map_blocks()
3797 			 * could flag the io structure whether there
3798 			 * is a unwritten extents needs to be converted
3799 			 * when IO is completed.
3800 			 */
3801 			EXT4_I(inode)->cur_aio_dio = iocb->private;
3802 		}
3803 
3804 		ret = blockdev_direct_IO(rw, iocb, inode,
3805 					 inode->i_sb->s_bdev, iov,
3806 					 offset, nr_segs,
3807 					 ext4_get_block_write,
3808 					 ext4_end_io_dio);
3809 		if (iocb->private)
3810 			EXT4_I(inode)->cur_aio_dio = NULL;
3811 		/*
3812 		 * The io_end structure takes a reference to the inode,
3813 		 * that structure needs to be destroyed and the
3814 		 * reference to the inode need to be dropped, when IO is
3815 		 * complete, even with 0 byte write, or failed.
3816 		 *
3817 		 * In the successful AIO DIO case, the io_end structure will be
3818 		 * desctroyed and the reference to the inode will be dropped
3819 		 * after the end_io call back function is called.
3820 		 *
3821 		 * In the case there is 0 byte write, or error case, since
3822 		 * VFS direct IO won't invoke the end_io call back function,
3823 		 * we need to free the end_io structure here.
3824 		 */
3825 		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3826 			ext4_free_io_end(iocb->private);
3827 			iocb->private = NULL;
3828 		} else if (ret > 0 && ext4_test_inode_state(inode,
3829 						EXT4_STATE_DIO_UNWRITTEN)) {
3830 			int err;
3831 			/*
3832 			 * for non AIO case, since the IO is already
3833 			 * completed, we could do the convertion right here
3834 			 */
3835 			err = ext4_convert_unwritten_extents(inode,
3836 							     offset, ret);
3837 			if (err < 0)
3838 				ret = err;
3839 			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3840 		}
3841 		return ret;
3842 	}
3843 
3844 	/* for write the the end of file case, we fall back to old way */
3845 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3846 }
3847 
3848 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3849 			      const struct iovec *iov, loff_t offset,
3850 			      unsigned long nr_segs)
3851 {
3852 	struct file *file = iocb->ki_filp;
3853 	struct inode *inode = file->f_mapping->host;
3854 
3855 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3856 		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3857 
3858 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3859 }
3860 
3861 /*
3862  * Pages can be marked dirty completely asynchronously from ext4's journalling
3863  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
3864  * much here because ->set_page_dirty is called under VFS locks.  The page is
3865  * not necessarily locked.
3866  *
3867  * We cannot just dirty the page and leave attached buffers clean, because the
3868  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
3869  * or jbddirty because all the journalling code will explode.
3870  *
3871  * So what we do is to mark the page "pending dirty" and next time writepage
3872  * is called, propagate that into the buffers appropriately.
3873  */
3874 static int ext4_journalled_set_page_dirty(struct page *page)
3875 {
3876 	SetPageChecked(page);
3877 	return __set_page_dirty_nobuffers(page);
3878 }
3879 
3880 static const struct address_space_operations ext4_ordered_aops = {
3881 	.readpage		= ext4_readpage,
3882 	.readpages		= ext4_readpages,
3883 	.writepage		= ext4_writepage,
3884 	.sync_page		= block_sync_page,
3885 	.write_begin		= ext4_write_begin,
3886 	.write_end		= ext4_ordered_write_end,
3887 	.bmap			= ext4_bmap,
3888 	.invalidatepage		= ext4_invalidatepage,
3889 	.releasepage		= ext4_releasepage,
3890 	.direct_IO		= ext4_direct_IO,
3891 	.migratepage		= buffer_migrate_page,
3892 	.is_partially_uptodate  = block_is_partially_uptodate,
3893 	.error_remove_page	= generic_error_remove_page,
3894 };
3895 
3896 static const struct address_space_operations ext4_writeback_aops = {
3897 	.readpage		= ext4_readpage,
3898 	.readpages		= ext4_readpages,
3899 	.writepage		= ext4_writepage,
3900 	.sync_page		= block_sync_page,
3901 	.write_begin		= ext4_write_begin,
3902 	.write_end		= ext4_writeback_write_end,
3903 	.bmap			= ext4_bmap,
3904 	.invalidatepage		= ext4_invalidatepage,
3905 	.releasepage		= ext4_releasepage,
3906 	.direct_IO		= ext4_direct_IO,
3907 	.migratepage		= buffer_migrate_page,
3908 	.is_partially_uptodate  = block_is_partially_uptodate,
3909 	.error_remove_page	= generic_error_remove_page,
3910 };
3911 
3912 static const struct address_space_operations ext4_journalled_aops = {
3913 	.readpage		= ext4_readpage,
3914 	.readpages		= ext4_readpages,
3915 	.writepage		= ext4_writepage,
3916 	.sync_page		= block_sync_page,
3917 	.write_begin		= ext4_write_begin,
3918 	.write_end		= ext4_journalled_write_end,
3919 	.set_page_dirty		= ext4_journalled_set_page_dirty,
3920 	.bmap			= ext4_bmap,
3921 	.invalidatepage		= ext4_invalidatepage,
3922 	.releasepage		= ext4_releasepage,
3923 	.is_partially_uptodate  = block_is_partially_uptodate,
3924 	.error_remove_page	= generic_error_remove_page,
3925 };
3926 
3927 static const struct address_space_operations ext4_da_aops = {
3928 	.readpage		= ext4_readpage,
3929 	.readpages		= ext4_readpages,
3930 	.writepage		= ext4_writepage,
3931 	.writepages		= ext4_da_writepages,
3932 	.sync_page		= block_sync_page,
3933 	.write_begin		= ext4_da_write_begin,
3934 	.write_end		= ext4_da_write_end,
3935 	.bmap			= ext4_bmap,
3936 	.invalidatepage		= ext4_da_invalidatepage,
3937 	.releasepage		= ext4_releasepage,
3938 	.direct_IO		= ext4_direct_IO,
3939 	.migratepage		= buffer_migrate_page,
3940 	.is_partially_uptodate  = block_is_partially_uptodate,
3941 	.error_remove_page	= generic_error_remove_page,
3942 };
3943 
3944 void ext4_set_aops(struct inode *inode)
3945 {
3946 	if (ext4_should_order_data(inode) &&
3947 		test_opt(inode->i_sb, DELALLOC))
3948 		inode->i_mapping->a_ops = &ext4_da_aops;
3949 	else if (ext4_should_order_data(inode))
3950 		inode->i_mapping->a_ops = &ext4_ordered_aops;
3951 	else if (ext4_should_writeback_data(inode) &&
3952 		 test_opt(inode->i_sb, DELALLOC))
3953 		inode->i_mapping->a_ops = &ext4_da_aops;
3954 	else if (ext4_should_writeback_data(inode))
3955 		inode->i_mapping->a_ops = &ext4_writeback_aops;
3956 	else
3957 		inode->i_mapping->a_ops = &ext4_journalled_aops;
3958 }
3959 
3960 /*
3961  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3962  * up to the end of the block which corresponds to `from'.
3963  * This required during truncate. We need to physically zero the tail end
3964  * of that block so it doesn't yield old data if the file is later grown.
3965  */
3966 int ext4_block_truncate_page(handle_t *handle,
3967 		struct address_space *mapping, loff_t from)
3968 {
3969 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3970 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
3971 	unsigned blocksize, length, pos;
3972 	ext4_lblk_t iblock;
3973 	struct inode *inode = mapping->host;
3974 	struct buffer_head *bh;
3975 	struct page *page;
3976 	int err = 0;
3977 
3978 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3979 				   mapping_gfp_mask(mapping) & ~__GFP_FS);
3980 	if (!page)
3981 		return -EINVAL;
3982 
3983 	blocksize = inode->i_sb->s_blocksize;
3984 	length = blocksize - (offset & (blocksize - 1));
3985 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3986 
3987 	if (!page_has_buffers(page))
3988 		create_empty_buffers(page, blocksize, 0);
3989 
3990 	/* Find the buffer that contains "offset" */
3991 	bh = page_buffers(page);
3992 	pos = blocksize;
3993 	while (offset >= pos) {
3994 		bh = bh->b_this_page;
3995 		iblock++;
3996 		pos += blocksize;
3997 	}
3998 
3999 	err = 0;
4000 	if (buffer_freed(bh)) {
4001 		BUFFER_TRACE(bh, "freed: skip");
4002 		goto unlock;
4003 	}
4004 
4005 	if (!buffer_mapped(bh)) {
4006 		BUFFER_TRACE(bh, "unmapped");
4007 		ext4_get_block(inode, iblock, bh, 0);
4008 		/* unmapped? It's a hole - nothing to do */
4009 		if (!buffer_mapped(bh)) {
4010 			BUFFER_TRACE(bh, "still unmapped");
4011 			goto unlock;
4012 		}
4013 	}
4014 
4015 	/* Ok, it's mapped. Make sure it's up-to-date */
4016 	if (PageUptodate(page))
4017 		set_buffer_uptodate(bh);
4018 
4019 	if (!buffer_uptodate(bh)) {
4020 		err = -EIO;
4021 		ll_rw_block(READ, 1, &bh);
4022 		wait_on_buffer(bh);
4023 		/* Uhhuh. Read error. Complain and punt. */
4024 		if (!buffer_uptodate(bh))
4025 			goto unlock;
4026 	}
4027 
4028 	if (ext4_should_journal_data(inode)) {
4029 		BUFFER_TRACE(bh, "get write access");
4030 		err = ext4_journal_get_write_access(handle, bh);
4031 		if (err)
4032 			goto unlock;
4033 	}
4034 
4035 	zero_user(page, offset, length);
4036 
4037 	BUFFER_TRACE(bh, "zeroed end of block");
4038 
4039 	err = 0;
4040 	if (ext4_should_journal_data(inode)) {
4041 		err = ext4_handle_dirty_metadata(handle, inode, bh);
4042 	} else {
4043 		if (ext4_should_order_data(inode))
4044 			err = ext4_jbd2_file_inode(handle, inode);
4045 		mark_buffer_dirty(bh);
4046 	}
4047 
4048 unlock:
4049 	unlock_page(page);
4050 	page_cache_release(page);
4051 	return err;
4052 }
4053 
4054 /*
4055  * Probably it should be a library function... search for first non-zero word
4056  * or memcmp with zero_page, whatever is better for particular architecture.
4057  * Linus?
4058  */
4059 static inline int all_zeroes(__le32 *p, __le32 *q)
4060 {
4061 	while (p < q)
4062 		if (*p++)
4063 			return 0;
4064 	return 1;
4065 }
4066 
4067 /**
4068  *	ext4_find_shared - find the indirect blocks for partial truncation.
4069  *	@inode:	  inode in question
4070  *	@depth:	  depth of the affected branch
4071  *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
4072  *	@chain:	  place to store the pointers to partial indirect blocks
4073  *	@top:	  place to the (detached) top of branch
4074  *
4075  *	This is a helper function used by ext4_truncate().
4076  *
4077  *	When we do truncate() we may have to clean the ends of several
4078  *	indirect blocks but leave the blocks themselves alive. Block is
4079  *	partially truncated if some data below the new i_size is refered
4080  *	from it (and it is on the path to the first completely truncated
4081  *	data block, indeed).  We have to free the top of that path along
4082  *	with everything to the right of the path. Since no allocation
4083  *	past the truncation point is possible until ext4_truncate()
4084  *	finishes, we may safely do the latter, but top of branch may
4085  *	require special attention - pageout below the truncation point
4086  *	might try to populate it.
4087  *
4088  *	We atomically detach the top of branch from the tree, store the
4089  *	block number of its root in *@top, pointers to buffer_heads of
4090  *	partially truncated blocks - in @chain[].bh and pointers to
4091  *	their last elements that should not be removed - in
4092  *	@chain[].p. Return value is the pointer to last filled element
4093  *	of @chain.
4094  *
4095  *	The work left to caller to do the actual freeing of subtrees:
4096  *		a) free the subtree starting from *@top
4097  *		b) free the subtrees whose roots are stored in
4098  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
4099  *		c) free the subtrees growing from the inode past the @chain[0].
4100  *			(no partially truncated stuff there).  */
4101 
4102 static Indirect *ext4_find_shared(struct inode *inode, int depth,
4103 				  ext4_lblk_t offsets[4], Indirect chain[4],
4104 				  __le32 *top)
4105 {
4106 	Indirect *partial, *p;
4107 	int k, err;
4108 
4109 	*top = 0;
4110 	/* Make k index the deepest non-null offset + 1 */
4111 	for (k = depth; k > 1 && !offsets[k-1]; k--)
4112 		;
4113 	partial = ext4_get_branch(inode, k, offsets, chain, &err);
4114 	/* Writer: pointers */
4115 	if (!partial)
4116 		partial = chain + k-1;
4117 	/*
4118 	 * If the branch acquired continuation since we've looked at it -
4119 	 * fine, it should all survive and (new) top doesn't belong to us.
4120 	 */
4121 	if (!partial->key && *partial->p)
4122 		/* Writer: end */
4123 		goto no_top;
4124 	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4125 		;
4126 	/*
4127 	 * OK, we've found the last block that must survive. The rest of our
4128 	 * branch should be detached before unlocking. However, if that rest
4129 	 * of branch is all ours and does not grow immediately from the inode
4130 	 * it's easier to cheat and just decrement partial->p.
4131 	 */
4132 	if (p == chain + k - 1 && p > chain) {
4133 		p->p--;
4134 	} else {
4135 		*top = *p->p;
4136 		/* Nope, don't do this in ext4.  Must leave the tree intact */
4137 #if 0
4138 		*p->p = 0;
4139 #endif
4140 	}
4141 	/* Writer: end */
4142 
4143 	while (partial > p) {
4144 		brelse(partial->bh);
4145 		partial--;
4146 	}
4147 no_top:
4148 	return partial;
4149 }
4150 
4151 /*
4152  * Zero a number of block pointers in either an inode or an indirect block.
4153  * If we restart the transaction we must again get write access to the
4154  * indirect block for further modification.
4155  *
4156  * We release `count' blocks on disk, but (last - first) may be greater
4157  * than `count' because there can be holes in there.
4158  */
4159 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4160 			     struct buffer_head *bh,
4161 			     ext4_fsblk_t block_to_free,
4162 			     unsigned long count, __le32 *first,
4163 			     __le32 *last)
4164 {
4165 	__le32 *p;
4166 	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4167 
4168 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4169 		flags |= EXT4_FREE_BLOCKS_METADATA;
4170 
4171 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4172 				   count)) {
4173 		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4174 				 "blocks %llu len %lu",
4175 				 (unsigned long long) block_to_free, count);
4176 		return 1;
4177 	}
4178 
4179 	if (try_to_extend_transaction(handle, inode)) {
4180 		if (bh) {
4181 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4182 			ext4_handle_dirty_metadata(handle, inode, bh);
4183 		}
4184 		ext4_mark_inode_dirty(handle, inode);
4185 		ext4_truncate_restart_trans(handle, inode,
4186 					    blocks_for_truncate(inode));
4187 		if (bh) {
4188 			BUFFER_TRACE(bh, "retaking write access");
4189 			ext4_journal_get_write_access(handle, bh);
4190 		}
4191 	}
4192 
4193 	for (p = first; p < last; p++)
4194 		*p = 0;
4195 
4196 	ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4197 	return 0;
4198 }
4199 
4200 /**
4201  * ext4_free_data - free a list of data blocks
4202  * @handle:	handle for this transaction
4203  * @inode:	inode we are dealing with
4204  * @this_bh:	indirect buffer_head which contains *@first and *@last
4205  * @first:	array of block numbers
4206  * @last:	points immediately past the end of array
4207  *
4208  * We are freeing all blocks refered from that array (numbers are stored as
4209  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4210  *
4211  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
4212  * blocks are contiguous then releasing them at one time will only affect one
4213  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4214  * actually use a lot of journal space.
4215  *
4216  * @this_bh will be %NULL if @first and @last point into the inode's direct
4217  * block pointers.
4218  */
4219 static void ext4_free_data(handle_t *handle, struct inode *inode,
4220 			   struct buffer_head *this_bh,
4221 			   __le32 *first, __le32 *last)
4222 {
4223 	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
4224 	unsigned long count = 0;	    /* Number of blocks in the run */
4225 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
4226 					       corresponding to
4227 					       block_to_free */
4228 	ext4_fsblk_t nr;		    /* Current block # */
4229 	__le32 *p;			    /* Pointer into inode/ind
4230 					       for current block */
4231 	int err;
4232 
4233 	if (this_bh) {				/* For indirect block */
4234 		BUFFER_TRACE(this_bh, "get_write_access");
4235 		err = ext4_journal_get_write_access(handle, this_bh);
4236 		/* Important: if we can't update the indirect pointers
4237 		 * to the blocks, we can't free them. */
4238 		if (err)
4239 			return;
4240 	}
4241 
4242 	for (p = first; p < last; p++) {
4243 		nr = le32_to_cpu(*p);
4244 		if (nr) {
4245 			/* accumulate blocks to free if they're contiguous */
4246 			if (count == 0) {
4247 				block_to_free = nr;
4248 				block_to_free_p = p;
4249 				count = 1;
4250 			} else if (nr == block_to_free + count) {
4251 				count++;
4252 			} else {
4253 				if (ext4_clear_blocks(handle, inode, this_bh,
4254 						      block_to_free, count,
4255 						      block_to_free_p, p))
4256 					break;
4257 				block_to_free = nr;
4258 				block_to_free_p = p;
4259 				count = 1;
4260 			}
4261 		}
4262 	}
4263 
4264 	if (count > 0)
4265 		ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4266 				  count, block_to_free_p, p);
4267 
4268 	if (this_bh) {
4269 		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4270 
4271 		/*
4272 		 * The buffer head should have an attached journal head at this
4273 		 * point. However, if the data is corrupted and an indirect
4274 		 * block pointed to itself, it would have been detached when
4275 		 * the block was cleared. Check for this instead of OOPSing.
4276 		 */
4277 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4278 			ext4_handle_dirty_metadata(handle, inode, this_bh);
4279 		else
4280 			EXT4_ERROR_INODE(inode,
4281 					 "circular indirect block detected at "
4282 					 "block %llu",
4283 				(unsigned long long) this_bh->b_blocknr);
4284 	}
4285 }
4286 
4287 /**
4288  *	ext4_free_branches - free an array of branches
4289  *	@handle: JBD handle for this transaction
4290  *	@inode:	inode we are dealing with
4291  *	@parent_bh: the buffer_head which contains *@first and *@last
4292  *	@first:	array of block numbers
4293  *	@last:	pointer immediately past the end of array
4294  *	@depth:	depth of the branches to free
4295  *
4296  *	We are freeing all blocks refered from these branches (numbers are
4297  *	stored as little-endian 32-bit) and updating @inode->i_blocks
4298  *	appropriately.
4299  */
4300 static void ext4_free_branches(handle_t *handle, struct inode *inode,
4301 			       struct buffer_head *parent_bh,
4302 			       __le32 *first, __le32 *last, int depth)
4303 {
4304 	ext4_fsblk_t nr;
4305 	__le32 *p;
4306 
4307 	if (ext4_handle_is_aborted(handle))
4308 		return;
4309 
4310 	if (depth--) {
4311 		struct buffer_head *bh;
4312 		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4313 		p = last;
4314 		while (--p >= first) {
4315 			nr = le32_to_cpu(*p);
4316 			if (!nr)
4317 				continue;		/* A hole */
4318 
4319 			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4320 						   nr, 1)) {
4321 				EXT4_ERROR_INODE(inode,
4322 						 "invalid indirect mapped "
4323 						 "block %lu (level %d)",
4324 						 (unsigned long) nr, depth);
4325 				break;
4326 			}
4327 
4328 			/* Go read the buffer for the next level down */
4329 			bh = sb_bread(inode->i_sb, nr);
4330 
4331 			/*
4332 			 * A read failure? Report error and clear slot
4333 			 * (should be rare).
4334 			 */
4335 			if (!bh) {
4336 				EXT4_ERROR_INODE_BLOCK(inode, nr,
4337 						       "Read failure");
4338 				continue;
4339 			}
4340 
4341 			/* This zaps the entire block.  Bottom up. */
4342 			BUFFER_TRACE(bh, "free child branches");
4343 			ext4_free_branches(handle, inode, bh,
4344 					(__le32 *) bh->b_data,
4345 					(__le32 *) bh->b_data + addr_per_block,
4346 					depth);
4347 
4348 			/*
4349 			 * Everything below this this pointer has been
4350 			 * released.  Now let this top-of-subtree go.
4351 			 *
4352 			 * We want the freeing of this indirect block to be
4353 			 * atomic in the journal with the updating of the
4354 			 * bitmap block which owns it.  So make some room in
4355 			 * the journal.
4356 			 *
4357 			 * We zero the parent pointer *after* freeing its
4358 			 * pointee in the bitmaps, so if extend_transaction()
4359 			 * for some reason fails to put the bitmap changes and
4360 			 * the release into the same transaction, recovery
4361 			 * will merely complain about releasing a free block,
4362 			 * rather than leaking blocks.
4363 			 */
4364 			if (ext4_handle_is_aborted(handle))
4365 				return;
4366 			if (try_to_extend_transaction(handle, inode)) {
4367 				ext4_mark_inode_dirty(handle, inode);
4368 				ext4_truncate_restart_trans(handle, inode,
4369 					    blocks_for_truncate(inode));
4370 			}
4371 
4372 			/*
4373 			 * The forget flag here is critical because if
4374 			 * we are journaling (and not doing data
4375 			 * journaling), we have to make sure a revoke
4376 			 * record is written to prevent the journal
4377 			 * replay from overwriting the (former)
4378 			 * indirect block if it gets reallocated as a
4379 			 * data block.  This must happen in the same
4380 			 * transaction where the data blocks are
4381 			 * actually freed.
4382 			 */
4383 			ext4_free_blocks(handle, inode, 0, nr, 1,
4384 					 EXT4_FREE_BLOCKS_METADATA|
4385 					 EXT4_FREE_BLOCKS_FORGET);
4386 
4387 			if (parent_bh) {
4388 				/*
4389 				 * The block which we have just freed is
4390 				 * pointed to by an indirect block: journal it
4391 				 */
4392 				BUFFER_TRACE(parent_bh, "get_write_access");
4393 				if (!ext4_journal_get_write_access(handle,
4394 								   parent_bh)){
4395 					*p = 0;
4396 					BUFFER_TRACE(parent_bh,
4397 					"call ext4_handle_dirty_metadata");
4398 					ext4_handle_dirty_metadata(handle,
4399 								   inode,
4400 								   parent_bh);
4401 				}
4402 			}
4403 		}
4404 	} else {
4405 		/* We have reached the bottom of the tree. */
4406 		BUFFER_TRACE(parent_bh, "free data blocks");
4407 		ext4_free_data(handle, inode, parent_bh, first, last);
4408 	}
4409 }
4410 
4411 int ext4_can_truncate(struct inode *inode)
4412 {
4413 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4414 		return 0;
4415 	if (S_ISREG(inode->i_mode))
4416 		return 1;
4417 	if (S_ISDIR(inode->i_mode))
4418 		return 1;
4419 	if (S_ISLNK(inode->i_mode))
4420 		return !ext4_inode_is_fast_symlink(inode);
4421 	return 0;
4422 }
4423 
4424 /*
4425  * ext4_truncate()
4426  *
4427  * We block out ext4_get_block() block instantiations across the entire
4428  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4429  * simultaneously on behalf of the same inode.
4430  *
4431  * As we work through the truncate and commmit bits of it to the journal there
4432  * is one core, guiding principle: the file's tree must always be consistent on
4433  * disk.  We must be able to restart the truncate after a crash.
4434  *
4435  * The file's tree may be transiently inconsistent in memory (although it
4436  * probably isn't), but whenever we close off and commit a journal transaction,
4437  * the contents of (the filesystem + the journal) must be consistent and
4438  * restartable.  It's pretty simple, really: bottom up, right to left (although
4439  * left-to-right works OK too).
4440  *
4441  * Note that at recovery time, journal replay occurs *before* the restart of
4442  * truncate against the orphan inode list.
4443  *
4444  * The committed inode has the new, desired i_size (which is the same as
4445  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
4446  * that this inode's truncate did not complete and it will again call
4447  * ext4_truncate() to have another go.  So there will be instantiated blocks
4448  * to the right of the truncation point in a crashed ext4 filesystem.  But
4449  * that's fine - as long as they are linked from the inode, the post-crash
4450  * ext4_truncate() run will find them and release them.
4451  */
4452 void ext4_truncate(struct inode *inode)
4453 {
4454 	handle_t *handle;
4455 	struct ext4_inode_info *ei = EXT4_I(inode);
4456 	__le32 *i_data = ei->i_data;
4457 	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4458 	struct address_space *mapping = inode->i_mapping;
4459 	ext4_lblk_t offsets[4];
4460 	Indirect chain[4];
4461 	Indirect *partial;
4462 	__le32 nr = 0;
4463 	int n;
4464 	ext4_lblk_t last_block;
4465 	unsigned blocksize = inode->i_sb->s_blocksize;
4466 
4467 	if (!ext4_can_truncate(inode))
4468 		return;
4469 
4470 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4471 
4472 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4473 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4474 
4475 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4476 		ext4_ext_truncate(inode);
4477 		return;
4478 	}
4479 
4480 	handle = start_transaction(inode);
4481 	if (IS_ERR(handle))
4482 		return;		/* AKPM: return what? */
4483 
4484 	last_block = (inode->i_size + blocksize-1)
4485 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4486 
4487 	if (inode->i_size & (blocksize - 1))
4488 		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4489 			goto out_stop;
4490 
4491 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
4492 	if (n == 0)
4493 		goto out_stop;	/* error */
4494 
4495 	/*
4496 	 * OK.  This truncate is going to happen.  We add the inode to the
4497 	 * orphan list, so that if this truncate spans multiple transactions,
4498 	 * and we crash, we will resume the truncate when the filesystem
4499 	 * recovers.  It also marks the inode dirty, to catch the new size.
4500 	 *
4501 	 * Implication: the file must always be in a sane, consistent
4502 	 * truncatable state while each transaction commits.
4503 	 */
4504 	if (ext4_orphan_add(handle, inode))
4505 		goto out_stop;
4506 
4507 	/*
4508 	 * From here we block out all ext4_get_block() callers who want to
4509 	 * modify the block allocation tree.
4510 	 */
4511 	down_write(&ei->i_data_sem);
4512 
4513 	ext4_discard_preallocations(inode);
4514 
4515 	/*
4516 	 * The orphan list entry will now protect us from any crash which
4517 	 * occurs before the truncate completes, so it is now safe to propagate
4518 	 * the new, shorter inode size (held for now in i_size) into the
4519 	 * on-disk inode. We do this via i_disksize, which is the value which
4520 	 * ext4 *really* writes onto the disk inode.
4521 	 */
4522 	ei->i_disksize = inode->i_size;
4523 
4524 	if (n == 1) {		/* direct blocks */
4525 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4526 			       i_data + EXT4_NDIR_BLOCKS);
4527 		goto do_indirects;
4528 	}
4529 
4530 	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4531 	/* Kill the top of shared branch (not detached) */
4532 	if (nr) {
4533 		if (partial == chain) {
4534 			/* Shared branch grows from the inode */
4535 			ext4_free_branches(handle, inode, NULL,
4536 					   &nr, &nr+1, (chain+n-1) - partial);
4537 			*partial->p = 0;
4538 			/*
4539 			 * We mark the inode dirty prior to restart,
4540 			 * and prior to stop.  No need for it here.
4541 			 */
4542 		} else {
4543 			/* Shared branch grows from an indirect block */
4544 			BUFFER_TRACE(partial->bh, "get_write_access");
4545 			ext4_free_branches(handle, inode, partial->bh,
4546 					partial->p,
4547 					partial->p+1, (chain+n-1) - partial);
4548 		}
4549 	}
4550 	/* Clear the ends of indirect blocks on the shared branch */
4551 	while (partial > chain) {
4552 		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4553 				   (__le32*)partial->bh->b_data+addr_per_block,
4554 				   (chain+n-1) - partial);
4555 		BUFFER_TRACE(partial->bh, "call brelse");
4556 		brelse(partial->bh);
4557 		partial--;
4558 	}
4559 do_indirects:
4560 	/* Kill the remaining (whole) subtrees */
4561 	switch (offsets[0]) {
4562 	default:
4563 		nr = i_data[EXT4_IND_BLOCK];
4564 		if (nr) {
4565 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4566 			i_data[EXT4_IND_BLOCK] = 0;
4567 		}
4568 	case EXT4_IND_BLOCK:
4569 		nr = i_data[EXT4_DIND_BLOCK];
4570 		if (nr) {
4571 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4572 			i_data[EXT4_DIND_BLOCK] = 0;
4573 		}
4574 	case EXT4_DIND_BLOCK:
4575 		nr = i_data[EXT4_TIND_BLOCK];
4576 		if (nr) {
4577 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4578 			i_data[EXT4_TIND_BLOCK] = 0;
4579 		}
4580 	case EXT4_TIND_BLOCK:
4581 		;
4582 	}
4583 
4584 	up_write(&ei->i_data_sem);
4585 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4586 	ext4_mark_inode_dirty(handle, inode);
4587 
4588 	/*
4589 	 * In a multi-transaction truncate, we only make the final transaction
4590 	 * synchronous
4591 	 */
4592 	if (IS_SYNC(inode))
4593 		ext4_handle_sync(handle);
4594 out_stop:
4595 	/*
4596 	 * If this was a simple ftruncate(), and the file will remain alive
4597 	 * then we need to clear up the orphan record which we created above.
4598 	 * However, if this was a real unlink then we were called by
4599 	 * ext4_delete_inode(), and we allow that function to clean up the
4600 	 * orphan info for us.
4601 	 */
4602 	if (inode->i_nlink)
4603 		ext4_orphan_del(handle, inode);
4604 
4605 	ext4_journal_stop(handle);
4606 }
4607 
4608 /*
4609  * ext4_get_inode_loc returns with an extra refcount against the inode's
4610  * underlying buffer_head on success. If 'in_mem' is true, we have all
4611  * data in memory that is needed to recreate the on-disk version of this
4612  * inode.
4613  */
4614 static int __ext4_get_inode_loc(struct inode *inode,
4615 				struct ext4_iloc *iloc, int in_mem)
4616 {
4617 	struct ext4_group_desc	*gdp;
4618 	struct buffer_head	*bh;
4619 	struct super_block	*sb = inode->i_sb;
4620 	ext4_fsblk_t		block;
4621 	int			inodes_per_block, inode_offset;
4622 
4623 	iloc->bh = NULL;
4624 	if (!ext4_valid_inum(sb, inode->i_ino))
4625 		return -EIO;
4626 
4627 	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4628 	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4629 	if (!gdp)
4630 		return -EIO;
4631 
4632 	/*
4633 	 * Figure out the offset within the block group inode table
4634 	 */
4635 	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
4636 	inode_offset = ((inode->i_ino - 1) %
4637 			EXT4_INODES_PER_GROUP(sb));
4638 	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4639 	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4640 
4641 	bh = sb_getblk(sb, block);
4642 	if (!bh) {
4643 		EXT4_ERROR_INODE_BLOCK(inode, block,
4644 				       "unable to read itable block");
4645 		return -EIO;
4646 	}
4647 	if (!buffer_uptodate(bh)) {
4648 		lock_buffer(bh);
4649 
4650 		/*
4651 		 * If the buffer has the write error flag, we have failed
4652 		 * to write out another inode in the same block.  In this
4653 		 * case, we don't have to read the block because we may
4654 		 * read the old inode data successfully.
4655 		 */
4656 		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4657 			set_buffer_uptodate(bh);
4658 
4659 		if (buffer_uptodate(bh)) {
4660 			/* someone brought it uptodate while we waited */
4661 			unlock_buffer(bh);
4662 			goto has_buffer;
4663 		}
4664 
4665 		/*
4666 		 * If we have all information of the inode in memory and this
4667 		 * is the only valid inode in the block, we need not read the
4668 		 * block.
4669 		 */
4670 		if (in_mem) {
4671 			struct buffer_head *bitmap_bh;
4672 			int i, start;
4673 
4674 			start = inode_offset & ~(inodes_per_block - 1);
4675 
4676 			/* Is the inode bitmap in cache? */
4677 			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4678 			if (!bitmap_bh)
4679 				goto make_io;
4680 
4681 			/*
4682 			 * If the inode bitmap isn't in cache then the
4683 			 * optimisation may end up performing two reads instead
4684 			 * of one, so skip it.
4685 			 */
4686 			if (!buffer_uptodate(bitmap_bh)) {
4687 				brelse(bitmap_bh);
4688 				goto make_io;
4689 			}
4690 			for (i = start; i < start + inodes_per_block; i++) {
4691 				if (i == inode_offset)
4692 					continue;
4693 				if (ext4_test_bit(i, bitmap_bh->b_data))
4694 					break;
4695 			}
4696 			brelse(bitmap_bh);
4697 			if (i == start + inodes_per_block) {
4698 				/* all other inodes are free, so skip I/O */
4699 				memset(bh->b_data, 0, bh->b_size);
4700 				set_buffer_uptodate(bh);
4701 				unlock_buffer(bh);
4702 				goto has_buffer;
4703 			}
4704 		}
4705 
4706 make_io:
4707 		/*
4708 		 * If we need to do any I/O, try to pre-readahead extra
4709 		 * blocks from the inode table.
4710 		 */
4711 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
4712 			ext4_fsblk_t b, end, table;
4713 			unsigned num;
4714 
4715 			table = ext4_inode_table(sb, gdp);
4716 			/* s_inode_readahead_blks is always a power of 2 */
4717 			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4718 			if (table > b)
4719 				b = table;
4720 			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
4721 			num = EXT4_INODES_PER_GROUP(sb);
4722 			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4723 				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
4724 				num -= ext4_itable_unused_count(sb, gdp);
4725 			table += num / inodes_per_block;
4726 			if (end > table)
4727 				end = table;
4728 			while (b <= end)
4729 				sb_breadahead(sb, b++);
4730 		}
4731 
4732 		/*
4733 		 * There are other valid inodes in the buffer, this inode
4734 		 * has in-inode xattrs, or we don't have this inode in memory.
4735 		 * Read the block from disk.
4736 		 */
4737 		get_bh(bh);
4738 		bh->b_end_io = end_buffer_read_sync;
4739 		submit_bh(READ_META, bh);
4740 		wait_on_buffer(bh);
4741 		if (!buffer_uptodate(bh)) {
4742 			EXT4_ERROR_INODE_BLOCK(inode, block,
4743 					       "unable to read itable block");
4744 			brelse(bh);
4745 			return -EIO;
4746 		}
4747 	}
4748 has_buffer:
4749 	iloc->bh = bh;
4750 	return 0;
4751 }
4752 
4753 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4754 {
4755 	/* We have all inode data except xattrs in memory here. */
4756 	return __ext4_get_inode_loc(inode, iloc,
4757 		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4758 }
4759 
4760 void ext4_set_inode_flags(struct inode *inode)
4761 {
4762 	unsigned int flags = EXT4_I(inode)->i_flags;
4763 
4764 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
4765 	if (flags & EXT4_SYNC_FL)
4766 		inode->i_flags |= S_SYNC;
4767 	if (flags & EXT4_APPEND_FL)
4768 		inode->i_flags |= S_APPEND;
4769 	if (flags & EXT4_IMMUTABLE_FL)
4770 		inode->i_flags |= S_IMMUTABLE;
4771 	if (flags & EXT4_NOATIME_FL)
4772 		inode->i_flags |= S_NOATIME;
4773 	if (flags & EXT4_DIRSYNC_FL)
4774 		inode->i_flags |= S_DIRSYNC;
4775 }
4776 
4777 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4778 void ext4_get_inode_flags(struct ext4_inode_info *ei)
4779 {
4780 	unsigned int vfs_fl;
4781 	unsigned long old_fl, new_fl;
4782 
4783 	do {
4784 		vfs_fl = ei->vfs_inode.i_flags;
4785 		old_fl = ei->i_flags;
4786 		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4787 				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4788 				EXT4_DIRSYNC_FL);
4789 		if (vfs_fl & S_SYNC)
4790 			new_fl |= EXT4_SYNC_FL;
4791 		if (vfs_fl & S_APPEND)
4792 			new_fl |= EXT4_APPEND_FL;
4793 		if (vfs_fl & S_IMMUTABLE)
4794 			new_fl |= EXT4_IMMUTABLE_FL;
4795 		if (vfs_fl & S_NOATIME)
4796 			new_fl |= EXT4_NOATIME_FL;
4797 		if (vfs_fl & S_DIRSYNC)
4798 			new_fl |= EXT4_DIRSYNC_FL;
4799 	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4800 }
4801 
4802 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4803 				  struct ext4_inode_info *ei)
4804 {
4805 	blkcnt_t i_blocks ;
4806 	struct inode *inode = &(ei->vfs_inode);
4807 	struct super_block *sb = inode->i_sb;
4808 
4809 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4810 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4811 		/* we are using combined 48 bit field */
4812 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4813 					le32_to_cpu(raw_inode->i_blocks_lo);
4814 		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4815 			/* i_blocks represent file system block size */
4816 			return i_blocks  << (inode->i_blkbits - 9);
4817 		} else {
4818 			return i_blocks;
4819 		}
4820 	} else {
4821 		return le32_to_cpu(raw_inode->i_blocks_lo);
4822 	}
4823 }
4824 
4825 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4826 {
4827 	struct ext4_iloc iloc;
4828 	struct ext4_inode *raw_inode;
4829 	struct ext4_inode_info *ei;
4830 	struct inode *inode;
4831 	journal_t *journal = EXT4_SB(sb)->s_journal;
4832 	long ret;
4833 	int block;
4834 
4835 	inode = iget_locked(sb, ino);
4836 	if (!inode)
4837 		return ERR_PTR(-ENOMEM);
4838 	if (!(inode->i_state & I_NEW))
4839 		return inode;
4840 
4841 	ei = EXT4_I(inode);
4842 	iloc.bh = 0;
4843 
4844 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
4845 	if (ret < 0)
4846 		goto bad_inode;
4847 	raw_inode = ext4_raw_inode(&iloc);
4848 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4849 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4850 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4851 	if (!(test_opt(inode->i_sb, NO_UID32))) {
4852 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4853 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4854 	}
4855 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4856 
4857 	ei->i_state_flags = 0;
4858 	ei->i_dir_start_lookup = 0;
4859 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4860 	/* We now have enough fields to check if the inode was active or not.
4861 	 * This is needed because nfsd might try to access dead inodes
4862 	 * the test is that same one that e2fsck uses
4863 	 * NeilBrown 1999oct15
4864 	 */
4865 	if (inode->i_nlink == 0) {
4866 		if (inode->i_mode == 0 ||
4867 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4868 			/* this inode is deleted */
4869 			ret = -ESTALE;
4870 			goto bad_inode;
4871 		}
4872 		/* The only unlinked inodes we let through here have
4873 		 * valid i_mode and are being read by the orphan
4874 		 * recovery code: that's fine, we're about to complete
4875 		 * the process of deleting those. */
4876 	}
4877 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4878 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4879 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4880 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4881 		ei->i_file_acl |=
4882 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4883 	inode->i_size = ext4_isize(raw_inode);
4884 	ei->i_disksize = inode->i_size;
4885 #ifdef CONFIG_QUOTA
4886 	ei->i_reserved_quota = 0;
4887 #endif
4888 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4889 	ei->i_block_group = iloc.block_group;
4890 	ei->i_last_alloc_group = ~0;
4891 	/*
4892 	 * NOTE! The in-memory inode i_data array is in little-endian order
4893 	 * even on big-endian machines: we do NOT byteswap the block numbers!
4894 	 */
4895 	for (block = 0; block < EXT4_N_BLOCKS; block++)
4896 		ei->i_data[block] = raw_inode->i_block[block];
4897 	INIT_LIST_HEAD(&ei->i_orphan);
4898 
4899 	/*
4900 	 * Set transaction id's of transactions that have to be committed
4901 	 * to finish f[data]sync. We set them to currently running transaction
4902 	 * as we cannot be sure that the inode or some of its metadata isn't
4903 	 * part of the transaction - the inode could have been reclaimed and
4904 	 * now it is reread from disk.
4905 	 */
4906 	if (journal) {
4907 		transaction_t *transaction;
4908 		tid_t tid;
4909 
4910 		read_lock(&journal->j_state_lock);
4911 		if (journal->j_running_transaction)
4912 			transaction = journal->j_running_transaction;
4913 		else
4914 			transaction = journal->j_committing_transaction;
4915 		if (transaction)
4916 			tid = transaction->t_tid;
4917 		else
4918 			tid = journal->j_commit_sequence;
4919 		read_unlock(&journal->j_state_lock);
4920 		ei->i_sync_tid = tid;
4921 		ei->i_datasync_tid = tid;
4922 	}
4923 
4924 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4925 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4926 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4927 		    EXT4_INODE_SIZE(inode->i_sb)) {
4928 			ret = -EIO;
4929 			goto bad_inode;
4930 		}
4931 		if (ei->i_extra_isize == 0) {
4932 			/* The extra space is currently unused. Use it. */
4933 			ei->i_extra_isize = sizeof(struct ext4_inode) -
4934 					    EXT4_GOOD_OLD_INODE_SIZE;
4935 		} else {
4936 			__le32 *magic = (void *)raw_inode +
4937 					EXT4_GOOD_OLD_INODE_SIZE +
4938 					ei->i_extra_isize;
4939 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4940 				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4941 		}
4942 	} else
4943 		ei->i_extra_isize = 0;
4944 
4945 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4946 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4947 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4948 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4949 
4950 	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4951 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4952 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4953 			inode->i_version |=
4954 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4955 	}
4956 
4957 	ret = 0;
4958 	if (ei->i_file_acl &&
4959 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4960 		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
4961 				 ei->i_file_acl);
4962 		ret = -EIO;
4963 		goto bad_inode;
4964 	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4965 		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4966 		    (S_ISLNK(inode->i_mode) &&
4967 		     !ext4_inode_is_fast_symlink(inode)))
4968 			/* Validate extent which is part of inode */
4969 			ret = ext4_ext_check_inode(inode);
4970 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4971 		   (S_ISLNK(inode->i_mode) &&
4972 		    !ext4_inode_is_fast_symlink(inode))) {
4973 		/* Validate block references which are part of inode */
4974 		ret = ext4_check_inode_blockref(inode);
4975 	}
4976 	if (ret)
4977 		goto bad_inode;
4978 
4979 	if (S_ISREG(inode->i_mode)) {
4980 		inode->i_op = &ext4_file_inode_operations;
4981 		inode->i_fop = &ext4_file_operations;
4982 		ext4_set_aops(inode);
4983 	} else if (S_ISDIR(inode->i_mode)) {
4984 		inode->i_op = &ext4_dir_inode_operations;
4985 		inode->i_fop = &ext4_dir_operations;
4986 	} else if (S_ISLNK(inode->i_mode)) {
4987 		if (ext4_inode_is_fast_symlink(inode)) {
4988 			inode->i_op = &ext4_fast_symlink_inode_operations;
4989 			nd_terminate_link(ei->i_data, inode->i_size,
4990 				sizeof(ei->i_data) - 1);
4991 		} else {
4992 			inode->i_op = &ext4_symlink_inode_operations;
4993 			ext4_set_aops(inode);
4994 		}
4995 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4996 	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4997 		inode->i_op = &ext4_special_inode_operations;
4998 		if (raw_inode->i_block[0])
4999 			init_special_inode(inode, inode->i_mode,
5000 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
5001 		else
5002 			init_special_inode(inode, inode->i_mode,
5003 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5004 	} else {
5005 		ret = -EIO;
5006 		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5007 		goto bad_inode;
5008 	}
5009 	brelse(iloc.bh);
5010 	ext4_set_inode_flags(inode);
5011 	unlock_new_inode(inode);
5012 	return inode;
5013 
5014 bad_inode:
5015 	brelse(iloc.bh);
5016 	iget_failed(inode);
5017 	return ERR_PTR(ret);
5018 }
5019 
5020 static int ext4_inode_blocks_set(handle_t *handle,
5021 				struct ext4_inode *raw_inode,
5022 				struct ext4_inode_info *ei)
5023 {
5024 	struct inode *inode = &(ei->vfs_inode);
5025 	u64 i_blocks = inode->i_blocks;
5026 	struct super_block *sb = inode->i_sb;
5027 
5028 	if (i_blocks <= ~0U) {
5029 		/*
5030 		 * i_blocks can be represnted in a 32 bit variable
5031 		 * as multiple of 512 bytes
5032 		 */
5033 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
5034 		raw_inode->i_blocks_high = 0;
5035 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5036 		return 0;
5037 	}
5038 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
5039 		return -EFBIG;
5040 
5041 	if (i_blocks <= 0xffffffffffffULL) {
5042 		/*
5043 		 * i_blocks can be represented in a 48 bit variable
5044 		 * as multiple of 512 bytes
5045 		 */
5046 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
5047 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5048 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5049 	} else {
5050 		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5051 		/* i_block is stored in file system block size */
5052 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
5053 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
5054 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5055 	}
5056 	return 0;
5057 }
5058 
5059 /*
5060  * Post the struct inode info into an on-disk inode location in the
5061  * buffer-cache.  This gobbles the caller's reference to the
5062  * buffer_head in the inode location struct.
5063  *
5064  * The caller must have write access to iloc->bh.
5065  */
5066 static int ext4_do_update_inode(handle_t *handle,
5067 				struct inode *inode,
5068 				struct ext4_iloc *iloc)
5069 {
5070 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5071 	struct ext4_inode_info *ei = EXT4_I(inode);
5072 	struct buffer_head *bh = iloc->bh;
5073 	int err = 0, rc, block;
5074 
5075 	/* For fields not not tracking in the in-memory inode,
5076 	 * initialise them to zero for new inodes. */
5077 	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5078 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5079 
5080 	ext4_get_inode_flags(ei);
5081 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
5082 	if (!(test_opt(inode->i_sb, NO_UID32))) {
5083 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
5084 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
5085 /*
5086  * Fix up interoperability with old kernels. Otherwise, old inodes get
5087  * re-used with the upper 16 bits of the uid/gid intact
5088  */
5089 		if (!ei->i_dtime) {
5090 			raw_inode->i_uid_high =
5091 				cpu_to_le16(high_16_bits(inode->i_uid));
5092 			raw_inode->i_gid_high =
5093 				cpu_to_le16(high_16_bits(inode->i_gid));
5094 		} else {
5095 			raw_inode->i_uid_high = 0;
5096 			raw_inode->i_gid_high = 0;
5097 		}
5098 	} else {
5099 		raw_inode->i_uid_low =
5100 			cpu_to_le16(fs_high2lowuid(inode->i_uid));
5101 		raw_inode->i_gid_low =
5102 			cpu_to_le16(fs_high2lowgid(inode->i_gid));
5103 		raw_inode->i_uid_high = 0;
5104 		raw_inode->i_gid_high = 0;
5105 	}
5106 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
5107 
5108 	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5109 	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5110 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5111 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
5112 
5113 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
5114 		goto out_brelse;
5115 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5116 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
5117 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5118 	    cpu_to_le32(EXT4_OS_HURD))
5119 		raw_inode->i_file_acl_high =
5120 			cpu_to_le16(ei->i_file_acl >> 32);
5121 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
5122 	ext4_isize_set(raw_inode, ei->i_disksize);
5123 	if (ei->i_disksize > 0x7fffffffULL) {
5124 		struct super_block *sb = inode->i_sb;
5125 		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
5126 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
5127 				EXT4_SB(sb)->s_es->s_rev_level ==
5128 				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
5129 			/* If this is the first large file
5130 			 * created, add a flag to the superblock.
5131 			 */
5132 			err = ext4_journal_get_write_access(handle,
5133 					EXT4_SB(sb)->s_sbh);
5134 			if (err)
5135 				goto out_brelse;
5136 			ext4_update_dynamic_rev(sb);
5137 			EXT4_SET_RO_COMPAT_FEATURE(sb,
5138 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5139 			sb->s_dirt = 1;
5140 			ext4_handle_sync(handle);
5141 			err = ext4_handle_dirty_metadata(handle, NULL,
5142 					EXT4_SB(sb)->s_sbh);
5143 		}
5144 	}
5145 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
5146 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
5147 		if (old_valid_dev(inode->i_rdev)) {
5148 			raw_inode->i_block[0] =
5149 				cpu_to_le32(old_encode_dev(inode->i_rdev));
5150 			raw_inode->i_block[1] = 0;
5151 		} else {
5152 			raw_inode->i_block[0] = 0;
5153 			raw_inode->i_block[1] =
5154 				cpu_to_le32(new_encode_dev(inode->i_rdev));
5155 			raw_inode->i_block[2] = 0;
5156 		}
5157 	} else
5158 		for (block = 0; block < EXT4_N_BLOCKS; block++)
5159 			raw_inode->i_block[block] = ei->i_data[block];
5160 
5161 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
5162 	if (ei->i_extra_isize) {
5163 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
5164 			raw_inode->i_version_hi =
5165 			cpu_to_le32(inode->i_version >> 32);
5166 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
5167 	}
5168 
5169 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5170 	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5171 	if (!err)
5172 		err = rc;
5173 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5174 
5175 	ext4_update_inode_fsync_trans(handle, inode, 0);
5176 out_brelse:
5177 	brelse(bh);
5178 	ext4_std_error(inode->i_sb, err);
5179 	return err;
5180 }
5181 
5182 /*
5183  * ext4_write_inode()
5184  *
5185  * We are called from a few places:
5186  *
5187  * - Within generic_file_write() for O_SYNC files.
5188  *   Here, there will be no transaction running. We wait for any running
5189  *   trasnaction to commit.
5190  *
5191  * - Within sys_sync(), kupdate and such.
5192  *   We wait on commit, if tol to.
5193  *
5194  * - Within prune_icache() (PF_MEMALLOC == true)
5195  *   Here we simply return.  We can't afford to block kswapd on the
5196  *   journal commit.
5197  *
5198  * In all cases it is actually safe for us to return without doing anything,
5199  * because the inode has been copied into a raw inode buffer in
5200  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
5201  * knfsd.
5202  *
5203  * Note that we are absolutely dependent upon all inode dirtiers doing the
5204  * right thing: they *must* call mark_inode_dirty() after dirtying info in
5205  * which we are interested.
5206  *
5207  * It would be a bug for them to not do this.  The code:
5208  *
5209  *	mark_inode_dirty(inode)
5210  *	stuff();
5211  *	inode->i_size = expr;
5212  *
5213  * is in error because a kswapd-driven write_inode() could occur while
5214  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
5215  * will no longer be on the superblock's dirty inode list.
5216  */
5217 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5218 {
5219 	int err;
5220 
5221 	if (current->flags & PF_MEMALLOC)
5222 		return 0;
5223 
5224 	if (EXT4_SB(inode->i_sb)->s_journal) {
5225 		if (ext4_journal_current_handle()) {
5226 			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
5227 			dump_stack();
5228 			return -EIO;
5229 		}
5230 
5231 		if (wbc->sync_mode != WB_SYNC_ALL)
5232 			return 0;
5233 
5234 		err = ext4_force_commit(inode->i_sb);
5235 	} else {
5236 		struct ext4_iloc iloc;
5237 
5238 		err = __ext4_get_inode_loc(inode, &iloc, 0);
5239 		if (err)
5240 			return err;
5241 		if (wbc->sync_mode == WB_SYNC_ALL)
5242 			sync_dirty_buffer(iloc.bh);
5243 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5244 			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5245 					 "IO error syncing inode");
5246 			err = -EIO;
5247 		}
5248 		brelse(iloc.bh);
5249 	}
5250 	return err;
5251 }
5252 
5253 /*
5254  * ext4_setattr()
5255  *
5256  * Called from notify_change.
5257  *
5258  * We want to trap VFS attempts to truncate the file as soon as
5259  * possible.  In particular, we want to make sure that when the VFS
5260  * shrinks i_size, we put the inode on the orphan list and modify
5261  * i_disksize immediately, so that during the subsequent flushing of
5262  * dirty pages and freeing of disk blocks, we can guarantee that any
5263  * commit will leave the blocks being flushed in an unused state on
5264  * disk.  (On recovery, the inode will get truncated and the blocks will
5265  * be freed, so we have a strong guarantee that no future commit will
5266  * leave these blocks visible to the user.)
5267  *
5268  * Another thing we have to assure is that if we are in ordered mode
5269  * and inode is still attached to the committing transaction, we must
5270  * we start writeout of all the dirty pages which are being truncated.
5271  * This way we are sure that all the data written in the previous
5272  * transaction are already on disk (truncate waits for pages under
5273  * writeback).
5274  *
5275  * Called with inode->i_mutex down.
5276  */
5277 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5278 {
5279 	struct inode *inode = dentry->d_inode;
5280 	int error, rc = 0;
5281 	int orphan = 0;
5282 	const unsigned int ia_valid = attr->ia_valid;
5283 
5284 	error = inode_change_ok(inode, attr);
5285 	if (error)
5286 		return error;
5287 
5288 	if (is_quota_modification(inode, attr))
5289 		dquot_initialize(inode);
5290 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5291 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5292 		handle_t *handle;
5293 
5294 		/* (user+group)*(old+new) structure, inode write (sb,
5295 		 * inode block, ? - but truncate inode update has it) */
5296 		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5297 					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5298 		if (IS_ERR(handle)) {
5299 			error = PTR_ERR(handle);
5300 			goto err_out;
5301 		}
5302 		error = dquot_transfer(inode, attr);
5303 		if (error) {
5304 			ext4_journal_stop(handle);
5305 			return error;
5306 		}
5307 		/* Update corresponding info in inode so that everything is in
5308 		 * one transaction */
5309 		if (attr->ia_valid & ATTR_UID)
5310 			inode->i_uid = attr->ia_uid;
5311 		if (attr->ia_valid & ATTR_GID)
5312 			inode->i_gid = attr->ia_gid;
5313 		error = ext4_mark_inode_dirty(handle, inode);
5314 		ext4_journal_stop(handle);
5315 	}
5316 
5317 	if (attr->ia_valid & ATTR_SIZE) {
5318 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5319 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5320 
5321 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
5322 				return -EFBIG;
5323 		}
5324 	}
5325 
5326 	if (S_ISREG(inode->i_mode) &&
5327 	    attr->ia_valid & ATTR_SIZE &&
5328 	    (attr->ia_size < inode->i_size ||
5329 	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5330 		handle_t *handle;
5331 
5332 		handle = ext4_journal_start(inode, 3);
5333 		if (IS_ERR(handle)) {
5334 			error = PTR_ERR(handle);
5335 			goto err_out;
5336 		}
5337 		if (ext4_handle_valid(handle)) {
5338 			error = ext4_orphan_add(handle, inode);
5339 			orphan = 1;
5340 		}
5341 		EXT4_I(inode)->i_disksize = attr->ia_size;
5342 		rc = ext4_mark_inode_dirty(handle, inode);
5343 		if (!error)
5344 			error = rc;
5345 		ext4_journal_stop(handle);
5346 
5347 		if (ext4_should_order_data(inode)) {
5348 			error = ext4_begin_ordered_truncate(inode,
5349 							    attr->ia_size);
5350 			if (error) {
5351 				/* Do as much error cleanup as possible */
5352 				handle = ext4_journal_start(inode, 3);
5353 				if (IS_ERR(handle)) {
5354 					ext4_orphan_del(NULL, inode);
5355 					goto err_out;
5356 				}
5357 				ext4_orphan_del(handle, inode);
5358 				orphan = 0;
5359 				ext4_journal_stop(handle);
5360 				goto err_out;
5361 			}
5362 		}
5363 		/* ext4_truncate will clear the flag */
5364 		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5365 			ext4_truncate(inode);
5366 	}
5367 
5368 	if ((attr->ia_valid & ATTR_SIZE) &&
5369 	    attr->ia_size != i_size_read(inode))
5370 		rc = vmtruncate(inode, attr->ia_size);
5371 
5372 	if (!rc) {
5373 		setattr_copy(inode, attr);
5374 		mark_inode_dirty(inode);
5375 	}
5376 
5377 	/*
5378 	 * If the call to ext4_truncate failed to get a transaction handle at
5379 	 * all, we need to clean up the in-core orphan list manually.
5380 	 */
5381 	if (orphan && inode->i_nlink)
5382 		ext4_orphan_del(NULL, inode);
5383 
5384 	if (!rc && (ia_valid & ATTR_MODE))
5385 		rc = ext4_acl_chmod(inode);
5386 
5387 err_out:
5388 	ext4_std_error(inode->i_sb, error);
5389 	if (!error)
5390 		error = rc;
5391 	return error;
5392 }
5393 
5394 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5395 		 struct kstat *stat)
5396 {
5397 	struct inode *inode;
5398 	unsigned long delalloc_blocks;
5399 
5400 	inode = dentry->d_inode;
5401 	generic_fillattr(inode, stat);
5402 
5403 	/*
5404 	 * We can't update i_blocks if the block allocation is delayed
5405 	 * otherwise in the case of system crash before the real block
5406 	 * allocation is done, we will have i_blocks inconsistent with
5407 	 * on-disk file blocks.
5408 	 * We always keep i_blocks updated together with real
5409 	 * allocation. But to not confuse with user, stat
5410 	 * will return the blocks that include the delayed allocation
5411 	 * blocks for this file.
5412 	 */
5413 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5414 	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5415 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5416 
5417 	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5418 	return 0;
5419 }
5420 
5421 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5422 				      int chunk)
5423 {
5424 	int indirects;
5425 
5426 	/* if nrblocks are contiguous */
5427 	if (chunk) {
5428 		/*
5429 		 * With N contiguous data blocks, it need at most
5430 		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
5431 		 * 2 dindirect blocks
5432 		 * 1 tindirect block
5433 		 */
5434 		indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
5435 		return indirects + 3;
5436 	}
5437 	/*
5438 	 * if nrblocks are not contiguous, worse case, each block touch
5439 	 * a indirect block, and each indirect block touch a double indirect
5440 	 * block, plus a triple indirect block
5441 	 */
5442 	indirects = nrblocks * 2 + 1;
5443 	return indirects;
5444 }
5445 
5446 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5447 {
5448 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5449 		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5450 	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5451 }
5452 
5453 /*
5454  * Account for index blocks, block groups bitmaps and block group
5455  * descriptor blocks if modify datablocks and index blocks
5456  * worse case, the indexs blocks spread over different block groups
5457  *
5458  * If datablocks are discontiguous, they are possible to spread over
5459  * different block groups too. If they are contiuguous, with flexbg,
5460  * they could still across block group boundary.
5461  *
5462  * Also account for superblock, inode, quota and xattr blocks
5463  */
5464 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5465 {
5466 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5467 	int gdpblocks;
5468 	int idxblocks;
5469 	int ret = 0;
5470 
5471 	/*
5472 	 * How many index blocks need to touch to modify nrblocks?
5473 	 * The "Chunk" flag indicating whether the nrblocks is
5474 	 * physically contiguous on disk
5475 	 *
5476 	 * For Direct IO and fallocate, they calls get_block to allocate
5477 	 * one single extent at a time, so they could set the "Chunk" flag
5478 	 */
5479 	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
5480 
5481 	ret = idxblocks;
5482 
5483 	/*
5484 	 * Now let's see how many group bitmaps and group descriptors need
5485 	 * to account
5486 	 */
5487 	groups = idxblocks;
5488 	if (chunk)
5489 		groups += 1;
5490 	else
5491 		groups += nrblocks;
5492 
5493 	gdpblocks = groups;
5494 	if (groups > ngroups)
5495 		groups = ngroups;
5496 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5497 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5498 
5499 	/* bitmaps and block group descriptor blocks */
5500 	ret += groups + gdpblocks;
5501 
5502 	/* Blocks for super block, inode, quota and xattr blocks */
5503 	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5504 
5505 	return ret;
5506 }
5507 
5508 /*
5509  * Calulate the total number of credits to reserve to fit
5510  * the modification of a single pages into a single transaction,
5511  * which may include multiple chunks of block allocations.
5512  *
5513  * This could be called via ext4_write_begin()
5514  *
5515  * We need to consider the worse case, when
5516  * one new block per extent.
5517  */
5518 int ext4_writepage_trans_blocks(struct inode *inode)
5519 {
5520 	int bpp = ext4_journal_blocks_per_page(inode);
5521 	int ret;
5522 
5523 	ret = ext4_meta_trans_blocks(inode, bpp, 0);
5524 
5525 	/* Account for data blocks for journalled mode */
5526 	if (ext4_should_journal_data(inode))
5527 		ret += bpp;
5528 	return ret;
5529 }
5530 
5531 /*
5532  * Calculate the journal credits for a chunk of data modification.
5533  *
5534  * This is called from DIO, fallocate or whoever calling
5535  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5536  *
5537  * journal buffers for data blocks are not included here, as DIO
5538  * and fallocate do no need to journal data buffers.
5539  */
5540 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5541 {
5542 	return ext4_meta_trans_blocks(inode, nrblocks, 1);
5543 }
5544 
5545 /*
5546  * The caller must have previously called ext4_reserve_inode_write().
5547  * Give this, we know that the caller already has write access to iloc->bh.
5548  */
5549 int ext4_mark_iloc_dirty(handle_t *handle,
5550 			 struct inode *inode, struct ext4_iloc *iloc)
5551 {
5552 	int err = 0;
5553 
5554 	if (test_opt(inode->i_sb, I_VERSION))
5555 		inode_inc_iversion(inode);
5556 
5557 	/* the do_update_inode consumes one bh->b_count */
5558 	get_bh(iloc->bh);
5559 
5560 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5561 	err = ext4_do_update_inode(handle, inode, iloc);
5562 	put_bh(iloc->bh);
5563 	return err;
5564 }
5565 
5566 /*
5567  * On success, We end up with an outstanding reference count against
5568  * iloc->bh.  This _must_ be cleaned up later.
5569  */
5570 
5571 int
5572 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5573 			 struct ext4_iloc *iloc)
5574 {
5575 	int err;
5576 
5577 	err = ext4_get_inode_loc(inode, iloc);
5578 	if (!err) {
5579 		BUFFER_TRACE(iloc->bh, "get_write_access");
5580 		err = ext4_journal_get_write_access(handle, iloc->bh);
5581 		if (err) {
5582 			brelse(iloc->bh);
5583 			iloc->bh = NULL;
5584 		}
5585 	}
5586 	ext4_std_error(inode->i_sb, err);
5587 	return err;
5588 }
5589 
5590 /*
5591  * Expand an inode by new_extra_isize bytes.
5592  * Returns 0 on success or negative error number on failure.
5593  */
5594 static int ext4_expand_extra_isize(struct inode *inode,
5595 				   unsigned int new_extra_isize,
5596 				   struct ext4_iloc iloc,
5597 				   handle_t *handle)
5598 {
5599 	struct ext4_inode *raw_inode;
5600 	struct ext4_xattr_ibody_header *header;
5601 
5602 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5603 		return 0;
5604 
5605 	raw_inode = ext4_raw_inode(&iloc);
5606 
5607 	header = IHDR(inode, raw_inode);
5608 
5609 	/* No extended attributes present */
5610 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5611 	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5612 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5613 			new_extra_isize);
5614 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
5615 		return 0;
5616 	}
5617 
5618 	/* try to expand with EAs present */
5619 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5620 					  raw_inode, handle);
5621 }
5622 
5623 /*
5624  * What we do here is to mark the in-core inode as clean with respect to inode
5625  * dirtiness (it may still be data-dirty).
5626  * This means that the in-core inode may be reaped by prune_icache
5627  * without having to perform any I/O.  This is a very good thing,
5628  * because *any* task may call prune_icache - even ones which
5629  * have a transaction open against a different journal.
5630  *
5631  * Is this cheating?  Not really.  Sure, we haven't written the
5632  * inode out, but prune_icache isn't a user-visible syncing function.
5633  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5634  * we start and wait on commits.
5635  *
5636  * Is this efficient/effective?  Well, we're being nice to the system
5637  * by cleaning up our inodes proactively so they can be reaped
5638  * without I/O.  But we are potentially leaving up to five seconds'
5639  * worth of inodes floating about which prune_icache wants us to
5640  * write out.  One way to fix that would be to get prune_icache()
5641  * to do a write_super() to free up some memory.  It has the desired
5642  * effect.
5643  */
5644 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5645 {
5646 	struct ext4_iloc iloc;
5647 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5648 	static unsigned int mnt_count;
5649 	int err, ret;
5650 
5651 	might_sleep();
5652 	err = ext4_reserve_inode_write(handle, inode, &iloc);
5653 	if (ext4_handle_valid(handle) &&
5654 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5655 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5656 		/*
5657 		 * We need extra buffer credits since we may write into EA block
5658 		 * with this same handle. If journal_extend fails, then it will
5659 		 * only result in a minor loss of functionality for that inode.
5660 		 * If this is felt to be critical, then e2fsck should be run to
5661 		 * force a large enough s_min_extra_isize.
5662 		 */
5663 		if ((jbd2_journal_extend(handle,
5664 			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5665 			ret = ext4_expand_extra_isize(inode,
5666 						      sbi->s_want_extra_isize,
5667 						      iloc, handle);
5668 			if (ret) {
5669 				ext4_set_inode_state(inode,
5670 						     EXT4_STATE_NO_EXPAND);
5671 				if (mnt_count !=
5672 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
5673 					ext4_warning(inode->i_sb,
5674 					"Unable to expand inode %lu. Delete"
5675 					" some EAs or run e2fsck.",
5676 					inode->i_ino);
5677 					mnt_count =
5678 					  le16_to_cpu(sbi->s_es->s_mnt_count);
5679 				}
5680 			}
5681 		}
5682 	}
5683 	if (!err)
5684 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5685 	return err;
5686 }
5687 
5688 /*
5689  * ext4_dirty_inode() is called from __mark_inode_dirty()
5690  *
5691  * We're really interested in the case where a file is being extended.
5692  * i_size has been changed by generic_commit_write() and we thus need
5693  * to include the updated inode in the current transaction.
5694  *
5695  * Also, dquot_alloc_block() will always dirty the inode when blocks
5696  * are allocated to the file.
5697  *
5698  * If the inode is marked synchronous, we don't honour that here - doing
5699  * so would cause a commit on atime updates, which we don't bother doing.
5700  * We handle synchronous inodes at the highest possible level.
5701  */
5702 void ext4_dirty_inode(struct inode *inode)
5703 {
5704 	handle_t *handle;
5705 
5706 	handle = ext4_journal_start(inode, 2);
5707 	if (IS_ERR(handle))
5708 		goto out;
5709 
5710 	ext4_mark_inode_dirty(handle, inode);
5711 
5712 	ext4_journal_stop(handle);
5713 out:
5714 	return;
5715 }
5716 
5717 #if 0
5718 /*
5719  * Bind an inode's backing buffer_head into this transaction, to prevent
5720  * it from being flushed to disk early.  Unlike
5721  * ext4_reserve_inode_write, this leaves behind no bh reference and
5722  * returns no iloc structure, so the caller needs to repeat the iloc
5723  * lookup to mark the inode dirty later.
5724  */
5725 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5726 {
5727 	struct ext4_iloc iloc;
5728 
5729 	int err = 0;
5730 	if (handle) {
5731 		err = ext4_get_inode_loc(inode, &iloc);
5732 		if (!err) {
5733 			BUFFER_TRACE(iloc.bh, "get_write_access");
5734 			err = jbd2_journal_get_write_access(handle, iloc.bh);
5735 			if (!err)
5736 				err = ext4_handle_dirty_metadata(handle,
5737 								 NULL,
5738 								 iloc.bh);
5739 			brelse(iloc.bh);
5740 		}
5741 	}
5742 	ext4_std_error(inode->i_sb, err);
5743 	return err;
5744 }
5745 #endif
5746 
5747 int ext4_change_inode_journal_flag(struct inode *inode, int val)
5748 {
5749 	journal_t *journal;
5750 	handle_t *handle;
5751 	int err;
5752 
5753 	/*
5754 	 * We have to be very careful here: changing a data block's
5755 	 * journaling status dynamically is dangerous.  If we write a
5756 	 * data block to the journal, change the status and then delete
5757 	 * that block, we risk forgetting to revoke the old log record
5758 	 * from the journal and so a subsequent replay can corrupt data.
5759 	 * So, first we make sure that the journal is empty and that
5760 	 * nobody is changing anything.
5761 	 */
5762 
5763 	journal = EXT4_JOURNAL(inode);
5764 	if (!journal)
5765 		return 0;
5766 	if (is_journal_aborted(journal))
5767 		return -EROFS;
5768 
5769 	jbd2_journal_lock_updates(journal);
5770 	jbd2_journal_flush(journal);
5771 
5772 	/*
5773 	 * OK, there are no updates running now, and all cached data is
5774 	 * synced to disk.  We are now in a completely consistent state
5775 	 * which doesn't have anything in the journal, and we know that
5776 	 * no filesystem updates are running, so it is safe to modify
5777 	 * the inode's in-core data-journaling state flag now.
5778 	 */
5779 
5780 	if (val)
5781 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5782 	else
5783 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5784 	ext4_set_aops(inode);
5785 
5786 	jbd2_journal_unlock_updates(journal);
5787 
5788 	/* Finally we can mark the inode as dirty. */
5789 
5790 	handle = ext4_journal_start(inode, 1);
5791 	if (IS_ERR(handle))
5792 		return PTR_ERR(handle);
5793 
5794 	err = ext4_mark_inode_dirty(handle, inode);
5795 	ext4_handle_sync(handle);
5796 	ext4_journal_stop(handle);
5797 	ext4_std_error(inode->i_sb, err);
5798 
5799 	return err;
5800 }
5801 
5802 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5803 {
5804 	return !buffer_mapped(bh);
5805 }
5806 
5807 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5808 {
5809 	struct page *page = vmf->page;
5810 	loff_t size;
5811 	unsigned long len;
5812 	int ret = -EINVAL;
5813 	void *fsdata;
5814 	struct file *file = vma->vm_file;
5815 	struct inode *inode = file->f_path.dentry->d_inode;
5816 	struct address_space *mapping = inode->i_mapping;
5817 
5818 	/*
5819 	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
5820 	 * get i_mutex because we are already holding mmap_sem.
5821 	 */
5822 	down_read(&inode->i_alloc_sem);
5823 	size = i_size_read(inode);
5824 	if (page->mapping != mapping || size <= page_offset(page)
5825 	    || !PageUptodate(page)) {
5826 		/* page got truncated from under us? */
5827 		goto out_unlock;
5828 	}
5829 	ret = 0;
5830 	if (PageMappedToDisk(page))
5831 		goto out_unlock;
5832 
5833 	if (page->index == size >> PAGE_CACHE_SHIFT)
5834 		len = size & ~PAGE_CACHE_MASK;
5835 	else
5836 		len = PAGE_CACHE_SIZE;
5837 
5838 	lock_page(page);
5839 	/*
5840 	 * return if we have all the buffers mapped. This avoid
5841 	 * the need to call write_begin/write_end which does a
5842 	 * journal_start/journal_stop which can block and take
5843 	 * long time
5844 	 */
5845 	if (page_has_buffers(page)) {
5846 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5847 					ext4_bh_unmapped)) {
5848 			unlock_page(page);
5849 			goto out_unlock;
5850 		}
5851 	}
5852 	unlock_page(page);
5853 	/*
5854 	 * OK, we need to fill the hole... Do write_begin write_end
5855 	 * to do block allocation/reservation.We are not holding
5856 	 * inode.i__mutex here. That allow * parallel write_begin,
5857 	 * write_end call. lock_page prevent this from happening
5858 	 * on the same page though
5859 	 */
5860 	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
5861 			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
5862 	if (ret < 0)
5863 		goto out_unlock;
5864 	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
5865 			len, len, page, fsdata);
5866 	if (ret < 0)
5867 		goto out_unlock;
5868 	ret = 0;
5869 out_unlock:
5870 	if (ret)
5871 		ret = VM_FAULT_SIGBUS;
5872 	up_read(&inode->i_alloc_sem);
5873 	return ret;
5874 }
5875