xref: /linux/fs/ext4/namei.c (revision 4413e16d9d21673bb5048a2e542f1aaa00015c2e)
1 /*
2  *  linux/fs/ext4/namei.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/namei.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Big-endian to little-endian byte-swapping/bitmaps by
16  *        David S. Miller (davem@caip.rutgers.edu), 1995
17  *  Directory entry file type support and forward compatibility hooks
18  *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19  *  Hash Tree Directory indexing (c)
20  *	Daniel Phillips, 2001
21  *  Hash Tree Directory indexing porting
22  *	Christopher Li, 2002
23  *  Hash Tree Directory indexing cleanup
24  *	Theodore Ts'o, 2002
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/pagemap.h>
29 #include <linux/jbd2.h>
30 #include <linux/time.h>
31 #include <linux/fcntl.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/quotaops.h>
35 #include <linux/buffer_head.h>
36 #include <linux/bio.h>
37 #include "ext4.h"
38 #include "ext4_jbd2.h"
39 
40 #include "xattr.h"
41 #include "acl.h"
42 
43 #include <trace/events/ext4.h>
44 /*
45  * define how far ahead to read directories while searching them.
46  */
47 #define NAMEI_RA_CHUNKS  2
48 #define NAMEI_RA_BLOCKS  4
49 #define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
50 #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
51 
52 static struct buffer_head *ext4_append(handle_t *handle,
53 					struct inode *inode,
54 					ext4_lblk_t *block, int *err)
55 {
56 	struct buffer_head *bh;
57 
58 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
59 
60 	bh = ext4_bread(handle, inode, *block, 1, err);
61 	if (bh) {
62 		inode->i_size += inode->i_sb->s_blocksize;
63 		EXT4_I(inode)->i_disksize = inode->i_size;
64 		*err = ext4_journal_get_write_access(handle, bh);
65 		if (*err) {
66 			brelse(bh);
67 			bh = NULL;
68 		}
69 	}
70 	return bh;
71 }
72 
73 #ifndef assert
74 #define assert(test) J_ASSERT(test)
75 #endif
76 
77 #ifdef DX_DEBUG
78 #define dxtrace(command) command
79 #else
80 #define dxtrace(command)
81 #endif
82 
83 struct fake_dirent
84 {
85 	__le32 inode;
86 	__le16 rec_len;
87 	u8 name_len;
88 	u8 file_type;
89 };
90 
91 struct dx_countlimit
92 {
93 	__le16 limit;
94 	__le16 count;
95 };
96 
97 struct dx_entry
98 {
99 	__le32 hash;
100 	__le32 block;
101 };
102 
103 /*
104  * dx_root_info is laid out so that if it should somehow get overlaid by a
105  * dirent the two low bits of the hash version will be zero.  Therefore, the
106  * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
107  */
108 
109 struct dx_root
110 {
111 	struct fake_dirent dot;
112 	char dot_name[4];
113 	struct fake_dirent dotdot;
114 	char dotdot_name[4];
115 	struct dx_root_info
116 	{
117 		__le32 reserved_zero;
118 		u8 hash_version;
119 		u8 info_length; /* 8 */
120 		u8 indirect_levels;
121 		u8 unused_flags;
122 	}
123 	info;
124 	struct dx_entry	entries[0];
125 };
126 
127 struct dx_node
128 {
129 	struct fake_dirent fake;
130 	struct dx_entry	entries[0];
131 };
132 
133 
134 struct dx_frame
135 {
136 	struct buffer_head *bh;
137 	struct dx_entry *entries;
138 	struct dx_entry *at;
139 };
140 
141 struct dx_map_entry
142 {
143 	u32 hash;
144 	u16 offs;
145 	u16 size;
146 };
147 
148 /*
149  * This goes at the end of each htree block.
150  */
151 struct dx_tail {
152 	u32 dt_reserved;
153 	__le32 dt_checksum;	/* crc32c(uuid+inum+dirblock) */
154 };
155 
156 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
157 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
158 static inline unsigned dx_get_hash(struct dx_entry *entry);
159 static void dx_set_hash(struct dx_entry *entry, unsigned value);
160 static unsigned dx_get_count(struct dx_entry *entries);
161 static unsigned dx_get_limit(struct dx_entry *entries);
162 static void dx_set_count(struct dx_entry *entries, unsigned value);
163 static void dx_set_limit(struct dx_entry *entries, unsigned value);
164 static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
165 static unsigned dx_node_limit(struct inode *dir);
166 static struct dx_frame *dx_probe(const struct qstr *d_name,
167 				 struct inode *dir,
168 				 struct dx_hash_info *hinfo,
169 				 struct dx_frame *frame,
170 				 int *err);
171 static void dx_release(struct dx_frame *frames);
172 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
173 		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
174 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
175 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
176 		struct dx_map_entry *offsets, int count, unsigned blocksize);
177 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
178 static void dx_insert_block(struct dx_frame *frame,
179 					u32 hash, ext4_lblk_t block);
180 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
181 				 struct dx_frame *frame,
182 				 struct dx_frame *frames,
183 				 __u32 *start_hash);
184 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
185 		const struct qstr *d_name,
186 		struct ext4_dir_entry_2 **res_dir,
187 		int *err);
188 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
189 			     struct inode *inode);
190 
191 /* checksumming functions */
192 #define EXT4_DIRENT_TAIL(block, blocksize) \
193 	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
194 					((blocksize) - \
195 					 sizeof(struct ext4_dir_entry_tail))))
196 
197 static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
198 				   unsigned int blocksize)
199 {
200 	memset(t, 0, sizeof(struct ext4_dir_entry_tail));
201 	t->det_rec_len = ext4_rec_len_to_disk(
202 			sizeof(struct ext4_dir_entry_tail), blocksize);
203 	t->det_reserved_ft = EXT4_FT_DIR_CSUM;
204 }
205 
206 /* Walk through a dirent block to find a checksum "dirent" at the tail */
207 static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
208 						   struct ext4_dir_entry *de)
209 {
210 	struct ext4_dir_entry_tail *t;
211 
212 #ifdef PARANOID
213 	struct ext4_dir_entry *d, *top;
214 
215 	d = de;
216 	top = (struct ext4_dir_entry *)(((void *)de) +
217 		(EXT4_BLOCK_SIZE(inode->i_sb) -
218 		sizeof(struct ext4_dir_entry_tail)));
219 	while (d < top && d->rec_len)
220 		d = (struct ext4_dir_entry *)(((void *)d) +
221 		    le16_to_cpu(d->rec_len));
222 
223 	if (d != top)
224 		return NULL;
225 
226 	t = (struct ext4_dir_entry_tail *)d;
227 #else
228 	t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
229 #endif
230 
231 	if (t->det_reserved_zero1 ||
232 	    le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
233 	    t->det_reserved_zero2 ||
234 	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
235 		return NULL;
236 
237 	return t;
238 }
239 
240 static __le32 ext4_dirent_csum(struct inode *inode,
241 			       struct ext4_dir_entry *dirent, int size)
242 {
243 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
244 	struct ext4_inode_info *ei = EXT4_I(inode);
245 	__u32 csum;
246 
247 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
248 	return cpu_to_le32(csum);
249 }
250 
251 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
252 {
253 	struct ext4_dir_entry_tail *t;
254 
255 	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
256 					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
257 		return 1;
258 
259 	t = get_dirent_tail(inode, dirent);
260 	if (!t) {
261 		EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
262 				 "leaf for checksum.  Please run e2fsck -D.");
263 		return 0;
264 	}
265 
266 	if (t->det_checksum != ext4_dirent_csum(inode, dirent,
267 						(void *)t - (void *)dirent))
268 		return 0;
269 
270 	return 1;
271 }
272 
273 static void ext4_dirent_csum_set(struct inode *inode,
274 				 struct ext4_dir_entry *dirent)
275 {
276 	struct ext4_dir_entry_tail *t;
277 
278 	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
279 					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
280 		return;
281 
282 	t = get_dirent_tail(inode, dirent);
283 	if (!t) {
284 		EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
285 				 "leaf for checksum.  Please run e2fsck -D.");
286 		return;
287 	}
288 
289 	t->det_checksum = ext4_dirent_csum(inode, dirent,
290 					   (void *)t - (void *)dirent);
291 }
292 
293 static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
294 						struct inode *inode,
295 						struct buffer_head *bh)
296 {
297 	ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
298 	return ext4_handle_dirty_metadata(handle, inode, bh);
299 }
300 
301 static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
302 					       struct ext4_dir_entry *dirent,
303 					       int *offset)
304 {
305 	struct ext4_dir_entry *dp;
306 	struct dx_root_info *root;
307 	int count_offset;
308 
309 	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
310 		count_offset = 8;
311 	else if (le16_to_cpu(dirent->rec_len) == 12) {
312 		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
313 		if (le16_to_cpu(dp->rec_len) !=
314 		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
315 			return NULL;
316 		root = (struct dx_root_info *)(((void *)dp + 12));
317 		if (root->reserved_zero ||
318 		    root->info_length != sizeof(struct dx_root_info))
319 			return NULL;
320 		count_offset = 32;
321 	} else
322 		return NULL;
323 
324 	if (offset)
325 		*offset = count_offset;
326 	return (struct dx_countlimit *)(((void *)dirent) + count_offset);
327 }
328 
329 static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
330 			   int count_offset, int count, struct dx_tail *t)
331 {
332 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333 	struct ext4_inode_info *ei = EXT4_I(inode);
334 	__u32 csum, old_csum;
335 	int size;
336 
337 	size = count_offset + (count * sizeof(struct dx_entry));
338 	old_csum = t->dt_checksum;
339 	t->dt_checksum = 0;
340 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
341 	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
342 	t->dt_checksum = old_csum;
343 
344 	return cpu_to_le32(csum);
345 }
346 
347 static int ext4_dx_csum_verify(struct inode *inode,
348 			       struct ext4_dir_entry *dirent)
349 {
350 	struct dx_countlimit *c;
351 	struct dx_tail *t;
352 	int count_offset, limit, count;
353 
354 	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
355 					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
356 		return 1;
357 
358 	c = get_dx_countlimit(inode, dirent, &count_offset);
359 	if (!c) {
360 		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
361 		return 1;
362 	}
363 	limit = le16_to_cpu(c->limit);
364 	count = le16_to_cpu(c->count);
365 	if (count_offset + (limit * sizeof(struct dx_entry)) >
366 	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
367 		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
368 				 "tree checksum found.  Run e2fsck -D.");
369 		return 1;
370 	}
371 	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
372 
373 	if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
374 					    count, t))
375 		return 0;
376 	return 1;
377 }
378 
379 static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
380 {
381 	struct dx_countlimit *c;
382 	struct dx_tail *t;
383 	int count_offset, limit, count;
384 
385 	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
386 					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
387 		return;
388 
389 	c = get_dx_countlimit(inode, dirent, &count_offset);
390 	if (!c) {
391 		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
392 		return;
393 	}
394 	limit = le16_to_cpu(c->limit);
395 	count = le16_to_cpu(c->count);
396 	if (count_offset + (limit * sizeof(struct dx_entry)) >
397 	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
398 		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
399 				 "tree checksum.  Run e2fsck -D.");
400 		return;
401 	}
402 	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
403 
404 	t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
405 }
406 
407 static inline int ext4_handle_dirty_dx_node(handle_t *handle,
408 					    struct inode *inode,
409 					    struct buffer_head *bh)
410 {
411 	ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
412 	return ext4_handle_dirty_metadata(handle, inode, bh);
413 }
414 
415 /*
416  * p is at least 6 bytes before the end of page
417  */
418 static inline struct ext4_dir_entry_2 *
419 ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
420 {
421 	return (struct ext4_dir_entry_2 *)((char *)p +
422 		ext4_rec_len_from_disk(p->rec_len, blocksize));
423 }
424 
425 /*
426  * Future: use high four bits of block for coalesce-on-delete flags
427  * Mask them off for now.
428  */
429 
430 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
431 {
432 	return le32_to_cpu(entry->block) & 0x00ffffff;
433 }
434 
435 static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
436 {
437 	entry->block = cpu_to_le32(value);
438 }
439 
440 static inline unsigned dx_get_hash(struct dx_entry *entry)
441 {
442 	return le32_to_cpu(entry->hash);
443 }
444 
445 static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
446 {
447 	entry->hash = cpu_to_le32(value);
448 }
449 
450 static inline unsigned dx_get_count(struct dx_entry *entries)
451 {
452 	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
453 }
454 
455 static inline unsigned dx_get_limit(struct dx_entry *entries)
456 {
457 	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
458 }
459 
460 static inline void dx_set_count(struct dx_entry *entries, unsigned value)
461 {
462 	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
463 }
464 
465 static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
466 {
467 	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
468 }
469 
470 static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
471 {
472 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
473 		EXT4_DIR_REC_LEN(2) - infosize;
474 
475 	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
476 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
477 		entry_space -= sizeof(struct dx_tail);
478 	return entry_space / sizeof(struct dx_entry);
479 }
480 
481 static inline unsigned dx_node_limit(struct inode *dir)
482 {
483 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
484 
485 	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
486 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
487 		entry_space -= sizeof(struct dx_tail);
488 	return entry_space / sizeof(struct dx_entry);
489 }
490 
491 /*
492  * Debug
493  */
494 #ifdef DX_DEBUG
495 static void dx_show_index(char * label, struct dx_entry *entries)
496 {
497 	int i, n = dx_get_count (entries);
498 	printk(KERN_DEBUG "%s index ", label);
499 	for (i = 0; i < n; i++) {
500 		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
501 				0, (unsigned long)dx_get_block(entries + i));
502 	}
503 	printk("\n");
504 }
505 
506 struct stats
507 {
508 	unsigned names;
509 	unsigned space;
510 	unsigned bcount;
511 };
512 
513 static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
514 				 int size, int show_names)
515 {
516 	unsigned names = 0, space = 0;
517 	char *base = (char *) de;
518 	struct dx_hash_info h = *hinfo;
519 
520 	printk("names: ");
521 	while ((char *) de < base + size)
522 	{
523 		if (de->inode)
524 		{
525 			if (show_names)
526 			{
527 				int len = de->name_len;
528 				char *name = de->name;
529 				while (len--) printk("%c", *name++);
530 				ext4fs_dirhash(de->name, de->name_len, &h);
531 				printk(":%x.%u ", h.hash,
532 				       (unsigned) ((char *) de - base));
533 			}
534 			space += EXT4_DIR_REC_LEN(de->name_len);
535 			names++;
536 		}
537 		de = ext4_next_entry(de, size);
538 	}
539 	printk("(%i)\n", names);
540 	return (struct stats) { names, space, 1 };
541 }
542 
543 struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
544 			     struct dx_entry *entries, int levels)
545 {
546 	unsigned blocksize = dir->i_sb->s_blocksize;
547 	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
548 	unsigned bcount = 0;
549 	struct buffer_head *bh;
550 	int err;
551 	printk("%i indexed blocks...\n", count);
552 	for (i = 0; i < count; i++, entries++)
553 	{
554 		ext4_lblk_t block = dx_get_block(entries);
555 		ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
556 		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
557 		struct stats stats;
558 		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
559 		if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
560 		stats = levels?
561 		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
562 		   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
563 		names += stats.names;
564 		space += stats.space;
565 		bcount += stats.bcount;
566 		brelse(bh);
567 	}
568 	if (bcount)
569 		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
570 		       levels ? "" : "   ", names, space/bcount,
571 		       (space/bcount)*100/blocksize);
572 	return (struct stats) { names, space, bcount};
573 }
574 #endif /* DX_DEBUG */
575 
576 /*
577  * Probe for a directory leaf block to search.
578  *
579  * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
580  * error in the directory index, and the caller should fall back to
581  * searching the directory normally.  The callers of dx_probe **MUST**
582  * check for this error code, and make sure it never gets reflected
583  * back to userspace.
584  */
585 static struct dx_frame *
586 dx_probe(const struct qstr *d_name, struct inode *dir,
587 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
588 {
589 	unsigned count, indirect;
590 	struct dx_entry *at, *entries, *p, *q, *m;
591 	struct dx_root *root;
592 	struct buffer_head *bh;
593 	struct dx_frame *frame = frame_in;
594 	u32 hash;
595 
596 	frame->bh = NULL;
597 	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
598 		goto fail;
599 	root = (struct dx_root *) bh->b_data;
600 	if (root->info.hash_version != DX_HASH_TEA &&
601 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
602 	    root->info.hash_version != DX_HASH_LEGACY) {
603 		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
604 			     root->info.hash_version);
605 		brelse(bh);
606 		*err = ERR_BAD_DX_DIR;
607 		goto fail;
608 	}
609 	hinfo->hash_version = root->info.hash_version;
610 	if (hinfo->hash_version <= DX_HASH_TEA)
611 		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
612 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
613 	if (d_name)
614 		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
615 	hash = hinfo->hash;
616 
617 	if (root->info.unused_flags & 1) {
618 		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
619 			     root->info.unused_flags);
620 		brelse(bh);
621 		*err = ERR_BAD_DX_DIR;
622 		goto fail;
623 	}
624 
625 	if ((indirect = root->info.indirect_levels) > 1) {
626 		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
627 			     root->info.indirect_levels);
628 		brelse(bh);
629 		*err = ERR_BAD_DX_DIR;
630 		goto fail;
631 	}
632 
633 	if (!buffer_verified(bh) &&
634 	    !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
635 		ext4_warning(dir->i_sb, "Root failed checksum");
636 		brelse(bh);
637 		*err = ERR_BAD_DX_DIR;
638 		goto fail;
639 	}
640 	set_buffer_verified(bh);
641 
642 	entries = (struct dx_entry *) (((char *)&root->info) +
643 				       root->info.info_length);
644 
645 	if (dx_get_limit(entries) != dx_root_limit(dir,
646 						   root->info.info_length)) {
647 		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
648 		brelse(bh);
649 		*err = ERR_BAD_DX_DIR;
650 		goto fail;
651 	}
652 
653 	dxtrace(printk("Look up %x", hash));
654 	while (1)
655 	{
656 		count = dx_get_count(entries);
657 		if (!count || count > dx_get_limit(entries)) {
658 			ext4_warning(dir->i_sb,
659 				     "dx entry: no count or count > limit");
660 			brelse(bh);
661 			*err = ERR_BAD_DX_DIR;
662 			goto fail2;
663 		}
664 
665 		p = entries + 1;
666 		q = entries + count - 1;
667 		while (p <= q)
668 		{
669 			m = p + (q - p)/2;
670 			dxtrace(printk("."));
671 			if (dx_get_hash(m) > hash)
672 				q = m - 1;
673 			else
674 				p = m + 1;
675 		}
676 
677 		if (0) // linear search cross check
678 		{
679 			unsigned n = count - 1;
680 			at = entries;
681 			while (n--)
682 			{
683 				dxtrace(printk(","));
684 				if (dx_get_hash(++at) > hash)
685 				{
686 					at--;
687 					break;
688 				}
689 			}
690 			assert (at == p - 1);
691 		}
692 
693 		at = p - 1;
694 		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
695 		frame->bh = bh;
696 		frame->entries = entries;
697 		frame->at = at;
698 		if (!indirect--) return frame;
699 		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
700 			goto fail2;
701 		at = entries = ((struct dx_node *) bh->b_data)->entries;
702 
703 		if (!buffer_verified(bh) &&
704 		    !ext4_dx_csum_verify(dir,
705 					 (struct ext4_dir_entry *)bh->b_data)) {
706 			ext4_warning(dir->i_sb, "Node failed checksum");
707 			brelse(bh);
708 			*err = ERR_BAD_DX_DIR;
709 			goto fail;
710 		}
711 		set_buffer_verified(bh);
712 
713 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
714 			ext4_warning(dir->i_sb,
715 				     "dx entry: limit != node limit");
716 			brelse(bh);
717 			*err = ERR_BAD_DX_DIR;
718 			goto fail2;
719 		}
720 		frame++;
721 		frame->bh = NULL;
722 	}
723 fail2:
724 	while (frame >= frame_in) {
725 		brelse(frame->bh);
726 		frame--;
727 	}
728 fail:
729 	if (*err == ERR_BAD_DX_DIR)
730 		ext4_warning(dir->i_sb,
731 			     "Corrupt dir inode %lu, running e2fsck is "
732 			     "recommended.", dir->i_ino);
733 	return NULL;
734 }
735 
736 static void dx_release (struct dx_frame *frames)
737 {
738 	if (frames[0].bh == NULL)
739 		return;
740 
741 	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
742 		brelse(frames[1].bh);
743 	brelse(frames[0].bh);
744 }
745 
746 /*
747  * This function increments the frame pointer to search the next leaf
748  * block, and reads in the necessary intervening nodes if the search
749  * should be necessary.  Whether or not the search is necessary is
750  * controlled by the hash parameter.  If the hash value is even, then
751  * the search is only continued if the next block starts with that
752  * hash value.  This is used if we are searching for a specific file.
753  *
754  * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
755  *
756  * This function returns 1 if the caller should continue to search,
757  * or 0 if it should not.  If there is an error reading one of the
758  * index blocks, it will a negative error code.
759  *
760  * If start_hash is non-null, it will be filled in with the starting
761  * hash of the next page.
762  */
763 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
764 				 struct dx_frame *frame,
765 				 struct dx_frame *frames,
766 				 __u32 *start_hash)
767 {
768 	struct dx_frame *p;
769 	struct buffer_head *bh;
770 	int err, num_frames = 0;
771 	__u32 bhash;
772 
773 	p = frame;
774 	/*
775 	 * Find the next leaf page by incrementing the frame pointer.
776 	 * If we run out of entries in the interior node, loop around and
777 	 * increment pointer in the parent node.  When we break out of
778 	 * this loop, num_frames indicates the number of interior
779 	 * nodes need to be read.
780 	 */
781 	while (1) {
782 		if (++(p->at) < p->entries + dx_get_count(p->entries))
783 			break;
784 		if (p == frames)
785 			return 0;
786 		num_frames++;
787 		p--;
788 	}
789 
790 	/*
791 	 * If the hash is 1, then continue only if the next page has a
792 	 * continuation hash of any value.  This is used for readdir
793 	 * handling.  Otherwise, check to see if the hash matches the
794 	 * desired contiuation hash.  If it doesn't, return since
795 	 * there's no point to read in the successive index pages.
796 	 */
797 	bhash = dx_get_hash(p->at);
798 	if (start_hash)
799 		*start_hash = bhash;
800 	if ((hash & 1) == 0) {
801 		if ((bhash & ~1) != hash)
802 			return 0;
803 	}
804 	/*
805 	 * If the hash is HASH_NB_ALWAYS, we always go to the next
806 	 * block so no check is necessary
807 	 */
808 	while (num_frames--) {
809 		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
810 				      0, &err)))
811 			return err; /* Failure */
812 
813 		if (!buffer_verified(bh) &&
814 		    !ext4_dx_csum_verify(dir,
815 					 (struct ext4_dir_entry *)bh->b_data)) {
816 			ext4_warning(dir->i_sb, "Node failed checksum");
817 			return -EIO;
818 		}
819 		set_buffer_verified(bh);
820 
821 		p++;
822 		brelse(p->bh);
823 		p->bh = bh;
824 		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
825 	}
826 	return 1;
827 }
828 
829 
830 /*
831  * This function fills a red-black tree with information from a
832  * directory block.  It returns the number directory entries loaded
833  * into the tree.  If there is an error it is returned in err.
834  */
835 static int htree_dirblock_to_tree(struct file *dir_file,
836 				  struct inode *dir, ext4_lblk_t block,
837 				  struct dx_hash_info *hinfo,
838 				  __u32 start_hash, __u32 start_minor_hash)
839 {
840 	struct buffer_head *bh;
841 	struct ext4_dir_entry_2 *de, *top;
842 	int err, count = 0;
843 
844 	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
845 							(unsigned long)block));
846 	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
847 		return err;
848 
849 	if (!buffer_verified(bh) &&
850 	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
851 		return -EIO;
852 	set_buffer_verified(bh);
853 
854 	de = (struct ext4_dir_entry_2 *) bh->b_data;
855 	top = (struct ext4_dir_entry_2 *) ((char *) de +
856 					   dir->i_sb->s_blocksize -
857 					   EXT4_DIR_REC_LEN(0));
858 	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
859 		if (ext4_check_dir_entry(dir, NULL, de, bh,
860 				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
861 					 + ((char *)de - bh->b_data))) {
862 			/* On error, skip the f_pos to the next block. */
863 			dir_file->f_pos = (dir_file->f_pos |
864 					(dir->i_sb->s_blocksize - 1)) + 1;
865 			brelse(bh);
866 			return count;
867 		}
868 		ext4fs_dirhash(de->name, de->name_len, hinfo);
869 		if ((hinfo->hash < start_hash) ||
870 		    ((hinfo->hash == start_hash) &&
871 		     (hinfo->minor_hash < start_minor_hash)))
872 			continue;
873 		if (de->inode == 0)
874 			continue;
875 		if ((err = ext4_htree_store_dirent(dir_file,
876 				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
877 			brelse(bh);
878 			return err;
879 		}
880 		count++;
881 	}
882 	brelse(bh);
883 	return count;
884 }
885 
886 
887 /*
888  * This function fills a red-black tree with information from a
889  * directory.  We start scanning the directory in hash order, starting
890  * at start_hash and start_minor_hash.
891  *
892  * This function returns the number of entries inserted into the tree,
893  * or a negative error code.
894  */
895 int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
896 			 __u32 start_minor_hash, __u32 *next_hash)
897 {
898 	struct dx_hash_info hinfo;
899 	struct ext4_dir_entry_2 *de;
900 	struct dx_frame frames[2], *frame;
901 	struct inode *dir;
902 	ext4_lblk_t block;
903 	int count = 0;
904 	int ret, err;
905 	__u32 hashval;
906 
907 	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
908 		       start_hash, start_minor_hash));
909 	dir = dir_file->f_path.dentry->d_inode;
910 	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
911 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
912 		if (hinfo.hash_version <= DX_HASH_TEA)
913 			hinfo.hash_version +=
914 				EXT4_SB(dir->i_sb)->s_hash_unsigned;
915 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
916 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
917 					       start_hash, start_minor_hash);
918 		*next_hash = ~0;
919 		return count;
920 	}
921 	hinfo.hash = start_hash;
922 	hinfo.minor_hash = 0;
923 	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
924 	if (!frame)
925 		return err;
926 
927 	/* Add '.' and '..' from the htree header */
928 	if (!start_hash && !start_minor_hash) {
929 		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
930 		if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
931 			goto errout;
932 		count++;
933 	}
934 	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
935 		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
936 		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
937 		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
938 			goto errout;
939 		count++;
940 	}
941 
942 	while (1) {
943 		block = dx_get_block(frame->at);
944 		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
945 					     start_hash, start_minor_hash);
946 		if (ret < 0) {
947 			err = ret;
948 			goto errout;
949 		}
950 		count += ret;
951 		hashval = ~0;
952 		ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
953 					    frame, frames, &hashval);
954 		*next_hash = hashval;
955 		if (ret < 0) {
956 			err = ret;
957 			goto errout;
958 		}
959 		/*
960 		 * Stop if:  (a) there are no more entries, or
961 		 * (b) we have inserted at least one entry and the
962 		 * next hash value is not a continuation
963 		 */
964 		if ((ret == 0) ||
965 		    (count && ((hashval & 1) == 0)))
966 			break;
967 	}
968 	dx_release(frames);
969 	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
970 		       "next hash: %x\n", count, *next_hash));
971 	return count;
972 errout:
973 	dx_release(frames);
974 	return (err);
975 }
976 
977 
978 /*
979  * Directory block splitting, compacting
980  */
981 
982 /*
983  * Create map of hash values, offsets, and sizes, stored at end of block.
984  * Returns number of entries mapped.
985  */
986 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
987 		       struct dx_hash_info *hinfo,
988 		       struct dx_map_entry *map_tail)
989 {
990 	int count = 0;
991 	char *base = (char *) de;
992 	struct dx_hash_info h = *hinfo;
993 
994 	while ((char *) de < base + blocksize) {
995 		if (de->name_len && de->inode) {
996 			ext4fs_dirhash(de->name, de->name_len, &h);
997 			map_tail--;
998 			map_tail->hash = h.hash;
999 			map_tail->offs = ((char *) de - base)>>2;
1000 			map_tail->size = le16_to_cpu(de->rec_len);
1001 			count++;
1002 			cond_resched();
1003 		}
1004 		/* XXX: do we need to check rec_len == 0 case? -Chris */
1005 		de = ext4_next_entry(de, blocksize);
1006 	}
1007 	return count;
1008 }
1009 
1010 /* Sort map by hash value */
1011 static void dx_sort_map (struct dx_map_entry *map, unsigned count)
1012 {
1013 	struct dx_map_entry *p, *q, *top = map + count - 1;
1014 	int more;
1015 	/* Combsort until bubble sort doesn't suck */
1016 	while (count > 2) {
1017 		count = count*10/13;
1018 		if (count - 9 < 2) /* 9, 10 -> 11 */
1019 			count = 11;
1020 		for (p = top, q = p - count; q >= map; p--, q--)
1021 			if (p->hash < q->hash)
1022 				swap(*p, *q);
1023 	}
1024 	/* Garden variety bubble sort */
1025 	do {
1026 		more = 0;
1027 		q = top;
1028 		while (q-- > map) {
1029 			if (q[1].hash >= q[0].hash)
1030 				continue;
1031 			swap(*(q+1), *q);
1032 			more = 1;
1033 		}
1034 	} while(more);
1035 }
1036 
1037 static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
1038 {
1039 	struct dx_entry *entries = frame->entries;
1040 	struct dx_entry *old = frame->at, *new = old + 1;
1041 	int count = dx_get_count(entries);
1042 
1043 	assert(count < dx_get_limit(entries));
1044 	assert(old < entries + count);
1045 	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
1046 	dx_set_hash(new, hash);
1047 	dx_set_block(new, block);
1048 	dx_set_count(entries, count + 1);
1049 }
1050 
1051 static void ext4_update_dx_flag(struct inode *inode)
1052 {
1053 	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1054 				     EXT4_FEATURE_COMPAT_DIR_INDEX))
1055 		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1056 }
1057 
1058 /*
1059  * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
1060  *
1061  * `len <= EXT4_NAME_LEN' is guaranteed by caller.
1062  * `de != NULL' is guaranteed by caller.
1063  */
1064 static inline int ext4_match (int len, const char * const name,
1065 			      struct ext4_dir_entry_2 * de)
1066 {
1067 	if (len != de->name_len)
1068 		return 0;
1069 	if (!de->inode)
1070 		return 0;
1071 	return !memcmp(name, de->name, len);
1072 }
1073 
1074 /*
1075  * Returns 0 if not found, -1 on failure, and 1 on success
1076  */
1077 static inline int search_dirblock(struct buffer_head *bh,
1078 				  struct inode *dir,
1079 				  const struct qstr *d_name,
1080 				  unsigned int offset,
1081 				  struct ext4_dir_entry_2 ** res_dir)
1082 {
1083 	struct ext4_dir_entry_2 * de;
1084 	char * dlimit;
1085 	int de_len;
1086 	const char *name = d_name->name;
1087 	int namelen = d_name->len;
1088 
1089 	de = (struct ext4_dir_entry_2 *) bh->b_data;
1090 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
1091 	while ((char *) de < dlimit) {
1092 		/* this code is executed quadratically often */
1093 		/* do minimal checking `by hand' */
1094 
1095 		if ((char *) de + namelen <= dlimit &&
1096 		    ext4_match (namelen, name, de)) {
1097 			/* found a match - just to be sure, do a full check */
1098 			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1099 				return -1;
1100 			*res_dir = de;
1101 			return 1;
1102 		}
1103 		/* prevent looping on a bad block */
1104 		de_len = ext4_rec_len_from_disk(de->rec_len,
1105 						dir->i_sb->s_blocksize);
1106 		if (de_len <= 0)
1107 			return -1;
1108 		offset += de_len;
1109 		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
1110 	}
1111 	return 0;
1112 }
1113 
1114 
1115 /*
1116  *	ext4_find_entry()
1117  *
1118  * finds an entry in the specified directory with the wanted name. It
1119  * returns the cache buffer in which the entry was found, and the entry
1120  * itself (as a parameter - res_dir). It does NOT read the inode of the
1121  * entry - you'll have to do that yourself if you want to.
1122  *
1123  * The returned buffer_head has ->b_count elevated.  The caller is expected
1124  * to brelse() it when appropriate.
1125  */
1126 static struct buffer_head * ext4_find_entry (struct inode *dir,
1127 					const struct qstr *d_name,
1128 					struct ext4_dir_entry_2 ** res_dir)
1129 {
1130 	struct super_block *sb;
1131 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
1132 	struct buffer_head *bh, *ret = NULL;
1133 	ext4_lblk_t start, block, b;
1134 	const u8 *name = d_name->name;
1135 	int ra_max = 0;		/* Number of bh's in the readahead
1136 				   buffer, bh_use[] */
1137 	int ra_ptr = 0;		/* Current index into readahead
1138 				   buffer */
1139 	int num = 0;
1140 	ext4_lblk_t  nblocks;
1141 	int i, err;
1142 	int namelen;
1143 
1144 	*res_dir = NULL;
1145 	sb = dir->i_sb;
1146 	namelen = d_name->len;
1147 	if (namelen > EXT4_NAME_LEN)
1148 		return NULL;
1149 	if ((namelen <= 2) && (name[0] == '.') &&
1150 	    (name[1] == '.' || name[1] == '\0')) {
1151 		/*
1152 		 * "." or ".." will only be in the first block
1153 		 * NFS may look up ".."; "." should be handled by the VFS
1154 		 */
1155 		block = start = 0;
1156 		nblocks = 1;
1157 		goto restart;
1158 	}
1159 	if (is_dx(dir)) {
1160 		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1161 		/*
1162 		 * On success, or if the error was file not found,
1163 		 * return.  Otherwise, fall back to doing a search the
1164 		 * old fashioned way.
1165 		 */
1166 		if (bh || (err != ERR_BAD_DX_DIR))
1167 			return bh;
1168 		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1169 			       "falling back\n"));
1170 	}
1171 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1172 	start = EXT4_I(dir)->i_dir_start_lookup;
1173 	if (start >= nblocks)
1174 		start = 0;
1175 	block = start;
1176 restart:
1177 	do {
1178 		/*
1179 		 * We deal with the read-ahead logic here.
1180 		 */
1181 		if (ra_ptr >= ra_max) {
1182 			/* Refill the readahead buffer */
1183 			ra_ptr = 0;
1184 			b = block;
1185 			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
1186 				/*
1187 				 * Terminate if we reach the end of the
1188 				 * directory and must wrap, or if our
1189 				 * search has finished at this block.
1190 				 */
1191 				if (b >= nblocks || (num && block == start)) {
1192 					bh_use[ra_max] = NULL;
1193 					break;
1194 				}
1195 				num++;
1196 				bh = ext4_getblk(NULL, dir, b++, 0, &err);
1197 				bh_use[ra_max] = bh;
1198 				if (bh)
1199 					ll_rw_block(READ | REQ_META | REQ_PRIO,
1200 						    1, &bh);
1201 			}
1202 		}
1203 		if ((bh = bh_use[ra_ptr++]) == NULL)
1204 			goto next;
1205 		wait_on_buffer(bh);
1206 		if (!buffer_uptodate(bh)) {
1207 			/* read error, skip block & hope for the best */
1208 			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
1209 					 (unsigned long) block);
1210 			brelse(bh);
1211 			goto next;
1212 		}
1213 		if (!buffer_verified(bh) &&
1214 		    !ext4_dirent_csum_verify(dir,
1215 				(struct ext4_dir_entry *)bh->b_data)) {
1216 			EXT4_ERROR_INODE(dir, "checksumming directory "
1217 					 "block %lu", (unsigned long)block);
1218 			brelse(bh);
1219 			goto next;
1220 		}
1221 		set_buffer_verified(bh);
1222 		i = search_dirblock(bh, dir, d_name,
1223 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
1224 		if (i == 1) {
1225 			EXT4_I(dir)->i_dir_start_lookup = block;
1226 			ret = bh;
1227 			goto cleanup_and_exit;
1228 		} else {
1229 			brelse(bh);
1230 			if (i < 0)
1231 				goto cleanup_and_exit;
1232 		}
1233 	next:
1234 		if (++block >= nblocks)
1235 			block = 0;
1236 	} while (block != start);
1237 
1238 	/*
1239 	 * If the directory has grown while we were searching, then
1240 	 * search the last part of the directory before giving up.
1241 	 */
1242 	block = nblocks;
1243 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1244 	if (block < nblocks) {
1245 		start = 0;
1246 		goto restart;
1247 	}
1248 
1249 cleanup_and_exit:
1250 	/* Clean up the read-ahead blocks */
1251 	for (; ra_ptr < ra_max; ra_ptr++)
1252 		brelse(bh_use[ra_ptr]);
1253 	return ret;
1254 }
1255 
1256 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1257 		       struct ext4_dir_entry_2 **res_dir, int *err)
1258 {
1259 	struct super_block * sb = dir->i_sb;
1260 	struct dx_hash_info	hinfo;
1261 	struct dx_frame frames[2], *frame;
1262 	struct buffer_head *bh;
1263 	ext4_lblk_t block;
1264 	int retval;
1265 
1266 	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1267 		return NULL;
1268 	do {
1269 		block = dx_get_block(frame->at);
1270 		if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
1271 			goto errout;
1272 
1273 		if (!buffer_verified(bh) &&
1274 		    !ext4_dirent_csum_verify(dir,
1275 				(struct ext4_dir_entry *)bh->b_data)) {
1276 			EXT4_ERROR_INODE(dir, "checksumming directory "
1277 					 "block %lu", (unsigned long)block);
1278 			brelse(bh);
1279 			*err = -EIO;
1280 			goto errout;
1281 		}
1282 		set_buffer_verified(bh);
1283 		retval = search_dirblock(bh, dir, d_name,
1284 					 block << EXT4_BLOCK_SIZE_BITS(sb),
1285 					 res_dir);
1286 		if (retval == 1) { 	/* Success! */
1287 			dx_release(frames);
1288 			return bh;
1289 		}
1290 		brelse(bh);
1291 		if (retval == -1) {
1292 			*err = ERR_BAD_DX_DIR;
1293 			goto errout;
1294 		}
1295 
1296 		/* Check to see if we should continue to search */
1297 		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1298 					       frames, NULL);
1299 		if (retval < 0) {
1300 			ext4_warning(sb,
1301 			     "error reading index page in directory #%lu",
1302 			     dir->i_ino);
1303 			*err = retval;
1304 			goto errout;
1305 		}
1306 	} while (retval == 1);
1307 
1308 	*err = -ENOENT;
1309 errout:
1310 	dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1311 	dx_release (frames);
1312 	return NULL;
1313 }
1314 
1315 static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
1316 {
1317 	struct inode *inode;
1318 	struct ext4_dir_entry_2 *de;
1319 	struct buffer_head *bh;
1320 
1321 	if (dentry->d_name.len > EXT4_NAME_LEN)
1322 		return ERR_PTR(-ENAMETOOLONG);
1323 
1324 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
1325 	inode = NULL;
1326 	if (bh) {
1327 		__u32 ino = le32_to_cpu(de->inode);
1328 		brelse(bh);
1329 		if (!ext4_valid_inum(dir->i_sb, ino)) {
1330 			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1331 			return ERR_PTR(-EIO);
1332 		}
1333 		if (unlikely(ino == dir->i_ino)) {
1334 			EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
1335 					 dentry->d_name.len,
1336 					 dentry->d_name.name);
1337 			return ERR_PTR(-EIO);
1338 		}
1339 		inode = ext4_iget(dir->i_sb, ino);
1340 		if (inode == ERR_PTR(-ESTALE)) {
1341 			EXT4_ERROR_INODE(dir,
1342 					 "deleted inode referenced: %u",
1343 					 ino);
1344 			return ERR_PTR(-EIO);
1345 		}
1346 	}
1347 	return d_splice_alias(inode, dentry);
1348 }
1349 
1350 
1351 struct dentry *ext4_get_parent(struct dentry *child)
1352 {
1353 	__u32 ino;
1354 	static const struct qstr dotdot = QSTR_INIT("..", 2);
1355 	struct ext4_dir_entry_2 * de;
1356 	struct buffer_head *bh;
1357 
1358 	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1359 	if (!bh)
1360 		return ERR_PTR(-ENOENT);
1361 	ino = le32_to_cpu(de->inode);
1362 	brelse(bh);
1363 
1364 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1365 		EXT4_ERROR_INODE(child->d_inode,
1366 				 "bad parent inode number: %u", ino);
1367 		return ERR_PTR(-EIO);
1368 	}
1369 
1370 	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
1371 }
1372 
1373 #define S_SHIFT 12
1374 static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1375 	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
1376 	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
1377 	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
1378 	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
1379 	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
1380 	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
1381 	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
1382 };
1383 
1384 static inline void ext4_set_de_type(struct super_block *sb,
1385 				struct ext4_dir_entry_2 *de,
1386 				umode_t mode) {
1387 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
1388 		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1389 }
1390 
1391 /*
1392  * Move count entries from end of map between two memory locations.
1393  * Returns pointer to last entry moved.
1394  */
1395 static struct ext4_dir_entry_2 *
1396 dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1397 		unsigned blocksize)
1398 {
1399 	unsigned rec_len = 0;
1400 
1401 	while (count--) {
1402 		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1403 						(from + (map->offs<<2));
1404 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
1405 		memcpy (to, de, rec_len);
1406 		((struct ext4_dir_entry_2 *) to)->rec_len =
1407 				ext4_rec_len_to_disk(rec_len, blocksize);
1408 		de->inode = 0;
1409 		map++;
1410 		to += rec_len;
1411 	}
1412 	return (struct ext4_dir_entry_2 *) (to - rec_len);
1413 }
1414 
1415 /*
1416  * Compact each dir entry in the range to the minimal rec_len.
1417  * Returns pointer to last entry in range.
1418  */
1419 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1420 {
1421 	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1422 	unsigned rec_len = 0;
1423 
1424 	prev = to = de;
1425 	while ((char*)de < base + blocksize) {
1426 		next = ext4_next_entry(de, blocksize);
1427 		if (de->inode && de->name_len) {
1428 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
1429 			if (de > to)
1430 				memmove(to, de, rec_len);
1431 			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1432 			prev = to;
1433 			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1434 		}
1435 		de = next;
1436 	}
1437 	return prev;
1438 }
1439 
1440 /*
1441  * Split a full leaf block to make room for a new dir entry.
1442  * Allocate a new block, and move entries so that they are approx. equally full.
1443  * Returns pointer to de in block into which the new entry will be inserted.
1444  */
1445 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1446 			struct buffer_head **bh,struct dx_frame *frame,
1447 			struct dx_hash_info *hinfo, int *error)
1448 {
1449 	unsigned blocksize = dir->i_sb->s_blocksize;
1450 	unsigned count, continued;
1451 	struct buffer_head *bh2;
1452 	ext4_lblk_t newblock;
1453 	u32 hash2;
1454 	struct dx_map_entry *map;
1455 	char *data1 = (*bh)->b_data, *data2;
1456 	unsigned split, move, size;
1457 	struct ext4_dir_entry_2 *de = NULL, *de2;
1458 	struct ext4_dir_entry_tail *t;
1459 	int	csum_size = 0;
1460 	int	err = 0, i;
1461 
1462 	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
1463 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1464 		csum_size = sizeof(struct ext4_dir_entry_tail);
1465 
1466 	bh2 = ext4_append (handle, dir, &newblock, &err);
1467 	if (!(bh2)) {
1468 		brelse(*bh);
1469 		*bh = NULL;
1470 		goto errout;
1471 	}
1472 
1473 	BUFFER_TRACE(*bh, "get_write_access");
1474 	err = ext4_journal_get_write_access(handle, *bh);
1475 	if (err)
1476 		goto journal_error;
1477 
1478 	BUFFER_TRACE(frame->bh, "get_write_access");
1479 	err = ext4_journal_get_write_access(handle, frame->bh);
1480 	if (err)
1481 		goto journal_error;
1482 
1483 	data2 = bh2->b_data;
1484 
1485 	/* create map in the end of data2 block */
1486 	map = (struct dx_map_entry *) (data2 + blocksize);
1487 	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1488 			     blocksize, hinfo, map);
1489 	map -= count;
1490 	dx_sort_map(map, count);
1491 	/* Split the existing block in the middle, size-wise */
1492 	size = 0;
1493 	move = 0;
1494 	for (i = count-1; i >= 0; i--) {
1495 		/* is more than half of this entry in 2nd half of the block? */
1496 		if (size + map[i].size/2 > blocksize/2)
1497 			break;
1498 		size += map[i].size;
1499 		move++;
1500 	}
1501 	/* map index at which we will split */
1502 	split = count - move;
1503 	hash2 = map[split].hash;
1504 	continued = hash2 == map[split - 1].hash;
1505 	dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
1506 			(unsigned long)dx_get_block(frame->at),
1507 					hash2, split, count-split));
1508 
1509 	/* Fancy dance to stay within two buffers */
1510 	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1511 	de = dx_pack_dirents(data1, blocksize);
1512 	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1513 					   (char *) de,
1514 					   blocksize);
1515 	de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
1516 					    (char *) de2,
1517 					    blocksize);
1518 	if (csum_size) {
1519 		t = EXT4_DIRENT_TAIL(data2, blocksize);
1520 		initialize_dirent_tail(t, blocksize);
1521 
1522 		t = EXT4_DIRENT_TAIL(data1, blocksize);
1523 		initialize_dirent_tail(t, blocksize);
1524 	}
1525 
1526 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1527 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1528 
1529 	/* Which block gets the new entry? */
1530 	if (hinfo->hash >= hash2)
1531 	{
1532 		swap(*bh, bh2);
1533 		de = de2;
1534 	}
1535 	dx_insert_block(frame, hash2 + continued, newblock);
1536 	err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
1537 	if (err)
1538 		goto journal_error;
1539 	err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1540 	if (err)
1541 		goto journal_error;
1542 	brelse(bh2);
1543 	dxtrace(dx_show_index("frame", frame->entries));
1544 	return de;
1545 
1546 journal_error:
1547 	brelse(*bh);
1548 	brelse(bh2);
1549 	*bh = NULL;
1550 	ext4_std_error(dir->i_sb, err);
1551 errout:
1552 	*error = err;
1553 	return NULL;
1554 }
1555 
1556 /*
1557  * Add a new entry into a directory (leaf) block.  If de is non-NULL,
1558  * it points to a directory entry which is guaranteed to be large
1559  * enough for new directory entry.  If de is NULL, then
1560  * add_dirent_to_buf will attempt search the directory block for
1561  * space.  It will return -ENOSPC if no space is available, and -EIO
1562  * and -EEXIST if directory entry already exists.
1563  */
1564 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1565 			     struct inode *inode, struct ext4_dir_entry_2 *de,
1566 			     struct buffer_head *bh)
1567 {
1568 	struct inode	*dir = dentry->d_parent->d_inode;
1569 	const char	*name = dentry->d_name.name;
1570 	int		namelen = dentry->d_name.len;
1571 	unsigned int	offset = 0;
1572 	unsigned int	blocksize = dir->i_sb->s_blocksize;
1573 	unsigned short	reclen;
1574 	int		nlen, rlen, err;
1575 	char		*top;
1576 	int		csum_size = 0;
1577 
1578 	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1579 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1580 		csum_size = sizeof(struct ext4_dir_entry_tail);
1581 
1582 	reclen = EXT4_DIR_REC_LEN(namelen);
1583 	if (!de) {
1584 		de = (struct ext4_dir_entry_2 *)bh->b_data;
1585 		top = bh->b_data + (blocksize - csum_size) - reclen;
1586 		while ((char *) de <= top) {
1587 			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1588 				return -EIO;
1589 			if (ext4_match(namelen, name, de))
1590 				return -EEXIST;
1591 			nlen = EXT4_DIR_REC_LEN(de->name_len);
1592 			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1593 			if ((de->inode? rlen - nlen: rlen) >= reclen)
1594 				break;
1595 			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1596 			offset += rlen;
1597 		}
1598 		if ((char *) de > top)
1599 			return -ENOSPC;
1600 	}
1601 	BUFFER_TRACE(bh, "get_write_access");
1602 	err = ext4_journal_get_write_access(handle, bh);
1603 	if (err) {
1604 		ext4_std_error(dir->i_sb, err);
1605 		return err;
1606 	}
1607 
1608 	/* By now the buffer is marked for journaling */
1609 	nlen = EXT4_DIR_REC_LEN(de->name_len);
1610 	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1611 	if (de->inode) {
1612 		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1613 		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1614 		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1615 		de = de1;
1616 	}
1617 	de->file_type = EXT4_FT_UNKNOWN;
1618 	de->inode = cpu_to_le32(inode->i_ino);
1619 	ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1620 	de->name_len = namelen;
1621 	memcpy(de->name, name, namelen);
1622 	/*
1623 	 * XXX shouldn't update any times until successful
1624 	 * completion of syscall, but too many callers depend
1625 	 * on this.
1626 	 *
1627 	 * XXX similarly, too many callers depend on
1628 	 * ext4_new_inode() setting the times, but error
1629 	 * recovery deletes the inode, so the worst that can
1630 	 * happen is that the times are slightly out of date
1631 	 * and/or different from the directory change time.
1632 	 */
1633 	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1634 	ext4_update_dx_flag(dir);
1635 	dir->i_version++;
1636 	ext4_mark_inode_dirty(handle, dir);
1637 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1638 	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1639 	if (err)
1640 		ext4_std_error(dir->i_sb, err);
1641 	return 0;
1642 }
1643 
1644 /*
1645  * This converts a one block unindexed directory to a 3 block indexed
1646  * directory, and adds the dentry to the indexed directory.
1647  */
1648 static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1649 			    struct inode *inode, struct buffer_head *bh)
1650 {
1651 	struct inode	*dir = dentry->d_parent->d_inode;
1652 	const char	*name = dentry->d_name.name;
1653 	int		namelen = dentry->d_name.len;
1654 	struct buffer_head *bh2;
1655 	struct dx_root	*root;
1656 	struct dx_frame	frames[2], *frame;
1657 	struct dx_entry *entries;
1658 	struct ext4_dir_entry_2	*de, *de2;
1659 	struct ext4_dir_entry_tail *t;
1660 	char		*data1, *top;
1661 	unsigned	len;
1662 	int		retval;
1663 	unsigned	blocksize;
1664 	struct dx_hash_info hinfo;
1665 	ext4_lblk_t  block;
1666 	struct fake_dirent *fde;
1667 	int		csum_size = 0;
1668 
1669 	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1670 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1671 		csum_size = sizeof(struct ext4_dir_entry_tail);
1672 
1673 	blocksize =  dir->i_sb->s_blocksize;
1674 	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1675 	retval = ext4_journal_get_write_access(handle, bh);
1676 	if (retval) {
1677 		ext4_std_error(dir->i_sb, retval);
1678 		brelse(bh);
1679 		return retval;
1680 	}
1681 	root = (struct dx_root *) bh->b_data;
1682 
1683 	/* The 0th block becomes the root, move the dirents out */
1684 	fde = &root->dotdot;
1685 	de = (struct ext4_dir_entry_2 *)((char *)fde +
1686 		ext4_rec_len_from_disk(fde->rec_len, blocksize));
1687 	if ((char *) de >= (((char *) root) + blocksize)) {
1688 		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1689 		brelse(bh);
1690 		return -EIO;
1691 	}
1692 	len = ((char *) root) + (blocksize - csum_size) - (char *) de;
1693 
1694 	/* Allocate new block for the 0th block's dirents */
1695 	bh2 = ext4_append(handle, dir, &block, &retval);
1696 	if (!(bh2)) {
1697 		brelse(bh);
1698 		return retval;
1699 	}
1700 	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1701 	data1 = bh2->b_data;
1702 
1703 	memcpy (data1, de, len);
1704 	de = (struct ext4_dir_entry_2 *) data1;
1705 	top = data1 + len;
1706 	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1707 		de = de2;
1708 	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1709 					   (char *) de,
1710 					   blocksize);
1711 
1712 	if (csum_size) {
1713 		t = EXT4_DIRENT_TAIL(data1, blocksize);
1714 		initialize_dirent_tail(t, blocksize);
1715 	}
1716 
1717 	/* Initialize the root; the dot dirents already exist */
1718 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1719 	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
1720 					   blocksize);
1721 	memset (&root->info, 0, sizeof(root->info));
1722 	root->info.info_length = sizeof(root->info);
1723 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1724 	entries = root->entries;
1725 	dx_set_block(entries, 1);
1726 	dx_set_count(entries, 1);
1727 	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1728 
1729 	/* Initialize as for dx_probe */
1730 	hinfo.hash_version = root->info.hash_version;
1731 	if (hinfo.hash_version <= DX_HASH_TEA)
1732 		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1733 	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1734 	ext4fs_dirhash(name, namelen, &hinfo);
1735 	frame = frames;
1736 	frame->entries = entries;
1737 	frame->at = entries;
1738 	frame->bh = bh;
1739 	bh = bh2;
1740 
1741 	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1742 	ext4_handle_dirty_dirent_node(handle, dir, bh);
1743 
1744 	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1745 	if (!de) {
1746 		/*
1747 		 * Even if the block split failed, we have to properly write
1748 		 * out all the changes we did so far. Otherwise we can end up
1749 		 * with corrupted filesystem.
1750 		 */
1751 		ext4_mark_inode_dirty(handle, dir);
1752 		dx_release(frames);
1753 		return retval;
1754 	}
1755 	dx_release(frames);
1756 
1757 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1758 	brelse(bh);
1759 	return retval;
1760 }
1761 
1762 /*
1763  *	ext4_add_entry()
1764  *
1765  * adds a file entry to the specified directory, using the same
1766  * semantics as ext4_find_entry(). It returns NULL if it failed.
1767  *
1768  * NOTE!! The inode part of 'de' is left at 0 - which means you
1769  * may not sleep between calling this and putting something into
1770  * the entry, as someone else might have used it while you slept.
1771  */
1772 static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1773 			  struct inode *inode)
1774 {
1775 	struct inode *dir = dentry->d_parent->d_inode;
1776 	struct buffer_head *bh;
1777 	struct ext4_dir_entry_2 *de;
1778 	struct ext4_dir_entry_tail *t;
1779 	struct super_block *sb;
1780 	int	retval;
1781 	int	dx_fallback=0;
1782 	unsigned blocksize;
1783 	ext4_lblk_t block, blocks;
1784 	int	csum_size = 0;
1785 
1786 	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1787 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1788 		csum_size = sizeof(struct ext4_dir_entry_tail);
1789 
1790 	sb = dir->i_sb;
1791 	blocksize = sb->s_blocksize;
1792 	if (!dentry->d_name.len)
1793 		return -EINVAL;
1794 	if (is_dx(dir)) {
1795 		retval = ext4_dx_add_entry(handle, dentry, inode);
1796 		if (!retval || (retval != ERR_BAD_DX_DIR))
1797 			return retval;
1798 		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1799 		dx_fallback++;
1800 		ext4_mark_inode_dirty(handle, dir);
1801 	}
1802 	blocks = dir->i_size >> sb->s_blocksize_bits;
1803 	for (block = 0; block < blocks; block++) {
1804 		bh = ext4_bread(handle, dir, block, 0, &retval);
1805 		if(!bh)
1806 			return retval;
1807 		if (!buffer_verified(bh) &&
1808 		    !ext4_dirent_csum_verify(dir,
1809 				(struct ext4_dir_entry *)bh->b_data))
1810 			return -EIO;
1811 		set_buffer_verified(bh);
1812 		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1813 		if (retval != -ENOSPC) {
1814 			brelse(bh);
1815 			return retval;
1816 		}
1817 
1818 		if (blocks == 1 && !dx_fallback &&
1819 		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1820 			return make_indexed_dir(handle, dentry, inode, bh);
1821 		brelse(bh);
1822 	}
1823 	bh = ext4_append(handle, dir, &block, &retval);
1824 	if (!bh)
1825 		return retval;
1826 	de = (struct ext4_dir_entry_2 *) bh->b_data;
1827 	de->inode = 0;
1828 	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
1829 
1830 	if (csum_size) {
1831 		t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
1832 		initialize_dirent_tail(t, blocksize);
1833 	}
1834 
1835 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1836 	brelse(bh);
1837 	if (retval == 0)
1838 		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1839 	return retval;
1840 }
1841 
1842 /*
1843  * Returns 0 for success, or a negative error value
1844  */
1845 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1846 			     struct inode *inode)
1847 {
1848 	struct dx_frame frames[2], *frame;
1849 	struct dx_entry *entries, *at;
1850 	struct dx_hash_info hinfo;
1851 	struct buffer_head *bh;
1852 	struct inode *dir = dentry->d_parent->d_inode;
1853 	struct super_block *sb = dir->i_sb;
1854 	struct ext4_dir_entry_2 *de;
1855 	int err;
1856 
1857 	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1858 	if (!frame)
1859 		return err;
1860 	entries = frame->entries;
1861 	at = frame->at;
1862 
1863 	if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1864 		goto cleanup;
1865 
1866 	if (!buffer_verified(bh) &&
1867 	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
1868 		goto journal_error;
1869 	set_buffer_verified(bh);
1870 
1871 	BUFFER_TRACE(bh, "get_write_access");
1872 	err = ext4_journal_get_write_access(handle, bh);
1873 	if (err)
1874 		goto journal_error;
1875 
1876 	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1877 	if (err != -ENOSPC)
1878 		goto cleanup;
1879 
1880 	/* Block full, should compress but for now just split */
1881 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1882 		       dx_get_count(entries), dx_get_limit(entries)));
1883 	/* Need to split index? */
1884 	if (dx_get_count(entries) == dx_get_limit(entries)) {
1885 		ext4_lblk_t newblock;
1886 		unsigned icount = dx_get_count(entries);
1887 		int levels = frame - frames;
1888 		struct dx_entry *entries2;
1889 		struct dx_node *node2;
1890 		struct buffer_head *bh2;
1891 
1892 		if (levels && (dx_get_count(frames->entries) ==
1893 			       dx_get_limit(frames->entries))) {
1894 			ext4_warning(sb, "Directory index full!");
1895 			err = -ENOSPC;
1896 			goto cleanup;
1897 		}
1898 		bh2 = ext4_append (handle, dir, &newblock, &err);
1899 		if (!(bh2))
1900 			goto cleanup;
1901 		node2 = (struct dx_node *)(bh2->b_data);
1902 		entries2 = node2->entries;
1903 		memset(&node2->fake, 0, sizeof(struct fake_dirent));
1904 		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1905 							   sb->s_blocksize);
1906 		BUFFER_TRACE(frame->bh, "get_write_access");
1907 		err = ext4_journal_get_write_access(handle, frame->bh);
1908 		if (err)
1909 			goto journal_error;
1910 		if (levels) {
1911 			unsigned icount1 = icount/2, icount2 = icount - icount1;
1912 			unsigned hash2 = dx_get_hash(entries + icount1);
1913 			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1914 				       icount1, icount2));
1915 
1916 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1917 			err = ext4_journal_get_write_access(handle,
1918 							     frames[0].bh);
1919 			if (err)
1920 				goto journal_error;
1921 
1922 			memcpy((char *) entries2, (char *) (entries + icount1),
1923 			       icount2 * sizeof(struct dx_entry));
1924 			dx_set_count(entries, icount1);
1925 			dx_set_count(entries2, icount2);
1926 			dx_set_limit(entries2, dx_node_limit(dir));
1927 
1928 			/* Which index block gets the new entry? */
1929 			if (at - entries >= icount1) {
1930 				frame->at = at = at - entries - icount1 + entries2;
1931 				frame->entries = entries = entries2;
1932 				swap(frame->bh, bh2);
1933 			}
1934 			dx_insert_block(frames + 0, hash2, newblock);
1935 			dxtrace(dx_show_index("node", frames[1].entries));
1936 			dxtrace(dx_show_index("node",
1937 			       ((struct dx_node *) bh2->b_data)->entries));
1938 			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
1939 			if (err)
1940 				goto journal_error;
1941 			brelse (bh2);
1942 		} else {
1943 			dxtrace(printk(KERN_DEBUG
1944 				       "Creating second level index...\n"));
1945 			memcpy((char *) entries2, (char *) entries,
1946 			       icount * sizeof(struct dx_entry));
1947 			dx_set_limit(entries2, dx_node_limit(dir));
1948 
1949 			/* Set up root */
1950 			dx_set_count(entries, 1);
1951 			dx_set_block(entries + 0, newblock);
1952 			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1953 
1954 			/* Add new access path frame */
1955 			frame = frames + 1;
1956 			frame->at = at = at - entries + entries2;
1957 			frame->entries = entries = entries2;
1958 			frame->bh = bh2;
1959 			err = ext4_journal_get_write_access(handle,
1960 							     frame->bh);
1961 			if (err)
1962 				goto journal_error;
1963 		}
1964 		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
1965 		if (err) {
1966 			ext4_std_error(inode->i_sb, err);
1967 			goto cleanup;
1968 		}
1969 	}
1970 	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1971 	if (!de)
1972 		goto cleanup;
1973 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1974 	goto cleanup;
1975 
1976 journal_error:
1977 	ext4_std_error(dir->i_sb, err);
1978 cleanup:
1979 	if (bh)
1980 		brelse(bh);
1981 	dx_release(frames);
1982 	return err;
1983 }
1984 
1985 /*
1986  * ext4_delete_entry deletes a directory entry by merging it with the
1987  * previous entry
1988  */
1989 static int ext4_delete_entry(handle_t *handle,
1990 			     struct inode *dir,
1991 			     struct ext4_dir_entry_2 *de_del,
1992 			     struct buffer_head *bh)
1993 {
1994 	struct ext4_dir_entry_2 *de, *pde;
1995 	unsigned int blocksize = dir->i_sb->s_blocksize;
1996 	int csum_size = 0;
1997 	int i, err;
1998 
1999 	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2000 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2001 		csum_size = sizeof(struct ext4_dir_entry_tail);
2002 
2003 	i = 0;
2004 	pde = NULL;
2005 	de = (struct ext4_dir_entry_2 *) bh->b_data;
2006 	while (i < bh->b_size - csum_size) {
2007 		if (ext4_check_dir_entry(dir, NULL, de, bh, i))
2008 			return -EIO;
2009 		if (de == de_del)  {
2010 			BUFFER_TRACE(bh, "get_write_access");
2011 			err = ext4_journal_get_write_access(handle, bh);
2012 			if (unlikely(err)) {
2013 				ext4_std_error(dir->i_sb, err);
2014 				return err;
2015 			}
2016 			if (pde)
2017 				pde->rec_len = ext4_rec_len_to_disk(
2018 					ext4_rec_len_from_disk(pde->rec_len,
2019 							       blocksize) +
2020 					ext4_rec_len_from_disk(de->rec_len,
2021 							       blocksize),
2022 					blocksize);
2023 			else
2024 				de->inode = 0;
2025 			dir->i_version++;
2026 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2027 			err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2028 			if (unlikely(err)) {
2029 				ext4_std_error(dir->i_sb, err);
2030 				return err;
2031 			}
2032 			return 0;
2033 		}
2034 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
2035 		pde = de;
2036 		de = ext4_next_entry(de, blocksize);
2037 	}
2038 	return -ENOENT;
2039 }
2040 
2041 /*
2042  * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
2043  * since this indicates that nlinks count was previously 1.
2044  */
2045 static void ext4_inc_count(handle_t *handle, struct inode *inode)
2046 {
2047 	inc_nlink(inode);
2048 	if (is_dx(inode) && inode->i_nlink > 1) {
2049 		/* limit is 16-bit i_links_count */
2050 		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
2051 			set_nlink(inode, 1);
2052 			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
2053 					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
2054 		}
2055 	}
2056 }
2057 
2058 /*
2059  * If a directory had nlink == 1, then we should let it be 1. This indicates
2060  * directory has >EXT4_LINK_MAX subdirs.
2061  */
2062 static void ext4_dec_count(handle_t *handle, struct inode *inode)
2063 {
2064 	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
2065 		drop_nlink(inode);
2066 }
2067 
2068 
2069 static int ext4_add_nondir(handle_t *handle,
2070 		struct dentry *dentry, struct inode *inode)
2071 {
2072 	int err = ext4_add_entry(handle, dentry, inode);
2073 	if (!err) {
2074 		ext4_mark_inode_dirty(handle, inode);
2075 		unlock_new_inode(inode);
2076 		d_instantiate(dentry, inode);
2077 		return 0;
2078 	}
2079 	drop_nlink(inode);
2080 	unlock_new_inode(inode);
2081 	iput(inode);
2082 	return err;
2083 }
2084 
2085 /*
2086  * By the time this is called, we already have created
2087  * the directory cache entry for the new file, but it
2088  * is so far negative - it has no inode.
2089  *
2090  * If the create succeeds, we fill in the inode information
2091  * with d_instantiate().
2092  */
2093 static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2094 		       bool excl)
2095 {
2096 	handle_t *handle;
2097 	struct inode *inode;
2098 	int err, retries = 0;
2099 
2100 	dquot_initialize(dir);
2101 
2102 retry:
2103 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2104 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2105 					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2106 	if (IS_ERR(handle))
2107 		return PTR_ERR(handle);
2108 
2109 	if (IS_DIRSYNC(dir))
2110 		ext4_handle_sync(handle);
2111 
2112 	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
2113 	err = PTR_ERR(inode);
2114 	if (!IS_ERR(inode)) {
2115 		inode->i_op = &ext4_file_inode_operations;
2116 		inode->i_fop = &ext4_file_operations;
2117 		ext4_set_aops(inode);
2118 		err = ext4_add_nondir(handle, dentry, inode);
2119 	}
2120 	ext4_journal_stop(handle);
2121 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2122 		goto retry;
2123 	return err;
2124 }
2125 
2126 static int ext4_mknod(struct inode *dir, struct dentry *dentry,
2127 		      umode_t mode, dev_t rdev)
2128 {
2129 	handle_t *handle;
2130 	struct inode *inode;
2131 	int err, retries = 0;
2132 
2133 	if (!new_valid_dev(rdev))
2134 		return -EINVAL;
2135 
2136 	dquot_initialize(dir);
2137 
2138 retry:
2139 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2140 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2141 					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2142 	if (IS_ERR(handle))
2143 		return PTR_ERR(handle);
2144 
2145 	if (IS_DIRSYNC(dir))
2146 		ext4_handle_sync(handle);
2147 
2148 	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
2149 	err = PTR_ERR(inode);
2150 	if (!IS_ERR(inode)) {
2151 		init_special_inode(inode, inode->i_mode, rdev);
2152 #ifdef CONFIG_EXT4_FS_XATTR
2153 		inode->i_op = &ext4_special_inode_operations;
2154 #endif
2155 		err = ext4_add_nondir(handle, dentry, inode);
2156 	}
2157 	ext4_journal_stop(handle);
2158 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2159 		goto retry;
2160 	return err;
2161 }
2162 
2163 static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2164 {
2165 	handle_t *handle;
2166 	struct inode *inode;
2167 	struct buffer_head *dir_block = NULL;
2168 	struct ext4_dir_entry_2 *de;
2169 	struct ext4_dir_entry_tail *t;
2170 	unsigned int blocksize = dir->i_sb->s_blocksize;
2171 	int csum_size = 0;
2172 	int err, retries = 0;
2173 
2174 	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2175 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2176 		csum_size = sizeof(struct ext4_dir_entry_tail);
2177 
2178 	if (EXT4_DIR_LINK_MAX(dir))
2179 		return -EMLINK;
2180 
2181 	dquot_initialize(dir);
2182 
2183 retry:
2184 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2185 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2186 					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2187 	if (IS_ERR(handle))
2188 		return PTR_ERR(handle);
2189 
2190 	if (IS_DIRSYNC(dir))
2191 		ext4_handle_sync(handle);
2192 
2193 	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
2194 			       &dentry->d_name, 0, NULL);
2195 	err = PTR_ERR(inode);
2196 	if (IS_ERR(inode))
2197 		goto out_stop;
2198 
2199 	inode->i_op = &ext4_dir_inode_operations;
2200 	inode->i_fop = &ext4_dir_operations;
2201 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
2202 	dir_block = ext4_bread(handle, inode, 0, 1, &err);
2203 	if (!dir_block)
2204 		goto out_clear_inode;
2205 	BUFFER_TRACE(dir_block, "get_write_access");
2206 	err = ext4_journal_get_write_access(handle, dir_block);
2207 	if (err)
2208 		goto out_clear_inode;
2209 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
2210 	de->inode = cpu_to_le32(inode->i_ino);
2211 	de->name_len = 1;
2212 	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
2213 					   blocksize);
2214 	strcpy(de->name, ".");
2215 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2216 	de = ext4_next_entry(de, blocksize);
2217 	de->inode = cpu_to_le32(dir->i_ino);
2218 	de->rec_len = ext4_rec_len_to_disk(blocksize -
2219 					   (csum_size + EXT4_DIR_REC_LEN(1)),
2220 					   blocksize);
2221 	de->name_len = 2;
2222 	strcpy(de->name, "..");
2223 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2224 	set_nlink(inode, 2);
2225 
2226 	if (csum_size) {
2227 		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2228 		initialize_dirent_tail(t, blocksize);
2229 	}
2230 
2231 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2232 	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2233 	if (err)
2234 		goto out_clear_inode;
2235 	set_buffer_verified(dir_block);
2236 	err = ext4_mark_inode_dirty(handle, inode);
2237 	if (!err)
2238 		err = ext4_add_entry(handle, dentry, inode);
2239 	if (err) {
2240 out_clear_inode:
2241 		clear_nlink(inode);
2242 		unlock_new_inode(inode);
2243 		ext4_mark_inode_dirty(handle, inode);
2244 		iput(inode);
2245 		goto out_stop;
2246 	}
2247 	ext4_inc_count(handle, dir);
2248 	ext4_update_dx_flag(dir);
2249 	err = ext4_mark_inode_dirty(handle, dir);
2250 	if (err)
2251 		goto out_clear_inode;
2252 	unlock_new_inode(inode);
2253 	d_instantiate(dentry, inode);
2254 out_stop:
2255 	brelse(dir_block);
2256 	ext4_journal_stop(handle);
2257 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2258 		goto retry;
2259 	return err;
2260 }
2261 
2262 /*
2263  * routine to check that the specified directory is empty (for rmdir)
2264  */
2265 static int empty_dir(struct inode *inode)
2266 {
2267 	unsigned int offset;
2268 	struct buffer_head *bh;
2269 	struct ext4_dir_entry_2 *de, *de1;
2270 	struct super_block *sb;
2271 	int err = 0;
2272 
2273 	sb = inode->i_sb;
2274 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
2275 	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
2276 		if (err)
2277 			EXT4_ERROR_INODE(inode,
2278 				"error %d reading directory lblock 0", err);
2279 		else
2280 			ext4_warning(inode->i_sb,
2281 				     "bad directory (dir #%lu) - no data block",
2282 				     inode->i_ino);
2283 		return 1;
2284 	}
2285 	if (!buffer_verified(bh) &&
2286 	    !ext4_dirent_csum_verify(inode,
2287 			(struct ext4_dir_entry *)bh->b_data)) {
2288 		EXT4_ERROR_INODE(inode, "checksum error reading directory "
2289 				 "lblock 0");
2290 		return -EIO;
2291 	}
2292 	set_buffer_verified(bh);
2293 	de = (struct ext4_dir_entry_2 *) bh->b_data;
2294 	de1 = ext4_next_entry(de, sb->s_blocksize);
2295 	if (le32_to_cpu(de->inode) != inode->i_ino ||
2296 			!le32_to_cpu(de1->inode) ||
2297 			strcmp(".", de->name) ||
2298 			strcmp("..", de1->name)) {
2299 		ext4_warning(inode->i_sb,
2300 			     "bad directory (dir #%lu) - no `.' or `..'",
2301 			     inode->i_ino);
2302 		brelse(bh);
2303 		return 1;
2304 	}
2305 	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
2306 		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
2307 	de = ext4_next_entry(de1, sb->s_blocksize);
2308 	while (offset < inode->i_size) {
2309 		if (!bh ||
2310 		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
2311 			unsigned int lblock;
2312 			err = 0;
2313 			brelse(bh);
2314 			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
2315 			bh = ext4_bread(NULL, inode, lblock, 0, &err);
2316 			if (!bh) {
2317 				if (err)
2318 					EXT4_ERROR_INODE(inode,
2319 						"error %d reading directory "
2320 						"lblock %u", err, lblock);
2321 				offset += sb->s_blocksize;
2322 				continue;
2323 			}
2324 			if (!buffer_verified(bh) &&
2325 			    !ext4_dirent_csum_verify(inode,
2326 					(struct ext4_dir_entry *)bh->b_data)) {
2327 				EXT4_ERROR_INODE(inode, "checksum error "
2328 						 "reading directory lblock 0");
2329 				return -EIO;
2330 			}
2331 			set_buffer_verified(bh);
2332 			de = (struct ext4_dir_entry_2 *) bh->b_data;
2333 		}
2334 		if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
2335 			de = (struct ext4_dir_entry_2 *)(bh->b_data +
2336 							 sb->s_blocksize);
2337 			offset = (offset | (sb->s_blocksize - 1)) + 1;
2338 			continue;
2339 		}
2340 		if (le32_to_cpu(de->inode)) {
2341 			brelse(bh);
2342 			return 0;
2343 		}
2344 		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
2345 		de = ext4_next_entry(de, sb->s_blocksize);
2346 	}
2347 	brelse(bh);
2348 	return 1;
2349 }
2350 
2351 /* ext4_orphan_add() links an unlinked or truncated inode into a list of
2352  * such inodes, starting at the superblock, in case we crash before the
2353  * file is closed/deleted, or in case the inode truncate spans multiple
2354  * transactions and the last transaction is not recovered after a crash.
2355  *
2356  * At filesystem recovery time, we walk this list deleting unlinked
2357  * inodes and truncating linked inodes in ext4_orphan_cleanup().
2358  */
2359 int ext4_orphan_add(handle_t *handle, struct inode *inode)
2360 {
2361 	struct super_block *sb = inode->i_sb;
2362 	struct ext4_iloc iloc;
2363 	int err = 0, rc;
2364 
2365 	if (!ext4_handle_valid(handle))
2366 		return 0;
2367 
2368 	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
2369 	if (!list_empty(&EXT4_I(inode)->i_orphan))
2370 		goto out_unlock;
2371 
2372 	/*
2373 	 * Orphan handling is only valid for files with data blocks
2374 	 * being truncated, or files being unlinked. Note that we either
2375 	 * hold i_mutex, or the inode can not be referenced from outside,
2376 	 * so i_nlink should not be bumped due to race
2377 	 */
2378 	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2379 		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
2380 
2381 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
2382 	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
2383 	if (err)
2384 		goto out_unlock;
2385 
2386 	err = ext4_reserve_inode_write(handle, inode, &iloc);
2387 	if (err)
2388 		goto out_unlock;
2389 	/*
2390 	 * Due to previous errors inode may be already a part of on-disk
2391 	 * orphan list. If so skip on-disk list modification.
2392 	 */
2393 	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2394 		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2395 			goto mem_insert;
2396 
2397 	/* Insert this inode at the head of the on-disk orphan list... */
2398 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2399 	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2400 	err = ext4_handle_dirty_super(handle, sb);
2401 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2402 	if (!err)
2403 		err = rc;
2404 
2405 	/* Only add to the head of the in-memory list if all the
2406 	 * previous operations succeeded.  If the orphan_add is going to
2407 	 * fail (possibly taking the journal offline), we can't risk
2408 	 * leaving the inode on the orphan list: stray orphan-list
2409 	 * entries can cause panics at unmount time.
2410 	 *
2411 	 * This is safe: on error we're going to ignore the orphan list
2412 	 * anyway on the next recovery. */
2413 mem_insert:
2414 	if (!err)
2415 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2416 
2417 	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
2418 	jbd_debug(4, "orphan inode %lu will point to %d\n",
2419 			inode->i_ino, NEXT_ORPHAN(inode));
2420 out_unlock:
2421 	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
2422 	ext4_std_error(inode->i_sb, err);
2423 	return err;
2424 }
2425 
2426 /*
2427  * ext4_orphan_del() removes an unlinked or truncated inode from the list
2428  * of such inodes stored on disk, because it is finally being cleaned up.
2429  */
2430 int ext4_orphan_del(handle_t *handle, struct inode *inode)
2431 {
2432 	struct list_head *prev;
2433 	struct ext4_inode_info *ei = EXT4_I(inode);
2434 	struct ext4_sb_info *sbi;
2435 	__u32 ino_next;
2436 	struct ext4_iloc iloc;
2437 	int err = 0;
2438 
2439 	/* ext4_handle_valid() assumes a valid handle_t pointer */
2440 	if (handle && !ext4_handle_valid(handle))
2441 		return 0;
2442 
2443 	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2444 	if (list_empty(&ei->i_orphan))
2445 		goto out;
2446 
2447 	ino_next = NEXT_ORPHAN(inode);
2448 	prev = ei->i_orphan.prev;
2449 	sbi = EXT4_SB(inode->i_sb);
2450 
2451 	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
2452 
2453 	list_del_init(&ei->i_orphan);
2454 
2455 	/* If we're on an error path, we may not have a valid
2456 	 * transaction handle with which to update the orphan list on
2457 	 * disk, but we still need to remove the inode from the linked
2458 	 * list in memory. */
2459 	if (sbi->s_journal && !handle)
2460 		goto out;
2461 
2462 	err = ext4_reserve_inode_write(handle, inode, &iloc);
2463 	if (err)
2464 		goto out_err;
2465 
2466 	if (prev == &sbi->s_orphan) {
2467 		jbd_debug(4, "superblock will point to %u\n", ino_next);
2468 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2469 		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2470 		if (err)
2471 			goto out_brelse;
2472 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2473 		err = ext4_handle_dirty_super(handle, inode->i_sb);
2474 	} else {
2475 		struct ext4_iloc iloc2;
2476 		struct inode *i_prev =
2477 			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2478 
2479 		jbd_debug(4, "orphan inode %lu will point to %u\n",
2480 			  i_prev->i_ino, ino_next);
2481 		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2482 		if (err)
2483 			goto out_brelse;
2484 		NEXT_ORPHAN(i_prev) = ino_next;
2485 		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
2486 	}
2487 	if (err)
2488 		goto out_brelse;
2489 	NEXT_ORPHAN(inode) = 0;
2490 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
2491 
2492 out_err:
2493 	ext4_std_error(inode->i_sb, err);
2494 out:
2495 	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2496 	return err;
2497 
2498 out_brelse:
2499 	brelse(iloc.bh);
2500 	goto out_err;
2501 }
2502 
2503 static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2504 {
2505 	int retval;
2506 	struct inode *inode;
2507 	struct buffer_head *bh;
2508 	struct ext4_dir_entry_2 *de;
2509 	handle_t *handle;
2510 
2511 	/* Initialize quotas before so that eventual writes go in
2512 	 * separate transaction */
2513 	dquot_initialize(dir);
2514 	dquot_initialize(dentry->d_inode);
2515 
2516 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2517 	if (IS_ERR(handle))
2518 		return PTR_ERR(handle);
2519 
2520 	retval = -ENOENT;
2521 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
2522 	if (!bh)
2523 		goto end_rmdir;
2524 
2525 	if (IS_DIRSYNC(dir))
2526 		ext4_handle_sync(handle);
2527 
2528 	inode = dentry->d_inode;
2529 
2530 	retval = -EIO;
2531 	if (le32_to_cpu(de->inode) != inode->i_ino)
2532 		goto end_rmdir;
2533 
2534 	retval = -ENOTEMPTY;
2535 	if (!empty_dir(inode))
2536 		goto end_rmdir;
2537 
2538 	retval = ext4_delete_entry(handle, dir, de, bh);
2539 	if (retval)
2540 		goto end_rmdir;
2541 	if (!EXT4_DIR_LINK_EMPTY(inode))
2542 		ext4_warning(inode->i_sb,
2543 			     "empty directory has too many links (%d)",
2544 			     inode->i_nlink);
2545 	inode->i_version++;
2546 	clear_nlink(inode);
2547 	/* There's no need to set i_disksize: the fact that i_nlink is
2548 	 * zero will ensure that the right thing happens during any
2549 	 * recovery. */
2550 	inode->i_size = 0;
2551 	ext4_orphan_add(handle, inode);
2552 	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
2553 	ext4_mark_inode_dirty(handle, inode);
2554 	ext4_dec_count(handle, dir);
2555 	ext4_update_dx_flag(dir);
2556 	ext4_mark_inode_dirty(handle, dir);
2557 
2558 end_rmdir:
2559 	ext4_journal_stop(handle);
2560 	brelse(bh);
2561 	return retval;
2562 }
2563 
2564 static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2565 {
2566 	int retval;
2567 	struct inode *inode;
2568 	struct buffer_head *bh;
2569 	struct ext4_dir_entry_2 *de;
2570 	handle_t *handle;
2571 
2572 	trace_ext4_unlink_enter(dir, dentry);
2573 	/* Initialize quotas before so that eventual writes go
2574 	 * in separate transaction */
2575 	dquot_initialize(dir);
2576 	dquot_initialize(dentry->d_inode);
2577 
2578 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2579 	if (IS_ERR(handle))
2580 		return PTR_ERR(handle);
2581 
2582 	if (IS_DIRSYNC(dir))
2583 		ext4_handle_sync(handle);
2584 
2585 	retval = -ENOENT;
2586 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
2587 	if (!bh)
2588 		goto end_unlink;
2589 
2590 	inode = dentry->d_inode;
2591 
2592 	retval = -EIO;
2593 	if (le32_to_cpu(de->inode) != inode->i_ino)
2594 		goto end_unlink;
2595 
2596 	if (!inode->i_nlink) {
2597 		ext4_warning(inode->i_sb,
2598 			     "Deleting nonexistent file (%lu), %d",
2599 			     inode->i_ino, inode->i_nlink);
2600 		set_nlink(inode, 1);
2601 	}
2602 	retval = ext4_delete_entry(handle, dir, de, bh);
2603 	if (retval)
2604 		goto end_unlink;
2605 	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
2606 	ext4_update_dx_flag(dir);
2607 	ext4_mark_inode_dirty(handle, dir);
2608 	drop_nlink(inode);
2609 	if (!inode->i_nlink)
2610 		ext4_orphan_add(handle, inode);
2611 	inode->i_ctime = ext4_current_time(inode);
2612 	ext4_mark_inode_dirty(handle, inode);
2613 	retval = 0;
2614 
2615 end_unlink:
2616 	ext4_journal_stop(handle);
2617 	brelse(bh);
2618 	trace_ext4_unlink_exit(dentry, retval);
2619 	return retval;
2620 }
2621 
2622 static int ext4_symlink(struct inode *dir,
2623 			struct dentry *dentry, const char *symname)
2624 {
2625 	handle_t *handle;
2626 	struct inode *inode;
2627 	int l, err, retries = 0;
2628 	int credits;
2629 
2630 	l = strlen(symname)+1;
2631 	if (l > dir->i_sb->s_blocksize)
2632 		return -ENAMETOOLONG;
2633 
2634 	dquot_initialize(dir);
2635 
2636 	if (l > EXT4_N_BLOCKS * 4) {
2637 		/*
2638 		 * For non-fast symlinks, we just allocate inode and put it on
2639 		 * orphan list in the first transaction => we need bitmap,
2640 		 * group descriptor, sb, inode block, quota blocks, and
2641 		 * possibly selinux xattr blocks.
2642 		 */
2643 		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2644 			  EXT4_XATTR_TRANS_BLOCKS;
2645 	} else {
2646 		/*
2647 		 * Fast symlink. We have to add entry to directory
2648 		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2649 		 * allocate new inode (bitmap, group descriptor, inode block,
2650 		 * quota blocks, sb is already counted in previous macros).
2651 		 */
2652 		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2653 			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2654 			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2655 	}
2656 retry:
2657 	handle = ext4_journal_start(dir, credits);
2658 	if (IS_ERR(handle))
2659 		return PTR_ERR(handle);
2660 
2661 	if (IS_DIRSYNC(dir))
2662 		ext4_handle_sync(handle);
2663 
2664 	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2665 			       &dentry->d_name, 0, NULL);
2666 	err = PTR_ERR(inode);
2667 	if (IS_ERR(inode))
2668 		goto out_stop;
2669 
2670 	if (l > EXT4_N_BLOCKS * 4) {
2671 		inode->i_op = &ext4_symlink_inode_operations;
2672 		ext4_set_aops(inode);
2673 		/*
2674 		 * We cannot call page_symlink() with transaction started
2675 		 * because it calls into ext4_write_begin() which can wait
2676 		 * for transaction commit if we are running out of space
2677 		 * and thus we deadlock. So we have to stop transaction now
2678 		 * and restart it when symlink contents is written.
2679 		 *
2680 		 * To keep fs consistent in case of crash, we have to put inode
2681 		 * to orphan list in the mean time.
2682 		 */
2683 		drop_nlink(inode);
2684 		err = ext4_orphan_add(handle, inode);
2685 		ext4_journal_stop(handle);
2686 		if (err)
2687 			goto err_drop_inode;
2688 		err = __page_symlink(inode, symname, l, 1);
2689 		if (err)
2690 			goto err_drop_inode;
2691 		/*
2692 		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2693 		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2694 		 */
2695 		handle = ext4_journal_start(dir,
2696 				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2697 				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2698 		if (IS_ERR(handle)) {
2699 			err = PTR_ERR(handle);
2700 			goto err_drop_inode;
2701 		}
2702 		set_nlink(inode, 1);
2703 		err = ext4_orphan_del(handle, inode);
2704 		if (err) {
2705 			ext4_journal_stop(handle);
2706 			clear_nlink(inode);
2707 			goto err_drop_inode;
2708 		}
2709 	} else {
2710 		/* clear the extent format for fast symlink */
2711 		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2712 		inode->i_op = &ext4_fast_symlink_inode_operations;
2713 		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2714 		inode->i_size = l-1;
2715 	}
2716 	EXT4_I(inode)->i_disksize = inode->i_size;
2717 	err = ext4_add_nondir(handle, dentry, inode);
2718 out_stop:
2719 	ext4_journal_stop(handle);
2720 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2721 		goto retry;
2722 	return err;
2723 err_drop_inode:
2724 	unlock_new_inode(inode);
2725 	iput(inode);
2726 	return err;
2727 }
2728 
2729 static int ext4_link(struct dentry *old_dentry,
2730 		     struct inode *dir, struct dentry *dentry)
2731 {
2732 	handle_t *handle;
2733 	struct inode *inode = old_dentry->d_inode;
2734 	int err, retries = 0;
2735 
2736 	if (inode->i_nlink >= EXT4_LINK_MAX)
2737 		return -EMLINK;
2738 
2739 	dquot_initialize(dir);
2740 
2741 retry:
2742 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2743 					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
2744 	if (IS_ERR(handle))
2745 		return PTR_ERR(handle);
2746 
2747 	if (IS_DIRSYNC(dir))
2748 		ext4_handle_sync(handle);
2749 
2750 	inode->i_ctime = ext4_current_time(inode);
2751 	ext4_inc_count(handle, inode);
2752 	ihold(inode);
2753 
2754 	err = ext4_add_entry(handle, dentry, inode);
2755 	if (!err) {
2756 		ext4_mark_inode_dirty(handle, inode);
2757 		d_instantiate(dentry, inode);
2758 	} else {
2759 		drop_nlink(inode);
2760 		iput(inode);
2761 	}
2762 	ext4_journal_stop(handle);
2763 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2764 		goto retry;
2765 	return err;
2766 }
2767 
2768 #define PARENT_INO(buffer, size) \
2769 	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2770 
2771 /*
2772  * Anybody can rename anything with this: the permission checks are left to the
2773  * higher-level routines.
2774  */
2775 static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2776 		       struct inode *new_dir, struct dentry *new_dentry)
2777 {
2778 	handle_t *handle;
2779 	struct inode *old_inode, *new_inode;
2780 	struct buffer_head *old_bh, *new_bh, *dir_bh;
2781 	struct ext4_dir_entry_2 *old_de, *new_de;
2782 	int retval, force_da_alloc = 0;
2783 
2784 	dquot_initialize(old_dir);
2785 	dquot_initialize(new_dir);
2786 
2787 	old_bh = new_bh = dir_bh = NULL;
2788 
2789 	/* Initialize quotas before so that eventual writes go
2790 	 * in separate transaction */
2791 	if (new_dentry->d_inode)
2792 		dquot_initialize(new_dentry->d_inode);
2793 	handle = ext4_journal_start(old_dir, 2 *
2794 					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2795 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
2796 	if (IS_ERR(handle))
2797 		return PTR_ERR(handle);
2798 
2799 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2800 		ext4_handle_sync(handle);
2801 
2802 	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2803 	/*
2804 	 *  Check for inode number is _not_ due to possible IO errors.
2805 	 *  We might rmdir the source, keep it as pwd of some process
2806 	 *  and merrily kill the link to whatever was created under the
2807 	 *  same name. Goodbye sticky bit ;-<
2808 	 */
2809 	old_inode = old_dentry->d_inode;
2810 	retval = -ENOENT;
2811 	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2812 		goto end_rename;
2813 
2814 	new_inode = new_dentry->d_inode;
2815 	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2816 	if (new_bh) {
2817 		if (!new_inode) {
2818 			brelse(new_bh);
2819 			new_bh = NULL;
2820 		}
2821 	}
2822 	if (S_ISDIR(old_inode->i_mode)) {
2823 		if (new_inode) {
2824 			retval = -ENOTEMPTY;
2825 			if (!empty_dir(new_inode))
2826 				goto end_rename;
2827 		}
2828 		retval = -EIO;
2829 		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2830 		if (!dir_bh)
2831 			goto end_rename;
2832 		if (!buffer_verified(dir_bh) &&
2833 		    !ext4_dirent_csum_verify(old_inode,
2834 				(struct ext4_dir_entry *)dir_bh->b_data))
2835 			goto end_rename;
2836 		set_buffer_verified(dir_bh);
2837 		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2838 				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2839 			goto end_rename;
2840 		retval = -EMLINK;
2841 		if (!new_inode && new_dir != old_dir &&
2842 		    EXT4_DIR_LINK_MAX(new_dir))
2843 			goto end_rename;
2844 		BUFFER_TRACE(dir_bh, "get_write_access");
2845 		retval = ext4_journal_get_write_access(handle, dir_bh);
2846 		if (retval)
2847 			goto end_rename;
2848 	}
2849 	if (!new_bh) {
2850 		retval = ext4_add_entry(handle, new_dentry, old_inode);
2851 		if (retval)
2852 			goto end_rename;
2853 	} else {
2854 		BUFFER_TRACE(new_bh, "get write access");
2855 		retval = ext4_journal_get_write_access(handle, new_bh);
2856 		if (retval)
2857 			goto end_rename;
2858 		new_de->inode = cpu_to_le32(old_inode->i_ino);
2859 		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2860 					      EXT4_FEATURE_INCOMPAT_FILETYPE))
2861 			new_de->file_type = old_de->file_type;
2862 		new_dir->i_version++;
2863 		new_dir->i_ctime = new_dir->i_mtime =
2864 					ext4_current_time(new_dir);
2865 		ext4_mark_inode_dirty(handle, new_dir);
2866 		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2867 		retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
2868 		if (unlikely(retval)) {
2869 			ext4_std_error(new_dir->i_sb, retval);
2870 			goto end_rename;
2871 		}
2872 		brelse(new_bh);
2873 		new_bh = NULL;
2874 	}
2875 
2876 	/*
2877 	 * Like most other Unix systems, set the ctime for inodes on a
2878 	 * rename.
2879 	 */
2880 	old_inode->i_ctime = ext4_current_time(old_inode);
2881 	ext4_mark_inode_dirty(handle, old_inode);
2882 
2883 	/*
2884 	 * ok, that's it
2885 	 */
2886 	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2887 	    old_de->name_len != old_dentry->d_name.len ||
2888 	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2889 	    (retval = ext4_delete_entry(handle, old_dir,
2890 					old_de, old_bh)) == -ENOENT) {
2891 		/* old_de could have moved from under us during htree split, so
2892 		 * make sure that we are deleting the right entry.  We might
2893 		 * also be pointing to a stale entry in the unused part of
2894 		 * old_bh so just checking inum and the name isn't enough. */
2895 		struct buffer_head *old_bh2;
2896 		struct ext4_dir_entry_2 *old_de2;
2897 
2898 		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2899 		if (old_bh2) {
2900 			retval = ext4_delete_entry(handle, old_dir,
2901 						   old_de2, old_bh2);
2902 			brelse(old_bh2);
2903 		}
2904 	}
2905 	if (retval) {
2906 		ext4_warning(old_dir->i_sb,
2907 				"Deleting old file (%lu), %d, error=%d",
2908 				old_dir->i_ino, old_dir->i_nlink, retval);
2909 	}
2910 
2911 	if (new_inode) {
2912 		ext4_dec_count(handle, new_inode);
2913 		new_inode->i_ctime = ext4_current_time(new_inode);
2914 	}
2915 	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2916 	ext4_update_dx_flag(old_dir);
2917 	if (dir_bh) {
2918 		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2919 						cpu_to_le32(new_dir->i_ino);
2920 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2921 		if (is_dx(old_inode)) {
2922 			retval = ext4_handle_dirty_dx_node(handle,
2923 							   old_inode,
2924 							   dir_bh);
2925 		} else {
2926 			retval = ext4_handle_dirty_dirent_node(handle,
2927 							       old_inode,
2928 							       dir_bh);
2929 		}
2930 		if (retval) {
2931 			ext4_std_error(old_dir->i_sb, retval);
2932 			goto end_rename;
2933 		}
2934 		ext4_dec_count(handle, old_dir);
2935 		if (new_inode) {
2936 			/* checked empty_dir above, can't have another parent,
2937 			 * ext4_dec_count() won't work for many-linked dirs */
2938 			clear_nlink(new_inode);
2939 		} else {
2940 			ext4_inc_count(handle, new_dir);
2941 			ext4_update_dx_flag(new_dir);
2942 			ext4_mark_inode_dirty(handle, new_dir);
2943 		}
2944 	}
2945 	ext4_mark_inode_dirty(handle, old_dir);
2946 	if (new_inode) {
2947 		ext4_mark_inode_dirty(handle, new_inode);
2948 		if (!new_inode->i_nlink)
2949 			ext4_orphan_add(handle, new_inode);
2950 		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
2951 			force_da_alloc = 1;
2952 	}
2953 	retval = 0;
2954 
2955 end_rename:
2956 	brelse(dir_bh);
2957 	brelse(old_bh);
2958 	brelse(new_bh);
2959 	ext4_journal_stop(handle);
2960 	if (retval == 0 && force_da_alloc)
2961 		ext4_alloc_da_blocks(old_inode);
2962 	return retval;
2963 }
2964 
2965 /*
2966  * directories can handle most operations...
2967  */
2968 const struct inode_operations ext4_dir_inode_operations = {
2969 	.create		= ext4_create,
2970 	.lookup		= ext4_lookup,
2971 	.link		= ext4_link,
2972 	.unlink		= ext4_unlink,
2973 	.symlink	= ext4_symlink,
2974 	.mkdir		= ext4_mkdir,
2975 	.rmdir		= ext4_rmdir,
2976 	.mknod		= ext4_mknod,
2977 	.rename		= ext4_rename,
2978 	.setattr	= ext4_setattr,
2979 #ifdef CONFIG_EXT4_FS_XATTR
2980 	.setxattr	= generic_setxattr,
2981 	.getxattr	= generic_getxattr,
2982 	.listxattr	= ext4_listxattr,
2983 	.removexattr	= generic_removexattr,
2984 #endif
2985 	.get_acl	= ext4_get_acl,
2986 	.fiemap         = ext4_fiemap,
2987 };
2988 
2989 const struct inode_operations ext4_special_inode_operations = {
2990 	.setattr	= ext4_setattr,
2991 #ifdef CONFIG_EXT4_FS_XATTR
2992 	.setxattr	= generic_setxattr,
2993 	.getxattr	= generic_getxattr,
2994 	.listxattr	= ext4_listxattr,
2995 	.removexattr	= generic_removexattr,
2996 #endif
2997 	.get_acl	= ext4_get_acl,
2998 };
2999