xref: /titanic_50/usr/src/uts/common/fs/zfs/zfs_znode.c (revision 2e83744e07e0937d9ade0801c0a4d8316ac3071e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Portions Copyright 2007 Jeremy Teo */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #ifdef _KERNEL
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/mntent.h>
38 #include <sys/mkdev.h>
39 #include <sys/u8_textprep.h>
40 #include <sys/dsl_dataset.h>
41 #include <sys/vfs.h>
42 #include <sys/vfs_opreg.h>
43 #include <sys/vnode.h>
44 #include <sys/file.h>
45 #include <sys/kmem.h>
46 #include <sys/errno.h>
47 #include <sys/unistd.h>
48 #include <sys/mode.h>
49 #include <sys/atomic.h>
50 #include <vm/pvn.h>
51 #include "fs/fs_subr.h"
52 #include <sys/zfs_dir.h>
53 #include <sys/zfs_acl.h>
54 #include <sys/zfs_ioctl.h>
55 #include <sys/zfs_rlock.h>
56 #include <sys/zfs_fuid.h>
57 #include <sys/fs/zfs.h>
58 #include <sys/kidmap.h>
59 #endif /* _KERNEL */
60 
61 #include <sys/dmu.h>
62 #include <sys/refcount.h>
63 #include <sys/stat.h>
64 #include <sys/zap.h>
65 #include <sys/zfs_znode.h>
66 
67 #include "zfs_prop.h"
68 
69 /*
70  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
71  * turned on when DEBUG is also defined.
72  */
73 #ifdef	DEBUG
74 #define	ZNODE_STATS
75 #endif	/* DEBUG */
76 
77 #ifdef	ZNODE_STATS
78 #define	ZNODE_STAT_ADD(stat)			((stat)++)
79 #else
80 #define	ZNODE_STAT_ADD(stat)			/* nothing */
81 #endif	/* ZNODE_STATS */
82 
83 #define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
84 #define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
85 
86 /*
87  * Functions needed for userland (ie: libzpool) are not put under
88  * #ifdef_KERNEL; the rest of the functions have dependencies
89  * (such as VFS logic) that will not compile easily in userland.
90  */
91 #ifdef _KERNEL
92 static kmem_cache_t *znode_cache = NULL;
93 
94 /*ARGSUSED*/
95 static void
96 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
97 {
98 	/*
99 	 * We should never drop all dbuf refs without first clearing
100 	 * the eviction callback.
101 	 */
102 	panic("evicting znode %p\n", user_ptr);
103 }
104 
105 /*ARGSUSED*/
106 static int
107 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
108 {
109 	znode_t *zp = buf;
110 
111 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
112 
113 	zp->z_vnode = vn_alloc(kmflags);
114 	if (zp->z_vnode == NULL) {
115 		return (-1);
116 	}
117 	ZTOV(zp)->v_data = zp;
118 
119 	list_link_init(&zp->z_link_node);
120 
121 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
122 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
123 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
124 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
125 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
126 
127 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
128 	avl_create(&zp->z_range_avl, zfs_range_compare,
129 	    sizeof (rl_t), offsetof(rl_t, r_node));
130 
131 	zp->z_dbuf = NULL;
132 	zp->z_dirlocks = NULL;
133 	return (0);
134 }
135 
136 /*ARGSUSED*/
137 static void
138 zfs_znode_cache_destructor(void *buf, void *arg)
139 {
140 	znode_t *zp = buf;
141 
142 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
143 	ASSERT(ZTOV(zp)->v_data == zp);
144 	vn_free(ZTOV(zp));
145 	ASSERT(!list_link_active(&zp->z_link_node));
146 	mutex_destroy(&zp->z_lock);
147 	rw_destroy(&zp->z_map_lock);
148 	rw_destroy(&zp->z_parent_lock);
149 	rw_destroy(&zp->z_name_lock);
150 	mutex_destroy(&zp->z_acl_lock);
151 	avl_destroy(&zp->z_range_avl);
152 	mutex_destroy(&zp->z_range_lock);
153 
154 	ASSERT(zp->z_dbuf == NULL);
155 	ASSERT(zp->z_dirlocks == NULL);
156 }
157 
158 #ifdef	ZNODE_STATS
159 static struct {
160 	uint64_t zms_zfsvfs_invalid;
161 	uint64_t zms_zfsvfs_unmounted;
162 	uint64_t zms_zfsvfs_recheck_invalid;
163 	uint64_t zms_vnode_locked;
164 	uint64_t zms_znode_in_use;
165 	uint64_t zms_yes;
166 	uint64_t zms_later;
167 	uint64_t zms_dont_know;
168 } znode_move_stats;
169 #endif	/* ZNODE_STATS */
170 
171 static void
172 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
173 {
174 	vnode_t *vp;
175 
176 	/* Copy fields. */
177 	nzp->z_zfsvfs = ozp->z_zfsvfs;
178 
179 	/* Swap vnodes. */
180 	vp = nzp->z_vnode;
181 	nzp->z_vnode = ozp->z_vnode;
182 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
183 	ZTOV(ozp)->v_data = ozp;
184 	ZTOV(nzp)->v_data = nzp;
185 
186 	nzp->z_id = ozp->z_id;
187 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
188 	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
189 	nzp->z_unlinked = ozp->z_unlinked;
190 	nzp->z_atime_dirty = ozp->z_atime_dirty;
191 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
192 	nzp->z_blksz = ozp->z_blksz;
193 	nzp->z_seq = ozp->z_seq;
194 	nzp->z_mapcnt = ozp->z_mapcnt;
195 	nzp->z_last_itx = ozp->z_last_itx;
196 	nzp->z_gen = ozp->z_gen;
197 	nzp->z_sync_cnt = ozp->z_sync_cnt;
198 	nzp->z_phys = ozp->z_phys;
199 	nzp->z_dbuf = ozp->z_dbuf;
200 
201 	/* Update back pointers. */
202 	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
203 	    znode_evict_error);
204 
205 	/*
206 	 * Invalidate the original znode by clearing fields that provide a
207 	 * pointer back to the znode. Set the low bit of the vfs pointer to
208 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
209 	 * subsequent callback.
210 	 */
211 	ozp->z_dbuf = NULL;
212 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
213 }
214 
215 /*
216  * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
217  * returns a non-zero error code.
218  */
219 static int
220 zfs_enter(zfsvfs_t *zfsvfs)
221 {
222 	ZFS_ENTER(zfsvfs);
223 	return (0);
224 }
225 
226 /*ARGSUSED*/
227 static kmem_cbrc_t
228 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
229 {
230 	znode_t *ozp = buf, *nzp = newbuf;
231 	zfsvfs_t *zfsvfs;
232 	vnode_t *vp;
233 
234 	/*
235 	 * The znode is on the file system's list of known znodes if the vfs
236 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
237 	 * the znode to invalidate it, and the memory patterns written by kmem
238 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
239 	 * created znode sets the vfs pointer last of all to indicate that the
240 	 * znode is known and in a valid state to be moved by this function.
241 	 */
242 	zfsvfs = ozp->z_zfsvfs;
243 	if (!POINTER_IS_VALID(zfsvfs)) {
244 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
245 		ZNODE_STAT_ADD(znode_move_stats.zms_dont_know);
246 		return (KMEM_CBRC_DONT_KNOW);
247 	}
248 
249 	/*
250 	 * Ensure that the filesystem is not unmounted during the move.
251 	 */
252 	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
253 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
254 		ZNODE_STAT_ADD(znode_move_stats.zms_dont_know);
255 		return (KMEM_CBRC_DONT_KNOW);
256 	}
257 
258 	mutex_enter(&zfsvfs->z_znodes_lock);
259 	/*
260 	 * Recheck the vfs pointer in case the znode was removed just before
261 	 * acquiring the lock.
262 	 */
263 	if (zfsvfs != ozp->z_zfsvfs) {
264 		mutex_exit(&zfsvfs->z_znodes_lock);
265 		ZFS_EXIT(zfsvfs);
266 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
267 		ZNODE_STAT_ADD(znode_move_stats.zms_dont_know);
268 		return (KMEM_CBRC_DONT_KNOW);
269 	}
270 
271 	/*
272 	 * At this point we know that as long as we hold z_znodes_lock, the
273 	 * znode cannot be freed and fields within the znode can be safely
274 	 * accessed.
275 	 */
276 	vp = ZTOV(ozp);
277 	if (mutex_tryenter(&vp->v_lock) == 0) {
278 		mutex_exit(&zfsvfs->z_znodes_lock);
279 		ZFS_EXIT(zfsvfs);
280 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
281 		ZNODE_STAT_ADD(znode_move_stats.zms_later);
282 		return (KMEM_CBRC_LATER);
283 	}
284 	/* Only move znodes that are referenced _only_ by the DNLC. */
285 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
286 		mutex_exit(&vp->v_lock);
287 		mutex_exit(&zfsvfs->z_znodes_lock);
288 		ZFS_EXIT(zfsvfs);
289 		ZNODE_STAT_ADD(znode_move_stats.zms_znode_in_use);
290 		ZNODE_STAT_ADD(znode_move_stats.zms_later);
291 		return (KMEM_CBRC_LATER);
292 	}
293 
294 	/*
295 	 * The znode is known and in a valid state to move. We're holding the
296 	 * locks needed to execute the critical section.
297 	 */
298 	zfs_znode_move_impl(ozp, nzp);
299 	mutex_exit(&vp->v_lock);
300 
301 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
302 	mutex_exit(&zfsvfs->z_znodes_lock);
303 	ZFS_EXIT(zfsvfs);
304 
305 	ZNODE_STAT_ADD(znode_move_stats.zms_yes);
306 	return (KMEM_CBRC_YES);
307 }
308 
309 void
310 zfs_znode_init(void)
311 {
312 	/*
313 	 * Initialize zcache
314 	 */
315 	ASSERT(znode_cache == NULL);
316 	znode_cache = kmem_cache_create("zfs_znode_cache",
317 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
318 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
319 	kmem_cache_set_move(znode_cache, zfs_znode_move);
320 }
321 
322 void
323 zfs_znode_fini(void)
324 {
325 	/*
326 	 * Cleanup vfs & vnode ops
327 	 */
328 	zfs_remove_op_tables();
329 
330 	/*
331 	 * Cleanup zcache
332 	 */
333 	if (znode_cache)
334 		kmem_cache_destroy(znode_cache);
335 	znode_cache = NULL;
336 }
337 
338 struct vnodeops *zfs_dvnodeops;
339 struct vnodeops *zfs_fvnodeops;
340 struct vnodeops *zfs_symvnodeops;
341 struct vnodeops *zfs_xdvnodeops;
342 struct vnodeops *zfs_evnodeops;
343 
344 void
345 zfs_remove_op_tables()
346 {
347 	/*
348 	 * Remove vfs ops
349 	 */
350 	ASSERT(zfsfstype);
351 	(void) vfs_freevfsops_by_type(zfsfstype);
352 	zfsfstype = 0;
353 
354 	/*
355 	 * Remove vnode ops
356 	 */
357 	if (zfs_dvnodeops)
358 		vn_freevnodeops(zfs_dvnodeops);
359 	if (zfs_fvnodeops)
360 		vn_freevnodeops(zfs_fvnodeops);
361 	if (zfs_symvnodeops)
362 		vn_freevnodeops(zfs_symvnodeops);
363 	if (zfs_xdvnodeops)
364 		vn_freevnodeops(zfs_xdvnodeops);
365 	if (zfs_evnodeops)
366 		vn_freevnodeops(zfs_evnodeops);
367 
368 	zfs_dvnodeops = NULL;
369 	zfs_fvnodeops = NULL;
370 	zfs_symvnodeops = NULL;
371 	zfs_xdvnodeops = NULL;
372 	zfs_evnodeops = NULL;
373 }
374 
375 extern const fs_operation_def_t zfs_dvnodeops_template[];
376 extern const fs_operation_def_t zfs_fvnodeops_template[];
377 extern const fs_operation_def_t zfs_xdvnodeops_template[];
378 extern const fs_operation_def_t zfs_symvnodeops_template[];
379 extern const fs_operation_def_t zfs_evnodeops_template[];
380 
381 int
382 zfs_create_op_tables()
383 {
384 	int error;
385 
386 	/*
387 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
388 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
389 	 * In this case we just return as the ops vectors are already set up.
390 	 */
391 	if (zfs_dvnodeops)
392 		return (0);
393 
394 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
395 	    &zfs_dvnodeops);
396 	if (error)
397 		return (error);
398 
399 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
400 	    &zfs_fvnodeops);
401 	if (error)
402 		return (error);
403 
404 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
405 	    &zfs_symvnodeops);
406 	if (error)
407 		return (error);
408 
409 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
410 	    &zfs_xdvnodeops);
411 	if (error)
412 		return (error);
413 
414 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
415 	    &zfs_evnodeops);
416 
417 	return (error);
418 }
419 
420 /*
421  * zfs_init_fs - Initialize the zfsvfs struct and the file system
422  *	incore "master" object.  Verify version compatibility.
423  */
424 int
425 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
426 {
427 	extern int zfsfstype;
428 
429 	objset_t	*os = zfsvfs->z_os;
430 	int		i, error;
431 	uint64_t fsid_guid;
432 	uint64_t zval;
433 
434 	*zpp = NULL;
435 
436 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
437 	if (error) {
438 		return (error);
439 	} else if (zfsvfs->z_version > ZPL_VERSION) {
440 		(void) printf("Mismatched versions:  File system "
441 		    "is version %llu on-disk format, which is "
442 		    "incompatible with this software version %lld!",
443 		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
444 		return (ENOTSUP);
445 	}
446 
447 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
448 		return (error);
449 	zfsvfs->z_norm = (int)zval;
450 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
451 		return (error);
452 	zfsvfs->z_utf8 = (zval != 0);
453 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
454 		return (error);
455 	zfsvfs->z_case = (uint_t)zval;
456 	/*
457 	 * Fold case on file systems that are always or sometimes case
458 	 * insensitive.
459 	 */
460 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
461 	    zfsvfs->z_case == ZFS_CASE_MIXED)
462 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
463 
464 	/*
465 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
466 	 * separates our fsid from any other filesystem types, and a
467 	 * 56-bit objset unique ID.  The objset unique ID is unique to
468 	 * all objsets open on this system, provided by unique_create().
469 	 * The 8-bit fs type must be put in the low bits of fsid[1]
470 	 * because that's where other Solaris filesystems put it.
471 	 */
472 	fsid_guid = dmu_objset_fsid_guid(os);
473 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
474 	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
475 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
476 	    zfsfstype & 0xFF;
477 
478 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
479 	    &zfsvfs->z_root);
480 	if (error)
481 		return (error);
482 	ASSERT(zfsvfs->z_root != 0);
483 
484 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
485 	    &zfsvfs->z_unlinkedobj);
486 	if (error)
487 		return (error);
488 
489 	/*
490 	 * Initialize zget mutex's
491 	 */
492 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
493 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
494 
495 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
496 	if (error) {
497 		/*
498 		 * On error, we destroy the mutexes here since it's not
499 		 * possible for the caller to determine if the mutexes were
500 		 * initialized properly.
501 		 */
502 		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
503 			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
504 		return (error);
505 	}
506 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
507 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
508 	    &zfsvfs->z_fuid_obj);
509 	if (error == ENOENT)
510 		error = 0;
511 
512 	return (0);
513 }
514 
515 /*
516  * define a couple of values we need available
517  * for both 64 and 32 bit environments.
518  */
519 #ifndef NBITSMINOR64
520 #define	NBITSMINOR64	32
521 #endif
522 #ifndef MAXMAJ64
523 #define	MAXMAJ64	0xffffffffUL
524 #endif
525 #ifndef	MAXMIN64
526 #define	MAXMIN64	0xffffffffUL
527 #endif
528 
529 /*
530  * Create special expldev for ZFS private use.
531  * Can't use standard expldev since it doesn't do
532  * what we want.  The standard expldev() takes a
533  * dev32_t in LP64 and expands it to a long dev_t.
534  * We need an interface that takes a dev32_t in ILP32
535  * and expands it to a long dev_t.
536  */
537 static uint64_t
538 zfs_expldev(dev_t dev)
539 {
540 #ifndef _LP64
541 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
542 	return (((uint64_t)major << NBITSMINOR64) |
543 	    ((minor_t)dev & MAXMIN32));
544 #else
545 	return (dev);
546 #endif
547 }
548 
549 /*
550  * Special cmpldev for ZFS private use.
551  * Can't use standard cmpldev since it takes
552  * a long dev_t and compresses it to dev32_t in
553  * LP64.  We need to do a compaction of a long dev_t
554  * to a dev32_t in ILP32.
555  */
556 dev_t
557 zfs_cmpldev(uint64_t dev)
558 {
559 #ifndef _LP64
560 	minor_t minor = (minor_t)dev & MAXMIN64;
561 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
562 
563 	if (major > MAXMAJ32 || minor > MAXMIN32)
564 		return (NODEV32);
565 
566 	return (((dev32_t)major << NBITSMINOR32) | minor);
567 #else
568 	return (dev);
569 #endif
570 }
571 
572 static void
573 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
574 {
575 	znode_t		*nzp;
576 
577 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
578 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
579 
580 	mutex_enter(&zp->z_lock);
581 
582 	ASSERT(zp->z_dbuf == NULL);
583 	zp->z_dbuf = db;
584 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
585 
586 	/*
587 	 * there should be no
588 	 * concurrent zgets on this object.
589 	 */
590 	if (nzp != NULL)
591 		panic("existing znode %p for dbuf %p", nzp, db);
592 
593 	/*
594 	 * Slap on VROOT if we are the root znode
595 	 */
596 	if (zp->z_id == zfsvfs->z_root)
597 		ZTOV(zp)->v_flag |= VROOT;
598 
599 	mutex_exit(&zp->z_lock);
600 	vn_exists(ZTOV(zp));
601 }
602 
603 void
604 zfs_znode_dmu_fini(znode_t *zp)
605 {
606 	dmu_buf_t *db = zp->z_dbuf;
607 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
608 	    zp->z_unlinked ||
609 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
610 	ASSERT(zp->z_dbuf != NULL);
611 	zp->z_dbuf = NULL;
612 	VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
613 	dmu_buf_rele(db, NULL);
614 }
615 
616 /*
617  * Construct a new znode/vnode and intialize.
618  *
619  * This does not do a call to dmu_set_user() that is
620  * up to the caller to do, in case you don't want to
621  * return the znode
622  */
623 static znode_t *
624 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
625 {
626 	znode_t	*zp;
627 	vnode_t *vp;
628 
629 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
630 
631 	ASSERT(zp->z_dirlocks == NULL);
632 	ASSERT(zp->z_dbuf == NULL);
633 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
634 
635 	/*
636 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
637 	 * the zfs_znode_move() callback.
638 	 */
639 	zp->z_phys = NULL;
640 	zp->z_unlinked = 0;
641 	zp->z_atime_dirty = 0;
642 	zp->z_mapcnt = 0;
643 	zp->z_last_itx = 0;
644 	zp->z_id = db->db_object;
645 	zp->z_blksz = blksz;
646 	zp->z_seq = 0x7A4653;
647 	zp->z_sync_cnt = 0;
648 
649 	vp = ZTOV(zp);
650 	vn_reinit(vp);
651 
652 	zfs_znode_dmu_init(zfsvfs, zp, db);
653 
654 	zp->z_gen = zp->z_phys->zp_gen;
655 
656 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
657 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
658 
659 	switch (vp->v_type) {
660 	case VDIR:
661 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
662 			vn_setops(vp, zfs_xdvnodeops);
663 			vp->v_flag |= V_XATTRDIR;
664 		} else {
665 			vn_setops(vp, zfs_dvnodeops);
666 		}
667 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
668 		break;
669 	case VBLK:
670 	case VCHR:
671 		vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
672 		/*FALLTHROUGH*/
673 	case VFIFO:
674 	case VSOCK:
675 	case VDOOR:
676 		vn_setops(vp, zfs_fvnodeops);
677 		break;
678 	case VREG:
679 		vp->v_flag |= VMODSORT;
680 		vn_setops(vp, zfs_fvnodeops);
681 		break;
682 	case VLNK:
683 		vn_setops(vp, zfs_symvnodeops);
684 		break;
685 	default:
686 		vn_setops(vp, zfs_evnodeops);
687 		break;
688 	}
689 
690 	mutex_enter(&zfsvfs->z_znodes_lock);
691 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
692 	membar_producer();
693 	/*
694 	 * Everything else must be valid before assigning z_zfsvfs makes the
695 	 * znode eligible for zfs_znode_move().
696 	 */
697 	zp->z_zfsvfs = zfsvfs;
698 	mutex_exit(&zfsvfs->z_znodes_lock);
699 
700 	VFS_HOLD(zfsvfs->z_vfs);
701 	return (zp);
702 }
703 
704 /*
705  * Create a new DMU object to hold a zfs znode.
706  *
707  *	IN:	dzp	- parent directory for new znode
708  *		vap	- file attributes for new znode
709  *		tx	- dmu transaction id for zap operations
710  *		cr	- credentials of caller
711  *		flag	- flags:
712  *			  IS_ROOT_NODE	- new object will be root
713  *			  IS_XATTR	- new object is an attribute
714  *			  IS_REPLAY	- intent log replay
715  *		bonuslen - length of bonus buffer
716  *		setaclp  - File/Dir initial ACL
717  *		fuidp	 - Tracks fuid allocation.
718  *
719  *	OUT:	zpp	- allocated znode
720  *
721  */
722 void
723 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
724     uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
725     zfs_fuid_info_t **fuidp)
726 {
727 	dmu_buf_t	*db;
728 	znode_phys_t	*pzp;
729 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
730 	timestruc_t	now;
731 	uint64_t	gen, obj;
732 	int		err;
733 
734 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
735 
736 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
737 		obj = vap->va_nodeid;
738 		flag |= IS_REPLAY;
739 		now = vap->va_ctime;		/* see zfs_replay_create() */
740 		gen = vap->va_nblocks;		/* ditto */
741 	} else {
742 		obj = 0;
743 		gethrestime(&now);
744 		gen = dmu_tx_get_txg(tx);
745 	}
746 
747 	/*
748 	 * Create a new DMU object.
749 	 */
750 	/*
751 	 * There's currently no mechanism for pre-reading the blocks that will
752 	 * be to needed allocate a new object, so we accept the small chance
753 	 * that there will be an i/o error and we will fail one of the
754 	 * assertions below.
755 	 */
756 	if (vap->va_type == VDIR) {
757 		if (flag & IS_REPLAY) {
758 			err = zap_create_claim_norm(zfsvfs->z_os, obj,
759 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
760 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
761 			ASSERT3U(err, ==, 0);
762 		} else {
763 			obj = zap_create_norm(zfsvfs->z_os,
764 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
765 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
766 		}
767 	} else {
768 		if (flag & IS_REPLAY) {
769 			err = dmu_object_claim(zfsvfs->z_os, obj,
770 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
771 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
772 			ASSERT3U(err, ==, 0);
773 		} else {
774 			obj = dmu_object_alloc(zfsvfs->z_os,
775 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
776 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
777 		}
778 	}
779 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
780 	dmu_buf_will_dirty(db, tx);
781 
782 	/*
783 	 * Initialize the znode physical data to zero.
784 	 */
785 	ASSERT(db->db_size >= sizeof (znode_phys_t));
786 	bzero(db->db_data, db->db_size);
787 	pzp = db->db_data;
788 
789 	/*
790 	 * If this is the root, fix up the half-initialized parent pointer
791 	 * to reference the just-allocated physical data area.
792 	 */
793 	if (flag & IS_ROOT_NODE) {
794 		dzp->z_dbuf = db;
795 		dzp->z_phys = pzp;
796 		dzp->z_id = obj;
797 	}
798 
799 	/*
800 	 * If parent is an xattr, so am I.
801 	 */
802 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
803 		flag |= IS_XATTR;
804 
805 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
806 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
807 	}
808 
809 	if (zfsvfs->z_use_fuids)
810 		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
811 
812 	if (vap->va_type == VDIR) {
813 		pzp->zp_size = 2;		/* contents ("." and "..") */
814 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
815 	}
816 
817 	pzp->zp_parent = dzp->z_id;
818 	if (flag & IS_XATTR)
819 		pzp->zp_flags |= ZFS_XATTR;
820 
821 	pzp->zp_gen = gen;
822 
823 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
824 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
825 
826 	if (vap->va_mask & AT_ATIME) {
827 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
828 	} else {
829 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
830 	}
831 
832 	if (vap->va_mask & AT_MTIME) {
833 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
834 	} else {
835 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
836 	}
837 
838 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
839 	if (!(flag & IS_ROOT_NODE)) {
840 		ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
841 		*zpp = zfs_znode_alloc(zfsvfs, db, 0);
842 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
843 	} else {
844 		/*
845 		 * If we are creating the root node, the "parent" we
846 		 * passed in is the znode for the root.
847 		 */
848 		*zpp = dzp;
849 	}
850 	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
851 }
852 
853 void
854 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
855 {
856 	xoptattr_t *xoap;
857 
858 	xoap = xva_getxoptattr(xvap);
859 	ASSERT(xoap);
860 
861 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
862 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
863 		XVA_SET_RTN(xvap, XAT_CREATETIME);
864 	}
865 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
866 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
867 		XVA_SET_RTN(xvap, XAT_READONLY);
868 	}
869 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
870 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
871 		XVA_SET_RTN(xvap, XAT_HIDDEN);
872 	}
873 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
874 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
875 		XVA_SET_RTN(xvap, XAT_SYSTEM);
876 	}
877 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
878 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
879 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
880 	}
881 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
882 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
883 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
884 	}
885 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
886 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
887 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
888 	}
889 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
890 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
891 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
892 	}
893 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
894 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
895 		XVA_SET_RTN(xvap, XAT_NODUMP);
896 	}
897 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
898 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
899 		XVA_SET_RTN(xvap, XAT_OPAQUE);
900 	}
901 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
902 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
903 		    xoap->xoa_av_quarantined);
904 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
905 	}
906 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
907 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
908 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
909 	}
910 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
911 		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
912 		    sizeof (xoap->xoa_av_scanstamp));
913 		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
914 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
915 	}
916 }
917 
918 int
919 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
920 {
921 	dmu_object_info_t doi;
922 	dmu_buf_t	*db;
923 	znode_t		*zp;
924 	int err;
925 
926 	*zpp = NULL;
927 
928 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
929 
930 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
931 	if (err) {
932 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
933 		return (err);
934 	}
935 
936 	dmu_object_info_from_db(db, &doi);
937 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
938 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
939 		dmu_buf_rele(db, NULL);
940 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
941 		return (EINVAL);
942 	}
943 
944 	zp = dmu_buf_get_user(db);
945 	if (zp != NULL) {
946 		mutex_enter(&zp->z_lock);
947 
948 		/*
949 		 * Since we do immediate eviction of the z_dbuf, we
950 		 * should never find a dbuf with a znode that doesn't
951 		 * know about the dbuf.
952 		 */
953 		ASSERT3P(zp->z_dbuf, ==, db);
954 		ASSERT3U(zp->z_id, ==, obj_num);
955 		if (zp->z_unlinked) {
956 			err = ENOENT;
957 		} else {
958 			VN_HOLD(ZTOV(zp));
959 			*zpp = zp;
960 			err = 0;
961 		}
962 		dmu_buf_rele(db, NULL);
963 		mutex_exit(&zp->z_lock);
964 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
965 		return (err);
966 	}
967 
968 	/*
969 	 * Not found create new znode/vnode
970 	 */
971 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
972 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
973 	*zpp = zp;
974 	return (0);
975 }
976 
977 int
978 zfs_rezget(znode_t *zp)
979 {
980 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
981 	dmu_object_info_t doi;
982 	dmu_buf_t *db;
983 	uint64_t obj_num = zp->z_id;
984 	int err;
985 
986 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
987 
988 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
989 	if (err) {
990 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
991 		return (err);
992 	}
993 
994 	dmu_object_info_from_db(db, &doi);
995 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
996 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
997 		dmu_buf_rele(db, NULL);
998 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
999 		return (EINVAL);
1000 	}
1001 
1002 	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
1003 		dmu_buf_rele(db, NULL);
1004 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1005 		return (EIO);
1006 	}
1007 
1008 	zfs_znode_dmu_init(zfsvfs, zp, db);
1009 	zp->z_unlinked = (zp->z_phys->zp_links == 0);
1010 	zp->z_blksz = doi.doi_data_block_size;
1011 
1012 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1013 
1014 	return (0);
1015 }
1016 
1017 void
1018 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1019 {
1020 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1021 	objset_t *os = zfsvfs->z_os;
1022 	uint64_t obj = zp->z_id;
1023 	uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
1024 
1025 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1026 	if (acl_obj)
1027 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1028 	VERIFY(0 == dmu_object_free(os, obj, tx));
1029 	zfs_znode_dmu_fini(zp);
1030 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1031 	zfs_znode_free(zp);
1032 }
1033 
1034 void
1035 zfs_zinactive(znode_t *zp)
1036 {
1037 	vnode_t	*vp = ZTOV(zp);
1038 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1039 	uint64_t z_id = zp->z_id;
1040 
1041 	ASSERT(zp->z_dbuf && zp->z_phys);
1042 
1043 	/*
1044 	 * Don't allow a zfs_zget() while were trying to release this znode
1045 	 */
1046 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1047 
1048 	mutex_enter(&zp->z_lock);
1049 	mutex_enter(&vp->v_lock);
1050 	vp->v_count--;
1051 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
1052 		/*
1053 		 * If the hold count is greater than zero, somebody has
1054 		 * obtained a new reference on this znode while we were
1055 		 * processing it here, so we are done.  If we still have
1056 		 * mapped pages then we are also done, since we don't
1057 		 * want to inactivate the znode until the pages get pushed.
1058 		 *
1059 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
1060 		 * this seems like it would leave the znode hanging with
1061 		 * no chance to go inactive...
1062 		 */
1063 		mutex_exit(&vp->v_lock);
1064 		mutex_exit(&zp->z_lock);
1065 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1066 		return;
1067 	}
1068 	mutex_exit(&vp->v_lock);
1069 
1070 	/*
1071 	 * If this was the last reference to a file with no links,
1072 	 * remove the file from the file system.
1073 	 */
1074 	if (zp->z_unlinked) {
1075 		mutex_exit(&zp->z_lock);
1076 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1077 		zfs_rmnode(zp);
1078 		return;
1079 	}
1080 	mutex_exit(&zp->z_lock);
1081 	zfs_znode_dmu_fini(zp);
1082 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1083 	zfs_znode_free(zp);
1084 }
1085 
1086 void
1087 zfs_znode_free(znode_t *zp)
1088 {
1089 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1090 
1091 	vn_invalid(ZTOV(zp));
1092 
1093 	ASSERT(ZTOV(zp)->v_count == 0);
1094 
1095 	mutex_enter(&zfsvfs->z_znodes_lock);
1096 	POINTER_INVALIDATE(&zp->z_zfsvfs);
1097 	list_remove(&zfsvfs->z_all_znodes, zp);
1098 	mutex_exit(&zfsvfs->z_znodes_lock);
1099 
1100 	kmem_cache_free(znode_cache, zp);
1101 
1102 	VFS_RELE(zfsvfs->z_vfs);
1103 }
1104 
1105 void
1106 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1107 {
1108 	timestruc_t	now;
1109 
1110 	ASSERT(MUTEX_HELD(&zp->z_lock));
1111 
1112 	gethrestime(&now);
1113 
1114 	if (tx) {
1115 		dmu_buf_will_dirty(zp->z_dbuf, tx);
1116 		zp->z_atime_dirty = 0;
1117 		zp->z_seq++;
1118 	} else {
1119 		zp->z_atime_dirty = 1;
1120 	}
1121 
1122 	if (flag & AT_ATIME)
1123 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1124 
1125 	if (flag & AT_MTIME) {
1126 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1127 		if (zp->z_zfsvfs->z_use_fuids)
1128 			zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1129 	}
1130 
1131 	if (flag & AT_CTIME) {
1132 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1133 		if (zp->z_zfsvfs->z_use_fuids)
1134 			zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1135 	}
1136 }
1137 
1138 /*
1139  * Update the requested znode timestamps with the current time.
1140  * If we are in a transaction, then go ahead and mark the znode
1141  * dirty in the transaction so the timestamps will go to disk.
1142  * Otherwise, we will get pushed next time the znode is updated
1143  * in a transaction, or when this znode eventually goes inactive.
1144  *
1145  * Why is this OK?
1146  *  1 - Only the ACCESS time is ever updated outside of a transaction.
1147  *  2 - Multiple consecutive updates will be collapsed into a single
1148  *	znode update by the transaction grouping semantics of the DMU.
1149  */
1150 void
1151 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1152 {
1153 	mutex_enter(&zp->z_lock);
1154 	zfs_time_stamper_locked(zp, flag, tx);
1155 	mutex_exit(&zp->z_lock);
1156 }
1157 
1158 /*
1159  * Grow the block size for a file.
1160  *
1161  *	IN:	zp	- znode of file to free data in.
1162  *		size	- requested block size
1163  *		tx	- open transaction.
1164  *
1165  * NOTE: this function assumes that the znode is write locked.
1166  */
1167 void
1168 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1169 {
1170 	int		error;
1171 	u_longlong_t	dummy;
1172 
1173 	if (size <= zp->z_blksz)
1174 		return;
1175 	/*
1176 	 * If the file size is already greater than the current blocksize,
1177 	 * we will not grow.  If there is more than one block in a file,
1178 	 * the blocksize cannot change.
1179 	 */
1180 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1181 		return;
1182 
1183 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1184 	    size, 0, tx);
1185 	if (error == ENOTSUP)
1186 		return;
1187 	ASSERT3U(error, ==, 0);
1188 
1189 	/* What blocksize did we actually get? */
1190 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1191 }
1192 
1193 /*
1194  * This is a dummy interface used when pvn_vplist_dirty() should *not*
1195  * be calling back into the fs for a putpage().  E.g.: when truncating
1196  * a file, the pages being "thrown away* don't need to be written out.
1197  */
1198 /* ARGSUSED */
1199 static int
1200 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1201     int flags, cred_t *cr)
1202 {
1203 	ASSERT(0);
1204 	return (0);
1205 }
1206 
1207 /*
1208  * Increase the file length
1209  *
1210  *	IN:	zp	- znode of file to free data in.
1211  *		end	- new end-of-file
1212  *
1213  * 	RETURN:	0 if success
1214  *		error code if failure
1215  */
1216 static int
1217 zfs_extend(znode_t *zp, uint64_t end)
1218 {
1219 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1220 	dmu_tx_t *tx;
1221 	rl_t *rl;
1222 	uint64_t newblksz;
1223 	int error;
1224 
1225 	/*
1226 	 * We will change zp_size, lock the whole file.
1227 	 */
1228 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1229 
1230 	/*
1231 	 * Nothing to do if file already at desired length.
1232 	 */
1233 	if (end <= zp->z_phys->zp_size) {
1234 		zfs_range_unlock(rl);
1235 		return (0);
1236 	}
1237 top:
1238 	tx = dmu_tx_create(zfsvfs->z_os);
1239 	dmu_tx_hold_bonus(tx, zp->z_id);
1240 	if (end > zp->z_blksz &&
1241 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1242 		/*
1243 		 * We are growing the file past the current block size.
1244 		 */
1245 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1246 			ASSERT(!ISP2(zp->z_blksz));
1247 			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1248 		} else {
1249 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1250 		}
1251 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1252 	} else {
1253 		newblksz = 0;
1254 	}
1255 
1256 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1257 	if (error) {
1258 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1259 			dmu_tx_wait(tx);
1260 			dmu_tx_abort(tx);
1261 			goto top;
1262 		}
1263 		dmu_tx_abort(tx);
1264 		zfs_range_unlock(rl);
1265 		return (error);
1266 	}
1267 	dmu_buf_will_dirty(zp->z_dbuf, tx);
1268 
1269 	if (newblksz)
1270 		zfs_grow_blocksize(zp, newblksz, tx);
1271 
1272 	zp->z_phys->zp_size = end;
1273 
1274 	zfs_range_unlock(rl);
1275 
1276 	dmu_tx_commit(tx);
1277 
1278 	return (0);
1279 }
1280 
1281 /*
1282  * Free space in a file.
1283  *
1284  *	IN:	zp	- znode of file to free data in.
1285  *		off	- start of section to free.
1286  *		len	- length of section to free.
1287  *
1288  * 	RETURN:	0 if success
1289  *		error code if failure
1290  */
1291 static int
1292 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1293 {
1294 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1295 	rl_t *rl;
1296 	int error;
1297 
1298 	/*
1299 	 * Lock the range being freed.
1300 	 */
1301 	rl = zfs_range_lock(zp, off, len, RL_WRITER);
1302 
1303 	/*
1304 	 * Nothing to do if file already at desired length.
1305 	 */
1306 	if (off >= zp->z_phys->zp_size) {
1307 		zfs_range_unlock(rl);
1308 		return (0);
1309 	}
1310 
1311 	if (off + len > zp->z_phys->zp_size)
1312 		len = zp->z_phys->zp_size - off;
1313 
1314 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1315 
1316 	zfs_range_unlock(rl);
1317 
1318 	return (error);
1319 }
1320 
1321 /*
1322  * Truncate a file
1323  *
1324  *	IN:	zp	- znode of file to free data in.
1325  *		end	- new end-of-file.
1326  *
1327  * 	RETURN:	0 if success
1328  *		error code if failure
1329  */
1330 static int
1331 zfs_trunc(znode_t *zp, uint64_t end)
1332 {
1333 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1334 	vnode_t *vp = ZTOV(zp);
1335 	dmu_tx_t *tx;
1336 	rl_t *rl;
1337 	int error;
1338 
1339 	/*
1340 	 * We will change zp_size, lock the whole file.
1341 	 */
1342 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1343 
1344 	/*
1345 	 * Nothing to do if file already at desired length.
1346 	 */
1347 	if (end >= zp->z_phys->zp_size) {
1348 		zfs_range_unlock(rl);
1349 		return (0);
1350 	}
1351 
1352 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
1353 	if (error) {
1354 		zfs_range_unlock(rl);
1355 		return (error);
1356 	}
1357 top:
1358 	tx = dmu_tx_create(zfsvfs->z_os);
1359 	dmu_tx_hold_bonus(tx, zp->z_id);
1360 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1361 	if (error) {
1362 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1363 			dmu_tx_wait(tx);
1364 			dmu_tx_abort(tx);
1365 			goto top;
1366 		}
1367 		dmu_tx_abort(tx);
1368 		zfs_range_unlock(rl);
1369 		return (error);
1370 	}
1371 	dmu_buf_will_dirty(zp->z_dbuf, tx);
1372 
1373 	zp->z_phys->zp_size = end;
1374 
1375 	dmu_tx_commit(tx);
1376 
1377 	zfs_range_unlock(rl);
1378 
1379 	/*
1380 	 * Clear any mapped pages in the truncated region.  This has to
1381 	 * happen outside of the transaction to avoid the possibility of
1382 	 * a deadlock with someone trying to push a page that we are
1383 	 * about to invalidate.
1384 	 */
1385 	rw_enter(&zp->z_map_lock, RW_WRITER);
1386 	if (vn_has_cached_data(vp)) {
1387 		page_t *pp;
1388 		uint64_t start = end & PAGEMASK;
1389 		int poff = end & PAGEOFFSET;
1390 
1391 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1392 			/*
1393 			 * We need to zero a partial page.
1394 			 */
1395 			pagezero(pp, poff, PAGESIZE - poff);
1396 			start += PAGESIZE;
1397 			page_unlock(pp);
1398 		}
1399 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1400 		    B_INVAL | B_TRUNC, NULL);
1401 		ASSERT(error == 0);
1402 	}
1403 	rw_exit(&zp->z_map_lock);
1404 
1405 	return (0);
1406 }
1407 
1408 /*
1409  * Free space in a file
1410  *
1411  *	IN:	zp	- znode of file to free data in.
1412  *		off	- start of range
1413  *		len	- end of range (0 => EOF)
1414  *		flag	- current file open mode flags.
1415  *		log	- TRUE if this action should be logged
1416  *
1417  * 	RETURN:	0 if success
1418  *		error code if failure
1419  */
1420 int
1421 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1422 {
1423 	vnode_t *vp = ZTOV(zp);
1424 	dmu_tx_t *tx;
1425 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1426 	zilog_t *zilog = zfsvfs->z_log;
1427 	int error;
1428 
1429 	if (off > zp->z_phys->zp_size) {
1430 		error =  zfs_extend(zp, off+len);
1431 		if (error == 0 && log)
1432 			goto log;
1433 		else
1434 			return (error);
1435 	}
1436 
1437 	/*
1438 	 * Check for any locks in the region to be freed.
1439 	 */
1440 	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
1441 		uint64_t length = (len ? len : zp->z_phys->zp_size - off);
1442 		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
1443 			return (error);
1444 	}
1445 
1446 	if (len == 0) {
1447 		error = zfs_trunc(zp, off);
1448 	} else {
1449 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1450 		    off + len > zp->z_phys->zp_size)
1451 			error = zfs_extend(zp, off+len);
1452 	}
1453 	if (error || !log)
1454 		return (error);
1455 log:
1456 	tx = dmu_tx_create(zfsvfs->z_os);
1457 	dmu_tx_hold_bonus(tx, zp->z_id);
1458 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1459 	if (error) {
1460 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1461 			dmu_tx_wait(tx);
1462 			dmu_tx_abort(tx);
1463 			goto log;
1464 		}
1465 		dmu_tx_abort(tx);
1466 		return (error);
1467 	}
1468 
1469 	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1470 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1471 
1472 	dmu_tx_commit(tx);
1473 	return (0);
1474 }
1475 
1476 void
1477 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1478 {
1479 	zfsvfs_t	zfsvfs;
1480 	uint64_t	moid, doid, version;
1481 	uint64_t	sense = ZFS_CASE_SENSITIVE;
1482 	uint64_t	norm = 0;
1483 	nvpair_t	*elem;
1484 	int		error;
1485 	znode_t		*rootzp = NULL;
1486 	vnode_t		*vp;
1487 	vattr_t		vattr;
1488 	znode_t		*zp;
1489 
1490 	/*
1491 	 * First attempt to create master node.
1492 	 */
1493 	/*
1494 	 * In an empty objset, there are no blocks to read and thus
1495 	 * there can be no i/o errors (which we assert below).
1496 	 */
1497 	moid = MASTER_NODE_OBJ;
1498 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1499 	    DMU_OT_NONE, 0, tx);
1500 	ASSERT(error == 0);
1501 
1502 	/*
1503 	 * Set starting attributes.
1504 	 */
1505 	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1506 		version = ZPL_VERSION;
1507 	else
1508 		version = ZPL_VERSION_FUID - 1;
1509 	error = zap_update(os, moid, ZPL_VERSION_STR,
1510 	    8, 1, &version, tx);
1511 	elem = NULL;
1512 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1513 		/* For the moment we expect all zpl props to be uint64_ts */
1514 		uint64_t val;
1515 		char *name;
1516 
1517 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1518 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1519 		name = nvpair_name(elem);
1520 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1521 			version = val;
1522 			error = zap_update(os, moid, ZPL_VERSION_STR,
1523 			    8, 1, &version, tx);
1524 		} else {
1525 			error = zap_update(os, moid, name, 8, 1, &val, tx);
1526 		}
1527 		ASSERT(error == 0);
1528 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1529 			norm = val;
1530 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1531 			sense = val;
1532 	}
1533 	ASSERT(version != 0);
1534 
1535 	/*
1536 	 * Create a delete queue.
1537 	 */
1538 	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1539 
1540 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
1541 	ASSERT(error == 0);
1542 
1543 	/*
1544 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1545 	 * to allow zfs_mknode to work.
1546 	 */
1547 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1548 	vattr.va_type = VDIR;
1549 	vattr.va_mode = S_IFDIR|0755;
1550 	vattr.va_uid = crgetuid(cr);
1551 	vattr.va_gid = crgetgid(cr);
1552 
1553 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1554 	rootzp->z_unlinked = 0;
1555 	rootzp->z_atime_dirty = 0;
1556 
1557 	vp = ZTOV(rootzp);
1558 	vn_reinit(vp);
1559 	vp->v_type = VDIR;
1560 
1561 	bzero(&zfsvfs, sizeof (zfsvfs_t));
1562 
1563 	zfsvfs.z_os = os;
1564 	zfsvfs.z_assign = TXG_NOWAIT;
1565 	zfsvfs.z_parent = &zfsvfs;
1566 	zfsvfs.z_version = version;
1567 	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1568 	zfsvfs.z_norm = norm;
1569 	/*
1570 	 * Fold case on file systems that are always or sometimes case
1571 	 * insensitive.
1572 	 */
1573 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1574 		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1575 
1576 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1577 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1578 	    offsetof(znode_t, z_link_node));
1579 
1580 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1581 	rootzp->z_zfsvfs = &zfsvfs;
1582 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
1583 	ASSERT3P(zp, ==, rootzp);
1584 	ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
1585 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1586 	ASSERT(error == 0);
1587 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1588 
1589 	ZTOV(rootzp)->v_count = 0;
1590 	dmu_buf_rele(rootzp->z_dbuf, NULL);
1591 	rootzp->z_dbuf = NULL;
1592 	kmem_cache_free(znode_cache, rootzp);
1593 }
1594 
1595 #endif /* _KERNEL */
1596 /*
1597  * Given an object number, return its parent object number and whether
1598  * or not the object is an extended attribute directory.
1599  */
1600 static int
1601 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1602 {
1603 	dmu_buf_t *db;
1604 	dmu_object_info_t doi;
1605 	znode_phys_t *zp;
1606 	int error;
1607 
1608 	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1609 		return (error);
1610 
1611 	dmu_object_info_from_db(db, &doi);
1612 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1613 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1614 		dmu_buf_rele(db, FTAG);
1615 		return (EINVAL);
1616 	}
1617 
1618 	zp = db->db_data;
1619 	*pobjp = zp->zp_parent;
1620 	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1621 	    S_ISDIR(zp->zp_mode);
1622 	dmu_buf_rele(db, FTAG);
1623 
1624 	return (0);
1625 }
1626 
1627 int
1628 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1629 {
1630 	char *path = buf + len - 1;
1631 	int error;
1632 
1633 	*path = '\0';
1634 
1635 	for (;;) {
1636 		uint64_t pobj;
1637 		char component[MAXNAMELEN + 2];
1638 		size_t complen;
1639 		int is_xattrdir;
1640 
1641 		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1642 		    &is_xattrdir)) != 0)
1643 			break;
1644 
1645 		if (pobj == obj) {
1646 			if (path[0] != '/')
1647 				*--path = '/';
1648 			break;
1649 		}
1650 
1651 		component[0] = '/';
1652 		if (is_xattrdir) {
1653 			(void) sprintf(component + 1, "<xattrdir>");
1654 		} else {
1655 			error = zap_value_search(osp, pobj, obj,
1656 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
1657 			if (error != 0)
1658 				break;
1659 		}
1660 
1661 		complen = strlen(component);
1662 		path -= complen;
1663 		ASSERT(path >= buf);
1664 		bcopy(component, path, complen);
1665 		obj = pobj;
1666 	}
1667 
1668 	if (error == 0)
1669 		(void) memmove(buf, path, buf + len - path);
1670 	return (error);
1671 }
1672