xref: /titanic_44/usr/src/uts/common/fs/zfs/zfs_znode.c (revision fa94a07fd0519b8abfd871ad8fe60e6bebe1e2bb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Portions Copyright 2007 Jeremy Teo */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #ifdef _KERNEL
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/mntent.h>
38 #include <sys/mkdev.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/kmem.h>
44 #include <sys/errno.h>
45 #include <sys/unistd.h>
46 #include <sys/mode.h>
47 #include <sys/atomic.h>
48 #include <vm/pvn.h>
49 #include "fs/fs_subr.h"
50 #include <sys/zfs_dir.h>
51 #include <sys/zfs_acl.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/zfs_rlock.h>
54 #include <sys/zfs_fuid.h>
55 #include <sys/zfs_i18n.h>
56 #include <sys/fs/zfs.h>
57 #include <sys/kidmap.h>
58 #endif /* _KERNEL */
59 
60 #include <sys/dmu.h>
61 #include <sys/refcount.h>
62 #include <sys/stat.h>
63 #include <sys/zap.h>
64 #include <sys/zfs_znode.h>
65 
66 /*
67  * Functions needed for userland (ie: libzpool) are not put under
68  * #ifdef_KERNEL; the rest of the functions have dependencies
69  * (such as VFS logic) that will not compile easily in userland.
70  */
71 #ifdef _KERNEL
72 struct kmem_cache *znode_cache = NULL;
73 
74 /*ARGSUSED*/
75 static void
76 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
77 {
78 	znode_t *zp = user_ptr;
79 	vnode_t *vp = ZTOV(zp);
80 
81 	mutex_enter(&zp->z_lock);
82 	zp->z_dbuf = NULL;
83 	if (vp->v_count == 0) {
84 		mutex_exit(&zp->z_lock);
85 		vn_invalid(vp);
86 		zfs_znode_free(zp);
87 	} else {
88 		mutex_exit(&zp->z_lock);
89 	}
90 }
91 
92 /*ARGSUSED*/
93 static int
94 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
95 {
96 	znode_t *zp = buf;
97 
98 	zp->z_vnode = vn_alloc(KM_SLEEP);
99 	zp->z_vnode->v_data = (caddr_t)zp;
100 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
101 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
102 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
103 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
104 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
105 
106 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
107 	avl_create(&zp->z_range_avl, zfs_range_compare,
108 	    sizeof (rl_t), offsetof(rl_t, r_node));
109 
110 	zp->z_dbuf = NULL;
111 	zp->z_dirlocks = 0;
112 	return (0);
113 }
114 
115 /*ARGSUSED*/
116 static void
117 zfs_znode_cache_destructor(void *buf, void *cdarg)
118 {
119 	znode_t *zp = buf;
120 
121 	ASSERT(zp->z_dirlocks == 0);
122 	mutex_destroy(&zp->z_lock);
123 	rw_destroy(&zp->z_map_lock);
124 	rw_destroy(&zp->z_parent_lock);
125 	rw_destroy(&zp->z_name_lock);
126 	mutex_destroy(&zp->z_acl_lock);
127 	avl_destroy(&zp->z_range_avl);
128 	mutex_destroy(&zp->z_range_lock);
129 
130 	ASSERT(zp->z_dbuf == NULL);
131 	ASSERT(ZTOV(zp)->v_count == 0);
132 	vn_free(ZTOV(zp));
133 }
134 
135 void
136 zfs_znode_init(void)
137 {
138 	/*
139 	 * Initialize zcache
140 	 */
141 	ASSERT(znode_cache == NULL);
142 	znode_cache = kmem_cache_create("zfs_znode_cache",
143 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
144 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
145 }
146 
147 void
148 zfs_znode_fini(void)
149 {
150 	/*
151 	 * Cleanup vfs & vnode ops
152 	 */
153 	zfs_remove_op_tables();
154 
155 	/*
156 	 * Cleanup zcache
157 	 */
158 	if (znode_cache)
159 		kmem_cache_destroy(znode_cache);
160 	znode_cache = NULL;
161 }
162 
163 struct vnodeops *zfs_dvnodeops;
164 struct vnodeops *zfs_fvnodeops;
165 struct vnodeops *zfs_symvnodeops;
166 struct vnodeops *zfs_xdvnodeops;
167 struct vnodeops *zfs_evnodeops;
168 
169 void
170 zfs_remove_op_tables()
171 {
172 	/*
173 	 * Remove vfs ops
174 	 */
175 	ASSERT(zfsfstype);
176 	(void) vfs_freevfsops_by_type(zfsfstype);
177 	zfsfstype = 0;
178 
179 	/*
180 	 * Remove vnode ops
181 	 */
182 	if (zfs_dvnodeops)
183 		vn_freevnodeops(zfs_dvnodeops);
184 	if (zfs_fvnodeops)
185 		vn_freevnodeops(zfs_fvnodeops);
186 	if (zfs_symvnodeops)
187 		vn_freevnodeops(zfs_symvnodeops);
188 	if (zfs_xdvnodeops)
189 		vn_freevnodeops(zfs_xdvnodeops);
190 	if (zfs_evnodeops)
191 		vn_freevnodeops(zfs_evnodeops);
192 
193 	zfs_dvnodeops = NULL;
194 	zfs_fvnodeops = NULL;
195 	zfs_symvnodeops = NULL;
196 	zfs_xdvnodeops = NULL;
197 	zfs_evnodeops = NULL;
198 }
199 
200 extern const fs_operation_def_t zfs_dvnodeops_template[];
201 extern const fs_operation_def_t zfs_fvnodeops_template[];
202 extern const fs_operation_def_t zfs_xdvnodeops_template[];
203 extern const fs_operation_def_t zfs_symvnodeops_template[];
204 extern const fs_operation_def_t zfs_evnodeops_template[];
205 
206 int
207 zfs_create_op_tables()
208 {
209 	int error;
210 
211 	/*
212 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
213 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
214 	 * In this case we just return as the ops vectors are already set up.
215 	 */
216 	if (zfs_dvnodeops)
217 		return (0);
218 
219 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
220 	    &zfs_dvnodeops);
221 	if (error)
222 		return (error);
223 
224 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
225 	    &zfs_fvnodeops);
226 	if (error)
227 		return (error);
228 
229 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
230 	    &zfs_symvnodeops);
231 	if (error)
232 		return (error);
233 
234 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
235 	    &zfs_xdvnodeops);
236 	if (error)
237 		return (error);
238 
239 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
240 	    &zfs_evnodeops);
241 
242 	return (error);
243 }
244 
245 /*
246  * zfs_init_fs - Initialize the zfsvfs struct and the file system
247  *	incore "master" object.  Verify version compatibility.
248  */
249 int
250 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
251 {
252 	extern int zfsfstype;
253 
254 	objset_t	*os = zfsvfs->z_os;
255 	int		i, error;
256 	dmu_object_info_t doi;
257 	uint64_t fsid_guid;
258 
259 	*zpp = NULL;
260 
261 	/*
262 	 * XXX - hack to auto-create the pool root filesystem at
263 	 * the first attempted mount.
264 	 */
265 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
266 		dmu_tx_t *tx = dmu_tx_create(os);
267 		uint64_t zpl_version;
268 
269 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
270 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
271 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
272 		error = dmu_tx_assign(tx, TXG_WAIT);
273 		ASSERT3U(error, ==, 0);
274 		if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
275 			zpl_version = ZPL_VERSION;
276 		else
277 			zpl_version = ZPL_VERSION_FUID - 1;
278 		zfs_create_fs(os, cr, zpl_version, 0, tx);
279 		dmu_tx_commit(tx);
280 	}
281 
282 	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
283 	    &zfsvfs->z_version);
284 	if (error) {
285 		return (error);
286 	} else if (zfsvfs->z_version > ZPL_VERSION) {
287 		(void) printf("Mismatched versions:  File system "
288 		    "is version %lld on-disk format, which is "
289 		    "incompatible with this software version %lld!",
290 		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
291 		return (ENOTSUP);
292 	}
293 
294 	/*
295 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
296 	 * separates our fsid from any other filesystem types, and a
297 	 * 56-bit objset unique ID.  The objset unique ID is unique to
298 	 * all objsets open on this system, provided by unique_create().
299 	 * The 8-bit fs type must be put in the low bits of fsid[1]
300 	 * because that's where other Solaris filesystems put it.
301 	 */
302 	fsid_guid = dmu_objset_fsid_guid(os);
303 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
304 	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
305 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
306 	    zfsfstype & 0xFF;
307 
308 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
309 	    &zfsvfs->z_root);
310 	if (error)
311 		return (error);
312 	ASSERT(zfsvfs->z_root != 0);
313 
314 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
315 	    &zfsvfs->z_unlinkedobj);
316 	if (error)
317 		return (error);
318 
319 	/*
320 	 * Initialize zget mutex's
321 	 */
322 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
323 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
324 
325 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
326 	if (error) {
327 		/*
328 		 * On error, we destroy the mutexes here since it's not
329 		 * possible for the caller to determine if the mutexes were
330 		 * initialized properly.
331 		 */
332 		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
333 			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
334 		return (error);
335 	}
336 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
337 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
338 	    &zfsvfs->z_fuid_obj);
339 	if (error == ENOENT)
340 		error = 0;
341 
342 	return (0);
343 }
344 
345 /*
346  * define a couple of values we need available
347  * for both 64 and 32 bit environments.
348  */
349 #ifndef NBITSMINOR64
350 #define	NBITSMINOR64	32
351 #endif
352 #ifndef MAXMAJ64
353 #define	MAXMAJ64	0xffffffffUL
354 #endif
355 #ifndef	MAXMIN64
356 #define	MAXMIN64	0xffffffffUL
357 #endif
358 
359 /*
360  * Create special expldev for ZFS private use.
361  * Can't use standard expldev since it doesn't do
362  * what we want.  The standard expldev() takes a
363  * dev32_t in LP64 and expands it to a long dev_t.
364  * We need an interface that takes a dev32_t in ILP32
365  * and expands it to a long dev_t.
366  */
367 static uint64_t
368 zfs_expldev(dev_t dev)
369 {
370 #ifndef _LP64
371 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
372 	return (((uint64_t)major << NBITSMINOR64) |
373 	    ((minor_t)dev & MAXMIN32));
374 #else
375 	return (dev);
376 #endif
377 }
378 
379 /*
380  * Special cmpldev for ZFS private use.
381  * Can't use standard cmpldev since it takes
382  * a long dev_t and compresses it to dev32_t in
383  * LP64.  We need to do a compaction of a long dev_t
384  * to a dev32_t in ILP32.
385  */
386 dev_t
387 zfs_cmpldev(uint64_t dev)
388 {
389 #ifndef _LP64
390 	minor_t minor = (minor_t)dev & MAXMIN64;
391 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
392 
393 	if (major > MAXMAJ32 || minor > MAXMIN32)
394 		return (NODEV32);
395 
396 	return (((dev32_t)major << NBITSMINOR32) | minor);
397 #else
398 	return (dev);
399 #endif
400 }
401 
402 static void
403 zfs_znode_dmu_init(znode_t *zp, dmu_buf_t *db)
404 {
405 	znode_t		*nzp;
406 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
407 
408 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp)));
409 
410 	mutex_enter(&zp->z_lock);
411 
412 	ASSERT(zp->z_dbuf == NULL);
413 	zp->z_dbuf = db;
414 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
415 
416 	/*
417 	 * there should be no
418 	 * concurrent zgets on this object.
419 	 */
420 	if (nzp != NULL)
421 		panic("existing znode %p for dbuf %p", nzp, db);
422 
423 	/*
424 	 * Slap on VROOT if we are the root znode
425 	 */
426 	if (zp->z_id == zfsvfs->z_root)
427 		ZTOV(zp)->v_flag |= VROOT;
428 
429 	mutex_exit(&zp->z_lock);
430 	vn_exists(ZTOV(zp));
431 }
432 
433 static void
434 zfs_znode_dmu_fini(znode_t *zp)
435 {
436 	dmu_buf_t *db = zp->z_dbuf;
437 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp)));
438 	ASSERT(zp->z_dbuf != NULL);
439 	zp->z_dbuf = NULL;
440 	dmu_buf_rele(db, NULL);
441 }
442 
443 /*
444  * Construct a new znode/vnode and intialize.
445  *
446  * This does not do a call to dmu_set_user() that is
447  * up to the caller to do, in case you don't want to
448  * return the znode
449  */
450 static znode_t *
451 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
452 {
453 	znode_t	*zp;
454 	vnode_t *vp;
455 
456 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
457 
458 	ASSERT(zp->z_dirlocks == NULL);
459 	ASSERT(zp->z_dbuf == NULL);
460 
461 	zp->z_phys = NULL;
462 	zp->z_zfsvfs = zfsvfs;
463 	zp->z_unlinked = 0;
464 	zp->z_atime_dirty = 0;
465 	zp->z_mapcnt = 0;
466 	zp->z_last_itx = 0;
467 	zp->z_id = db->db_object;
468 	zp->z_blksz = blksz;
469 	zp->z_seq = 0x7A4653;
470 	zp->z_sync_cnt = 0;
471 
472 	vp = ZTOV(zp);
473 	vn_reinit(vp);
474 
475 	zfs_znode_dmu_init(zp, db);
476 
477 	zp->z_gen = zp->z_phys->zp_gen;
478 
479 	mutex_enter(&zfsvfs->z_znodes_lock);
480 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
481 	mutex_exit(&zfsvfs->z_znodes_lock);
482 
483 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
484 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
485 
486 	switch (vp->v_type) {
487 	case VDIR:
488 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
489 			vn_setops(vp, zfs_xdvnodeops);
490 			vp->v_flag |= V_XATTRDIR;
491 		} else {
492 			vn_setops(vp, zfs_dvnodeops);
493 		}
494 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
495 		break;
496 	case VBLK:
497 	case VCHR:
498 		vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
499 		/*FALLTHROUGH*/
500 	case VFIFO:
501 	case VSOCK:
502 	case VDOOR:
503 		vn_setops(vp, zfs_fvnodeops);
504 		break;
505 	case VREG:
506 		vp->v_flag |= VMODSORT;
507 		vn_setops(vp, zfs_fvnodeops);
508 		break;
509 	case VLNK:
510 		vn_setops(vp, zfs_symvnodeops);
511 		break;
512 	default:
513 		vn_setops(vp, zfs_evnodeops);
514 		break;
515 	}
516 
517 	/* it can be NULL from zfs_create_fs */
518 	if (zfsvfs->z_vfs)
519 		VFS_HOLD(zfsvfs->z_vfs);
520 	return (zp);
521 }
522 
523 /*
524  * Create a new DMU object to hold a zfs znode.
525  *
526  *	IN:	dzp	- parent directory for new znode
527  *		vap	- file attributes for new znode
528  *		tx	- dmu transaction id for zap operations
529  *		cr	- credentials of caller
530  *		flag	- flags:
531  *			  IS_ROOT_NODE	- new object will be root
532  *			  IS_XATTR	- new object is an attribute
533  *			  IS_REPLAY	- intent log replay
534  *		bonuslen - length of bonus buffer
535  *		setaclp  - File/Dir initial ACL
536  *		fuidp	 - Tracks fuid allocation.
537  *
538  *	OUT:	zpp	- allocated znode
539  *
540  */
541 void
542 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
543     uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
544     zfs_fuid_info_t **fuidp)
545 {
546 	dmu_buf_t	*db;
547 	znode_phys_t	*pzp;
548 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
549 	timestruc_t	now;
550 	uint64_t	gen, obj;
551 	int		err;
552 
553 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
554 
555 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
556 		obj = vap->va_nodeid;
557 		flag |= IS_REPLAY;
558 		now = vap->va_ctime;		/* see zfs_replay_create() */
559 		gen = vap->va_nblocks;		/* ditto */
560 	} else {
561 		obj = 0;
562 		gethrestime(&now);
563 		gen = dmu_tx_get_txg(tx);
564 	}
565 
566 	/*
567 	 * Create a new DMU object.
568 	 */
569 	/*
570 	 * There's currently no mechanism for pre-reading the blocks that will
571 	 * be to needed allocate a new object, so we accept the small chance
572 	 * that there will be an i/o error and we will fail one of the
573 	 * assertions below.
574 	 */
575 	if (vap->va_type == VDIR) {
576 		if (flag & IS_REPLAY) {
577 			err = zap_create_claim_norm(zfsvfs->z_os, obj,
578 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
579 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
580 			ASSERT3U(err, ==, 0);
581 		} else {
582 			obj = zap_create_norm(zfsvfs->z_os,
583 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
584 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
585 		}
586 	} else {
587 		if (flag & IS_REPLAY) {
588 			err = dmu_object_claim(zfsvfs->z_os, obj,
589 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
590 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
591 			ASSERT3U(err, ==, 0);
592 		} else {
593 			obj = dmu_object_alloc(zfsvfs->z_os,
594 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
595 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
596 		}
597 	}
598 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
599 	dmu_buf_will_dirty(db, tx);
600 
601 	/*
602 	 * Initialize the znode physical data to zero.
603 	 */
604 	ASSERT(db->db_size >= sizeof (znode_phys_t));
605 	bzero(db->db_data, db->db_size);
606 	pzp = db->db_data;
607 
608 	/*
609 	 * If this is the root, fix up the half-initialized parent pointer
610 	 * to reference the just-allocated physical data area.
611 	 */
612 	if (flag & IS_ROOT_NODE) {
613 		dzp->z_phys = pzp;
614 		dzp->z_id = obj;
615 	}
616 
617 	/*
618 	 * If parent is an xattr, so am I.
619 	 */
620 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
621 		flag |= IS_XATTR;
622 
623 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
624 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
625 	}
626 
627 	if (zfsvfs->z_use_fuids)
628 		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
629 
630 	if (vap->va_type == VDIR) {
631 		pzp->zp_size = 2;		/* contents ("." and "..") */
632 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
633 	}
634 
635 	pzp->zp_parent = dzp->z_id;
636 	if (flag & IS_XATTR)
637 		pzp->zp_flags |= ZFS_XATTR;
638 
639 	pzp->zp_gen = gen;
640 
641 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
642 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
643 
644 	if (vap->va_mask & AT_ATIME) {
645 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
646 	} else {
647 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
648 	}
649 
650 	if (vap->va_mask & AT_MTIME) {
651 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
652 	} else {
653 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
654 	}
655 
656 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
657 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj)
658 	*zpp = zfs_znode_alloc(zfsvfs, db, 0);
659 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
660 	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
661 }
662 
663 void
664 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
665 {
666 	xoptattr_t *xoap;
667 
668 	xoap = xva_getxoptattr(xvap);
669 	ASSERT(xoap);
670 
671 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
672 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
673 		XVA_SET_RTN(xvap, XAT_CREATETIME);
674 	}
675 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
676 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
677 		XVA_SET_RTN(xvap, XAT_READONLY);
678 	}
679 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
680 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
681 		XVA_SET_RTN(xvap, XAT_HIDDEN);
682 	}
683 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
684 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
685 		XVA_SET_RTN(xvap, XAT_SYSTEM);
686 	}
687 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
688 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
689 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
690 	}
691 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
692 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
693 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
694 	}
695 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
696 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
697 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
698 	}
699 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
700 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
701 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
702 	}
703 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
704 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
705 		XVA_SET_RTN(xvap, XAT_NODUMP);
706 	}
707 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
708 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
709 		XVA_SET_RTN(xvap, XAT_OPAQUE);
710 	}
711 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
712 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
713 		    xoap->xoa_av_quarantined);
714 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
715 	}
716 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
717 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
718 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
719 	}
720 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
721 		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
722 		    sizeof (xoap->xoa_av_scanstamp));
723 		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
724 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
725 	}
726 }
727 
728 int
729 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
730 {
731 	dmu_object_info_t doi;
732 	dmu_buf_t	*db;
733 	znode_t		*zp;
734 	int err;
735 
736 	*zpp = NULL;
737 
738 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
739 
740 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
741 	if (err) {
742 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
743 		return (err);
744 	}
745 
746 	dmu_object_info_from_db(db, &doi);
747 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
748 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
749 		dmu_buf_rele(db, NULL);
750 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
751 		return (EINVAL);
752 	}
753 
754 	zp = dmu_buf_get_user(db);
755 	if (zp != NULL) {
756 		mutex_enter(&zp->z_lock);
757 
758 		/*
759 		 * Since we do immediate eviction of the z_dbuf, we
760 		 * should never find a dbuf with a znode that doesn't
761 		 * know about the dbuf.
762 		 */
763 		ASSERT3P(zp->z_dbuf, ==, db);
764 		ASSERT3U(zp->z_id, ==, obj_num);
765 		if (zp->z_unlinked) {
766 			err = ENOENT;
767 		} else {
768 			VN_HOLD(ZTOV(zp));
769 			*zpp = zp;
770 			err = 0;
771 		}
772 		dmu_buf_rele(db, NULL);
773 		mutex_exit(&zp->z_lock);
774 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
775 		return (err);
776 	}
777 
778 	/*
779 	 * Not found create new znode/vnode
780 	 */
781 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
782 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
783 	*zpp = zp;
784 	return (0);
785 }
786 
787 int
788 zfs_rezget(znode_t *zp)
789 {
790 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
791 	dmu_object_info_t doi;
792 	dmu_buf_t *db;
793 	uint64_t obj_num = zp->z_id;
794 	int err;
795 
796 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
797 
798 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
799 	if (err) {
800 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
801 		return (err);
802 	}
803 
804 	dmu_object_info_from_db(db, &doi);
805 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
806 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
807 		dmu_buf_rele(db, NULL);
808 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
809 		return (EINVAL);
810 	}
811 
812 	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
813 		dmu_buf_rele(db, NULL);
814 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
815 		return (EIO);
816 	}
817 
818 	zfs_znode_dmu_init(zp, db);
819 	zp->z_unlinked = (zp->z_phys->zp_links == 0);
820 
821 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
822 
823 	return (0);
824 }
825 
826 void
827 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
828 {
829 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
830 	uint64_t obj = zp->z_id;
831 
832 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
833 	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
834 		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
835 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx));
836 	}
837 	VERIFY(0 == dmu_object_free(zfsvfs->z_os, obj, tx));
838 	zfs_znode_dmu_fini(zp);
839 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
840 }
841 
842 void
843 zfs_zinactive(znode_t *zp)
844 {
845 	vnode_t	*vp = ZTOV(zp);
846 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
847 	uint64_t z_id = zp->z_id;
848 
849 	ASSERT(zp->z_dbuf && zp->z_phys);
850 
851 	/*
852 	 * Don't allow a zfs_zget() while were trying to release this znode
853 	 */
854 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
855 
856 	mutex_enter(&zp->z_lock);
857 	mutex_enter(&vp->v_lock);
858 	vp->v_count--;
859 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
860 		/*
861 		 * If the hold count is greater than zero, somebody has
862 		 * obtained a new reference on this znode while we were
863 		 * processing it here, so we are done.  If we still have
864 		 * mapped pages then we are also done, since we don't
865 		 * want to inactivate the znode until the pages get pushed.
866 		 *
867 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
868 		 * this seems like it would leave the znode hanging with
869 		 * no chance to go inactive...
870 		 */
871 		mutex_exit(&vp->v_lock);
872 		mutex_exit(&zp->z_lock);
873 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
874 		return;
875 	}
876 	mutex_exit(&vp->v_lock);
877 
878 	/*
879 	 * If this was the last reference to a file with no links,
880 	 * remove the file from the file system.
881 	 */
882 	if (zp->z_unlinked) {
883 		mutex_exit(&zp->z_lock);
884 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
885 		zfs_rmnode(zp);
886 		VFS_RELE(zfsvfs->z_vfs);
887 		return;
888 	}
889 	mutex_exit(&zp->z_lock);
890 	zfs_znode_dmu_fini(zp);
891 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
892 	/* it can be NULL from zfs_create_fs */
893 	if (zfsvfs->z_vfs)
894 		VFS_RELE(zfsvfs->z_vfs);
895 }
896 
897 void
898 zfs_znode_free(znode_t *zp)
899 {
900 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
901 
902 	mutex_enter(&zfsvfs->z_znodes_lock);
903 	list_remove(&zfsvfs->z_all_znodes, zp);
904 	mutex_exit(&zfsvfs->z_znodes_lock);
905 
906 	kmem_cache_free(znode_cache, zp);
907 }
908 
909 void
910 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
911 {
912 	timestruc_t	now;
913 
914 	ASSERT(MUTEX_HELD(&zp->z_lock));
915 
916 	gethrestime(&now);
917 
918 	if (tx) {
919 		dmu_buf_will_dirty(zp->z_dbuf, tx);
920 		zp->z_atime_dirty = 0;
921 		zp->z_seq++;
922 	} else {
923 		zp->z_atime_dirty = 1;
924 	}
925 
926 	if (flag & AT_ATIME)
927 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
928 
929 	if (flag & AT_MTIME) {
930 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
931 		if (zp->z_zfsvfs->z_use_fuids)
932 			zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
933 	}
934 
935 	if (flag & AT_CTIME) {
936 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
937 		if (zp->z_zfsvfs->z_use_fuids)
938 			zp->z_phys->zp_flags |= ZFS_ARCHIVE;
939 	}
940 }
941 
942 /*
943  * Update the requested znode timestamps with the current time.
944  * If we are in a transaction, then go ahead and mark the znode
945  * dirty in the transaction so the timestamps will go to disk.
946  * Otherwise, we will get pushed next time the znode is updated
947  * in a transaction, or when this znode eventually goes inactive.
948  *
949  * Why is this OK?
950  *  1 - Only the ACCESS time is ever updated outside of a transaction.
951  *  2 - Multiple consecutive updates will be collapsed into a single
952  *	znode update by the transaction grouping semantics of the DMU.
953  */
954 void
955 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
956 {
957 	mutex_enter(&zp->z_lock);
958 	zfs_time_stamper_locked(zp, flag, tx);
959 	mutex_exit(&zp->z_lock);
960 }
961 
962 /*
963  * Grow the block size for a file.
964  *
965  *	IN:	zp	- znode of file to free data in.
966  *		size	- requested block size
967  *		tx	- open transaction.
968  *
969  * NOTE: this function assumes that the znode is write locked.
970  */
971 void
972 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
973 {
974 	int		error;
975 	u_longlong_t	dummy;
976 
977 	if (size <= zp->z_blksz)
978 		return;
979 	/*
980 	 * If the file size is already greater than the current blocksize,
981 	 * we will not grow.  If there is more than one block in a file,
982 	 * the blocksize cannot change.
983 	 */
984 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
985 		return;
986 
987 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
988 	    size, 0, tx);
989 	if (error == ENOTSUP)
990 		return;
991 	ASSERT3U(error, ==, 0);
992 
993 	/* What blocksize did we actually get? */
994 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
995 }
996 
997 /*
998  * This is a dummy interface used when pvn_vplist_dirty() should *not*
999  * be calling back into the fs for a putpage().  E.g.: when truncating
1000  * a file, the pages being "thrown away* don't need to be written out.
1001  */
1002 /* ARGSUSED */
1003 static int
1004 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1005     int flags, cred_t *cr)
1006 {
1007 	ASSERT(0);
1008 	return (0);
1009 }
1010 
1011 /*
1012  * Free space in a file.
1013  *
1014  *	IN:	zp	- znode of file to free data in.
1015  *		off	- start of section to free.
1016  *		len	- length of section to free (0 => to EOF).
1017  *		flag	- current file open mode flags.
1018  *
1019  * 	RETURN:	0 if success
1020  *		error code if failure
1021  */
1022 int
1023 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1024 {
1025 	vnode_t *vp = ZTOV(zp);
1026 	dmu_tx_t *tx;
1027 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1028 	zilog_t *zilog = zfsvfs->z_log;
1029 	rl_t *rl;
1030 	uint64_t end = off + len;
1031 	uint64_t size, new_blksz;
1032 	uint64_t pflags = zp->z_phys->zp_flags;
1033 	int error;
1034 
1035 	if ((pflags & (ZFS_IMMUTABLE|ZFS_READONLY)) ||
1036 	    off < zp->z_phys->zp_size && (pflags & ZFS_APPENDONLY))
1037 		return (EPERM);
1038 
1039 	if (ZTOV(zp)->v_type == VFIFO)
1040 		return (0);
1041 
1042 	/*
1043 	 * If we will change zp_size then lock the whole file,
1044 	 * otherwise just lock the range being freed.
1045 	 */
1046 	if (len == 0 || off + len > zp->z_phys->zp_size) {
1047 		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1048 	} else {
1049 		rl = zfs_range_lock(zp, off, len, RL_WRITER);
1050 		/* recheck, in case zp_size changed */
1051 		if (off + len > zp->z_phys->zp_size) {
1052 			/* lost race: file size changed, lock whole file */
1053 			zfs_range_unlock(rl);
1054 			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1055 		}
1056 	}
1057 
1058 	/*
1059 	 * Nothing to do if file already at desired length.
1060 	 */
1061 	size = zp->z_phys->zp_size;
1062 	if (len == 0 && size == off && off != 0) {
1063 		zfs_range_unlock(rl);
1064 		return (0);
1065 	}
1066 
1067 	/*
1068 	 * Check for any locks in the region to be freed.
1069 	 */
1070 	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
1071 		uint64_t start = off;
1072 		uint64_t extent = len;
1073 
1074 		if (off > size) {
1075 			start = size;
1076 			extent += off - size;
1077 		} else if (len == 0) {
1078 			extent = size - off;
1079 		}
1080 		if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
1081 			zfs_range_unlock(rl);
1082 			return (error);
1083 		}
1084 	}
1085 
1086 	tx = dmu_tx_create(zfsvfs->z_os);
1087 	dmu_tx_hold_bonus(tx, zp->z_id);
1088 	new_blksz = 0;
1089 	if (end > size &&
1090 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1091 		/*
1092 		 * We are growing the file past the current block size.
1093 		 */
1094 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1095 			ASSERT(!ISP2(zp->z_blksz));
1096 			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
1097 		} else {
1098 			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1099 		}
1100 		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
1101 	} else if (off < size) {
1102 		/*
1103 		 * If len == 0, we are truncating the file.
1104 		 */
1105 		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
1106 	}
1107 
1108 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1109 	if (error) {
1110 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
1111 			dmu_tx_wait(tx);
1112 		dmu_tx_abort(tx);
1113 		zfs_range_unlock(rl);
1114 		return (error);
1115 	}
1116 
1117 	if (new_blksz)
1118 		zfs_grow_blocksize(zp, new_blksz, tx);
1119 
1120 	if (end > size || len == 0)
1121 		zp->z_phys->zp_size = end;
1122 
1123 	if (off < size) {
1124 		objset_t *os = zfsvfs->z_os;
1125 		uint64_t rlen = len;
1126 
1127 		if (len == 0)
1128 			rlen = -1;
1129 		else if (end > size)
1130 			rlen = size - off;
1131 		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
1132 	}
1133 
1134 	if (log) {
1135 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1136 		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1137 	}
1138 
1139 	zfs_range_unlock(rl);
1140 
1141 	dmu_tx_commit(tx);
1142 
1143 	/*
1144 	 * Clear any mapped pages in the truncated region.  This has to
1145 	 * happen outside of the transaction to avoid the possibility of
1146 	 * a deadlock with someone trying to push a page that we are
1147 	 * about to invalidate.
1148 	 */
1149 	rw_enter(&zp->z_map_lock, RW_WRITER);
1150 	if (off < size && vn_has_cached_data(vp)) {
1151 		page_t *pp;
1152 		uint64_t start = off & PAGEMASK;
1153 		int poff = off & PAGEOFFSET;
1154 
1155 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1156 			/*
1157 			 * We need to zero a partial page.
1158 			 */
1159 			pagezero(pp, poff, PAGESIZE - poff);
1160 			start += PAGESIZE;
1161 			page_unlock(pp);
1162 		}
1163 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1164 		    B_INVAL | B_TRUNC, NULL);
1165 		ASSERT(error == 0);
1166 	}
1167 	rw_exit(&zp->z_map_lock);
1168 
1169 	return (0);
1170 }
1171 
1172 void
1173 zfs_create_fs(objset_t *os, cred_t *cr, uint64_t version,
1174     int norm, dmu_tx_t *tx)
1175 {
1176 	zfsvfs_t	zfsvfs;
1177 	uint64_t	moid, doid;
1178 	int		error;
1179 	znode_t		*rootzp = NULL;
1180 	vnode_t		*vp;
1181 	vattr_t		vattr;
1182 	znode_t		*zp;
1183 
1184 	/*
1185 	 * First attempt to create master node.
1186 	 */
1187 	/*
1188 	 * In an empty objset, there are no blocks to read and thus
1189 	 * there can be no i/o errors (which we assert below).
1190 	 */
1191 	moid = MASTER_NODE_OBJ;
1192 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1193 	    DMU_OT_NONE, 0, tx);
1194 	ASSERT(error == 0);
1195 
1196 	/*
1197 	 * Set starting attributes.
1198 	 */
1199 
1200 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1201 	ASSERT(error == 0);
1202 
1203 	/*
1204 	 * Create a delete queue.
1205 	 */
1206 	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1207 
1208 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
1209 	ASSERT(error == 0);
1210 
1211 	/*
1212 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1213 	 * to allow zfs_mknode to work.
1214 	 */
1215 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1216 	vattr.va_type = VDIR;
1217 	vattr.va_mode = S_IFDIR|0755;
1218 	vattr.va_uid = crgetuid(cr);
1219 	vattr.va_gid = crgetgid(cr);
1220 
1221 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1222 	rootzp->z_zfsvfs = &zfsvfs;
1223 	rootzp->z_unlinked = 0;
1224 	rootzp->z_atime_dirty = 0;
1225 
1226 	vp = ZTOV(rootzp);
1227 	vn_reinit(vp);
1228 	vp->v_type = VDIR;
1229 
1230 	bzero(&zfsvfs, sizeof (zfsvfs_t));
1231 
1232 	zfsvfs.z_os = os;
1233 	zfsvfs.z_assign = TXG_NOWAIT;
1234 	zfsvfs.z_parent = &zfsvfs;
1235 	zfsvfs.z_version = version;
1236 	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1237 	zfsvfs.z_norm = norm;
1238 
1239 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1240 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1241 	    offsetof(znode_t, z_link_node));
1242 
1243 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE,
1244 	    &zp, 0, NULL, NULL);
1245 	VN_RELE(ZTOV(zp));
1246 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1247 	ASSERT(error == 0);
1248 
1249 	ZTOV(rootzp)->v_count = 0;
1250 	kmem_cache_free(znode_cache, rootzp);
1251 }
1252 
1253 #endif /* _KERNEL */
1254 /*
1255  * Given an object number, return its parent object number and whether
1256  * or not the object is an extended attribute directory.
1257  */
1258 static int
1259 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1260 {
1261 	dmu_buf_t *db;
1262 	dmu_object_info_t doi;
1263 	znode_phys_t *zp;
1264 	int error;
1265 
1266 	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1267 		return (error);
1268 
1269 	dmu_object_info_from_db(db, &doi);
1270 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1271 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1272 		dmu_buf_rele(db, FTAG);
1273 		return (EINVAL);
1274 	}
1275 
1276 	zp = db->db_data;
1277 	*pobjp = zp->zp_parent;
1278 	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1279 	    S_ISDIR(zp->zp_mode);
1280 	dmu_buf_rele(db, FTAG);
1281 
1282 	return (0);
1283 }
1284 
1285 int
1286 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1287 {
1288 	char *path = buf + len - 1;
1289 	int error;
1290 
1291 	*path = '\0';
1292 
1293 	for (;;) {
1294 		uint64_t pobj;
1295 		char component[MAXNAMELEN + 2];
1296 		size_t complen;
1297 		int is_xattrdir;
1298 
1299 		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1300 		    &is_xattrdir)) != 0)
1301 			break;
1302 
1303 		if (pobj == obj) {
1304 			if (path[0] != '/')
1305 				*--path = '/';
1306 			break;
1307 		}
1308 
1309 		component[0] = '/';
1310 		if (is_xattrdir) {
1311 			(void) sprintf(component + 1, "<xattrdir>");
1312 		} else {
1313 			error = zap_value_search(osp, pobj, obj,
1314 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
1315 			if (error != 0)
1316 				break;
1317 		}
1318 
1319 		complen = strlen(component);
1320 		path -= complen;
1321 		ASSERT(path >= buf);
1322 		bcopy(component, path, complen);
1323 		obj = pobj;
1324 	}
1325 
1326 	if (error == 0)
1327 		(void) memmove(buf, path, buf + len - path);
1328 	return (error);
1329 }
1330