xref: /titanic_50/usr/src/uts/common/fs/zfs/zfs_znode.c (revision 711890bc9379ceea66272dc8d4981812224ea86e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/mkdev.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/file.h>
40 #include <sys/kmem.h>
41 #include <sys/cmn_err.h>
42 #include <sys/errno.h>
43 #include <sys/unistd.h>
44 #include <sys/mode.h>
45 #include <sys/atomic.h>
46 #include <vm/pvn.h>
47 #include "fs/fs_subr.h"
48 #include <sys/zfs_dir.h>
49 #include <sys/zfs_acl.h>
50 #include <sys/zfs_ioctl.h>
51 #include <sys/zfs_rlock.h>
52 #include <sys/fs/zfs.h>
53 #endif /* _KERNEL */
54 
55 #include <sys/dmu.h>
56 #include <sys/refcount.h>
57 #include <sys/stat.h>
58 #include <sys/zap.h>
59 #include <sys/zfs_znode.h>
60 
61 /*
62  * Functions needed for userland (ie: libzpool) are not put under
63  * #ifdef_KERNEL; the rest of the functions have dependencies
64  * (such as VFS logic) that will not compile easily in userland.
65  */
66 #ifdef _KERNEL
67 struct kmem_cache *znode_cache = NULL;
68 
69 /*ARGSUSED*/
70 static void
71 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
72 {
73 	znode_t *zp = user_ptr;
74 	vnode_t *vp = ZTOV(zp);
75 
76 	mutex_enter(&zp->z_lock);
77 	if (vp->v_count == 0) {
78 		mutex_exit(&zp->z_lock);
79 		vn_invalid(vp);
80 		zfs_znode_free(zp);
81 	} else {
82 		/* signal force unmount that this znode can be freed */
83 		zp->z_dbuf = NULL;
84 		mutex_exit(&zp->z_lock);
85 	}
86 }
87 
88 /*ARGSUSED*/
89 static int
90 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
91 {
92 	znode_t *zp = buf;
93 
94 	zp->z_vnode = vn_alloc(KM_SLEEP);
95 	zp->z_vnode->v_data = (caddr_t)zp;
96 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
97 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
98 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
99 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
100 
101 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
102 	avl_create(&zp->z_range_avl, zfs_range_compare,
103 	    sizeof (rl_t), offsetof(rl_t, r_node));
104 
105 	zp->z_dbuf_held = 0;
106 	zp->z_dirlocks = 0;
107 	return (0);
108 }
109 
110 /*ARGSUSED*/
111 static void
112 zfs_znode_cache_destructor(void *buf, void *cdarg)
113 {
114 	znode_t *zp = buf;
115 
116 	ASSERT(zp->z_dirlocks == 0);
117 	mutex_destroy(&zp->z_lock);
118 	rw_destroy(&zp->z_map_lock);
119 	rw_destroy(&zp->z_parent_lock);
120 	mutex_destroy(&zp->z_acl_lock);
121 	avl_destroy(&zp->z_range_avl);
122 
123 	ASSERT(zp->z_dbuf_held == 0);
124 	ASSERT(ZTOV(zp)->v_count == 0);
125 	vn_free(ZTOV(zp));
126 }
127 
128 void
129 zfs_znode_init(void)
130 {
131 	/*
132 	 * Initialize zcache
133 	 */
134 	ASSERT(znode_cache == NULL);
135 	znode_cache = kmem_cache_create("zfs_znode_cache",
136 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
137 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
138 }
139 
140 void
141 zfs_znode_fini(void)
142 {
143 	/*
144 	 * Cleanup vfs & vnode ops
145 	 */
146 	zfs_remove_op_tables();
147 
148 	/*
149 	 * Cleanup zcache
150 	 */
151 	if (znode_cache)
152 		kmem_cache_destroy(znode_cache);
153 	znode_cache = NULL;
154 }
155 
156 struct vnodeops *zfs_dvnodeops;
157 struct vnodeops *zfs_fvnodeops;
158 struct vnodeops *zfs_symvnodeops;
159 struct vnodeops *zfs_xdvnodeops;
160 struct vnodeops *zfs_evnodeops;
161 
162 void
163 zfs_remove_op_tables()
164 {
165 	/*
166 	 * Remove vfs ops
167 	 */
168 	ASSERT(zfsfstype);
169 	(void) vfs_freevfsops_by_type(zfsfstype);
170 	zfsfstype = 0;
171 
172 	/*
173 	 * Remove vnode ops
174 	 */
175 	if (zfs_dvnodeops)
176 		vn_freevnodeops(zfs_dvnodeops);
177 	if (zfs_fvnodeops)
178 		vn_freevnodeops(zfs_fvnodeops);
179 	if (zfs_symvnodeops)
180 		vn_freevnodeops(zfs_symvnodeops);
181 	if (zfs_xdvnodeops)
182 		vn_freevnodeops(zfs_xdvnodeops);
183 	if (zfs_evnodeops)
184 		vn_freevnodeops(zfs_evnodeops);
185 
186 	zfs_dvnodeops = NULL;
187 	zfs_fvnodeops = NULL;
188 	zfs_symvnodeops = NULL;
189 	zfs_xdvnodeops = NULL;
190 	zfs_evnodeops = NULL;
191 }
192 
193 extern const fs_operation_def_t zfs_dvnodeops_template[];
194 extern const fs_operation_def_t zfs_fvnodeops_template[];
195 extern const fs_operation_def_t zfs_xdvnodeops_template[];
196 extern const fs_operation_def_t zfs_symvnodeops_template[];
197 extern const fs_operation_def_t zfs_evnodeops_template[];
198 
199 int
200 zfs_create_op_tables()
201 {
202 	int error;
203 
204 	/*
205 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
206 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
207 	 * In this case we just return as the ops vectors are already set up.
208 	 */
209 	if (zfs_dvnodeops)
210 		return (0);
211 
212 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
213 	    &zfs_dvnodeops);
214 	if (error)
215 		return (error);
216 
217 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
218 	    &zfs_fvnodeops);
219 	if (error)
220 		return (error);
221 
222 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
223 	    &zfs_symvnodeops);
224 	if (error)
225 		return (error);
226 
227 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
228 	    &zfs_xdvnodeops);
229 	if (error)
230 		return (error);
231 
232 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
233 	    &zfs_evnodeops);
234 
235 	return (error);
236 }
237 
238 /*
239  * zfs_init_fs - Initialize the zfsvfs struct and the file system
240  *	incore "master" object.  Verify version compatibility.
241  */
242 int
243 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
244 {
245 	extern int zfsfstype;
246 
247 	objset_t	*os = zfsvfs->z_os;
248 	uint64_t	zoid;
249 	uint64_t	version = ZPL_VERSION;
250 	int		i, error;
251 	dmu_object_info_t doi;
252 	uint64_t fsid_guid;
253 
254 	*zpp = NULL;
255 
256 	/*
257 	 * XXX - hack to auto-create the pool root filesystem at
258 	 * the first attempted mount.
259 	 */
260 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
261 		dmu_tx_t *tx = dmu_tx_create(os);
262 
263 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
264 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
265 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
266 		error = dmu_tx_assign(tx, TXG_WAIT);
267 		ASSERT3U(error, ==, 0);
268 		zfs_create_fs(os, cr, tx);
269 		dmu_tx_commit(tx);
270 	}
271 
272 	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
273 	    &version);
274 	if (error) {
275 		return (error);
276 	} else if (version != ZPL_VERSION) {
277 		(void) printf("Mismatched versions:  File system "
278 		    "is version %lld on-disk format, which is "
279 		    "incompatible with this software version %lld!",
280 		    (u_longlong_t)version, ZPL_VERSION);
281 		return (ENOTSUP);
282 	}
283 
284 	/*
285 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
286 	 * separates our fsid from any other filesystem types, and a
287 	 * 56-bit objset unique ID.  The objset unique ID is unique to
288 	 * all objsets open on this system, provided by unique_create().
289 	 * The 8-bit fs type must be put in the low bits of fsid[1]
290 	 * because that's where other Solaris filesystems put it.
291 	 */
292 	fsid_guid = dmu_objset_fsid_guid(os);
293 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
294 	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
295 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
296 	    zfsfstype & 0xFF;
297 
298 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid);
299 	if (error)
300 		return (error);
301 	ASSERT(zoid != 0);
302 	zfsvfs->z_root = zoid;
303 
304 	/*
305 	 * Create the per mount vop tables.
306 	 */
307 
308 	/*
309 	 * Initialize zget mutex's
310 	 */
311 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
312 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
313 
314 	error = zfs_zget(zfsvfs, zoid, zpp);
315 	if (error)
316 		return (error);
317 	ASSERT3U((*zpp)->z_id, ==, zoid);
318 
319 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid);
320 	if (error)
321 		return (error);
322 
323 	zfsvfs->z_dqueue = zoid;
324 
325 	/*
326 	 * Initialize delete head structure
327 	 * Thread(s) will be started/stopped via
328 	 * readonly_changed_cb() depending
329 	 * on whether this is rw/ro mount.
330 	 */
331 	list_create(&zfsvfs->z_delete_head.z_znodes,
332 	    sizeof (znode_t), offsetof(znode_t, z_list_node));
333 	/* Mutex never destroyed. */
334 	mutex_init(&zfsvfs->z_delete_head.z_mutex, NULL, MUTEX_DEFAULT, NULL);
335 
336 	return (0);
337 }
338 
339 /*
340  * define a couple of values we need available
341  * for both 64 and 32 bit environments.
342  */
343 #ifndef NBITSMINOR64
344 #define	NBITSMINOR64	32
345 #endif
346 #ifndef MAXMAJ64
347 #define	MAXMAJ64	0xffffffffUL
348 #endif
349 #ifndef	MAXMIN64
350 #define	MAXMIN64	0xffffffffUL
351 #endif
352 
353 /*
354  * Create special expldev for ZFS private use.
355  * Can't use standard expldev since it doesn't do
356  * what we want.  The standard expldev() takes a
357  * dev32_t in LP64 and expands it to a long dev_t.
358  * We need an interface that takes a dev32_t in ILP32
359  * and expands it to a long dev_t.
360  */
361 static uint64_t
362 zfs_expldev(dev_t dev)
363 {
364 #ifndef _LP64
365 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
366 	return (((uint64_t)major << NBITSMINOR64) |
367 	    ((minor_t)dev & MAXMIN32));
368 #else
369 	return (dev);
370 #endif
371 }
372 
373 /*
374  * Special cmpldev for ZFS private use.
375  * Can't use standard cmpldev since it takes
376  * a long dev_t and compresses it to dev32_t in
377  * LP64.  We need to do a compaction of a long dev_t
378  * to a dev32_t in ILP32.
379  */
380 dev_t
381 zfs_cmpldev(uint64_t dev)
382 {
383 #ifndef _LP64
384 	minor_t minor = (minor_t)dev & MAXMIN64;
385 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
386 
387 	if (major > MAXMAJ32 || minor > MAXMIN32)
388 		return (NODEV32);
389 
390 	return (((dev32_t)major << NBITSMINOR32) | minor);
391 #else
392 	return (dev);
393 #endif
394 }
395 
396 /*
397  * Construct a new znode/vnode and intialize.
398  *
399  * This does not do a call to dmu_set_user() that is
400  * up to the caller to do, in case you don't want to
401  * return the znode
402  */
403 static znode_t *
404 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
405 {
406 	znode_t	*zp;
407 	vnode_t *vp;
408 
409 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
410 
411 	ASSERT(zp->z_dirlocks == NULL);
412 
413 	zp->z_phys = db->db_data;
414 	zp->z_zfsvfs = zfsvfs;
415 	zp->z_reap = 0;
416 	zp->z_atime_dirty = 0;
417 	zp->z_dbuf_held = 0;
418 	zp->z_mapcnt = 0;
419 	zp->z_last_itx = 0;
420 	zp->z_dbuf = db;
421 	zp->z_id = obj_num;
422 	zp->z_blksz = blksz;
423 	zp->z_seq = 0x7A4653;
424 	zp->z_sync_cnt = 0;
425 
426 	mutex_enter(&zfsvfs->z_znodes_lock);
427 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
428 	mutex_exit(&zfsvfs->z_znodes_lock);
429 
430 	vp = ZTOV(zp);
431 	vn_reinit(vp);
432 
433 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
434 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
435 
436 	switch (vp->v_type) {
437 	case VDIR:
438 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
439 			vn_setops(vp, zfs_xdvnodeops);
440 			vp->v_flag |= V_XATTRDIR;
441 		} else
442 			vn_setops(vp, zfs_dvnodeops);
443 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
444 		break;
445 	case VBLK:
446 	case VCHR:
447 		vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
448 		/*FALLTHROUGH*/
449 	case VFIFO:
450 	case VSOCK:
451 	case VDOOR:
452 		vn_setops(vp, zfs_fvnodeops);
453 		break;
454 	case VREG:
455 		vp->v_flag |= VMODSORT;
456 		vn_setops(vp, zfs_fvnodeops);
457 		break;
458 	case VLNK:
459 		vn_setops(vp, zfs_symvnodeops);
460 		break;
461 	default:
462 		vn_setops(vp, zfs_evnodeops);
463 		break;
464 	}
465 
466 	return (zp);
467 }
468 
469 static void
470 zfs_znode_dmu_init(znode_t *zp)
471 {
472 	znode_t		*nzp;
473 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
474 	dmu_buf_t	*db = zp->z_dbuf;
475 
476 	mutex_enter(&zp->z_lock);
477 
478 	nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
479 
480 	/*
481 	 * there should be no
482 	 * concurrent zgets on this object.
483 	 */
484 	ASSERT3P(nzp, ==, NULL);
485 
486 	/*
487 	 * Slap on VROOT if we are the root znode
488 	 */
489 	if (zp->z_id == zfsvfs->z_root) {
490 		ZTOV(zp)->v_flag |= VROOT;
491 	}
492 
493 	ASSERT(zp->z_dbuf_held == 0);
494 	zp->z_dbuf_held = 1;
495 	VFS_HOLD(zfsvfs->z_vfs);
496 	mutex_exit(&zp->z_lock);
497 	vn_exists(ZTOV(zp));
498 }
499 
500 /*
501  * Create a new DMU object to hold a zfs znode.
502  *
503  *	IN:	dzp	- parent directory for new znode
504  *		vap	- file attributes for new znode
505  *		tx	- dmu transaction id for zap operations
506  *		cr	- credentials of caller
507  *		flag	- flags:
508  *			  IS_ROOT_NODE	- new object will be root
509  *			  IS_XATTR	- new object is an attribute
510  *			  IS_REPLAY	- intent log replay
511  *
512  *	OUT:	oid	- ID of created object
513  *
514  */
515 void
516 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
517 	uint_t flag, znode_t **zpp, int bonuslen)
518 {
519 	dmu_buf_t	*dbp;
520 	znode_phys_t	*pzp;
521 	znode_t		*zp;
522 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
523 	timestruc_t	now;
524 	uint64_t	gen;
525 	int		err;
526 
527 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
528 
529 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
530 		*oid = vap->va_nodeid;
531 		flag |= IS_REPLAY;
532 		now = vap->va_ctime;		/* see zfs_replay_create() */
533 		gen = vap->va_nblocks;		/* ditto */
534 	} else {
535 		*oid = 0;
536 		gethrestime(&now);
537 		gen = dmu_tx_get_txg(tx);
538 	}
539 
540 	/*
541 	 * Create a new DMU object.
542 	 */
543 	/*
544 	 * There's currently no mechanism for pre-reading the blocks that will
545 	 * be to needed allocate a new object, so we accept the small chance
546 	 * that there will be an i/o error and we will fail one of the
547 	 * assertions below.
548 	 */
549 	if (vap->va_type == VDIR) {
550 		if (flag & IS_REPLAY) {
551 			err = zap_create_claim(zfsvfs->z_os, *oid,
552 			    DMU_OT_DIRECTORY_CONTENTS,
553 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
554 			ASSERT3U(err, ==, 0);
555 		} else {
556 			*oid = zap_create(zfsvfs->z_os,
557 			    DMU_OT_DIRECTORY_CONTENTS,
558 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
559 		}
560 	} else {
561 		if (flag & IS_REPLAY) {
562 			err = dmu_object_claim(zfsvfs->z_os, *oid,
563 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
564 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
565 			ASSERT3U(err, ==, 0);
566 		} else {
567 			*oid = dmu_object_alloc(zfsvfs->z_os,
568 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
569 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
570 		}
571 	}
572 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
573 	dmu_buf_will_dirty(dbp, tx);
574 
575 	/*
576 	 * Initialize the znode physical data to zero.
577 	 */
578 	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
579 	bzero(dbp->db_data, dbp->db_size);
580 	pzp = dbp->db_data;
581 
582 	/*
583 	 * If this is the root, fix up the half-initialized parent pointer
584 	 * to reference the just-allocated physical data area.
585 	 */
586 	if (flag & IS_ROOT_NODE) {
587 		dzp->z_phys = pzp;
588 		dzp->z_id = *oid;
589 	}
590 
591 	/*
592 	 * If parent is an xattr, so am I.
593 	 */
594 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
595 		flag |= IS_XATTR;
596 
597 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
598 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
599 	}
600 
601 	if (vap->va_type == VDIR) {
602 		pzp->zp_size = 2;		/* contents ("." and "..") */
603 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
604 	}
605 
606 	pzp->zp_parent = dzp->z_id;
607 	if (flag & IS_XATTR)
608 		pzp->zp_flags |= ZFS_XATTR;
609 
610 	pzp->zp_gen = gen;
611 
612 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
613 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
614 
615 	if (vap->va_mask & AT_ATIME) {
616 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
617 	} else {
618 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
619 	}
620 
621 	if (vap->va_mask & AT_MTIME) {
622 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
623 	} else {
624 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
625 	}
626 
627 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
628 	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
629 
630 	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
631 
632 	if (zpp) {
633 		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
634 
635 		mutex_enter(hash_mtx);
636 		zfs_znode_dmu_init(zp);
637 		mutex_exit(hash_mtx);
638 
639 		*zpp = zp;
640 	} else {
641 		ZTOV(zp)->v_count = 0;
642 		dmu_buf_rele(dbp, NULL);
643 		zfs_znode_free(zp);
644 	}
645 }
646 
647 int
648 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
649 {
650 	dmu_object_info_t doi;
651 	dmu_buf_t	*db;
652 	znode_t		*zp;
653 	int err;
654 
655 	*zpp = NULL;
656 
657 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
658 
659 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
660 	if (err) {
661 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
662 		return (err);
663 	}
664 
665 	dmu_object_info_from_db(db, &doi);
666 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
667 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
668 		dmu_buf_rele(db, NULL);
669 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
670 		return (EINVAL);
671 	}
672 
673 	ASSERT(db->db_object == obj_num);
674 	ASSERT(db->db_offset == -1);
675 	ASSERT(db->db_data != NULL);
676 
677 	zp = dmu_buf_get_user(db);
678 
679 	if (zp != NULL) {
680 		mutex_enter(&zp->z_lock);
681 
682 		ASSERT3U(zp->z_id, ==, obj_num);
683 		if (zp->z_reap) {
684 			dmu_buf_rele(db, NULL);
685 			mutex_exit(&zp->z_lock);
686 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
687 			return (ENOENT);
688 		} else if (zp->z_dbuf_held) {
689 			dmu_buf_rele(db, NULL);
690 		} else {
691 			zp->z_dbuf_held = 1;
692 			VFS_HOLD(zfsvfs->z_vfs);
693 		}
694 
695 
696 		VN_HOLD(ZTOV(zp));
697 		mutex_exit(&zp->z_lock);
698 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
699 		*zpp = zp;
700 		return (0);
701 	}
702 
703 	/*
704 	 * Not found create new znode/vnode
705 	 */
706 	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
707 	ASSERT3U(zp->z_id, ==, obj_num);
708 	zfs_znode_dmu_init(zp);
709 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
710 	*zpp = zp;
711 	return (0);
712 }
713 
714 void
715 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
716 {
717 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
718 	int error;
719 
720 	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
721 	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
722 		error = dmu_object_free(zfsvfs->z_os,
723 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
724 		ASSERT3U(error, ==, 0);
725 	}
726 	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
727 	ASSERT3U(error, ==, 0);
728 	zp->z_dbuf_held = 0;
729 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
730 	dmu_buf_rele(zp->z_dbuf, NULL);
731 }
732 
733 void
734 zfs_zinactive(znode_t *zp)
735 {
736 	vnode_t	*vp = ZTOV(zp);
737 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
738 	uint64_t z_id = zp->z_id;
739 
740 	ASSERT(zp->z_dbuf_held && zp->z_phys);
741 
742 	/*
743 	 * Don't allow a zfs_zget() while were trying to release this znode
744 	 */
745 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
746 
747 	mutex_enter(&zp->z_lock);
748 	mutex_enter(&vp->v_lock);
749 	vp->v_count--;
750 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
751 		/*
752 		 * If the hold count is greater than zero, somebody has
753 		 * obtained a new reference on this znode while we were
754 		 * processing it here, so we are done.  If we still have
755 		 * mapped pages then we are also done, since we don't
756 		 * want to inactivate the znode until the pages get pushed.
757 		 *
758 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
759 		 * this seems like it would leave the znode hanging with
760 		 * no chance to go inactive...
761 		 */
762 		mutex_exit(&vp->v_lock);
763 		mutex_exit(&zp->z_lock);
764 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
765 		return;
766 	}
767 	mutex_exit(&vp->v_lock);
768 
769 	/*
770 	 * If this was the last reference to a file with no links,
771 	 * remove the file from the file system.
772 	 */
773 	if (zp->z_reap) {
774 		mutex_exit(&zp->z_lock);
775 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
776 		/* XATTR files are not put on the delete queue */
777 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
778 			zfs_rmnode(zp);
779 		} else {
780 			mutex_enter(&zfsvfs->z_delete_head.z_mutex);
781 			list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp);
782 			zfsvfs->z_delete_head.z_znode_count++;
783 			cv_broadcast(&zfsvfs->z_delete_head.z_cv);
784 			mutex_exit(&zfsvfs->z_delete_head.z_mutex);
785 		}
786 		VFS_RELE(zfsvfs->z_vfs);
787 		return;
788 	}
789 	ASSERT(zp->z_phys);
790 	ASSERT(zp->z_dbuf_held);
791 
792 	zp->z_dbuf_held = 0;
793 	mutex_exit(&zp->z_lock);
794 	dmu_buf_rele(zp->z_dbuf, NULL);
795 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
796 	VFS_RELE(zfsvfs->z_vfs);
797 }
798 
799 void
800 zfs_znode_free(znode_t *zp)
801 {
802 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
803 
804 	mutex_enter(&zfsvfs->z_znodes_lock);
805 	list_remove(&zfsvfs->z_all_znodes, zp);
806 	mutex_exit(&zfsvfs->z_znodes_lock);
807 
808 	kmem_cache_free(znode_cache, zp);
809 }
810 
811 void
812 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
813 {
814 	timestruc_t	now;
815 
816 	ASSERT(MUTEX_HELD(&zp->z_lock));
817 
818 	gethrestime(&now);
819 
820 	if (tx) {
821 		dmu_buf_will_dirty(zp->z_dbuf, tx);
822 		zp->z_atime_dirty = 0;
823 		zp->z_seq++;
824 	} else {
825 		zp->z_atime_dirty = 1;
826 	}
827 
828 	if (flag & AT_ATIME)
829 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
830 
831 	if (flag & AT_MTIME)
832 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
833 
834 	if (flag & AT_CTIME)
835 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
836 }
837 
838 /*
839  * Update the requested znode timestamps with the current time.
840  * If we are in a transaction, then go ahead and mark the znode
841  * dirty in the transaction so the timestamps will go to disk.
842  * Otherwise, we will get pushed next time the znode is updated
843  * in a transaction, or when this znode eventually goes inactive.
844  *
845  * Why is this OK?
846  *  1 - Only the ACCESS time is ever updated outside of a transaction.
847  *  2 - Multiple consecutive updates will be collapsed into a single
848  *	znode update by the transaction grouping semantics of the DMU.
849  */
850 void
851 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
852 {
853 	mutex_enter(&zp->z_lock);
854 	zfs_time_stamper_locked(zp, flag, tx);
855 	mutex_exit(&zp->z_lock);
856 }
857 
858 /*
859  * Grow the block size for a file.
860  *
861  *	IN:	zp	- znode of file to free data in.
862  *		size	- requested block size
863  *		tx	- open transaction.
864  *
865  * NOTE: this function assumes that the znode is write locked.
866  */
867 void
868 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
869 {
870 	int		error;
871 	u_longlong_t	dummy;
872 
873 	if (size <= zp->z_blksz)
874 		return;
875 	/*
876 	 * If the file size is already greater than the current blocksize,
877 	 * we will not grow.  If there is more than one block in a file,
878 	 * the blocksize cannot change.
879 	 */
880 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
881 		return;
882 
883 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
884 	    size, 0, tx);
885 	if (error == ENOTSUP)
886 		return;
887 	ASSERT3U(error, ==, 0);
888 
889 	/* What blocksize did we actually get? */
890 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
891 }
892 
893 /*
894  * This is a dummy interface used when pvn_vplist_dirty() should *not*
895  * be calling back into the fs for a putpage().  E.g.: when truncating
896  * a file, the pages being "thrown away* don't need to be written out.
897  */
898 /* ARGSUSED */
899 static int
900 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
901     int flags, cred_t *cr)
902 {
903 	ASSERT(0);
904 	return (0);
905 }
906 
907 /*
908  * Free space in a file.
909  *
910  *	IN:	zp	- znode of file to free data in.
911  *		off	- start of section to free.
912  *		len	- length of section to free (0 => to EOF).
913  *		flag	- current file open mode flags.
914  *
915  * 	RETURN:	0 if success
916  *		error code if failure
917  */
918 int
919 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
920 {
921 	vnode_t *vp = ZTOV(zp);
922 	dmu_tx_t *tx;
923 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
924 	zilog_t *zilog = zfsvfs->z_log;
925 	rl_t *rl;
926 	uint64_t end = off + len;
927 	uint64_t size, new_blksz;
928 	int error;
929 
930 	if (ZTOV(zp)->v_type == VFIFO)
931 		return (0);
932 
933 	/*
934 	 * If we will change zp_size then lock the whole file,
935 	 * otherwise just lock the range being freed.
936 	 */
937 	if (len == 0 || off + len > zp->z_phys->zp_size) {
938 		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
939 	} else {
940 		rl = zfs_range_lock(zp, off, len, RL_WRITER);
941 		/* recheck, in case zp_size changed */
942 		if (off + len > zp->z_phys->zp_size) {
943 			/* lost race: file size changed, lock whole file */
944 			zfs_range_unlock(rl);
945 			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
946 		}
947 	}
948 
949 	/*
950 	 * Nothing to do if file already at desired length.
951 	 */
952 	size = zp->z_phys->zp_size;
953 	if (len == 0 && size == off) {
954 		zfs_range_unlock(rl);
955 		return (0);
956 	}
957 
958 	/*
959 	 * Check for any locks in the region to be freed.
960 	 */
961 	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
962 		uint64_t start = off;
963 		uint64_t extent = len;
964 
965 		if (off > size) {
966 			start = size;
967 			extent += off - size;
968 		} else if (len == 0) {
969 			extent = size - off;
970 		}
971 		if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
972 			zfs_range_unlock(rl);
973 			return (error);
974 		}
975 	}
976 
977 	tx = dmu_tx_create(zfsvfs->z_os);
978 	dmu_tx_hold_bonus(tx, zp->z_id);
979 	new_blksz = 0;
980 	if (end > size &&
981 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
982 		/*
983 		 * We are growing the file past the current block size.
984 		 */
985 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
986 			ASSERT(!ISP2(zp->z_blksz));
987 			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
988 		} else {
989 			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
990 		}
991 		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
992 	} else if (off < size) {
993 		/*
994 		 * If len == 0, we are truncating the file.
995 		 */
996 		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
997 	}
998 
999 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1000 	if (error) {
1001 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
1002 			dmu_tx_wait(tx);
1003 		dmu_tx_abort(tx);
1004 		zfs_range_unlock(rl);
1005 		return (error);
1006 	}
1007 
1008 	if (new_blksz)
1009 		zfs_grow_blocksize(zp, new_blksz, tx);
1010 
1011 	if (end > size || len == 0)
1012 		zp->z_phys->zp_size = end;
1013 
1014 	if (off < size) {
1015 		objset_t *os = zfsvfs->z_os;
1016 		uint64_t rlen = len;
1017 
1018 		if (len == 0)
1019 			rlen = -1;
1020 		else if (end > size)
1021 			rlen = size - off;
1022 		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
1023 	}
1024 
1025 	if (log) {
1026 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1027 		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1028 	}
1029 
1030 	zfs_range_unlock(rl);
1031 
1032 	dmu_tx_commit(tx);
1033 
1034 	/*
1035 	 * Clear any mapped pages in the truncated region.  This has to
1036 	 * happen outside of the transaction to avoid the possibility of
1037 	 * a deadlock with someone trying to push a page that we are
1038 	 * about to invalidate.
1039 	 */
1040 	rw_enter(&zp->z_map_lock, RW_WRITER);
1041 	if (off < size && vn_has_cached_data(vp)) {
1042 		page_t *pp;
1043 		uint64_t start = off & PAGEMASK;
1044 		int poff = off & PAGEOFFSET;
1045 
1046 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1047 			/*
1048 			 * We need to zero a partial page.
1049 			 */
1050 			pagezero(pp, poff, PAGESIZE - poff);
1051 			start += PAGESIZE;
1052 			page_unlock(pp);
1053 		}
1054 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1055 		    B_INVAL | B_TRUNC, NULL);
1056 		ASSERT(error == 0);
1057 	}
1058 	rw_exit(&zp->z_map_lock);
1059 
1060 	return (0);
1061 }
1062 
1063 void
1064 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
1065 {
1066 	zfsvfs_t	zfsvfs;
1067 	uint64_t	moid, doid, roid = 0;
1068 	uint64_t	version = ZPL_VERSION;
1069 	int		error;
1070 	znode_t		*rootzp = NULL;
1071 	vnode_t		*vp;
1072 	vattr_t		vattr;
1073 
1074 	/*
1075 	 * First attempt to create master node.
1076 	 */
1077 	/*
1078 	 * In an empty objset, there are no blocks to read and thus
1079 	 * there can be no i/o errors (which we assert below).
1080 	 */
1081 	moid = MASTER_NODE_OBJ;
1082 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1083 	    DMU_OT_NONE, 0, tx);
1084 	ASSERT(error == 0);
1085 
1086 	/*
1087 	 * Set starting attributes.
1088 	 */
1089 
1090 	error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
1091 	ASSERT(error == 0);
1092 
1093 	/*
1094 	 * Create a delete queue.
1095 	 */
1096 	doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx);
1097 
1098 	error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx);
1099 	ASSERT(error == 0);
1100 
1101 	/*
1102 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1103 	 * to allow zfs_mknode to work.
1104 	 */
1105 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1106 	vattr.va_type = VDIR;
1107 	vattr.va_mode = S_IFDIR|0755;
1108 	vattr.va_uid = 0;
1109 	vattr.va_gid = 3;
1110 
1111 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1112 	rootzp->z_zfsvfs = &zfsvfs;
1113 	rootzp->z_reap = 0;
1114 	rootzp->z_atime_dirty = 0;
1115 	rootzp->z_dbuf_held = 0;
1116 
1117 	vp = ZTOV(rootzp);
1118 	vn_reinit(vp);
1119 	vp->v_type = VDIR;
1120 
1121 	bzero(&zfsvfs, sizeof (zfsvfs_t));
1122 
1123 	zfsvfs.z_os = os;
1124 	zfsvfs.z_assign = TXG_NOWAIT;
1125 	zfsvfs.z_parent = &zfsvfs;
1126 
1127 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1128 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1129 	    offsetof(znode_t, z_link_node));
1130 
1131 	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
1132 	ASSERT3U(rootzp->z_id, ==, roid);
1133 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
1134 	ASSERT(error == 0);
1135 
1136 	ZTOV(rootzp)->v_count = 0;
1137 	kmem_cache_free(znode_cache, rootzp);
1138 }
1139 #endif /* _KERNEL */
1140 
1141 /*
1142  * Given an object number, return its parent object number and whether
1143  * or not the object is an extended attribute directory.
1144  */
1145 static int
1146 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1147 {
1148 	dmu_buf_t *db;
1149 	dmu_object_info_t doi;
1150 	znode_phys_t *zp;
1151 	int error;
1152 
1153 	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1154 		return (error);
1155 
1156 	dmu_object_info_from_db(db, &doi);
1157 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1158 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1159 		dmu_buf_rele(db, FTAG);
1160 		return (EINVAL);
1161 	}
1162 
1163 	zp = db->db_data;
1164 	*pobjp = zp->zp_parent;
1165 	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1166 	    S_ISDIR(zp->zp_mode);
1167 	dmu_buf_rele(db, FTAG);
1168 
1169 	return (0);
1170 }
1171 
1172 int
1173 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1174 {
1175 	char *path = buf + len - 1;
1176 	int error;
1177 
1178 	*path = '\0';
1179 
1180 	for (;;) {
1181 		uint64_t pobj;
1182 		char component[MAXNAMELEN + 2];
1183 		size_t complen;
1184 		int is_xattrdir;
1185 
1186 		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1187 		    &is_xattrdir)) != 0)
1188 			break;
1189 
1190 		if (pobj == obj) {
1191 			if (path[0] != '/')
1192 				*--path = '/';
1193 			break;
1194 		}
1195 
1196 		component[0] = '/';
1197 		if (is_xattrdir) {
1198 			(void) sprintf(component + 1, "<xattrdir>");
1199 		} else {
1200 			error = zap_value_search(osp, pobj, obj, component + 1);
1201 			if (error != 0)
1202 				break;
1203 		}
1204 
1205 		complen = strlen(component);
1206 		path -= complen;
1207 		ASSERT(path >= buf);
1208 		bcopy(component, path, complen);
1209 		obj = pobj;
1210 	}
1211 
1212 	if (error == 0)
1213 		(void) memmove(buf, path, buf + len - path);
1214 	return (error);
1215 }
1216