xref: /illumos-gate/usr/src/uts/common/fs/zfs/zfs_znode.c (revision 622200ad88c6c6382403a01985a94e22484baac6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/time.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/resource.h>
34 #include <sys/mntent.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/file.h>
38 #include <sys/kmem.h>
39 #include <sys/cmn_err.h>
40 #include <sys/errno.h>
41 #include <sys/unistd.h>
42 #include <sys/stat.h>
43 #include <sys/mode.h>
44 #include <sys/atomic.h>
45 #include <vm/pvn.h>
46 #include "fs/fs_subr.h"
47 #include <sys/zfs_dir.h>
48 #include <sys/zfs_acl.h>
49 #include <sys/zfs_ioctl.h>
50 #include <sys/zfs_znode.h>
51 #include <sys/zap.h>
52 #include <sys/dmu.h>
53 #include <sys/fs/zfs.h>
54 
55 struct kmem_cache *znode_cache = NULL;
56 
57 /*ARGSUSED*/
58 static void
59 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
60 {
61 	znode_t *zp = user_ptr;
62 	vnode_t *vp = ZTOV(zp);
63 
64 	mutex_enter(&zp->z_lock);
65 	if (vp->v_count == 0) {
66 		mutex_exit(&zp->z_lock);
67 		vn_invalid(vp);
68 		zfs_znode_free(zp);
69 	} else {
70 		/* signal force unmount that this znode can be freed */
71 		zp->z_dbuf = NULL;
72 		mutex_exit(&zp->z_lock);
73 	}
74 }
75 
76 /*ARGSUSED*/
77 static int
78 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
79 {
80 	znode_t *zp = buf;
81 
82 	zp->z_vnode = vn_alloc(KM_SLEEP);
83 	zp->z_vnode->v_data = (caddr_t)zp;
84 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
85 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
86 	rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL);
87 	rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL);
88 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
89 	zp->z_dbuf_held = 0;
90 	zp->z_dirlocks = 0;
91 	return (0);
92 }
93 
94 /*ARGSUSED*/
95 static void
96 zfs_znode_cache_destructor(void *buf, void *cdarg)
97 {
98 	znode_t *zp = buf;
99 
100 	ASSERT(zp->z_dirlocks == 0);
101 	mutex_destroy(&zp->z_lock);
102 	rw_destroy(&zp->z_map_lock);
103 	rw_destroy(&zp->z_grow_lock);
104 	rw_destroy(&zp->z_append_lock);
105 	mutex_destroy(&zp->z_acl_lock);
106 
107 	ASSERT(zp->z_dbuf_held == 0);
108 	ASSERT(ZTOV(zp)->v_count == 0);
109 	vn_free(ZTOV(zp));
110 }
111 
112 void
113 zfs_znode_init(void)
114 {
115 	/*
116 	 * Initialize zcache
117 	 */
118 	ASSERT(znode_cache == NULL);
119 	znode_cache = kmem_cache_create("zfs_znode_cache",
120 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
121 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
122 }
123 
124 void
125 zfs_znode_fini(void)
126 {
127 	/*
128 	 * Cleanup vfs & vnode ops
129 	 */
130 	zfs_remove_op_tables();
131 
132 	/*
133 	 * Cleanup zcache
134 	 */
135 	if (znode_cache)
136 		kmem_cache_destroy(znode_cache);
137 	znode_cache = NULL;
138 }
139 
140 struct vnodeops *zfs_dvnodeops;
141 struct vnodeops *zfs_fvnodeops;
142 struct vnodeops *zfs_symvnodeops;
143 struct vnodeops *zfs_xdvnodeops;
144 struct vnodeops *zfs_evnodeops;
145 
146 void
147 zfs_remove_op_tables()
148 {
149 	/*
150 	 * Remove vfs ops
151 	 */
152 	ASSERT(zfsfstype);
153 	(void) vfs_freevfsops_by_type(zfsfstype);
154 	zfsfstype = 0;
155 
156 	/*
157 	 * Remove vnode ops
158 	 */
159 	if (zfs_dvnodeops)
160 		vn_freevnodeops(zfs_dvnodeops);
161 	if (zfs_fvnodeops)
162 		vn_freevnodeops(zfs_fvnodeops);
163 	if (zfs_symvnodeops)
164 		vn_freevnodeops(zfs_symvnodeops);
165 	if (zfs_xdvnodeops)
166 		vn_freevnodeops(zfs_xdvnodeops);
167 	if (zfs_evnodeops)
168 		vn_freevnodeops(zfs_evnodeops);
169 
170 	zfs_dvnodeops = NULL;
171 	zfs_fvnodeops = NULL;
172 	zfs_symvnodeops = NULL;
173 	zfs_xdvnodeops = NULL;
174 	zfs_evnodeops = NULL;
175 }
176 
177 extern const fs_operation_def_t zfs_dvnodeops_template[];
178 extern const fs_operation_def_t zfs_fvnodeops_template[];
179 extern const fs_operation_def_t zfs_xdvnodeops_template[];
180 extern const fs_operation_def_t zfs_symvnodeops_template[];
181 extern const fs_operation_def_t zfs_evnodeops_template[];
182 
183 int
184 zfs_create_op_tables()
185 {
186 	int error;
187 
188 	/*
189 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
190 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
191 	 * In this case we just return as the ops vectors are already set up.
192 	 */
193 	if (zfs_dvnodeops)
194 		return (0);
195 
196 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
197 	    &zfs_dvnodeops);
198 	if (error)
199 		return (error);
200 
201 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
202 	    &zfs_fvnodeops);
203 	if (error)
204 		return (error);
205 
206 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
207 	    &zfs_symvnodeops);
208 	if (error)
209 		return (error);
210 
211 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
212 	    &zfs_xdvnodeops);
213 	if (error)
214 		return (error);
215 
216 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
217 	    &zfs_evnodeops);
218 
219 	return (error);
220 }
221 
222 /*
223  * zfs_init_fs - Initialize the zfsvfs struct and the file system
224  *	incore "master" object.  Verify version compatibility.
225  */
226 int
227 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
228 {
229 	extern int zfsfstype;
230 
231 	objset_t	*os = zfsvfs->z_os;
232 	uint64_t	zoid;
233 	uint64_t	version = ZFS_VERSION;
234 	int		i, error;
235 	dmu_object_info_t doi;
236 	dmu_objset_stats_t *stats;
237 
238 	*zpp = NULL;
239 
240 	/*
241 	 * XXX - hack to auto-create the pool root filesystem at
242 	 * the first attempted mount.
243 	 */
244 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
245 		dmu_tx_t *tx = dmu_tx_create(os);
246 
247 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
248 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
249 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
250 		error = dmu_tx_assign(tx, TXG_WAIT);
251 		ASSERT3U(error, ==, 0);
252 		zfs_create_fs(os, cr, tx);
253 		dmu_tx_commit(tx);
254 	}
255 
256 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1,
257 	    &version);
258 	if (error) {
259 		return (error);
260 	} else if (version != ZFS_VERSION) {
261 		(void) printf("Mismatched versions:  File system "
262 		    "is version %lld on-disk format, which is "
263 		    "incompatible with this software version %lld!",
264 		    (u_longlong_t)version, ZFS_VERSION);
265 		return (ENOTSUP);
266 	}
267 
268 	/*
269 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
270 	 * separates our fsid from any other filesystem types, and a
271 	 * 56-bit objset unique ID.  The objset unique ID is unique to
272 	 * all objsets open on this system, provided by unique_create().
273 	 * The 8-bit fs type must be put in the low bits of fsid[1]
274 	 * because that's where other Solaris filesystems put it.
275 	 */
276 	stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP);
277 	dmu_objset_stats(os, stats);
278 	ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0);
279 	zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid;
280 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) |
281 	    zfsfstype & 0xFF;
282 	kmem_free(stats, sizeof (dmu_objset_stats_t));
283 	stats = NULL;
284 
285 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid);
286 	if (error)
287 		return (error);
288 	ASSERT(zoid != 0);
289 	zfsvfs->z_root = zoid;
290 
291 	/*
292 	 * Create the per mount vop tables.
293 	 */
294 
295 	/*
296 	 * Initialize zget mutex's
297 	 */
298 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
299 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
300 
301 	error = zfs_zget(zfsvfs, zoid, zpp);
302 	if (error)
303 		return (error);
304 	ASSERT3U((*zpp)->z_id, ==, zoid);
305 
306 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid);
307 	if (error)
308 		return (error);
309 
310 	zfsvfs->z_dqueue = zoid;
311 
312 	/*
313 	 * Initialize delete head structure
314 	 * Thread(s) will be started/stopped via
315 	 * readonly_changed_cb() depending
316 	 * on whether this is rw/ro mount.
317 	 */
318 	list_create(&zfsvfs->z_delete_head.z_znodes,
319 	    sizeof (znode_t), offsetof(znode_t, z_list_node));
320 
321 	return (0);
322 }
323 
324 /*
325  * Construct a new znode/vnode and intialize.
326  *
327  * This does not do a call to dmu_set_user() that is
328  * up to the caller to do, in case you don't want to
329  * return the znode
330  */
331 static znode_t *
332 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
333 {
334 	znode_t	*zp;
335 	vnode_t *vp;
336 
337 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
338 
339 	ASSERT(zp->z_dirlocks == NULL);
340 
341 	zp->z_phys = db->db_data;
342 	zp->z_zfsvfs = zfsvfs;
343 	zp->z_active = 1;
344 	zp->z_reap = 0;
345 	zp->z_atime_dirty = 0;
346 	zp->z_dbuf_held = 0;
347 	zp->z_mapcnt = 0;
348 	zp->z_last_itx = 0;
349 	zp->z_dbuf = db;
350 	zp->z_id = obj_num;
351 	zp->z_blksz = blksz;
352 	zp->z_seq = 0x7A4653;
353 
354 	mutex_enter(&zfsvfs->z_znodes_lock);
355 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
356 	mutex_exit(&zfsvfs->z_znodes_lock);
357 
358 	vp = ZTOV(zp);
359 	vn_reinit(vp);
360 
361 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
362 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
363 
364 	switch (vp->v_type) {
365 	case VDIR:
366 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
367 			vn_setops(vp, zfs_xdvnodeops);
368 			vp->v_flag |= V_XATTRDIR;
369 		} else
370 			vn_setops(vp, zfs_dvnodeops);
371 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
372 		break;
373 	case VBLK:
374 	case VCHR:
375 		vp->v_rdev = (dev_t)zp->z_phys->zp_rdev;
376 		/*FALLTHROUGH*/
377 	case VFIFO:
378 	case VSOCK:
379 	case VDOOR:
380 		vn_setops(vp, zfs_fvnodeops);
381 		break;
382 	case VREG:
383 		vp->v_flag |= VMODSORT;
384 		vn_setops(vp, zfs_fvnodeops);
385 		break;
386 	case VLNK:
387 		vn_setops(vp, zfs_symvnodeops);
388 		break;
389 	default:
390 		vn_setops(vp, zfs_evnodeops);
391 		break;
392 	}
393 
394 	return (zp);
395 }
396 
397 static void
398 zfs_znode_dmu_init(znode_t *zp)
399 {
400 	znode_t		*nzp;
401 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
402 	dmu_buf_t	*db = zp->z_dbuf;
403 
404 	mutex_enter(&zp->z_lock);
405 
406 	nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
407 
408 	/*
409 	 * there should be no
410 	 * concurrent zgets on this object.
411 	 */
412 	ASSERT3P(nzp, ==, NULL);
413 
414 	/*
415 	 * Slap on VROOT if we are the root znode
416 	 */
417 	if (zp->z_id == zfsvfs->z_root) {
418 		ZTOV(zp)->v_flag |= VROOT;
419 	}
420 
421 	ASSERT(zp->z_dbuf_held == 0);
422 	zp->z_dbuf_held = 1;
423 	VFS_HOLD(zfsvfs->z_vfs);
424 	mutex_exit(&zp->z_lock);
425 	vn_exists(ZTOV(zp));
426 }
427 
428 /*
429  * Create a new DMU object to hold a zfs znode.
430  *
431  *	IN:	dzp	- parent directory for new znode
432  *		vap	- file attributes for new znode
433  *		tx	- dmu transaction id for zap operations
434  *		cr	- credentials of caller
435  *		flag	- flags:
436  *			  IS_ROOT_NODE	- new object will be root
437  *			  IS_XATTR	- new object is an attribute
438  *			  IS_REPLAY	- intent log replay
439  *
440  *	OUT:	oid	- ID of created object
441  *
442  */
443 void
444 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
445 	uint_t flag, znode_t **zpp, int bonuslen)
446 {
447 	dmu_buf_t	*dbp;
448 	znode_phys_t	*pzp;
449 	znode_t		*zp;
450 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
451 	timestruc_t	now;
452 	uint64_t	gen;
453 	int		err;
454 
455 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
456 
457 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
458 		*oid = vap->va_nodeid;
459 		flag |= IS_REPLAY;
460 		now = vap->va_ctime;		/* see zfs_replay_create() */
461 		gen = vap->va_nblocks;		/* ditto */
462 	} else {
463 		*oid = 0;
464 		gethrestime(&now);
465 		gen = dmu_tx_get_txg(tx);
466 	}
467 
468 	/*
469 	 * Create a new DMU object.
470 	 */
471 	/*
472 	 * There's currently no mechanism for pre-reading the blocks that will
473 	 * be to needed allocate a new object, so we accept the small chance
474 	 * that there will be an i/o error and we will fail one of the
475 	 * assertions below.
476 	 */
477 	if (vap->va_type == VDIR) {
478 		if (flag & IS_REPLAY) {
479 			err = zap_create_claim(zfsvfs->z_os, *oid,
480 			    DMU_OT_DIRECTORY_CONTENTS,
481 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
482 			ASSERT3U(err, ==, 0);
483 		} else {
484 			*oid = zap_create(zfsvfs->z_os,
485 			    DMU_OT_DIRECTORY_CONTENTS,
486 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
487 		}
488 	} else {
489 		if (flag & IS_REPLAY) {
490 			err = dmu_object_claim(zfsvfs->z_os, *oid,
491 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
492 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
493 			ASSERT3U(err, ==, 0);
494 		} else {
495 			*oid = dmu_object_alloc(zfsvfs->z_os,
496 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
497 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
498 		}
499 	}
500 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
501 	dmu_buf_will_dirty(dbp, tx);
502 
503 	/*
504 	 * Initialize the znode physical data to zero.
505 	 */
506 	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
507 	bzero(dbp->db_data, dbp->db_size);
508 	pzp = dbp->db_data;
509 
510 	/*
511 	 * If this is the root, fix up the half-initialized parent pointer
512 	 * to reference the just-allocated physical data area.
513 	 */
514 	if (flag & IS_ROOT_NODE) {
515 		dzp->z_phys = pzp;
516 		dzp->z_id = *oid;
517 	}
518 
519 	/*
520 	 * If parent is an xattr, so am I.
521 	 */
522 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
523 		flag |= IS_XATTR;
524 
525 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
526 		pzp->zp_rdev = vap->va_rdev;
527 	}
528 
529 	if (vap->va_type == VDIR) {
530 		pzp->zp_size = 2;		/* contents ("." and "..") */
531 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
532 	}
533 
534 	pzp->zp_parent = dzp->z_id;
535 	if (flag & IS_XATTR)
536 		pzp->zp_flags |= ZFS_XATTR;
537 
538 	pzp->zp_gen = gen;
539 
540 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
541 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
542 
543 	if (vap->va_mask & AT_ATIME) {
544 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
545 	} else {
546 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
547 	}
548 
549 	if (vap->va_mask & AT_MTIME) {
550 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
551 	} else {
552 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
553 	}
554 
555 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
556 	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
557 
558 	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
559 
560 	if (zpp) {
561 		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
562 
563 		mutex_enter(hash_mtx);
564 		zfs_znode_dmu_init(zp);
565 		mutex_exit(hash_mtx);
566 
567 		*zpp = zp;
568 	} else {
569 		ZTOV(zp)->v_count = 0;
570 		dmu_buf_rele(dbp, NULL);
571 		zfs_znode_free(zp);
572 	}
573 }
574 
575 int
576 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
577 {
578 	dmu_object_info_t doi;
579 	dmu_buf_t	*db;
580 	znode_t		*zp;
581 	int err;
582 
583 	*zpp = NULL;
584 
585 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
586 
587 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
588 	if (err) {
589 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
590 		return (err);
591 	}
592 
593 	dmu_object_info_from_db(db, &doi);
594 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
595 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
596 		dmu_buf_rele(db, NULL);
597 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
598 		return (EINVAL);
599 	}
600 
601 	ASSERT(db->db_object == obj_num);
602 	ASSERT(db->db_offset == -1);
603 	ASSERT(db->db_data != NULL);
604 
605 	zp = dmu_buf_get_user(db);
606 
607 	if (zp != NULL) {
608 		mutex_enter(&zp->z_lock);
609 
610 		ASSERT3U(zp->z_id, ==, obj_num);
611 		if (zp->z_reap) {
612 			dmu_buf_rele(db, NULL);
613 			mutex_exit(&zp->z_lock);
614 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
615 			return (ENOENT);
616 		} else if (zp->z_dbuf_held) {
617 			dmu_buf_rele(db, NULL);
618 		} else {
619 			zp->z_dbuf_held = 1;
620 			VFS_HOLD(zfsvfs->z_vfs);
621 		}
622 
623 		if (zp->z_active == 0)
624 			zp->z_active = 1;
625 
626 		VN_HOLD(ZTOV(zp));
627 		mutex_exit(&zp->z_lock);
628 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
629 		*zpp = zp;
630 		return (0);
631 	}
632 
633 	/*
634 	 * Not found create new znode/vnode
635 	 */
636 	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
637 	ASSERT3U(zp->z_id, ==, obj_num);
638 	zfs_znode_dmu_init(zp);
639 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
640 	*zpp = zp;
641 	return (0);
642 }
643 
644 void
645 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
646 {
647 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
648 	int error;
649 
650 	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
651 	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
652 		error = dmu_object_free(zfsvfs->z_os,
653 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
654 		ASSERT3U(error, ==, 0);
655 	}
656 	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
657 	ASSERT3U(error, ==, 0);
658 	zp->z_dbuf_held = 0;
659 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
660 	dmu_buf_rele(zp->z_dbuf, NULL);
661 }
662 
663 void
664 zfs_zinactive(znode_t *zp)
665 {
666 	vnode_t	*vp = ZTOV(zp);
667 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
668 	uint64_t z_id = zp->z_id;
669 
670 	ASSERT(zp->z_dbuf_held && zp->z_phys);
671 
672 	/*
673 	 * Don't allow a zfs_zget() while were trying to release this znode
674 	 */
675 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
676 
677 	mutex_enter(&zp->z_lock);
678 	mutex_enter(&vp->v_lock);
679 	vp->v_count--;
680 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
681 		/*
682 		 * If the hold count is greater than zero, somebody has
683 		 * obtained a new reference on this znode while we were
684 		 * processing it here, so we are done.  If we still have
685 		 * mapped pages then we are also done, since we don't
686 		 * want to inactivate the znode until the pages get pushed.
687 		 *
688 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
689 		 * this seems like it would leave the znode hanging with
690 		 * no chance to go inactive...
691 		 */
692 		mutex_exit(&vp->v_lock);
693 		mutex_exit(&zp->z_lock);
694 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
695 		return;
696 	}
697 	mutex_exit(&vp->v_lock);
698 	zp->z_active = 0;
699 
700 	/*
701 	 * If this was the last reference to a file with no links,
702 	 * remove the file from the file system.
703 	 */
704 	if (zp->z_reap) {
705 		mutex_exit(&zp->z_lock);
706 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
707 		/* XATTR files are not put on the delete queue */
708 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
709 			zfs_rmnode(zp);
710 		} else {
711 			mutex_enter(&zfsvfs->z_delete_head.z_mutex);
712 			list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp);
713 			zfsvfs->z_delete_head.z_znode_count++;
714 			cv_broadcast(&zfsvfs->z_delete_head.z_cv);
715 			mutex_exit(&zfsvfs->z_delete_head.z_mutex);
716 		}
717 		VFS_RELE(zfsvfs->z_vfs);
718 		return;
719 	}
720 	ASSERT(zp->z_phys);
721 	ASSERT(zp->z_dbuf_held);
722 
723 	zp->z_dbuf_held = 0;
724 	mutex_exit(&zp->z_lock);
725 	dmu_buf_rele(zp->z_dbuf, NULL);
726 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
727 	VFS_RELE(zfsvfs->z_vfs);
728 }
729 
730 void
731 zfs_znode_free(znode_t *zp)
732 {
733 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
734 
735 	mutex_enter(&zfsvfs->z_znodes_lock);
736 	list_remove(&zfsvfs->z_all_znodes, zp);
737 	mutex_exit(&zfsvfs->z_znodes_lock);
738 
739 	kmem_cache_free(znode_cache, zp);
740 }
741 
742 void
743 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
744 {
745 	timestruc_t	now;
746 
747 	ASSERT(MUTEX_HELD(&zp->z_lock));
748 
749 	gethrestime(&now);
750 
751 	if (tx) {
752 		dmu_buf_will_dirty(zp->z_dbuf, tx);
753 		zp->z_atime_dirty = 0;
754 		zp->z_seq++;
755 	} else {
756 		zp->z_atime_dirty = 1;
757 	}
758 
759 	if (flag & AT_ATIME)
760 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
761 
762 	if (flag & AT_MTIME)
763 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
764 
765 	if (flag & AT_CTIME)
766 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
767 }
768 
769 /*
770  * Update the requested znode timestamps with the current time.
771  * If we are in a transaction, then go ahead and mark the znode
772  * dirty in the transaction so the timestamps will go to disk.
773  * Otherwise, we will get pushed next time the znode is updated
774  * in a transaction, or when this znode eventually goes inactive.
775  *
776  * Why is this OK?
777  *  1 - Only the ACCESS time is ever updated outside of a transaction.
778  *  2 - Multiple consecutive updates will be collapsed into a single
779  *	znode update by the transaction grouping semantics of the DMU.
780  */
781 void
782 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
783 {
784 	mutex_enter(&zp->z_lock);
785 	zfs_time_stamper_locked(zp, flag, tx);
786 	mutex_exit(&zp->z_lock);
787 }
788 
789 /*
790  * Grow the block size for a file.  This may involve migrating data
791  * from the bonus buffer into a data block (when we grow beyond the
792  * bonus buffer data area).
793  *
794  *	IN:	zp	- znode of file to free data in.
795  *		size	- requested block size
796  *		tx	- open transaction.
797  *
798  * 	RETURN:	0 if success
799  *		error code if failure
800  *
801  * NOTE: this function assumes that the znode is write locked.
802  */
803 int
804 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
805 {
806 	int		error;
807 	u_longlong_t	dummy;
808 
809 	ASSERT(rw_write_held(&zp->z_grow_lock));
810 
811 	if (size <= zp->z_blksz)
812 		return (0);
813 	/*
814 	 * If the file size is already greater than the current blocksize,
815 	 * we will not grow.  If there is more than one block in a file,
816 	 * the blocksize cannot change.
817 	 */
818 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
819 		return (0);
820 
821 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
822 	    size, 0, tx);
823 	if (error == ENOTSUP)
824 		return (0);
825 	ASSERT3U(error, ==, 0);
826 
827 	/* What blocksize did we actually get? */
828 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
829 
830 	return (0);
831 }
832 
833 /*
834  * This is a dummy interface used when pvn_vplist_dirty() should *not*
835  * be calling back into the fs for a putpage().  E.g.: when truncating
836  * a file, the pages being "thrown away* don't need to be written out.
837  */
838 /* ARGSUSED */
839 static int
840 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
841     int flags, cred_t *cr)
842 {
843 	ASSERT(0);
844 	return (0);
845 }
846 
847 /*
848  * Free space in a file.  Currently, this function only
849  * supports freeing space at the end of the file.
850  *
851  *	IN:	zp	- znode of file to free data in.
852  *		from	- start of section to free.
853  *		len	- length of section to free (0 => to EOF).
854  *		flag	- current file open mode flags.
855  *		tx	- open transaction.
856  *
857  * 	RETURN:	0 if success
858  *		error code if failure
859  */
860 int
861 zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
862 	cred_t *cr)
863 {
864 	vnode_t *vp = ZTOV(zp);
865 	uint64_t size = zp->z_phys->zp_size;
866 	uint64_t end = from + len;
867 	int have_grow_lock, error;
868 
869 	if (ZTOV(zp)->v_type == VFIFO)
870 		return (0);
871 
872 	have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock);
873 
874 	/*
875 	 * Nothing to do if file already at desired length.
876 	 */
877 	if (len == 0 && size == from) {
878 		return (0);
879 	}
880 
881 	/*
882 	 * Check for any locks in the region to be freed.
883 	 */
884 	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
885 		uint64_t	start;
886 
887 		if (size > from)
888 			start = from;
889 		else
890 			start = size;
891 		if (error = chklock(vp, FWRITE, start, 0, flag, NULL))
892 			return (error);
893 	}
894 
895 	if (end > zp->z_blksz && (!ISP2(zp->z_blksz) ||
896 	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
897 		uint64_t new_blksz;
898 		/*
899 		 * We are growing the file past the current block size.
900 		 */
901 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
902 			ASSERT(!ISP2(zp->z_blksz));
903 			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
904 		} else {
905 			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
906 		}
907 		error = zfs_grow_blocksize(zp, new_blksz, tx);
908 		ASSERT(error == 0);
909 	}
910 	if (end > size || len == 0)
911 		zp->z_phys->zp_size = end;
912 	if (from > size)
913 		return (0);
914 
915 	if (have_grow_lock)
916 		rw_downgrade(&zp->z_grow_lock);
917 	/*
918 	 * Clear any mapped pages in the truncated region.
919 	 */
920 	rw_enter(&zp->z_map_lock, RW_WRITER);
921 	if (vn_has_cached_data(vp)) {
922 		page_t *pp;
923 		uint64_t start = from & PAGEMASK;
924 		int off = from & PAGEOFFSET;
925 
926 		if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
927 			/*
928 			 * We need to zero a partial page.
929 			 */
930 			pagezero(pp, off, PAGESIZE - off);
931 			start += PAGESIZE;
932 			page_unlock(pp);
933 		}
934 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
935 		    B_INVAL | B_TRUNC, cr);
936 		ASSERT(error == 0);
937 	}
938 	rw_exit(&zp->z_map_lock);
939 
940 	if (!have_grow_lock)
941 		rw_enter(&zp->z_grow_lock, RW_READER);
942 
943 	if (len == 0)
944 		len = -1;
945 	else if (end > size)
946 		len = size - from;
947 	VERIFY(0 == dmu_free_range(zp->z_zfsvfs->z_os,
948 	    zp->z_id, from, len, tx));
949 
950 	if (!have_grow_lock)
951 		rw_exit(&zp->z_grow_lock);
952 
953 	return (0);
954 }
955 
956 void
957 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
958 {
959 	zfsvfs_t	zfsvfs;
960 	uint64_t	moid, doid, roid = 0;
961 	uint64_t	version = ZFS_VERSION;
962 	int		error;
963 	znode_t		*rootzp = NULL;
964 	vnode_t		*vp;
965 	vattr_t		vattr;
966 
967 	/*
968 	 * First attempt to create master node.
969 	 */
970 	/*
971 	 * In an empty objset, there are no blocks to read and thus
972 	 * there can be no i/o errors (which we assert below).
973 	 */
974 	moid = MASTER_NODE_OBJ;
975 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
976 	    DMU_OT_NONE, 0, tx);
977 	ASSERT(error == 0);
978 
979 	/*
980 	 * Set starting attributes.
981 	 */
982 
983 	error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx);
984 	ASSERT(error == 0);
985 
986 	/*
987 	 * Create a delete queue.
988 	 */
989 	doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx);
990 
991 	error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx);
992 	ASSERT(error == 0);
993 
994 	/*
995 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
996 	 * to allow zfs_mknode to work.
997 	 */
998 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
999 	vattr.va_type = VDIR;
1000 	vattr.va_mode = S_IFDIR|0755;
1001 	vattr.va_uid = 0;
1002 	vattr.va_gid = 3;
1003 
1004 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1005 	rootzp->z_zfsvfs = &zfsvfs;
1006 	rootzp->z_active = 1;
1007 	rootzp->z_reap = 0;
1008 	rootzp->z_atime_dirty = 0;
1009 	rootzp->z_dbuf_held = 0;
1010 
1011 	vp = ZTOV(rootzp);
1012 	vn_reinit(vp);
1013 	vp->v_type = VDIR;
1014 
1015 	bzero(&zfsvfs, sizeof (zfsvfs_t));
1016 
1017 	zfsvfs.z_os = os;
1018 	zfsvfs.z_assign = TXG_NOWAIT;
1019 	zfsvfs.z_parent = &zfsvfs;
1020 
1021 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1022 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1023 	    offsetof(znode_t, z_link_node));
1024 
1025 	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
1026 	ASSERT3U(rootzp->z_id, ==, roid);
1027 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
1028 	ASSERT(error == 0);
1029 
1030 	ZTOV(rootzp)->v_count = 0;
1031 	kmem_cache_free(znode_cache, rootzp);
1032 }
1033