xref: /illumos-gate/usr/src/uts/common/fs/zfs/zfs_znode.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24  * Copyright (c) 2014 Integros [integros.com]
25  */
26 
27 /* Portions Copyright 2007 Jeremy Teo */
28 
29 #ifdef _KERNEL
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/time.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/resource.h>
36 #include <sys/mntent.h>
37 #include <sys/mkdev.h>
38 #include <sys/u8_textprep.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/vfs.h>
41 #include <sys/vfs_opreg.h>
42 #include <sys/vnode.h>
43 #include <sys/file.h>
44 #include <sys/kmem.h>
45 #include <sys/errno.h>
46 #include <sys/unistd.h>
47 #include <sys/mode.h>
48 #include <sys/atomic.h>
49 #include <vm/pvn.h>
50 #include "fs/fs_subr.h"
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_acl.h>
53 #include <sys/zfs_ioctl.h>
54 #include <sys/zfs_rlock.h>
55 #include <sys/zfs_fuid.h>
56 #include <sys/dnode.h>
57 #include <sys/fs/zfs.h>
58 #include <sys/kidmap.h>
59 #endif /* _KERNEL */
60 
61 #include <sys/dmu.h>
62 #include <sys/dmu_objset.h>
63 #include <sys/dmu_tx.h>
64 #include <sys/refcount.h>
65 #include <sys/stat.h>
66 #include <sys/zap.h>
67 #include <sys/zfs_znode.h>
68 #include <sys/sa.h>
69 #include <sys/zfs_sa.h>
70 #include <sys/zfs_stat.h>
71 
72 #include "zfs_prop.h"
73 #include "zfs_comutil.h"
74 
75 /*
76  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
77  * turned on when DEBUG is also defined.
78  */
79 #ifdef	DEBUG
80 #define	ZNODE_STATS
81 #endif	/* DEBUG */
82 
83 #ifdef	ZNODE_STATS
84 #define	ZNODE_STAT_ADD(stat)			((stat)++)
85 #else
86 #define	ZNODE_STAT_ADD(stat)			/* nothing */
87 #endif	/* ZNODE_STATS */
88 
89 /*
90  * Functions needed for userland (ie: libzpool) are not put under
91  * #ifdef_KERNEL; the rest of the functions have dependencies
92  * (such as VFS logic) that will not compile easily in userland.
93  */
94 #ifdef _KERNEL
95 /*
96  * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
97  * be freed before it can be safely accessed.
98  */
99 krwlock_t zfsvfs_lock;
100 
101 static kmem_cache_t *znode_cache = NULL;
102 
103 /*
104  * This is used by the test suite so that it can delay znodes from being
105  * freed in order to inspect the unlinked set.
106  */
107 int zfs_unlink_suspend_progress = 0;
108 
109 /*ARGSUSED*/
110 static void
111 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
112 {
113 	/*
114 	 * We should never drop all dbuf refs without first clearing
115 	 * the eviction callback.
116 	 */
117 	panic("evicting znode %p\n", user_ptr);
118 }
119 
120 /*
121  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
122  * z_rangelock. It will modify the offset and length of the lock to reflect
123  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
124  * called with the rangelock_t's rl_lock held, which avoids races.
125  */
126 static void
127 zfs_rangelock_cb(locked_range_t *new, void *arg)
128 {
129 	znode_t *zp = arg;
130 
131 	/*
132 	 * If in append mode, convert to writer and lock starting at the
133 	 * current end of file.
134 	 */
135 	if (new->lr_type == RL_APPEND) {
136 		new->lr_offset = zp->z_size;
137 		new->lr_type = RL_WRITER;
138 	}
139 
140 	/*
141 	 * If we need to grow the block size then lock the whole file range.
142 	 */
143 	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
144 	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
145 	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
146 		new->lr_offset = 0;
147 		new->lr_length = UINT64_MAX;
148 	}
149 }
150 
151 /*ARGSUSED*/
152 static int
153 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
154 {
155 	znode_t *zp = buf;
156 
157 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
158 
159 	zp->z_vnode = vn_alloc(kmflags);
160 	if (zp->z_vnode == NULL) {
161 		return (-1);
162 	}
163 	ZTOV(zp)->v_data = zp;
164 
165 	list_link_init(&zp->z_link_node);
166 
167 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
168 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
169 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
170 	rw_init(&zp->z_acl_lock, NULL, RW_DEFAULT, NULL);
171 
172 	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
173 
174 	zp->z_dirlocks = NULL;
175 	zp->z_acl_cached = NULL;
176 	zp->z_moved = 0;
177 	return (0);
178 }
179 
180 /*ARGSUSED*/
181 static void
182 zfs_znode_cache_destructor(void *buf, void *arg)
183 {
184 	znode_t *zp = buf;
185 
186 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
187 	ASSERT(ZTOV(zp)->v_data == zp);
188 	vn_free(ZTOV(zp));
189 	ASSERT(!list_link_active(&zp->z_link_node));
190 	mutex_destroy(&zp->z_lock);
191 	rw_destroy(&zp->z_parent_lock);
192 	rw_destroy(&zp->z_name_lock);
193 	rw_destroy(&zp->z_acl_lock);
194 	rangelock_fini(&zp->z_rangelock);
195 
196 	ASSERT(zp->z_dirlocks == NULL);
197 	ASSERT(zp->z_acl_cached == NULL);
198 }
199 
200 #ifdef	ZNODE_STATS
201 static struct {
202 	uint64_t zms_zfsvfs_invalid;
203 	uint64_t zms_zfsvfs_recheck1;
204 	uint64_t zms_zfsvfs_unmounted;
205 	uint64_t zms_zfsvfs_recheck2;
206 	uint64_t zms_obj_held;
207 	uint64_t zms_vnode_locked;
208 	uint64_t zms_not_only_dnlc;
209 } znode_move_stats;
210 #endif	/* ZNODE_STATS */
211 
212 static void
213 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
214 {
215 	vnode_t *vp;
216 
217 	/* Copy fields. */
218 	nzp->z_zfsvfs = ozp->z_zfsvfs;
219 
220 	/* Swap vnodes. */
221 	vp = nzp->z_vnode;
222 	nzp->z_vnode = ozp->z_vnode;
223 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
224 	ZTOV(ozp)->v_data = ozp;
225 	ZTOV(nzp)->v_data = nzp;
226 
227 	nzp->z_id = ozp->z_id;
228 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
229 	nzp->z_unlinked = ozp->z_unlinked;
230 	nzp->z_atime_dirty = ozp->z_atime_dirty;
231 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
232 	nzp->z_blksz = ozp->z_blksz;
233 	nzp->z_seq = ozp->z_seq;
234 	nzp->z_mapcnt = ozp->z_mapcnt;
235 	nzp->z_gen = ozp->z_gen;
236 	nzp->z_sync_cnt = ozp->z_sync_cnt;
237 	nzp->z_is_sa = ozp->z_is_sa;
238 	nzp->z_sa_hdl = ozp->z_sa_hdl;
239 	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
240 	nzp->z_links = ozp->z_links;
241 	nzp->z_size = ozp->z_size;
242 	nzp->z_pflags = ozp->z_pflags;
243 	nzp->z_uid = ozp->z_uid;
244 	nzp->z_gid = ozp->z_gid;
245 	nzp->z_mode = ozp->z_mode;
246 
247 	/*
248 	 * Since this is just an idle znode and kmem is already dealing with
249 	 * memory pressure, release any cached ACL.
250 	 */
251 	if (ozp->z_acl_cached) {
252 		zfs_acl_free(ozp->z_acl_cached);
253 		ozp->z_acl_cached = NULL;
254 	}
255 
256 	sa_set_userp(nzp->z_sa_hdl, nzp);
257 
258 	/*
259 	 * Invalidate the original znode by clearing fields that provide a
260 	 * pointer back to the znode. Set the low bit of the vfs pointer to
261 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
262 	 * subsequent callback.
263 	 */
264 	ozp->z_sa_hdl = NULL;
265 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
266 
267 	/*
268 	 * Mark the znode.
269 	 */
270 	nzp->z_moved = 1;
271 	ozp->z_moved = (uint8_t)-1;
272 }
273 
274 /*ARGSUSED*/
275 static kmem_cbrc_t
276 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
277 {
278 	znode_t *ozp = buf, *nzp = newbuf;
279 	zfsvfs_t *zfsvfs;
280 	vnode_t *vp;
281 
282 	/*
283 	 * The znode is on the file system's list of known znodes if the vfs
284 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
285 	 * the znode to invalidate it, and the memory patterns written by kmem
286 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
287 	 * created znode sets the vfs pointer last of all to indicate that the
288 	 * znode is known and in a valid state to be moved by this function.
289 	 */
290 	zfsvfs = ozp->z_zfsvfs;
291 	if (!POINTER_IS_VALID(zfsvfs)) {
292 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
293 		return (KMEM_CBRC_DONT_KNOW);
294 	}
295 
296 	/*
297 	 * Close a small window in which it's possible that the filesystem could
298 	 * be unmounted and freed, and zfsvfs, though valid in the previous
299 	 * statement, could point to unrelated memory by the time we try to
300 	 * prevent the filesystem from being unmounted.
301 	 */
302 	rw_enter(&zfsvfs_lock, RW_WRITER);
303 	if (zfsvfs != ozp->z_zfsvfs) {
304 		rw_exit(&zfsvfs_lock);
305 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
306 		return (KMEM_CBRC_DONT_KNOW);
307 	}
308 
309 	/*
310 	 * If the znode is still valid, then so is the file system. We know that
311 	 * no valid file system can be freed while we hold zfsvfs_lock, so we
312 	 * can safely ensure that the filesystem is not and will not be
313 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
314 	 */
315 	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
316 	if (zfsvfs->z_unmounted) {
317 		ZFS_EXIT(zfsvfs);
318 		rw_exit(&zfsvfs_lock);
319 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
320 		return (KMEM_CBRC_DONT_KNOW);
321 	}
322 	rw_exit(&zfsvfs_lock);
323 
324 	mutex_enter(&zfsvfs->z_znodes_lock);
325 	/*
326 	 * Recheck the vfs pointer in case the znode was removed just before
327 	 * acquiring the lock.
328 	 */
329 	if (zfsvfs != ozp->z_zfsvfs) {
330 		mutex_exit(&zfsvfs->z_znodes_lock);
331 		ZFS_EXIT(zfsvfs);
332 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
333 		return (KMEM_CBRC_DONT_KNOW);
334 	}
335 
336 	/*
337 	 * At this point we know that as long as we hold z_znodes_lock, the
338 	 * znode cannot be freed and fields within the znode can be safely
339 	 * accessed. Now, prevent a race with zfs_zget().
340 	 */
341 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
342 		mutex_exit(&zfsvfs->z_znodes_lock);
343 		ZFS_EXIT(zfsvfs);
344 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
345 		return (KMEM_CBRC_LATER);
346 	}
347 
348 	vp = ZTOV(ozp);
349 	if (mutex_tryenter(&vp->v_lock) == 0) {
350 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
351 		mutex_exit(&zfsvfs->z_znodes_lock);
352 		ZFS_EXIT(zfsvfs);
353 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
354 		return (KMEM_CBRC_LATER);
355 	}
356 
357 	/* Only move znodes that are referenced _only_ by the DNLC. */
358 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
359 		mutex_exit(&vp->v_lock);
360 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
361 		mutex_exit(&zfsvfs->z_znodes_lock);
362 		ZFS_EXIT(zfsvfs);
363 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
364 		return (KMEM_CBRC_LATER);
365 	}
366 
367 	/*
368 	 * The znode is known and in a valid state to move. We're holding the
369 	 * locks needed to execute the critical section.
370 	 */
371 	zfs_znode_move_impl(ozp, nzp);
372 	mutex_exit(&vp->v_lock);
373 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
374 
375 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
376 	mutex_exit(&zfsvfs->z_znodes_lock);
377 	ZFS_EXIT(zfsvfs);
378 
379 	return (KMEM_CBRC_YES);
380 }
381 
382 void
383 zfs_znode_init(void)
384 {
385 	/*
386 	 * Initialize zcache
387 	 */
388 	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
389 	ASSERT(znode_cache == NULL);
390 	znode_cache = kmem_cache_create("zfs_znode_cache",
391 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
392 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
393 	kmem_cache_set_move(znode_cache, zfs_znode_move);
394 }
395 
396 void
397 zfs_znode_fini(void)
398 {
399 	/*
400 	 * Cleanup vfs & vnode ops
401 	 */
402 	zfs_remove_op_tables();
403 
404 	/*
405 	 * Cleanup zcache
406 	 */
407 	if (znode_cache)
408 		kmem_cache_destroy(znode_cache);
409 	znode_cache = NULL;
410 	rw_destroy(&zfsvfs_lock);
411 }
412 
413 struct vnodeops *zfs_dvnodeops;
414 struct vnodeops *zfs_fvnodeops;
415 struct vnodeops *zfs_symvnodeops;
416 struct vnodeops *zfs_xdvnodeops;
417 struct vnodeops *zfs_evnodeops;
418 struct vnodeops *zfs_sharevnodeops;
419 
420 void
421 zfs_remove_op_tables()
422 {
423 	/*
424 	 * Remove vfs ops
425 	 */
426 	ASSERT(zfsfstype);
427 	(void) vfs_freevfsops_by_type(zfsfstype);
428 	zfsfstype = 0;
429 
430 	/*
431 	 * Remove vnode ops
432 	 */
433 	if (zfs_dvnodeops)
434 		vn_freevnodeops(zfs_dvnodeops);
435 	if (zfs_fvnodeops)
436 		vn_freevnodeops(zfs_fvnodeops);
437 	if (zfs_symvnodeops)
438 		vn_freevnodeops(zfs_symvnodeops);
439 	if (zfs_xdvnodeops)
440 		vn_freevnodeops(zfs_xdvnodeops);
441 	if (zfs_evnodeops)
442 		vn_freevnodeops(zfs_evnodeops);
443 	if (zfs_sharevnodeops)
444 		vn_freevnodeops(zfs_sharevnodeops);
445 
446 	zfs_dvnodeops = NULL;
447 	zfs_fvnodeops = NULL;
448 	zfs_symvnodeops = NULL;
449 	zfs_xdvnodeops = NULL;
450 	zfs_evnodeops = NULL;
451 	zfs_sharevnodeops = NULL;
452 }
453 
454 extern const fs_operation_def_t zfs_dvnodeops_template[];
455 extern const fs_operation_def_t zfs_fvnodeops_template[];
456 extern const fs_operation_def_t zfs_xdvnodeops_template[];
457 extern const fs_operation_def_t zfs_symvnodeops_template[];
458 extern const fs_operation_def_t zfs_evnodeops_template[];
459 extern const fs_operation_def_t zfs_sharevnodeops_template[];
460 
461 int
462 zfs_create_op_tables()
463 {
464 	int error;
465 
466 	/*
467 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
468 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
469 	 * In this case we just return as the ops vectors are already set up.
470 	 */
471 	if (zfs_dvnodeops)
472 		return (0);
473 
474 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
475 	    &zfs_dvnodeops);
476 	if (error)
477 		return (error);
478 
479 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
480 	    &zfs_fvnodeops);
481 	if (error)
482 		return (error);
483 
484 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
485 	    &zfs_symvnodeops);
486 	if (error)
487 		return (error);
488 
489 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
490 	    &zfs_xdvnodeops);
491 	if (error)
492 		return (error);
493 
494 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
495 	    &zfs_evnodeops);
496 	if (error)
497 		return (error);
498 
499 	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
500 	    &zfs_sharevnodeops);
501 
502 	return (error);
503 }
504 
505 int
506 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
507 {
508 	zfs_acl_ids_t acl_ids;
509 	vattr_t vattr;
510 	znode_t *sharezp;
511 	vnode_t *vp;
512 	znode_t *zp;
513 	int error;
514 
515 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
516 	vattr.va_type = VDIR;
517 	vattr.va_mode = S_IFDIR|0555;
518 	vattr.va_uid = crgetuid(kcred);
519 	vattr.va_gid = crgetgid(kcred);
520 
521 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
522 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
523 	sharezp->z_moved = 0;
524 	sharezp->z_unlinked = 0;
525 	sharezp->z_atime_dirty = 0;
526 	sharezp->z_zfsvfs = zfsvfs;
527 	sharezp->z_is_sa = zfsvfs->z_use_sa;
528 	sharezp->z_pflags = 0;
529 
530 	vp = ZTOV(sharezp);
531 	vn_reinit(vp);
532 	vp->v_type = VDIR;
533 
534 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
535 	    kcred, NULL, &acl_ids));
536 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
537 	ASSERT3P(zp, ==, sharezp);
538 	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
539 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
540 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
541 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
542 	zfsvfs->z_shares_dir = sharezp->z_id;
543 
544 	zfs_acl_ids_free(&acl_ids);
545 	ZTOV(sharezp)->v_count = 0;
546 	sa_handle_destroy(sharezp->z_sa_hdl);
547 	kmem_cache_free(znode_cache, sharezp);
548 
549 	return (error);
550 }
551 
552 /*
553  * define a couple of values we need available
554  * for both 64 and 32 bit environments.
555  */
556 #ifndef NBITSMINOR64
557 #define	NBITSMINOR64	32
558 #endif
559 #ifndef MAXMAJ64
560 #define	MAXMAJ64	0xffffffffUL
561 #endif
562 #ifndef	MAXMIN64
563 #define	MAXMIN64	0xffffffffUL
564 #endif
565 
566 /*
567  * Create special expldev for ZFS private use.
568  * Can't use standard expldev since it doesn't do
569  * what we want.  The standard expldev() takes a
570  * dev32_t in LP64 and expands it to a long dev_t.
571  * We need an interface that takes a dev32_t in ILP32
572  * and expands it to a long dev_t.
573  */
574 static uint64_t
575 zfs_expldev(dev_t dev)
576 {
577 #ifndef _LP64
578 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
579 	return (((uint64_t)major << NBITSMINOR64) |
580 	    ((minor_t)dev & MAXMIN32));
581 #else
582 	return (dev);
583 #endif
584 }
585 
586 /*
587  * Special cmpldev for ZFS private use.
588  * Can't use standard cmpldev since it takes
589  * a long dev_t and compresses it to dev32_t in
590  * LP64.  We need to do a compaction of a long dev_t
591  * to a dev32_t in ILP32.
592  */
593 dev_t
594 zfs_cmpldev(uint64_t dev)
595 {
596 #ifndef _LP64
597 	minor_t minor = (minor_t)dev & MAXMIN64;
598 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
599 
600 	if (major > MAXMAJ32 || minor > MAXMIN32)
601 		return (NODEV32);
602 
603 	return (((dev32_t)major << NBITSMINOR32) | minor);
604 #else
605 	return (dev);
606 #endif
607 }
608 
609 static void
610 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
611     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
612 {
613 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
614 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
615 
616 	mutex_enter(&zp->z_lock);
617 
618 	ASSERT(zp->z_sa_hdl == NULL);
619 	ASSERT(zp->z_acl_cached == NULL);
620 	if (sa_hdl == NULL) {
621 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
622 		    SA_HDL_SHARED, &zp->z_sa_hdl));
623 	} else {
624 		zp->z_sa_hdl = sa_hdl;
625 		sa_set_userp(sa_hdl, zp);
626 	}
627 
628 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
629 
630 	/*
631 	 * Slap on VROOT if we are the root znode
632 	 */
633 	if (zp->z_id == zfsvfs->z_root)
634 		ZTOV(zp)->v_flag |= VROOT;
635 
636 	mutex_exit(&zp->z_lock);
637 	vn_exists(ZTOV(zp));
638 }
639 
640 void
641 zfs_znode_dmu_fini(znode_t *zp)
642 {
643 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
644 	    zp->z_unlinked ||
645 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
646 
647 	sa_handle_destroy(zp->z_sa_hdl);
648 	zp->z_sa_hdl = NULL;
649 }
650 
651 /*
652  * Construct a new znode/vnode and intialize.
653  *
654  * This does not do a call to dmu_set_user() that is
655  * up to the caller to do, in case you don't want to
656  * return the znode
657  */
658 static znode_t *
659 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
660     dmu_object_type_t obj_type, sa_handle_t *hdl)
661 {
662 	znode_t	*zp;
663 	vnode_t *vp;
664 	uint64_t mode;
665 	uint64_t parent;
666 	uint64_t projid = ZFS_DEFAULT_PROJID;
667 	sa_bulk_attr_t bulk[11];
668 	int count = 0;
669 
670 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
671 
672 	ASSERT(zp->z_dirlocks == NULL);
673 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
674 	zp->z_moved = 0;
675 
676 	/*
677 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
678 	 * the zfs_znode_move() callback.
679 	 */
680 	zp->z_sa_hdl = NULL;
681 	zp->z_unlinked = 0;
682 	zp->z_atime_dirty = 0;
683 	zp->z_mapcnt = 0;
684 	zp->z_id = db->db_object;
685 	zp->z_blksz = blksz;
686 	zp->z_seq = 0x7A4653;
687 	zp->z_sync_cnt = 0;
688 
689 	vp = ZTOV(zp);
690 	vn_reinit(vp);
691 
692 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
693 
694 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
695 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
696 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
697 	    &zp->z_size, 8);
698 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
699 	    &zp->z_links, 8);
700 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
701 	    &zp->z_pflags, 8);
702 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
703 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
704 	    &zp->z_atime, 16);
705 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
706 	    &zp->z_uid, 8);
707 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
708 	    &zp->z_gid, 8);
709 
710 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
711 	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
712 	    (zp->z_pflags & ZFS_PROJID) &&
713 	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
714 		if (hdl == NULL)
715 			sa_handle_destroy(zp->z_sa_hdl);
716 		kmem_cache_free(znode_cache, zp);
717 		return (NULL);
718 	}
719 
720 	zp->z_projid = projid;
721 	zp->z_mode = mode;
722 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
723 
724 	vp->v_type = IFTOVT((mode_t)mode);
725 
726 	switch (vp->v_type) {
727 	case VDIR:
728 		if (zp->z_pflags & ZFS_XATTR) {
729 			vn_setops(vp, zfs_xdvnodeops);
730 			vp->v_flag |= V_XATTRDIR;
731 		} else {
732 			vn_setops(vp, zfs_dvnodeops);
733 		}
734 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
735 		break;
736 	case VBLK:
737 	case VCHR:
738 		{
739 			uint64_t rdev;
740 			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
741 			    &rdev, sizeof (rdev)) == 0);
742 
743 			vp->v_rdev = zfs_cmpldev(rdev);
744 		}
745 		/*FALLTHROUGH*/
746 	case VFIFO:
747 	case VSOCK:
748 	case VDOOR:
749 		vn_setops(vp, zfs_fvnodeops);
750 		break;
751 	case VREG:
752 		vp->v_flag |= VMODSORT;
753 		if (parent == zfsvfs->z_shares_dir) {
754 			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
755 			vn_setops(vp, zfs_sharevnodeops);
756 		} else {
757 			vn_setops(vp, zfs_fvnodeops);
758 		}
759 		break;
760 	case VLNK:
761 		vn_setops(vp, zfs_symvnodeops);
762 		break;
763 	default:
764 		vn_setops(vp, zfs_evnodeops);
765 		break;
766 	}
767 
768 	mutex_enter(&zfsvfs->z_znodes_lock);
769 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
770 	membar_producer();
771 	/*
772 	 * Everything else must be valid before assigning z_zfsvfs makes the
773 	 * znode eligible for zfs_znode_move().
774 	 */
775 	zp->z_zfsvfs = zfsvfs;
776 	mutex_exit(&zfsvfs->z_znodes_lock);
777 
778 	VFS_HOLD(zfsvfs->z_vfs);
779 	return (zp);
780 }
781 
782 static uint64_t empty_xattr;
783 static uint64_t pad[4];
784 static zfs_acl_phys_t acl_phys;
785 /*
786  * Create a new DMU object to hold a zfs znode.
787  *
788  *	IN:	dzp	- parent directory for new znode
789  *		vap	- file attributes for new znode
790  *		tx	- dmu transaction id for zap operations
791  *		cr	- credentials of caller
792  *		flag	- flags:
793  *			  IS_ROOT_NODE	- new object will be root
794  *			  IS_XATTR	- new object is an attribute
795  *		bonuslen - length of bonus buffer
796  *		setaclp  - File/Dir initial ACL
797  *		fuidp	 - Tracks fuid allocation.
798  *
799  *	OUT:	zpp	- allocated znode
800  *
801  */
802 void
803 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
804     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
805 {
806 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
807 	uint64_t	mode, size, links, parent, pflags;
808 	uint64_t	dzp_pflags = 0;
809 	uint64_t	projid = ZFS_DEFAULT_PROJID;
810 	uint64_t	rdev = 0;
811 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
812 	dmu_buf_t	*db;
813 	timestruc_t	now;
814 	uint64_t	gen, obj;
815 	int		bonuslen;
816 	int		dnodesize;
817 	sa_handle_t	*sa_hdl;
818 	dmu_object_type_t obj_type;
819 	sa_bulk_attr_t	*sa_attrs;
820 	int		cnt = 0;
821 	zfs_acl_locator_cb_t locate = { 0 };
822 
823 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
824 
825 	if (zfsvfs->z_replay) {
826 		obj = vap->va_nodeid;
827 		now = vap->va_ctime;		/* see zfs_replay_create() */
828 		gen = vap->va_nblocks;		/* ditto */
829 		dnodesize = vap->va_fsid;	/* ditto */
830 	} else {
831 		obj = 0;
832 		gethrestime(&now);
833 		gen = dmu_tx_get_txg(tx);
834 		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
835 	}
836 
837 	if (dnodesize == 0)
838 		dnodesize = DNODE_MIN_SIZE;
839 
840 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
841 	bonuslen = (obj_type == DMU_OT_SA) ?
842 	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
843 
844 	/*
845 	 * Create a new DMU object.
846 	 */
847 	/*
848 	 * There's currently no mechanism for pre-reading the blocks that will
849 	 * be needed to allocate a new object, so we accept the small chance
850 	 * that there will be an i/o error and we will fail one of the
851 	 * assertions below.
852 	 */
853 	if (vap->va_type == VDIR) {
854 		if (zfsvfs->z_replay) {
855 			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
856 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
857 			    obj_type, bonuslen, dnodesize, tx));
858 		} else {
859 			obj = zap_create_norm_dnsize(zfsvfs->z_os,
860 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
861 			    obj_type, bonuslen, dnodesize, tx);
862 		}
863 	} else {
864 		if (zfsvfs->z_replay) {
865 			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
866 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
867 			    obj_type, bonuslen, dnodesize, tx));
868 		} else {
869 			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
870 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
871 			    obj_type, bonuslen, dnodesize, tx);
872 		}
873 	}
874 
875 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
876 	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
877 
878 	/*
879 	 * If this is the root, fix up the half-initialized parent pointer
880 	 * to reference the just-allocated physical data area.
881 	 */
882 	if (flag & IS_ROOT_NODE) {
883 		dzp->z_id = obj;
884 	}
885 
886 	/*
887 	 * If parent is an xattr, so am I.
888 	 */
889 	if (dzp->z_pflags & ZFS_XATTR) {
890 		flag |= IS_XATTR;
891 	}
892 
893 	if (zfsvfs->z_use_fuids)
894 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
895 	else
896 		pflags = 0;
897 
898 	if (vap->va_type == VDIR) {
899 		size = 2;		/* contents ("." and "..") */
900 		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
901 	} else {
902 		size = links = 0;
903 	}
904 
905 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
906 		rdev = zfs_expldev(vap->va_rdev);
907 	}
908 
909 	parent = dzp->z_id;
910 	mode = acl_ids->z_mode;
911 	if (flag & IS_XATTR)
912 		pflags |= ZFS_XATTR;
913 
914 	if (vap->va_type == VREG || vap->va_type == VDIR) {
915 		/*
916 		 * With ZFS_PROJID flag, we can easily know whether there is
917 		 * project ID stored on disk or not. See zfs_space_delta_cb().
918 		 */
919 		if (obj_type != DMU_OT_ZNODE &&
920 		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
921 			pflags |= ZFS_PROJID;
922 
923 		/*
924 		 * Inherit project ID from parent if required.
925 		 */
926 		projid = zfs_inherit_projid(dzp);
927 		if (dzp->z_pflags & ZFS_PROJINHERIT)
928 			pflags |= ZFS_PROJINHERIT;
929 	}
930 
931 	/*
932 	 * No execs denied will be deterimed when zfs_mode_compute() is called.
933 	 */
934 	pflags |= acl_ids->z_aclp->z_hints &
935 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
936 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
937 
938 	ZFS_TIME_ENCODE(&now, crtime);
939 	ZFS_TIME_ENCODE(&now, ctime);
940 
941 	if (vap->va_mask & AT_ATIME) {
942 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
943 	} else {
944 		ZFS_TIME_ENCODE(&now, atime);
945 	}
946 
947 	if (vap->va_mask & AT_MTIME) {
948 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
949 	} else {
950 		ZFS_TIME_ENCODE(&now, mtime);
951 	}
952 
953 	/* Now add in all of the "SA" attributes */
954 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
955 	    &sa_hdl));
956 
957 	/*
958 	 * Setup the array of attributes to be replaced/set on the new file
959 	 *
960 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
961 	 * in the old znode_phys_t format.  Don't change this ordering
962 	 */
963 	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
964 
965 	if (obj_type == DMU_OT_ZNODE) {
966 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
967 		    NULL, &atime, 16);
968 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
969 		    NULL, &mtime, 16);
970 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
971 		    NULL, &ctime, 16);
972 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
973 		    NULL, &crtime, 16);
974 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
975 		    NULL, &gen, 8);
976 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
977 		    NULL, &mode, 8);
978 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
979 		    NULL, &size, 8);
980 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
981 		    NULL, &parent, 8);
982 	} else {
983 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
984 		    NULL, &mode, 8);
985 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
986 		    NULL, &size, 8);
987 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
988 		    NULL, &gen, 8);
989 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
990 		    NULL, &acl_ids->z_fuid, 8);
991 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
992 		    NULL, &acl_ids->z_fgid, 8);
993 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
994 		    NULL, &parent, 8);
995 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
996 		    NULL, &pflags, 8);
997 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
998 		    NULL, &atime, 16);
999 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
1000 		    NULL, &mtime, 16);
1001 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
1002 		    NULL, &ctime, 16);
1003 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
1004 		    NULL, &crtime, 16);
1005 	}
1006 
1007 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
1008 
1009 	if (obj_type == DMU_OT_ZNODE) {
1010 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
1011 		    &empty_xattr, 8);
1012 	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
1013 	    pflags & ZFS_PROJID) {
1014 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
1015 		    NULL, &projid, 8);
1016 	}
1017 	if (obj_type == DMU_OT_ZNODE ||
1018 	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
1019 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
1020 		    NULL, &rdev, 8);
1021 
1022 	}
1023 	if (obj_type == DMU_OT_ZNODE) {
1024 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
1025 		    NULL, &pflags, 8);
1026 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
1027 		    &acl_ids->z_fuid, 8);
1028 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
1029 		    &acl_ids->z_fgid, 8);
1030 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
1031 		    sizeof (uint64_t) * 4);
1032 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
1033 		    &acl_phys, sizeof (zfs_acl_phys_t));
1034 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
1035 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
1036 		    &acl_ids->z_aclp->z_acl_count, 8);
1037 		locate.cb_aclp = acl_ids->z_aclp;
1038 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
1039 		    zfs_acl_data_locator, &locate,
1040 		    acl_ids->z_aclp->z_acl_bytes);
1041 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
1042 		    acl_ids->z_fuid, acl_ids->z_fgid);
1043 	}
1044 
1045 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
1046 
1047 	if (!(flag & IS_ROOT_NODE)) {
1048 		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
1049 		ASSERT(*zpp != NULL);
1050 	} else {
1051 		/*
1052 		 * If we are creating the root node, the "parent" we
1053 		 * passed in is the znode for the root.
1054 		 */
1055 		*zpp = dzp;
1056 
1057 		(*zpp)->z_sa_hdl = sa_hdl;
1058 	}
1059 
1060 	(*zpp)->z_pflags = pflags;
1061 	(*zpp)->z_mode = mode;
1062 	(*zpp)->z_dnodesize = dnodesize;
1063 	(*zpp)->z_projid = projid;
1064 
1065 	if (vap->va_mask & AT_XVATTR)
1066 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
1067 
1068 	if (obj_type == DMU_OT_ZNODE ||
1069 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
1070 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
1071 	}
1072 	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
1073 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1074 }
1075 
1076 /*
1077  * Update in-core attributes.  It is assumed the caller will be doing an
1078  * sa_bulk_update to push the changes out.
1079  */
1080 void
1081 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
1082 {
1083 	xoptattr_t *xoap;
1084 
1085 	xoap = xva_getxoptattr(xvap);
1086 	ASSERT(xoap);
1087 
1088 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
1089 		uint64_t times[2];
1090 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
1091 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
1092 		    &times, sizeof (times), tx);
1093 		XVA_SET_RTN(xvap, XAT_CREATETIME);
1094 	}
1095 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1096 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1097 		    zp->z_pflags, tx);
1098 		XVA_SET_RTN(xvap, XAT_READONLY);
1099 	}
1100 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1101 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1102 		    zp->z_pflags, tx);
1103 		XVA_SET_RTN(xvap, XAT_HIDDEN);
1104 	}
1105 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1106 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1107 		    zp->z_pflags, tx);
1108 		XVA_SET_RTN(xvap, XAT_SYSTEM);
1109 	}
1110 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1111 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1112 		    zp->z_pflags, tx);
1113 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
1114 	}
1115 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1116 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1117 		    zp->z_pflags, tx);
1118 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1119 	}
1120 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1121 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1122 		    zp->z_pflags, tx);
1123 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
1124 	}
1125 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1126 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1127 		    zp->z_pflags, tx);
1128 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
1129 	}
1130 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1131 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1132 		    zp->z_pflags, tx);
1133 		XVA_SET_RTN(xvap, XAT_NODUMP);
1134 	}
1135 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1136 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1137 		    zp->z_pflags, tx);
1138 		XVA_SET_RTN(xvap, XAT_OPAQUE);
1139 	}
1140 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1141 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1142 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1143 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1144 	}
1145 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1146 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1147 		    zp->z_pflags, tx);
1148 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1149 	}
1150 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1151 		zfs_sa_set_scanstamp(zp, xvap, tx);
1152 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1153 	}
1154 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1155 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1156 		    zp->z_pflags, tx);
1157 		XVA_SET_RTN(xvap, XAT_REPARSE);
1158 	}
1159 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1160 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1161 		    zp->z_pflags, tx);
1162 		XVA_SET_RTN(xvap, XAT_OFFLINE);
1163 	}
1164 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1165 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1166 		    zp->z_pflags, tx);
1167 		XVA_SET_RTN(xvap, XAT_SPARSE);
1168 	}
1169 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1170 		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1171 		    zp->z_pflags, tx);
1172 		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1173 	}
1174 }
1175 
1176 int
1177 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1178 {
1179 	dmu_object_info_t doi;
1180 	dmu_buf_t	*db;
1181 	znode_t		*zp;
1182 	int err;
1183 	sa_handle_t	*hdl;
1184 
1185 	*zpp = NULL;
1186 
1187 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1188 
1189 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1190 	if (err) {
1191 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1192 		return (err);
1193 	}
1194 
1195 	dmu_object_info_from_db(db, &doi);
1196 	if (doi.doi_bonus_type != DMU_OT_SA &&
1197 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1198 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1199 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1200 		sa_buf_rele(db, NULL);
1201 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1202 		return (SET_ERROR(EINVAL));
1203 	}
1204 
1205 	hdl = dmu_buf_get_user(db);
1206 	if (hdl != NULL) {
1207 		zp  = sa_get_userdata(hdl);
1208 
1209 
1210 		/*
1211 		 * Since "SA" does immediate eviction we
1212 		 * should never find a sa handle that doesn't
1213 		 * know about the znode.
1214 		 */
1215 
1216 		ASSERT3P(zp, !=, NULL);
1217 
1218 		mutex_enter(&zp->z_lock);
1219 		ASSERT3U(zp->z_id, ==, obj_num);
1220 		if (zp->z_unlinked) {
1221 			err = SET_ERROR(ENOENT);
1222 		} else {
1223 			VN_HOLD(ZTOV(zp));
1224 			*zpp = zp;
1225 			err = 0;
1226 		}
1227 		mutex_exit(&zp->z_lock);
1228 		sa_buf_rele(db, NULL);
1229 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1230 		return (err);
1231 	}
1232 
1233 	/*
1234 	 * Not found create new znode/vnode
1235 	 * but only if file exists.
1236 	 *
1237 	 * There is a small window where zfs_vget() could
1238 	 * find this object while a file create is still in
1239 	 * progress.  This is checked for in zfs_znode_alloc()
1240 	 *
1241 	 * if zfs_znode_alloc() fails it will drop the hold on the
1242 	 * bonus buffer.
1243 	 */
1244 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1245 	    doi.doi_bonus_type, NULL);
1246 	if (zp == NULL) {
1247 		err = SET_ERROR(ENOENT);
1248 	} else {
1249 		if (zp->z_links == 0)
1250 			zp->z_unlinked = B_TRUE;
1251 		*zpp = zp;
1252 	}
1253 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1254 	return (err);
1255 }
1256 
1257 int
1258 zfs_rezget(znode_t *zp)
1259 {
1260 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1261 	dmu_object_info_t doi;
1262 	dmu_buf_t *db;
1263 	uint64_t obj_num = zp->z_id;
1264 	uint64_t mode;
1265 	sa_bulk_attr_t bulk[10];
1266 	int err;
1267 	int count = 0;
1268 	uint64_t gen;
1269 	uint64_t projid = ZFS_DEFAULT_PROJID;
1270 
1271 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1272 
1273 	rw_enter(&zp->z_acl_lock, RW_WRITER);
1274 	if (zp->z_acl_cached) {
1275 		zfs_acl_free(zp->z_acl_cached);
1276 		zp->z_acl_cached = NULL;
1277 	}
1278 	rw_exit(&zp->z_acl_lock);
1279 
1280 	ASSERT(zp->z_sa_hdl == NULL);
1281 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1282 	if (err) {
1283 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1284 		return (err);
1285 	}
1286 
1287 	dmu_object_info_from_db(db, &doi);
1288 	if (doi.doi_bonus_type != DMU_OT_SA &&
1289 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1290 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1291 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1292 		sa_buf_rele(db, NULL);
1293 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1294 		return (SET_ERROR(EINVAL));
1295 	}
1296 
1297 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1298 
1299 	/* reload cached values */
1300 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1301 	    &gen, sizeof (gen));
1302 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1303 	    &zp->z_size, sizeof (zp->z_size));
1304 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1305 	    &zp->z_links, sizeof (zp->z_links));
1306 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1307 	    &zp->z_pflags, sizeof (zp->z_pflags));
1308 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1309 	    &zp->z_atime, sizeof (zp->z_atime));
1310 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1311 	    &zp->z_uid, sizeof (zp->z_uid));
1312 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1313 	    &zp->z_gid, sizeof (zp->z_gid));
1314 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1315 	    &mode, sizeof (mode));
1316 
1317 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1318 		zfs_znode_dmu_fini(zp);
1319 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1320 		return (SET_ERROR(EIO));
1321 	}
1322 
1323 	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1324 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1325 		    &projid, 8);
1326 		if (err != 0 && err != ENOENT) {
1327 			zfs_znode_dmu_fini(zp);
1328 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1329 			return (SET_ERROR(err));
1330 		}
1331 	}
1332 
1333 	zp->z_projid = projid;
1334 	zp->z_mode = mode;
1335 
1336 	if (gen != zp->z_gen) {
1337 		zfs_znode_dmu_fini(zp);
1338 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1339 		return (SET_ERROR(EIO));
1340 	}
1341 
1342 	zp->z_blksz = doi.doi_data_block_size;
1343 
1344 	/*
1345 	 * If the file has zero links, then it has been unlinked on the send
1346 	 * side and it must be in the received unlinked set.
1347 	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1348 	 * stale data and to prevent automatical removal of the file in
1349 	 * zfs_zinactive().  The file will be removed either when it is removed
1350 	 * on the send side and the next incremental stream is received or
1351 	 * when the unlinked set gets processed.
1352 	 */
1353 	zp->z_unlinked = (zp->z_links == 0);
1354 	if (zp->z_unlinked)
1355 		zfs_znode_dmu_fini(zp);
1356 
1357 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1358 
1359 	return (0);
1360 }
1361 
1362 void
1363 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1364 {
1365 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1366 	objset_t *os = zfsvfs->z_os;
1367 	uint64_t obj = zp->z_id;
1368 	uint64_t acl_obj = zfs_external_acl(zp);
1369 
1370 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1371 	if (acl_obj) {
1372 		VERIFY(!zp->z_is_sa);
1373 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1374 	}
1375 	VERIFY(0 == dmu_object_free(os, obj, tx));
1376 	zfs_znode_dmu_fini(zp);
1377 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1378 	zfs_znode_free(zp);
1379 }
1380 
1381 void
1382 zfs_zinactive(znode_t *zp)
1383 {
1384 	vnode_t	*vp = ZTOV(zp);
1385 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1386 	uint64_t z_id = zp->z_id;
1387 
1388 	ASSERT(zp->z_sa_hdl);
1389 
1390 	/*
1391 	 * Don't allow a zfs_zget() while were trying to release this znode
1392 	 */
1393 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1394 
1395 	mutex_enter(&zp->z_lock);
1396 	mutex_enter(&vp->v_lock);
1397 	VN_RELE_LOCKED(vp);
1398 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
1399 		/*
1400 		 * If the hold count is greater than zero, somebody has
1401 		 * obtained a new reference on this znode while we were
1402 		 * processing it here, so we are done.  If we still have
1403 		 * mapped pages then we are also done, since we don't
1404 		 * want to inactivate the znode until the pages get pushed.
1405 		 *
1406 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
1407 		 * this seems like it would leave the znode hanging with
1408 		 * no chance to go inactive...
1409 		 */
1410 		mutex_exit(&vp->v_lock);
1411 		mutex_exit(&zp->z_lock);
1412 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1413 		return;
1414 	}
1415 	mutex_exit(&vp->v_lock);
1416 
1417 	/*
1418 	 * If this was the last reference to a file with no links, remove
1419 	 * the file from the file system unless the file system is mounted
1420 	 * read-only.  That can happen, for example, if the file system was
1421 	 * originally read-write, the file was opened, then unlinked and
1422 	 * the file system was made read-only before the file was finally
1423 	 * closed.  The file will remain in the unlinked set.
1424 	 */
1425 	if (zp->z_unlinked) {
1426 		ASSERT(!zfsvfs->z_issnap);
1427 		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0 &&
1428 		    !zfs_unlink_suspend_progress) {
1429 			mutex_exit(&zp->z_lock);
1430 			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1431 			zfs_rmnode(zp);
1432 			return;
1433 		}
1434 	}
1435 
1436 	mutex_exit(&zp->z_lock);
1437 	zfs_znode_dmu_fini(zp);
1438 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1439 	zfs_znode_free(zp);
1440 }
1441 
1442 void
1443 zfs_znode_free(znode_t *zp)
1444 {
1445 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1446 
1447 	vn_invalid(ZTOV(zp));
1448 
1449 	ASSERT(ZTOV(zp)->v_count == 0);
1450 
1451 	mutex_enter(&zfsvfs->z_znodes_lock);
1452 	POINTER_INVALIDATE(&zp->z_zfsvfs);
1453 	list_remove(&zfsvfs->z_all_znodes, zp);
1454 	mutex_exit(&zfsvfs->z_znodes_lock);
1455 
1456 	if (zp->z_acl_cached) {
1457 		zfs_acl_free(zp->z_acl_cached);
1458 		zp->z_acl_cached = NULL;
1459 	}
1460 
1461 	kmem_cache_free(znode_cache, zp);
1462 
1463 	VFS_RELE(zfsvfs->z_vfs);
1464 }
1465 
1466 void
1467 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1468     uint64_t ctime[2], boolean_t have_tx)
1469 {
1470 	timestruc_t	now;
1471 
1472 	gethrestime(&now);
1473 
1474 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
1475 		zp->z_atime_dirty = 0;
1476 		zp->z_seq++;
1477 	} else {
1478 		zp->z_atime_dirty = 1;
1479 	}
1480 
1481 	if (flag & AT_ATIME) {
1482 		ZFS_TIME_ENCODE(&now, zp->z_atime);
1483 	}
1484 
1485 	if (flag & AT_MTIME) {
1486 		ZFS_TIME_ENCODE(&now, mtime);
1487 		if (zp->z_zfsvfs->z_use_fuids) {
1488 			zp->z_pflags |= (ZFS_ARCHIVE |
1489 			    ZFS_AV_MODIFIED);
1490 		}
1491 	}
1492 
1493 	if (flag & AT_CTIME) {
1494 		ZFS_TIME_ENCODE(&now, ctime);
1495 		if (zp->z_zfsvfs->z_use_fuids)
1496 			zp->z_pflags |= ZFS_ARCHIVE;
1497 	}
1498 }
1499 
1500 /*
1501  * Grow the block size for a file.
1502  *
1503  *	IN:	zp	- znode of file to free data in.
1504  *		size	- requested block size
1505  *		tx	- open transaction.
1506  *
1507  * NOTE: this function assumes that the znode is write locked.
1508  */
1509 void
1510 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1511 {
1512 	int		error;
1513 	u_longlong_t	dummy;
1514 
1515 	if (size <= zp->z_blksz)
1516 		return;
1517 	/*
1518 	 * If the file size is already greater than the current blocksize,
1519 	 * we will not grow.  If there is more than one block in a file,
1520 	 * the blocksize cannot change.
1521 	 */
1522 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1523 		return;
1524 
1525 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1526 	    size, 0, tx);
1527 
1528 	if (error == ENOTSUP)
1529 		return;
1530 	ASSERT0(error);
1531 
1532 	/* What blocksize did we actually get? */
1533 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1534 }
1535 
1536 /*
1537  * This is a dummy interface used when pvn_vplist_dirty() should *not*
1538  * be calling back into the fs for a putpage().  E.g.: when truncating
1539  * a file, the pages being "thrown away* don't need to be written out.
1540  */
1541 /* ARGSUSED */
1542 static int
1543 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1544     int flags, cred_t *cr)
1545 {
1546 	ASSERT(0);
1547 	return (0);
1548 }
1549 
1550 /*
1551  * Increase the file length
1552  *
1553  *	IN:	zp	- znode of file to free data in.
1554  *		end	- new end-of-file
1555  *
1556  *	RETURN:	0 on success, error code on failure
1557  */
1558 static int
1559 zfs_extend(znode_t *zp, uint64_t end)
1560 {
1561 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1562 	dmu_tx_t *tx;
1563 	locked_range_t *lr;
1564 	uint64_t newblksz;
1565 	int error;
1566 
1567 	/*
1568 	 * We will change zp_size, lock the whole file.
1569 	 */
1570 	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1571 
1572 	/*
1573 	 * Nothing to do if file already at desired length.
1574 	 */
1575 	if (end <= zp->z_size) {
1576 		rangelock_exit(lr);
1577 		return (0);
1578 	}
1579 	tx = dmu_tx_create(zfsvfs->z_os);
1580 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1581 	zfs_sa_upgrade_txholds(tx, zp);
1582 	if (end > zp->z_blksz &&
1583 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1584 		/*
1585 		 * We are growing the file past the current block size.
1586 		 */
1587 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1588 			/*
1589 			 * File's blocksize is already larger than the
1590 			 * "recordsize" property.  Only let it grow to
1591 			 * the next power of 2.
1592 			 */
1593 			ASSERT(!ISP2(zp->z_blksz));
1594 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1595 		} else {
1596 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1597 		}
1598 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1599 	} else {
1600 		newblksz = 0;
1601 	}
1602 
1603 	error = dmu_tx_assign(tx, TXG_WAIT);
1604 	if (error) {
1605 		dmu_tx_abort(tx);
1606 		rangelock_exit(lr);
1607 		return (error);
1608 	}
1609 
1610 	if (newblksz)
1611 		zfs_grow_blocksize(zp, newblksz, tx);
1612 
1613 	zp->z_size = end;
1614 
1615 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
1616 	    &zp->z_size, sizeof (zp->z_size), tx));
1617 
1618 	rangelock_exit(lr);
1619 
1620 	dmu_tx_commit(tx);
1621 
1622 	return (0);
1623 }
1624 
1625 /*
1626  * Free space in a file.
1627  *
1628  *	IN:	zp	- znode of file to free data in.
1629  *		off	- start of section to free.
1630  *		len	- length of section to free.
1631  *
1632  *	RETURN:	0 on success, error code on failure
1633  */
1634 static int
1635 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1636 {
1637 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1638 	locked_range_t *lr;
1639 	int error;
1640 
1641 	/*
1642 	 * Lock the range being freed.
1643 	 */
1644 	lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1645 
1646 	/*
1647 	 * Nothing to do if file already at desired length.
1648 	 */
1649 	if (off >= zp->z_size) {
1650 		rangelock_exit(lr);
1651 		return (0);
1652 	}
1653 
1654 	if (off + len > zp->z_size)
1655 		len = zp->z_size - off;
1656 
1657 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1658 
1659 	rangelock_exit(lr);
1660 
1661 	return (error);
1662 }
1663 
1664 /*
1665  * Truncate a file
1666  *
1667  *	IN:	zp	- znode of file to free data in.
1668  *		end	- new end-of-file.
1669  *
1670  *	RETURN:	0 on success, error code on failure
1671  */
1672 static int
1673 zfs_trunc(znode_t *zp, uint64_t end)
1674 {
1675 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1676 	vnode_t *vp = ZTOV(zp);
1677 	dmu_tx_t *tx;
1678 	locked_range_t *lr;
1679 	int error;
1680 	sa_bulk_attr_t bulk[2];
1681 	int count = 0;
1682 
1683 	/*
1684 	 * We will change zp_size, lock the whole file.
1685 	 */
1686 	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1687 
1688 	/*
1689 	 * Nothing to do if file already at desired length.
1690 	 */
1691 	if (end >= zp->z_size) {
1692 		rangelock_exit(lr);
1693 		return (0);
1694 	}
1695 
1696 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1697 	    DMU_OBJECT_END);
1698 	if (error) {
1699 		rangelock_exit(lr);
1700 		return (error);
1701 	}
1702 	tx = dmu_tx_create(zfsvfs->z_os);
1703 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1704 	zfs_sa_upgrade_txholds(tx, zp);
1705 	dmu_tx_mark_netfree(tx);
1706 	error = dmu_tx_assign(tx, TXG_WAIT);
1707 	if (error) {
1708 		dmu_tx_abort(tx);
1709 		rangelock_exit(lr);
1710 		return (error);
1711 	}
1712 
1713 	zp->z_size = end;
1714 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1715 	    NULL, &zp->z_size, sizeof (zp->z_size));
1716 
1717 	if (end == 0) {
1718 		zp->z_pflags &= ~ZFS_SPARSE;
1719 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1720 		    NULL, &zp->z_pflags, 8);
1721 	}
1722 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1723 
1724 	dmu_tx_commit(tx);
1725 
1726 	/*
1727 	 * Clear any mapped pages in the truncated region.  This has to
1728 	 * happen outside of the transaction to avoid the possibility of
1729 	 * a deadlock with someone trying to push a page that we are
1730 	 * about to invalidate.
1731 	 */
1732 	if (vn_has_cached_data(vp)) {
1733 		page_t *pp;
1734 		uint64_t start = end & PAGEMASK;
1735 		int poff = end & PAGEOFFSET;
1736 
1737 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1738 			/*
1739 			 * We need to zero a partial page.
1740 			 */
1741 			pagezero(pp, poff, PAGESIZE - poff);
1742 			start += PAGESIZE;
1743 			page_unlock(pp);
1744 		}
1745 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1746 		    B_INVAL | B_TRUNC, NULL);
1747 		ASSERT(error == 0);
1748 	}
1749 
1750 	rangelock_exit(lr);
1751 
1752 	return (0);
1753 }
1754 
1755 /*
1756  * Free space in a file
1757  *
1758  *	IN:	zp	- znode of file to free data in.
1759  *		off	- start of range
1760  *		len	- end of range (0 => EOF)
1761  *		flag	- current file open mode flags.
1762  *		log	- TRUE if this action should be logged
1763  *
1764  *	RETURN:	0 on success, error code on failure
1765  */
1766 int
1767 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1768 {
1769 	vnode_t *vp = ZTOV(zp);
1770 	dmu_tx_t *tx;
1771 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1772 	zilog_t *zilog = zfsvfs->z_log;
1773 	uint64_t mode;
1774 	uint64_t mtime[2], ctime[2];
1775 	sa_bulk_attr_t bulk[3];
1776 	int count = 0;
1777 	int error;
1778 
1779 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1780 	    sizeof (mode))) != 0)
1781 		return (error);
1782 
1783 	if (off > zp->z_size) {
1784 		error =  zfs_extend(zp, off+len);
1785 		if (error == 0 && log)
1786 			goto log;
1787 		else
1788 			return (error);
1789 	}
1790 
1791 	/*
1792 	 * Check for any locks in the region to be freed.
1793 	 */
1794 
1795 	if (MANDLOCK(vp, (mode_t)mode)) {
1796 		uint64_t length = (len ? len : zp->z_size - off);
1797 		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
1798 			return (error);
1799 	}
1800 
1801 	if (len == 0) {
1802 		error = zfs_trunc(zp, off);
1803 	} else {
1804 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1805 		    off + len > zp->z_size)
1806 			error = zfs_extend(zp, off+len);
1807 	}
1808 	if (error || !log)
1809 		return (error);
1810 log:
1811 	tx = dmu_tx_create(zfsvfs->z_os);
1812 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1813 	zfs_sa_upgrade_txholds(tx, zp);
1814 	error = dmu_tx_assign(tx, TXG_WAIT);
1815 	if (error) {
1816 		dmu_tx_abort(tx);
1817 		return (error);
1818 	}
1819 
1820 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1821 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1822 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1823 	    NULL, &zp->z_pflags, 8);
1824 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
1825 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1826 	ASSERT(error == 0);
1827 
1828 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1829 
1830 	dmu_tx_commit(tx);
1831 	return (0);
1832 }
1833 
1834 void
1835 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1836 {
1837 	uint64_t	moid, obj, sa_obj, version;
1838 	uint64_t	sense = ZFS_CASE_SENSITIVE;
1839 	uint64_t	norm = 0;
1840 	nvpair_t	*elem;
1841 	int		error;
1842 	int		i;
1843 	znode_t		*rootzp = NULL;
1844 	zfsvfs_t	*zfsvfs;
1845 	vnode_t		*vp;
1846 	vattr_t		vattr;
1847 	znode_t		*zp;
1848 	zfs_acl_ids_t	acl_ids;
1849 
1850 	/*
1851 	 * First attempt to create master node.
1852 	 */
1853 	/*
1854 	 * In an empty objset, there are no blocks to read and thus
1855 	 * there can be no i/o errors (which we assert below).
1856 	 */
1857 	moid = MASTER_NODE_OBJ;
1858 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1859 	    DMU_OT_NONE, 0, tx);
1860 	ASSERT(error == 0);
1861 
1862 	/*
1863 	 * Set starting attributes.
1864 	 */
1865 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1866 	elem = NULL;
1867 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1868 		/* For the moment we expect all zpl props to be uint64_ts */
1869 		uint64_t val;
1870 		char *name;
1871 
1872 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1873 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1874 		name = nvpair_name(elem);
1875 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1876 			if (val < version)
1877 				version = val;
1878 		} else {
1879 			error = zap_update(os, moid, name, 8, 1, &val, tx);
1880 		}
1881 		ASSERT(error == 0);
1882 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1883 			norm = val;
1884 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1885 			sense = val;
1886 	}
1887 	ASSERT(version != 0);
1888 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1889 
1890 	/*
1891 	 * Create zap object used for SA attribute registration
1892 	 */
1893 
1894 	if (version >= ZPL_VERSION_SA) {
1895 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1896 		    DMU_OT_NONE, 0, tx);
1897 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1898 		ASSERT(error == 0);
1899 	} else {
1900 		sa_obj = 0;
1901 	}
1902 	/*
1903 	 * Create a delete queue.
1904 	 */
1905 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1906 
1907 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1908 	ASSERT(error == 0);
1909 
1910 	/*
1911 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1912 	 * to allow zfs_mknode to work.
1913 	 */
1914 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1915 	vattr.va_type = VDIR;
1916 	vattr.va_mode = S_IFDIR|0755;
1917 	vattr.va_uid = crgetuid(cr);
1918 	vattr.va_gid = crgetgid(cr);
1919 
1920 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1921 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1922 	rootzp->z_moved = 0;
1923 	rootzp->z_unlinked = 0;
1924 	rootzp->z_atime_dirty = 0;
1925 	rootzp->z_is_sa = USE_SA(version, os);
1926 	rootzp->z_pflags = 0;
1927 
1928 	vp = ZTOV(rootzp);
1929 	vn_reinit(vp);
1930 	vp->v_type = VDIR;
1931 
1932 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1933 	zfsvfs->z_os = os;
1934 	zfsvfs->z_parent = zfsvfs;
1935 	zfsvfs->z_version = version;
1936 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1937 	zfsvfs->z_use_sa = USE_SA(version, os);
1938 	zfsvfs->z_norm = norm;
1939 
1940 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1941 	    &zfsvfs->z_attr_table);
1942 
1943 	ASSERT(error == 0);
1944 
1945 	/*
1946 	 * Fold case on file systems that are always or sometimes case
1947 	 * insensitive.
1948 	 */
1949 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1950 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1951 
1952 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1953 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1954 	    offsetof(znode_t, z_link_node));
1955 
1956 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1957 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1958 
1959 	rootzp->z_zfsvfs = zfsvfs;
1960 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1961 	    cr, NULL, &acl_ids));
1962 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1963 	ASSERT3P(zp, ==, rootzp);
1964 	ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
1965 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1966 	ASSERT(error == 0);
1967 	zfs_acl_ids_free(&acl_ids);
1968 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1969 
1970 	ZTOV(rootzp)->v_count = 0;
1971 	sa_handle_destroy(rootzp->z_sa_hdl);
1972 	kmem_cache_free(znode_cache, rootzp);
1973 
1974 	/*
1975 	 * Create shares directory
1976 	 */
1977 
1978 	error = zfs_create_share_dir(zfsvfs, tx);
1979 
1980 	ASSERT(error == 0);
1981 
1982 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1983 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1984 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1985 }
1986 
1987 #endif /* _KERNEL */
1988 
1989 static int
1990 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1991 {
1992 	uint64_t sa_obj = 0;
1993 	int error;
1994 
1995 	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1996 	if (error != 0 && error != ENOENT)
1997 		return (error);
1998 
1999 	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2000 	return (error);
2001 }
2002 
2003 static int
2004 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2005     dmu_buf_t **db, void *tag)
2006 {
2007 	dmu_object_info_t doi;
2008 	int error;
2009 
2010 	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2011 		return (error);
2012 
2013 	dmu_object_info_from_db(*db, &doi);
2014 	if ((doi.doi_bonus_type != DMU_OT_SA &&
2015 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
2016 	    doi.doi_bonus_type == DMU_OT_ZNODE &&
2017 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
2018 		sa_buf_rele(*db, tag);
2019 		return (SET_ERROR(ENOTSUP));
2020 	}
2021 
2022 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2023 	if (error != 0) {
2024 		sa_buf_rele(*db, tag);
2025 		return (error);
2026 	}
2027 
2028 	return (0);
2029 }
2030 
2031 void
2032 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2033 {
2034 	sa_handle_destroy(hdl);
2035 	sa_buf_rele(db, tag);
2036 }
2037 
2038 /*
2039  * Given an object number, return its parent object number and whether
2040  * or not the object is an extended attribute directory.
2041  */
2042 static int
2043 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2044     uint64_t *pobjp, int *is_xattrdir)
2045 {
2046 	uint64_t parent;
2047 	uint64_t pflags;
2048 	uint64_t mode;
2049 	uint64_t parent_mode;
2050 	sa_bulk_attr_t bulk[3];
2051 	sa_handle_t *sa_hdl;
2052 	dmu_buf_t *sa_db;
2053 	int count = 0;
2054 	int error;
2055 
2056 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2057 	    &parent, sizeof (parent));
2058 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2059 	    &pflags, sizeof (pflags));
2060 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2061 	    &mode, sizeof (mode));
2062 
2063 	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2064 		return (error);
2065 
2066 	/*
2067 	 * When a link is removed its parent pointer is not changed and will
2068 	 * be invalid.  There are two cases where a link is removed but the
2069 	 * file stays around, when it goes to the delete queue and when there
2070 	 * are additional links.
2071 	 */
2072 	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2073 	if (error != 0)
2074 		return (error);
2075 
2076 	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2077 	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2078 	if (error != 0)
2079 		return (error);
2080 
2081 	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2082 
2083 	/*
2084 	 * Extended attributes can be applied to files, directories, etc.
2085 	 * Otherwise the parent must be a directory.
2086 	 */
2087 	if (!*is_xattrdir && !S_ISDIR(parent_mode))
2088 		return (SET_ERROR(EINVAL));
2089 
2090 	*pobjp = parent;
2091 
2092 	return (0);
2093 }
2094 
2095 /*
2096  * Given an object number, return some zpl level statistics
2097  */
2098 static int
2099 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2100     zfs_stat_t *sb)
2101 {
2102 	sa_bulk_attr_t bulk[4];
2103 	int count = 0;
2104 
2105 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2106 	    &sb->zs_mode, sizeof (sb->zs_mode));
2107 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2108 	    &sb->zs_gen, sizeof (sb->zs_gen));
2109 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2110 	    &sb->zs_links, sizeof (sb->zs_links));
2111 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2112 	    &sb->zs_ctime, sizeof (sb->zs_ctime));
2113 
2114 	return (sa_bulk_lookup(hdl, bulk, count));
2115 }
2116 
2117 static int
2118 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2119     sa_attr_type_t *sa_table, char *buf, int len)
2120 {
2121 	sa_handle_t *sa_hdl;
2122 	sa_handle_t *prevhdl = NULL;
2123 	dmu_buf_t *prevdb = NULL;
2124 	dmu_buf_t *sa_db = NULL;
2125 	char *path = buf + len - 1;
2126 	int error;
2127 
2128 	*path = '\0';
2129 	sa_hdl = hdl;
2130 
2131 	uint64_t deleteq_obj;
2132 	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2133 	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2134 	error = zap_lookup_int(osp, deleteq_obj, obj);
2135 	if (error == 0) {
2136 		return (ESTALE);
2137 	} else if (error != ENOENT) {
2138 		return (error);
2139 	}
2140 	error = 0;
2141 
2142 	for (;;) {
2143 		uint64_t pobj;
2144 		char component[MAXNAMELEN + 2];
2145 		size_t complen;
2146 		int is_xattrdir;
2147 
2148 		if (prevdb)
2149 			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2150 
2151 		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2152 		    &is_xattrdir)) != 0)
2153 			break;
2154 
2155 		if (pobj == obj) {
2156 			if (path[0] != '/')
2157 				*--path = '/';
2158 			break;
2159 		}
2160 
2161 		component[0] = '/';
2162 		if (is_xattrdir) {
2163 			(void) sprintf(component + 1, "<xattrdir>");
2164 		} else {
2165 			error = zap_value_search(osp, pobj, obj,
2166 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
2167 			if (error != 0)
2168 				break;
2169 		}
2170 
2171 		complen = strlen(component);
2172 		path -= complen;
2173 		ASSERT(path >= buf);
2174 		bcopy(component, path, complen);
2175 		obj = pobj;
2176 
2177 		if (sa_hdl != hdl) {
2178 			prevhdl = sa_hdl;
2179 			prevdb = sa_db;
2180 		}
2181 		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2182 		if (error != 0) {
2183 			sa_hdl = prevhdl;
2184 			sa_db = prevdb;
2185 			break;
2186 		}
2187 	}
2188 
2189 	if (sa_hdl != NULL && sa_hdl != hdl) {
2190 		ASSERT(sa_db != NULL);
2191 		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2192 	}
2193 
2194 	if (error == 0)
2195 		(void) memmove(buf, path, buf + len - path);
2196 
2197 	return (error);
2198 }
2199 
2200 int
2201 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2202 {
2203 	sa_attr_type_t *sa_table;
2204 	sa_handle_t *hdl;
2205 	dmu_buf_t *db;
2206 	int error;
2207 
2208 	error = zfs_sa_setup(osp, &sa_table);
2209 	if (error != 0)
2210 		return (error);
2211 
2212 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2213 	if (error != 0)
2214 		return (error);
2215 
2216 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2217 
2218 	zfs_release_sa_handle(hdl, db, FTAG);
2219 	return (error);
2220 }
2221 
2222 int
2223 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2224     char *buf, int len)
2225 {
2226 	char *path = buf + len - 1;
2227 	sa_attr_type_t *sa_table;
2228 	sa_handle_t *hdl;
2229 	dmu_buf_t *db;
2230 	int error;
2231 
2232 	*path = '\0';
2233 
2234 	error = zfs_sa_setup(osp, &sa_table);
2235 	if (error != 0)
2236 		return (error);
2237 
2238 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2239 	if (error != 0)
2240 		return (error);
2241 
2242 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2243 	if (error != 0) {
2244 		zfs_release_sa_handle(hdl, db, FTAG);
2245 		return (error);
2246 	}
2247 
2248 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2249 
2250 	zfs_release_sa_handle(hdl, db, FTAG);
2251 	return (error);
2252 }
2253