xref: /illumos-gate/usr/src/uts/common/fs/zfs/zfs_acl.c (revision 201ceb75ab95f9bf1f42ea1dc9ab363b43ba47cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013 by Delphix. All rights reserved.
24  * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
25  * Copyright 2019-2023 RackTop Systems, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/time.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/resource.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/sid.h>
37 #include <sys/file.h>
38 #include <sys/stat.h>
39 #include <sys/kmem.h>
40 #include <sys/cmn_err.h>
41 #include <sys/errno.h>
42 #include <sys/unistd.h>
43 #include <sys/sdt.h>
44 #include <sys/fs/zfs.h>
45 #include <sys/mode.h>
46 #include <sys/policy.h>
47 #include <sys/zfs_znode.h>
48 #include <sys/zfs_fuid.h>
49 #include <sys/zfs_acl.h>
50 #include <sys/zfs_dir.h>
51 #include <sys/zfs_vfsops.h>
52 #include <sys/dmu.h>
53 #include <sys/dnode.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include "fs/fs_subr.h"
57 #include <acl/acl_common.h>
58 
59 #define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
60 #define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
61 #define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
62 #define	MIN_ACE_TYPE	ALLOW
63 
64 #define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
65 #define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
66     ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
67 #define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
68     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
69 #define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
70     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
71 
72 #define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
73     ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
74     ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
75     ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
76 
77 #define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
78 #define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
79     ACE_DELETE|ACE_DELETE_CHILD)
80 #define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
81 
82 #define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
83     ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
84 
85 #define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
86     ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
87 
88 #define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
89     ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
90 
91 #define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
92 
93 #define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
94     ZFS_ACL_PROTECTED)
95 
96 #define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
97     ZFS_ACL_OBJ_ACE)
98 
99 #define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
100 
101 typedef struct zfs_acl_iter {
102 	zfs_acl_t	*zai_aclp;	/* The ACL we're iterating */
103 	zfs_acl_node_t	*zai_curr_node;	/* current node iterator is in */
104 	void		*zai_next_ace;	/* pointer to next ACE */
105 	int		zai_ace_idx;	/* ace iterator positioned on */
106 } zfs_acl_iter_t;
107 
108 static uint16_t
109 zfs_ace_v0_get_type(void *acep)
110 {
111 	return (((zfs_oldace_t *)acep)->z_type);
112 }
113 
114 static uint16_t
115 zfs_ace_v0_get_flags(void *acep)
116 {
117 	return (((zfs_oldace_t *)acep)->z_flags);
118 }
119 
120 static uint32_t
121 zfs_ace_v0_get_mask(void *acep)
122 {
123 	return (((zfs_oldace_t *)acep)->z_access_mask);
124 }
125 
126 static uint64_t
127 zfs_ace_v0_get_who(void *acep)
128 {
129 	return (((zfs_oldace_t *)acep)->z_fuid);
130 }
131 
132 static void
133 zfs_ace_v0_set_type(void *acep, uint16_t type)
134 {
135 	((zfs_oldace_t *)acep)->z_type = type;
136 }
137 
138 static void
139 zfs_ace_v0_set_flags(void *acep, uint16_t flags)
140 {
141 	((zfs_oldace_t *)acep)->z_flags = flags;
142 }
143 
144 static void
145 zfs_ace_v0_set_mask(void *acep, uint32_t mask)
146 {
147 	((zfs_oldace_t *)acep)->z_access_mask = mask;
148 }
149 
150 static void
151 zfs_ace_v0_set_who(void *acep, uint64_t who)
152 {
153 	((zfs_oldace_t *)acep)->z_fuid = who;
154 }
155 
156 /*ARGSUSED*/
157 static size_t
158 zfs_ace_v0_size(void *acep)
159 {
160 	return (sizeof (zfs_oldace_t));
161 }
162 
163 static size_t
164 zfs_ace_v0_abstract_size(void)
165 {
166 	return (sizeof (zfs_oldace_t));
167 }
168 
169 static int
170 zfs_ace_v0_mask_off(void)
171 {
172 	return (offsetof(zfs_oldace_t, z_access_mask));
173 }
174 
175 /*ARGSUSED*/
176 static int
177 zfs_ace_v0_data(void *acep, void **datap)
178 {
179 	*datap = NULL;
180 	return (0);
181 }
182 
183 static acl_ops_t zfs_acl_v0_ops = {
184 	zfs_ace_v0_get_mask,
185 	zfs_ace_v0_set_mask,
186 	zfs_ace_v0_get_flags,
187 	zfs_ace_v0_set_flags,
188 	zfs_ace_v0_get_type,
189 	zfs_ace_v0_set_type,
190 	zfs_ace_v0_get_who,
191 	zfs_ace_v0_set_who,
192 	zfs_ace_v0_size,
193 	zfs_ace_v0_abstract_size,
194 	zfs_ace_v0_mask_off,
195 	zfs_ace_v0_data
196 };
197 
198 static uint16_t
199 zfs_ace_fuid_get_type(void *acep)
200 {
201 	return (((zfs_ace_hdr_t *)acep)->z_type);
202 }
203 
204 static uint16_t
205 zfs_ace_fuid_get_flags(void *acep)
206 {
207 	return (((zfs_ace_hdr_t *)acep)->z_flags);
208 }
209 
210 static uint32_t
211 zfs_ace_fuid_get_mask(void *acep)
212 {
213 	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
214 }
215 
216 static uint64_t
217 zfs_ace_fuid_get_who(void *args)
218 {
219 	uint16_t entry_type;
220 	zfs_ace_t *acep = args;
221 
222 	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
223 
224 	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
225 	    entry_type == ACE_EVERYONE)
226 		return (-1);
227 	return (((zfs_ace_t *)acep)->z_fuid);
228 }
229 
230 static void
231 zfs_ace_fuid_set_type(void *acep, uint16_t type)
232 {
233 	((zfs_ace_hdr_t *)acep)->z_type = type;
234 }
235 
236 static void
237 zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
238 {
239 	((zfs_ace_hdr_t *)acep)->z_flags = flags;
240 }
241 
242 static void
243 zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
244 {
245 	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
246 }
247 
248 static void
249 zfs_ace_fuid_set_who(void *arg, uint64_t who)
250 {
251 	zfs_ace_t *acep = arg;
252 
253 	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
254 
255 	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
256 	    entry_type == ACE_EVERYONE)
257 		return;
258 	acep->z_fuid = who;
259 }
260 
261 static size_t
262 zfs_ace_fuid_size(void *acep)
263 {
264 	zfs_ace_hdr_t *zacep = acep;
265 	uint16_t entry_type;
266 
267 	switch (zacep->z_type) {
268 	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
269 	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
270 	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
271 	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
272 		return (sizeof (zfs_object_ace_t));
273 	case ALLOW:
274 	case DENY:
275 		entry_type =
276 		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
277 		if (entry_type == ACE_OWNER ||
278 		    entry_type == OWNING_GROUP ||
279 		    entry_type == ACE_EVERYONE)
280 			return (sizeof (zfs_ace_hdr_t));
281 		/*FALLTHROUGH*/
282 	default:
283 		return (sizeof (zfs_ace_t));
284 	}
285 }
286 
287 static size_t
288 zfs_ace_fuid_abstract_size(void)
289 {
290 	return (sizeof (zfs_ace_hdr_t));
291 }
292 
293 static int
294 zfs_ace_fuid_mask_off(void)
295 {
296 	return (offsetof(zfs_ace_hdr_t, z_access_mask));
297 }
298 
299 static int
300 zfs_ace_fuid_data(void *acep, void **datap)
301 {
302 	zfs_ace_t *zacep = acep;
303 	zfs_object_ace_t *zobjp;
304 
305 	switch (zacep->z_hdr.z_type) {
306 	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
307 	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
308 	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
309 	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
310 		zobjp = acep;
311 		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
312 		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
313 	default:
314 		*datap = NULL;
315 		return (0);
316 	}
317 }
318 
319 static acl_ops_t zfs_acl_fuid_ops = {
320 	zfs_ace_fuid_get_mask,
321 	zfs_ace_fuid_set_mask,
322 	zfs_ace_fuid_get_flags,
323 	zfs_ace_fuid_set_flags,
324 	zfs_ace_fuid_get_type,
325 	zfs_ace_fuid_set_type,
326 	zfs_ace_fuid_get_who,
327 	zfs_ace_fuid_set_who,
328 	zfs_ace_fuid_size,
329 	zfs_ace_fuid_abstract_size,
330 	zfs_ace_fuid_mask_off,
331 	zfs_ace_fuid_data
332 };
333 
334 /*
335  * The following three functions are provided for compatibility with
336  * older ZPL version in order to determine if the file use to have
337  * an external ACL and what version of ACL previously existed on the
338  * file.  Would really be nice to not need this, sigh.
339  */
340 uint64_t
341 zfs_external_acl(znode_t *zp)
342 {
343 	zfs_acl_phys_t acl_phys;
344 	int error;
345 
346 	if (zp->z_is_sa)
347 		return (0);
348 
349 	/*
350 	 * Need to deal with a potential
351 	 * race where zfs_sa_upgrade could cause
352 	 * z_isa_sa to change.
353 	 *
354 	 * If the lookup fails then the state of z_is_sa should have
355 	 * changed.
356 	 */
357 
358 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
359 	    &acl_phys, sizeof (acl_phys))) == 0)
360 		return (acl_phys.z_acl_extern_obj);
361 	else {
362 		/*
363 		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
364 		 * removed
365 		 */
366 		VERIFY(zp->z_is_sa && error == ENOENT);
367 		return (0);
368 	}
369 }
370 
371 /*
372  * Determine size of ACL in bytes
373  *
374  * This is more complicated than it should be since we have to deal
375  * with old external ACLs.
376  */
377 static int
378 zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
379     zfs_acl_phys_t *aclphys)
380 {
381 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
382 	uint64_t acl_count;
383 	int size;
384 	int error;
385 
386 	ASSERT(RW_ISWRITER(&zp->z_acl_lock));
387 	if (zp->z_is_sa) {
388 		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
389 		    &size)) != 0)
390 			return (error);
391 		*aclsize = size;
392 		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
393 		    &acl_count, sizeof (acl_count))) != 0)
394 			return (error);
395 		*aclcount = acl_count;
396 	} else {
397 		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
398 		    aclphys, sizeof (*aclphys))) != 0)
399 			return (error);
400 
401 		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
402 			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
403 			*aclcount = aclphys->z_acl_size;
404 		} else {
405 			*aclsize = aclphys->z_acl_size;
406 			*aclcount = aclphys->z_acl_count;
407 		}
408 	}
409 	return (0);
410 }
411 
412 int
413 zfs_znode_acl_version(znode_t *zp)
414 {
415 	zfs_acl_phys_t acl_phys;
416 
417 	if (zp->z_is_sa)
418 		return (ZFS_ACL_VERSION_FUID);
419 	else {
420 		int error;
421 
422 		/*
423 		 * Need to deal with a potential
424 		 * race where zfs_sa_upgrade could cause
425 		 * z_isa_sa to change.
426 		 *
427 		 * If the lookup fails then the state of z_is_sa should have
428 		 * changed.
429 		 */
430 		if ((error = sa_lookup(zp->z_sa_hdl,
431 		    SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
432 		    &acl_phys, sizeof (acl_phys))) == 0)
433 			return (acl_phys.z_acl_version);
434 		else {
435 			/*
436 			 * After upgrade SA_ZPL_ZNODE_ACL should have
437 			 * been removed.
438 			 */
439 			VERIFY(zp->z_is_sa && error == ENOENT);
440 			return (ZFS_ACL_VERSION_FUID);
441 		}
442 	}
443 }
444 
445 static int
446 zfs_acl_version(int version)
447 {
448 	if (version < ZPL_VERSION_FUID)
449 		return (ZFS_ACL_VERSION_INITIAL);
450 	else
451 		return (ZFS_ACL_VERSION_FUID);
452 }
453 
454 static int
455 zfs_acl_version_zp(znode_t *zp)
456 {
457 	return (zfs_acl_version(zp->z_zfsvfs->z_version));
458 }
459 
460 zfs_acl_t *
461 zfs_acl_alloc(int vers)
462 {
463 	zfs_acl_t *aclp;
464 
465 	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
466 	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
467 	    offsetof(zfs_acl_node_t, z_next));
468 	aclp->z_version = vers;
469 	if (vers == ZFS_ACL_VERSION_FUID)
470 		aclp->z_ops = zfs_acl_fuid_ops;
471 	else
472 		aclp->z_ops = zfs_acl_v0_ops;
473 	return (aclp);
474 }
475 
476 zfs_acl_node_t *
477 zfs_acl_node_alloc(size_t bytes)
478 {
479 	zfs_acl_node_t *aclnode;
480 
481 	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
482 	if (bytes) {
483 		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
484 		aclnode->z_allocdata = aclnode->z_acldata;
485 		aclnode->z_allocsize = bytes;
486 		aclnode->z_size = bytes;
487 	}
488 
489 	return (aclnode);
490 }
491 
492 static void
493 zfs_acl_node_free(zfs_acl_node_t *aclnode)
494 {
495 	if (aclnode->z_allocsize)
496 		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
497 	kmem_free(aclnode, sizeof (zfs_acl_node_t));
498 }
499 
500 static void
501 zfs_acl_release_nodes(zfs_acl_t *aclp)
502 {
503 	zfs_acl_node_t *aclnode;
504 
505 	while (aclnode = list_head(&aclp->z_acl)) {
506 		list_remove(&aclp->z_acl, aclnode);
507 		zfs_acl_node_free(aclnode);
508 	}
509 	aclp->z_acl_count = 0;
510 	aclp->z_acl_bytes = 0;
511 }
512 
513 void
514 zfs_acl_free(zfs_acl_t *aclp)
515 {
516 	zfs_acl_release_nodes(aclp);
517 	list_destroy(&aclp->z_acl);
518 	kmem_free(aclp, sizeof (zfs_acl_t));
519 }
520 
521 static boolean_t
522 zfs_acl_valid_ace_type(uint_t type, uint_t flags)
523 {
524 	uint16_t entry_type;
525 
526 	switch (type) {
527 	case ALLOW:
528 	case DENY:
529 	case ACE_SYSTEM_AUDIT_ACE_TYPE:
530 	case ACE_SYSTEM_ALARM_ACE_TYPE:
531 		entry_type = flags & ACE_TYPE_FLAGS;
532 		return (entry_type == ACE_OWNER ||
533 		    entry_type == OWNING_GROUP ||
534 		    entry_type == ACE_EVERYONE || entry_type == 0 ||
535 		    entry_type == ACE_IDENTIFIER_GROUP);
536 	default:
537 		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
538 			return (B_TRUE);
539 	}
540 	return (B_FALSE);
541 }
542 
543 static boolean_t
544 zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
545 {
546 	/*
547 	 * first check type of entry
548 	 */
549 
550 	if (!zfs_acl_valid_ace_type(type, iflags))
551 		return (B_FALSE);
552 
553 	switch (type) {
554 	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
555 	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
556 	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
557 	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
558 		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
559 			return (B_FALSE);
560 		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
561 	}
562 
563 	/*
564 	 * next check inheritance level flags
565 	 */
566 
567 	if (obj_type == VDIR &&
568 	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
569 		aclp->z_hints |= ZFS_INHERIT_ACE;
570 
571 	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
572 		if ((iflags & (ACE_FILE_INHERIT_ACE|
573 		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
574 			return (B_FALSE);
575 		}
576 	}
577 
578 	return (B_TRUE);
579 }
580 
581 /*
582  * We want this iterator to be usable by multiple threads in parallel.
583  * Caller now passes iterator state as 1st arg.
584  */
585 static void *
586 zfs_acl_next_ace(zfs_acl_iter_t *zaip, void *start, uint64_t *who,
587     uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
588 {
589 	zfs_acl_t *aclp = zaip->zai_aclp;
590 	zfs_acl_node_t *aclnode;
591 
592 	ASSERT(aclp);
593 
594 	if (start == NULL) {
595 		aclnode = list_head(&aclp->z_acl);
596 		if (aclnode == NULL)
597 			return (NULL);
598 
599 		zaip->zai_curr_node = aclnode;
600 		zaip->zai_next_ace = aclnode->z_acldata;
601 		zaip->zai_ace_idx = 0;
602 	}
603 
604 	aclnode = zaip->zai_curr_node;
605 
606 	if (aclnode == NULL)
607 		return (NULL);
608 
609 	if (zaip->zai_ace_idx >= aclnode->z_ace_count) {
610 	next_node:
611 		aclnode = list_next(&aclp->z_acl, aclnode);
612 		if (aclnode == NULL)
613 			return (NULL);
614 		zaip->zai_curr_node = aclnode;
615 		zaip->zai_ace_idx = 0;
616 		zaip->zai_next_ace = aclnode->z_acldata;
617 	}
618 
619 	if (zaip->zai_ace_idx < aclnode->z_ace_count) {
620 		void *acep = zaip->zai_next_ace;
621 		size_t ace_size;
622 
623 		/*
624 		 * Make sure we don't overstep our bounds
625 		 */
626 		ace_size = aclp->z_ops.ace_size(acep);
627 
628 		if (((caddr_t)acep + ace_size) >
629 		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
630 			DTRACE_PROBE3(acl__size__error, zfs_acl_t *, aclp,
631 			    zfs_acl_node_t *, aclnode, zfs_ace_hdr_t *, acep);
632 			goto next_node;
633 		}
634 
635 		*iflags = aclp->z_ops.ace_flags_get(acep);
636 		*type = aclp->z_ops.ace_type_get(acep);
637 		*access_mask = aclp->z_ops.ace_mask_get(acep);
638 		*who = aclp->z_ops.ace_who_get(acep);
639 
640 		zaip->zai_next_ace = (caddr_t)acep + ace_size;
641 		zaip->zai_ace_idx++;
642 
643 		return ((void *)acep);
644 	}
645 	return (NULL);
646 }
647 
648 /*ARGSUSED*/
649 static uint64_t
650 zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
651     uint16_t *flags, uint16_t *type, uint32_t *mask)
652 {
653 	zfs_acl_iter_t *zaip = datap;
654 	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
655 	uint64_t who;
656 
657 	acep = zfs_acl_next_ace(zaip, acep, &who, mask,
658 	    flags, type);
659 	return ((uint64_t)(uintptr_t)acep);
660 }
661 
662 /*
663  * Copy ACE to internal ZFS format.
664  * While processing the ACL each ACE will be validated for correctness.
665  * ACE FUIDs will be created later.
666  */
667 int
668 zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
669     void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
670     zfs_fuid_info_t **fuidp, cred_t *cr)
671 {
672 	int i;
673 	uint16_t entry_type;
674 	zfs_ace_t *aceptr = z_acl;
675 	ace_t *acep = datap;
676 	zfs_object_ace_t *zobjacep;
677 	ace_object_t *aceobjp;
678 
679 	for (i = 0; i != aclcnt; i++) {
680 		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
681 		aceptr->z_hdr.z_flags = acep->a_flags;
682 		aceptr->z_hdr.z_type = acep->a_type;
683 		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
684 		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
685 		    entry_type != ACE_EVERYONE) {
686 			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
687 			    cr, (entry_type == 0) ?
688 			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
689 		}
690 
691 		/*
692 		 * Make sure ACE is valid
693 		 */
694 		if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
695 		    aceptr->z_hdr.z_flags) != B_TRUE)
696 			return (SET_ERROR(EINVAL));
697 
698 		switch (acep->a_type) {
699 		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
700 		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
701 		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
702 		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
703 			zobjacep = (zfs_object_ace_t *)aceptr;
704 			aceobjp = (ace_object_t *)acep;
705 
706 			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
707 			    sizeof (aceobjp->a_obj_type));
708 			bcopy(aceobjp->a_inherit_obj_type,
709 			    zobjacep->z_inherit_type,
710 			    sizeof (aceobjp->a_inherit_obj_type));
711 			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
712 			break;
713 		default:
714 			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
715 		}
716 
717 		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
718 		    aclp->z_ops.ace_size(aceptr));
719 	}
720 
721 	*size = (caddr_t)aceptr - (caddr_t)z_acl;
722 
723 	return (0);
724 }
725 
726 /*
727  * Copy ZFS ACEs to fixed size ace_t layout
728  */
729 static void
730 zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
731     void *datap, int filter)
732 {
733 	uint64_t who;
734 	uint32_t access_mask;
735 	uint16_t iflags, type;
736 	zfs_ace_hdr_t *zacep = NULL;
737 	ace_t *acep = datap;
738 	ace_object_t *objacep;
739 	zfs_object_ace_t *zobjacep;
740 	size_t ace_size;
741 	uint16_t entry_type;
742 	zfs_acl_iter_t	zai = {0};
743 
744 	zai.zai_aclp = aclp;
745 	while (zacep = zfs_acl_next_ace(&zai, zacep,
746 	    &who, &access_mask, &iflags, &type)) {
747 
748 		switch (type) {
749 		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
750 		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
751 		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
752 		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
753 			if (filter) {
754 				continue;
755 			}
756 			zobjacep = (zfs_object_ace_t *)zacep;
757 			objacep = (ace_object_t *)acep;
758 			bcopy(zobjacep->z_object_type,
759 			    objacep->a_obj_type,
760 			    sizeof (zobjacep->z_object_type));
761 			bcopy(zobjacep->z_inherit_type,
762 			    objacep->a_inherit_obj_type,
763 			    sizeof (zobjacep->z_inherit_type));
764 			ace_size = sizeof (ace_object_t);
765 			break;
766 		default:
767 			ace_size = sizeof (ace_t);
768 			break;
769 		}
770 
771 		entry_type = (iflags & ACE_TYPE_FLAGS);
772 		if ((entry_type != ACE_OWNER &&
773 		    entry_type != OWNING_GROUP &&
774 		    entry_type != ACE_EVERYONE)) {
775 			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
776 			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
777 			    ZFS_ACE_GROUP : ZFS_ACE_USER);
778 		} else {
779 			acep->a_who = (uid_t)(int64_t)who;
780 		}
781 		acep->a_access_mask = access_mask;
782 		acep->a_flags = iflags;
783 		acep->a_type = type;
784 		acep = (ace_t *)((caddr_t)acep + ace_size);
785 	}
786 }
787 
788 static int
789 zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
790     zfs_oldace_t *z_acl, int aclcnt, size_t *size)
791 {
792 	int i;
793 	zfs_oldace_t *aceptr = z_acl;
794 
795 	for (i = 0; i != aclcnt; i++, aceptr++) {
796 		aceptr->z_access_mask = acep[i].a_access_mask;
797 		aceptr->z_type = acep[i].a_type;
798 		aceptr->z_flags = acep[i].a_flags;
799 		aceptr->z_fuid = acep[i].a_who;
800 		/*
801 		 * Make sure ACE is valid
802 		 */
803 		if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
804 		    aceptr->z_flags) != B_TRUE)
805 			return (SET_ERROR(EINVAL));
806 	}
807 	*size = (caddr_t)aceptr - (caddr_t)z_acl;
808 	return (0);
809 }
810 
811 /*
812  * convert old ACL format to new
813  */
814 void
815 zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
816 {
817 	zfs_oldace_t *oldaclp;
818 	int i;
819 	uint16_t type, iflags;
820 	uint32_t access_mask;
821 	uint64_t who;
822 	void *cookie = NULL;
823 	zfs_acl_node_t *newaclnode;
824 	zfs_acl_iter_t	zai = {0};
825 
826 	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
827 	/*
828 	 * First create the ACE in a contiguous piece of memory
829 	 * for zfs_copy_ace_2_fuid().
830 	 *
831 	 * We only convert an ACL once, so this won't happen
832 	 * everytime.
833 	 */
834 	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
835 	    KM_SLEEP);
836 	i = 0;
837 
838 	zai.zai_aclp = aclp;
839 	while (cookie = zfs_acl_next_ace(&zai, cookie, &who,
840 	    &access_mask, &iflags, &type)) {
841 		oldaclp[i].z_flags = iflags;
842 		oldaclp[i].z_type = type;
843 		oldaclp[i].z_fuid = who;
844 		oldaclp[i++].z_access_mask = access_mask;
845 	}
846 
847 	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
848 	    sizeof (zfs_object_ace_t));
849 	aclp->z_ops = zfs_acl_fuid_ops;
850 	VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
851 	    oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
852 	    &newaclnode->z_size, NULL, cr) == 0);
853 	newaclnode->z_ace_count = aclp->z_acl_count;
854 	aclp->z_version = ZFS_ACL_VERSION;
855 	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
856 
857 	/*
858 	 * Release all previous ACL nodes
859 	 */
860 
861 	zfs_acl_release_nodes(aclp);
862 
863 	list_insert_head(&aclp->z_acl, newaclnode);
864 
865 	aclp->z_acl_bytes = newaclnode->z_size;
866 	aclp->z_acl_count = newaclnode->z_ace_count;
867 
868 }
869 
870 /*
871  * Convert unix access mask to v4 access mask
872  */
873 static uint32_t
874 zfs_unix_to_v4(uint32_t access_mask)
875 {
876 	uint32_t new_mask = 0;
877 
878 	if (access_mask & S_IXOTH)
879 		new_mask |= ACE_EXECUTE;
880 	if (access_mask & S_IWOTH)
881 		new_mask |= ACE_WRITE_DATA;
882 	if (access_mask & S_IROTH)
883 		new_mask |= ACE_READ_DATA;
884 	return (new_mask);
885 }
886 
887 static void
888 zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
889     uint16_t access_type, uint64_t fuid, uint16_t entry_type)
890 {
891 	uint16_t type = entry_type & ACE_TYPE_FLAGS;
892 
893 	aclp->z_ops.ace_mask_set(acep, access_mask);
894 	aclp->z_ops.ace_type_set(acep, access_type);
895 	aclp->z_ops.ace_flags_set(acep, entry_type);
896 	if ((type != ACE_OWNER && type != OWNING_GROUP &&
897 	    type != ACE_EVERYONE))
898 		aclp->z_ops.ace_who_set(acep, fuid);
899 }
900 
901 /*
902  * Determine mode of file based on ACL.
903  */
904 uint64_t
905 zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
906     uint64_t *pflags, uint64_t fuid, uint64_t fgid)
907 {
908 	int		entry_type;
909 	mode_t		mode;
910 	mode_t		seen = 0;
911 	zfs_ace_hdr_t	*acep = NULL;
912 	uint64_t	who;
913 	uint16_t	iflags, type;
914 	uint32_t	access_mask;
915 	boolean_t	an_exec_denied = B_FALSE;
916 	zfs_acl_iter_t	zai = {0};
917 
918 	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
919 
920 	zai.zai_aclp = aclp;
921 	while (acep = zfs_acl_next_ace(&zai, acep, &who,
922 	    &access_mask, &iflags, &type)) {
923 
924 		if (!zfs_acl_valid_ace_type(type, iflags))
925 			continue;
926 
927 		entry_type = (iflags & ACE_TYPE_FLAGS);
928 
929 		/*
930 		 * Skip over any inherit_only ACEs
931 		 */
932 		if (iflags & ACE_INHERIT_ONLY_ACE)
933 			continue;
934 
935 		if (entry_type == ACE_OWNER || (entry_type == 0 &&
936 		    who == fuid)) {
937 			if ((access_mask & ACE_READ_DATA) &&
938 			    (!(seen & S_IRUSR))) {
939 				seen |= S_IRUSR;
940 				if (type == ALLOW) {
941 					mode |= S_IRUSR;
942 				}
943 			}
944 			if ((access_mask & ACE_WRITE_DATA) &&
945 			    (!(seen & S_IWUSR))) {
946 				seen |= S_IWUSR;
947 				if (type == ALLOW) {
948 					mode |= S_IWUSR;
949 				}
950 			}
951 			if ((access_mask & ACE_EXECUTE) &&
952 			    (!(seen & S_IXUSR))) {
953 				seen |= S_IXUSR;
954 				if (type == ALLOW) {
955 					mode |= S_IXUSR;
956 				}
957 			}
958 		} else if (entry_type == OWNING_GROUP ||
959 		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
960 			if ((access_mask & ACE_READ_DATA) &&
961 			    (!(seen & S_IRGRP))) {
962 				seen |= S_IRGRP;
963 				if (type == ALLOW) {
964 					mode |= S_IRGRP;
965 				}
966 			}
967 			if ((access_mask & ACE_WRITE_DATA) &&
968 			    (!(seen & S_IWGRP))) {
969 				seen |= S_IWGRP;
970 				if (type == ALLOW) {
971 					mode |= S_IWGRP;
972 				}
973 			}
974 			if ((access_mask & ACE_EXECUTE) &&
975 			    (!(seen & S_IXGRP))) {
976 				seen |= S_IXGRP;
977 				if (type == ALLOW) {
978 					mode |= S_IXGRP;
979 				}
980 			}
981 		} else if (entry_type == ACE_EVERYONE) {
982 			if ((access_mask & ACE_READ_DATA)) {
983 				if (!(seen & S_IRUSR)) {
984 					seen |= S_IRUSR;
985 					if (type == ALLOW) {
986 						mode |= S_IRUSR;
987 					}
988 				}
989 				if (!(seen & S_IRGRP)) {
990 					seen |= S_IRGRP;
991 					if (type == ALLOW) {
992 						mode |= S_IRGRP;
993 					}
994 				}
995 				if (!(seen & S_IROTH)) {
996 					seen |= S_IROTH;
997 					if (type == ALLOW) {
998 						mode |= S_IROTH;
999 					}
1000 				}
1001 			}
1002 			if ((access_mask & ACE_WRITE_DATA)) {
1003 				if (!(seen & S_IWUSR)) {
1004 					seen |= S_IWUSR;
1005 					if (type == ALLOW) {
1006 						mode |= S_IWUSR;
1007 					}
1008 				}
1009 				if (!(seen & S_IWGRP)) {
1010 					seen |= S_IWGRP;
1011 					if (type == ALLOW) {
1012 						mode |= S_IWGRP;
1013 					}
1014 				}
1015 				if (!(seen & S_IWOTH)) {
1016 					seen |= S_IWOTH;
1017 					if (type == ALLOW) {
1018 						mode |= S_IWOTH;
1019 					}
1020 				}
1021 			}
1022 			if ((access_mask & ACE_EXECUTE)) {
1023 				if (!(seen & S_IXUSR)) {
1024 					seen |= S_IXUSR;
1025 					if (type == ALLOW) {
1026 						mode |= S_IXUSR;
1027 					}
1028 				}
1029 				if (!(seen & S_IXGRP)) {
1030 					seen |= S_IXGRP;
1031 					if (type == ALLOW) {
1032 						mode |= S_IXGRP;
1033 					}
1034 				}
1035 				if (!(seen & S_IXOTH)) {
1036 					seen |= S_IXOTH;
1037 					if (type == ALLOW) {
1038 						mode |= S_IXOTH;
1039 					}
1040 				}
1041 			}
1042 		} else {
1043 			/*
1044 			 * Only care if this IDENTIFIER_GROUP or
1045 			 * USER ACE denies execute access to someone,
1046 			 * mode is not affected
1047 			 */
1048 			if ((access_mask & ACE_EXECUTE) && type == DENY)
1049 				an_exec_denied = B_TRUE;
1050 		}
1051 	}
1052 
1053 	/*
1054 	 * Failure to allow is effectively a deny, so execute permission
1055 	 * is denied if it was never mentioned or if we explicitly
1056 	 * weren't allowed it.
1057 	 */
1058 	if (!an_exec_denied &&
1059 	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
1060 	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
1061 		an_exec_denied = B_TRUE;
1062 
1063 	if (an_exec_denied)
1064 		*pflags &= ~ZFS_NO_EXECS_DENIED;
1065 	else
1066 		*pflags |= ZFS_NO_EXECS_DENIED;
1067 
1068 	return (mode);
1069 }
1070 
1071 /*
1072  * Read an external acl object.  If the intent is to modify, always
1073  * create a new acl and leave any cached acl in place.
1074  */
1075 int
1076 zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp,
1077     boolean_t will_modify)
1078 {
1079 	zfs_acl_t	*aclp;
1080 	int		aclsize;
1081 	int		acl_count;
1082 	zfs_acl_node_t	*aclnode;
1083 	zfs_acl_phys_t	znode_acl;
1084 	int		version;
1085 	int		error;
1086 	boolean_t	drop_lock = B_FALSE;
1087 
1088 	ASSERT(RW_ISWRITER(&zp->z_acl_lock));
1089 
1090 	if (zp->z_acl_cached && !will_modify) {
1091 		*aclpp = zp->z_acl_cached;
1092 		return (0);
1093 	}
1094 
1095 	/*
1096 	 * close race where znode could be upgrade while trying to
1097 	 * read the znode attributes.
1098 	 *
1099 	 * But this could only happen if the file isn't already an SA
1100 	 * znode
1101 	 */
1102 	if (!zp->z_is_sa && !have_lock) {
1103 		mutex_enter(&zp->z_lock);
1104 		drop_lock = B_TRUE;
1105 	}
1106 	version = zfs_znode_acl_version(zp);
1107 
1108 	if ((error = zfs_acl_znode_info(zp, &aclsize,
1109 	    &acl_count, &znode_acl)) != 0) {
1110 		goto done;
1111 	}
1112 
1113 	aclp = zfs_acl_alloc(version);
1114 
1115 	aclp->z_acl_count = acl_count;
1116 	aclp->z_acl_bytes = aclsize;
1117 
1118 	aclnode = zfs_acl_node_alloc(aclsize);
1119 	aclnode->z_ace_count = aclp->z_acl_count;
1120 	aclnode->z_size = aclsize;
1121 
1122 	if (!zp->z_is_sa) {
1123 		if (znode_acl.z_acl_extern_obj) {
1124 			error = dmu_read(zp->z_zfsvfs->z_os,
1125 			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
1126 			    aclnode->z_acldata, DMU_READ_PREFETCH);
1127 		} else {
1128 			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
1129 			    aclnode->z_size);
1130 		}
1131 	} else {
1132 		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
1133 		    aclnode->z_acldata, aclnode->z_size);
1134 	}
1135 
1136 	if (error != 0) {
1137 		zfs_acl_free(aclp);
1138 		zfs_acl_node_free(aclnode);
1139 		/* convert checksum errors into IO errors */
1140 		if (error == ECKSUM)
1141 			error = SET_ERROR(EIO);
1142 		goto done;
1143 	}
1144 
1145 	list_insert_head(&aclp->z_acl, aclnode);
1146 
1147 	*aclpp = aclp;
1148 	if (!will_modify)
1149 		zp->z_acl_cached = aclp;
1150 done:
1151 	if (drop_lock)
1152 		mutex_exit(&zp->z_lock);
1153 	return (error);
1154 }
1155 
1156 /*ARGSUSED*/
1157 void
1158 zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
1159     boolean_t start, void *userdata)
1160 {
1161 	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
1162 
1163 	if (start) {
1164 		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
1165 	} else {
1166 		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
1167 		    cb->cb_acl_node);
1168 	}
1169 	*dataptr = cb->cb_acl_node->z_acldata;
1170 	*length = cb->cb_acl_node->z_size;
1171 }
1172 
1173 int
1174 zfs_acl_chown_setattr(znode_t *zp)
1175 {
1176 	int error;
1177 	zfs_acl_t *aclp;
1178 
1179 	ASSERT(MUTEX_HELD(&zp->z_lock));
1180 	ASSERT(RW_ISWRITER(&zp->z_acl_lock));
1181 
1182 	if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
1183 		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
1184 		    &zp->z_pflags, zp->z_uid, zp->z_gid);
1185 	return (error);
1186 }
1187 
1188 /*
1189  * common code for setting ACLs.
1190  *
1191  * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
1192  * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
1193  * already checked the acl and knows whether to inherit.
1194  */
1195 int
1196 zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
1197 {
1198 	int			error;
1199 	zfsvfs_t		*zfsvfs = zp->z_zfsvfs;
1200 	dmu_object_type_t	otype;
1201 	zfs_acl_locator_cb_t	locate = { 0 };
1202 	uint64_t		mode;
1203 	sa_bulk_attr_t		bulk[5];
1204 	uint64_t		ctime[2];
1205 	int			count = 0;
1206 	zfs_acl_phys_t		acl_phys;
1207 	zfs_acl_iter_t		zai = {0};
1208 
1209 	/*
1210 	 * One might be tempted to add here:
1211 	 * ASSERT(RW_ISWRITER(&zp->z_acl_lock));
1212 	 * and most callers have that lock, but zfs_mknode
1213 	 * calls without the lock when making a new node,
1214 	 * which is OK because it's not visible yet.
1215 	 */
1216 
1217 	mode = zp->z_mode;
1218 
1219 	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
1220 	    zp->z_uid, zp->z_gid);
1221 
1222 	zp->z_mode = mode;
1223 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1224 	    &mode, sizeof (mode));
1225 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1226 	    &zp->z_pflags, sizeof (zp->z_pflags));
1227 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1228 	    &ctime, sizeof (ctime));
1229 
1230 	if (zp->z_acl_cached) {
1231 		zfs_acl_free(zp->z_acl_cached);
1232 		zp->z_acl_cached = NULL;
1233 	}
1234 
1235 	/*
1236 	 * Upgrade needed?
1237 	 */
1238 	if (!zfsvfs->z_use_fuids) {
1239 		otype = DMU_OT_OLDACL;
1240 	} else {
1241 		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
1242 		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
1243 			zfs_acl_xform(zp, aclp, cr);
1244 		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
1245 		otype = DMU_OT_ACL;
1246 	}
1247 
1248 	/*
1249 	 * Arrgh, we have to handle old on disk format
1250 	 * as well as newer (preferred) SA format.
1251 	 */
1252 
1253 	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
1254 		locate.cb_aclp = aclp;
1255 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
1256 		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
1257 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
1258 		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
1259 	} else { /* Painful legacy way */
1260 		zfs_acl_node_t *aclnode;
1261 		uint64_t off = 0;
1262 		uint64_t aoid;
1263 
1264 		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
1265 		    &acl_phys, sizeof (acl_phys))) != 0)
1266 			return (error);
1267 
1268 		aoid = acl_phys.z_acl_extern_obj;
1269 
1270 		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1271 			/*
1272 			 * If ACL was previously external and we are now
1273 			 * converting to new ACL format then release old
1274 			 * ACL object and create a new one.
1275 			 */
1276 			if (aoid &&
1277 			    aclp->z_version != acl_phys.z_acl_version) {
1278 				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
1279 				if (error)
1280 					return (error);
1281 				aoid = 0;
1282 			}
1283 			if (aoid == 0) {
1284 				aoid = dmu_object_alloc(zfsvfs->z_os,
1285 				    otype, aclp->z_acl_bytes,
1286 				    otype == DMU_OT_ACL ?
1287 				    DMU_OT_SYSACL : DMU_OT_NONE,
1288 				    otype == DMU_OT_ACL ?
1289 				    DN_OLD_MAX_BONUSLEN : 0, tx);
1290 			} else {
1291 				(void) dmu_object_set_blocksize(zfsvfs->z_os,
1292 				    aoid, aclp->z_acl_bytes, 0, tx);
1293 			}
1294 			acl_phys.z_acl_extern_obj = aoid;
1295 			for (aclnode = list_head(&aclp->z_acl); aclnode;
1296 			    aclnode = list_next(&aclp->z_acl, aclnode)) {
1297 				if (aclnode->z_ace_count == 0)
1298 					continue;
1299 				dmu_write(zfsvfs->z_os, aoid, off,
1300 				    aclnode->z_size, aclnode->z_acldata, tx);
1301 				off += aclnode->z_size;
1302 			}
1303 		} else {
1304 			void *start = acl_phys.z_ace_data;
1305 			/*
1306 			 * Migrating back embedded?
1307 			 */
1308 			if (acl_phys.z_acl_extern_obj) {
1309 				error = dmu_object_free(zfsvfs->z_os,
1310 				    acl_phys.z_acl_extern_obj, tx);
1311 				if (error)
1312 					return (error);
1313 				acl_phys.z_acl_extern_obj = 0;
1314 			}
1315 
1316 			for (aclnode = list_head(&aclp->z_acl); aclnode;
1317 			    aclnode = list_next(&aclp->z_acl, aclnode)) {
1318 				if (aclnode->z_ace_count == 0)
1319 					continue;
1320 				bcopy(aclnode->z_acldata, start,
1321 				    aclnode->z_size);
1322 				start = (caddr_t)start + aclnode->z_size;
1323 			}
1324 		}
1325 		/*
1326 		 * If Old version then swap count/bytes to match old
1327 		 * layout of znode_acl_phys_t.
1328 		 */
1329 		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
1330 			acl_phys.z_acl_size = aclp->z_acl_count;
1331 			acl_phys.z_acl_count = aclp->z_acl_bytes;
1332 		} else {
1333 			acl_phys.z_acl_size = aclp->z_acl_bytes;
1334 			acl_phys.z_acl_count = aclp->z_acl_count;
1335 		}
1336 		acl_phys.z_acl_version = aclp->z_version;
1337 
1338 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
1339 		    &acl_phys, sizeof (acl_phys));
1340 	}
1341 
1342 	/*
1343 	 * Replace ACL wide bits, but first clear them.
1344 	 */
1345 	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
1346 
1347 	zp->z_pflags |= aclp->z_hints;
1348 
1349 	zai.zai_aclp = aclp;
1350 	if (ace_trivial_common(&zai, 0, zfs_ace_walk) == 0)
1351 		zp->z_pflags |= ZFS_ACL_TRIVIAL;
1352 
1353 	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
1354 	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
1355 }
1356 
1357 static void
1358 zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
1359     zfs_acl_t *aclp)
1360 {
1361 	void		*acep = NULL;
1362 	uint64_t	who;
1363 	int		new_count, new_bytes;
1364 	int		ace_size;
1365 	int		entry_type;
1366 	uint16_t	iflags, type;
1367 	uint32_t	access_mask;
1368 	zfs_acl_node_t	*newnode;
1369 	size_t		abstract_size = aclp->z_ops.ace_abstract_size();
1370 	void		*zacep;
1371 	boolean_t	isdir;
1372 	trivial_acl_t	masks;
1373 	zfs_acl_iter_t	zai = {0};
1374 
1375 	new_count = new_bytes = 0;
1376 
1377 	isdir = (vtype == VDIR);
1378 
1379 	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
1380 
1381 	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
1382 
1383 	zacep = newnode->z_acldata;
1384 	if (masks.allow0) {
1385 		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
1386 		zacep = (void *)((uintptr_t)zacep + abstract_size);
1387 		new_count++;
1388 		new_bytes += abstract_size;
1389 	}
1390 	if (masks.deny1) {
1391 		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
1392 		zacep = (void *)((uintptr_t)zacep + abstract_size);
1393 		new_count++;
1394 		new_bytes += abstract_size;
1395 	}
1396 	if (masks.deny2) {
1397 		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
1398 		zacep = (void *)((uintptr_t)zacep + abstract_size);
1399 		new_count++;
1400 		new_bytes += abstract_size;
1401 	}
1402 
1403 	zai.zai_aclp = aclp;
1404 	while (acep = zfs_acl_next_ace(&zai, acep, &who, &access_mask,
1405 	    &iflags, &type)) {
1406 		entry_type = (iflags & ACE_TYPE_FLAGS);
1407 		/*
1408 		 * ACEs used to represent the file mode may be divided
1409 		 * into an equivalent pair of inherit-only and regular
1410 		 * ACEs, if they are inheritable.
1411 		 * Skip regular ACEs, which are replaced by the new mode.
1412 		 */
1413 		if (split && (entry_type == ACE_OWNER ||
1414 		    entry_type == OWNING_GROUP ||
1415 		    entry_type == ACE_EVERYONE)) {
1416 			if (!isdir || !(iflags &
1417 			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
1418 				continue;
1419 			/*
1420 			 * We preserve owner@, group@, or @everyone
1421 			 * permissions, if they are inheritable, by
1422 			 * copying them to inherit_only ACEs. This
1423 			 * prevents inheritable permissions from being
1424 			 * altered along with the file mode.
1425 			 */
1426 			iflags |= ACE_INHERIT_ONLY_ACE;
1427 		}
1428 
1429 		/*
1430 		 * If this ACL has any inheritable ACEs, mark that in
1431 		 * the hints (which are later masked into the pflags)
1432 		 * so create knows to do inheritance.
1433 		 */
1434 		if (isdir && (iflags &
1435 		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
1436 			aclp->z_hints |= ZFS_INHERIT_ACE;
1437 
1438 		if ((type != ALLOW && type != DENY) ||
1439 		    (iflags & ACE_INHERIT_ONLY_ACE)) {
1440 			switch (type) {
1441 			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
1442 			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
1443 			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
1444 			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
1445 				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
1446 				break;
1447 			}
1448 		} else {
1449 			/*
1450 			 * Limit permissions granted by ACEs to be no greater
1451 			 * than permissions of the requested group mode.
1452 			 * Applies when the "aclmode" property is set to
1453 			 * "groupmask".
1454 			 */
1455 			if ((type == ALLOW) && trim)
1456 				access_mask &= masks.group;
1457 		}
1458 		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
1459 		ace_size = aclp->z_ops.ace_size(acep);
1460 		zacep = (void *)((uintptr_t)zacep + ace_size);
1461 		new_count++;
1462 		new_bytes += ace_size;
1463 	}
1464 	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
1465 	zacep = (void *)((uintptr_t)zacep + abstract_size);
1466 	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
1467 	zacep = (void *)((uintptr_t)zacep + abstract_size);
1468 	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
1469 
1470 	new_count += 3;
1471 	new_bytes += abstract_size * 3;
1472 	zfs_acl_release_nodes(aclp);
1473 	aclp->z_acl_count = new_count;
1474 	aclp->z_acl_bytes = new_bytes;
1475 	newnode->z_ace_count = new_count;
1476 	newnode->z_size = new_bytes;
1477 	list_insert_tail(&aclp->z_acl, newnode);
1478 }
1479 
1480 int
1481 zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
1482 {
1483 	int error = 0;
1484 
1485 	rw_enter(&zp->z_acl_lock, RW_WRITER);
1486 	mutex_enter(&zp->z_lock);
1487 	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
1488 		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
1489 	else
1490 		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
1491 
1492 	if (error == 0) {
1493 		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
1494 		zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
1495 		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
1496 	}
1497 	mutex_exit(&zp->z_lock);
1498 	rw_exit(&zp->z_acl_lock);
1499 
1500 	return (error);
1501 }
1502 
1503 /*
1504  * Should ACE be inherited?
1505  */
1506 static int
1507 zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
1508 {
1509 	int	iflags = (acep_flags & 0xf);
1510 
1511 	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
1512 		return (1);
1513 	else if (iflags & ACE_FILE_INHERIT_ACE)
1514 		return (!((vtype == VDIR) &&
1515 		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
1516 	return (0);
1517 }
1518 
1519 /*
1520  * inherit inheritable ACEs from parent
1521  */
1522 static zfs_acl_t *
1523 zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
1524     uint64_t mode, boolean_t *need_chmod)
1525 {
1526 	void		*pacep = NULL;
1527 	void		*acep;
1528 	zfs_acl_node_t  *aclnode;
1529 	zfs_acl_t	*aclp = NULL;
1530 	uint64_t	who;
1531 	uint32_t	access_mask;
1532 	uint16_t	iflags, newflags, type;
1533 	size_t		ace_size;
1534 	void		*data1, *data2;
1535 	size_t		data1sz, data2sz;
1536 	uint_t		aclinherit;
1537 	boolean_t	isdir = (vtype == VDIR);
1538 	boolean_t	isreg = (vtype == VREG);
1539 	zfs_acl_iter_t	zai = {0};
1540 
1541 	*need_chmod = B_TRUE;
1542 
1543 	aclp = zfs_acl_alloc(paclp->z_version);
1544 	aclinherit = zfsvfs->z_acl_inherit;
1545 	if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
1546 		return (aclp);
1547 
1548 	zai.zai_aclp = paclp;
1549 	while (pacep = zfs_acl_next_ace(&zai, pacep, &who,
1550 	    &access_mask, &iflags, &type)) {
1551 
1552 		/*
1553 		 * don't inherit bogus ACEs
1554 		 */
1555 		if (!zfs_acl_valid_ace_type(type, iflags))
1556 			continue;
1557 
1558 		/*
1559 		 * Check if ACE is inheritable by this vnode
1560 		 */
1561 		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
1562 		    !zfs_ace_can_use(vtype, iflags))
1563 			continue;
1564 
1565 		/*
1566 		 * If owner@, group@, or everyone@ inheritable
1567 		 * then zfs_acl_chmod() isn't needed.
1568 		 */
1569 		if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
1570 		    aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
1571 		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
1572 		    ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
1573 		    (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
1574 			*need_chmod = B_FALSE;
1575 
1576 		/*
1577 		 * Strip inherited execute permission from file if
1578 		 * not in mode
1579 		 */
1580 		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
1581 		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
1582 			access_mask &= ~ACE_EXECUTE;
1583 		}
1584 
1585 		/*
1586 		 * Strip write_acl and write_owner from permissions
1587 		 * when inheriting an ACE
1588 		 */
1589 		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
1590 			access_mask &= ~RESTRICTED_CLEAR;
1591 		}
1592 
1593 		ace_size = aclp->z_ops.ace_size(pacep);
1594 		aclnode = zfs_acl_node_alloc(ace_size);
1595 		list_insert_tail(&aclp->z_acl, aclnode);
1596 		acep = aclnode->z_acldata;
1597 
1598 		zfs_set_ace(aclp, acep, access_mask, type,
1599 		    who, iflags|ACE_INHERITED_ACE);
1600 
1601 		/*
1602 		 * Copy special opaque data if any
1603 		 */
1604 		if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) {
1605 			VERIFY((data2sz = aclp->z_ops.ace_data(acep,
1606 			    &data2)) == data1sz);
1607 			bcopy(data1, data2, data2sz);
1608 		}
1609 
1610 		aclp->z_acl_count++;
1611 		aclnode->z_ace_count++;
1612 		aclp->z_acl_bytes += aclnode->z_size;
1613 		newflags = aclp->z_ops.ace_flags_get(acep);
1614 
1615 		/*
1616 		 * If ACE is not to be inherited further, or if the vnode is
1617 		 * not a directory, remove all inheritance flags
1618 		 */
1619 		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
1620 			newflags &= ~ALL_INHERIT;
1621 			aclp->z_ops.ace_flags_set(acep,
1622 			    newflags|ACE_INHERITED_ACE);
1623 			continue;
1624 		}
1625 
1626 		/*
1627 		 * This directory has an inheritable ACE
1628 		 */
1629 		aclp->z_hints |= ZFS_INHERIT_ACE;
1630 
1631 		/*
1632 		 * If only FILE_INHERIT is set then turn on
1633 		 * inherit_only
1634 		 */
1635 		if ((iflags & (ACE_FILE_INHERIT_ACE |
1636 		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
1637 			newflags |= ACE_INHERIT_ONLY_ACE;
1638 			aclp->z_ops.ace_flags_set(acep,
1639 			    newflags|ACE_INHERITED_ACE);
1640 		} else {
1641 			newflags &= ~ACE_INHERIT_ONLY_ACE;
1642 			aclp->z_ops.ace_flags_set(acep,
1643 			    newflags|ACE_INHERITED_ACE);
1644 		}
1645 	}
1646 
1647 	return (aclp);
1648 }
1649 
1650 /*
1651  * Create file system object initial permissions
1652  * including inheritable ACEs.
1653  * Also, create FUIDs for owner and group.
1654  */
1655 int
1656 zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
1657     vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
1658 {
1659 	int		error;
1660 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1661 	zfs_acl_t	*paclp;
1662 	gid_t		gid;
1663 	boolean_t	need_chmod = B_TRUE;
1664 	boolean_t	trim = B_FALSE;
1665 	boolean_t	inherited = B_FALSE;
1666 
1667 	bzero(acl_ids, sizeof (zfs_acl_ids_t));
1668 	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
1669 
1670 	if (vsecp)
1671 		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
1672 		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
1673 			return (error);
1674 	/*
1675 	 * Determine uid and gid.
1676 	 */
1677 	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
1678 	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
1679 		acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
1680 		    (uint64_t)vap->va_uid, cr,
1681 		    ZFS_OWNER, &acl_ids->z_fuidp);
1682 		acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
1683 		    (uint64_t)vap->va_gid, cr,
1684 		    ZFS_GROUP, &acl_ids->z_fuidp);
1685 		gid = vap->va_gid;
1686 	} else {
1687 		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
1688 		    cr, &acl_ids->z_fuidp);
1689 		acl_ids->z_fgid = 0;
1690 		if (vap->va_mask & AT_GID)  {
1691 			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
1692 			    (uint64_t)vap->va_gid,
1693 			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
1694 			gid = vap->va_gid;
1695 			if (acl_ids->z_fgid != dzp->z_gid &&
1696 			    !groupmember(vap->va_gid, cr) &&
1697 			    secpolicy_vnode_create_gid(cr) != 0)
1698 				acl_ids->z_fgid = 0;
1699 		}
1700 		if (acl_ids->z_fgid == 0) {
1701 			if (dzp->z_mode & S_ISGID) {
1702 				char		*domain;
1703 				uint32_t	rid;
1704 
1705 				acl_ids->z_fgid = dzp->z_gid;
1706 				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
1707 				    cr, ZFS_GROUP);
1708 
1709 				if (zfsvfs->z_use_fuids &&
1710 				    IS_EPHEMERAL(acl_ids->z_fgid)) {
1711 					domain = zfs_fuid_idx_domain(
1712 					    &zfsvfs->z_fuid_idx,
1713 					    FUID_INDEX(acl_ids->z_fgid));
1714 					rid = FUID_RID(acl_ids->z_fgid);
1715 					zfs_fuid_node_add(&acl_ids->z_fuidp,
1716 					    domain, rid,
1717 					    FUID_INDEX(acl_ids->z_fgid),
1718 					    acl_ids->z_fgid, ZFS_GROUP);
1719 				}
1720 			} else {
1721 				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
1722 				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
1723 				gid = crgetgid(cr);
1724 			}
1725 		}
1726 	}
1727 
1728 	/*
1729 	 * If we're creating a directory, and the parent directory has the
1730 	 * set-GID bit set, set in on the new directory.
1731 	 * Otherwise, if the user is neither privileged nor a member of the
1732 	 * file's new group, clear the file's set-GID bit.
1733 	 */
1734 
1735 	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
1736 	    (vap->va_type == VDIR)) {
1737 		acl_ids->z_mode |= S_ISGID;
1738 	} else {
1739 		if ((acl_ids->z_mode & S_ISGID) &&
1740 		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
1741 			acl_ids->z_mode &= ~S_ISGID;
1742 	}
1743 
1744 	if (acl_ids->z_aclp == NULL) {
1745 		rw_enter(&dzp->z_acl_lock, RW_WRITER);
1746 		mutex_enter(&dzp->z_lock);
1747 		if (!(flag & IS_ROOT_NODE) &&
1748 		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
1749 		    !(dzp->z_pflags & ZFS_XATTR)) {
1750 			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
1751 			    &paclp, B_FALSE));
1752 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
1753 			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
1754 			inherited = B_TRUE;
1755 		} else {
1756 			acl_ids->z_aclp =
1757 			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
1758 			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
1759 		}
1760 		mutex_exit(&dzp->z_lock);
1761 		rw_exit(&dzp->z_acl_lock);
1762 
1763 		if (need_chmod) {
1764 			if (vap->va_type == VDIR)
1765 				acl_ids->z_aclp->z_hints |=
1766 				    ZFS_ACL_AUTO_INHERIT;
1767 
1768 			if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
1769 			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
1770 			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
1771 				trim = B_TRUE;
1772 			zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
1773 			    trim, acl_ids->z_aclp);
1774 		}
1775 	}
1776 
1777 	if (inherited || vsecp) {
1778 		zfs_acl_iter_t	zai = {0};
1779 
1780 		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
1781 		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
1782 		    acl_ids->z_fuid, acl_ids->z_fgid);
1783 
1784 		zai.zai_aclp = acl_ids->z_aclp;
1785 		if (ace_trivial_common(&zai, 0, zfs_ace_walk) == 0)
1786 			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
1787 	}
1788 
1789 	return (0);
1790 }
1791 
1792 /*
1793  * Free ACL and fuid_infop, but not the acl_ids structure
1794  */
1795 void
1796 zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
1797 {
1798 	if (acl_ids->z_aclp)
1799 		zfs_acl_free(acl_ids->z_aclp);
1800 	if (acl_ids->z_fuidp)
1801 		zfs_fuid_info_free(acl_ids->z_fuidp);
1802 	acl_ids->z_aclp = NULL;
1803 	acl_ids->z_fuidp = NULL;
1804 }
1805 
1806 boolean_t
1807 zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
1808 {
1809 	return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
1810 	    zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
1811 	    (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
1812 	    zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
1813 }
1814 
1815 /*
1816  * Retrieve a file's ACL
1817  */
1818 int
1819 zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
1820 {
1821 	zfs_acl_t	*aclp;
1822 	ulong_t		mask;
1823 	int		error;
1824 	int		count = 0;
1825 	int		largeace = 0;
1826 
1827 	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
1828 	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
1829 
1830 	if (mask == 0)
1831 		return (SET_ERROR(ENOSYS));
1832 
1833 	if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
1834 		return (error);
1835 
1836 	/*
1837 	 * This may be frequently called, so when possible
1838 	 * try to avoid entering z_acl_lock as writer.
1839 	 */
1840 	rw_enter(&zp->z_acl_lock, RW_READER);
1841 	aclp = zp->z_acl_cached;
1842 	if (aclp == NULL) {
1843 		/*
1844 		 * OK, need WRITER to load the ACL.  Note:
1845 		 * zfs_acl_node_read() re-checks zp->z_acl_cached
1846 		 * and just returns if it's set.
1847 		 */
1848 		rw_exit(&zp->z_acl_lock);
1849 		rw_enter(&zp->z_acl_lock, RW_WRITER);
1850 
1851 		error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
1852 		if (error != 0) {
1853 			rw_exit(&zp->z_acl_lock);
1854 			return (error);
1855 		}
1856 		rw_downgrade(&zp->z_acl_lock);	/* now RW_READER */
1857 		ASSERT(zp->z_acl_cached);
1858 	}
1859 
1860 	/*
1861 	 * Scan ACL to determine number of ACEs
1862 	 */
1863 	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
1864 		void *zacep = NULL;
1865 		uint64_t who;
1866 		uint32_t access_mask;
1867 		uint16_t type, iflags;
1868 		zfs_acl_iter_t	zai = {0};
1869 
1870 		zai.zai_aclp = aclp;
1871 		while (zacep = zfs_acl_next_ace(&zai, zacep,
1872 		    &who, &access_mask, &iflags, &type)) {
1873 			switch (type) {
1874 			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
1875 			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
1876 			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
1877 			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
1878 				largeace++;
1879 				continue;
1880 			default:
1881 				count++;
1882 			}
1883 		}
1884 		vsecp->vsa_aclcnt = count;
1885 	} else
1886 		count = (int)aclp->z_acl_count;
1887 
1888 	if (mask & VSA_ACECNT) {
1889 		vsecp->vsa_aclcnt = count;
1890 	}
1891 
1892 	if (mask & VSA_ACE) {
1893 		size_t aclsz;
1894 
1895 		aclsz = count * sizeof (ace_t) +
1896 		    sizeof (ace_object_t) * largeace;
1897 
1898 		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
1899 		vsecp->vsa_aclentsz = aclsz;
1900 
1901 		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
1902 			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
1903 			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
1904 		else {
1905 			zfs_acl_node_t *aclnode;
1906 			void *start = vsecp->vsa_aclentp;
1907 
1908 			for (aclnode = list_head(&aclp->z_acl); aclnode;
1909 			    aclnode = list_next(&aclp->z_acl, aclnode)) {
1910 				bcopy(aclnode->z_acldata, start,
1911 				    aclnode->z_size);
1912 				start = (caddr_t)start + aclnode->z_size;
1913 			}
1914 			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
1915 			    aclp->z_acl_bytes);
1916 		}
1917 	}
1918 	if (mask & VSA_ACE_ACLFLAGS) {
1919 		vsecp->vsa_aclflags = 0;
1920 		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
1921 			vsecp->vsa_aclflags |= ACL_DEFAULTED;
1922 		if (zp->z_pflags & ZFS_ACL_PROTECTED)
1923 			vsecp->vsa_aclflags |= ACL_PROTECTED;
1924 		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
1925 			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
1926 	}
1927 
1928 	rw_exit(&zp->z_acl_lock);
1929 
1930 	return (0);
1931 }
1932 
1933 int
1934 zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
1935     vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
1936 {
1937 	zfs_acl_t *aclp;
1938 	zfs_acl_node_t *aclnode;
1939 	int aclcnt = vsecp->vsa_aclcnt;
1940 	int error;
1941 
1942 	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
1943 		return (SET_ERROR(EINVAL));
1944 
1945 	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
1946 
1947 	aclp->z_hints = 0;
1948 	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
1949 	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
1950 		if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
1951 		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
1952 		    aclcnt, &aclnode->z_size)) != 0) {
1953 			zfs_acl_free(aclp);
1954 			zfs_acl_node_free(aclnode);
1955 			return (error);
1956 		}
1957 	} else {
1958 		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
1959 		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
1960 		    &aclnode->z_size, fuidp, cr)) != 0) {
1961 			zfs_acl_free(aclp);
1962 			zfs_acl_node_free(aclnode);
1963 			return (error);
1964 		}
1965 	}
1966 	aclp->z_acl_bytes = aclnode->z_size;
1967 	aclnode->z_ace_count = aclcnt;
1968 	aclp->z_acl_count = aclcnt;
1969 	list_insert_head(&aclp->z_acl, aclnode);
1970 
1971 	/*
1972 	 * If flags are being set then add them to z_hints
1973 	 */
1974 	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
1975 		if (vsecp->vsa_aclflags & ACL_PROTECTED)
1976 			aclp->z_hints |= ZFS_ACL_PROTECTED;
1977 		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
1978 			aclp->z_hints |= ZFS_ACL_DEFAULTED;
1979 		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
1980 			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
1981 	}
1982 
1983 	*zaclp = aclp;
1984 
1985 	return (0);
1986 }
1987 
1988 /*
1989  * Set a file's ACL
1990  */
1991 int
1992 zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
1993 {
1994 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1995 	zilog_t		*zilog = zfsvfs->z_log;
1996 	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
1997 	dmu_tx_t	*tx;
1998 	int		error;
1999 	zfs_acl_t	*aclp;
2000 	zfs_fuid_info_t	*fuidp = NULL;
2001 	boolean_t	fuid_dirtied;
2002 	uint64_t	acl_obj;
2003 
2004 	if (mask == 0)
2005 		return (SET_ERROR(ENOSYS));
2006 
2007 	if (zp->z_pflags & ZFS_IMMUTABLE)
2008 		return (SET_ERROR(EPERM));
2009 
2010 	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
2011 		return (error);
2012 
2013 	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
2014 	    &aclp);
2015 	if (error)
2016 		return (error);
2017 
2018 	/*
2019 	 * If ACL wide flags aren't being set then preserve any
2020 	 * existing flags.
2021 	 */
2022 	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
2023 		aclp->z_hints |=
2024 		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
2025 	}
2026 top:
2027 	rw_enter(&zp->z_acl_lock, RW_WRITER);
2028 	mutex_enter(&zp->z_lock);
2029 
2030 	tx = dmu_tx_create(zfsvfs->z_os);
2031 
2032 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2033 
2034 	fuid_dirtied = zfsvfs->z_fuid_dirty;
2035 	if (fuid_dirtied)
2036 		zfs_fuid_txhold(zfsvfs, tx);
2037 
2038 	/*
2039 	 * If old version and ACL won't fit in bonus and we aren't
2040 	 * upgrading then take out necessary DMU holds
2041 	 */
2042 
2043 	if ((acl_obj = zfs_external_acl(zp)) != 0) {
2044 		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2045 		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
2046 			dmu_tx_hold_free(tx, acl_obj, 0,
2047 			    DMU_OBJECT_END);
2048 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2049 			    aclp->z_acl_bytes);
2050 		} else {
2051 			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
2052 		}
2053 	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2054 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
2055 	}
2056 
2057 	zfs_sa_upgrade_txholds(tx, zp);
2058 	error = dmu_tx_assign(tx, TXG_NOWAIT);
2059 	if (error) {
2060 		rw_exit(&zp->z_acl_lock);
2061 		mutex_exit(&zp->z_lock);
2062 
2063 		if (error == ERESTART) {
2064 			dmu_tx_wait(tx);
2065 			dmu_tx_abort(tx);
2066 			goto top;
2067 		}
2068 		dmu_tx_abort(tx);
2069 		zfs_acl_free(aclp);
2070 		return (error);
2071 	}
2072 
2073 	error = zfs_aclset_common(zp, aclp, cr, tx);
2074 	ASSERT(error == 0);
2075 	ASSERT(zp->z_acl_cached == NULL);
2076 	zp->z_acl_cached = aclp;
2077 
2078 	if (fuid_dirtied)
2079 		zfs_fuid_sync(zfsvfs, tx);
2080 
2081 	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
2082 
2083 	if (fuidp)
2084 		zfs_fuid_info_free(fuidp);
2085 	dmu_tx_commit(tx);
2086 done:
2087 	mutex_exit(&zp->z_lock);
2088 	rw_exit(&zp->z_acl_lock);
2089 
2090 	return (error);
2091 }
2092 
2093 /*
2094  * Check accesses of interest (AoI) against attributes of the dataset
2095  * such as read-only.  Returns zero if no AoI conflict with dataset
2096  * attributes, otherwise an appropriate errno is returned.
2097  */
2098 static int
2099 zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
2100 {
2101 	if ((v4_mode & WRITE_MASK) &&
2102 	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
2103 	    (!IS_DEVVP(ZTOV(zp)) ||
2104 	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
2105 		return (SET_ERROR(EROFS));
2106 	}
2107 
2108 	/*
2109 	 * Intentionally allow ZFS_READONLY through here.
2110 	 * See zfs_zaccess_common().
2111 	 */
2112 	if ((v4_mode & WRITE_MASK_DATA) &&
2113 	    (zp->z_pflags & ZFS_IMMUTABLE)) {
2114 		return (SET_ERROR(EPERM));
2115 	}
2116 
2117 	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
2118 	    (zp->z_pflags & ZFS_NOUNLINK)) {
2119 		return (SET_ERROR(EPERM));
2120 	}
2121 
2122 	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
2123 	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
2124 		return (SET_ERROR(EACCES));
2125 	}
2126 
2127 	return (0);
2128 }
2129 
2130 /*
2131  * The primary usage of this function is to loop through all of the
2132  * ACEs in the znode, determining what accesses of interest (AoI) to
2133  * the caller are allowed or denied.  The AoI are expressed as bits in
2134  * the working_mode parameter.  As each ACE is processed, bits covered
2135  * by that ACE are removed from the working_mode.  This removal
2136  * facilitates two things.  The first is that when the working mode is
2137  * empty (= 0), we know we've looked at all the AoI. The second is
2138  * that the ACE interpretation rules don't allow a later ACE to undo
2139  * something granted or denied by an earlier ACE.  Removing the
2140  * discovered access or denial enforces this rule.  At the end of
2141  * processing the ACEs, all AoI that were found to be denied are
2142  * placed into the working_mode, giving the caller a mask of denied
2143  * accesses.  Returns:
2144  *	0		if all AoI granted
2145  *	EACCES		if the denied mask is non-zero
2146  *	other error	if abnormal failure (e.g., IO error)
2147  *
2148  * A secondary usage of the function is to determine if any of the
2149  * AoI are granted.  If an ACE grants any access in
2150  * the working_mode, we immediately short circuit out of the function.
2151  * This mode is chosen by setting anyaccess to B_TRUE.  The
2152  * working_mode is not a denied access mask upon exit if the function
2153  * is used in this manner.
2154  */
2155 static int
2156 zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
2157     boolean_t anyaccess, cred_t *cr)
2158 {
2159 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2160 	zfs_acl_t	*aclp;
2161 	int		error;
2162 	uint64_t	who;		/* FUID from the ACE */
2163 	uint16_t	type, iflags;
2164 	uint16_t	entry_type;
2165 	uint32_t	access_mask;
2166 	uint32_t	deny_mask = 0;
2167 	zfs_ace_hdr_t	*acep = NULL;
2168 	boolean_t	checkit;	/* ACE ID matches */
2169 	zfs_acl_iter_t	zai = {0};
2170 
2171 	/*
2172 	 * This can be a HOT code path, so when possible
2173 	 * try to avoid entering z_acl_lock as writer.
2174 	 */
2175 	rw_enter(&zp->z_acl_lock, RW_READER);
2176 	aclp = zp->z_acl_cached;
2177 	if (aclp == NULL) {
2178 		/*
2179 		 * OK, need WRITER to load the ACL.  Note:
2180 		 * zfs_acl_node_read() re-checks zp->z_acl_cached
2181 		 * and just returns if it's set.
2182 		 */
2183 		rw_exit(&zp->z_acl_lock);
2184 		rw_enter(&zp->z_acl_lock, RW_WRITER);
2185 
2186 		error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
2187 		if (error != 0) {
2188 			rw_exit(&zp->z_acl_lock);
2189 			return (error);
2190 		}
2191 		rw_downgrade(&zp->z_acl_lock);	/* now RW_READER */
2192 		ASSERT(zp->z_acl_cached);
2193 	}
2194 
2195 	zai.zai_aclp = aclp;
2196 	while (acep = zfs_acl_next_ace(&zai, acep, &who, &access_mask,
2197 	    &iflags, &type)) {
2198 		uint32_t mask_matched;
2199 
2200 		if (!zfs_acl_valid_ace_type(type, iflags))
2201 			continue;
2202 
2203 		if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
2204 			continue;
2205 
2206 		/* Skip ACE if it does not affect any AoI */
2207 		mask_matched = (access_mask & *working_mode);
2208 		if (!mask_matched)
2209 			continue;
2210 
2211 		entry_type = (iflags & ACE_TYPE_FLAGS);
2212 
2213 		checkit = B_FALSE;
2214 
2215 		switch (entry_type) {
2216 		case ACE_OWNER:
2217 			who = zp->z_uid;
2218 			/*FALLTHROUGH*/
2219 		case 0:	/* USER Entry */
2220 			checkit = zfs_user_in_cred(zfsvfs, who, cr);
2221 			break;
2222 		case OWNING_GROUP:
2223 			who = zp->z_gid;
2224 			/*FALLTHROUGH*/
2225 		case ACE_IDENTIFIER_GROUP:
2226 			checkit = zfs_groupmember(zfsvfs, who, cr);
2227 			break;
2228 		case ACE_EVERYONE:
2229 			checkit = B_TRUE;
2230 			break;
2231 
2232 		default:
2233 			/*
2234 			 * The zfs_acl_valid_ace_type check above
2235 			 * should make this case impossible.
2236 			 */
2237 			rw_exit(&zp->z_acl_lock);
2238 			return (SET_ERROR(EIO));
2239 		}
2240 
2241 		if (checkit) {
2242 			if (type == DENY) {
2243 				DTRACE_PROBE3(zfs__ace__denies,
2244 				    znode_t *, zp,
2245 				    zfs_ace_hdr_t *, acep,
2246 				    uint32_t, mask_matched);
2247 				deny_mask |= mask_matched;
2248 			} else {
2249 				DTRACE_PROBE3(zfs__ace__allows,
2250 				    znode_t *, zp,
2251 				    zfs_ace_hdr_t *, acep,
2252 				    uint32_t, mask_matched);
2253 				if (anyaccess) {
2254 					rw_exit(&zp->z_acl_lock);
2255 					return (0);
2256 				}
2257 			}
2258 			*working_mode &= ~mask_matched;
2259 		}
2260 
2261 		/* Are we done? */
2262 		if (*working_mode == 0)
2263 			break;
2264 	}
2265 
2266 	rw_exit(&zp->z_acl_lock);
2267 
2268 	/* Put the found 'denies' back on the working mode */
2269 	if (deny_mask) {
2270 		*working_mode |= deny_mask;
2271 		return (SET_ERROR(EACCES));
2272 	} else if (*working_mode) {
2273 		return (-1);
2274 	}
2275 
2276 	return (0);
2277 }
2278 
2279 /*
2280  * Return true if any access whatsoever granted, we don't actually
2281  * care what access is granted.
2282  */
2283 boolean_t
2284 zfs_has_access(znode_t *zp, cred_t *cr)
2285 {
2286 	uint32_t have = ACE_ALL_PERMS;
2287 
2288 	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
2289 		uid_t owner;
2290 
2291 		owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
2292 		return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
2293 	}
2294 	return (B_TRUE);
2295 }
2296 
2297 static int
2298 zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
2299     boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
2300 {
2301 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2302 	int err;
2303 
2304 	*working_mode = v4_mode;
2305 	*check_privs = B_TRUE;
2306 
2307 	/*
2308 	 * Short circuit empty requests
2309 	 */
2310 	if (v4_mode == 0 || zfsvfs->z_replay) {
2311 		*working_mode = 0;
2312 		return (0);
2313 	}
2314 
2315 	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
2316 		*check_privs = B_FALSE;
2317 		return (err);
2318 	}
2319 
2320 	/*
2321 	 * The caller requested that the ACL check be skipped.  This
2322 	 * would only happen if the caller checked VOP_ACCESS() with a
2323 	 * 32 bit ACE mask and already had the appropriate permissions.
2324 	 */
2325 	if (skipaclchk) {
2326 		*working_mode = 0;
2327 		return (0);
2328 	}
2329 
2330 	/*
2331 	 * Note: ZFS_READONLY represents the "DOS R/O" attribute.
2332 	 * When that flag is set, we should behave as if write access
2333 	 * were not granted by anything in the ACL.  In particular:
2334 	 * We _must_ allow writes after opening the file r/w, then
2335 	 * setting the DOS R/O attribute, and writing some more.
2336 	 * (Similar to how you can write after fchmod(fd, 0444).)
2337 	 *
2338 	 * Therefore ZFS_READONLY is ignored in the dataset check
2339 	 * above, and checked here as if part of the ACL check.
2340 	 * Also note: DOS R/O is ignored for directories.
2341 	 */
2342 	if ((v4_mode & WRITE_MASK_DATA) &&
2343 	    (ZTOV(zp)->v_type != VDIR) &&
2344 	    (zp->z_pflags & ZFS_READONLY)) {
2345 		return (SET_ERROR(EPERM));
2346 	}
2347 
2348 	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
2349 }
2350 
2351 static int
2352 zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
2353     cred_t *cr)
2354 {
2355 	if (*working_mode != ACE_WRITE_DATA)
2356 		return (SET_ERROR(EACCES));
2357 
2358 	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
2359 	    check_privs, B_FALSE, cr));
2360 }
2361 
2362 int
2363 zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr, boolean_t skipaclchk)
2364 {
2365 	boolean_t owner = B_FALSE;
2366 	boolean_t groupmbr = B_FALSE;
2367 	boolean_t is_attr;
2368 	uid_t uid = crgetuid(cr);
2369 	int error;
2370 
2371 	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
2372 		return (SET_ERROR(EACCES));
2373 
2374 	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
2375 	    (ZTOV(zdp)->v_type == VDIR));
2376 	if (is_attr)
2377 		goto slow;
2378 
2379 
2380 	rw_enter(&zdp->z_acl_lock, RW_READER);
2381 
2382 	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
2383 		rw_exit(&zdp->z_acl_lock);
2384 		return (0);
2385 	}
2386 
2387 	if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
2388 		rw_exit(&zdp->z_acl_lock);
2389 		goto slow;
2390 	}
2391 
2392 	if (uid == zdp->z_uid) {
2393 		owner = B_TRUE;
2394 		if (zdp->z_mode & S_IXUSR) {
2395 			rw_exit(&zdp->z_acl_lock);
2396 			return (0);
2397 		} else {
2398 			rw_exit(&zdp->z_acl_lock);
2399 			goto slow;
2400 		}
2401 	}
2402 	if (groupmember(zdp->z_gid, cr)) {
2403 		groupmbr = B_TRUE;
2404 		if (zdp->z_mode & S_IXGRP) {
2405 			rw_exit(&zdp->z_acl_lock);
2406 			return (0);
2407 		} else {
2408 			rw_exit(&zdp->z_acl_lock);
2409 			goto slow;
2410 		}
2411 	}
2412 	if (!owner && !groupmbr) {
2413 		if (zdp->z_mode & S_IXOTH) {
2414 			rw_exit(&zdp->z_acl_lock);
2415 			return (0);
2416 		}
2417 	}
2418 
2419 	rw_exit(&zdp->z_acl_lock);
2420 
2421 slow:
2422 	DTRACE_PROBE(zfs__fastpath__execute__access__miss);
2423 	ZFS_ENTER(zdp->z_zfsvfs);
2424 	error = zfs_zaccess(zdp, ACE_EXECUTE, 0, skipaclchk, cr);
2425 	ZFS_EXIT(zdp->z_zfsvfs);
2426 	return (error);
2427 }
2428 
2429 /*
2430  * Determine whether Access should be granted/denied.
2431  *
2432  * The least priv subsystem is always consulted as a basic privilege
2433  * can define any form of access.
2434  */
2435 int
2436 zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
2437 {
2438 	uint32_t	working_mode;
2439 	int		error;
2440 	int		is_attr;
2441 	boolean_t	check_privs;
2442 	znode_t		*xzp;
2443 	znode_t		*check_zp = zp;
2444 	mode_t		needed_bits;
2445 	uid_t		owner;
2446 
2447 	is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
2448 
2449 	/*
2450 	 * If attribute then validate against base file
2451 	 */
2452 	if (is_attr) {
2453 		uint64_t	parent;
2454 
2455 		if ((error = sa_lookup(zp->z_sa_hdl,
2456 		    SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
2457 		    sizeof (parent))) != 0)
2458 			return (error);
2459 
2460 		if ((error = zfs_zget(zp->z_zfsvfs,
2461 		    parent, &xzp)) != 0)	{
2462 			return (error);
2463 		}
2464 
2465 		check_zp = xzp;
2466 
2467 		/*
2468 		 * fixup mode to map to xattr perms
2469 		 */
2470 
2471 		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
2472 			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
2473 			mode |= ACE_WRITE_NAMED_ATTRS;
2474 		}
2475 
2476 		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
2477 			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
2478 			mode |= ACE_READ_NAMED_ATTRS;
2479 		}
2480 	}
2481 
2482 	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
2483 	/*
2484 	 * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
2485 	 * in needed_bits.  Map the bits mapped by working_mode (currently
2486 	 * missing) in missing_bits.
2487 	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
2488 	 * needed_bits.
2489 	 */
2490 	needed_bits = 0;
2491 
2492 	working_mode = mode;
2493 
2494 	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
2495 	    owner == crgetuid(cr))
2496 		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
2497 
2498 	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
2499 	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
2500 		needed_bits |= VREAD;
2501 	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
2502 	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
2503 		needed_bits |= VWRITE;
2504 	if (working_mode & ACE_EXECUTE)
2505 		needed_bits |= VEXEC;
2506 
2507 	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
2508 	    &check_privs, skipaclchk, cr)) == 0) {
2509 		if (is_attr)
2510 			VN_RELE(ZTOV(xzp));
2511 		return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
2512 		    needed_bits, needed_bits));
2513 	}
2514 
2515 	if (error && !check_privs) {
2516 		if (is_attr)
2517 			VN_RELE(ZTOV(xzp));
2518 		return (error);
2519 	}
2520 
2521 	if (error && (flags & V_APPEND)) {
2522 		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
2523 	}
2524 
2525 	if (error && check_privs) {
2526 		mode_t		checkmode = 0;
2527 
2528 		/*
2529 		 * First check for implicit owner permission on
2530 		 * read_acl/read_attributes
2531 		 */
2532 
2533 		error = 0;
2534 		ASSERT(working_mode != 0);
2535 
2536 		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
2537 		    owner == crgetuid(cr)))
2538 			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
2539 
2540 		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
2541 		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
2542 			checkmode |= VREAD;
2543 		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
2544 		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
2545 			checkmode |= VWRITE;
2546 		if (working_mode & ACE_EXECUTE)
2547 			checkmode |= VEXEC;
2548 
2549 		error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
2550 		    needed_bits & ~checkmode, needed_bits);
2551 
2552 		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
2553 			error = secpolicy_vnode_chown(cr, owner);
2554 		if (error == 0 && (working_mode & ACE_WRITE_ACL))
2555 			error = secpolicy_vnode_setdac3(cr, owner,
2556 			    zp->z_zfsvfs->z_acl_implicit);
2557 
2558 		if (error == 0 && (working_mode &
2559 		    (ACE_DELETE|ACE_DELETE_CHILD)))
2560 			error = secpolicy_vnode_remove(cr);
2561 
2562 		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
2563 			error = secpolicy_vnode_chown(cr, owner);
2564 		}
2565 		if (error == 0) {
2566 			/*
2567 			 * See if any bits other than those already checked
2568 			 * for are still present.  If so then return EACCES
2569 			 */
2570 			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
2571 				error = SET_ERROR(EACCES);
2572 			}
2573 		}
2574 	} else if (error == 0) {
2575 		error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
2576 		    needed_bits, needed_bits);
2577 	}
2578 
2579 
2580 	if (is_attr)
2581 		VN_RELE(ZTOV(xzp));
2582 
2583 	return (error);
2584 }
2585 
2586 /*
2587  * Translate traditional unix VREAD/VWRITE/VEXEC mode into
2588  * native ACL format and call zfs_zaccess()
2589  */
2590 int
2591 zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
2592 {
2593 	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
2594 }
2595 
2596 /*
2597  * Access function for secpolicy_vnode_setattr
2598  */
2599 int
2600 zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
2601 {
2602 	int v4_mode = zfs_unix_to_v4(mode >> 6);
2603 
2604 	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
2605 }
2606 
2607 /* See zfs_zaccess_delete() */
2608 int zfs_write_implies_delete_child = 1;
2609 
2610 /*
2611  * Determine whether delete access should be granted.
2612  *
2613  * The following chart outlines how we handle delete permissions which is
2614  * how recent versions of windows (Windows 2008) handles it.  The efficiency
2615  * comes from not having to check the parent ACL where the object itself grants
2616  * delete:
2617  *
2618  *      -------------------------------------------------------
2619  *      |   Parent Dir  |      Target Object Permissions      |
2620  *      |  permissions  |                                     |
2621  *      -------------------------------------------------------
2622  *      |               | ACL Allows | ACL Denies| Delete     |
2623  *      |               |  Delete    |  Delete   | unspecified|
2624  *      -------------------------------------------------------
2625  *      | ACL Allows    | Permit     | Deny *    | Permit     |
2626  *      | DELETE_CHILD  |            |           |            |
2627  *      -------------------------------------------------------
2628  *      | ACL Denies    | Permit     | Deny      | Deny       |
2629  *      | DELETE_CHILD  |            |           |            |
2630  *      -------------------------------------------------------
2631  *      | ACL specifies |            |           |            |
2632  *      | only allow    | Permit     | Deny *    | Permit     |
2633  *      | write and     |            |           |            |
2634  *      | execute       |            |           |            |
2635  *      -------------------------------------------------------
2636  *      | ACL denies    |            |           |            |
2637  *      | write and     | Permit     | Deny      | Deny       |
2638  *      | execute       |            |           |            |
2639  *      -------------------------------------------------------
2640  *         ^
2641  *         |
2642  *         Re. execute permission on the directory:  if that's missing,
2643  *	   the vnode lookup of the target will fail before we get here.
2644  *
2645  * Re [*] in the table above:  NFSv4 would normally Permit delete for
2646  * these two cells of the matrix.
2647  * See acl.h for notes on which ACE_... flags should be checked for which
2648  * operations.  Specifically, the NFSv4 committee recommendation is in
2649  * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs
2650  * should take precedence ahead of ALLOW ACEs.
2651  *
2652  * This implementation always consults the target object's ACL first.
2653  * If a DENY ACE is present on the target object that specifies ACE_DELETE,
2654  * delete access is denied.  If an ALLOW ACE with ACE_DELETE is present on
2655  * the target object, access is allowed.  If and only if no entries with
2656  * ACE_DELETE are present in the object's ACL, check the container's ACL
2657  * for entries with ACE_DELETE_CHILD.
2658  *
2659  * A summary of the logic implemented from the table above is as follows:
2660  *
2661  * First check for DENY ACEs that apply.
2662  * If either target or container has a deny, EACCES.
2663  *
2664  * Delete access can then be summarized as follows:
2665  * 1: The object to be deleted grants ACE_DELETE, or
2666  * 2: The containing directory grants ACE_DELETE_CHILD.
2667  * In a Windows system, that would be the end of the story.
2668  * In this system, (2) has some complications...
2669  * 2a: "sticky" bit on a directory adds restrictions, and
2670  * 2b: existing ACEs from previous versions of ZFS may
2671  * not carry ACE_DELETE_CHILD where they should, so we
2672  * also allow delete when ACE_WRITE_DATA is granted.
2673  *
2674  * Note: 2b is technically a work-around for a prior bug,
2675  * which hopefully can go away some day.  For those who
2676  * no longer need the work around, and for testing, this
2677  * work-around is made conditional via the tunable:
2678  * zfs_write_implies_delete_child
2679  */
2680 int
2681 zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
2682 {
2683 	uint32_t wanted_dirperms;
2684 	uint32_t dzp_working_mode = 0;
2685 	uint32_t zp_working_mode = 0;
2686 	int dzp_error, zp_error;
2687 	boolean_t dzpcheck_privs;
2688 	boolean_t zpcheck_privs;
2689 
2690 	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
2691 		return (SET_ERROR(EPERM));
2692 
2693 	/*
2694 	 * Case 1:
2695 	 * If target object grants ACE_DELETE then we are done.  This is
2696 	 * indicated by a return value of 0.  For this case we don't worry
2697 	 * about the sticky bit because sticky only applies to the parent
2698 	 * directory and this is the child access result.
2699 	 *
2700 	 * If we encounter a DENY ACE here, we're also done (EACCES).
2701 	 * Note that if we hit a DENY ACE here (on the target) it should
2702 	 * take precedence over a DENY ACE on the container, so that when
2703 	 * we have more complete auditing support we will be able to
2704 	 * report an access failure against the specific target.
2705 	 * (This is part of why we're checking the target first.)
2706 	 */
2707 	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
2708 	    &zpcheck_privs, B_FALSE, cr);
2709 	if (zp_error == EACCES) {
2710 		/* We hit a DENY ACE. */
2711 		if (!zpcheck_privs)
2712 			return (SET_ERROR(zp_error));
2713 		return (secpolicy_vnode_remove(cr));
2714 
2715 	}
2716 	if (zp_error == 0)
2717 		return (0);
2718 
2719 	/*
2720 	 * Case 2:
2721 	 * If the containing directory grants ACE_DELETE_CHILD,
2722 	 * or we're in backward compatibility mode and the
2723 	 * containing directory has ACE_WRITE_DATA, allow.
2724 	 * Case 2b is handled with wanted_dirperms.
2725 	 */
2726 	wanted_dirperms = ACE_DELETE_CHILD;
2727 	if (zfs_write_implies_delete_child)
2728 		wanted_dirperms |= ACE_WRITE_DATA;
2729 	dzp_error = zfs_zaccess_common(dzp, wanted_dirperms,
2730 	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
2731 	if (dzp_error == EACCES) {
2732 		/* We hit a DENY ACE. */
2733 		if (!dzpcheck_privs)
2734 			return (SET_ERROR(dzp_error));
2735 		return (secpolicy_vnode_remove(cr));
2736 	}
2737 
2738 	/*
2739 	 * Cases 2a, 2b (continued)
2740 	 *
2741 	 * Note: dzp_working_mode now contains any permissions
2742 	 * that were NOT granted.  Therefore, if any of the
2743 	 * wanted_dirperms WERE granted, we will have:
2744 	 *   dzp_working_mode != wanted_dirperms
2745 	 * We're really asking if ANY of those permissions
2746 	 * were granted, and if so, grant delete access.
2747 	 */
2748 	if (dzp_working_mode != wanted_dirperms)
2749 		dzp_error = 0;
2750 
2751 	/*
2752 	 * dzp_error is 0 if the container granted us permissions to "modify".
2753 	 * If we do not have permission via one or more ACEs, our current
2754 	 * privileges may still permit us to modify the container.
2755 	 *
2756 	 * dzpcheck_privs is false when i.e. the FS is read-only.
2757 	 * Otherwise, do privilege checks for the container.
2758 	 */
2759 	if (dzp_error != 0 && dzpcheck_privs) {
2760 		uid_t owner;
2761 
2762 		/*
2763 		 * The secpolicy call needs the requested access and
2764 		 * the current access mode of the container, but it
2765 		 * only knows about Unix-style modes (VEXEC, VWRITE),
2766 		 * so this must condense the fine-grained ACE bits into
2767 		 * Unix modes.
2768 		 *
2769 		 * The VEXEC flag is easy, because we know that has
2770 		 * always been checked before we get here (during the
2771 		 * lookup of the target vnode).  The container has not
2772 		 * granted us permissions to "modify", so we do not set
2773 		 * the VWRITE flag in the current access mode.
2774 		 */
2775 		owner = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr,
2776 		    ZFS_OWNER);
2777 		dzp_error = secpolicy_vnode_access2(cr, ZTOV(dzp),
2778 		    owner, VEXEC, VWRITE|VEXEC);
2779 	}
2780 	if (dzp_error != 0) {
2781 		/*
2782 		 * Note: We may have dzp_error = -1 here (from
2783 		 * zfs_zacess_common).  Don't return that.
2784 		 */
2785 		return (SET_ERROR(EACCES));
2786 	}
2787 
2788 	/*
2789 	 * At this point, we know that the directory permissions allow
2790 	 * us to modify, but we still need to check for the additional
2791 	 * restrictions that apply when the "sticky bit" is set.
2792 	 *
2793 	 * Yes, zfs_sticky_remove_access() also checks this bit, but
2794 	 * checking it here and skipping the call below is nice when
2795 	 * you're watching all of this with dtrace.
2796 	 */
2797 	if ((dzp->z_mode & S_ISVTX) == 0)
2798 		return (0);
2799 
2800 	/*
2801 	 * zfs_sticky_remove_access will succeed if:
2802 	 * 1. The sticky bit is absent.
2803 	 * 2. We pass the sticky bit restrictions.
2804 	 * 3. We have privileges that always allow file removal.
2805 	 */
2806 	return (zfs_sticky_remove_access(dzp, zp, cr));
2807 }
2808 
2809 int
2810 zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
2811     znode_t *tzp, cred_t *cr)
2812 {
2813 	int add_perm;
2814 	int error;
2815 
2816 	if (szp->z_pflags & ZFS_AV_QUARANTINED)
2817 		return (SET_ERROR(EACCES));
2818 
2819 	add_perm = (ZTOV(szp)->v_type == VDIR) ?
2820 	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
2821 
2822 	/*
2823 	 * Rename permissions are combination of delete permission +
2824 	 * add file/subdir permission.
2825 	 */
2826 
2827 	/*
2828 	 * first make sure we do the delete portion.
2829 	 *
2830 	 * If that succeeds then check for add_file/add_subdir permissions
2831 	 */
2832 
2833 	if (error = zfs_zaccess_delete(sdzp, szp, cr))
2834 		return (error);
2835 
2836 	/*
2837 	 * If we have a tzp, see if we can delete it?
2838 	 */
2839 	if (tzp) {
2840 		if (error = zfs_zaccess_delete(tdzp, tzp, cr))
2841 			return (error);
2842 	}
2843 
2844 	/*
2845 	 * Now check for add permissions
2846 	 */
2847 	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
2848 
2849 	return (error);
2850 }
2851