xref: /illumos-gate/usr/src/lib/brand/solaris10/s10_brand/common/s10_brand.c (revision 4a344fefc11f3e4396164d49b41397979db382c2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <errno.h>
27 #include <fcntl.h>
28 #include <dirent.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <strings.h>
33 #include <unistd.h>
34 #include <thread.h>
35 #include <sys/auxv.h>
36 #include <sys/brand.h>
37 #include <sys/inttypes.h>
38 #include <sys/lwp.h>
39 #include <sys/syscall.h>
40 #include <sys/systm.h>
41 #include <sys/utsname.h>
42 #include <sys/sysconfig.h>
43 #include <sys/systeminfo.h>
44 #include <sys/zone.h>
45 #include <sys/stat.h>
46 #include <sys/mntent.h>
47 #include <sys/ctfs.h>
48 #include <sys/priv.h>
49 #include <sys/acctctl.h>
50 #include <libgen.h>
51 #include <bsm/audit.h>
52 #include <sys/crypto/ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/zfs_ioctl.h>
55 #include <sys/ucontext.h>
56 #include <sys/mntio.h>
57 #include <sys/mnttab.h>
58 #include <sys/attr.h>
59 #include <atomic.h>
60 #include <sys/acl.h>
61 
62 #include <s10_brand.h>
63 #include <brand_misc.h>
64 #include <s10_misc.h>
65 #include <s10_signal.h>
66 
67 /*
68  * See usr/src/lib/brand/shared/brand/common/brand_util.c for general
69  * emulation notes.
70  */
71 
72 static zoneid_t zoneid;
73 static boolean_t emul_global_zone = B_FALSE;
74 static s10_emul_bitmap_t emul_bitmap;
75 pid_t zone_init_pid;
76 
77 /*
78  * S10_FEATURE_IS_PRESENT is a macro that helps facilitate conditional
79  * emulation.  For each constant N defined in the s10_emulated_features
80  * enumeration in usr/src/uts/common/brand/solaris10/s10_brand.h,
81  * S10_FEATURE_IS_PRESENT(N) is true iff the feature/backport represented by N
82  * is present in the Solaris 10 image hosted within the zone.  In other words,
83  * S10_FEATURE_IS_PRESENT(N) is true iff the file /usr/lib/brand/solaris10/M,
84  * where M is the enum value of N, was present in the zone when the zone booted.
85  *
86  *
87  * *** Sample Usage
88  *
89  * Suppose that you need to backport a fix to Solaris 10 and there is
90  * emulation in place for the fix.  Suppose further that the emulation won't be
91  * needed if the fix is backported (i.e., if the fix is present in the hosted
92  * Solaris 10 environment, then the brand won't need the emulation).  Then if
93  * you add a constant named "S10_FEATURE_X" to the end of the
94  * s10_emulated_features enumeration that represents the backported fix and
95  * S10_FEATURE_X evaluates to four, then you should create a file named
96  * /usr/lib/brand/solaris10/4 as part of your backport.  Additionally, you
97  * should retain the aforementioned emulation but modify it so that it's
98  * performed only when S10_FEATURE_IS_PRESENT(S10_FEATURE_X) is false.  Thus the
99  * emulation function should look something like the following:
100  *
101  *	static int
102  *	my_emul_function(sysret_t *rv, ...)
103  *	{
104  *		if (S10_FEATURE_IS_PRESENT(S10_FEATURE_X)) {
105  *			// Don't emulate
106  *			return (__systemcall(rv, ...));
107  *		} else {
108  *			// Emulate whatever needs to be emulated when the
109  *			// backport isn't present in the Solaris 10 image.
110  *		}
111  *	}
112  */
113 #define	S10_FEATURE_IS_PRESENT(s10_emulated_features_constant)	\
114 	((emul_bitmap[(s10_emulated_features_constant) >> 3] &	\
115 	(1 << ((s10_emulated_features_constant) & 0x7))) != 0)
116 
117 brand_sysent_table_t brand_sysent_table[];
118 
119 #define	S10_UTS_RELEASE	"5.10"
120 #define	S10_UTS_VERSION	"Generic_Virtual"
121 
122 /*
123  * Figures out the PID of init for the zone.  Also returns a boolean
124  * indicating whether this process currently has that pid: if so,
125  * then at this moment, we are init.
126  */
127 static boolean_t
128 get_initpid_info(void)
129 {
130 	pid_t pid;
131 	sysret_t rval;
132 	int err;
133 
134 	/*
135 	 * Determine the current process PID and the PID of the zone's init.
136 	 * We use care not to call getpid() here, because we're not supposed
137 	 * to call getpid() until after the program is fully linked-- the
138 	 * first call to getpid() is a signal from the linker to debuggers
139 	 * that linking has been completed.
140 	 */
141 	if ((err = __systemcall(&rval, SYS_brand,
142 	    B_S10_PIDINFO, &pid, &zone_init_pid)) != 0) {
143 		brand_abort(err, "Failed to get init's pid");
144 	}
145 
146 	/*
147 	 * Note that we need to be cautious with the pid we get back--
148 	 * it should not be stashed and used in place of getpid(), since
149 	 * we might fork(2).  So we keep zone_init_pid and toss the pid
150 	 * we otherwise got.
151 	 */
152 	if (pid == zone_init_pid)
153 		return (B_TRUE);
154 
155 	return (B_FALSE);
156 }
157 
158 /* Free the thread-local storage provided by mntfs_get_mntentbuf(). */
159 static void
160 mntfs_free_mntentbuf(void *arg)
161 {
162 	struct mntentbuf *embufp = arg;
163 
164 	if (embufp == NULL)
165 		return;
166 	if (embufp->mbuf_emp)
167 		free(embufp->mbuf_emp);
168 	if (embufp->mbuf_buf)
169 		free(embufp->mbuf_buf);
170 	bzero(embufp, sizeof (struct mntentbuf));
171 	free(embufp);
172 }
173 
174 /* Provide the thread-local storage required by mntfs_ioctl(). */
175 static struct mntentbuf *
176 mntfs_get_mntentbuf(size_t size)
177 {
178 	static mutex_t keylock;
179 	static thread_key_t key;
180 	static int once_per_keyname = 0;
181 	void *tsd = NULL;
182 	struct mntentbuf *embufp;
183 
184 	/* Create the key. */
185 	if (!once_per_keyname) {
186 		(void) mutex_lock(&keylock);
187 		if (!once_per_keyname) {
188 			if (thr_keycreate(&key, mntfs_free_mntentbuf)) {
189 				(void) mutex_unlock(&keylock);
190 				return (NULL);
191 			} else {
192 				once_per_keyname++;
193 			}
194 		}
195 		(void) mutex_unlock(&keylock);
196 	}
197 
198 	/*
199 	 * The thread-specific datum for this key is the address of a struct
200 	 * mntentbuf. If this is the first time here then we allocate the struct
201 	 * and its contents, and associate its address with the thread; if there
202 	 * are any problems then we abort.
203 	 */
204 	if (thr_getspecific(key, &tsd))
205 		return (NULL);
206 	if (tsd == NULL) {
207 		if (!(embufp = calloc(1, sizeof (struct mntentbuf))) ||
208 		    !(embufp->mbuf_emp = malloc(sizeof (struct extmnttab))) ||
209 		    thr_setspecific(key, embufp)) {
210 			mntfs_free_mntentbuf(embufp);
211 			return (NULL);
212 		}
213 	} else {
214 		embufp = tsd;
215 	}
216 
217 	/* Return the buffer, resizing it if necessary. */
218 	if (size > embufp->mbuf_bufsize) {
219 		if (embufp->mbuf_buf)
220 			free(embufp->mbuf_buf);
221 		if ((embufp->mbuf_buf = malloc(size)) == NULL) {
222 			embufp->mbuf_bufsize = 0;
223 			return (NULL);
224 		} else {
225 			embufp->mbuf_bufsize = size;
226 		}
227 	}
228 	return (embufp);
229 }
230 
231 /*
232  * The MNTIOC_GETMNTENT command in this release differs from that in early
233  * versions of Solaris 10.
234  *
235  * Previously, the command would copy a pointer to a struct extmnttab to an
236  * address provided as an argument. The pointer would be somewhere within a
237  * mapping already present within the user's address space. In addition, the
238  * text to which the struct's members pointed would also be within a
239  * pre-existing mapping. Now, the user is required to allocate memory for both
240  * the struct and the text buffer, and to pass the address of each within a
241  * struct mntentbuf. In order to conceal these details from a Solaris 10 client
242  * we allocate some thread-local storage in which to create the necessary data
243  * structures; this is static, thread-safe memory that will be cleaned up
244  * without the caller's intervention.
245  *
246  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY are new in this release; they should
247  * not work for older clients.
248  */
249 int
250 mntfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
251 {
252 	int err;
253 	struct stat statbuf;
254 	struct mntentbuf *embufp;
255 	static size_t bufsize = MNT_LINE_MAX;
256 
257 	/* Do not emulate mntfs commands from up-to-date clients. */
258 	if (S10_FEATURE_IS_PRESENT(S10_FEATURE_ALTERED_MNTFS_IOCTL))
259 		return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
260 
261 	/* Do not emulate mntfs commands directed at other file systems. */
262 	if ((err = __systemcall(rval, SYS_fstatat + 1024,
263 	    fdes, NULL, &statbuf, 0)) != 0)
264 		return (err);
265 	if (strcmp(statbuf.st_fstype, MNTTYPE_MNTFS) != 0)
266 		return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
267 
268 	if (cmd == MNTIOC_GETEXTMNTENT || cmd == MNTIOC_GETMNTANY)
269 		return (EINVAL);
270 
271 	if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
272 		return (ENOMEM);
273 
274 	/*
275 	 * MNTIOC_GETEXTMNTENT advances the file pointer once it has
276 	 * successfully copied out the result to the address provided. We
277 	 * therefore need to check the user-supplied address now since the
278 	 * one we'll be providing is guaranteed to work.
279 	 */
280 	if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
281 		return (EFAULT);
282 
283 	/*
284 	 * Keep retrying for as long as we fail for want of a large enough
285 	 * buffer.
286 	 */
287 	for (;;) {
288 		if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes,
289 		    MNTIOC_GETEXTMNTENT, embufp)) != 0)
290 			return (err);
291 
292 		if (rval->sys_rval1 == MNTFS_TOOLONG) {
293 			/* The buffer wasn't large enough. */
294 			(void) atomic_swap_ulong((unsigned long *)&bufsize,
295 			    2 * embufp->mbuf_bufsize);
296 			if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
297 				return (ENOMEM);
298 		} else {
299 			break;
300 		}
301 	}
302 
303 	if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
304 		return (EFAULT);
305 
306 	return (0);
307 }
308 
309 /*
310  * Assign the structure member value from the s (source) structure to the
311  * d (dest) structure.
312  */
313 #define	struct_assign(d, s, val)	(((d).val) = ((s).val))
314 
315 /*
316  * The CRYPTO_GET_FUNCTION_LIST parameter structure crypto_function_list_t
317  * changed between S10 and Nevada, so we have to emulate the old S10
318  * crypto_function_list_t structure when interposing on the ioctl syscall.
319  */
320 typedef struct s10_crypto_function_list {
321 	boolean_t fl_digest_init;
322 	boolean_t fl_digest;
323 	boolean_t fl_digest_update;
324 	boolean_t fl_digest_key;
325 	boolean_t fl_digest_final;
326 
327 	boolean_t fl_encrypt_init;
328 	boolean_t fl_encrypt;
329 	boolean_t fl_encrypt_update;
330 	boolean_t fl_encrypt_final;
331 
332 	boolean_t fl_decrypt_init;
333 	boolean_t fl_decrypt;
334 	boolean_t fl_decrypt_update;
335 	boolean_t fl_decrypt_final;
336 
337 	boolean_t fl_mac_init;
338 	boolean_t fl_mac;
339 	boolean_t fl_mac_update;
340 	boolean_t fl_mac_final;
341 
342 	boolean_t fl_sign_init;
343 	boolean_t fl_sign;
344 	boolean_t fl_sign_update;
345 	boolean_t fl_sign_final;
346 	boolean_t fl_sign_recover_init;
347 	boolean_t fl_sign_recover;
348 
349 	boolean_t fl_verify_init;
350 	boolean_t fl_verify;
351 	boolean_t fl_verify_update;
352 	boolean_t fl_verify_final;
353 	boolean_t fl_verify_recover_init;
354 	boolean_t fl_verify_recover;
355 
356 	boolean_t fl_digest_encrypt_update;
357 	boolean_t fl_decrypt_digest_update;
358 	boolean_t fl_sign_encrypt_update;
359 	boolean_t fl_decrypt_verify_update;
360 
361 	boolean_t fl_seed_random;
362 	boolean_t fl_generate_random;
363 
364 	boolean_t fl_session_open;
365 	boolean_t fl_session_close;
366 	boolean_t fl_session_login;
367 	boolean_t fl_session_logout;
368 
369 	boolean_t fl_object_create;
370 	boolean_t fl_object_copy;
371 	boolean_t fl_object_destroy;
372 	boolean_t fl_object_get_size;
373 	boolean_t fl_object_get_attribute_value;
374 	boolean_t fl_object_set_attribute_value;
375 	boolean_t fl_object_find_init;
376 	boolean_t fl_object_find;
377 	boolean_t fl_object_find_final;
378 
379 	boolean_t fl_key_generate;
380 	boolean_t fl_key_generate_pair;
381 	boolean_t fl_key_wrap;
382 	boolean_t fl_key_unwrap;
383 	boolean_t fl_key_derive;
384 
385 	boolean_t fl_init_token;
386 	boolean_t fl_init_pin;
387 	boolean_t fl_set_pin;
388 
389 	boolean_t prov_is_hash_limited;
390 	uint32_t prov_hash_threshold;
391 	uint32_t prov_hash_limit;
392 } s10_crypto_function_list_t;
393 
394 typedef struct s10_crypto_get_function_list {
395 	uint_t				fl_return_value;
396 	crypto_provider_id_t		fl_provider_id;
397 	s10_crypto_function_list_t	fl_list;
398 } s10_crypto_get_function_list_t;
399 
400 /*
401  * The structure returned by the CRYPTO_GET_FUNCTION_LIST ioctl on /dev/crypto
402  * increased in size due to:
403  *	6482533 Threshold for HW offload via PKCS11 interface
404  * between S10 and Nevada.  This is a relatively simple process of filling
405  * in the S10 structure fields with the Nevada data.
406  *
407  * We stat the device to make sure that the ioctl is meant for /dev/crypto.
408  *
409  */
410 static int
411 crypto_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
412 {
413 	int				err;
414 	s10_crypto_get_function_list_t	s10_param;
415 	crypto_get_function_list_t	native_param;
416 	static dev_t			crypto_dev = (dev_t)-1;
417 	struct stat			sbuf;
418 
419 	if (crypto_dev == (dev_t)-1) {
420 		if ((err = __systemcall(rval, SYS_fstatat + 1024,
421 		    AT_FDCWD, "/dev/crypto", &sbuf, 0)) != 0)
422 			goto nonemuioctl;
423 		crypto_dev = major(sbuf.st_rdev);
424 	}
425 	if ((err = __systemcall(rval, SYS_fstatat + 1024,
426 	    fdes, NULL, &sbuf, 0)) != 0)
427 		return (err);
428 	/* Each open fd of /dev/crypto gets a new minor device. */
429 	if (major(sbuf.st_rdev) != crypto_dev)
430 		goto nonemuioctl;
431 
432 	if (brand_uucopy((const void *)arg, &s10_param, sizeof (s10_param))
433 	    != 0)
434 		return (EFAULT);
435 	struct_assign(native_param, s10_param, fl_provider_id);
436 	if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd,
437 	    &native_param)) != 0)
438 		return (err);
439 
440 	struct_assign(s10_param, native_param, fl_return_value);
441 	struct_assign(s10_param, native_param, fl_provider_id);
442 
443 	struct_assign(s10_param, native_param, fl_list.fl_digest_init);
444 	struct_assign(s10_param, native_param, fl_list.fl_digest);
445 	struct_assign(s10_param, native_param, fl_list.fl_digest_update);
446 	struct_assign(s10_param, native_param, fl_list.fl_digest_key);
447 	struct_assign(s10_param, native_param, fl_list.fl_digest_final);
448 
449 	struct_assign(s10_param, native_param, fl_list.fl_encrypt_init);
450 	struct_assign(s10_param, native_param, fl_list.fl_encrypt);
451 	struct_assign(s10_param, native_param, fl_list.fl_encrypt_update);
452 	struct_assign(s10_param, native_param, fl_list.fl_encrypt_final);
453 
454 	struct_assign(s10_param, native_param, fl_list.fl_decrypt_init);
455 	struct_assign(s10_param, native_param, fl_list.fl_decrypt);
456 	struct_assign(s10_param, native_param, fl_list.fl_decrypt_update);
457 	struct_assign(s10_param, native_param, fl_list.fl_decrypt_final);
458 
459 	struct_assign(s10_param, native_param, fl_list.fl_mac_init);
460 	struct_assign(s10_param, native_param, fl_list.fl_mac);
461 	struct_assign(s10_param, native_param, fl_list.fl_mac_update);
462 	struct_assign(s10_param, native_param, fl_list.fl_mac_final);
463 
464 	struct_assign(s10_param, native_param, fl_list.fl_sign_init);
465 	struct_assign(s10_param, native_param, fl_list.fl_sign);
466 	struct_assign(s10_param, native_param, fl_list.fl_sign_update);
467 	struct_assign(s10_param, native_param, fl_list.fl_sign_final);
468 	struct_assign(s10_param, native_param, fl_list.fl_sign_recover_init);
469 	struct_assign(s10_param, native_param, fl_list.fl_sign_recover);
470 
471 	struct_assign(s10_param, native_param, fl_list.fl_verify_init);
472 	struct_assign(s10_param, native_param, fl_list.fl_verify);
473 	struct_assign(s10_param, native_param, fl_list.fl_verify_update);
474 	struct_assign(s10_param, native_param, fl_list.fl_verify_final);
475 	struct_assign(s10_param, native_param, fl_list.fl_verify_recover_init);
476 	struct_assign(s10_param, native_param, fl_list.fl_verify_recover);
477 
478 	struct_assign(s10_param, native_param,
479 	    fl_list.fl_digest_encrypt_update);
480 	struct_assign(s10_param, native_param,
481 	    fl_list.fl_decrypt_digest_update);
482 	struct_assign(s10_param, native_param, fl_list.fl_sign_encrypt_update);
483 	struct_assign(s10_param, native_param,
484 	    fl_list.fl_decrypt_verify_update);
485 
486 	struct_assign(s10_param, native_param, fl_list.fl_seed_random);
487 	struct_assign(s10_param, native_param, fl_list.fl_generate_random);
488 
489 	struct_assign(s10_param, native_param, fl_list.fl_session_open);
490 	struct_assign(s10_param, native_param, fl_list.fl_session_close);
491 	struct_assign(s10_param, native_param, fl_list.fl_session_login);
492 	struct_assign(s10_param, native_param, fl_list.fl_session_logout);
493 
494 	struct_assign(s10_param, native_param, fl_list.fl_object_create);
495 	struct_assign(s10_param, native_param, fl_list.fl_object_copy);
496 	struct_assign(s10_param, native_param, fl_list.fl_object_destroy);
497 	struct_assign(s10_param, native_param, fl_list.fl_object_get_size);
498 	struct_assign(s10_param, native_param,
499 	    fl_list.fl_object_get_attribute_value);
500 	struct_assign(s10_param, native_param,
501 	    fl_list.fl_object_set_attribute_value);
502 	struct_assign(s10_param, native_param, fl_list.fl_object_find_init);
503 	struct_assign(s10_param, native_param, fl_list.fl_object_find);
504 	struct_assign(s10_param, native_param, fl_list.fl_object_find_final);
505 
506 	struct_assign(s10_param, native_param, fl_list.fl_key_generate);
507 	struct_assign(s10_param, native_param, fl_list.fl_key_generate_pair);
508 	struct_assign(s10_param, native_param, fl_list.fl_key_wrap);
509 	struct_assign(s10_param, native_param, fl_list.fl_key_unwrap);
510 	struct_assign(s10_param, native_param, fl_list.fl_key_derive);
511 
512 	struct_assign(s10_param, native_param, fl_list.fl_init_token);
513 	struct_assign(s10_param, native_param, fl_list.fl_init_pin);
514 	struct_assign(s10_param, native_param, fl_list.fl_set_pin);
515 
516 	struct_assign(s10_param, native_param, fl_list.prov_is_hash_limited);
517 	struct_assign(s10_param, native_param, fl_list.prov_hash_threshold);
518 	struct_assign(s10_param, native_param, fl_list.prov_hash_limit);
519 
520 	return (brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param)));
521 
522 nonemuioctl:
523 	return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
524 }
525 
526 /*
527  * The process contract CT_TGET and CT_TSET parameter structure ct_param_t
528  * changed between S10 and Nevada, so we have to emulate the old S10
529  * ct_param_t structure when interposing on the ioctl syscall.
530  */
531 typedef struct s10_ct_param {
532 	uint32_t ctpm_id;
533 	uint32_t ctpm_pad;
534 	uint64_t ctpm_value;
535 } s10_ct_param_t;
536 
537 /*
538  * We have to emulate process contract ioctls for init(1M) because the
539  * ioctl parameter structure changed between S10 and Nevada.  This is
540  * a relatively simple process of filling Nevada structure fields,
541  * shuffling values, and initiating a native system call.
542  *
543  * For now, we'll assume that all consumers of CT_TGET and CT_TSET will
544  * need emulation.  We'll issue a stat to make sure that the ioctl
545  * is meant for the contract file system.
546  *
547  */
548 static int
549 ctfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
550 {
551 	int err;
552 	s10_ct_param_t s10param;
553 	ct_param_t param;
554 	struct stat statbuf;
555 
556 	if ((err = __systemcall(rval, SYS_fstatat + 1024,
557 	    fdes, NULL, &statbuf, 0)) != 0)
558 		return (err);
559 	if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0)
560 		return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
561 
562 	if (brand_uucopy((const void *)arg, &s10param, sizeof (s10param)) != 0)
563 		return (EFAULT);
564 	param.ctpm_id = s10param.ctpm_id;
565 	param.ctpm_size = sizeof (uint64_t);
566 	param.ctpm_value = &s10param.ctpm_value;
567 	if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, &param))
568 	    != 0)
569 		return (err);
570 
571 	if (cmd == CT_TGET)
572 		return (brand_uucopy(&s10param, (void *)arg,
573 		    sizeof (s10param)));
574 
575 	return (0);
576 }
577 
578 /*
579  * ZFS ioctls have changed in each Solaris 10 (S10) release as well as in
580  * Solaris Next.  The brand wraps ZFS commands so that the native commands
581  * are used, but we want to be sure no command sneaks in that uses ZFS
582  * without our knowledge.  We'll abort the process if we see a ZFS ioctl.
583  */
584 static int
585 zfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
586 {
587 	dev_t		zfs_dev;
588 	struct stat	sbuf;
589 
590 	/*
591 	 * See if the ioctl is targeting the ZFS device, /dev/zfs.
592 	 * If it isn't, then s10_ioctl() mistook the ioctl for a ZFS ioctl.
593 	 * In that case, we don't want to abort, so we pass it along to the
594 	 * kernel.
595 	 */
596 	if (__systemcall(rval, SYS_fstatat + 1024, AT_FDCWD, ZFS_DEV, &sbuf, 0)
597 	    != 0)
598 		return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
599 	zfs_dev = major(sbuf.st_rdev);
600 
601 	if (__systemcall(rval, SYS_fstatat + 1024, fdes, NULL, &sbuf, 0) != 0 ||
602 	    major(sbuf.st_rdev) != zfs_dev)
603 		return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
604 
605 	brand_abort(0, "ZFS ioctl!");
606 	/*NOTREACHED*/
607 	return (0);
608 }
609 
610 int
611 s10_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
612 {
613 	switch (cmd) {
614 	case CRYPTO_GET_FUNCTION_LIST:
615 		return (crypto_ioctl(rval, fdes, cmd, arg));
616 	case CT_TGET:
617 		/*FALLTHRU*/
618 	case CT_TSET:
619 		return (ctfs_ioctl(rval, fdes, cmd, arg));
620 	case MNTIOC_GETMNTENT:
621 		/*FALLTHRU*/
622 	case MNTIOC_GETEXTMNTENT:
623 		/*FALLTHRU*/
624 	case MNTIOC_GETMNTANY:
625 		return (mntfs_ioctl(rval, fdes, cmd, arg));
626 	}
627 
628 	if ((cmd & 0xff00) == ZFS_IOC)
629 		return (zfs_ioctl(rval, fdes, cmd, arg));
630 
631 	return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
632 }
633 
634 /*
635  * Unfortunately, pwrite()'s behavior differs between S10 and Nevada when
636  * applied to files opened with O_APPEND.  The offset argument is ignored and
637  * the buffer is appended to the target file in S10, whereas the current file
638  * position is ignored in Nevada (i.e., pwrite() acts as though the target file
639  * wasn't opened with O_APPEND).  This is a result of the fix for CR 6655660
640  * (pwrite() must ignore the O_APPEND/FAPPEND flag).
641  *
642  * We emulate the old S10 pwrite() behavior by checking whether the target file
643  * was opened with O_APPEND.  If it was, then invoke the write() system call
644  * instead of pwrite(); otherwise, invoke the pwrite() system call as usual.
645  */
646 static int
647 s10_pwrite(sysret_t *rval, int fd, const void *bufferp, size_t num_bytes,
648     off_t offset)
649 {
650 	int err;
651 
652 	if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0)
653 		return (err);
654 	if (rval->sys_rval1 & O_APPEND)
655 		return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
656 		    num_bytes));
657 	return (__systemcall(rval, SYS_pwrite + 1024, fd, bufferp, num_bytes,
658 	    offset));
659 }
660 
661 #if !defined(_LP64)
662 /*
663  * This is the large file version of the pwrite() system call for 32-bit
664  * processes.  This exists for the same reason that s10_pwrite() exists; see
665  * the comment above s10_pwrite().
666  */
667 static int
668 s10_pwrite64(sysret_t *rval, int fd, const void *bufferp, size32_t num_bytes,
669     uint32_t offset_1, uint32_t offset_2)
670 {
671 	int err;
672 
673 	if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0)
674 		return (err);
675 	if (rval->sys_rval1 & O_APPEND)
676 		return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
677 		    num_bytes));
678 	return (__systemcall(rval, SYS_pwrite64 + 1024, fd, bufferp,
679 	    num_bytes, offset_1, offset_2));
680 }
681 #endif	/* !_LP64 */
682 
683 /*
684  * These are convenience macros that s10_getdents_common() uses.  Both treat
685  * their arguments, which should be character pointers, as dirent pointers or
686  * dirent64 pointers and yield their d_name and d_reclen fields.  These
687  * macros shouldn't be used outside of s10_getdents_common().
688  */
689 #define	dirent_name(charptr)	((charptr) + name_offset)
690 #define	dirent_reclen(charptr)	\
691 	(*(unsigned short *)(uintptr_t)((charptr) + reclen_offset))
692 
693 /*
694  * This function contains code that is common to both s10_getdents() and
695  * s10_getdents64().  See the comment above s10_getdents() for details.
696  *
697  * rval, fd, buf, and nbyte should be passed unmodified from s10_getdents()
698  * and s10_getdents64().  getdents_syscall_id should be either SYS_getdents
699  * or SYS_getdents64.  name_offset should be the the byte offset of
700  * the d_name field in the dirent structures passed to the kernel via the
701  * syscall represented by getdents_syscall_id.  reclen_offset should be
702  * the byte offset of the d_reclen field in the aforementioned dirent
703  * structures.
704  */
705 static int
706 s10_getdents_common(sysret_t *rval, int fd, char *buf, size_t nbyte,
707     int getdents_syscall_id, size_t name_offset, size_t reclen_offset)
708 {
709 	int err;
710 	size_t buf_size;
711 	char *local_buf;
712 	char *buf_current;
713 
714 	/*
715 	 * Use a special brand operation, B_S10_ISFDXATTRDIR, to determine
716 	 * whether the specified file descriptor refers to an extended file
717 	 * attribute directory.  If it doesn't, then SYS_getdents won't
718 	 * reveal extended file attributes, in which case we can simply
719 	 * hand the syscall to the native kernel.
720 	 */
721 	if ((err = __systemcall(rval, SYS_brand + 1024, B_S10_ISFDXATTRDIR,
722 	    fd)) != 0)
723 		return (err);
724 	if (rval->sys_rval1 == 0)
725 		return (__systemcall(rval, getdents_syscall_id + 1024, fd, buf,
726 		    nbyte));
727 
728 	/*
729 	 * The file descriptor refers to an extended file attributes directory.
730 	 * We need to create a dirent buffer that's as large as buf into which
731 	 * the native SYS_getdents will store the special extended file
732 	 * attribute directory's entries.  We can't dereference buf because
733 	 * it might be an invalid pointer!
734 	 */
735 	if (nbyte > MAXGETDENTS_SIZE)
736 		nbyte = MAXGETDENTS_SIZE;
737 	local_buf = (char *)malloc(nbyte);
738 	if (local_buf == NULL) {
739 		/*
740 		 * getdents(2) doesn't return an error code indicating a memory
741 		 * allocation error and it doesn't make sense to return any of
742 		 * its documented error codes for a malloc(3C) failure.  We'll
743 		 * use ENOMEM even though getdents(2) doesn't use it because it
744 		 * best describes the failure.
745 		 */
746 		(void) B_TRUSS_POINT_3(rval, getdents_syscall_id, ENOMEM, fd,
747 		    buf, nbyte);
748 		rval->sys_rval1 = -1;
749 		rval->sys_rval2 = 0;
750 		return (EIO);
751 	}
752 
753 	/*
754 	 * Issue a native SYS_getdents syscall but use our local dirent buffer
755 	 * instead of buf.  This will allow us to examine the returned dirent
756 	 * structures immediately and copy them to buf later.  That way the
757 	 * calling process won't be able to see the dirent structures until
758 	 * we finish examining them.
759 	 */
760 	if ((err = __systemcall(rval, getdents_syscall_id + 1024, fd, local_buf,
761 	    nbyte)) != 0) {
762 		free(local_buf);
763 		return (err);
764 	}
765 	buf_size = rval->sys_rval1;
766 	if (buf_size == 0) {
767 		free(local_buf);
768 		return (0);
769 	}
770 
771 	/*
772 	 * Look for SUNWattr_ro (VIEW_READONLY) and SUNWattr_rw
773 	 * (VIEW_READWRITE) in the directory entries and remove them
774 	 * from the dirent buffer.
775 	 */
776 	for (buf_current = local_buf;
777 	    (size_t)(buf_current - local_buf) < buf_size; /* cstyle */) {
778 		if (strcmp(dirent_name(buf_current), VIEW_READONLY) != 0 &&
779 		    strcmp(dirent_name(buf_current), VIEW_READWRITE) != 0) {
780 			/*
781 			 * The dirent refers to an attribute that should
782 			 * be visible to Solaris 10 processes.  Keep it
783 			 * and examine the next entry in the buffer.
784 			 */
785 			buf_current += dirent_reclen(buf_current);
786 		} else {
787 			/*
788 			 * We found either SUNWattr_ro (VIEW_READONLY)
789 			 * or SUNWattr_rw (VIEW_READWRITE).  Remove it
790 			 * from the dirent buffer by decrementing
791 			 * buf_size by the size of the entry and
792 			 * overwriting the entry with the remaining
793 			 * entries.
794 			 */
795 			buf_size -= dirent_reclen(buf_current);
796 			(void) memmove(buf_current, buf_current +
797 			    dirent_reclen(buf_current), buf_size -
798 			    (size_t)(buf_current - local_buf));
799 		}
800 	}
801 
802 	/*
803 	 * Copy local_buf into buf so that the calling process can see
804 	 * the results.
805 	 */
806 	if ((err = brand_uucopy(local_buf, buf, buf_size)) != 0) {
807 		free(local_buf);
808 		rval->sys_rval1 = -1;
809 		rval->sys_rval2 = 0;
810 		return (err);
811 	}
812 	rval->sys_rval1 = buf_size;
813 	free(local_buf);
814 	return (0);
815 }
816 
817 /*
818  * Solaris Next added two special extended file attributes, SUNWattr_ro and
819  * SUNWattr_rw, which are called "extended system attributes".  They have
820  * special semantics (e.g., a process cannot unlink SUNWattr_ro) and should
821  * not appear in solaris10-branded zones because no Solaris 10 applications,
822  * including system commands such as tar(1), are coded to correctly handle these
823  * special attributes.
824  *
825  * This emulation function solves the aforementioned problem by emulating
826  * the getdents(2) syscall and filtering both system attributes out of resulting
827  * directory entry lists.  The emulation function only filters results when
828  * the given file descriptor refers to an extended file attribute directory.
829  * Filtering getdents(2) results is expensive because it requires dynamic
830  * memory allocation; however, the performance cost is tolerable because
831  * we don't expect Solaris 10 processes to frequently examine extended file
832  * attribute directories.
833  *
834  * The brand's emulation library needs two getdents(2) emulation functions
835  * because getdents(2) comes in two flavors: non-largefile-aware getdents(2)
836  * and largefile-aware getdents64(2).  s10_getdents() handles the non-largefile-
837  * aware case for 32-bit processes and all getdents(2) syscalls for 64-bit
838  * processes (64-bit processes use largefile-aware interfaces by default).
839  * See s10_getdents64() below for the largefile-aware getdents64(2) emulation
840  * function for 32-bit processes.
841  */
842 static int
843 s10_getdents(sysret_t *rval, int fd, struct dirent *buf, size_t nbyte)
844 {
845 	return (s10_getdents_common(rval, fd, (char *)buf, nbyte, SYS_getdents,
846 	    offsetof(struct dirent, d_name),
847 	    offsetof(struct dirent, d_reclen)));
848 }
849 
850 #ifndef	_LP64
851 /*
852  * This is the largefile-aware version of getdents(2) for 32-bit processes.
853  * This exists for the same reason that s10_getdents() exists.  See the comment
854  * above s10_getdents().
855  */
856 static int
857 s10_getdents64(sysret_t *rval, int fd, struct dirent64 *buf, size_t nbyte)
858 {
859 	return (s10_getdents_common(rval, fd, (char *)buf, nbyte,
860 	    SYS_getdents64, offsetof(struct dirent64, d_name),
861 	    offsetof(struct dirent64, d_reclen)));
862 }
863 #endif	/* !_LP64 */
864 
865 #define	S10_TRIVIAL_ACL_CNT	6
866 #define	NATIVE_TRIVIAL_ACL_CNT	3
867 
868 /*
869  * Check if the ACL qualifies as a trivial ACL based on the native
870  * interpretation.
871  */
872 static boolean_t
873 has_trivial_native_acl(int cmd, int cnt, const char *fname, int fd)
874 {
875 	int i, err;
876 	sysret_t rval;
877 	ace_t buf[NATIVE_TRIVIAL_ACL_CNT];
878 
879 	if (fname != NULL)
880 		err = __systemcall(&rval, SYS_pathconf + 1024, fname,
881 		    _PC_ACL_ENABLED);
882 	else
883 		err = __systemcall(&rval, SYS_fpathconf + 1024, fd,
884 		    _PC_ACL_ENABLED);
885 	if (err != 0 || rval.sys_rval1 != _ACL_ACE_ENABLED)
886 		return (B_FALSE);
887 
888 	/*
889 	 * If we just got the ACL cnt, we don't need to get it again, its
890 	 * passed in as the cnt arg.
891 	 */
892 	if (cmd != ACE_GETACLCNT) {
893 		if (fname != NULL) {
894 			if (__systemcall(&rval, SYS_acl + 1024, fname,
895 			    ACE_GETACLCNT, 0, NULL) != 0)
896 				return (B_FALSE);
897 		} else {
898 			if (__systemcall(&rval, SYS_facl + 1024, fd,
899 			    ACE_GETACLCNT, 0, NULL) != 0)
900 				return (B_FALSE);
901 		}
902 		cnt = rval.sys_rval1;
903 	}
904 
905 	if (cnt != NATIVE_TRIVIAL_ACL_CNT)
906 		return (B_FALSE);
907 
908 	if (fname != NULL) {
909 		if (__systemcall(&rval, SYS_acl + 1024, fname, ACE_GETACL, cnt,
910 		    buf) != 0)
911 			return (B_FALSE);
912 	} else {
913 		if (__systemcall(&rval, SYS_facl + 1024, fd, ACE_GETACL, cnt,
914 		    buf) != 0)
915 			return (B_FALSE);
916 	}
917 
918 	/*
919 	 * The following is based on the logic from the native OS
920 	 * ace_trivial_common() to determine if the native ACL is trivial.
921 	 */
922 	for (i = 0; i < cnt; i++) {
923 		switch (buf[i].a_flags & ACE_TYPE_FLAGS) {
924 		case ACE_OWNER:
925 		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
926 		case ACE_EVERYONE:
927 			break;
928 		default:
929 			return (B_FALSE);
930 		}
931 
932 		if (buf[i].a_flags & (ACE_FILE_INHERIT_ACE|
933 		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
934 		    ACE_INHERIT_ONLY_ACE))
935 			return (B_FALSE);
936 
937 		/*
938 		 * Special check for some special bits
939 		 *
940 		 * Don't allow anybody to deny reading basic
941 		 * attributes or a files ACL.
942 		 */
943 		if (buf[i].a_access_mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
944 		    buf[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE)
945 			return (B_FALSE);
946 
947 		/*
948 		 * Delete permissions are never set by default
949 		 */
950 		if (buf[i].a_access_mask & (ACE_DELETE|ACE_DELETE_CHILD))
951 			return (B_FALSE);
952 		/*
953 		 * only allow owner@ to have
954 		 * write_acl/write_owner/write_attributes/write_xattr/
955 		 */
956 		if (buf[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
957 		    (!(buf[i].a_flags & ACE_OWNER) && (buf[i].a_access_mask &
958 		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
959 		    ACE_WRITE_NAMED_ATTRS))))
960 			return (B_FALSE);
961 
962 	}
963 
964 	return (B_TRUE);
965 }
966 
967 /*
968  * The following logic is based on the S10 adjust_ace_pair_common() code.
969  */
970 static void
971 s10_adjust_ace_mask(void *pair, size_t access_off, size_t pairsize, mode_t mode)
972 {
973 	char *datap = (char *)pair;
974 	uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
975 	uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
976 	    access_off);
977 
978 	if (mode & S_IROTH)
979 		*amask1 |= ACE_READ_DATA;
980 	else
981 		*amask0 |= ACE_READ_DATA;
982 	if (mode & S_IWOTH)
983 		*amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
984 	else
985 		*amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
986 	if (mode & S_IXOTH)
987 		*amask1 |= ACE_EXECUTE;
988 	else
989 		*amask0 |= ACE_EXECUTE;
990 }
991 
992 /*
993  * Construct a trivial S10 style ACL.
994  */
995 static int
996 make_trivial_s10_acl(const char *fname, int fd, ace_t *bp)
997 {
998 	int err;
999 	sysret_t rval;
1000 	struct stat64 buf;
1001 	ace_t trivial_s10_acl[] = {
1002 		{(uint_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
1003 		{(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
1004 		    ACE_WRITE_NAMED_ATTRS, ACE_OWNER,
1005 		    ACE_ACCESS_ALLOWED_ACE_TYPE},
1006 		{(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
1007 		    ACE_ACCESS_DENIED_ACE_TYPE},
1008 		{(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
1009 		    ACE_ACCESS_ALLOWED_ACE_TYPE},
1010 		{(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
1011 		    ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE,
1012 		    ACE_ACCESS_DENIED_ACE_TYPE},
1013 		{(uint_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|
1014 		    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE, ACE_EVERYONE,
1015 		    ACE_ACCESS_ALLOWED_ACE_TYPE}
1016 	};
1017 
1018 	if (fname != NULL) {
1019 		if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, AT_FDCWD,
1020 		    fname, &buf, 0)) != 0)
1021 			return (err);
1022 	} else {
1023 		if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, fd,
1024 		    NULL, &buf, 0)) != 0)
1025 			return (err);
1026 	}
1027 
1028 	s10_adjust_ace_mask(&trivial_s10_acl[0], offsetof(ace_t, a_access_mask),
1029 	    sizeof (ace_t), (buf.st_mode & 0700) >> 6);
1030 	s10_adjust_ace_mask(&trivial_s10_acl[2], offsetof(ace_t, a_access_mask),
1031 	    sizeof (ace_t), (buf.st_mode & 0070) >> 3);
1032 	s10_adjust_ace_mask(&trivial_s10_acl[4], offsetof(ace_t, a_access_mask),
1033 	    sizeof (ace_t), buf.st_mode & 0007);
1034 
1035 	if (brand_uucopy(&trivial_s10_acl, bp, sizeof (trivial_s10_acl)) != 0)
1036 		return (EFAULT);
1037 
1038 	return (0);
1039 }
1040 
1041 /*
1042  * The definition of a trivial ace-style ACL (used by ZFS and NFSv4) has been
1043  * simplified since S10.  Instead of 6 entries on a trivial S10 ACE ACL we now
1044  * have 3 streamlined entries.  The new, simpler trivial style confuses S10
1045  * commands such as 'ls -v' or 'cp -p' which don't see the expected S10 trivial
1046  * ACL entries and thus assume that there is a complex ACL on the file.
1047  *
1048  * See: PSARC/2010/029 Improved ACL interoperability
1049  *
1050  * Note that the trival ACL detection code is implemented in acl_trival() in
1051  * lib/libsec/common/aclutils.c.  It always uses the acl() syscall (not the
1052  * facl syscall) to determine if an ACL is trivial.  However, we emulate both
1053  * acl() and facl() so that the two provide consistent results.
1054  *
1055  * We don't currently try to emulate setting of ACLs since the primary
1056  * consumer of this feature is SMB or NFSv4 servers, neither of which are
1057  * supported in solaris10-branded zones.  If ACLs are used they must be set on
1058  * files using the native OS interpretation.
1059  */
1060 int
1061 s10_acl(sysret_t *rval, const char *fname, int cmd, int nentries, void *aclbufp)
1062 {
1063 	int res;
1064 
1065 	res = __systemcall(rval, SYS_acl + 1024, fname, cmd, nentries, aclbufp);
1066 
1067 	switch (cmd) {
1068 	case ACE_GETACLCNT:
1069 		if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT,
1070 		    rval->sys_rval1, fname, 0)) {
1071 			rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1072 		}
1073 		break;
1074 	case ACE_GETACL:
1075 		if (res == 0 &&
1076 		    has_trivial_native_acl(ACE_GETACL, 0, fname, 0) &&
1077 		    nentries >= S10_TRIVIAL_ACL_CNT) {
1078 			res = make_trivial_s10_acl(fname, 0, aclbufp);
1079 			rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1080 		}
1081 		break;
1082 	}
1083 
1084 	return (res);
1085 }
1086 
1087 int
1088 s10_facl(sysret_t *rval, int fdes, int cmd, int nentries, void *aclbufp)
1089 {
1090 	int res;
1091 
1092 	res = __systemcall(rval, SYS_facl + 1024, fdes, cmd, nentries, aclbufp);
1093 
1094 	switch (cmd) {
1095 	case ACE_GETACLCNT:
1096 		if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT,
1097 		    rval->sys_rval1, NULL, fdes)) {
1098 			rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1099 		}
1100 		break;
1101 	case ACE_GETACL:
1102 		if (res == 0 &&
1103 		    has_trivial_native_acl(ACE_GETACL, 0, NULL, fdes) &&
1104 		    nentries >= S10_TRIVIAL_ACL_CNT) {
1105 			res = make_trivial_s10_acl(NULL, fdes, aclbufp);
1106 			rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1107 		}
1108 		break;
1109 	}
1110 
1111 	return (res);
1112 }
1113 
1114 #define	S10_AC_PROC		(0x1 << 28)
1115 #define	S10_AC_TASK		(0x2 << 28)
1116 #define	S10_AC_FLOW		(0x4 << 28)
1117 #define	S10_AC_MODE(x)		((x) & 0xf0000000)
1118 #define	S10_AC_OPTION(x)	((x) & 0x0fffffff)
1119 
1120 /*
1121  * The mode shift, mode mask and option mask for acctctl have changed.  The
1122  * mode is currently the top full byte and the option is the lower 3 full bytes.
1123  */
1124 int
1125 s10_acctctl(sysret_t *rval, int cmd, void *buf, size_t bufsz)
1126 {
1127 	int mode = S10_AC_MODE(cmd);
1128 	int option = S10_AC_OPTION(cmd);
1129 
1130 	switch (mode) {
1131 	case S10_AC_PROC:
1132 		mode = AC_PROC;
1133 		break;
1134 	case S10_AC_TASK:
1135 		mode = AC_TASK;
1136 		break;
1137 	case S10_AC_FLOW:
1138 		mode = AC_FLOW;
1139 		break;
1140 	default:
1141 		return (B_TRUSS_POINT_3(rval, SYS_acctctl, EINVAL, cmd, buf,
1142 		    bufsz));
1143 	}
1144 
1145 	return (__systemcall(rval, SYS_acctctl + 1024, mode | option, buf,
1146 	    bufsz));
1147 }
1148 
1149 /*
1150  * The Audit Policy parameters have changed due to:
1151  *    6466722 audituser and AUDIT_USER are defined, unused, undocumented and
1152  *            should be removed.
1153  *
1154  * In S10 we had the following flag:
1155  *	#define AUDIT_USER 0x0040
1156  * which doesn't exist in Solaris Next where the subsequent flags are shifted
1157  * down.  For example, in S10 we had:
1158  *	#define AUDIT_GROUP     0x0080
1159  * but on Solaris Next we have:
1160  *	#define AUDIT_GROUP     0x0040
1161  * AUDIT_GROUP has the value AUDIT_USER had in S10 and all of the subsequent
1162  * bits are also shifted one place.
1163  *
1164  * When we're getting or setting the Audit Policy parameters we need to
1165  * shift the outgoing or incoming bits into their proper positions.  Since
1166  * S10_AUDIT_USER was always unused, we always clear that bit on A_GETPOLICY.
1167  *
1168  * The command we care about, BSM_AUDITCTL, passes the most parameters (3),
1169  * so declare this function to take up to 4 args and just pass them on.
1170  * The number of parameters for s10_auditsys needs to be equal to the BSM_*
1171  * subcommand that has the most parameters, since we want to pass all
1172  * parameters through, regardless of which subcommands we interpose on.
1173  *
1174  * Note that the auditsys system call uses the SYSENT_AP macro wrapper instead
1175  * of the more common SYSENT_CI macro.  This means the return value is a
1176  * SE_64RVAL so the syscall table uses RV_64RVAL.
1177  */
1178 
1179 #define	S10_AUDIT_HMASK	0xffffffc0
1180 #define	S10_AUDIT_LMASK	0x3f
1181 #define	S10_AUC_NOSPACE	0x3
1182 
1183 int
1184 s10_auditsys(sysret_t *rval, int bsmcmd, intptr_t a0, intptr_t a1, intptr_t a2)
1185 {
1186 	int	    err;
1187 	uint32_t    m;
1188 
1189 	if (bsmcmd != BSM_AUDITCTL)
1190 		return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1,
1191 		    a2));
1192 
1193 	if ((int)a0 == A_GETPOLICY) {
1194 		if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0,
1195 		    &m, a2)) != 0)
1196 			return (err);
1197 		m = ((m & S10_AUDIT_HMASK) << 1) | (m & S10_AUDIT_LMASK);
1198 		if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0)
1199 			return (EFAULT);
1200 		return (0);
1201 
1202 	} else if ((int)a0 == A_SETPOLICY) {
1203 		if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0)
1204 			return (EFAULT);
1205 		m = ((m >> 1) & S10_AUDIT_HMASK) | (m & S10_AUDIT_LMASK);
1206 		return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m,
1207 		    a2));
1208 	} else if ((int)a0 == A_GETCOND) {
1209 		if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0,
1210 		    &m, a2)) != 0)
1211 			return (err);
1212 		if (m == AUC_NOSPACE)
1213 			m = S10_AUC_NOSPACE;
1214 		if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0)
1215 			return (EFAULT);
1216 		return (0);
1217 	} else if ((int)a0 == A_SETCOND) {
1218 		if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0)
1219 			return (EFAULT);
1220 		if (m == S10_AUC_NOSPACE)
1221 			m = AUC_NOSPACE;
1222 		return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m,
1223 		    a2));
1224 	}
1225 
1226 	return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, a2));
1227 }
1228 
1229 /*
1230  * Determine whether the executable passed to SYS_exec or SYS_execve is a
1231  * native executable.  The s10_npreload.so invokes the B_S10_NATIVE brand
1232  * operation which patches up the processes exec info to eliminate any trace
1233  * of the wrapper.  That will make pgrep and other commands that examine
1234  * process' executable names and command-line parameters work properly.
1235  */
1236 static int
1237 s10_exec_native(sysret_t *rval, const char *fname, const char **argp,
1238     const char **envp)
1239 {
1240 	const char *filename = fname;
1241 	char path[64];
1242 	int err;
1243 
1244 	/* Get a copy of the executable we're trying to run */
1245 	path[0] = '\0';
1246 	(void) brand_uucopystr(filename, path, sizeof (path));
1247 
1248 	/* Check if we're trying to run a native binary */
1249 	if (strncmp(path, "/.SUNWnative/usr/lib/brand/solaris10/s10_native",
1250 	    sizeof (path)) != 0)
1251 		return (0);
1252 
1253 	/* Skip the first element in the argv array */
1254 	argp++;
1255 
1256 	/*
1257 	 * The the path of the dynamic linker is the second parameter
1258 	 * of s10_native_exec().
1259 	 */
1260 	if (brand_uucopy(argp, &filename, sizeof (char *)) != 0)
1261 		return (EFAULT);
1262 
1263 	/* If an exec call succeeds, it never returns */
1264 	err = __systemcall(rval, SYS_brand + 1024, B_EXEC_NATIVE, filename,
1265 	    argp, envp, NULL, NULL, NULL);
1266 	brand_assert(err != 0);
1267 	return (err);
1268 }
1269 
1270 /*
1271  * Interpose on the SYS_exec syscall to detect native wrappers.
1272  */
1273 int
1274 s10_exec(sysret_t *rval, const char *fname, const char **argp)
1275 {
1276 	int err;
1277 
1278 	if ((err = s10_exec_native(rval, fname, argp, NULL)) != 0)
1279 		return (err);
1280 
1281 	/* If an exec call succeeds, it never returns */
1282 	err = __systemcall(rval, SYS_execve + 1024, fname, argp, NULL);
1283 	brand_assert(err != 0);
1284 	return (err);
1285 }
1286 
1287 /*
1288  * Interpose on the SYS_execve syscall to detect native wrappers.
1289  */
1290 int
1291 s10_execve(sysret_t *rval, const char *fname, const char **argp,
1292     const char **envp)
1293 {
1294 	int err;
1295 
1296 	if ((err = s10_exec_native(rval, fname, argp, envp)) != 0)
1297 		return (err);
1298 
1299 	/* If an exec call succeeds, it never returns */
1300 	err = __systemcall(rval, SYS_execve + 1024, fname, argp, envp);
1301 	brand_assert(err != 0);
1302 	return (err);
1303 }
1304 
1305 /*
1306  * S10's issetugid() syscall is now a subcode to privsys().
1307  */
1308 static int
1309 s10_issetugid(sysret_t *rval)
1310 {
1311 	return (__systemcall(rval, SYS_privsys + 1024, PRIVSYS_ISSETUGID,
1312 	    0, 0, 0, 0, 0));
1313 }
1314 
1315 static long
1316 s10_uname(sysret_t *rv, uintptr_t p1)
1317 {
1318 	struct utsname un, *unp = (struct utsname *)p1;
1319 	int rev, err;
1320 
1321 	if ((err = __systemcall(rv, SYS_uname + 1024, &un)) != 0)
1322 		return (err);
1323 
1324 	rev = atoi(&un.release[2]);
1325 	brand_assert(rev >= 11);
1326 	bzero(un.release, _SYS_NMLN);
1327 	(void) strlcpy(un.release, S10_UTS_RELEASE, _SYS_NMLN);
1328 	bzero(un.version, _SYS_NMLN);
1329 	(void) strlcpy(un.version, S10_UTS_VERSION, _SYS_NMLN);
1330 
1331 	/* copy out the modified uname info */
1332 	return (brand_uucopy(&un, unp, sizeof (un)));
1333 }
1334 
1335 int
1336 s10_sysconfig(sysret_t *rv, int which)
1337 {
1338 	long value;
1339 
1340 	/*
1341 	 * We must interpose on the sysconfig(2) requests
1342 	 * that deal with the realtime signal number range.
1343 	 * All others get passed to the native sysconfig(2).
1344 	 */
1345 	switch (which) {
1346 	case _CONFIG_RTSIG_MAX:
1347 		value = S10_SIGRTMAX - S10_SIGRTMIN + 1;
1348 		break;
1349 	case _CONFIG_SIGRT_MIN:
1350 		value = S10_SIGRTMIN;
1351 		break;
1352 	case _CONFIG_SIGRT_MAX:
1353 		value = S10_SIGRTMAX;
1354 		break;
1355 	default:
1356 		return (__systemcall(rv, SYS_sysconfig + 1024, which));
1357 	}
1358 
1359 	(void) B_TRUSS_POINT_1(rv, SYS_sysconfig, 0, which);
1360 	rv->sys_rval1 = value;
1361 	rv->sys_rval2 = 0;
1362 
1363 	return (0);
1364 }
1365 
1366 int
1367 s10_sysinfo(sysret_t *rv, int command, char *buf, long count)
1368 {
1369 	char *value;
1370 	int len;
1371 
1372 	/*
1373 	 * We must interpose on the sysinfo(2) commands SI_RELEASE and
1374 	 * SI_VERSION; all others get passed to the native sysinfo(2)
1375 	 * command.
1376 	 */
1377 	switch (command) {
1378 		case SI_RELEASE:
1379 			value = S10_UTS_RELEASE;
1380 			break;
1381 
1382 		case SI_VERSION:
1383 			value = S10_UTS_VERSION;
1384 			break;
1385 
1386 		default:
1387 			/*
1388 			 * The default action is to pass the command to the
1389 			 * native sysinfo(2) syscall.
1390 			 */
1391 			return (__systemcall(rv, SYS_systeminfo + 1024,
1392 			    command, buf, count));
1393 	}
1394 
1395 	len = strlen(value) + 1;
1396 	if (count > 0) {
1397 		if (brand_uucopystr(value, buf, count) != 0)
1398 			return (EFAULT);
1399 
1400 		/*
1401 		 * Assure NULL termination of buf as brand_uucopystr() doesn't.
1402 		 */
1403 		if (len > count && brand_uucopy("\0", buf + (count - 1), 1)
1404 		    != 0)
1405 			return (EFAULT);
1406 	}
1407 
1408 	/*
1409 	 * On success, sysinfo(2) returns the size of buffer required to hold
1410 	 * the complete value plus its terminating NULL byte.
1411 	 */
1412 	(void) B_TRUSS_POINT_3(rv, SYS_systeminfo, 0, command, buf, count);
1413 	rv->sys_rval1 = len;
1414 	rv->sys_rval2 = 0;
1415 	return (0);
1416 }
1417 
1418 #if defined(__x86)
1419 #if defined(__amd64)
1420 /*
1421  * 64-bit x86 LWPs created by SYS_lwp_create start here if they need to set
1422  * their %fs registers to the legacy Solaris 10 selector value.
1423  *
1424  * This function does three things:
1425  *
1426  *	1.  Trap to the kernel so that it can set %fs to the legacy Solaris 10
1427  *	    selector value.
1428  *	2.  Read the LWP's true entry point (the entry point supplied by libc
1429  *	    when SYS_lwp_create was invoked) from %r14.
1430  *	3.  Eliminate this function's stack frame and pass control to the LWP's
1431  *	    true entry point.
1432  *
1433  * See the comment above s10_lwp_create_correct_fs() (see below) for the reason
1434  * why this function exists.
1435  */
1436 /*ARGSUSED*/
1437 static void
1438 s10_lwp_create_entry_point(void *ulwp_structp)
1439 {
1440 	sysret_t rval;
1441 
1442 	/*
1443 	 * The new LWP's %fs register is initially zero, but libc won't
1444 	 * function correctly when %fs is zero.  Change the LWP's %fs register
1445 	 * via SYS_brand.
1446 	 */
1447 	(void) __systemcall(&rval, SYS_brand + 1024, B_S10_FSREGCORRECTION);
1448 
1449 	/*
1450 	 * Jump to the true entry point, which is stored in %r14.
1451 	 * Remove our stack frame before jumping so that
1452 	 * s10_lwp_create_entry_point() won't be seen in stack traces.
1453 	 *
1454 	 * NOTE: s10_lwp_create_entry_point() pushes %r12 onto its stack frame
1455 	 * so that it can use it as a temporary register.  We don't restore %r12
1456 	 * in this assembly block because we don't care about its value (and
1457 	 * neither does _lwp_start()).  Besides, the System V ABI AMD64
1458 	 * Actirecture Processor Supplement doesn't specify that %r12 should
1459 	 * have a special value when LWPs start, so we can ignore its value when
1460 	 * we jump to the true entry point.  Furthermore, %r12 is a callee-saved
1461 	 * register, so the true entry point should push %r12 onto its stack
1462 	 * before using the register.  We ignore %r14 after we read it for
1463 	 * similar reasons.
1464 	 *
1465 	 * NOTE: The compiler will generate a function epilogue for this
1466 	 * function despite the fact that the LWP will never execute it.
1467 	 * We could hand-code this entire function in assembly to eliminate
1468 	 * the epilogue, but the epilogue is only three or four instructions,
1469 	 * so we wouldn't save much space.  Besides, why would we want
1470 	 * to create yet another ugly, hard-to-maintain assembly function when
1471 	 * we could write most of it in C?
1472 	 */
1473 	__asm__ __volatile__(
1474 	    "movq %0, %%rdi\n\t"	/* pass ulwp_structp as arg1 */
1475 	    "movq %%rbp, %%rsp\n\t"	/* eliminate the stack frame */
1476 	    "popq %%rbp\n\t"
1477 	    "jmp *%%r14\n\t"		/* jump to the true entry point */
1478 	    : : "r" (ulwp_structp));
1479 	/*NOTREACHED*/
1480 }
1481 
1482 /*
1483  * The S10 libc expects that %fs will be nonzero for new 64-bit x86 LWPs but the
1484  * Nevada kernel clears %fs for such LWPs.  Unforunately, new LWPs do not issue
1485  * SYS_lwp_private (see s10_lwp_private() below) after they are created, so
1486  * we must ensure that new LWPs invoke a brand operation that sets %fs to a
1487  * nonzero value immediately after their creation.
1488  *
1489  * The easiest way to do this is to make new LWPs start at a special function,
1490  * s10_lwp_create_entry_point() (see its definition above), that invokes the
1491  * brand operation that corrects %fs.  We'll store the entry points of new LWPs
1492  * in their %r14 registers so that s10_lwp_create_entry_point() can find and
1493  * call them after invoking the special brand operation.  %r14 is a callee-saved
1494  * register; therefore, any functions invoked by s10_lwp_create_entry_point()
1495  * and all functions dealing with signals (e.g., sigacthandler()) will preserve
1496  * %r14 for s10_lwp_create_entry_point().
1497  *
1498  * The Nevada kernel can safely work with nonzero %fs values because the kernel
1499  * configures per-thread %fs segment descriptors so that the legacy %fs selector
1500  * value will still work.  See the comment in lwp_load() regarding %fs and
1501  * %fsbase in 64-bit x86 processes.
1502  *
1503  * This emulation exists thanks to CRs 6467491 and 6501650.
1504  */
1505 static int
1506 s10_lwp_create_correct_fs(sysret_t *rval, ucontext_t *ucp, int flags,
1507     id_t *new_lwp)
1508 {
1509 	ucontext_t s10_uc;
1510 
1511 	/*
1512 	 * Copy the supplied ucontext_t structure to the local stack
1513 	 * frame and store the new LWP's entry point (the value of %rip
1514 	 * stored in the ucontext_t) in the new LWP's %r14 register.
1515 	 * Then make s10_lwp_create_entry_point() the new LWP's entry
1516 	 * point.
1517 	 */
1518 	if (brand_uucopy(ucp, &s10_uc, sizeof (s10_uc)) != 0)
1519 		return (EFAULT);
1520 
1521 	s10_uc.uc_mcontext.gregs[REG_R14] = s10_uc.uc_mcontext.gregs[REG_RIP];
1522 	s10_uc.uc_mcontext.gregs[REG_RIP] = (greg_t)s10_lwp_create_entry_point;
1523 
1524 	/*  fix up the signal mask */
1525 	if (s10_uc.uc_flags & UC_SIGMASK)
1526 		(void) s10sigset_to_native(&s10_uc.uc_sigmask,
1527 		    &s10_uc.uc_sigmask);
1528 
1529 	/*
1530 	 * Issue SYS_lwp_create to create the new LWP.  We pass the
1531 	 * modified ucontext_t to make sure that the new LWP starts at
1532 	 * s10_lwp_create_entry_point().
1533 	 */
1534 	return (__systemcall(rval, SYS_lwp_create + 1024, &s10_uc,
1535 	    flags, new_lwp));
1536 }
1537 #endif	/* __amd64 */
1538 
1539 /*
1540  * SYS_lwp_private is issued by libc_init() to set %fsbase in 64-bit x86
1541  * processes.  The Nevada kernel sets %fs to zero but the S10 libc expects
1542  * %fs to be nonzero.  We'll pass the issued system call to the kernel untouched
1543  * and invoke a brand operation to set %fs to the legacy S10 selector value.
1544  *
1545  * This emulation exists thanks to CRs 6467491 and 6501650.
1546  */
1547 static int
1548 s10_lwp_private(sysret_t *rval, int cmd, int which, uintptr_t base)
1549 {
1550 #if defined(__amd64)
1551 	int err;
1552 
1553 	/*
1554 	 * The current LWP's %fs register should be zero.  Determine whether the
1555 	 * Solaris 10 libc with which we're working functions correctly when %fs
1556 	 * is zero by calling thr_main() after issuing the SYS_lwp_private
1557 	 * syscall.  If thr_main() barfs (returns -1), then change the LWP's %fs
1558 	 * register via SYS_brand and patch brand_sysent_table so that issuing
1559 	 * SYS_lwp_create executes s10_lwp_create_correct_fs() rather than the
1560 	 * default s10_lwp_create().  s10_lwp_create_correct_fs() will
1561 	 * guarantee that new LWPs will have correct %fs values.
1562 	 */
1563 	if ((err = __systemcall(rval, SYS_lwp_private + 1024, cmd, which,
1564 	    base)) != 0)
1565 		return (err);
1566 	if (thr_main() == -1) {
1567 		/*
1568 		 * SYS_lwp_private is only issued by libc_init(), which is
1569 		 * executed when libc is first loaded by ld.so.1.  Thus we
1570 		 * are guaranteed to be single-threaded at this point.  Even
1571 		 * if we were multithreaded at this point, writing a 64-bit
1572 		 * value to the st_callc field of a brand_sysent_table
1573 		 * entry is guaranteed to be atomic on 64-bit x86 chips
1574 		 * as long as the field is not split across cache lines
1575 		 * (It shouldn't be.).  See chapter 8, section 1.1 of
1576 		 * "The Intel 64 and IA32 Architectures Software Developer's
1577 		 * Manual," Volume 3A for more details.
1578 		 */
1579 		brand_sysent_table[SYS_lwp_create].st_callc =
1580 		    (sysent_cb_t)s10_lwp_create_correct_fs;
1581 		return (__systemcall(rval, SYS_brand + 1024,
1582 		    B_S10_FSREGCORRECTION));
1583 	}
1584 	return (0);
1585 #else	/* !__amd64 */
1586 	return (__systemcall(rval, SYS_lwp_private + 1024, cmd, which, base));
1587 #endif	/* !__amd64 */
1588 }
1589 #endif	/* __x86 */
1590 
1591 /*
1592  * The Opensolaris versions of lwp_mutex_timedlock() and lwp_mutex_trylock()
1593  * add an extra argument to the interfaces, a uintptr_t value for the mutex's
1594  * mutex_owner field.  The Solaris 10 libc assigns the mutex_owner field at
1595  * user-level, so we just make the extra argument be zero in both syscalls.
1596  */
1597 
1598 static int
1599 s10_lwp_mutex_timedlock(sysret_t *rval, lwp_mutex_t *lp, timespec_t *tsp)
1600 {
1601 	return (__systemcall(rval, SYS_lwp_mutex_timedlock + 1024, lp, tsp, 0));
1602 }
1603 
1604 static int
1605 s10_lwp_mutex_trylock(sysret_t *rval, lwp_mutex_t *lp)
1606 {
1607 	return (__systemcall(rval, SYS_lwp_mutex_trylock + 1024, lp, 0));
1608 }
1609 
1610 /*
1611  * If the emul_global_zone flag is set then emulate some aspects of the
1612  * zone system call.  In particular, emulate the global zone ID on the
1613  * ZONE_LOOKUP subcommand and emulate some of the global zone attributes
1614  * on the ZONE_GETATTR subcommand.  If the flag is not set or we're performing
1615  * some other operation, simply pass the calls through.
1616  */
1617 int
1618 s10_zone(sysret_t *rval, int cmd, void *arg1, void *arg2, void *arg3,
1619     void *arg4)
1620 {
1621 	char		*aval;
1622 	int		len;
1623 	zoneid_t	zid;
1624 	int		attr;
1625 	char		*buf;
1626 	size_t		bufsize;
1627 
1628 	/*
1629 	 * We only emulate the zone syscall for a subset of specific commands,
1630 	 * otherwise we just pass the call through.
1631 	 */
1632 	if (!emul_global_zone)
1633 		return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2,
1634 		    arg3, arg4));
1635 
1636 	switch (cmd) {
1637 	case ZONE_LOOKUP:
1638 		(void) B_TRUSS_POINT_1(rval, SYS_zone, 0, cmd);
1639 		rval->sys_rval1 = GLOBAL_ZONEID;
1640 		rval->sys_rval2 = 0;
1641 		return (0);
1642 
1643 	case ZONE_GETATTR:
1644 		zid = (zoneid_t)(uintptr_t)arg1;
1645 		attr = (int)(uintptr_t)arg2;
1646 		buf = (char *)arg3;
1647 		bufsize = (size_t)arg4;
1648 
1649 		/*
1650 		 * If the request is for the global zone then we're emulating
1651 		 * that, otherwise pass this thru.
1652 		 */
1653 		if (zid != GLOBAL_ZONEID)
1654 			goto passthru;
1655 
1656 		switch (attr) {
1657 		case ZONE_ATTR_NAME:
1658 			aval = GLOBAL_ZONENAME;
1659 			break;
1660 
1661 		case ZONE_ATTR_BRAND:
1662 			aval = NATIVE_BRAND_NAME;
1663 			break;
1664 		default:
1665 			/*
1666 			 * We only emulate a subset of the attrs, use the
1667 			 * real zone id to pass thru the rest.
1668 			 */
1669 			arg1 = (void *)(uintptr_t)zoneid;
1670 			goto passthru;
1671 		}
1672 
1673 		(void) B_TRUSS_POINT_5(rval, SYS_zone, 0, cmd, zid, attr,
1674 		    buf, bufsize);
1675 
1676 		len = strlen(aval) + 1;
1677 		if (len > bufsize)
1678 			return (ENAMETOOLONG);
1679 
1680 		if (buf != NULL) {
1681 			if (len == 1) {
1682 				if (brand_uucopy("\0", buf, 1) != 0)
1683 					return (EFAULT);
1684 			} else {
1685 				if (brand_uucopystr(aval, buf, len) != 0)
1686 					return (EFAULT);
1687 
1688 				/*
1689 				 * Assure NULL termination of "buf" as
1690 				 * brand_uucopystr() does NOT.
1691 				 */
1692 				if (brand_uucopy("\0", buf + (len - 1), 1) != 0)
1693 					return (EFAULT);
1694 			}
1695 		}
1696 
1697 		rval->sys_rval1 = len;
1698 		rval->sys_rval2 = 0;
1699 		return (0);
1700 
1701 	default:
1702 		break;
1703 	}
1704 
1705 passthru:
1706 	return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, arg3,
1707 	    arg4));
1708 }
1709 
1710 /*ARGSUSED*/
1711 int
1712 brand_init(int argc, char *argv[], char *envp[])
1713 {
1714 	sysret_t		rval;
1715 	ulong_t			ldentry;
1716 	int			err;
1717 	char			*bname;
1718 
1719 	brand_pre_init();
1720 
1721 	/*
1722 	 * Cache the pid of the zone's init process and determine if
1723 	 * we're init(1m) for the zone.  Remember: we might be init
1724 	 * now, but as soon as we fork(2) we won't be.
1725 	 */
1726 	(void) get_initpid_info();
1727 
1728 	/* get the current zoneid */
1729 	err = __systemcall(&rval, SYS_zone, ZONE_LOOKUP, NULL);
1730 	brand_assert(err == 0);
1731 	zoneid = (zoneid_t)rval.sys_rval1;
1732 
1733 	/* Get the zone's emulation bitmap. */
1734 	if ((err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid,
1735 	    S10_EMUL_BITMAP, emul_bitmap, sizeof (emul_bitmap))) != 0) {
1736 		brand_abort(err, "The zone's patch level is unsupported");
1737 		/*NOTREACHED*/
1738 	}
1739 
1740 	bname = basename(argv[0]);
1741 
1742 	/*
1743 	 * In general we want the S10 commands that are zone-aware to continue
1744 	 * to behave as they normally do within a zone.  Since these commands
1745 	 * are zone-aware, they should continue to "do the right thing".
1746 	 * However, some zone-aware commands aren't going to work the way
1747 	 * we expect them to inside the branded zone.  In particular, the pkg
1748 	 * and patch commands will not properly manage all pkgs/patches
1749 	 * unless the commands think they are running in the global zone.  For
1750 	 * these commands we want to emulate the global zone.
1751 	 *
1752 	 * We don't do any emulation for pkgcond since it is typically used
1753 	 * in pkg/patch postinstall scripts and we want those scripts to do
1754 	 * the right thing inside a zone.
1755 	 *
1756 	 * One issue is the handling of hollow pkgs.  Since the pkgs are
1757 	 * hollow, they won't use pkgcond in their postinstall scripts.  These
1758 	 * pkgs typically are installing drivers so we handle that by
1759 	 * replacing add_drv and rem_drv in the s10_boot script.
1760 	 */
1761 	if (strcmp("pkgadd", bname) == 0 || strcmp("pkgrm", bname) == 0 ||
1762 	    strcmp("patchadd", bname) == 0 || strcmp("patchrm", bname) == 0)
1763 		emul_global_zone = B_TRUE;
1764 
1765 	ldentry = brand_post_init(S10_VERSION, argc, argv, envp);
1766 
1767 	brand_runexe(argv, ldentry);
1768 	/*NOTREACHED*/
1769 	brand_abort(0, "brand_runexe() returned");
1770 	return (-1);
1771 }
1772 
1773 /*
1774  * This table must have at least NSYSCALL entries in it.
1775  *
1776  * The second parameter of each entry in the brand_sysent_table
1777  * contains the number of parameters and flags that describe the
1778  * syscall return value encoding.  See the block comments at the
1779  * top of this file for more information about the syscall return
1780  * value flags and when they should be used.
1781  */
1782 brand_sysent_table_t brand_sysent_table[] = {
1783 #if defined(__sparc) && !defined(__sparcv9)
1784 	EMULATE(brand_indir, 9 | RV_64RVAL),	/*  0 */
1785 #else
1786 	NOSYS,					/*  0 */
1787 #endif
1788 	NOSYS,					/*   1 */
1789 	EMULATE(s10_forkall, 0 | RV_32RVAL2),	/*   2 */
1790 	NOSYS,					/*   3 */
1791 	NOSYS,					/*   4 */
1792 	EMULATE(s10_open, 3 | RV_DEFAULT),	/*   5 */
1793 	NOSYS,					/*   6 */
1794 	EMULATE(s10_wait, 0 | RV_32RVAL2),	/*   7 */
1795 	EMULATE(s10_creat, 2 | RV_DEFAULT),	/*   8 */
1796 	NOSYS,					/*   9 */
1797 	EMULATE(s10_unlink, 1 | RV_DEFAULT),	/*  10 */
1798 	EMULATE(s10_exec, 2 | RV_DEFAULT),	/*  11 */
1799 	NOSYS,					/*  12 */
1800 	NOSYS,					/*  13 */
1801 	NOSYS,					/*  14 */
1802 	NOSYS,					/*  15 */
1803 	EMULATE(s10_chown, 3 | RV_DEFAULT),	/*  16 */
1804 	NOSYS,					/*  17 */
1805 	EMULATE(s10_stat, 2 | RV_DEFAULT),	/*  18 */
1806 	NOSYS,					/*  19 */
1807 	NOSYS,					/*  20 */
1808 	NOSYS,					/*  21 */
1809 	EMULATE(s10_umount, 1 | RV_DEFAULT),	/*  22 */
1810 	NOSYS,					/*  23 */
1811 	NOSYS,					/*  24 */
1812 	NOSYS,					/*  25 */
1813 	NOSYS,					/*  26 */
1814 	NOSYS,					/*  27 */
1815 	EMULATE(s10_fstat, 2 | RV_DEFAULT),	/*  28 */
1816 	NOSYS,					/*  29 */
1817 	EMULATE(s10_utime, 2 | RV_DEFAULT),	/*  30 */
1818 	NOSYS,					/*  31 */
1819 	NOSYS,					/*  32 */
1820 	EMULATE(s10_access, 2 | RV_DEFAULT),	/*  33 */
1821 	NOSYS,					/*  34 */
1822 	NOSYS,					/*  35 */
1823 	NOSYS,					/*  36 */
1824 	EMULATE(s10_kill, 2 | RV_DEFAULT),	/*  37 */
1825 	NOSYS,					/*  38 */
1826 	NOSYS,					/*  39 */
1827 	NOSYS,					/*  40 */
1828 	EMULATE(s10_dup, 1 | RV_DEFAULT),	/*  41 */
1829 	NOSYS,					/*  42 */
1830 	NOSYS,					/*  43 */
1831 	NOSYS,					/*  44 */
1832 	NOSYS,					/*  45 */
1833 	NOSYS,					/*  46 */
1834 	NOSYS,					/*  47 */
1835 	NOSYS,					/*  48 */
1836 	NOSYS,					/*  49 */
1837 	NOSYS,					/*  50 */
1838 	NOSYS,					/*  51 */
1839 	NOSYS,					/*  52 */
1840 	NOSYS,					/*  53 */
1841 	EMULATE(s10_ioctl, 3 | RV_DEFAULT),	/*  54 */
1842 	NOSYS,					/*  55 */
1843 	NOSYS,					/*  56 */
1844 	NOSYS,					/*  57 */
1845 	NOSYS,					/*  58 */
1846 	EMULATE(s10_execve, 3 | RV_DEFAULT),	/*  59 */
1847 	NOSYS,					/*  60 */
1848 	NOSYS,					/*  61 */
1849 	NOSYS,					/*  62 */
1850 	NOSYS,					/*  63 */
1851 	NOSYS,					/*  64 */
1852 	NOSYS,					/*  65 */
1853 	NOSYS,					/*  66 */
1854 	NOSYS,					/*  67 */
1855 	NOSYS,					/*  68 */
1856 	NOSYS,					/*  69 */
1857 	NOSYS,					/*  70 */
1858 	EMULATE(s10_acctctl, 3 | RV_DEFAULT),	/*  71 */
1859 	NOSYS,					/*  72 */
1860 	NOSYS,					/*  73 */
1861 	NOSYS,					/*  74 */
1862 	EMULATE(s10_issetugid, 0 | RV_DEFAULT),	/*  75 */
1863 	EMULATE(s10_fsat, 6 | RV_DEFAULT),	/*  76 */
1864 	NOSYS,					/*  77 */
1865 	NOSYS,					/*  78 */
1866 	EMULATE(s10_rmdir, 1 | RV_DEFAULT),	/*  79 */
1867 	NOSYS,					/*  80 */
1868 	EMULATE(s10_getdents, 3 | RV_DEFAULT),	/*  81 */
1869 	NOSYS,					/*  82 */
1870 	NOSYS,					/*  83 */
1871 	NOSYS,					/*  84 */
1872 	NOSYS,					/*  85 */
1873 	NOSYS,					/*  86 */
1874 	EMULATE(s10_poll, 3 | RV_DEFAULT),	/*  87 */
1875 	EMULATE(s10_lstat, 2 | RV_DEFAULT),	/*  88 */
1876 	NOSYS,					/*  89 */
1877 	NOSYS,					/*  90 */
1878 	NOSYS,					/*  91 */
1879 	NOSYS,					/*  92 */
1880 	NOSYS,					/*  93 */
1881 	EMULATE(s10_fchown, 3 | RV_DEFAULT),	/*  94 */
1882 	EMULATE(s10_sigprocmask, 3 | RV_DEFAULT), /*  95 */
1883 	EMULATE(s10_sigsuspend, 1 | RV_DEFAULT), /*  96 */
1884 	NOSYS,					/*  97 */
1885 	EMULATE(s10_sigaction, 3 | RV_DEFAULT),	/*  98 */
1886 	EMULATE(s10_sigpending, 2 | RV_DEFAULT), /*  99 */
1887 	NOSYS,					/* 100 */
1888 	NOSYS,					/* 101 */
1889 	NOSYS,					/* 102 */
1890 	NOSYS,					/* 103 */
1891 	NOSYS,					/* 104 */
1892 	NOSYS,					/* 105 */
1893 	NOSYS,					/* 106 */
1894 	EMULATE(s10_waitid, 4 | RV_DEFAULT),	/* 107 */
1895 	EMULATE(s10_sigsendsys, 2 | RV_DEFAULT), /* 108 */
1896 	NOSYS,					/* 109 */
1897 	NOSYS,					/* 110 */
1898 	NOSYS,					/* 111 */
1899 	NOSYS,					/* 112 */
1900 	NOSYS,					/* 113 */
1901 	NOSYS,					/* 114 */
1902 	NOSYS,					/* 115 */
1903 	NOSYS,					/* 116 */
1904 	NOSYS,					/* 117 */
1905 	NOSYS,					/* 118 */
1906 	NOSYS,					/* 119 */
1907 	NOSYS,					/* 120 */
1908 	NOSYS,					/* 121 */
1909 	NOSYS,					/* 122 */
1910 #if defined(__x86)
1911 	EMULATE(s10_xstat, 3 | RV_DEFAULT),	/* 123 */
1912 	EMULATE(s10_lxstat, 3 | RV_DEFAULT),	/* 124 */
1913 	EMULATE(s10_fxstat, 3 | RV_DEFAULT),	/* 125 */
1914 	EMULATE(s10_xmknod, 4 | RV_DEFAULT),	/* 126 */
1915 #else
1916 	NOSYS,					/* 123 */
1917 	NOSYS,					/* 124 */
1918 	NOSYS,					/* 125 */
1919 	NOSYS,					/* 126 */
1920 #endif
1921 	NOSYS,					/* 127 */
1922 	NOSYS,					/* 128 */
1923 	NOSYS,					/* 129 */
1924 	EMULATE(s10_lchown, 3 | RV_DEFAULT),	/* 130 */
1925 	NOSYS,					/* 131 */
1926 	NOSYS,					/* 132 */
1927 	NOSYS,					/* 133 */
1928 	EMULATE(s10_rename, 2 | RV_DEFAULT),	/* 134 */
1929 	EMULATE(s10_uname, 1 | RV_DEFAULT),	/* 135 */
1930 	NOSYS,					/* 136 */
1931 	EMULATE(s10_sysconfig, 1 | RV_DEFAULT),	/* 137 */
1932 	NOSYS,					/* 138 */
1933 	EMULATE(s10_sysinfo, 3 | RV_DEFAULT),	/* 139 */
1934 	NOSYS,					/* 140 */
1935 	NOSYS,					/* 141 */
1936 	NOSYS,					/* 142 */
1937 	EMULATE(s10_fork1, 0 | RV_32RVAL2),	/* 143 */
1938 	EMULATE(s10_sigtimedwait, 3 | RV_DEFAULT), /* 144 */
1939 	NOSYS,					/* 145 */
1940 	NOSYS,					/* 146 */
1941 	EMULATE(s10_lwp_sema_wait, 1 | RV_DEFAULT), /* 147 */
1942 	NOSYS,					/* 148 */
1943 	NOSYS,					/* 149 */
1944 	NOSYS,					/* 150 */
1945 	NOSYS,					/* 151 */
1946 	NOSYS,					/* 152 */
1947 	NOSYS,					/* 153 */
1948 	EMULATE(s10_utimes, 2 | RV_DEFAULT),	/* 154 */
1949 	NOSYS,					/* 155 */
1950 	NOSYS,					/* 156 */
1951 	NOSYS,					/* 157 */
1952 	NOSYS,					/* 158 */
1953 	EMULATE(s10_lwp_create, 3 | RV_DEFAULT), /* 159 */
1954 	NOSYS,					/* 160 */
1955 	NOSYS,					/* 161 */
1956 	NOSYS,					/* 162 */
1957 	EMULATE(s10_lwp_kill, 2 | RV_DEFAULT),	/* 163 */
1958 	NOSYS,					/* 164 */
1959 	EMULATE(s10_lwp_sigmask, 3 | RV_32RVAL2), /* 165 */
1960 #if defined(__x86)
1961 	EMULATE(s10_lwp_private, 3 | RV_DEFAULT), /* 166 */
1962 #else
1963 	NOSYS,					/* 166 */
1964 #endif
1965 	NOSYS,					/* 167 */
1966 	NOSYS,					/* 168 */
1967 	EMULATE(s10_lwp_mutex_lock, 1 | RV_DEFAULT), /* 169 */
1968 	NOSYS,					/* 170 */
1969 	NOSYS,					/* 171 */
1970 	NOSYS,					/* 172 */
1971 	NOSYS,					/* 173 */
1972 	EMULATE(s10_pwrite, 4 | RV_DEFAULT),	/* 174 */
1973 	NOSYS,					/* 175 */
1974 	NOSYS,					/* 176 */
1975 	NOSYS,					/* 177 */
1976 	NOSYS,					/* 178 */
1977 	NOSYS,					/* 179 */
1978 	NOSYS,					/* 180 */
1979 	NOSYS,					/* 181 */
1980 	NOSYS,					/* 182 */
1981 	NOSYS,					/* 183 */
1982 	NOSYS,					/* 184 */
1983 	EMULATE(s10_acl, 4 | RV_DEFAULT),	/* 185 */
1984 	EMULATE(s10_auditsys, 4 | RV_64RVAL),	/* 186 */
1985 	NOSYS,					/* 187 */
1986 	NOSYS,					/* 188 */
1987 	NOSYS,					/* 189 */
1988 	EMULATE(s10_sigqueue, 4 | RV_DEFAULT),	/* 190 */
1989 	NOSYS,					/* 191 */
1990 	NOSYS,					/* 192 */
1991 	NOSYS,					/* 193 */
1992 	NOSYS,					/* 194 */
1993 	NOSYS,					/* 195 */
1994 	NOSYS,					/* 196 */
1995 	NOSYS,					/* 197 */
1996 	NOSYS,					/* 198 */
1997 	NOSYS,					/* 199 */
1998 	EMULATE(s10_facl, 4 | RV_DEFAULT),	/* 200 */
1999 	NOSYS,					/* 201 */
2000 	NOSYS,					/* 202 */
2001 	NOSYS,					/* 203 */
2002 	NOSYS,					/* 204 */
2003 	EMULATE(s10_signotify, 3 | RV_DEFAULT),	/* 205 */
2004 	NOSYS,					/* 206 */
2005 	NOSYS,					/* 207 */
2006 	NOSYS,					/* 208 */
2007 	NOSYS,					/* 209 */
2008 	EMULATE(s10_lwp_mutex_timedlock, 2 | RV_DEFAULT), /* 210 */
2009 	NOSYS,					/* 211 */
2010 	NOSYS,					/* 212 */
2011 #if defined(_LP64)
2012 	NOSYS,					/* 213 */
2013 #else
2014 	EMULATE(s10_getdents64, 3 | RV_DEFAULT), /* 213 */
2015 #endif
2016 	NOSYS,					/* 214 */
2017 #if defined(_LP64)
2018 	NOSYS,					/* 215 */
2019 	NOSYS,					/* 216 */
2020 	NOSYS,					/* 217 */
2021 #else
2022 	EMULATE(s10_stat64, 2 | RV_DEFAULT),	/* 215 */
2023 	EMULATE(s10_lstat64, 2 | RV_DEFAULT),	/* 216 */
2024 	EMULATE(s10_fstat64, 2 | RV_DEFAULT),	/* 217 */
2025 #endif
2026 	NOSYS,					/* 218 */
2027 	NOSYS,					/* 219 */
2028 	NOSYS,					/* 220 */
2029 	NOSYS,					/* 221 */
2030 	NOSYS,					/* 222 */
2031 #if defined(_LP64)
2032 	NOSYS,					/* 223 */
2033 	NOSYS,					/* 224 */
2034 	NOSYS,					/* 225 */
2035 #else
2036 	EMULATE(s10_pwrite64, 5 | RV_DEFAULT),	/* 223 */
2037 	EMULATE(s10_creat64, 2 | RV_DEFAULT),	/* 224 */
2038 	EMULATE(s10_open64, 3 | RV_DEFAULT),	/* 225 */
2039 #endif
2040 	NOSYS,					/* 226 */
2041 	EMULATE(s10_zone, 5 | RV_DEFAULT),	/* 227 */
2042 	NOSYS,					/* 228 */
2043 	NOSYS,					/* 229 */
2044 	NOSYS,					/* 230 */
2045 	NOSYS,					/* 231 */
2046 	NOSYS,					/* 232 */
2047 	NOSYS,					/* 233 */
2048 	NOSYS,					/* 234 */
2049 	NOSYS,					/* 235 */
2050 	NOSYS,					/* 236 */
2051 	NOSYS,					/* 237 */
2052 	NOSYS,					/* 238 */
2053 	NOSYS,					/* 239 */
2054 	NOSYS,					/* 240 */
2055 	NOSYS,					/* 241 */
2056 	NOSYS,					/* 242 */
2057 	NOSYS,					/* 243 */
2058 	NOSYS,					/* 244 */
2059 	NOSYS,					/* 245 */
2060 	NOSYS,					/* 246 */
2061 	NOSYS,					/* 247 */
2062 	NOSYS,					/* 248 */
2063 	NOSYS,					/* 249 */
2064 	NOSYS,					/* 250 */
2065 	EMULATE(s10_lwp_mutex_trylock, 1 | RV_DEFAULT), /* 251 */
2066 	NOSYS,					/* 252 */
2067 	NOSYS,					/* 253 */
2068 	NOSYS,					/* 254 */
2069 	NOSYS					/* 255 */
2070 };
2071