1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2016 Toomas Soome <tsoome@me.com>
24 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
25 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright 2024 Oxide Computer Company
27 */
28
29 #include <errno.h>
30 #include <fcntl.h>
31 #include <dirent.h>
32 #include <stddef.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <strings.h>
36 #include <unistd.h>
37 #include <thread.h>
38 #include <sys/auxv.h>
39 #include <sys/brand.h>
40 #include <sys/inttypes.h>
41 #include <sys/lwp.h>
42 #include <sys/syscall.h>
43 #include <sys/systm.h>
44 #include <sys/utsname.h>
45 #include <sys/sysconfig.h>
46 #include <sys/systeminfo.h>
47 #include <sys/zone.h>
48 #include <sys/stat.h>
49 #include <sys/mntent.h>
50 #include <sys/ctfs.h>
51 #include <sys/priv.h>
52 #include <sys/acctctl.h>
53 #include <libgen.h>
54 #include <bsm/audit.h>
55 #include <sys/crypto/ioctl.h>
56 #include <sys/fs/zfs.h>
57 #include <sys/zfs_ioctl.h>
58 #include <sys/ucontext.h>
59 #include <sys/mntio.h>
60 #include <sys/mnttab.h>
61 #include <sys/attr.h>
62 #include <sys/lofi.h>
63 #include <atomic.h>
64 #include <sys/acl.h>
65 #include <sys/socket.h>
66 #include <sys/fdsync.h>
67
68 #include <s10_brand.h>
69 #include <brand_misc.h>
70 #include <s10_misc.h>
71 #include <s10_signal.h>
72
73 /*
74 * See usr/src/lib/brand/shared/brand/common/brand_util.c for general
75 * emulation notes.
76 */
77
78 static zoneid_t zoneid;
79 static boolean_t emul_global_zone = B_FALSE;
80 static s10_emul_bitmap_t emul_bitmap;
81 pid_t zone_init_pid;
82
83 /*
84 * S10_FEATURE_IS_PRESENT is a macro that helps facilitate conditional
85 * emulation. For each constant N defined in the s10_emulated_features
86 * enumeration in usr/src/uts/common/brand/solaris10/s10_brand.h,
87 * S10_FEATURE_IS_PRESENT(N) is true iff the feature/backport represented by N
88 * is present in the Solaris 10 image hosted within the zone. In other words,
89 * S10_FEATURE_IS_PRESENT(N) is true iff the file /usr/lib/brand/solaris10/M,
90 * where M is the enum value of N, was present in the zone when the zone booted.
91 *
92 *
93 * *** Sample Usage
94 *
95 * Suppose that you need to backport a fix to Solaris 10 and there is
96 * emulation in place for the fix. Suppose further that the emulation won't be
97 * needed if the fix is backported (i.e., if the fix is present in the hosted
98 * Solaris 10 environment, then the brand won't need the emulation). Then if
99 * you add a constant named "S10_FEATURE_X" to the end of the
100 * s10_emulated_features enumeration that represents the backported fix and
101 * S10_FEATURE_X evaluates to four, then you should create a file named
102 * /usr/lib/brand/solaris10/4 as part of your backport. Additionally, you
103 * should retain the aforementioned emulation but modify it so that it's
104 * performed only when S10_FEATURE_IS_PRESENT(S10_FEATURE_X) is false. Thus the
105 * emulation function should look something like the following:
106 *
107 * static int
108 * my_emul_function(sysret_t *rv, ...)
109 * {
110 * if (S10_FEATURE_IS_PRESENT(S10_FEATURE_X)) {
111 * // Don't emulate
112 * return (__systemcall(rv, ...));
113 * } else {
114 * // Emulate whatever needs to be emulated when the
115 * // backport isn't present in the Solaris 10 image.
116 * }
117 * }
118 */
119 #define S10_FEATURE_IS_PRESENT(s10_emulated_features_constant) \
120 ((emul_bitmap[(s10_emulated_features_constant) >> 3] & \
121 (1 << ((s10_emulated_features_constant) & 0x7))) != 0)
122
123 brand_sysent_table_t brand_sysent_table[];
124
125 #define S10_UTS_RELEASE "5.10"
126 #define S10_UTS_VERSION "Generic_Virtual"
127
128 /*
129 * If the ioctl fd's major doesn't match "major", then pass through the
130 * ioctl, since it is not the expected device. major should be a
131 * pointer to a static dev_t initialized to -1, and devname should be
132 * the path of the device.
133 *
134 * Returns 1 if the ioctl was handled (in which case *err contains the
135 * error code), or 0 if it still needs handling.
136 */
137 static int
passthru_otherdev_ioctl(dev_t * majordev,const char * devname,int * err,sysret_t * rval,int fdes,int cmd,intptr_t arg)138 passthru_otherdev_ioctl(dev_t *majordev, const char *devname, int *err,
139 sysret_t *rval, int fdes, int cmd, intptr_t arg)
140 {
141 struct stat sbuf;
142
143 if (*majordev == (dev_t)-1) {
144 if ((*err = __systemcall(rval, SYS_fstatat + 1024,
145 AT_FDCWD, devname, &sbuf, 0) != 0) != 0)
146 goto doioctl;
147
148 *majordev = major(sbuf.st_rdev);
149 }
150
151 if ((*err = __systemcall(rval, SYS_fstatat + 1024, fdes,
152 NULL, &sbuf, 0)) != 0)
153 goto doioctl;
154
155 if (major(sbuf.st_rdev) == *majordev)
156 return (0);
157
158 doioctl:
159 *err = (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
160 return (1);
161 }
162
163 /*
164 * Figures out the PID of init for the zone. Also returns a boolean
165 * indicating whether this process currently has that pid: if so,
166 * then at this moment, we are init.
167 */
168 static boolean_t
get_initpid_info(void)169 get_initpid_info(void)
170 {
171 pid_t pid;
172 sysret_t rval;
173 int err;
174
175 /*
176 * Determine the current process PID and the PID of the zone's init.
177 * We use care not to call getpid() here, because we're not supposed
178 * to call getpid() until after the program is fully linked-- the
179 * first call to getpid() is a signal from the linker to debuggers
180 * that linking has been completed.
181 */
182 if ((err = __systemcall(&rval, SYS_brand,
183 B_S10_PIDINFO, &pid, &zone_init_pid)) != 0) {
184 brand_abort(err, "Failed to get init's pid");
185 }
186
187 /*
188 * Note that we need to be cautious with the pid we get back--
189 * it should not be stashed and used in place of getpid(), since
190 * we might fork(2). So we keep zone_init_pid and toss the pid
191 * we otherwise got.
192 */
193 if (pid == zone_init_pid)
194 return (B_TRUE);
195
196 return (B_FALSE);
197 }
198
199 /* Free the thread-local storage provided by mntfs_get_mntentbuf(). */
200 static void
mntfs_free_mntentbuf(void * arg)201 mntfs_free_mntentbuf(void *arg)
202 {
203 struct mntentbuf *embufp = arg;
204
205 if (embufp == NULL)
206 return;
207 if (embufp->mbuf_emp)
208 free(embufp->mbuf_emp);
209 if (embufp->mbuf_buf)
210 free(embufp->mbuf_buf);
211 bzero(embufp, sizeof (struct mntentbuf));
212 free(embufp);
213 }
214
215 /* Provide the thread-local storage required by mntfs_ioctl(). */
216 static struct mntentbuf *
mntfs_get_mntentbuf(size_t size)217 mntfs_get_mntentbuf(size_t size)
218 {
219 static mutex_t keylock;
220 static thread_key_t key;
221 static int once_per_keyname = 0;
222 void *tsd = NULL;
223 struct mntentbuf *embufp;
224
225 /* Create the key. */
226 if (!once_per_keyname) {
227 (void) mutex_lock(&keylock);
228 if (!once_per_keyname) {
229 if (thr_keycreate(&key, mntfs_free_mntentbuf)) {
230 (void) mutex_unlock(&keylock);
231 return (NULL);
232 } else {
233 once_per_keyname++;
234 }
235 }
236 (void) mutex_unlock(&keylock);
237 }
238
239 /*
240 * The thread-specific datum for this key is the address of a struct
241 * mntentbuf. If this is the first time here then we allocate the struct
242 * and its contents, and associate its address with the thread; if there
243 * are any problems then we abort.
244 */
245 if (thr_getspecific(key, &tsd))
246 return (NULL);
247 if (tsd == NULL) {
248 if (!(embufp = calloc(1, sizeof (struct mntentbuf))) ||
249 !(embufp->mbuf_emp = malloc(sizeof (struct extmnttab))) ||
250 thr_setspecific(key, embufp)) {
251 mntfs_free_mntentbuf(embufp);
252 return (NULL);
253 }
254 } else {
255 embufp = tsd;
256 }
257
258 /* Return the buffer, resizing it if necessary. */
259 if (size > embufp->mbuf_bufsize) {
260 if (embufp->mbuf_buf)
261 free(embufp->mbuf_buf);
262 if ((embufp->mbuf_buf = malloc(size)) == NULL) {
263 embufp->mbuf_bufsize = 0;
264 return (NULL);
265 } else {
266 embufp->mbuf_bufsize = size;
267 }
268 }
269 return (embufp);
270 }
271
272 /*
273 * The MNTIOC_GETMNTENT command in this release differs from that in early
274 * versions of Solaris 10.
275 *
276 * Previously, the command would copy a pointer to a struct extmnttab to an
277 * address provided as an argument. The pointer would be somewhere within a
278 * mapping already present within the user's address space. In addition, the
279 * text to which the struct's members pointed would also be within a
280 * pre-existing mapping. Now, the user is required to allocate memory for both
281 * the struct and the text buffer, and to pass the address of each within a
282 * struct mntentbuf. In order to conceal these details from a Solaris 10 client
283 * we allocate some thread-local storage in which to create the necessary data
284 * structures; this is static, thread-safe memory that will be cleaned up
285 * without the caller's intervention.
286 *
287 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY are new in this release; they should
288 * not work for older clients.
289 */
290 int
mntfs_ioctl(sysret_t * rval,int fdes,int cmd,intptr_t arg)291 mntfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
292 {
293 int err;
294 struct stat statbuf;
295 struct mntentbuf *embufp;
296 static size_t bufsize = MNT_LINE_MAX;
297
298 /* Do not emulate mntfs commands from up-to-date clients. */
299 if (S10_FEATURE_IS_PRESENT(S10_FEATURE_ALTERED_MNTFS_IOCTL))
300 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
301
302 /* Do not emulate mntfs commands directed at other file systems. */
303 if ((err = __systemcall(rval, SYS_fstatat + 1024,
304 fdes, NULL, &statbuf, 0)) != 0)
305 return (err);
306 if (strcmp(statbuf.st_fstype, MNTTYPE_MNTFS) != 0)
307 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
308
309 if (cmd == MNTIOC_GETEXTMNTENT || cmd == MNTIOC_GETMNTANY)
310 return (EINVAL);
311
312 if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
313 return (ENOMEM);
314
315 /*
316 * MNTIOC_GETEXTMNTENT advances the file pointer once it has
317 * successfully copied out the result to the address provided. We
318 * therefore need to check the user-supplied address now since the
319 * one we'll be providing is guaranteed to work.
320 */
321 if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
322 return (EFAULT);
323
324 /*
325 * Keep retrying for as long as we fail for want of a large enough
326 * buffer.
327 */
328 for (;;) {
329 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes,
330 MNTIOC_GETEXTMNTENT, embufp)) != 0)
331 return (err);
332
333 if (rval->sys_rval1 == MNTFS_TOOLONG) {
334 /* The buffer wasn't large enough. */
335 (void) atomic_swap_ulong((unsigned long *)&bufsize,
336 2 * embufp->mbuf_bufsize);
337 if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
338 return (ENOMEM);
339 } else {
340 break;
341 }
342 }
343
344 if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
345 return (EFAULT);
346
347 return (0);
348 }
349
350 /*
351 * Assign the structure member value from the s (source) structure to the
352 * d (dest) structure.
353 */
354 #define struct_assign(d, s, val) (((d).val) = ((s).val))
355
356 /*
357 * The CRYPTO_GET_FUNCTION_LIST parameter structure crypto_function_list_t
358 * changed between S10 and Nevada, so we have to emulate the old S10
359 * crypto_function_list_t structure when interposing on the ioctl syscall.
360 */
361 typedef struct s10_crypto_function_list {
362 boolean_t fl_digest_init;
363 boolean_t fl_digest;
364 boolean_t fl_digest_update;
365 boolean_t fl_digest_key;
366 boolean_t fl_digest_final;
367
368 boolean_t fl_encrypt_init;
369 boolean_t fl_encrypt;
370 boolean_t fl_encrypt_update;
371 boolean_t fl_encrypt_final;
372
373 boolean_t fl_decrypt_init;
374 boolean_t fl_decrypt;
375 boolean_t fl_decrypt_update;
376 boolean_t fl_decrypt_final;
377
378 boolean_t fl_mac_init;
379 boolean_t fl_mac;
380 boolean_t fl_mac_update;
381 boolean_t fl_mac_final;
382
383 boolean_t fl_sign_init;
384 boolean_t fl_sign;
385 boolean_t fl_sign_update;
386 boolean_t fl_sign_final;
387 boolean_t fl_sign_recover_init;
388 boolean_t fl_sign_recover;
389
390 boolean_t fl_verify_init;
391 boolean_t fl_verify;
392 boolean_t fl_verify_update;
393 boolean_t fl_verify_final;
394 boolean_t fl_verify_recover_init;
395 boolean_t fl_verify_recover;
396
397 boolean_t fl_digest_encrypt_update;
398 boolean_t fl_decrypt_digest_update;
399 boolean_t fl_sign_encrypt_update;
400 boolean_t fl_decrypt_verify_update;
401
402 boolean_t fl_seed_random;
403 boolean_t fl_generate_random;
404
405 boolean_t fl_session_open;
406 boolean_t fl_session_close;
407 boolean_t fl_session_login;
408 boolean_t fl_session_logout;
409
410 boolean_t fl_object_create;
411 boolean_t fl_object_copy;
412 boolean_t fl_object_destroy;
413 boolean_t fl_object_get_size;
414 boolean_t fl_object_get_attribute_value;
415 boolean_t fl_object_set_attribute_value;
416 boolean_t fl_object_find_init;
417 boolean_t fl_object_find;
418 boolean_t fl_object_find_final;
419
420 boolean_t fl_key_generate;
421 boolean_t fl_key_generate_pair;
422 boolean_t fl_key_wrap;
423 boolean_t fl_key_unwrap;
424 boolean_t fl_key_derive;
425
426 boolean_t fl_init_token;
427 boolean_t fl_init_pin;
428 boolean_t fl_set_pin;
429
430 boolean_t prov_is_hash_limited;
431 uint32_t prov_hash_threshold;
432 uint32_t prov_hash_limit;
433 } s10_crypto_function_list_t;
434
435 typedef struct s10_crypto_get_function_list {
436 uint_t fl_return_value;
437 crypto_provider_id_t fl_provider_id;
438 s10_crypto_function_list_t fl_list;
439 } s10_crypto_get_function_list_t;
440
441 /*
442 * The structure returned by the CRYPTO_GET_FUNCTION_LIST ioctl on /dev/crypto
443 * increased in size due to:
444 * 6482533 Threshold for HW offload via PKCS11 interface
445 * between S10 and Nevada. This is a relatively simple process of filling
446 * in the S10 structure fields with the Nevada data.
447 *
448 * We stat the device to make sure that the ioctl is meant for /dev/crypto.
449 *
450 */
451 static int
crypto_ioctl(sysret_t * rval,int fdes,int cmd,intptr_t arg)452 crypto_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
453 {
454 int err;
455 s10_crypto_get_function_list_t s10_param;
456 crypto_get_function_list_t native_param;
457 static dev_t crypto_dev = (dev_t)-1;
458
459 if (passthru_otherdev_ioctl(&crypto_dev, "/dev/crypto", &err,
460 rval, fdes, cmd, arg) == 1)
461 return (err);
462
463 if (brand_uucopy((const void *)arg, &s10_param, sizeof (s10_param))
464 != 0)
465 return (EFAULT);
466 struct_assign(native_param, s10_param, fl_provider_id);
467 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd,
468 &native_param)) != 0)
469 return (err);
470
471 struct_assign(s10_param, native_param, fl_return_value);
472 struct_assign(s10_param, native_param, fl_provider_id);
473
474 struct_assign(s10_param, native_param, fl_list.fl_digest_init);
475 struct_assign(s10_param, native_param, fl_list.fl_digest);
476 struct_assign(s10_param, native_param, fl_list.fl_digest_update);
477 struct_assign(s10_param, native_param, fl_list.fl_digest_key);
478 struct_assign(s10_param, native_param, fl_list.fl_digest_final);
479
480 struct_assign(s10_param, native_param, fl_list.fl_encrypt_init);
481 struct_assign(s10_param, native_param, fl_list.fl_encrypt);
482 struct_assign(s10_param, native_param, fl_list.fl_encrypt_update);
483 struct_assign(s10_param, native_param, fl_list.fl_encrypt_final);
484
485 struct_assign(s10_param, native_param, fl_list.fl_decrypt_init);
486 struct_assign(s10_param, native_param, fl_list.fl_decrypt);
487 struct_assign(s10_param, native_param, fl_list.fl_decrypt_update);
488 struct_assign(s10_param, native_param, fl_list.fl_decrypt_final);
489
490 struct_assign(s10_param, native_param, fl_list.fl_mac_init);
491 struct_assign(s10_param, native_param, fl_list.fl_mac);
492 struct_assign(s10_param, native_param, fl_list.fl_mac_update);
493 struct_assign(s10_param, native_param, fl_list.fl_mac_final);
494
495 struct_assign(s10_param, native_param, fl_list.fl_sign_init);
496 struct_assign(s10_param, native_param, fl_list.fl_sign);
497 struct_assign(s10_param, native_param, fl_list.fl_sign_update);
498 struct_assign(s10_param, native_param, fl_list.fl_sign_final);
499 struct_assign(s10_param, native_param, fl_list.fl_sign_recover_init);
500 struct_assign(s10_param, native_param, fl_list.fl_sign_recover);
501
502 struct_assign(s10_param, native_param, fl_list.fl_verify_init);
503 struct_assign(s10_param, native_param, fl_list.fl_verify);
504 struct_assign(s10_param, native_param, fl_list.fl_verify_update);
505 struct_assign(s10_param, native_param, fl_list.fl_verify_final);
506 struct_assign(s10_param, native_param, fl_list.fl_verify_recover_init);
507 struct_assign(s10_param, native_param, fl_list.fl_verify_recover);
508
509 struct_assign(s10_param, native_param,
510 fl_list.fl_digest_encrypt_update);
511 struct_assign(s10_param, native_param,
512 fl_list.fl_decrypt_digest_update);
513 struct_assign(s10_param, native_param, fl_list.fl_sign_encrypt_update);
514 struct_assign(s10_param, native_param,
515 fl_list.fl_decrypt_verify_update);
516
517 struct_assign(s10_param, native_param, fl_list.fl_seed_random);
518 struct_assign(s10_param, native_param, fl_list.fl_generate_random);
519
520 struct_assign(s10_param, native_param, fl_list.fl_session_open);
521 struct_assign(s10_param, native_param, fl_list.fl_session_close);
522 struct_assign(s10_param, native_param, fl_list.fl_session_login);
523 struct_assign(s10_param, native_param, fl_list.fl_session_logout);
524
525 struct_assign(s10_param, native_param, fl_list.fl_object_create);
526 struct_assign(s10_param, native_param, fl_list.fl_object_copy);
527 struct_assign(s10_param, native_param, fl_list.fl_object_destroy);
528 struct_assign(s10_param, native_param, fl_list.fl_object_get_size);
529 struct_assign(s10_param, native_param,
530 fl_list.fl_object_get_attribute_value);
531 struct_assign(s10_param, native_param,
532 fl_list.fl_object_set_attribute_value);
533 struct_assign(s10_param, native_param, fl_list.fl_object_find_init);
534 struct_assign(s10_param, native_param, fl_list.fl_object_find);
535 struct_assign(s10_param, native_param, fl_list.fl_object_find_final);
536
537 struct_assign(s10_param, native_param, fl_list.fl_key_generate);
538 struct_assign(s10_param, native_param, fl_list.fl_key_generate_pair);
539 struct_assign(s10_param, native_param, fl_list.fl_key_wrap);
540 struct_assign(s10_param, native_param, fl_list.fl_key_unwrap);
541 struct_assign(s10_param, native_param, fl_list.fl_key_derive);
542
543 struct_assign(s10_param, native_param, fl_list.fl_init_token);
544 struct_assign(s10_param, native_param, fl_list.fl_init_pin);
545 struct_assign(s10_param, native_param, fl_list.fl_set_pin);
546
547 struct_assign(s10_param, native_param, fl_list.prov_is_hash_limited);
548 struct_assign(s10_param, native_param, fl_list.prov_hash_threshold);
549 struct_assign(s10_param, native_param, fl_list.prov_hash_limit);
550
551 return (brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param)));
552 }
553
554 /*
555 * The process contract CT_TGET and CT_TSET parameter structure ct_param_t
556 * changed between S10 and Nevada, so we have to emulate the old S10
557 * ct_param_t structure when interposing on the ioctl syscall.
558 */
559 typedef struct s10_ct_param {
560 uint32_t ctpm_id;
561 uint32_t ctpm_pad;
562 uint64_t ctpm_value;
563 } s10_ct_param_t;
564
565 /*
566 * We have to emulate process contract ioctls for init(8) because the
567 * ioctl parameter structure changed between S10 and Nevada. This is
568 * a relatively simple process of filling Nevada structure fields,
569 * shuffling values, and initiating a native system call.
570 *
571 * For now, we'll assume that all consumers of CT_TGET and CT_TSET will
572 * need emulation. We'll issue a stat to make sure that the ioctl
573 * is meant for the contract file system.
574 *
575 */
576 static int
ctfs_ioctl(sysret_t * rval,int fdes,int cmd,intptr_t arg)577 ctfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
578 {
579 int err;
580 s10_ct_param_t s10param;
581 ct_param_t param;
582 struct stat statbuf;
583
584 if ((err = __systemcall(rval, SYS_fstatat + 1024,
585 fdes, NULL, &statbuf, 0)) != 0)
586 return (err);
587 if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0)
588 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
589
590 if (brand_uucopy((const void *)arg, &s10param, sizeof (s10param)) != 0)
591 return (EFAULT);
592 param.ctpm_id = s10param.ctpm_id;
593 param.ctpm_size = sizeof (uint64_t);
594 param.ctpm_value = &s10param.ctpm_value;
595 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, ¶m))
596 != 0)
597 return (err);
598
599 if (cmd == CT_TGET)
600 return (brand_uucopy(&s10param, (void *)arg,
601 sizeof (s10param)));
602
603 return (0);
604 }
605
606 /*
607 * ZFS ioctls have changed in each Solaris 10 (S10) release as well as in
608 * Solaris Next. The brand wraps ZFS commands so that the native commands
609 * are used, but we want to be sure no command sneaks in that uses ZFS
610 * without our knowledge. We'll abort the process if we see a ZFS ioctl.
611 */
612 static int
zfs_ioctl(sysret_t * rval,int fdes,int cmd,intptr_t arg)613 zfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
614 {
615 static dev_t zfs_dev = (dev_t)-1;
616 int err;
617
618 if (passthru_otherdev_ioctl(&zfs_dev, ZFS_DEV, &err,
619 rval, fdes, cmd, arg) == 1)
620 return (err);
621
622 brand_abort(0, "ZFS ioctl!");
623 /*NOTREACHED*/
624 return (0);
625 }
626
627 struct s10_lofi_ioctl {
628 uint32_t li_id;
629 boolean_t li_force;
630 char li_filename[MAXPATHLEN + 1];
631 };
632
633 static int
lofi_ioctl(sysret_t * rval,int fdes,int cmd,intptr_t arg)634 lofi_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
635 {
636 static dev_t lofi_dev = (dev_t)-1;
637 struct s10_lofi_ioctl s10_param;
638 struct lofi_ioctl native_param;
639 int err;
640
641 if (passthru_otherdev_ioctl(&lofi_dev, "/dev/lofictl", &err,
642 rval, fdes, cmd, arg) == 1)
643 return (err);
644
645 if (brand_uucopy((const void *)arg, &s10_param,
646 sizeof (s10_param)) != 0)
647 return (EFAULT);
648
649 /*
650 * Somewhat weirdly, EIO is what the S10 lofi driver would
651 * return for unrecognised cmds.
652 */
653 if (cmd >= LOFI_CHECK_COMPRESSED)
654 return (EIO);
655
656 bzero(&native_param, sizeof (native_param));
657
658 struct_assign(native_param, s10_param, li_id);
659 struct_assign(native_param, s10_param, li_force);
660
661 /*
662 * Careful here, this has changed from [MAXPATHLEN + 1] to
663 * [MAXPATHLEN].
664 */
665 bcopy(s10_param.li_filename, native_param.li_filename,
666 sizeof (native_param.li_filename));
667 native_param.li_filename[MAXPATHLEN - 1] = '\0';
668
669 err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, &native_param);
670
671 struct_assign(s10_param, native_param, li_id);
672 /* li_force is input-only */
673
674 bcopy(native_param.li_filename, s10_param.li_filename,
675 sizeof (native_param.li_filename));
676
677 (void) brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param));
678 return (err);
679 }
680
681 int
s10_ioctl(sysret_t * rval,int fdes,int cmd,intptr_t arg)682 s10_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
683 {
684 switch (cmd) {
685 case CRYPTO_GET_FUNCTION_LIST:
686 return (crypto_ioctl(rval, fdes, cmd, arg));
687 case CT_TGET:
688 /*FALLTHRU*/
689 case CT_TSET:
690 return (ctfs_ioctl(rval, fdes, cmd, arg));
691 case MNTIOC_GETMNTENT:
692 /*FALLTHRU*/
693 case MNTIOC_GETEXTMNTENT:
694 /*FALLTHRU*/
695 case MNTIOC_GETMNTANY:
696 return (mntfs_ioctl(rval, fdes, cmd, arg));
697 }
698
699 switch (cmd & ~0xff) {
700 case ZFS_IOC:
701 return (zfs_ioctl(rval, fdes, cmd, arg));
702
703 case LOFI_IOC_BASE:
704 return (lofi_ioctl(rval, fdes, cmd, arg));
705
706 default:
707 break;
708 }
709
710 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
711 }
712
713 /*
714 * Unfortunately, pwrite()'s behavior differs between S10 and Nevada when
715 * applied to files opened with O_APPEND. The offset argument is ignored and
716 * the buffer is appended to the target file in S10, whereas the current file
717 * position is ignored in Nevada (i.e., pwrite() acts as though the target file
718 * wasn't opened with O_APPEND). This is a result of the fix for CR 6655660
719 * (pwrite() must ignore the O_APPEND/FAPPEND flag).
720 *
721 * We emulate the old S10 pwrite() behavior by checking whether the target file
722 * was opened with O_APPEND. If it was, then invoke the write() system call
723 * instead of pwrite(); otherwise, invoke the pwrite() system call as usual.
724 */
725 static int
s10_pwrite(sysret_t * rval,int fd,const void * bufferp,size_t num_bytes,off_t offset)726 s10_pwrite(sysret_t *rval, int fd, const void *bufferp, size_t num_bytes,
727 off_t offset)
728 {
729 int err;
730
731 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL, 0, 0)) !=
732 0) {
733 return (err);
734 }
735 if (rval->sys_rval1 & O_APPEND)
736 return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
737 num_bytes));
738 return (__systemcall(rval, SYS_pwrite + 1024, fd, bufferp, num_bytes,
739 offset));
740 }
741
742 #if !defined(_LP64)
743 /*
744 * This is the large file version of the pwrite() system call for 32-bit
745 * processes. This exists for the same reason that s10_pwrite() exists; see
746 * the comment above s10_pwrite().
747 */
748 static int
s10_pwrite64(sysret_t * rval,int fd,const void * bufferp,size32_t num_bytes,uint32_t offset_1,uint32_t offset_2)749 s10_pwrite64(sysret_t *rval, int fd, const void *bufferp, size32_t num_bytes,
750 uint32_t offset_1, uint32_t offset_2)
751 {
752 int err;
753
754 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL, 0, 0)) !=
755 0) {
756 return (err);
757 }
758 if (rval->sys_rval1 & O_APPEND)
759 return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
760 num_bytes));
761 return (__systemcall(rval, SYS_pwrite64 + 1024, fd, bufferp,
762 num_bytes, offset_1, offset_2));
763 }
764 #endif /* !_LP64 */
765
766 /*
767 * These are convenience macros that s10_getdents_common() uses. Both treat
768 * their arguments, which should be character pointers, as dirent pointers or
769 * dirent64 pointers and yield their d_name and d_reclen fields. These
770 * macros shouldn't be used outside of s10_getdents_common().
771 */
772 #define dirent_name(charptr) ((charptr) + name_offset)
773 #define dirent_reclen(charptr) \
774 (*(unsigned short *)(uintptr_t)((charptr) + reclen_offset))
775
776 /*
777 * This function contains code that is common to both s10_getdents() and
778 * s10_getdents64(). See the comment above s10_getdents() for details.
779 *
780 * rval, fd, buf, and nbyte should be passed unmodified from s10_getdents()
781 * and s10_getdents64(). getdents_syscall_id should be either SYS_getdents
782 * or SYS_getdents64. name_offset should be the the byte offset of
783 * the d_name field in the dirent structures passed to the kernel via the
784 * syscall represented by getdents_syscall_id. reclen_offset should be
785 * the byte offset of the d_reclen field in the aforementioned dirent
786 * structures.
787 */
788 static int
s10_getdents_common(sysret_t * rval,int fd,char * buf,size_t nbyte,int getdents_syscall_id,size_t name_offset,size_t reclen_offset)789 s10_getdents_common(sysret_t *rval, int fd, char *buf, size_t nbyte,
790 int getdents_syscall_id, size_t name_offset, size_t reclen_offset)
791 {
792 int err;
793 size_t buf_size;
794 char *local_buf;
795 char *buf_current;
796
797 /*
798 * Use a special brand operation, B_S10_ISFDXATTRDIR, to determine
799 * whether the specified file descriptor refers to an extended file
800 * attribute directory. If it doesn't, then SYS_getdents won't
801 * reveal extended file attributes, in which case we can simply
802 * hand the syscall to the native kernel.
803 */
804 if ((err = __systemcall(rval, SYS_brand + 1024, B_S10_ISFDXATTRDIR,
805 fd)) != 0)
806 return (err);
807 if (rval->sys_rval1 == 0)
808 return (__systemcall(rval, getdents_syscall_id + 1024, fd, buf,
809 nbyte));
810
811 /*
812 * The file descriptor refers to an extended file attributes directory.
813 * We need to create a dirent buffer that's as large as buf into which
814 * the native SYS_getdents will store the special extended file
815 * attribute directory's entries. We can't dereference buf because
816 * it might be an invalid pointer!
817 */
818 if (nbyte > MAXGETDENTS_SIZE)
819 nbyte = MAXGETDENTS_SIZE;
820 local_buf = (char *)malloc(nbyte);
821 if (local_buf == NULL) {
822 /*
823 * getdents(2) doesn't return an error code indicating a memory
824 * allocation error and it doesn't make sense to return any of
825 * its documented error codes for a malloc(3C) failure. We'll
826 * use ENOMEM even though getdents(2) doesn't use it because it
827 * best describes the failure.
828 */
829 (void) B_TRUSS_POINT_3(rval, getdents_syscall_id, ENOMEM, fd,
830 buf, nbyte);
831 rval->sys_rval1 = -1;
832 rval->sys_rval2 = 0;
833 return (EIO);
834 }
835
836 /*
837 * Issue a native SYS_getdents syscall but use our local dirent buffer
838 * instead of buf. This will allow us to examine the returned dirent
839 * structures immediately and copy them to buf later. That way the
840 * calling process won't be able to see the dirent structures until
841 * we finish examining them.
842 */
843 if ((err = __systemcall(rval, getdents_syscall_id + 1024, fd, local_buf,
844 nbyte)) != 0) {
845 free(local_buf);
846 return (err);
847 }
848 buf_size = rval->sys_rval1;
849 if (buf_size == 0) {
850 free(local_buf);
851 return (0);
852 }
853
854 /*
855 * Look for SUNWattr_ro (VIEW_READONLY) and SUNWattr_rw
856 * (VIEW_READWRITE) in the directory entries and remove them
857 * from the dirent buffer.
858 */
859 for (buf_current = local_buf;
860 (size_t)(buf_current - local_buf) < buf_size; /* cstyle */) {
861 if (strcmp(dirent_name(buf_current), VIEW_READONLY) != 0 &&
862 strcmp(dirent_name(buf_current), VIEW_READWRITE) != 0) {
863 /*
864 * The dirent refers to an attribute that should
865 * be visible to Solaris 10 processes. Keep it
866 * and examine the next entry in the buffer.
867 */
868 buf_current += dirent_reclen(buf_current);
869 } else {
870 /*
871 * We found either SUNWattr_ro (VIEW_READONLY)
872 * or SUNWattr_rw (VIEW_READWRITE). Remove it
873 * from the dirent buffer by decrementing
874 * buf_size by the size of the entry and
875 * overwriting the entry with the remaining
876 * entries.
877 */
878 buf_size -= dirent_reclen(buf_current);
879 (void) memmove(buf_current, buf_current +
880 dirent_reclen(buf_current), buf_size -
881 (size_t)(buf_current - local_buf));
882 }
883 }
884
885 /*
886 * Copy local_buf into buf so that the calling process can see
887 * the results.
888 */
889 if ((err = brand_uucopy(local_buf, buf, buf_size)) != 0) {
890 free(local_buf);
891 rval->sys_rval1 = -1;
892 rval->sys_rval2 = 0;
893 return (err);
894 }
895 rval->sys_rval1 = buf_size;
896 free(local_buf);
897 return (0);
898 }
899
900 /*
901 * Solaris Next added two special extended file attributes, SUNWattr_ro and
902 * SUNWattr_rw, which are called "extended system attributes". They have
903 * special semantics (e.g., a process cannot unlink SUNWattr_ro) and should
904 * not appear in solaris10-branded zones because no Solaris 10 applications,
905 * including system commands such as tar(1), are coded to correctly handle these
906 * special attributes.
907 *
908 * This emulation function solves the aforementioned problem by emulating
909 * the getdents(2) syscall and filtering both system attributes out of resulting
910 * directory entry lists. The emulation function only filters results when
911 * the given file descriptor refers to an extended file attribute directory.
912 * Filtering getdents(2) results is expensive because it requires dynamic
913 * memory allocation; however, the performance cost is tolerable because
914 * we don't expect Solaris 10 processes to frequently examine extended file
915 * attribute directories.
916 *
917 * The brand's emulation library needs two getdents(2) emulation functions
918 * because getdents(2) comes in two flavors: non-largefile-aware getdents(2)
919 * and largefile-aware getdents64(2). s10_getdents() handles the non-largefile-
920 * aware case for 32-bit processes and all getdents(2) syscalls for 64-bit
921 * processes (64-bit processes use largefile-aware interfaces by default).
922 * See s10_getdents64() below for the largefile-aware getdents64(2) emulation
923 * function for 32-bit processes.
924 */
925 static int
s10_getdents(sysret_t * rval,int fd,struct dirent * buf,size_t nbyte)926 s10_getdents(sysret_t *rval, int fd, struct dirent *buf, size_t nbyte)
927 {
928 return (s10_getdents_common(rval, fd, (char *)buf, nbyte, SYS_getdents,
929 offsetof(struct dirent, d_name),
930 offsetof(struct dirent, d_reclen)));
931 }
932
933 #ifndef _LP64
934 /*
935 * This is the largefile-aware version of getdents(2) for 32-bit processes.
936 * This exists for the same reason that s10_getdents() exists. See the comment
937 * above s10_getdents().
938 */
939 static int
s10_getdents64(sysret_t * rval,int fd,struct dirent64 * buf,size_t nbyte)940 s10_getdents64(sysret_t *rval, int fd, struct dirent64 *buf, size_t nbyte)
941 {
942 return (s10_getdents_common(rval, fd, (char *)buf, nbyte,
943 SYS_getdents64, offsetof(struct dirent64, d_name),
944 offsetof(struct dirent64, d_reclen)));
945 }
946 #endif /* !_LP64 */
947
948 #define S10_TRIVIAL_ACL_CNT 6
949 #define NATIVE_TRIVIAL_ACL_CNT 3
950
951 /*
952 * Check if the ACL qualifies as a trivial ACL based on the native
953 * interpretation.
954 */
955 static boolean_t
has_trivial_native_acl(int cmd,int cnt,const char * fname,int fd)956 has_trivial_native_acl(int cmd, int cnt, const char *fname, int fd)
957 {
958 int i, err;
959 sysret_t rval;
960 ace_t buf[NATIVE_TRIVIAL_ACL_CNT];
961
962 if (fname != NULL)
963 err = __systemcall(&rval, SYS_pathconf + 1024, fname,
964 _PC_ACL_ENABLED);
965 else
966 err = __systemcall(&rval, SYS_fpathconf + 1024, fd,
967 _PC_ACL_ENABLED);
968 if (err != 0 || rval.sys_rval1 != _ACL_ACE_ENABLED)
969 return (B_FALSE);
970
971 /*
972 * If we just got the ACL cnt, we don't need to get it again, its
973 * passed in as the cnt arg.
974 */
975 if (cmd != ACE_GETACLCNT) {
976 if (fname != NULL) {
977 if (__systemcall(&rval, SYS_acl + 1024, fname,
978 ACE_GETACLCNT, 0, NULL) != 0)
979 return (B_FALSE);
980 } else {
981 if (__systemcall(&rval, SYS_facl + 1024, fd,
982 ACE_GETACLCNT, 0, NULL) != 0)
983 return (B_FALSE);
984 }
985 cnt = rval.sys_rval1;
986 }
987
988 if (cnt != NATIVE_TRIVIAL_ACL_CNT)
989 return (B_FALSE);
990
991 if (fname != NULL) {
992 if (__systemcall(&rval, SYS_acl + 1024, fname, ACE_GETACL, cnt,
993 buf) != 0)
994 return (B_FALSE);
995 } else {
996 if (__systemcall(&rval, SYS_facl + 1024, fd, ACE_GETACL, cnt,
997 buf) != 0)
998 return (B_FALSE);
999 }
1000
1001 /*
1002 * The following is based on the logic from the native OS
1003 * ace_trivial_common() to determine if the native ACL is trivial.
1004 */
1005 for (i = 0; i < cnt; i++) {
1006 switch (buf[i].a_flags & ACE_TYPE_FLAGS) {
1007 case ACE_OWNER:
1008 case ACE_GROUP|ACE_IDENTIFIER_GROUP:
1009 case ACE_EVERYONE:
1010 break;
1011 default:
1012 return (B_FALSE);
1013 }
1014
1015 if (buf[i].a_flags & (ACE_FILE_INHERIT_ACE|
1016 ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
1017 ACE_INHERIT_ONLY_ACE))
1018 return (B_FALSE);
1019
1020 /*
1021 * Special check for some special bits
1022 *
1023 * Don't allow anybody to deny reading basic
1024 * attributes or a files ACL.
1025 */
1026 if (buf[i].a_access_mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
1027 buf[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE)
1028 return (B_FALSE);
1029
1030 /*
1031 * Delete permissions are never set by default
1032 */
1033 if (buf[i].a_access_mask & (ACE_DELETE|ACE_DELETE_CHILD))
1034 return (B_FALSE);
1035 /*
1036 * only allow owner@ to have
1037 * write_acl/write_owner/write_attributes/write_xattr/
1038 */
1039 if (buf[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
1040 (!(buf[i].a_flags & ACE_OWNER) && (buf[i].a_access_mask &
1041 (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
1042 ACE_WRITE_NAMED_ATTRS))))
1043 return (B_FALSE);
1044
1045 }
1046
1047 return (B_TRUE);
1048 }
1049
1050 /*
1051 * The following logic is based on the S10 adjust_ace_pair_common() code.
1052 */
1053 static void
s10_adjust_ace_mask(void * pair,size_t access_off,size_t pairsize,mode_t mode)1054 s10_adjust_ace_mask(void *pair, size_t access_off, size_t pairsize, mode_t mode)
1055 {
1056 char *datap = (char *)pair;
1057 uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
1058 uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
1059 access_off);
1060
1061 if (mode & S_IROTH)
1062 *amask1 |= ACE_READ_DATA;
1063 else
1064 *amask0 |= ACE_READ_DATA;
1065 if (mode & S_IWOTH)
1066 *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
1067 else
1068 *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
1069 if (mode & S_IXOTH)
1070 *amask1 |= ACE_EXECUTE;
1071 else
1072 *amask0 |= ACE_EXECUTE;
1073 }
1074
1075 /*
1076 * Construct a trivial S10 style ACL.
1077 */
1078 static int
make_trivial_s10_acl(const char * fname,int fd,ace_t * bp)1079 make_trivial_s10_acl(const char *fname, int fd, ace_t *bp)
1080 {
1081 int err;
1082 sysret_t rval;
1083 struct stat64 buf;
1084 ace_t trivial_s10_acl[] = {
1085 {(uint_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
1086 {(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
1087 ACE_WRITE_NAMED_ATTRS, ACE_OWNER,
1088 ACE_ACCESS_ALLOWED_ACE_TYPE},
1089 {(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
1090 ACE_ACCESS_DENIED_ACE_TYPE},
1091 {(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
1092 ACE_ACCESS_ALLOWED_ACE_TYPE},
1093 {(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
1094 ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE,
1095 ACE_ACCESS_DENIED_ACE_TYPE},
1096 {(uint_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|
1097 ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE, ACE_EVERYONE,
1098 ACE_ACCESS_ALLOWED_ACE_TYPE}
1099 };
1100
1101 if (fname != NULL) {
1102 if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, AT_FDCWD,
1103 fname, &buf, 0)) != 0)
1104 return (err);
1105 } else {
1106 if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, fd,
1107 NULL, &buf, 0)) != 0)
1108 return (err);
1109 }
1110
1111 s10_adjust_ace_mask(&trivial_s10_acl[0], offsetof(ace_t, a_access_mask),
1112 sizeof (ace_t), (buf.st_mode & 0700) >> 6);
1113 s10_adjust_ace_mask(&trivial_s10_acl[2], offsetof(ace_t, a_access_mask),
1114 sizeof (ace_t), (buf.st_mode & 0070) >> 3);
1115 s10_adjust_ace_mask(&trivial_s10_acl[4], offsetof(ace_t, a_access_mask),
1116 sizeof (ace_t), buf.st_mode & 0007);
1117
1118 if (brand_uucopy(&trivial_s10_acl, bp, sizeof (trivial_s10_acl)) != 0)
1119 return (EFAULT);
1120
1121 return (0);
1122 }
1123
1124 /*
1125 * The definition of a trivial ace-style ACL (used by ZFS and NFSv4) has been
1126 * simplified since S10. Instead of 6 entries on a trivial S10 ACE ACL we now
1127 * have 3 streamlined entries. The new, simpler trivial style confuses S10
1128 * commands such as 'ls -v' or 'cp -p' which don't see the expected S10 trivial
1129 * ACL entries and thus assume that there is a complex ACL on the file.
1130 *
1131 * See: PSARC/2010/029 Improved ACL interoperability
1132 *
1133 * Note that the trival ACL detection code is implemented in acl_trival() in
1134 * lib/libsec/common/aclutils.c. It always uses the acl() syscall (not the
1135 * facl syscall) to determine if an ACL is trivial. However, we emulate both
1136 * acl() and facl() so that the two provide consistent results.
1137 *
1138 * We don't currently try to emulate setting of ACLs since the primary
1139 * consumer of this feature is SMB or NFSv4 servers, neither of which are
1140 * supported in solaris10-branded zones. If ACLs are used they must be set on
1141 * files using the native OS interpretation.
1142 */
1143 int
s10_acl(sysret_t * rval,const char * fname,int cmd,int nentries,void * aclbufp)1144 s10_acl(sysret_t *rval, const char *fname, int cmd, int nentries, void *aclbufp)
1145 {
1146 int res;
1147
1148 res = __systemcall(rval, SYS_acl + 1024, fname, cmd, nentries, aclbufp);
1149
1150 switch (cmd) {
1151 case ACE_GETACLCNT:
1152 if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT,
1153 rval->sys_rval1, fname, 0)) {
1154 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1155 }
1156 break;
1157 case ACE_GETACL:
1158 if (res == 0 &&
1159 has_trivial_native_acl(ACE_GETACL, 0, fname, 0) &&
1160 nentries >= S10_TRIVIAL_ACL_CNT) {
1161 res = make_trivial_s10_acl(fname, 0, aclbufp);
1162 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1163 }
1164 break;
1165 }
1166
1167 return (res);
1168 }
1169
1170 int
s10_facl(sysret_t * rval,int fdes,int cmd,int nentries,void * aclbufp)1171 s10_facl(sysret_t *rval, int fdes, int cmd, int nentries, void *aclbufp)
1172 {
1173 int res;
1174
1175 res = __systemcall(rval, SYS_facl + 1024, fdes, cmd, nentries, aclbufp);
1176
1177 switch (cmd) {
1178 case ACE_GETACLCNT:
1179 if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT,
1180 rval->sys_rval1, NULL, fdes)) {
1181 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1182 }
1183 break;
1184 case ACE_GETACL:
1185 if (res == 0 &&
1186 has_trivial_native_acl(ACE_GETACL, 0, NULL, fdes) &&
1187 nentries >= S10_TRIVIAL_ACL_CNT) {
1188 res = make_trivial_s10_acl(NULL, fdes, aclbufp);
1189 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT;
1190 }
1191 break;
1192 }
1193
1194 return (res);
1195 }
1196
1197 #define S10_AC_PROC (0x1 << 28)
1198 #define S10_AC_TASK (0x2 << 28)
1199 #define S10_AC_FLOW (0x4 << 28)
1200 #define S10_AC_MODE(x) ((x) & 0xf0000000)
1201 #define S10_AC_OPTION(x) ((x) & 0x0fffffff)
1202
1203 /*
1204 * The mode shift, mode mask and option mask for acctctl have changed. The
1205 * mode is currently the top full byte and the option is the lower 3 full bytes.
1206 */
1207 int
s10_acctctl(sysret_t * rval,int cmd,void * buf,size_t bufsz)1208 s10_acctctl(sysret_t *rval, int cmd, void *buf, size_t bufsz)
1209 {
1210 int mode = S10_AC_MODE(cmd);
1211 int option = S10_AC_OPTION(cmd);
1212
1213 switch (mode) {
1214 case S10_AC_PROC:
1215 mode = AC_PROC;
1216 break;
1217 case S10_AC_TASK:
1218 mode = AC_TASK;
1219 break;
1220 case S10_AC_FLOW:
1221 mode = AC_FLOW;
1222 break;
1223 default:
1224 return (B_TRUSS_POINT_3(rval, SYS_acctctl, EINVAL, cmd, buf,
1225 bufsz));
1226 }
1227
1228 return (__systemcall(rval, SYS_acctctl + 1024, mode | option, buf,
1229 bufsz));
1230 }
1231
1232 /*
1233 * The Audit Policy parameters have changed due to:
1234 * 6466722 audituser and AUDIT_USER are defined, unused, undocumented and
1235 * should be removed.
1236 *
1237 * In S10 we had the following flag:
1238 * #define AUDIT_USER 0x0040
1239 * which doesn't exist in Solaris Next where the subsequent flags are shifted
1240 * down. For example, in S10 we had:
1241 * #define AUDIT_GROUP 0x0080
1242 * but on Solaris Next we have:
1243 * #define AUDIT_GROUP 0x0040
1244 * AUDIT_GROUP has the value AUDIT_USER had in S10 and all of the subsequent
1245 * bits are also shifted one place.
1246 *
1247 * When we're getting or setting the Audit Policy parameters we need to
1248 * shift the outgoing or incoming bits into their proper positions. Since
1249 * S10_AUDIT_USER was always unused, we always clear that bit on A_GETPOLICY.
1250 *
1251 * The command we care about, BSM_AUDITCTL, passes the most parameters (3),
1252 * so declare this function to take up to 4 args and just pass them on.
1253 * The number of parameters for s10_auditsys needs to be equal to the BSM_*
1254 * subcommand that has the most parameters, since we want to pass all
1255 * parameters through, regardless of which subcommands we interpose on.
1256 *
1257 * Note that the auditsys system call uses the SYSENT_AP macro wrapper instead
1258 * of the more common SYSENT_CI macro. This means the return value is a
1259 * SE_64RVAL so the syscall table uses RV_64RVAL.
1260 */
1261
1262 #define S10_AUDIT_HMASK 0xffffffc0
1263 #define S10_AUDIT_LMASK 0x3f
1264 #define S10_AUC_NOSPACE 0x3
1265
1266 int
s10_auditsys(sysret_t * rval,int bsmcmd,intptr_t a0,intptr_t a1,intptr_t a2)1267 s10_auditsys(sysret_t *rval, int bsmcmd, intptr_t a0, intptr_t a1, intptr_t a2)
1268 {
1269 int err;
1270 uint32_t m;
1271
1272 if (bsmcmd != BSM_AUDITCTL)
1273 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1,
1274 a2));
1275
1276 if ((int)a0 == A_GETPOLICY) {
1277 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0,
1278 &m, a2)) != 0)
1279 return (err);
1280 m = ((m & S10_AUDIT_HMASK) << 1) | (m & S10_AUDIT_LMASK);
1281 if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0)
1282 return (EFAULT);
1283 return (0);
1284
1285 } else if ((int)a0 == A_SETPOLICY) {
1286 if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0)
1287 return (EFAULT);
1288 m = ((m >> 1) & S10_AUDIT_HMASK) | (m & S10_AUDIT_LMASK);
1289 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m,
1290 a2));
1291 } else if ((int)a0 == A_GETCOND) {
1292 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0,
1293 &m, a2)) != 0)
1294 return (err);
1295 if (m == AUC_NOSPACE)
1296 m = S10_AUC_NOSPACE;
1297 if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0)
1298 return (EFAULT);
1299 return (0);
1300 } else if ((int)a0 == A_SETCOND) {
1301 if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0)
1302 return (EFAULT);
1303 if (m == S10_AUC_NOSPACE)
1304 m = AUC_NOSPACE;
1305 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m,
1306 a2));
1307 }
1308
1309 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, a2));
1310 }
1311
1312 /*
1313 * Determine whether the executable passed to SYS_exec or SYS_execve is a
1314 * native executable. The s10_npreload.so invokes the B_S10_NATIVE brand
1315 * operation which patches up the processes exec info to eliminate any trace
1316 * of the wrapper. That will make pgrep and other commands that examine
1317 * process' executable names and command-line parameters work properly.
1318 */
1319 static int
s10_exec_native(sysret_t * rval,const char * fname,const char ** argp,const char ** envp)1320 s10_exec_native(sysret_t *rval, const char *fname, const char **argp,
1321 const char **envp)
1322 {
1323 const char *filename = fname;
1324 char path[64];
1325 int err;
1326
1327 /* Get a copy of the executable we're trying to run */
1328 path[0] = '\0';
1329 (void) brand_uucopystr(filename, path, sizeof (path));
1330
1331 /* Check if we're trying to run a native binary */
1332 if (strncmp(path, "/.SUNWnative/usr/lib/brand/solaris10/s10_native",
1333 sizeof (path)) != 0)
1334 return (0);
1335
1336 /* Skip the first element in the argv array */
1337 argp++;
1338
1339 /*
1340 * The the path of the dynamic linker is the second parameter
1341 * of s10_native_exec().
1342 */
1343 if (brand_uucopy(argp, &filename, sizeof (char *)) != 0)
1344 return (EFAULT);
1345
1346 /* If an exec call succeeds, it never returns */
1347 err = __systemcall(rval, SYS_brand + 1024, B_EXEC_NATIVE, filename,
1348 argp, envp, NULL, NULL, NULL);
1349 brand_assert(err != 0);
1350 return (err);
1351 }
1352
1353 /*
1354 * Interpose on the SYS_exec syscall to detect native wrappers.
1355 */
1356 int
s10_exec(sysret_t * rval,const char * fname,const char ** argp)1357 s10_exec(sysret_t *rval, const char *fname, const char **argp)
1358 {
1359 int err;
1360
1361 if ((err = s10_exec_native(rval, fname, argp, NULL)) != 0)
1362 return (err);
1363
1364 /* If an exec call succeeds, it never returns */
1365 err = __systemcall(rval, SYS_execve + 1024, fname, argp, NULL, 0);
1366 brand_assert(err != 0);
1367 return (err);
1368 }
1369
1370 /*
1371 * Interpose on the SYS_execve syscall to detect native wrappers.
1372 */
1373 int
s10_execve(sysret_t * rval,const char * fname,const char ** argp,const char ** envp)1374 s10_execve(sysret_t *rval, const char *fname, const char **argp,
1375 const char **envp)
1376 {
1377 int err;
1378
1379 if ((err = s10_exec_native(rval, fname, argp, envp)) != 0)
1380 return (err);
1381
1382 /* If an exec call succeeds, it never returns */
1383 err = __systemcall(rval, SYS_execve + 1024, fname, argp, envp, 0);
1384 brand_assert(err != 0);
1385 return (err);
1386 }
1387
1388 /*
1389 * fcntl(2) added an additional argument which we need to pass as zero.
1390 */
1391 int
s10_fcntl(sysret_t * rval,int fd,int cmd,intptr_t arg)1392 s10_fcntl(sysret_t *rval, int fd, int cmd, intptr_t arg)
1393 {
1394 return (__systemcall(rval, SYS_fcntl + 1024, fd, cmd, arg, 0));
1395 }
1396
1397 /*
1398 * Interpose on the SYS_fdsync system call. The structure was chagned to use a
1399 * distinct enum rather than passing a combination of the <sys/file.h> flags
1400 * FSYNC and FDSYNC. The prior system call implementation only passed those two
1401 * flags on to the VFS operation. The system call did not check if the 'flag'
1402 * argument was zero or not; however, we know that in illumos it was always
1403 * called with either FSYNC or FDSYNC explicitly. To try and fail open in a
1404 * sense, we translate any call with no explicit level to a normal fsync(3C)
1405 * style operation.
1406 */
1407 static int
s10_fdsync(sysret_t * rval,int fd,int flag)1408 s10_fdsync(sysret_t *rval, int fd, int flag)
1409 {
1410 uint32_t level;
1411
1412 if ((flag & FSYNC) != 0) {
1413 level = FDSYNC_FILE;
1414 } else if ((flag & FDSYNC) != 0) {
1415 level = FDSYNC_DATA;
1416 } else {
1417 level = FDSYNC_FILE;
1418 }
1419
1420 return (__systemcall(rval, SYS_fdsync + 1024, fd, level));
1421 }
1422
1423 /*
1424 * S10's issetugid() syscall is now a subcode to privsys().
1425 */
1426 static int
s10_issetugid(sysret_t * rval)1427 s10_issetugid(sysret_t *rval)
1428 {
1429 return (__systemcall(rval, SYS_privsys + 1024, PRIVSYS_ISSETUGID,
1430 0, 0, 0, 0, 0));
1431 }
1432
1433 /*
1434 * S10's socket() syscall does not split type and flags
1435 */
1436 static int
s10_so_socket(sysret_t * rval,int domain,int type,int protocol,char * devpath,int version)1437 s10_so_socket(sysret_t *rval, int domain, int type, int protocol,
1438 char *devpath, int version)
1439 {
1440 if ((type & ~SOCK_TYPE_MASK) != 0) {
1441 errno = EINVAL;
1442 return (-1);
1443 }
1444 return (__systemcall(rval, SYS_so_socket + 1024, domain, type,
1445 protocol, devpath, version));
1446 }
1447
1448 /*
1449 * S10's pipe() syscall has a different calling convention
1450 */
1451 static int
s10_pipe(sysret_t * rval)1452 s10_pipe(sysret_t *rval)
1453 {
1454 int fds[2], err;
1455 if ((err = __systemcall(rval, SYS_pipe + 1024, fds, 0)) != 0)
1456 return (err);
1457
1458 rval->sys_rval1 = fds[0];
1459 rval->sys_rval2 = fds[1];
1460 return (0);
1461 }
1462
1463 /*
1464 * S10's accept() syscall takes three arguments
1465 */
1466 static int
s10_accept(sysret_t * rval,int sock,struct sockaddr * addr,uint_t * addrlen,int version)1467 s10_accept(sysret_t *rval, int sock, struct sockaddr *addr, uint_t *addrlen,
1468 int version)
1469 {
1470 return (__systemcall(rval, SYS_accept + 1024, sock, addr, addrlen,
1471 version, 0));
1472 }
1473
1474 static long
s10_uname(sysret_t * rv,uintptr_t p1)1475 s10_uname(sysret_t *rv, uintptr_t p1)
1476 {
1477 struct utsname un, *unp = (struct utsname *)p1;
1478 int rev, err;
1479
1480 if ((err = __systemcall(rv, SYS_uname + 1024, &un)) != 0)
1481 return (err);
1482
1483 rev = atoi(&un.release[2]);
1484 brand_assert(rev >= 11);
1485 bzero(un.release, _SYS_NMLN);
1486 (void) strlcpy(un.release, S10_UTS_RELEASE, _SYS_NMLN);
1487 bzero(un.version, _SYS_NMLN);
1488 (void) strlcpy(un.version, S10_UTS_VERSION, _SYS_NMLN);
1489
1490 /* copy out the modified uname info */
1491 return (brand_uucopy(&un, unp, sizeof (un)));
1492 }
1493
1494 int
s10_sysconfig(sysret_t * rv,int which)1495 s10_sysconfig(sysret_t *rv, int which)
1496 {
1497 long value;
1498
1499 /*
1500 * We must interpose on the sysconfig(2) requests
1501 * that deal with the realtime signal number range.
1502 * All others get passed to the native sysconfig(2).
1503 */
1504 switch (which) {
1505 case _CONFIG_RTSIG_MAX:
1506 value = S10_SIGRTMAX - S10_SIGRTMIN + 1;
1507 break;
1508 case _CONFIG_SIGRT_MIN:
1509 value = S10_SIGRTMIN;
1510 break;
1511 case _CONFIG_SIGRT_MAX:
1512 value = S10_SIGRTMAX;
1513 break;
1514 default:
1515 return (__systemcall(rv, SYS_sysconfig + 1024, which));
1516 }
1517
1518 (void) B_TRUSS_POINT_1(rv, SYS_sysconfig, 0, which);
1519 rv->sys_rval1 = value;
1520 rv->sys_rval2 = 0;
1521
1522 return (0);
1523 }
1524
1525 int
s10_sysinfo(sysret_t * rv,int command,char * buf,long count)1526 s10_sysinfo(sysret_t *rv, int command, char *buf, long count)
1527 {
1528 char *value;
1529 int len;
1530
1531 /*
1532 * We must interpose on the sysinfo(2) commands SI_RELEASE and
1533 * SI_VERSION; all others get passed to the native sysinfo(2)
1534 * command.
1535 */
1536 switch (command) {
1537 case SI_RELEASE:
1538 value = S10_UTS_RELEASE;
1539 break;
1540
1541 case SI_VERSION:
1542 value = S10_UTS_VERSION;
1543 break;
1544
1545 default:
1546 /*
1547 * The default action is to pass the command to the
1548 * native sysinfo(2) syscall.
1549 */
1550 return (__systemcall(rv, SYS_systeminfo + 1024,
1551 command, buf, count));
1552 }
1553
1554 len = strlen(value) + 1;
1555 if (count > 0) {
1556 if (brand_uucopystr(value, buf, count) != 0)
1557 return (EFAULT);
1558
1559 /*
1560 * Assure NULL termination of buf as brand_uucopystr() doesn't.
1561 */
1562 if (len > count && brand_uucopy("\0", buf + (count - 1), 1)
1563 != 0)
1564 return (EFAULT);
1565 }
1566
1567 /*
1568 * On success, sysinfo(2) returns the size of buffer required to hold
1569 * the complete value plus its terminating NULL byte.
1570 */
1571 (void) B_TRUSS_POINT_3(rv, SYS_systeminfo, 0, command, buf, count);
1572 rv->sys_rval1 = len;
1573 rv->sys_rval2 = 0;
1574 return (0);
1575 }
1576
1577 #if defined(__x86)
1578 #if defined(__amd64)
1579 /*
1580 * 64-bit x86 LWPs created by SYS_lwp_create start here if they need to set
1581 * their %fs registers to the legacy Solaris 10 selector value.
1582 *
1583 * This function does three things:
1584 *
1585 * 1. Trap to the kernel so that it can set %fs to the legacy Solaris 10
1586 * selector value.
1587 * 2. Read the LWP's true entry point (the entry point supplied by libc
1588 * when SYS_lwp_create was invoked) from %r14.
1589 * 3. Eliminate this function's stack frame and pass control to the LWP's
1590 * true entry point.
1591 *
1592 * See the comment above s10_lwp_create_correct_fs() (see below) for the reason
1593 * why this function exists.
1594 */
1595 /*ARGSUSED*/
1596 static void
s10_lwp_create_entry_point(void * ulwp_structp)1597 s10_lwp_create_entry_point(void *ulwp_structp)
1598 {
1599 sysret_t rval;
1600
1601 /*
1602 * The new LWP's %fs register is initially zero, but libc won't
1603 * function correctly when %fs is zero. Change the LWP's %fs register
1604 * via SYS_brand.
1605 */
1606 (void) __systemcall(&rval, SYS_brand + 1024, B_S10_FSREGCORRECTION);
1607
1608 /*
1609 * Jump to the true entry point, which is stored in %r14.
1610 * Remove our stack frame before jumping so that
1611 * s10_lwp_create_entry_point() won't be seen in stack traces.
1612 *
1613 * NOTE: s10_lwp_create_entry_point() pushes %r12 onto its stack frame
1614 * so that it can use it as a temporary register. We don't restore %r12
1615 * in this assembly block because we don't care about its value (and
1616 * neither does _lwp_start()). Besides, the System V ABI AMD64
1617 * Actirecture Processor Supplement doesn't specify that %r12 should
1618 * have a special value when LWPs start, so we can ignore its value when
1619 * we jump to the true entry point. Furthermore, %r12 is a callee-saved
1620 * register, so the true entry point should push %r12 onto its stack
1621 * before using the register. We ignore %r14 after we read it for
1622 * similar reasons.
1623 *
1624 * NOTE: The compiler will generate a function epilogue for this
1625 * function despite the fact that the LWP will never execute it.
1626 * We could hand-code this entire function in assembly to eliminate
1627 * the epilogue, but the epilogue is only three or four instructions,
1628 * so we wouldn't save much space. Besides, why would we want
1629 * to create yet another ugly, hard-to-maintain assembly function when
1630 * we could write most of it in C?
1631 */
1632 __asm__ __volatile__(
1633 "movq %0, %%rdi\n\t" /* pass ulwp_structp as arg1 */
1634 "movq %%rbp, %%rsp\n\t" /* eliminate the stack frame */
1635 "popq %%rbp\n\t"
1636 "jmp *%%r14\n\t" /* jump to the true entry point */
1637 : : "r" (ulwp_structp));
1638 /*NOTREACHED*/
1639 }
1640
1641 /*
1642 * The S10 libc expects that %fs will be nonzero for new 64-bit x86 LWPs but the
1643 * Nevada kernel clears %fs for such LWPs. Unforunately, new LWPs do not issue
1644 * SYS_lwp_private (see s10_lwp_private() below) after they are created, so
1645 * we must ensure that new LWPs invoke a brand operation that sets %fs to a
1646 * nonzero value immediately after their creation.
1647 *
1648 * The easiest way to do this is to make new LWPs start at a special function,
1649 * s10_lwp_create_entry_point() (see its definition above), that invokes the
1650 * brand operation that corrects %fs. We'll store the entry points of new LWPs
1651 * in their %r14 registers so that s10_lwp_create_entry_point() can find and
1652 * call them after invoking the special brand operation. %r14 is a callee-saved
1653 * register; therefore, any functions invoked by s10_lwp_create_entry_point()
1654 * and all functions dealing with signals (e.g., sigacthandler()) will preserve
1655 * %r14 for s10_lwp_create_entry_point().
1656 *
1657 * The Nevada kernel can safely work with nonzero %fs values because the kernel
1658 * configures per-thread %fs segment descriptors so that the legacy %fs selector
1659 * value will still work. See the comment in lwp_load() regarding %fs and
1660 * %fsbase in 64-bit x86 processes.
1661 *
1662 * This emulation exists thanks to CRs 6467491 and 6501650.
1663 */
1664 static int
s10_lwp_create_correct_fs(sysret_t * rval,ucontext_t * ucp,int flags,id_t * new_lwp)1665 s10_lwp_create_correct_fs(sysret_t *rval, ucontext_t *ucp, int flags,
1666 id_t *new_lwp)
1667 {
1668 ucontext_t s10_uc;
1669
1670 /*
1671 * Copy the supplied ucontext_t structure to the local stack
1672 * frame and store the new LWP's entry point (the value of %rip
1673 * stored in the ucontext_t) in the new LWP's %r14 register.
1674 * Then make s10_lwp_create_entry_point() the new LWP's entry
1675 * point.
1676 */
1677 if (brand_uucopy(ucp, &s10_uc, sizeof (s10_uc)) != 0)
1678 return (EFAULT);
1679
1680 s10_uc.uc_mcontext.gregs[REG_R14] = s10_uc.uc_mcontext.gregs[REG_RIP];
1681 s10_uc.uc_mcontext.gregs[REG_RIP] = (greg_t)s10_lwp_create_entry_point;
1682
1683 /* fix up the signal mask */
1684 if (s10_uc.uc_flags & UC_SIGMASK)
1685 (void) s10sigset_to_native(&s10_uc.uc_sigmask,
1686 &s10_uc.uc_sigmask);
1687
1688 /*
1689 * Issue SYS_lwp_create to create the new LWP. We pass the
1690 * modified ucontext_t to make sure that the new LWP starts at
1691 * s10_lwp_create_entry_point().
1692 */
1693 return (__systemcall(rval, SYS_lwp_create + 1024, &s10_uc,
1694 flags, new_lwp));
1695 }
1696 #endif /* __amd64 */
1697
1698 /*
1699 * SYS_lwp_private is issued by libc_init() to set %fsbase in 64-bit x86
1700 * processes. The Nevada kernel sets %fs to zero but the S10 libc expects
1701 * %fs to be nonzero. We'll pass the issued system call to the kernel untouched
1702 * and invoke a brand operation to set %fs to the legacy S10 selector value.
1703 *
1704 * This emulation exists thanks to CRs 6467491 and 6501650.
1705 */
1706 static int
s10_lwp_private(sysret_t * rval,int cmd,int which,uintptr_t base)1707 s10_lwp_private(sysret_t *rval, int cmd, int which, uintptr_t base)
1708 {
1709 #if defined(__amd64)
1710 int err;
1711
1712 /*
1713 * The current LWP's %fs register should be zero. Determine whether the
1714 * Solaris 10 libc with which we're working functions correctly when %fs
1715 * is zero by calling thr_main() after issuing the SYS_lwp_private
1716 * syscall. If thr_main() barfs (returns -1), then change the LWP's %fs
1717 * register via SYS_brand and patch brand_sysent_table so that issuing
1718 * SYS_lwp_create executes s10_lwp_create_correct_fs() rather than the
1719 * default s10_lwp_create(). s10_lwp_create_correct_fs() will
1720 * guarantee that new LWPs will have correct %fs values.
1721 */
1722 if ((err = __systemcall(rval, SYS_lwp_private + 1024, cmd, which,
1723 base)) != 0)
1724 return (err);
1725 if (thr_main() == -1) {
1726 /*
1727 * SYS_lwp_private is only issued by libc_init(), which is
1728 * executed when libc is first loaded by ld.so.1. Thus we
1729 * are guaranteed to be single-threaded at this point. Even
1730 * if we were multithreaded at this point, writing a 64-bit
1731 * value to the st_callc field of a brand_sysent_table
1732 * entry is guaranteed to be atomic on 64-bit x86 chips
1733 * as long as the field is not split across cache lines
1734 * (It shouldn't be.). See chapter 8, section 1.1 of
1735 * "The Intel 64 and IA32 Architectures Software Developer's
1736 * Manual," Volume 3A for more details.
1737 */
1738 brand_sysent_table[SYS_lwp_create].st_callc =
1739 (sysent_cb_t)(uintptr_t)s10_lwp_create_correct_fs;
1740 return (__systemcall(rval, SYS_brand + 1024,
1741 B_S10_FSREGCORRECTION));
1742 }
1743 return (0);
1744 #else /* !__amd64 */
1745 return (__systemcall(rval, SYS_lwp_private + 1024, cmd, which, base));
1746 #endif /* !__amd64 */
1747 }
1748 #endif /* __x86 */
1749
1750 /*
1751 * The Opensolaris versions of lwp_mutex_timedlock() and lwp_mutex_trylock()
1752 * add an extra argument to the interfaces, a uintptr_t value for the mutex's
1753 * mutex_owner field. The Solaris 10 libc assigns the mutex_owner field at
1754 * user-level, so we just make the extra argument be zero in both syscalls.
1755 */
1756
1757 static int
s10_lwp_mutex_timedlock(sysret_t * rval,lwp_mutex_t * lp,timespec_t * tsp)1758 s10_lwp_mutex_timedlock(sysret_t *rval, lwp_mutex_t *lp, timespec_t *tsp)
1759 {
1760 return (__systemcall(rval, SYS_lwp_mutex_timedlock + 1024, lp, tsp, 0));
1761 }
1762
1763 static int
s10_lwp_mutex_trylock(sysret_t * rval,lwp_mutex_t * lp)1764 s10_lwp_mutex_trylock(sysret_t *rval, lwp_mutex_t *lp)
1765 {
1766 return (__systemcall(rval, SYS_lwp_mutex_trylock + 1024, lp, 0));
1767 }
1768
1769 /*
1770 * If the emul_global_zone flag is set then emulate some aspects of the
1771 * zone system call. In particular, emulate the global zone ID on the
1772 * ZONE_LOOKUP subcommand and emulate some of the global zone attributes
1773 * on the ZONE_GETATTR subcommand. If the flag is not set or we're performing
1774 * some other operation, simply pass the calls through.
1775 */
1776 int
s10_zone(sysret_t * rval,int cmd,void * arg1,void * arg2,void * arg3,void * arg4)1777 s10_zone(sysret_t *rval, int cmd, void *arg1, void *arg2, void *arg3,
1778 void *arg4)
1779 {
1780 char *aval;
1781 int len;
1782 zoneid_t zid;
1783 int attr;
1784 char *buf;
1785 size_t bufsize;
1786
1787 /*
1788 * We only emulate the zone syscall for a subset of specific commands,
1789 * otherwise we just pass the call through.
1790 */
1791 if (!emul_global_zone)
1792 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2,
1793 arg3, arg4));
1794
1795 switch (cmd) {
1796 case ZONE_LOOKUP:
1797 (void) B_TRUSS_POINT_1(rval, SYS_zone, 0, cmd);
1798 rval->sys_rval1 = GLOBAL_ZONEID;
1799 rval->sys_rval2 = 0;
1800 return (0);
1801
1802 case ZONE_GETATTR:
1803 zid = (zoneid_t)(uintptr_t)arg1;
1804 attr = (int)(uintptr_t)arg2;
1805 buf = (char *)arg3;
1806 bufsize = (size_t)arg4;
1807
1808 /*
1809 * If the request is for the global zone then we're emulating
1810 * that, otherwise pass this thru.
1811 */
1812 if (zid != GLOBAL_ZONEID)
1813 goto passthru;
1814
1815 switch (attr) {
1816 case ZONE_ATTR_NAME:
1817 aval = GLOBAL_ZONENAME;
1818 break;
1819
1820 case ZONE_ATTR_BRAND:
1821 aval = NATIVE_BRAND_NAME;
1822 break;
1823 default:
1824 /*
1825 * We only emulate a subset of the attrs, use the
1826 * real zone id to pass thru the rest.
1827 */
1828 arg1 = (void *)(uintptr_t)zoneid;
1829 goto passthru;
1830 }
1831
1832 (void) B_TRUSS_POINT_5(rval, SYS_zone, 0, cmd, zid, attr,
1833 buf, bufsize);
1834
1835 len = strlen(aval) + 1;
1836 if (len > bufsize)
1837 return (ENAMETOOLONG);
1838
1839 if (buf != NULL) {
1840 if (len == 1) {
1841 if (brand_uucopy("\0", buf, 1) != 0)
1842 return (EFAULT);
1843 } else {
1844 if (brand_uucopystr(aval, buf, len) != 0)
1845 return (EFAULT);
1846
1847 /*
1848 * Assure NULL termination of "buf" as
1849 * brand_uucopystr() does NOT.
1850 */
1851 if (brand_uucopy("\0", buf + (len - 1), 1) != 0)
1852 return (EFAULT);
1853 }
1854 }
1855
1856 rval->sys_rval1 = len;
1857 rval->sys_rval2 = 0;
1858 return (0);
1859
1860 default:
1861 break;
1862 }
1863
1864 passthru:
1865 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, arg3,
1866 arg4));
1867 }
1868
1869 /*ARGSUSED*/
1870 int
brand_init(int argc,char * argv[],char * envp[])1871 brand_init(int argc, char *argv[], char *envp[])
1872 {
1873 sysret_t rval;
1874 ulong_t ldentry;
1875 int err;
1876 char *bname;
1877
1878 brand_pre_init();
1879
1880 /*
1881 * Cache the pid of the zone's init process and determine if
1882 * we're init(8) for the zone. Remember: we might be init
1883 * now, but as soon as we fork(2) we won't be.
1884 */
1885 (void) get_initpid_info();
1886
1887 /* get the current zoneid */
1888 err = __systemcall(&rval, SYS_zone, ZONE_LOOKUP, NULL);
1889 brand_assert(err == 0);
1890 zoneid = (zoneid_t)rval.sys_rval1;
1891
1892 /* Get the zone's emulation bitmap. */
1893 if ((err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid,
1894 S10_EMUL_BITMAP, emul_bitmap, sizeof (emul_bitmap))) != 0) {
1895 brand_abort(err, "The zone's patch level is unsupported");
1896 /*NOTREACHED*/
1897 }
1898
1899 bname = basename(argv[0]);
1900
1901 /*
1902 * In general we want the S10 commands that are zone-aware to continue
1903 * to behave as they normally do within a zone. Since these commands
1904 * are zone-aware, they should continue to "do the right thing".
1905 * However, some zone-aware commands aren't going to work the way
1906 * we expect them to inside the branded zone. In particular, the pkg
1907 * and patch commands will not properly manage all pkgs/patches
1908 * unless the commands think they are running in the global zone. For
1909 * these commands we want to emulate the global zone.
1910 *
1911 * We don't do any emulation for pkgcond since it is typically used
1912 * in pkg/patch postinstall scripts and we want those scripts to do
1913 * the right thing inside a zone.
1914 *
1915 * One issue is the handling of hollow pkgs. Since the pkgs are
1916 * hollow, they won't use pkgcond in their postinstall scripts. These
1917 * pkgs typically are installing drivers so we handle that by
1918 * replacing add_drv and rem_drv in the s10_boot script.
1919 */
1920 if (strcmp("pkgadd", bname) == 0 || strcmp("pkgrm", bname) == 0 ||
1921 strcmp("patchadd", bname) == 0 || strcmp("patchrm", bname) == 0)
1922 emul_global_zone = B_TRUE;
1923
1924 ldentry = brand_post_init(S10_VERSION, argc, argv, envp);
1925
1926 brand_runexe(argv, ldentry);
1927 /*NOTREACHED*/
1928 brand_abort(0, "brand_runexe() returned");
1929 return (-1);
1930 }
1931
1932 /*
1933 * This table must have at least NSYSCALL entries in it.
1934 *
1935 * The second parameter of each entry in the brand_sysent_table
1936 * contains the number of parameters and flags that describe the
1937 * syscall return value encoding. See the block comments at the
1938 * top of this file for more information about the syscall return
1939 * value flags and when they should be used.
1940 */
1941 brand_sysent_table_t brand_sysent_table[] = {
1942 #if defined(__sparc) && !defined(__sparcv9)
1943 EMULATE(brand_indir, 9 | RV_64RVAL), /* 0 */
1944 #else
1945 NOSYS, /* 0 */
1946 #endif
1947 NOSYS, /* 1 */
1948 EMULATE(s10_forkall, 0 | RV_32RVAL2), /* 2 */
1949 NOSYS, /* 3 */
1950 NOSYS, /* 4 */
1951 EMULATE(s10_open, 3 | RV_DEFAULT), /* 5 */
1952 NOSYS, /* 6 */
1953 EMULATE(s10_wait, 0 | RV_32RVAL2), /* 7 */
1954 EMULATE(s10_creat, 2 | RV_DEFAULT), /* 8 */
1955 EMULATE(s10_link, 2 | RV_DEFAULT), /* 9 */
1956 EMULATE(s10_unlink, 1 | RV_DEFAULT), /* 10 */
1957 EMULATE(s10_exec, 2 | RV_DEFAULT), /* 11 */
1958 NOSYS, /* 12 */
1959 NOSYS, /* 13 */
1960 EMULATE(s10_mknod, 3 | RV_DEFAULT), /* 14 */
1961 EMULATE(s10_chmod, 2 | RV_DEFAULT), /* 15 */
1962 EMULATE(s10_chown, 3 | RV_DEFAULT), /* 16 */
1963 NOSYS, /* 17 */
1964 EMULATE(s10_stat, 2 | RV_DEFAULT), /* 18 */
1965 NOSYS, /* 19 */
1966 NOSYS, /* 20 */
1967 NOSYS, /* 21 */
1968 EMULATE(s10_umount, 1 | RV_DEFAULT), /* 22 */
1969 NOSYS, /* 23 */
1970 NOSYS, /* 24 */
1971 NOSYS, /* 25 */
1972 NOSYS, /* 26 */
1973 NOSYS, /* 27 */
1974 EMULATE(s10_fstat, 2 | RV_DEFAULT), /* 28 */
1975 NOSYS, /* 29 */
1976 EMULATE(s10_utime, 2 | RV_DEFAULT), /* 30 */
1977 NOSYS, /* 31 */
1978 NOSYS, /* 32 */
1979 EMULATE(s10_access, 2 | RV_DEFAULT), /* 33 */
1980 NOSYS, /* 34 */
1981 NOSYS, /* 35 */
1982 NOSYS, /* 36 */
1983 EMULATE(s10_kill, 2 | RV_DEFAULT), /* 37 */
1984 NOSYS, /* 38 */
1985 NOSYS, /* 39 */
1986 NOSYS, /* 40 */
1987 EMULATE(s10_dup, 1 | RV_DEFAULT), /* 41 */
1988 EMULATE(s10_pipe, 0 | RV_32RVAL2), /* 42 */
1989 NOSYS, /* 43 */
1990 NOSYS, /* 44 */
1991 NOSYS, /* 45 */
1992 NOSYS, /* 46 */
1993 NOSYS, /* 47 */
1994 NOSYS, /* 48 */
1995 NOSYS, /* 49 */
1996 NOSYS, /* 50 */
1997 NOSYS, /* 51 */
1998 NOSYS, /* 52 */
1999 NOSYS, /* 53 */
2000 EMULATE(s10_ioctl, 3 | RV_DEFAULT), /* 54 */
2001 NOSYS, /* 55 */
2002 NOSYS, /* 56 */
2003 NOSYS, /* 57 */
2004 EMULATE(s10_fdsync, 2 | RV_DEFAULT), /* 58 */
2005 EMULATE(s10_execve, 3 | RV_DEFAULT), /* 59 */
2006 NOSYS, /* 60 */
2007 NOSYS, /* 61 */
2008 EMULATE(s10_fcntl, 3 | RV_DEFAULT), /* 62 */
2009 NOSYS, /* 63 */
2010 NOSYS, /* 64 */
2011 NOSYS, /* 65 */
2012 NOSYS, /* 66 */
2013 NOSYS, /* 67 */
2014 NOSYS, /* 68 */
2015 NOSYS, /* 69 */
2016 NOSYS, /* 70 */
2017 EMULATE(s10_acctctl, 3 | RV_DEFAULT), /* 71 */
2018 NOSYS, /* 72 */
2019 NOSYS, /* 73 */
2020 NOSYS, /* 74 */
2021 EMULATE(s10_issetugid, 0 | RV_DEFAULT), /* 75 */
2022 EMULATE(s10_fsat, 6 | RV_DEFAULT), /* 76 */
2023 NOSYS, /* 77 */
2024 NOSYS, /* 78 */
2025 EMULATE(s10_rmdir, 1 | RV_DEFAULT), /* 79 */
2026 EMULATE(s10_mkdir, 2 | RV_DEFAULT), /* 80 */
2027 EMULATE(s10_getdents, 3 | RV_DEFAULT), /* 81 */
2028 NOSYS, /* 82 */
2029 NOSYS, /* 83 */
2030 NOSYS, /* 84 */
2031 NOSYS, /* 85 */
2032 NOSYS, /* 86 */
2033 EMULATE(s10_poll, 3 | RV_DEFAULT), /* 87 */
2034 EMULATE(s10_lstat, 2 | RV_DEFAULT), /* 88 */
2035 EMULATE(s10_symlink, 2 | RV_DEFAULT), /* 89 */
2036 EMULATE(s10_readlink, 3 | RV_DEFAULT), /* 90 */
2037 NOSYS, /* 91 */
2038 NOSYS, /* 92 */
2039 EMULATE(s10_fchmod, 2 | RV_DEFAULT), /* 93 */
2040 EMULATE(s10_fchown, 3 | RV_DEFAULT), /* 94 */
2041 EMULATE(s10_sigprocmask, 3 | RV_DEFAULT), /* 95 */
2042 EMULATE(s10_sigsuspend, 1 | RV_DEFAULT), /* 96 */
2043 NOSYS, /* 97 */
2044 EMULATE(s10_sigaction, 3 | RV_DEFAULT), /* 98 */
2045 EMULATE(s10_sigpending, 2 | RV_DEFAULT), /* 99 */
2046 NOSYS, /* 100 */
2047 NOSYS, /* 101 */
2048 NOSYS, /* 102 */
2049 NOSYS, /* 103 */
2050 NOSYS, /* 104 */
2051 NOSYS, /* 105 */
2052 NOSYS, /* 106 */
2053 EMULATE(s10_waitid, 4 | RV_DEFAULT), /* 107 */
2054 EMULATE(s10_sigsendsys, 2 | RV_DEFAULT), /* 108 */
2055 NOSYS, /* 109 */
2056 NOSYS, /* 110 */
2057 NOSYS, /* 111 */
2058 NOSYS, /* 112 */
2059 NOSYS, /* 113 */
2060 NOSYS, /* 114 */
2061 NOSYS, /* 115 */
2062 NOSYS, /* 116 */
2063 NOSYS, /* 117 */
2064 NOSYS, /* 118 */
2065 NOSYS, /* 119 */
2066 NOSYS, /* 120 */
2067 NOSYS, /* 121 */
2068 NOSYS, /* 122 */
2069 #if defined(__x86)
2070 EMULATE(s10_xstat, 3 | RV_DEFAULT), /* 123 */
2071 EMULATE(s10_lxstat, 3 | RV_DEFAULT), /* 124 */
2072 EMULATE(s10_fxstat, 3 | RV_DEFAULT), /* 125 */
2073 EMULATE(s10_xmknod, 4 | RV_DEFAULT), /* 126 */
2074 #else
2075 NOSYS, /* 123 */
2076 NOSYS, /* 124 */
2077 NOSYS, /* 125 */
2078 NOSYS, /* 126 */
2079 #endif
2080 NOSYS, /* 127 */
2081 NOSYS, /* 128 */
2082 NOSYS, /* 129 */
2083 EMULATE(s10_lchown, 3 | RV_DEFAULT), /* 130 */
2084 NOSYS, /* 131 */
2085 NOSYS, /* 132 */
2086 NOSYS, /* 133 */
2087 EMULATE(s10_rename, 2 | RV_DEFAULT), /* 134 */
2088 EMULATE(s10_uname, 1 | RV_DEFAULT), /* 135 */
2089 NOSYS, /* 136 */
2090 EMULATE(s10_sysconfig, 1 | RV_DEFAULT), /* 137 */
2091 NOSYS, /* 138 */
2092 EMULATE(s10_sysinfo, 3 | RV_DEFAULT), /* 139 */
2093 NOSYS, /* 140 */
2094 NOSYS, /* 141 */
2095 NOSYS, /* 142 */
2096 EMULATE(s10_fork1, 0 | RV_32RVAL2), /* 143 */
2097 EMULATE(s10_sigtimedwait, 3 | RV_DEFAULT), /* 144 */
2098 NOSYS, /* 145 */
2099 NOSYS, /* 146 */
2100 EMULATE(s10_lwp_sema_wait, 1 | RV_DEFAULT), /* 147 */
2101 NOSYS, /* 148 */
2102 NOSYS, /* 149 */
2103 NOSYS, /* 150 */
2104 NOSYS, /* 151 */
2105 NOSYS, /* 152 */
2106 NOSYS, /* 153 */
2107 EMULATE(s10_utimes, 2 | RV_DEFAULT), /* 154 */
2108 NOSYS, /* 155 */
2109 NOSYS, /* 156 */
2110 NOSYS, /* 157 */
2111 NOSYS, /* 158 */
2112 EMULATE(s10_lwp_create, 3 | RV_DEFAULT), /* 159 */
2113 NOSYS, /* 160 */
2114 NOSYS, /* 161 */
2115 NOSYS, /* 162 */
2116 EMULATE(s10_lwp_kill, 2 | RV_DEFAULT), /* 163 */
2117 NOSYS, /* 164 */
2118 EMULATE(s10_lwp_sigmask, 3 | RV_32RVAL2), /* 165 */
2119 #if defined(__x86)
2120 EMULATE(s10_lwp_private, 3 | RV_DEFAULT), /* 166 */
2121 #else
2122 NOSYS, /* 166 */
2123 #endif
2124 NOSYS, /* 167 */
2125 NOSYS, /* 168 */
2126 EMULATE(s10_lwp_mutex_lock, 1 | RV_DEFAULT), /* 169 */
2127 NOSYS, /* 170 */
2128 NOSYS, /* 171 */
2129 NOSYS, /* 172 */
2130 NOSYS, /* 173 */
2131 EMULATE(s10_pwrite, 4 | RV_DEFAULT), /* 174 */
2132 NOSYS, /* 175 */
2133 NOSYS, /* 176 */
2134 NOSYS, /* 177 */
2135 NOSYS, /* 178 */
2136 NOSYS, /* 179 */
2137 NOSYS, /* 180 */
2138 NOSYS, /* 181 */
2139 NOSYS, /* 182 */
2140 NOSYS, /* 183 */
2141 NOSYS, /* 184 */
2142 EMULATE(s10_acl, 4 | RV_DEFAULT), /* 185 */
2143 EMULATE(s10_auditsys, 4 | RV_64RVAL), /* 186 */
2144 NOSYS, /* 187 */
2145 NOSYS, /* 188 */
2146 NOSYS, /* 189 */
2147 EMULATE(s10_sigqueue, 4 | RV_DEFAULT), /* 190 */
2148 NOSYS, /* 191 */
2149 NOSYS, /* 192 */
2150 NOSYS, /* 193 */
2151 NOSYS, /* 194 */
2152 NOSYS, /* 195 */
2153 NOSYS, /* 196 */
2154 NOSYS, /* 197 */
2155 NOSYS, /* 198 */
2156 NOSYS, /* 199 */
2157 EMULATE(s10_facl, 4 | RV_DEFAULT), /* 200 */
2158 NOSYS, /* 201 */
2159 NOSYS, /* 202 */
2160 NOSYS, /* 203 */
2161 NOSYS, /* 204 */
2162 EMULATE(s10_signotify, 3 | RV_DEFAULT), /* 205 */
2163 NOSYS, /* 206 */
2164 NOSYS, /* 207 */
2165 NOSYS, /* 208 */
2166 NOSYS, /* 209 */
2167 EMULATE(s10_lwp_mutex_timedlock, 2 | RV_DEFAULT), /* 210 */
2168 NOSYS, /* 211 */
2169 NOSYS, /* 212 */
2170 #if defined(_LP64)
2171 NOSYS, /* 213 */
2172 #else
2173 EMULATE(s10_getdents64, 3 | RV_DEFAULT), /* 213 */
2174 #endif
2175 NOSYS, /* 214 */
2176 #if defined(_LP64)
2177 NOSYS, /* 215 */
2178 NOSYS, /* 216 */
2179 NOSYS, /* 217 */
2180 #else
2181 EMULATE(s10_stat64, 2 | RV_DEFAULT), /* 215 */
2182 EMULATE(s10_lstat64, 2 | RV_DEFAULT), /* 216 */
2183 EMULATE(s10_fstat64, 2 | RV_DEFAULT), /* 217 */
2184 #endif
2185 NOSYS, /* 218 */
2186 NOSYS, /* 219 */
2187 NOSYS, /* 220 */
2188 NOSYS, /* 221 */
2189 NOSYS, /* 222 */
2190 #if defined(_LP64)
2191 NOSYS, /* 223 */
2192 NOSYS, /* 224 */
2193 NOSYS, /* 225 */
2194 #else
2195 EMULATE(s10_pwrite64, 5 | RV_DEFAULT), /* 223 */
2196 EMULATE(s10_creat64, 2 | RV_DEFAULT), /* 224 */
2197 EMULATE(s10_open64, 3 | RV_DEFAULT), /* 225 */
2198 #endif
2199 NOSYS, /* 226 */
2200 EMULATE(s10_zone, 5 | RV_DEFAULT), /* 227 */
2201 NOSYS, /* 228 */
2202 NOSYS, /* 229 */
2203 EMULATE(s10_so_socket, 5 | RV_DEFAULT), /* 230 */
2204 NOSYS, /* 231 */
2205 NOSYS, /* 232 */
2206 NOSYS, /* 233 */
2207 EMULATE(s10_accept, 4 | RV_DEFAULT), /* 234 */
2208 NOSYS, /* 235 */
2209 NOSYS, /* 236 */
2210 NOSYS, /* 237 */
2211 NOSYS, /* 238 */
2212 NOSYS, /* 239 */
2213 NOSYS, /* 240 */
2214 NOSYS, /* 241 */
2215 NOSYS, /* 242 */
2216 NOSYS, /* 243 */
2217 NOSYS, /* 244 */
2218 NOSYS, /* 245 */
2219 NOSYS, /* 246 */
2220 NOSYS, /* 247 */
2221 NOSYS, /* 248 */
2222 NOSYS, /* 249 */
2223 NOSYS, /* 250 */
2224 EMULATE(s10_lwp_mutex_trylock, 1 | RV_DEFAULT), /* 251 */
2225 NOSYS, /* 252 */
2226 NOSYS, /* 253 */
2227 NOSYS, /* 254 */
2228 NOSYS /* 255 */
2229 };
2230