1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2016 Toomas Soome <tsoome@me.com> 24 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. 25 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 26 */ 27 28 #include <errno.h> 29 #include <fcntl.h> 30 #include <dirent.h> 31 #include <stddef.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <strings.h> 35 #include <unistd.h> 36 #include <thread.h> 37 #include <sys/auxv.h> 38 #include <sys/brand.h> 39 #include <sys/inttypes.h> 40 #include <sys/lwp.h> 41 #include <sys/syscall.h> 42 #include <sys/systm.h> 43 #include <sys/utsname.h> 44 #include <sys/sysconfig.h> 45 #include <sys/systeminfo.h> 46 #include <sys/zone.h> 47 #include <sys/stat.h> 48 #include <sys/mntent.h> 49 #include <sys/ctfs.h> 50 #include <sys/priv.h> 51 #include <sys/acctctl.h> 52 #include <libgen.h> 53 #include <bsm/audit.h> 54 #include <sys/crypto/ioctl.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/zfs_ioctl.h> 57 #include <sys/ucontext.h> 58 #include <sys/mntio.h> 59 #include <sys/mnttab.h> 60 #include <sys/attr.h> 61 #include <sys/lofi.h> 62 #include <atomic.h> 63 #include <sys/acl.h> 64 #include <sys/socket.h> 65 66 #include <s10_brand.h> 67 #include <brand_misc.h> 68 #include <s10_misc.h> 69 #include <s10_signal.h> 70 71 /* 72 * See usr/src/lib/brand/shared/brand/common/brand_util.c for general 73 * emulation notes. 74 */ 75 76 static zoneid_t zoneid; 77 static boolean_t emul_global_zone = B_FALSE; 78 static s10_emul_bitmap_t emul_bitmap; 79 pid_t zone_init_pid; 80 81 /* 82 * S10_FEATURE_IS_PRESENT is a macro that helps facilitate conditional 83 * emulation. For each constant N defined in the s10_emulated_features 84 * enumeration in usr/src/uts/common/brand/solaris10/s10_brand.h, 85 * S10_FEATURE_IS_PRESENT(N) is true iff the feature/backport represented by N 86 * is present in the Solaris 10 image hosted within the zone. In other words, 87 * S10_FEATURE_IS_PRESENT(N) is true iff the file /usr/lib/brand/solaris10/M, 88 * where M is the enum value of N, was present in the zone when the zone booted. 89 * 90 * 91 * *** Sample Usage 92 * 93 * Suppose that you need to backport a fix to Solaris 10 and there is 94 * emulation in place for the fix. Suppose further that the emulation won't be 95 * needed if the fix is backported (i.e., if the fix is present in the hosted 96 * Solaris 10 environment, then the brand won't need the emulation). Then if 97 * you add a constant named "S10_FEATURE_X" to the end of the 98 * s10_emulated_features enumeration that represents the backported fix and 99 * S10_FEATURE_X evaluates to four, then you should create a file named 100 * /usr/lib/brand/solaris10/4 as part of your backport. Additionally, you 101 * should retain the aforementioned emulation but modify it so that it's 102 * performed only when S10_FEATURE_IS_PRESENT(S10_FEATURE_X) is false. Thus the 103 * emulation function should look something like the following: 104 * 105 * static int 106 * my_emul_function(sysret_t *rv, ...) 107 * { 108 * if (S10_FEATURE_IS_PRESENT(S10_FEATURE_X)) { 109 * // Don't emulate 110 * return (__systemcall(rv, ...)); 111 * } else { 112 * // Emulate whatever needs to be emulated when the 113 * // backport isn't present in the Solaris 10 image. 114 * } 115 * } 116 */ 117 #define S10_FEATURE_IS_PRESENT(s10_emulated_features_constant) \ 118 ((emul_bitmap[(s10_emulated_features_constant) >> 3] & \ 119 (1 << ((s10_emulated_features_constant) & 0x7))) != 0) 120 121 brand_sysent_table_t brand_sysent_table[]; 122 123 #define S10_UTS_RELEASE "5.10" 124 #define S10_UTS_VERSION "Generic_Virtual" 125 126 /* 127 * If the ioctl fd's major doesn't match "major", then pass through the 128 * ioctl, since it is not the expected device. major should be a 129 * pointer to a static dev_t initialized to -1, and devname should be 130 * the path of the device. 131 * 132 * Returns 1 if the ioctl was handled (in which case *err contains the 133 * error code), or 0 if it still needs handling. 134 */ 135 static int 136 passthru_otherdev_ioctl(dev_t *majordev, const char *devname, int *err, 137 sysret_t *rval, int fdes, int cmd, intptr_t arg) 138 { 139 struct stat sbuf; 140 141 if (*majordev == (dev_t)-1) { 142 if ((*err = __systemcall(rval, SYS_fstatat + 1024, 143 AT_FDCWD, devname, &sbuf, 0) != 0) != 0) 144 goto doioctl; 145 146 *majordev = major(sbuf.st_rdev); 147 } 148 149 if ((*err = __systemcall(rval, SYS_fstatat + 1024, fdes, 150 NULL, &sbuf, 0)) != 0) 151 goto doioctl; 152 153 if (major(sbuf.st_rdev) == *majordev) 154 return (0); 155 156 doioctl: 157 *err = (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 158 return (1); 159 } 160 161 /* 162 * Figures out the PID of init for the zone. Also returns a boolean 163 * indicating whether this process currently has that pid: if so, 164 * then at this moment, we are init. 165 */ 166 static boolean_t 167 get_initpid_info(void) 168 { 169 pid_t pid; 170 sysret_t rval; 171 int err; 172 173 /* 174 * Determine the current process PID and the PID of the zone's init. 175 * We use care not to call getpid() here, because we're not supposed 176 * to call getpid() until after the program is fully linked-- the 177 * first call to getpid() is a signal from the linker to debuggers 178 * that linking has been completed. 179 */ 180 if ((err = __systemcall(&rval, SYS_brand, 181 B_S10_PIDINFO, &pid, &zone_init_pid)) != 0) { 182 brand_abort(err, "Failed to get init's pid"); 183 } 184 185 /* 186 * Note that we need to be cautious with the pid we get back-- 187 * it should not be stashed and used in place of getpid(), since 188 * we might fork(2). So we keep zone_init_pid and toss the pid 189 * we otherwise got. 190 */ 191 if (pid == zone_init_pid) 192 return (B_TRUE); 193 194 return (B_FALSE); 195 } 196 197 /* Free the thread-local storage provided by mntfs_get_mntentbuf(). */ 198 static void 199 mntfs_free_mntentbuf(void *arg) 200 { 201 struct mntentbuf *embufp = arg; 202 203 if (embufp == NULL) 204 return; 205 if (embufp->mbuf_emp) 206 free(embufp->mbuf_emp); 207 if (embufp->mbuf_buf) 208 free(embufp->mbuf_buf); 209 bzero(embufp, sizeof (struct mntentbuf)); 210 free(embufp); 211 } 212 213 /* Provide the thread-local storage required by mntfs_ioctl(). */ 214 static struct mntentbuf * 215 mntfs_get_mntentbuf(size_t size) 216 { 217 static mutex_t keylock; 218 static thread_key_t key; 219 static int once_per_keyname = 0; 220 void *tsd = NULL; 221 struct mntentbuf *embufp; 222 223 /* Create the key. */ 224 if (!once_per_keyname) { 225 (void) mutex_lock(&keylock); 226 if (!once_per_keyname) { 227 if (thr_keycreate(&key, mntfs_free_mntentbuf)) { 228 (void) mutex_unlock(&keylock); 229 return (NULL); 230 } else { 231 once_per_keyname++; 232 } 233 } 234 (void) mutex_unlock(&keylock); 235 } 236 237 /* 238 * The thread-specific datum for this key is the address of a struct 239 * mntentbuf. If this is the first time here then we allocate the struct 240 * and its contents, and associate its address with the thread; if there 241 * are any problems then we abort. 242 */ 243 if (thr_getspecific(key, &tsd)) 244 return (NULL); 245 if (tsd == NULL) { 246 if (!(embufp = calloc(1, sizeof (struct mntentbuf))) || 247 !(embufp->mbuf_emp = malloc(sizeof (struct extmnttab))) || 248 thr_setspecific(key, embufp)) { 249 mntfs_free_mntentbuf(embufp); 250 return (NULL); 251 } 252 } else { 253 embufp = tsd; 254 } 255 256 /* Return the buffer, resizing it if necessary. */ 257 if (size > embufp->mbuf_bufsize) { 258 if (embufp->mbuf_buf) 259 free(embufp->mbuf_buf); 260 if ((embufp->mbuf_buf = malloc(size)) == NULL) { 261 embufp->mbuf_bufsize = 0; 262 return (NULL); 263 } else { 264 embufp->mbuf_bufsize = size; 265 } 266 } 267 return (embufp); 268 } 269 270 /* 271 * The MNTIOC_GETMNTENT command in this release differs from that in early 272 * versions of Solaris 10. 273 * 274 * Previously, the command would copy a pointer to a struct extmnttab to an 275 * address provided as an argument. The pointer would be somewhere within a 276 * mapping already present within the user's address space. In addition, the 277 * text to which the struct's members pointed would also be within a 278 * pre-existing mapping. Now, the user is required to allocate memory for both 279 * the struct and the text buffer, and to pass the address of each within a 280 * struct mntentbuf. In order to conceal these details from a Solaris 10 client 281 * we allocate some thread-local storage in which to create the necessary data 282 * structures; this is static, thread-safe memory that will be cleaned up 283 * without the caller's intervention. 284 * 285 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY are new in this release; they should 286 * not work for older clients. 287 */ 288 int 289 mntfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 290 { 291 int err; 292 struct stat statbuf; 293 struct mntentbuf *embufp; 294 static size_t bufsize = MNT_LINE_MAX; 295 296 /* Do not emulate mntfs commands from up-to-date clients. */ 297 if (S10_FEATURE_IS_PRESENT(S10_FEATURE_ALTERED_MNTFS_IOCTL)) 298 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 299 300 /* Do not emulate mntfs commands directed at other file systems. */ 301 if ((err = __systemcall(rval, SYS_fstatat + 1024, 302 fdes, NULL, &statbuf, 0)) != 0) 303 return (err); 304 if (strcmp(statbuf.st_fstype, MNTTYPE_MNTFS) != 0) 305 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 306 307 if (cmd == MNTIOC_GETEXTMNTENT || cmd == MNTIOC_GETMNTANY) 308 return (EINVAL); 309 310 if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL) 311 return (ENOMEM); 312 313 /* 314 * MNTIOC_GETEXTMNTENT advances the file pointer once it has 315 * successfully copied out the result to the address provided. We 316 * therefore need to check the user-supplied address now since the 317 * one we'll be providing is guaranteed to work. 318 */ 319 if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0) 320 return (EFAULT); 321 322 /* 323 * Keep retrying for as long as we fail for want of a large enough 324 * buffer. 325 */ 326 for (;;) { 327 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, 328 MNTIOC_GETEXTMNTENT, embufp)) != 0) 329 return (err); 330 331 if (rval->sys_rval1 == MNTFS_TOOLONG) { 332 /* The buffer wasn't large enough. */ 333 (void) atomic_swap_ulong((unsigned long *)&bufsize, 334 2 * embufp->mbuf_bufsize); 335 if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL) 336 return (ENOMEM); 337 } else { 338 break; 339 } 340 } 341 342 if (brand_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0) 343 return (EFAULT); 344 345 return (0); 346 } 347 348 /* 349 * Assign the structure member value from the s (source) structure to the 350 * d (dest) structure. 351 */ 352 #define struct_assign(d, s, val) (((d).val) = ((s).val)) 353 354 /* 355 * The CRYPTO_GET_FUNCTION_LIST parameter structure crypto_function_list_t 356 * changed between S10 and Nevada, so we have to emulate the old S10 357 * crypto_function_list_t structure when interposing on the ioctl syscall. 358 */ 359 typedef struct s10_crypto_function_list { 360 boolean_t fl_digest_init; 361 boolean_t fl_digest; 362 boolean_t fl_digest_update; 363 boolean_t fl_digest_key; 364 boolean_t fl_digest_final; 365 366 boolean_t fl_encrypt_init; 367 boolean_t fl_encrypt; 368 boolean_t fl_encrypt_update; 369 boolean_t fl_encrypt_final; 370 371 boolean_t fl_decrypt_init; 372 boolean_t fl_decrypt; 373 boolean_t fl_decrypt_update; 374 boolean_t fl_decrypt_final; 375 376 boolean_t fl_mac_init; 377 boolean_t fl_mac; 378 boolean_t fl_mac_update; 379 boolean_t fl_mac_final; 380 381 boolean_t fl_sign_init; 382 boolean_t fl_sign; 383 boolean_t fl_sign_update; 384 boolean_t fl_sign_final; 385 boolean_t fl_sign_recover_init; 386 boolean_t fl_sign_recover; 387 388 boolean_t fl_verify_init; 389 boolean_t fl_verify; 390 boolean_t fl_verify_update; 391 boolean_t fl_verify_final; 392 boolean_t fl_verify_recover_init; 393 boolean_t fl_verify_recover; 394 395 boolean_t fl_digest_encrypt_update; 396 boolean_t fl_decrypt_digest_update; 397 boolean_t fl_sign_encrypt_update; 398 boolean_t fl_decrypt_verify_update; 399 400 boolean_t fl_seed_random; 401 boolean_t fl_generate_random; 402 403 boolean_t fl_session_open; 404 boolean_t fl_session_close; 405 boolean_t fl_session_login; 406 boolean_t fl_session_logout; 407 408 boolean_t fl_object_create; 409 boolean_t fl_object_copy; 410 boolean_t fl_object_destroy; 411 boolean_t fl_object_get_size; 412 boolean_t fl_object_get_attribute_value; 413 boolean_t fl_object_set_attribute_value; 414 boolean_t fl_object_find_init; 415 boolean_t fl_object_find; 416 boolean_t fl_object_find_final; 417 418 boolean_t fl_key_generate; 419 boolean_t fl_key_generate_pair; 420 boolean_t fl_key_wrap; 421 boolean_t fl_key_unwrap; 422 boolean_t fl_key_derive; 423 424 boolean_t fl_init_token; 425 boolean_t fl_init_pin; 426 boolean_t fl_set_pin; 427 428 boolean_t prov_is_hash_limited; 429 uint32_t prov_hash_threshold; 430 uint32_t prov_hash_limit; 431 } s10_crypto_function_list_t; 432 433 typedef struct s10_crypto_get_function_list { 434 uint_t fl_return_value; 435 crypto_provider_id_t fl_provider_id; 436 s10_crypto_function_list_t fl_list; 437 } s10_crypto_get_function_list_t; 438 439 /* 440 * The structure returned by the CRYPTO_GET_FUNCTION_LIST ioctl on /dev/crypto 441 * increased in size due to: 442 * 6482533 Threshold for HW offload via PKCS11 interface 443 * between S10 and Nevada. This is a relatively simple process of filling 444 * in the S10 structure fields with the Nevada data. 445 * 446 * We stat the device to make sure that the ioctl is meant for /dev/crypto. 447 * 448 */ 449 static int 450 crypto_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 451 { 452 int err; 453 s10_crypto_get_function_list_t s10_param; 454 crypto_get_function_list_t native_param; 455 static dev_t crypto_dev = (dev_t)-1; 456 457 if (passthru_otherdev_ioctl(&crypto_dev, "/dev/crypto", &err, 458 rval, fdes, cmd, arg) == 1) 459 return (err); 460 461 if (brand_uucopy((const void *)arg, &s10_param, sizeof (s10_param)) 462 != 0) 463 return (EFAULT); 464 struct_assign(native_param, s10_param, fl_provider_id); 465 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, 466 &native_param)) != 0) 467 return (err); 468 469 struct_assign(s10_param, native_param, fl_return_value); 470 struct_assign(s10_param, native_param, fl_provider_id); 471 472 struct_assign(s10_param, native_param, fl_list.fl_digest_init); 473 struct_assign(s10_param, native_param, fl_list.fl_digest); 474 struct_assign(s10_param, native_param, fl_list.fl_digest_update); 475 struct_assign(s10_param, native_param, fl_list.fl_digest_key); 476 struct_assign(s10_param, native_param, fl_list.fl_digest_final); 477 478 struct_assign(s10_param, native_param, fl_list.fl_encrypt_init); 479 struct_assign(s10_param, native_param, fl_list.fl_encrypt); 480 struct_assign(s10_param, native_param, fl_list.fl_encrypt_update); 481 struct_assign(s10_param, native_param, fl_list.fl_encrypt_final); 482 483 struct_assign(s10_param, native_param, fl_list.fl_decrypt_init); 484 struct_assign(s10_param, native_param, fl_list.fl_decrypt); 485 struct_assign(s10_param, native_param, fl_list.fl_decrypt_update); 486 struct_assign(s10_param, native_param, fl_list.fl_decrypt_final); 487 488 struct_assign(s10_param, native_param, fl_list.fl_mac_init); 489 struct_assign(s10_param, native_param, fl_list.fl_mac); 490 struct_assign(s10_param, native_param, fl_list.fl_mac_update); 491 struct_assign(s10_param, native_param, fl_list.fl_mac_final); 492 493 struct_assign(s10_param, native_param, fl_list.fl_sign_init); 494 struct_assign(s10_param, native_param, fl_list.fl_sign); 495 struct_assign(s10_param, native_param, fl_list.fl_sign_update); 496 struct_assign(s10_param, native_param, fl_list.fl_sign_final); 497 struct_assign(s10_param, native_param, fl_list.fl_sign_recover_init); 498 struct_assign(s10_param, native_param, fl_list.fl_sign_recover); 499 500 struct_assign(s10_param, native_param, fl_list.fl_verify_init); 501 struct_assign(s10_param, native_param, fl_list.fl_verify); 502 struct_assign(s10_param, native_param, fl_list.fl_verify_update); 503 struct_assign(s10_param, native_param, fl_list.fl_verify_final); 504 struct_assign(s10_param, native_param, fl_list.fl_verify_recover_init); 505 struct_assign(s10_param, native_param, fl_list.fl_verify_recover); 506 507 struct_assign(s10_param, native_param, 508 fl_list.fl_digest_encrypt_update); 509 struct_assign(s10_param, native_param, 510 fl_list.fl_decrypt_digest_update); 511 struct_assign(s10_param, native_param, fl_list.fl_sign_encrypt_update); 512 struct_assign(s10_param, native_param, 513 fl_list.fl_decrypt_verify_update); 514 515 struct_assign(s10_param, native_param, fl_list.fl_seed_random); 516 struct_assign(s10_param, native_param, fl_list.fl_generate_random); 517 518 struct_assign(s10_param, native_param, fl_list.fl_session_open); 519 struct_assign(s10_param, native_param, fl_list.fl_session_close); 520 struct_assign(s10_param, native_param, fl_list.fl_session_login); 521 struct_assign(s10_param, native_param, fl_list.fl_session_logout); 522 523 struct_assign(s10_param, native_param, fl_list.fl_object_create); 524 struct_assign(s10_param, native_param, fl_list.fl_object_copy); 525 struct_assign(s10_param, native_param, fl_list.fl_object_destroy); 526 struct_assign(s10_param, native_param, fl_list.fl_object_get_size); 527 struct_assign(s10_param, native_param, 528 fl_list.fl_object_get_attribute_value); 529 struct_assign(s10_param, native_param, 530 fl_list.fl_object_set_attribute_value); 531 struct_assign(s10_param, native_param, fl_list.fl_object_find_init); 532 struct_assign(s10_param, native_param, fl_list.fl_object_find); 533 struct_assign(s10_param, native_param, fl_list.fl_object_find_final); 534 535 struct_assign(s10_param, native_param, fl_list.fl_key_generate); 536 struct_assign(s10_param, native_param, fl_list.fl_key_generate_pair); 537 struct_assign(s10_param, native_param, fl_list.fl_key_wrap); 538 struct_assign(s10_param, native_param, fl_list.fl_key_unwrap); 539 struct_assign(s10_param, native_param, fl_list.fl_key_derive); 540 541 struct_assign(s10_param, native_param, fl_list.fl_init_token); 542 struct_assign(s10_param, native_param, fl_list.fl_init_pin); 543 struct_assign(s10_param, native_param, fl_list.fl_set_pin); 544 545 struct_assign(s10_param, native_param, fl_list.prov_is_hash_limited); 546 struct_assign(s10_param, native_param, fl_list.prov_hash_threshold); 547 struct_assign(s10_param, native_param, fl_list.prov_hash_limit); 548 549 return (brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param))); 550 } 551 552 /* 553 * The process contract CT_TGET and CT_TSET parameter structure ct_param_t 554 * changed between S10 and Nevada, so we have to emulate the old S10 555 * ct_param_t structure when interposing on the ioctl syscall. 556 */ 557 typedef struct s10_ct_param { 558 uint32_t ctpm_id; 559 uint32_t ctpm_pad; 560 uint64_t ctpm_value; 561 } s10_ct_param_t; 562 563 /* 564 * We have to emulate process contract ioctls for init(1M) because the 565 * ioctl parameter structure changed between S10 and Nevada. This is 566 * a relatively simple process of filling Nevada structure fields, 567 * shuffling values, and initiating a native system call. 568 * 569 * For now, we'll assume that all consumers of CT_TGET and CT_TSET will 570 * need emulation. We'll issue a stat to make sure that the ioctl 571 * is meant for the contract file system. 572 * 573 */ 574 static int 575 ctfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 576 { 577 int err; 578 s10_ct_param_t s10param; 579 ct_param_t param; 580 struct stat statbuf; 581 582 if ((err = __systemcall(rval, SYS_fstatat + 1024, 583 fdes, NULL, &statbuf, 0)) != 0) 584 return (err); 585 if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0) 586 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 587 588 if (brand_uucopy((const void *)arg, &s10param, sizeof (s10param)) != 0) 589 return (EFAULT); 590 param.ctpm_id = s10param.ctpm_id; 591 param.ctpm_size = sizeof (uint64_t); 592 param.ctpm_value = &s10param.ctpm_value; 593 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, ¶m)) 594 != 0) 595 return (err); 596 597 if (cmd == CT_TGET) 598 return (brand_uucopy(&s10param, (void *)arg, 599 sizeof (s10param))); 600 601 return (0); 602 } 603 604 /* 605 * ZFS ioctls have changed in each Solaris 10 (S10) release as well as in 606 * Solaris Next. The brand wraps ZFS commands so that the native commands 607 * are used, but we want to be sure no command sneaks in that uses ZFS 608 * without our knowledge. We'll abort the process if we see a ZFS ioctl. 609 */ 610 static int 611 zfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 612 { 613 static dev_t zfs_dev = (dev_t)-1; 614 int err; 615 616 if (passthru_otherdev_ioctl(&zfs_dev, ZFS_DEV, &err, 617 rval, fdes, cmd, arg) == 1) 618 return (err); 619 620 brand_abort(0, "ZFS ioctl!"); 621 /*NOTREACHED*/ 622 return (0); 623 } 624 625 struct s10_lofi_ioctl { 626 uint32_t li_id; 627 boolean_t li_force; 628 char li_filename[MAXPATHLEN + 1]; 629 }; 630 631 static int 632 lofi_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 633 { 634 static dev_t lofi_dev = (dev_t)-1; 635 struct s10_lofi_ioctl s10_param; 636 struct lofi_ioctl native_param; 637 int err; 638 639 if (passthru_otherdev_ioctl(&lofi_dev, "/dev/lofictl", &err, 640 rval, fdes, cmd, arg) == 1) 641 return (err); 642 643 if (brand_uucopy((const void *)arg, &s10_param, 644 sizeof (s10_param)) != 0) 645 return (EFAULT); 646 647 /* 648 * Somewhat weirdly, EIO is what the S10 lofi driver would 649 * return for unrecognised cmds. 650 */ 651 if (cmd >= LOFI_CHECK_COMPRESSED) 652 return (EIO); 653 654 bzero(&native_param, sizeof (native_param)); 655 656 struct_assign(native_param, s10_param, li_id); 657 struct_assign(native_param, s10_param, li_force); 658 659 /* 660 * Careful here, this has changed from [MAXPATHLEN + 1] to 661 * [MAXPATHLEN]. 662 */ 663 bcopy(s10_param.li_filename, native_param.li_filename, 664 sizeof (native_param.li_filename)); 665 native_param.li_filename[MAXPATHLEN - 1] = '\0'; 666 667 err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, &native_param); 668 669 struct_assign(s10_param, native_param, li_id); 670 /* li_force is input-only */ 671 672 bcopy(native_param.li_filename, s10_param.li_filename, 673 sizeof (native_param.li_filename)); 674 675 (void) brand_uucopy(&s10_param, (void *)arg, sizeof (s10_param)); 676 return (err); 677 } 678 679 int 680 s10_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg) 681 { 682 switch (cmd) { 683 case CRYPTO_GET_FUNCTION_LIST: 684 return (crypto_ioctl(rval, fdes, cmd, arg)); 685 case CT_TGET: 686 /*FALLTHRU*/ 687 case CT_TSET: 688 return (ctfs_ioctl(rval, fdes, cmd, arg)); 689 case MNTIOC_GETMNTENT: 690 /*FALLTHRU*/ 691 case MNTIOC_GETEXTMNTENT: 692 /*FALLTHRU*/ 693 case MNTIOC_GETMNTANY: 694 return (mntfs_ioctl(rval, fdes, cmd, arg)); 695 } 696 697 switch (cmd & ~0xff) { 698 case ZFS_IOC: 699 return (zfs_ioctl(rval, fdes, cmd, arg)); 700 701 case LOFI_IOC_BASE: 702 return (lofi_ioctl(rval, fdes, cmd, arg)); 703 704 default: 705 break; 706 } 707 708 return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)); 709 } 710 711 /* 712 * Unfortunately, pwrite()'s behavior differs between S10 and Nevada when 713 * applied to files opened with O_APPEND. The offset argument is ignored and 714 * the buffer is appended to the target file in S10, whereas the current file 715 * position is ignored in Nevada (i.e., pwrite() acts as though the target file 716 * wasn't opened with O_APPEND). This is a result of the fix for CR 6655660 717 * (pwrite() must ignore the O_APPEND/FAPPEND flag). 718 * 719 * We emulate the old S10 pwrite() behavior by checking whether the target file 720 * was opened with O_APPEND. If it was, then invoke the write() system call 721 * instead of pwrite(); otherwise, invoke the pwrite() system call as usual. 722 */ 723 static int 724 s10_pwrite(sysret_t *rval, int fd, const void *bufferp, size_t num_bytes, 725 off_t offset) 726 { 727 int err; 728 729 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0) 730 return (err); 731 if (rval->sys_rval1 & O_APPEND) 732 return (__systemcall(rval, SYS_write + 1024, fd, bufferp, 733 num_bytes)); 734 return (__systemcall(rval, SYS_pwrite + 1024, fd, bufferp, num_bytes, 735 offset)); 736 } 737 738 #if !defined(_LP64) 739 /* 740 * This is the large file version of the pwrite() system call for 32-bit 741 * processes. This exists for the same reason that s10_pwrite() exists; see 742 * the comment above s10_pwrite(). 743 */ 744 static int 745 s10_pwrite64(sysret_t *rval, int fd, const void *bufferp, size32_t num_bytes, 746 uint32_t offset_1, uint32_t offset_2) 747 { 748 int err; 749 750 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0) 751 return (err); 752 if (rval->sys_rval1 & O_APPEND) 753 return (__systemcall(rval, SYS_write + 1024, fd, bufferp, 754 num_bytes)); 755 return (__systemcall(rval, SYS_pwrite64 + 1024, fd, bufferp, 756 num_bytes, offset_1, offset_2)); 757 } 758 #endif /* !_LP64 */ 759 760 /* 761 * These are convenience macros that s10_getdents_common() uses. Both treat 762 * their arguments, which should be character pointers, as dirent pointers or 763 * dirent64 pointers and yield their d_name and d_reclen fields. These 764 * macros shouldn't be used outside of s10_getdents_common(). 765 */ 766 #define dirent_name(charptr) ((charptr) + name_offset) 767 #define dirent_reclen(charptr) \ 768 (*(unsigned short *)(uintptr_t)((charptr) + reclen_offset)) 769 770 /* 771 * This function contains code that is common to both s10_getdents() and 772 * s10_getdents64(). See the comment above s10_getdents() for details. 773 * 774 * rval, fd, buf, and nbyte should be passed unmodified from s10_getdents() 775 * and s10_getdents64(). getdents_syscall_id should be either SYS_getdents 776 * or SYS_getdents64. name_offset should be the the byte offset of 777 * the d_name field in the dirent structures passed to the kernel via the 778 * syscall represented by getdents_syscall_id. reclen_offset should be 779 * the byte offset of the d_reclen field in the aforementioned dirent 780 * structures. 781 */ 782 static int 783 s10_getdents_common(sysret_t *rval, int fd, char *buf, size_t nbyte, 784 int getdents_syscall_id, size_t name_offset, size_t reclen_offset) 785 { 786 int err; 787 size_t buf_size; 788 char *local_buf; 789 char *buf_current; 790 791 /* 792 * Use a special brand operation, B_S10_ISFDXATTRDIR, to determine 793 * whether the specified file descriptor refers to an extended file 794 * attribute directory. If it doesn't, then SYS_getdents won't 795 * reveal extended file attributes, in which case we can simply 796 * hand the syscall to the native kernel. 797 */ 798 if ((err = __systemcall(rval, SYS_brand + 1024, B_S10_ISFDXATTRDIR, 799 fd)) != 0) 800 return (err); 801 if (rval->sys_rval1 == 0) 802 return (__systemcall(rval, getdents_syscall_id + 1024, fd, buf, 803 nbyte)); 804 805 /* 806 * The file descriptor refers to an extended file attributes directory. 807 * We need to create a dirent buffer that's as large as buf into which 808 * the native SYS_getdents will store the special extended file 809 * attribute directory's entries. We can't dereference buf because 810 * it might be an invalid pointer! 811 */ 812 if (nbyte > MAXGETDENTS_SIZE) 813 nbyte = MAXGETDENTS_SIZE; 814 local_buf = (char *)malloc(nbyte); 815 if (local_buf == NULL) { 816 /* 817 * getdents(2) doesn't return an error code indicating a memory 818 * allocation error and it doesn't make sense to return any of 819 * its documented error codes for a malloc(3C) failure. We'll 820 * use ENOMEM even though getdents(2) doesn't use it because it 821 * best describes the failure. 822 */ 823 (void) B_TRUSS_POINT_3(rval, getdents_syscall_id, ENOMEM, fd, 824 buf, nbyte); 825 rval->sys_rval1 = -1; 826 rval->sys_rval2 = 0; 827 return (EIO); 828 } 829 830 /* 831 * Issue a native SYS_getdents syscall but use our local dirent buffer 832 * instead of buf. This will allow us to examine the returned dirent 833 * structures immediately and copy them to buf later. That way the 834 * calling process won't be able to see the dirent structures until 835 * we finish examining them. 836 */ 837 if ((err = __systemcall(rval, getdents_syscall_id + 1024, fd, local_buf, 838 nbyte)) != 0) { 839 free(local_buf); 840 return (err); 841 } 842 buf_size = rval->sys_rval1; 843 if (buf_size == 0) { 844 free(local_buf); 845 return (0); 846 } 847 848 /* 849 * Look for SUNWattr_ro (VIEW_READONLY) and SUNWattr_rw 850 * (VIEW_READWRITE) in the directory entries and remove them 851 * from the dirent buffer. 852 */ 853 for (buf_current = local_buf; 854 (size_t)(buf_current - local_buf) < buf_size; /* cstyle */) { 855 if (strcmp(dirent_name(buf_current), VIEW_READONLY) != 0 && 856 strcmp(dirent_name(buf_current), VIEW_READWRITE) != 0) { 857 /* 858 * The dirent refers to an attribute that should 859 * be visible to Solaris 10 processes. Keep it 860 * and examine the next entry in the buffer. 861 */ 862 buf_current += dirent_reclen(buf_current); 863 } else { 864 /* 865 * We found either SUNWattr_ro (VIEW_READONLY) 866 * or SUNWattr_rw (VIEW_READWRITE). Remove it 867 * from the dirent buffer by decrementing 868 * buf_size by the size of the entry and 869 * overwriting the entry with the remaining 870 * entries. 871 */ 872 buf_size -= dirent_reclen(buf_current); 873 (void) memmove(buf_current, buf_current + 874 dirent_reclen(buf_current), buf_size - 875 (size_t)(buf_current - local_buf)); 876 } 877 } 878 879 /* 880 * Copy local_buf into buf so that the calling process can see 881 * the results. 882 */ 883 if ((err = brand_uucopy(local_buf, buf, buf_size)) != 0) { 884 free(local_buf); 885 rval->sys_rval1 = -1; 886 rval->sys_rval2 = 0; 887 return (err); 888 } 889 rval->sys_rval1 = buf_size; 890 free(local_buf); 891 return (0); 892 } 893 894 /* 895 * Solaris Next added two special extended file attributes, SUNWattr_ro and 896 * SUNWattr_rw, which are called "extended system attributes". They have 897 * special semantics (e.g., a process cannot unlink SUNWattr_ro) and should 898 * not appear in solaris10-branded zones because no Solaris 10 applications, 899 * including system commands such as tar(1), are coded to correctly handle these 900 * special attributes. 901 * 902 * This emulation function solves the aforementioned problem by emulating 903 * the getdents(2) syscall and filtering both system attributes out of resulting 904 * directory entry lists. The emulation function only filters results when 905 * the given file descriptor refers to an extended file attribute directory. 906 * Filtering getdents(2) results is expensive because it requires dynamic 907 * memory allocation; however, the performance cost is tolerable because 908 * we don't expect Solaris 10 processes to frequently examine extended file 909 * attribute directories. 910 * 911 * The brand's emulation library needs two getdents(2) emulation functions 912 * because getdents(2) comes in two flavors: non-largefile-aware getdents(2) 913 * and largefile-aware getdents64(2). s10_getdents() handles the non-largefile- 914 * aware case for 32-bit processes and all getdents(2) syscalls for 64-bit 915 * processes (64-bit processes use largefile-aware interfaces by default). 916 * See s10_getdents64() below for the largefile-aware getdents64(2) emulation 917 * function for 32-bit processes. 918 */ 919 static int 920 s10_getdents(sysret_t *rval, int fd, struct dirent *buf, size_t nbyte) 921 { 922 return (s10_getdents_common(rval, fd, (char *)buf, nbyte, SYS_getdents, 923 offsetof(struct dirent, d_name), 924 offsetof(struct dirent, d_reclen))); 925 } 926 927 #ifndef _LP64 928 /* 929 * This is the largefile-aware version of getdents(2) for 32-bit processes. 930 * This exists for the same reason that s10_getdents() exists. See the comment 931 * above s10_getdents(). 932 */ 933 static int 934 s10_getdents64(sysret_t *rval, int fd, struct dirent64 *buf, size_t nbyte) 935 { 936 return (s10_getdents_common(rval, fd, (char *)buf, nbyte, 937 SYS_getdents64, offsetof(struct dirent64, d_name), 938 offsetof(struct dirent64, d_reclen))); 939 } 940 #endif /* !_LP64 */ 941 942 #define S10_TRIVIAL_ACL_CNT 6 943 #define NATIVE_TRIVIAL_ACL_CNT 3 944 945 /* 946 * Check if the ACL qualifies as a trivial ACL based on the native 947 * interpretation. 948 */ 949 static boolean_t 950 has_trivial_native_acl(int cmd, int cnt, const char *fname, int fd) 951 { 952 int i, err; 953 sysret_t rval; 954 ace_t buf[NATIVE_TRIVIAL_ACL_CNT]; 955 956 if (fname != NULL) 957 err = __systemcall(&rval, SYS_pathconf + 1024, fname, 958 _PC_ACL_ENABLED); 959 else 960 err = __systemcall(&rval, SYS_fpathconf + 1024, fd, 961 _PC_ACL_ENABLED); 962 if (err != 0 || rval.sys_rval1 != _ACL_ACE_ENABLED) 963 return (B_FALSE); 964 965 /* 966 * If we just got the ACL cnt, we don't need to get it again, its 967 * passed in as the cnt arg. 968 */ 969 if (cmd != ACE_GETACLCNT) { 970 if (fname != NULL) { 971 if (__systemcall(&rval, SYS_acl + 1024, fname, 972 ACE_GETACLCNT, 0, NULL) != 0) 973 return (B_FALSE); 974 } else { 975 if (__systemcall(&rval, SYS_facl + 1024, fd, 976 ACE_GETACLCNT, 0, NULL) != 0) 977 return (B_FALSE); 978 } 979 cnt = rval.sys_rval1; 980 } 981 982 if (cnt != NATIVE_TRIVIAL_ACL_CNT) 983 return (B_FALSE); 984 985 if (fname != NULL) { 986 if (__systemcall(&rval, SYS_acl + 1024, fname, ACE_GETACL, cnt, 987 buf) != 0) 988 return (B_FALSE); 989 } else { 990 if (__systemcall(&rval, SYS_facl + 1024, fd, ACE_GETACL, cnt, 991 buf) != 0) 992 return (B_FALSE); 993 } 994 995 /* 996 * The following is based on the logic from the native OS 997 * ace_trivial_common() to determine if the native ACL is trivial. 998 */ 999 for (i = 0; i < cnt; i++) { 1000 switch (buf[i].a_flags & ACE_TYPE_FLAGS) { 1001 case ACE_OWNER: 1002 case ACE_GROUP|ACE_IDENTIFIER_GROUP: 1003 case ACE_EVERYONE: 1004 break; 1005 default: 1006 return (B_FALSE); 1007 } 1008 1009 if (buf[i].a_flags & (ACE_FILE_INHERIT_ACE| 1010 ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| 1011 ACE_INHERIT_ONLY_ACE)) 1012 return (B_FALSE); 1013 1014 /* 1015 * Special check for some special bits 1016 * 1017 * Don't allow anybody to deny reading basic 1018 * attributes or a files ACL. 1019 */ 1020 if (buf[i].a_access_mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && 1021 buf[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE) 1022 return (B_FALSE); 1023 1024 /* 1025 * Delete permissions are never set by default 1026 */ 1027 if (buf[i].a_access_mask & (ACE_DELETE|ACE_DELETE_CHILD)) 1028 return (B_FALSE); 1029 /* 1030 * only allow owner@ to have 1031 * write_acl/write_owner/write_attributes/write_xattr/ 1032 */ 1033 if (buf[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE && 1034 (!(buf[i].a_flags & ACE_OWNER) && (buf[i].a_access_mask & 1035 (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| 1036 ACE_WRITE_NAMED_ATTRS)))) 1037 return (B_FALSE); 1038 1039 } 1040 1041 return (B_TRUE); 1042 } 1043 1044 /* 1045 * The following logic is based on the S10 adjust_ace_pair_common() code. 1046 */ 1047 static void 1048 s10_adjust_ace_mask(void *pair, size_t access_off, size_t pairsize, mode_t mode) 1049 { 1050 char *datap = (char *)pair; 1051 uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off); 1052 uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize + 1053 access_off); 1054 1055 if (mode & S_IROTH) 1056 *amask1 |= ACE_READ_DATA; 1057 else 1058 *amask0 |= ACE_READ_DATA; 1059 if (mode & S_IWOTH) 1060 *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA; 1061 else 1062 *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA; 1063 if (mode & S_IXOTH) 1064 *amask1 |= ACE_EXECUTE; 1065 else 1066 *amask0 |= ACE_EXECUTE; 1067 } 1068 1069 /* 1070 * Construct a trivial S10 style ACL. 1071 */ 1072 static int 1073 make_trivial_s10_acl(const char *fname, int fd, ace_t *bp) 1074 { 1075 int err; 1076 sysret_t rval; 1077 struct stat64 buf; 1078 ace_t trivial_s10_acl[] = { 1079 {(uint_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE}, 1080 {(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| 1081 ACE_WRITE_NAMED_ATTRS, ACE_OWNER, 1082 ACE_ACCESS_ALLOWED_ACE_TYPE}, 1083 {(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, 1084 ACE_ACCESS_DENIED_ACE_TYPE}, 1085 {(uint_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, 1086 ACE_ACCESS_ALLOWED_ACE_TYPE}, 1087 {(uint_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| 1088 ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, 1089 ACE_ACCESS_DENIED_ACE_TYPE}, 1090 {(uint_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES| 1091 ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE, ACE_EVERYONE, 1092 ACE_ACCESS_ALLOWED_ACE_TYPE} 1093 }; 1094 1095 if (fname != NULL) { 1096 if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, AT_FDCWD, 1097 fname, &buf, 0)) != 0) 1098 return (err); 1099 } else { 1100 if ((err = __systemcall(&rval, SYS_fstatat64 + 1024, fd, 1101 NULL, &buf, 0)) != 0) 1102 return (err); 1103 } 1104 1105 s10_adjust_ace_mask(&trivial_s10_acl[0], offsetof(ace_t, a_access_mask), 1106 sizeof (ace_t), (buf.st_mode & 0700) >> 6); 1107 s10_adjust_ace_mask(&trivial_s10_acl[2], offsetof(ace_t, a_access_mask), 1108 sizeof (ace_t), (buf.st_mode & 0070) >> 3); 1109 s10_adjust_ace_mask(&trivial_s10_acl[4], offsetof(ace_t, a_access_mask), 1110 sizeof (ace_t), buf.st_mode & 0007); 1111 1112 if (brand_uucopy(&trivial_s10_acl, bp, sizeof (trivial_s10_acl)) != 0) 1113 return (EFAULT); 1114 1115 return (0); 1116 } 1117 1118 /* 1119 * The definition of a trivial ace-style ACL (used by ZFS and NFSv4) has been 1120 * simplified since S10. Instead of 6 entries on a trivial S10 ACE ACL we now 1121 * have 3 streamlined entries. The new, simpler trivial style confuses S10 1122 * commands such as 'ls -v' or 'cp -p' which don't see the expected S10 trivial 1123 * ACL entries and thus assume that there is a complex ACL on the file. 1124 * 1125 * See: PSARC/2010/029 Improved ACL interoperability 1126 * 1127 * Note that the trival ACL detection code is implemented in acl_trival() in 1128 * lib/libsec/common/aclutils.c. It always uses the acl() syscall (not the 1129 * facl syscall) to determine if an ACL is trivial. However, we emulate both 1130 * acl() and facl() so that the two provide consistent results. 1131 * 1132 * We don't currently try to emulate setting of ACLs since the primary 1133 * consumer of this feature is SMB or NFSv4 servers, neither of which are 1134 * supported in solaris10-branded zones. If ACLs are used they must be set on 1135 * files using the native OS interpretation. 1136 */ 1137 int 1138 s10_acl(sysret_t *rval, const char *fname, int cmd, int nentries, void *aclbufp) 1139 { 1140 int res; 1141 1142 res = __systemcall(rval, SYS_acl + 1024, fname, cmd, nentries, aclbufp); 1143 1144 switch (cmd) { 1145 case ACE_GETACLCNT: 1146 if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT, 1147 rval->sys_rval1, fname, 0)) { 1148 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT; 1149 } 1150 break; 1151 case ACE_GETACL: 1152 if (res == 0 && 1153 has_trivial_native_acl(ACE_GETACL, 0, fname, 0) && 1154 nentries >= S10_TRIVIAL_ACL_CNT) { 1155 res = make_trivial_s10_acl(fname, 0, aclbufp); 1156 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT; 1157 } 1158 break; 1159 } 1160 1161 return (res); 1162 } 1163 1164 int 1165 s10_facl(sysret_t *rval, int fdes, int cmd, int nentries, void *aclbufp) 1166 { 1167 int res; 1168 1169 res = __systemcall(rval, SYS_facl + 1024, fdes, cmd, nentries, aclbufp); 1170 1171 switch (cmd) { 1172 case ACE_GETACLCNT: 1173 if (res == 0 && has_trivial_native_acl(ACE_GETACLCNT, 1174 rval->sys_rval1, NULL, fdes)) { 1175 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT; 1176 } 1177 break; 1178 case ACE_GETACL: 1179 if (res == 0 && 1180 has_trivial_native_acl(ACE_GETACL, 0, NULL, fdes) && 1181 nentries >= S10_TRIVIAL_ACL_CNT) { 1182 res = make_trivial_s10_acl(NULL, fdes, aclbufp); 1183 rval->sys_rval1 = S10_TRIVIAL_ACL_CNT; 1184 } 1185 break; 1186 } 1187 1188 return (res); 1189 } 1190 1191 #define S10_AC_PROC (0x1 << 28) 1192 #define S10_AC_TASK (0x2 << 28) 1193 #define S10_AC_FLOW (0x4 << 28) 1194 #define S10_AC_MODE(x) ((x) & 0xf0000000) 1195 #define S10_AC_OPTION(x) ((x) & 0x0fffffff) 1196 1197 /* 1198 * The mode shift, mode mask and option mask for acctctl have changed. The 1199 * mode is currently the top full byte and the option is the lower 3 full bytes. 1200 */ 1201 int 1202 s10_acctctl(sysret_t *rval, int cmd, void *buf, size_t bufsz) 1203 { 1204 int mode = S10_AC_MODE(cmd); 1205 int option = S10_AC_OPTION(cmd); 1206 1207 switch (mode) { 1208 case S10_AC_PROC: 1209 mode = AC_PROC; 1210 break; 1211 case S10_AC_TASK: 1212 mode = AC_TASK; 1213 break; 1214 case S10_AC_FLOW: 1215 mode = AC_FLOW; 1216 break; 1217 default: 1218 return (B_TRUSS_POINT_3(rval, SYS_acctctl, EINVAL, cmd, buf, 1219 bufsz)); 1220 } 1221 1222 return (__systemcall(rval, SYS_acctctl + 1024, mode | option, buf, 1223 bufsz)); 1224 } 1225 1226 /* 1227 * The Audit Policy parameters have changed due to: 1228 * 6466722 audituser and AUDIT_USER are defined, unused, undocumented and 1229 * should be removed. 1230 * 1231 * In S10 we had the following flag: 1232 * #define AUDIT_USER 0x0040 1233 * which doesn't exist in Solaris Next where the subsequent flags are shifted 1234 * down. For example, in S10 we had: 1235 * #define AUDIT_GROUP 0x0080 1236 * but on Solaris Next we have: 1237 * #define AUDIT_GROUP 0x0040 1238 * AUDIT_GROUP has the value AUDIT_USER had in S10 and all of the subsequent 1239 * bits are also shifted one place. 1240 * 1241 * When we're getting or setting the Audit Policy parameters we need to 1242 * shift the outgoing or incoming bits into their proper positions. Since 1243 * S10_AUDIT_USER was always unused, we always clear that bit on A_GETPOLICY. 1244 * 1245 * The command we care about, BSM_AUDITCTL, passes the most parameters (3), 1246 * so declare this function to take up to 4 args and just pass them on. 1247 * The number of parameters for s10_auditsys needs to be equal to the BSM_* 1248 * subcommand that has the most parameters, since we want to pass all 1249 * parameters through, regardless of which subcommands we interpose on. 1250 * 1251 * Note that the auditsys system call uses the SYSENT_AP macro wrapper instead 1252 * of the more common SYSENT_CI macro. This means the return value is a 1253 * SE_64RVAL so the syscall table uses RV_64RVAL. 1254 */ 1255 1256 #define S10_AUDIT_HMASK 0xffffffc0 1257 #define S10_AUDIT_LMASK 0x3f 1258 #define S10_AUC_NOSPACE 0x3 1259 1260 int 1261 s10_auditsys(sysret_t *rval, int bsmcmd, intptr_t a0, intptr_t a1, intptr_t a2) 1262 { 1263 int err; 1264 uint32_t m; 1265 1266 if (bsmcmd != BSM_AUDITCTL) 1267 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, 1268 a2)); 1269 1270 if ((int)a0 == A_GETPOLICY) { 1271 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, 1272 &m, a2)) != 0) 1273 return (err); 1274 m = ((m & S10_AUDIT_HMASK) << 1) | (m & S10_AUDIT_LMASK); 1275 if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0) 1276 return (EFAULT); 1277 return (0); 1278 1279 } else if ((int)a0 == A_SETPOLICY) { 1280 if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0) 1281 return (EFAULT); 1282 m = ((m >> 1) & S10_AUDIT_HMASK) | (m & S10_AUDIT_LMASK); 1283 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m, 1284 a2)); 1285 } else if ((int)a0 == A_GETCOND) { 1286 if ((err = __systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, 1287 &m, a2)) != 0) 1288 return (err); 1289 if (m == AUC_NOSPACE) 1290 m = S10_AUC_NOSPACE; 1291 if (brand_uucopy(&m, (void *)a1, sizeof (m)) != 0) 1292 return (EFAULT); 1293 return (0); 1294 } else if ((int)a0 == A_SETCOND) { 1295 if (brand_uucopy((const void *)a1, &m, sizeof (m)) != 0) 1296 return (EFAULT); 1297 if (m == S10_AUC_NOSPACE) 1298 m = AUC_NOSPACE; 1299 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, &m, 1300 a2)); 1301 } 1302 1303 return (__systemcall(rval, SYS_auditsys + 1024, bsmcmd, a0, a1, a2)); 1304 } 1305 1306 /* 1307 * Determine whether the executable passed to SYS_exec or SYS_execve is a 1308 * native executable. The s10_npreload.so invokes the B_S10_NATIVE brand 1309 * operation which patches up the processes exec info to eliminate any trace 1310 * of the wrapper. That will make pgrep and other commands that examine 1311 * process' executable names and command-line parameters work properly. 1312 */ 1313 static int 1314 s10_exec_native(sysret_t *rval, const char *fname, const char **argp, 1315 const char **envp) 1316 { 1317 const char *filename = fname; 1318 char path[64]; 1319 int err; 1320 1321 /* Get a copy of the executable we're trying to run */ 1322 path[0] = '\0'; 1323 (void) brand_uucopystr(filename, path, sizeof (path)); 1324 1325 /* Check if we're trying to run a native binary */ 1326 if (strncmp(path, "/.SUNWnative/usr/lib/brand/solaris10/s10_native", 1327 sizeof (path)) != 0) 1328 return (0); 1329 1330 /* Skip the first element in the argv array */ 1331 argp++; 1332 1333 /* 1334 * The the path of the dynamic linker is the second parameter 1335 * of s10_native_exec(). 1336 */ 1337 if (brand_uucopy(argp, &filename, sizeof (char *)) != 0) 1338 return (EFAULT); 1339 1340 /* If an exec call succeeds, it never returns */ 1341 err = __systemcall(rval, SYS_brand + 1024, B_EXEC_NATIVE, filename, 1342 argp, envp, NULL, NULL, NULL); 1343 brand_assert(err != 0); 1344 return (err); 1345 } 1346 1347 /* 1348 * Interpose on the SYS_exec syscall to detect native wrappers. 1349 */ 1350 int 1351 s10_exec(sysret_t *rval, const char *fname, const char **argp) 1352 { 1353 int err; 1354 1355 if ((err = s10_exec_native(rval, fname, argp, NULL)) != 0) 1356 return (err); 1357 1358 /* If an exec call succeeds, it never returns */ 1359 err = __systemcall(rval, SYS_execve + 1024, fname, argp, NULL); 1360 brand_assert(err != 0); 1361 return (err); 1362 } 1363 1364 /* 1365 * Interpose on the SYS_execve syscall to detect native wrappers. 1366 */ 1367 int 1368 s10_execve(sysret_t *rval, const char *fname, const char **argp, 1369 const char **envp) 1370 { 1371 int err; 1372 1373 if ((err = s10_exec_native(rval, fname, argp, envp)) != 0) 1374 return (err); 1375 1376 /* If an exec call succeeds, it never returns */ 1377 err = __systemcall(rval, SYS_execve + 1024, fname, argp, envp); 1378 brand_assert(err != 0); 1379 return (err); 1380 } 1381 1382 /* 1383 * S10's issetugid() syscall is now a subcode to privsys(). 1384 */ 1385 static int 1386 s10_issetugid(sysret_t *rval) 1387 { 1388 return (__systemcall(rval, SYS_privsys + 1024, PRIVSYS_ISSETUGID, 1389 0, 0, 0, 0, 0)); 1390 } 1391 1392 /* 1393 * S10's socket() syscall does not split type and flags 1394 */ 1395 static int 1396 s10_so_socket(sysret_t *rval, int domain, int type, int protocol, 1397 char *devpath, int version) 1398 { 1399 if ((type & ~SOCK_TYPE_MASK) != 0) { 1400 errno = EINVAL; 1401 return (-1); 1402 } 1403 return (__systemcall(rval, SYS_so_socket + 1024, domain, type, 1404 protocol, devpath, version)); 1405 } 1406 1407 /* 1408 * S10's pipe() syscall has a different calling convention 1409 */ 1410 static int 1411 s10_pipe(sysret_t *rval) 1412 { 1413 int fds[2], err; 1414 if ((err = __systemcall(rval, SYS_pipe + 1024, fds, 0)) != 0) 1415 return (err); 1416 1417 rval->sys_rval1 = fds[0]; 1418 rval->sys_rval2 = fds[1]; 1419 return (0); 1420 } 1421 1422 /* 1423 * S10's accept() syscall takes three arguments 1424 */ 1425 static int 1426 s10_accept(sysret_t *rval, int sock, struct sockaddr *addr, uint_t *addrlen, 1427 int version) 1428 { 1429 return (__systemcall(rval, SYS_accept + 1024, sock, addr, addrlen, 1430 version, 0)); 1431 } 1432 1433 static long 1434 s10_uname(sysret_t *rv, uintptr_t p1) 1435 { 1436 struct utsname un, *unp = (struct utsname *)p1; 1437 int rev, err; 1438 1439 if ((err = __systemcall(rv, SYS_uname + 1024, &un)) != 0) 1440 return (err); 1441 1442 rev = atoi(&un.release[2]); 1443 brand_assert(rev >= 11); 1444 bzero(un.release, _SYS_NMLN); 1445 (void) strlcpy(un.release, S10_UTS_RELEASE, _SYS_NMLN); 1446 bzero(un.version, _SYS_NMLN); 1447 (void) strlcpy(un.version, S10_UTS_VERSION, _SYS_NMLN); 1448 1449 /* copy out the modified uname info */ 1450 return (brand_uucopy(&un, unp, sizeof (un))); 1451 } 1452 1453 int 1454 s10_sysconfig(sysret_t *rv, int which) 1455 { 1456 long value; 1457 1458 /* 1459 * We must interpose on the sysconfig(2) requests 1460 * that deal with the realtime signal number range. 1461 * All others get passed to the native sysconfig(2). 1462 */ 1463 switch (which) { 1464 case _CONFIG_RTSIG_MAX: 1465 value = S10_SIGRTMAX - S10_SIGRTMIN + 1; 1466 break; 1467 case _CONFIG_SIGRT_MIN: 1468 value = S10_SIGRTMIN; 1469 break; 1470 case _CONFIG_SIGRT_MAX: 1471 value = S10_SIGRTMAX; 1472 break; 1473 default: 1474 return (__systemcall(rv, SYS_sysconfig + 1024, which)); 1475 } 1476 1477 (void) B_TRUSS_POINT_1(rv, SYS_sysconfig, 0, which); 1478 rv->sys_rval1 = value; 1479 rv->sys_rval2 = 0; 1480 1481 return (0); 1482 } 1483 1484 int 1485 s10_sysinfo(sysret_t *rv, int command, char *buf, long count) 1486 { 1487 char *value; 1488 int len; 1489 1490 /* 1491 * We must interpose on the sysinfo(2) commands SI_RELEASE and 1492 * SI_VERSION; all others get passed to the native sysinfo(2) 1493 * command. 1494 */ 1495 switch (command) { 1496 case SI_RELEASE: 1497 value = S10_UTS_RELEASE; 1498 break; 1499 1500 case SI_VERSION: 1501 value = S10_UTS_VERSION; 1502 break; 1503 1504 default: 1505 /* 1506 * The default action is to pass the command to the 1507 * native sysinfo(2) syscall. 1508 */ 1509 return (__systemcall(rv, SYS_systeminfo + 1024, 1510 command, buf, count)); 1511 } 1512 1513 len = strlen(value) + 1; 1514 if (count > 0) { 1515 if (brand_uucopystr(value, buf, count) != 0) 1516 return (EFAULT); 1517 1518 /* 1519 * Assure NULL termination of buf as brand_uucopystr() doesn't. 1520 */ 1521 if (len > count && brand_uucopy("\0", buf + (count - 1), 1) 1522 != 0) 1523 return (EFAULT); 1524 } 1525 1526 /* 1527 * On success, sysinfo(2) returns the size of buffer required to hold 1528 * the complete value plus its terminating NULL byte. 1529 */ 1530 (void) B_TRUSS_POINT_3(rv, SYS_systeminfo, 0, command, buf, count); 1531 rv->sys_rval1 = len; 1532 rv->sys_rval2 = 0; 1533 return (0); 1534 } 1535 1536 #if defined(__x86) 1537 #if defined(__amd64) 1538 /* 1539 * 64-bit x86 LWPs created by SYS_lwp_create start here if they need to set 1540 * their %fs registers to the legacy Solaris 10 selector value. 1541 * 1542 * This function does three things: 1543 * 1544 * 1. Trap to the kernel so that it can set %fs to the legacy Solaris 10 1545 * selector value. 1546 * 2. Read the LWP's true entry point (the entry point supplied by libc 1547 * when SYS_lwp_create was invoked) from %r14. 1548 * 3. Eliminate this function's stack frame and pass control to the LWP's 1549 * true entry point. 1550 * 1551 * See the comment above s10_lwp_create_correct_fs() (see below) for the reason 1552 * why this function exists. 1553 */ 1554 /*ARGSUSED*/ 1555 static void 1556 s10_lwp_create_entry_point(void *ulwp_structp) 1557 { 1558 sysret_t rval; 1559 1560 /* 1561 * The new LWP's %fs register is initially zero, but libc won't 1562 * function correctly when %fs is zero. Change the LWP's %fs register 1563 * via SYS_brand. 1564 */ 1565 (void) __systemcall(&rval, SYS_brand + 1024, B_S10_FSREGCORRECTION); 1566 1567 /* 1568 * Jump to the true entry point, which is stored in %r14. 1569 * Remove our stack frame before jumping so that 1570 * s10_lwp_create_entry_point() won't be seen in stack traces. 1571 * 1572 * NOTE: s10_lwp_create_entry_point() pushes %r12 onto its stack frame 1573 * so that it can use it as a temporary register. We don't restore %r12 1574 * in this assembly block because we don't care about its value (and 1575 * neither does _lwp_start()). Besides, the System V ABI AMD64 1576 * Actirecture Processor Supplement doesn't specify that %r12 should 1577 * have a special value when LWPs start, so we can ignore its value when 1578 * we jump to the true entry point. Furthermore, %r12 is a callee-saved 1579 * register, so the true entry point should push %r12 onto its stack 1580 * before using the register. We ignore %r14 after we read it for 1581 * similar reasons. 1582 * 1583 * NOTE: The compiler will generate a function epilogue for this 1584 * function despite the fact that the LWP will never execute it. 1585 * We could hand-code this entire function in assembly to eliminate 1586 * the epilogue, but the epilogue is only three or four instructions, 1587 * so we wouldn't save much space. Besides, why would we want 1588 * to create yet another ugly, hard-to-maintain assembly function when 1589 * we could write most of it in C? 1590 */ 1591 __asm__ __volatile__( 1592 "movq %0, %%rdi\n\t" /* pass ulwp_structp as arg1 */ 1593 "movq %%rbp, %%rsp\n\t" /* eliminate the stack frame */ 1594 "popq %%rbp\n\t" 1595 "jmp *%%r14\n\t" /* jump to the true entry point */ 1596 : : "r" (ulwp_structp)); 1597 /*NOTREACHED*/ 1598 } 1599 1600 /* 1601 * The S10 libc expects that %fs will be nonzero for new 64-bit x86 LWPs but the 1602 * Nevada kernel clears %fs for such LWPs. Unforunately, new LWPs do not issue 1603 * SYS_lwp_private (see s10_lwp_private() below) after they are created, so 1604 * we must ensure that new LWPs invoke a brand operation that sets %fs to a 1605 * nonzero value immediately after their creation. 1606 * 1607 * The easiest way to do this is to make new LWPs start at a special function, 1608 * s10_lwp_create_entry_point() (see its definition above), that invokes the 1609 * brand operation that corrects %fs. We'll store the entry points of new LWPs 1610 * in their %r14 registers so that s10_lwp_create_entry_point() can find and 1611 * call them after invoking the special brand operation. %r14 is a callee-saved 1612 * register; therefore, any functions invoked by s10_lwp_create_entry_point() 1613 * and all functions dealing with signals (e.g., sigacthandler()) will preserve 1614 * %r14 for s10_lwp_create_entry_point(). 1615 * 1616 * The Nevada kernel can safely work with nonzero %fs values because the kernel 1617 * configures per-thread %fs segment descriptors so that the legacy %fs selector 1618 * value will still work. See the comment in lwp_load() regarding %fs and 1619 * %fsbase in 64-bit x86 processes. 1620 * 1621 * This emulation exists thanks to CRs 6467491 and 6501650. 1622 */ 1623 static int 1624 s10_lwp_create_correct_fs(sysret_t *rval, ucontext_t *ucp, int flags, 1625 id_t *new_lwp) 1626 { 1627 ucontext_t s10_uc; 1628 1629 /* 1630 * Copy the supplied ucontext_t structure to the local stack 1631 * frame and store the new LWP's entry point (the value of %rip 1632 * stored in the ucontext_t) in the new LWP's %r14 register. 1633 * Then make s10_lwp_create_entry_point() the new LWP's entry 1634 * point. 1635 */ 1636 if (brand_uucopy(ucp, &s10_uc, sizeof (s10_uc)) != 0) 1637 return (EFAULT); 1638 1639 s10_uc.uc_mcontext.gregs[REG_R14] = s10_uc.uc_mcontext.gregs[REG_RIP]; 1640 s10_uc.uc_mcontext.gregs[REG_RIP] = (greg_t)s10_lwp_create_entry_point; 1641 1642 /* fix up the signal mask */ 1643 if (s10_uc.uc_flags & UC_SIGMASK) 1644 (void) s10sigset_to_native(&s10_uc.uc_sigmask, 1645 &s10_uc.uc_sigmask); 1646 1647 /* 1648 * Issue SYS_lwp_create to create the new LWP. We pass the 1649 * modified ucontext_t to make sure that the new LWP starts at 1650 * s10_lwp_create_entry_point(). 1651 */ 1652 return (__systemcall(rval, SYS_lwp_create + 1024, &s10_uc, 1653 flags, new_lwp)); 1654 } 1655 #endif /* __amd64 */ 1656 1657 /* 1658 * SYS_lwp_private is issued by libc_init() to set %fsbase in 64-bit x86 1659 * processes. The Nevada kernel sets %fs to zero but the S10 libc expects 1660 * %fs to be nonzero. We'll pass the issued system call to the kernel untouched 1661 * and invoke a brand operation to set %fs to the legacy S10 selector value. 1662 * 1663 * This emulation exists thanks to CRs 6467491 and 6501650. 1664 */ 1665 static int 1666 s10_lwp_private(sysret_t *rval, int cmd, int which, uintptr_t base) 1667 { 1668 #if defined(__amd64) 1669 int err; 1670 1671 /* 1672 * The current LWP's %fs register should be zero. Determine whether the 1673 * Solaris 10 libc with which we're working functions correctly when %fs 1674 * is zero by calling thr_main() after issuing the SYS_lwp_private 1675 * syscall. If thr_main() barfs (returns -1), then change the LWP's %fs 1676 * register via SYS_brand and patch brand_sysent_table so that issuing 1677 * SYS_lwp_create executes s10_lwp_create_correct_fs() rather than the 1678 * default s10_lwp_create(). s10_lwp_create_correct_fs() will 1679 * guarantee that new LWPs will have correct %fs values. 1680 */ 1681 if ((err = __systemcall(rval, SYS_lwp_private + 1024, cmd, which, 1682 base)) != 0) 1683 return (err); 1684 if (thr_main() == -1) { 1685 /* 1686 * SYS_lwp_private is only issued by libc_init(), which is 1687 * executed when libc is first loaded by ld.so.1. Thus we 1688 * are guaranteed to be single-threaded at this point. Even 1689 * if we were multithreaded at this point, writing a 64-bit 1690 * value to the st_callc field of a brand_sysent_table 1691 * entry is guaranteed to be atomic on 64-bit x86 chips 1692 * as long as the field is not split across cache lines 1693 * (It shouldn't be.). See chapter 8, section 1.1 of 1694 * "The Intel 64 and IA32 Architectures Software Developer's 1695 * Manual," Volume 3A for more details. 1696 */ 1697 brand_sysent_table[SYS_lwp_create].st_callc = 1698 (sysent_cb_t)s10_lwp_create_correct_fs; 1699 return (__systemcall(rval, SYS_brand + 1024, 1700 B_S10_FSREGCORRECTION)); 1701 } 1702 return (0); 1703 #else /* !__amd64 */ 1704 return (__systemcall(rval, SYS_lwp_private + 1024, cmd, which, base)); 1705 #endif /* !__amd64 */ 1706 } 1707 #endif /* __x86 */ 1708 1709 /* 1710 * The Opensolaris versions of lwp_mutex_timedlock() and lwp_mutex_trylock() 1711 * add an extra argument to the interfaces, a uintptr_t value for the mutex's 1712 * mutex_owner field. The Solaris 10 libc assigns the mutex_owner field at 1713 * user-level, so we just make the extra argument be zero in both syscalls. 1714 */ 1715 1716 static int 1717 s10_lwp_mutex_timedlock(sysret_t *rval, lwp_mutex_t *lp, timespec_t *tsp) 1718 { 1719 return (__systemcall(rval, SYS_lwp_mutex_timedlock + 1024, lp, tsp, 0)); 1720 } 1721 1722 static int 1723 s10_lwp_mutex_trylock(sysret_t *rval, lwp_mutex_t *lp) 1724 { 1725 return (__systemcall(rval, SYS_lwp_mutex_trylock + 1024, lp, 0)); 1726 } 1727 1728 /* 1729 * If the emul_global_zone flag is set then emulate some aspects of the 1730 * zone system call. In particular, emulate the global zone ID on the 1731 * ZONE_LOOKUP subcommand and emulate some of the global zone attributes 1732 * on the ZONE_GETATTR subcommand. If the flag is not set or we're performing 1733 * some other operation, simply pass the calls through. 1734 */ 1735 int 1736 s10_zone(sysret_t *rval, int cmd, void *arg1, void *arg2, void *arg3, 1737 void *arg4) 1738 { 1739 char *aval; 1740 int len; 1741 zoneid_t zid; 1742 int attr; 1743 char *buf; 1744 size_t bufsize; 1745 1746 /* 1747 * We only emulate the zone syscall for a subset of specific commands, 1748 * otherwise we just pass the call through. 1749 */ 1750 if (!emul_global_zone) 1751 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, 1752 arg3, arg4)); 1753 1754 switch (cmd) { 1755 case ZONE_LOOKUP: 1756 (void) B_TRUSS_POINT_1(rval, SYS_zone, 0, cmd); 1757 rval->sys_rval1 = GLOBAL_ZONEID; 1758 rval->sys_rval2 = 0; 1759 return (0); 1760 1761 case ZONE_GETATTR: 1762 zid = (zoneid_t)(uintptr_t)arg1; 1763 attr = (int)(uintptr_t)arg2; 1764 buf = (char *)arg3; 1765 bufsize = (size_t)arg4; 1766 1767 /* 1768 * If the request is for the global zone then we're emulating 1769 * that, otherwise pass this thru. 1770 */ 1771 if (zid != GLOBAL_ZONEID) 1772 goto passthru; 1773 1774 switch (attr) { 1775 case ZONE_ATTR_NAME: 1776 aval = GLOBAL_ZONENAME; 1777 break; 1778 1779 case ZONE_ATTR_BRAND: 1780 aval = NATIVE_BRAND_NAME; 1781 break; 1782 default: 1783 /* 1784 * We only emulate a subset of the attrs, use the 1785 * real zone id to pass thru the rest. 1786 */ 1787 arg1 = (void *)(uintptr_t)zoneid; 1788 goto passthru; 1789 } 1790 1791 (void) B_TRUSS_POINT_5(rval, SYS_zone, 0, cmd, zid, attr, 1792 buf, bufsize); 1793 1794 len = strlen(aval) + 1; 1795 if (len > bufsize) 1796 return (ENAMETOOLONG); 1797 1798 if (buf != NULL) { 1799 if (len == 1) { 1800 if (brand_uucopy("\0", buf, 1) != 0) 1801 return (EFAULT); 1802 } else { 1803 if (brand_uucopystr(aval, buf, len) != 0) 1804 return (EFAULT); 1805 1806 /* 1807 * Assure NULL termination of "buf" as 1808 * brand_uucopystr() does NOT. 1809 */ 1810 if (brand_uucopy("\0", buf + (len - 1), 1) != 0) 1811 return (EFAULT); 1812 } 1813 } 1814 1815 rval->sys_rval1 = len; 1816 rval->sys_rval2 = 0; 1817 return (0); 1818 1819 default: 1820 break; 1821 } 1822 1823 passthru: 1824 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, arg3, 1825 arg4)); 1826 } 1827 1828 /*ARGSUSED*/ 1829 int 1830 brand_init(int argc, char *argv[], char *envp[]) 1831 { 1832 sysret_t rval; 1833 ulong_t ldentry; 1834 int err; 1835 char *bname; 1836 1837 brand_pre_init(); 1838 1839 /* 1840 * Cache the pid of the zone's init process and determine if 1841 * we're init(1m) for the zone. Remember: we might be init 1842 * now, but as soon as we fork(2) we won't be. 1843 */ 1844 (void) get_initpid_info(); 1845 1846 /* get the current zoneid */ 1847 err = __systemcall(&rval, SYS_zone, ZONE_LOOKUP, NULL); 1848 brand_assert(err == 0); 1849 zoneid = (zoneid_t)rval.sys_rval1; 1850 1851 /* Get the zone's emulation bitmap. */ 1852 if ((err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid, 1853 S10_EMUL_BITMAP, emul_bitmap, sizeof (emul_bitmap))) != 0) { 1854 brand_abort(err, "The zone's patch level is unsupported"); 1855 /*NOTREACHED*/ 1856 } 1857 1858 bname = basename(argv[0]); 1859 1860 /* 1861 * In general we want the S10 commands that are zone-aware to continue 1862 * to behave as they normally do within a zone. Since these commands 1863 * are zone-aware, they should continue to "do the right thing". 1864 * However, some zone-aware commands aren't going to work the way 1865 * we expect them to inside the branded zone. In particular, the pkg 1866 * and patch commands will not properly manage all pkgs/patches 1867 * unless the commands think they are running in the global zone. For 1868 * these commands we want to emulate the global zone. 1869 * 1870 * We don't do any emulation for pkgcond since it is typically used 1871 * in pkg/patch postinstall scripts and we want those scripts to do 1872 * the right thing inside a zone. 1873 * 1874 * One issue is the handling of hollow pkgs. Since the pkgs are 1875 * hollow, they won't use pkgcond in their postinstall scripts. These 1876 * pkgs typically are installing drivers so we handle that by 1877 * replacing add_drv and rem_drv in the s10_boot script. 1878 */ 1879 if (strcmp("pkgadd", bname) == 0 || strcmp("pkgrm", bname) == 0 || 1880 strcmp("patchadd", bname) == 0 || strcmp("patchrm", bname) == 0) 1881 emul_global_zone = B_TRUE; 1882 1883 ldentry = brand_post_init(S10_VERSION, argc, argv, envp); 1884 1885 brand_runexe(argv, ldentry); 1886 /*NOTREACHED*/ 1887 brand_abort(0, "brand_runexe() returned"); 1888 return (-1); 1889 } 1890 1891 /* 1892 * This table must have at least NSYSCALL entries in it. 1893 * 1894 * The second parameter of each entry in the brand_sysent_table 1895 * contains the number of parameters and flags that describe the 1896 * syscall return value encoding. See the block comments at the 1897 * top of this file for more information about the syscall return 1898 * value flags and when they should be used. 1899 */ 1900 brand_sysent_table_t brand_sysent_table[] = { 1901 #if defined(__sparc) && !defined(__sparcv9) 1902 EMULATE(brand_indir, 9 | RV_64RVAL), /* 0 */ 1903 #else 1904 NOSYS, /* 0 */ 1905 #endif 1906 NOSYS, /* 1 */ 1907 EMULATE(s10_forkall, 0 | RV_32RVAL2), /* 2 */ 1908 NOSYS, /* 3 */ 1909 NOSYS, /* 4 */ 1910 EMULATE(s10_open, 3 | RV_DEFAULT), /* 5 */ 1911 NOSYS, /* 6 */ 1912 EMULATE(s10_wait, 0 | RV_32RVAL2), /* 7 */ 1913 EMULATE(s10_creat, 2 | RV_DEFAULT), /* 8 */ 1914 EMULATE(s10_link, 2 | RV_DEFAULT), /* 9 */ 1915 EMULATE(s10_unlink, 1 | RV_DEFAULT), /* 10 */ 1916 EMULATE(s10_exec, 2 | RV_DEFAULT), /* 11 */ 1917 NOSYS, /* 12 */ 1918 NOSYS, /* 13 */ 1919 EMULATE(s10_mknod, 3 | RV_DEFAULT), /* 14 */ 1920 EMULATE(s10_chmod, 2 | RV_DEFAULT), /* 15 */ 1921 EMULATE(s10_chown, 3 | RV_DEFAULT), /* 16 */ 1922 NOSYS, /* 17 */ 1923 EMULATE(s10_stat, 2 | RV_DEFAULT), /* 18 */ 1924 NOSYS, /* 19 */ 1925 NOSYS, /* 20 */ 1926 NOSYS, /* 21 */ 1927 EMULATE(s10_umount, 1 | RV_DEFAULT), /* 22 */ 1928 NOSYS, /* 23 */ 1929 NOSYS, /* 24 */ 1930 NOSYS, /* 25 */ 1931 NOSYS, /* 26 */ 1932 NOSYS, /* 27 */ 1933 EMULATE(s10_fstat, 2 | RV_DEFAULT), /* 28 */ 1934 NOSYS, /* 29 */ 1935 EMULATE(s10_utime, 2 | RV_DEFAULT), /* 30 */ 1936 NOSYS, /* 31 */ 1937 NOSYS, /* 32 */ 1938 EMULATE(s10_access, 2 | RV_DEFAULT), /* 33 */ 1939 NOSYS, /* 34 */ 1940 NOSYS, /* 35 */ 1941 NOSYS, /* 36 */ 1942 EMULATE(s10_kill, 2 | RV_DEFAULT), /* 37 */ 1943 NOSYS, /* 38 */ 1944 NOSYS, /* 39 */ 1945 NOSYS, /* 40 */ 1946 EMULATE(s10_dup, 1 | RV_DEFAULT), /* 41 */ 1947 EMULATE(s10_pipe, 0 | RV_32RVAL2), /* 42 */ 1948 NOSYS, /* 43 */ 1949 NOSYS, /* 44 */ 1950 NOSYS, /* 45 */ 1951 NOSYS, /* 46 */ 1952 NOSYS, /* 47 */ 1953 NOSYS, /* 48 */ 1954 NOSYS, /* 49 */ 1955 NOSYS, /* 50 */ 1956 NOSYS, /* 51 */ 1957 NOSYS, /* 52 */ 1958 NOSYS, /* 53 */ 1959 EMULATE(s10_ioctl, 3 | RV_DEFAULT), /* 54 */ 1960 NOSYS, /* 55 */ 1961 NOSYS, /* 56 */ 1962 NOSYS, /* 57 */ 1963 NOSYS, /* 58 */ 1964 EMULATE(s10_execve, 3 | RV_DEFAULT), /* 59 */ 1965 NOSYS, /* 60 */ 1966 NOSYS, /* 61 */ 1967 NOSYS, /* 62 */ 1968 NOSYS, /* 63 */ 1969 NOSYS, /* 64 */ 1970 NOSYS, /* 65 */ 1971 NOSYS, /* 66 */ 1972 NOSYS, /* 67 */ 1973 NOSYS, /* 68 */ 1974 NOSYS, /* 69 */ 1975 NOSYS, /* 70 */ 1976 EMULATE(s10_acctctl, 3 | RV_DEFAULT), /* 71 */ 1977 NOSYS, /* 72 */ 1978 NOSYS, /* 73 */ 1979 NOSYS, /* 74 */ 1980 EMULATE(s10_issetugid, 0 | RV_DEFAULT), /* 75 */ 1981 EMULATE(s10_fsat, 6 | RV_DEFAULT), /* 76 */ 1982 NOSYS, /* 77 */ 1983 NOSYS, /* 78 */ 1984 EMULATE(s10_rmdir, 1 | RV_DEFAULT), /* 79 */ 1985 EMULATE(s10_mkdir, 2 | RV_DEFAULT), /* 80 */ 1986 EMULATE(s10_getdents, 3 | RV_DEFAULT), /* 81 */ 1987 NOSYS, /* 82 */ 1988 NOSYS, /* 83 */ 1989 NOSYS, /* 84 */ 1990 NOSYS, /* 85 */ 1991 NOSYS, /* 86 */ 1992 EMULATE(s10_poll, 3 | RV_DEFAULT), /* 87 */ 1993 EMULATE(s10_lstat, 2 | RV_DEFAULT), /* 88 */ 1994 EMULATE(s10_symlink, 2 | RV_DEFAULT), /* 89 */ 1995 EMULATE(s10_readlink, 3 | RV_DEFAULT), /* 90 */ 1996 NOSYS, /* 91 */ 1997 NOSYS, /* 92 */ 1998 EMULATE(s10_fchmod, 2 | RV_DEFAULT), /* 93 */ 1999 EMULATE(s10_fchown, 3 | RV_DEFAULT), /* 94 */ 2000 EMULATE(s10_sigprocmask, 3 | RV_DEFAULT), /* 95 */ 2001 EMULATE(s10_sigsuspend, 1 | RV_DEFAULT), /* 96 */ 2002 NOSYS, /* 97 */ 2003 EMULATE(s10_sigaction, 3 | RV_DEFAULT), /* 98 */ 2004 EMULATE(s10_sigpending, 2 | RV_DEFAULT), /* 99 */ 2005 NOSYS, /* 100 */ 2006 NOSYS, /* 101 */ 2007 NOSYS, /* 102 */ 2008 NOSYS, /* 103 */ 2009 NOSYS, /* 104 */ 2010 NOSYS, /* 105 */ 2011 NOSYS, /* 106 */ 2012 EMULATE(s10_waitid, 4 | RV_DEFAULT), /* 107 */ 2013 EMULATE(s10_sigsendsys, 2 | RV_DEFAULT), /* 108 */ 2014 NOSYS, /* 109 */ 2015 NOSYS, /* 110 */ 2016 NOSYS, /* 111 */ 2017 NOSYS, /* 112 */ 2018 NOSYS, /* 113 */ 2019 NOSYS, /* 114 */ 2020 NOSYS, /* 115 */ 2021 NOSYS, /* 116 */ 2022 NOSYS, /* 117 */ 2023 NOSYS, /* 118 */ 2024 NOSYS, /* 119 */ 2025 NOSYS, /* 120 */ 2026 NOSYS, /* 121 */ 2027 NOSYS, /* 122 */ 2028 #if defined(__x86) 2029 EMULATE(s10_xstat, 3 | RV_DEFAULT), /* 123 */ 2030 EMULATE(s10_lxstat, 3 | RV_DEFAULT), /* 124 */ 2031 EMULATE(s10_fxstat, 3 | RV_DEFAULT), /* 125 */ 2032 EMULATE(s10_xmknod, 4 | RV_DEFAULT), /* 126 */ 2033 #else 2034 NOSYS, /* 123 */ 2035 NOSYS, /* 124 */ 2036 NOSYS, /* 125 */ 2037 NOSYS, /* 126 */ 2038 #endif 2039 NOSYS, /* 127 */ 2040 NOSYS, /* 128 */ 2041 NOSYS, /* 129 */ 2042 EMULATE(s10_lchown, 3 | RV_DEFAULT), /* 130 */ 2043 NOSYS, /* 131 */ 2044 NOSYS, /* 132 */ 2045 NOSYS, /* 133 */ 2046 EMULATE(s10_rename, 2 | RV_DEFAULT), /* 134 */ 2047 EMULATE(s10_uname, 1 | RV_DEFAULT), /* 135 */ 2048 NOSYS, /* 136 */ 2049 EMULATE(s10_sysconfig, 1 | RV_DEFAULT), /* 137 */ 2050 NOSYS, /* 138 */ 2051 EMULATE(s10_sysinfo, 3 | RV_DEFAULT), /* 139 */ 2052 NOSYS, /* 140 */ 2053 NOSYS, /* 141 */ 2054 NOSYS, /* 142 */ 2055 EMULATE(s10_fork1, 0 | RV_32RVAL2), /* 143 */ 2056 EMULATE(s10_sigtimedwait, 3 | RV_DEFAULT), /* 144 */ 2057 NOSYS, /* 145 */ 2058 NOSYS, /* 146 */ 2059 EMULATE(s10_lwp_sema_wait, 1 | RV_DEFAULT), /* 147 */ 2060 NOSYS, /* 148 */ 2061 NOSYS, /* 149 */ 2062 NOSYS, /* 150 */ 2063 NOSYS, /* 151 */ 2064 NOSYS, /* 152 */ 2065 NOSYS, /* 153 */ 2066 EMULATE(s10_utimes, 2 | RV_DEFAULT), /* 154 */ 2067 NOSYS, /* 155 */ 2068 NOSYS, /* 156 */ 2069 NOSYS, /* 157 */ 2070 NOSYS, /* 158 */ 2071 EMULATE(s10_lwp_create, 3 | RV_DEFAULT), /* 159 */ 2072 NOSYS, /* 160 */ 2073 NOSYS, /* 161 */ 2074 NOSYS, /* 162 */ 2075 EMULATE(s10_lwp_kill, 2 | RV_DEFAULT), /* 163 */ 2076 NOSYS, /* 164 */ 2077 EMULATE(s10_lwp_sigmask, 3 | RV_32RVAL2), /* 165 */ 2078 #if defined(__x86) 2079 EMULATE(s10_lwp_private, 3 | RV_DEFAULT), /* 166 */ 2080 #else 2081 NOSYS, /* 166 */ 2082 #endif 2083 NOSYS, /* 167 */ 2084 NOSYS, /* 168 */ 2085 EMULATE(s10_lwp_mutex_lock, 1 | RV_DEFAULT), /* 169 */ 2086 NOSYS, /* 170 */ 2087 NOSYS, /* 171 */ 2088 NOSYS, /* 172 */ 2089 NOSYS, /* 173 */ 2090 EMULATE(s10_pwrite, 4 | RV_DEFAULT), /* 174 */ 2091 NOSYS, /* 175 */ 2092 NOSYS, /* 176 */ 2093 NOSYS, /* 177 */ 2094 NOSYS, /* 178 */ 2095 NOSYS, /* 179 */ 2096 NOSYS, /* 180 */ 2097 NOSYS, /* 181 */ 2098 NOSYS, /* 182 */ 2099 NOSYS, /* 183 */ 2100 NOSYS, /* 184 */ 2101 EMULATE(s10_acl, 4 | RV_DEFAULT), /* 185 */ 2102 EMULATE(s10_auditsys, 4 | RV_64RVAL), /* 186 */ 2103 NOSYS, /* 187 */ 2104 NOSYS, /* 188 */ 2105 NOSYS, /* 189 */ 2106 EMULATE(s10_sigqueue, 4 | RV_DEFAULT), /* 190 */ 2107 NOSYS, /* 191 */ 2108 NOSYS, /* 192 */ 2109 NOSYS, /* 193 */ 2110 NOSYS, /* 194 */ 2111 NOSYS, /* 195 */ 2112 NOSYS, /* 196 */ 2113 NOSYS, /* 197 */ 2114 NOSYS, /* 198 */ 2115 NOSYS, /* 199 */ 2116 EMULATE(s10_facl, 4 | RV_DEFAULT), /* 200 */ 2117 NOSYS, /* 201 */ 2118 NOSYS, /* 202 */ 2119 NOSYS, /* 203 */ 2120 NOSYS, /* 204 */ 2121 EMULATE(s10_signotify, 3 | RV_DEFAULT), /* 205 */ 2122 NOSYS, /* 206 */ 2123 NOSYS, /* 207 */ 2124 NOSYS, /* 208 */ 2125 NOSYS, /* 209 */ 2126 EMULATE(s10_lwp_mutex_timedlock, 2 | RV_DEFAULT), /* 210 */ 2127 NOSYS, /* 211 */ 2128 NOSYS, /* 212 */ 2129 #if defined(_LP64) 2130 NOSYS, /* 213 */ 2131 #else 2132 EMULATE(s10_getdents64, 3 | RV_DEFAULT), /* 213 */ 2133 #endif 2134 NOSYS, /* 214 */ 2135 #if defined(_LP64) 2136 NOSYS, /* 215 */ 2137 NOSYS, /* 216 */ 2138 NOSYS, /* 217 */ 2139 #else 2140 EMULATE(s10_stat64, 2 | RV_DEFAULT), /* 215 */ 2141 EMULATE(s10_lstat64, 2 | RV_DEFAULT), /* 216 */ 2142 EMULATE(s10_fstat64, 2 | RV_DEFAULT), /* 217 */ 2143 #endif 2144 NOSYS, /* 218 */ 2145 NOSYS, /* 219 */ 2146 NOSYS, /* 220 */ 2147 NOSYS, /* 221 */ 2148 NOSYS, /* 222 */ 2149 #if defined(_LP64) 2150 NOSYS, /* 223 */ 2151 NOSYS, /* 224 */ 2152 NOSYS, /* 225 */ 2153 #else 2154 EMULATE(s10_pwrite64, 5 | RV_DEFAULT), /* 223 */ 2155 EMULATE(s10_creat64, 2 | RV_DEFAULT), /* 224 */ 2156 EMULATE(s10_open64, 3 | RV_DEFAULT), /* 225 */ 2157 #endif 2158 NOSYS, /* 226 */ 2159 EMULATE(s10_zone, 5 | RV_DEFAULT), /* 227 */ 2160 NOSYS, /* 228 */ 2161 NOSYS, /* 229 */ 2162 EMULATE(s10_so_socket, 5 | RV_DEFAULT), /* 230 */ 2163 NOSYS, /* 231 */ 2164 NOSYS, /* 232 */ 2165 NOSYS, /* 233 */ 2166 EMULATE(s10_accept, 4 | RV_DEFAULT), /* 234 */ 2167 NOSYS, /* 235 */ 2168 NOSYS, /* 236 */ 2169 NOSYS, /* 237 */ 2170 NOSYS, /* 238 */ 2171 NOSYS, /* 239 */ 2172 NOSYS, /* 240 */ 2173 NOSYS, /* 241 */ 2174 NOSYS, /* 242 */ 2175 NOSYS, /* 243 */ 2176 NOSYS, /* 244 */ 2177 NOSYS, /* 245 */ 2178 NOSYS, /* 246 */ 2179 NOSYS, /* 247 */ 2180 NOSYS, /* 248 */ 2181 NOSYS, /* 249 */ 2182 NOSYS, /* 250 */ 2183 EMULATE(s10_lwp_mutex_trylock, 1 | RV_DEFAULT), /* 251 */ 2184 NOSYS, /* 252 */ 2185 NOSYS, /* 253 */ 2186 NOSYS, /* 254 */ 2187 NOSYS /* 255 */ 2188 }; 2189