1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
24 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25 */
26
27 #include <sys/errno.h>
28 #include <sys/exec.h>
29 #include <sys/file.h>
30 #include <sys/kmem.h>
31 #include <sys/modctl.h>
32 #include <sys/model.h>
33 #include <sys/proc.h>
34 #include <sys/syscall.h>
35 #include <sys/systm.h>
36 #include <sys/thread.h>
37 #include <sys/cmn_err.h>
38 #include <sys/archsystm.h>
39 #include <sys/pathname.h>
40 #include <sys/sunddi.h>
41
42 #include <sys/machbrand.h>
43 #include <sys/brand.h>
44 #include "s10_brand.h"
45
46 char *s10_emulation_table = NULL;
47
48 void s10_init_brand_data(zone_t *);
49 void s10_free_brand_data(zone_t *);
50 void s10_setbrand(proc_t *);
51 int s10_getattr(zone_t *, int, void *, size_t *);
52 int s10_setattr(zone_t *, int, void *, size_t);
53 int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
54 uintptr_t, uintptr_t, uintptr_t);
55 void s10_copy_procdata(proc_t *, proc_t *);
56 void s10_proc_exit(struct proc *, klwp_t *);
57 void s10_exec();
58 int s10_initlwp(klwp_t *);
59 void s10_forklwp(klwp_t *, klwp_t *);
60 void s10_freelwp(klwp_t *);
61 void s10_lwpexit(klwp_t *);
62 int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
63 long *, int, caddr_t, cred_t *, int);
64 void s10_sigset_native_to_s10(sigset_t *);
65 void s10_sigset_s10_to_native(sigset_t *);
66
67 /* s10 brand */
68 struct brand_ops s10_brops = {
69 s10_init_brand_data,
70 s10_free_brand_data,
71 s10_brandsys,
72 s10_setbrand,
73 s10_getattr,
74 s10_setattr,
75 s10_copy_procdata,
76 s10_proc_exit,
77 s10_exec,
78 lwp_setrval,
79 s10_initlwp,
80 s10_forklwp,
81 s10_freelwp,
82 s10_lwpexit,
83 s10_elfexec,
84 s10_sigset_native_to_s10,
85 s10_sigset_s10_to_native,
86 S10_NSIG,
87 };
88
89 #ifdef sparc
90
91 struct brand_mach_ops s10_mops = {
92 s10_brand_syscall_callback,
93 s10_brand_syscall32_callback
94 };
95
96 #else /* sparc */
97
98 #ifdef __amd64
99
100 struct brand_mach_ops s10_mops = {
101 s10_brand_sysenter_callback,
102 s10_brand_int91_callback,
103 s10_brand_syscall_callback,
104 s10_brand_syscall32_callback
105 };
106
107 #else /* ! __amd64 */
108
109 struct brand_mach_ops s10_mops = {
110 s10_brand_sysenter_callback,
111 NULL,
112 s10_brand_syscall_callback,
113 NULL
114 };
115 #endif /* __amd64 */
116
117 #endif /* _sparc */
118
119 struct brand s10_brand = {
120 BRAND_VER_1,
121 "solaris10",
122 &s10_brops,
123 &s10_mops
124 };
125
126 static struct modlbrand modlbrand = {
127 &mod_brandops, /* type of module */
128 "Solaris 10 Brand", /* description of module */
129 &s10_brand /* driver ops */
130 };
131
132 static struct modlinkage modlinkage = {
133 MODREV_1, (void *)&modlbrand, NULL
134 };
135
136 void
s10_setbrand(proc_t * p)137 s10_setbrand(proc_t *p)
138 {
139 brand_solaris_setbrand(p, &s10_brand);
140 }
141
142 /*ARGSUSED*/
143 int
s10_getattr(zone_t * zone,int attr,void * buf,size_t * bufsize)144 s10_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
145 {
146 ASSERT(zone->zone_brand == &s10_brand);
147 if (attr == S10_EMUL_BITMAP) {
148 if (buf == NULL || *bufsize != sizeof (s10_emul_bitmap_t))
149 return (EINVAL);
150 if (copyout(((s10_zone_data_t *)zone->zone_brand_data)->
151 emul_bitmap, buf, sizeof (s10_emul_bitmap_t)) != 0)
152 return (EFAULT);
153 return (0);
154 }
155
156 return (EINVAL);
157 }
158
159 int
s10_setattr(zone_t * zone,int attr,void * buf,size_t bufsize)160 s10_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
161 {
162 ASSERT(zone->zone_brand == &s10_brand);
163 if (attr == S10_EMUL_BITMAP) {
164 if (buf == NULL || bufsize != sizeof (s10_emul_bitmap_t))
165 return (EINVAL);
166 if (copyin(buf, ((s10_zone_data_t *)zone->zone_brand_data)->
167 emul_bitmap, sizeof (s10_emul_bitmap_t)) != 0)
168 return (EFAULT);
169 return (0);
170 }
171
172 return (EINVAL);
173 }
174
175 #ifdef __amd64
176 /*
177 * The Nevada kernel clears %fs for threads in 64-bit x86 processes but S10's
178 * libc expects %fs to be nonzero. This causes some committed
179 * libc/libthread interfaces (e.g., thr_main()) to fail, which impacts several
180 * libraries, including libdoor. This function sets the specified LWP's %fs
181 * register to the legacy S10 selector value (LWPFS_SEL).
182 *
183 * The best solution to the aforementioned problem is backporting CRs
184 * 6467491 to Solaris 10 so that 64-bit x86 Solaris 10 processes
185 * would accept zero for %fs. Backporting the CRs is a requirement for running
186 * S10 Containers in PV domUs because 64-bit Xen clears %fsbase when %fs is
187 * nonzero. Such behavior breaks 64-bit processes because Xen has to fetch the
188 * FS segments' base addresses from the LWPs' GDTs, which are only capable of
189 * 32-bit addressing.
190 */
191 /*ARGSUSED*/
192 static void
s10_amd64_correct_fsreg(klwp_t * l)193 s10_amd64_correct_fsreg(klwp_t *l)
194 {
195 if (lwp_getdatamodel(l) == DATAMODEL_NATIVE) {
196 kpreempt_disable();
197 l->lwp_pcb.pcb_fs = LWPFS_SEL;
198 l->lwp_pcb.pcb_rupdate = 1;
199 lwptot(l)->t_post_sys = 1; /* Guarantee update_sregs() */
200 kpreempt_enable();
201 }
202 }
203 #endif /* __amd64 */
204
205 /*
206 * Native processes are started with the native ld.so.1 as the command. This
207 * brand op is invoked by s10_npreload to fix up the command and arguments
208 * so that apps like pgrep or ps see the expected command strings.
209 */
210 int
s10_native(void * cmd,void * args)211 s10_native(void *cmd, void *args)
212 {
213 struct user *up = PTOU(curproc);
214 char cmd_buf[MAXCOMLEN + 1];
215 char arg_buf[PSARGSZ];
216
217 if (copyin(cmd, &cmd_buf, sizeof (cmd_buf)) != 0)
218 return (EFAULT);
219 if (copyin(args, &arg_buf, sizeof (arg_buf)) != 0)
220 return (EFAULT);
221
222 /*
223 * Make sure that the process' interpreter is the native dynamic linker.
224 * Convention dictates that native processes executing within solaris10-
225 * branded zones are interpreted by the native dynamic linker (the
226 * process and its arguments are specified as arguments to the dynamic
227 * linker). If this convention is violated (i.e.,
228 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be
229 * native), then do nothing and silently indicate success.
230 */
231 if (strcmp(up->u_comm, S10_LINKER_NAME) != 0)
232 return (0);
233
234 /*
235 * The sizeof has an extra value for the trailing '\0' so this covers
236 * the appended " " in the following strcmps.
237 */
238 if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " ",
239 sizeof (BRAND_NATIVE_LINKER64)) != 0 &&
240 strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " ",
241 sizeof (BRAND_NATIVE_LINKER32)) != 0)
242 return (0);
243
244 mutex_enter(&curproc->p_lock);
245 (void) strlcpy(up->u_comm, cmd_buf, sizeof (up->u_comm));
246 (void) strlcpy(up->u_psargs, arg_buf, sizeof (up->u_psargs));
247 mutex_exit(&curproc->p_lock);
248
249 return (0);
250 }
251
252 /*ARGSUSED*/
253 int
s10_brandsys(int cmd,int64_t * rval,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,uintptr_t arg5,uintptr_t arg6)254 s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
255 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
256 {
257 proc_t *p = curproc;
258 int res;
259
260 *rval = 0;
261
262 if (cmd == B_S10_NATIVE)
263 return (s10_native((void *)arg1, (void *)arg2));
264
265 res = brand_solaris_cmd(cmd, arg1, arg2, arg3, &s10_brand, S10_VERSION);
266 if (res >= 0)
267 return (res);
268
269 switch ((cmd)) {
270 case B_S10_PIDINFO:
271 /*
272 * The s10 brand needs to be able to get the pid of the
273 * current process and the pid of the zone's init, and it
274 * needs to do this on every process startup. Early in
275 * brand startup, we can't call getpid() because calls to
276 * getpid() represent a magical signal to some old-skool
277 * debuggers. By merging all of this into one call, we
278 * make this quite a bit cheaper and easier to handle in
279 * the brand module.
280 */
281 if (copyout(&p->p_pid, (void *)arg1, sizeof (pid_t)) != 0)
282 return (EFAULT);
283 if (copyout(&p->p_zone->zone_proc_initpid, (void *)arg2,
284 sizeof (pid_t)) != 0)
285 return (EFAULT);
286 return (0);
287
288 case B_S10_ISFDXATTRDIR: {
289 /*
290 * This subcommand enables the userland brand emulation library
291 * to determine whether a file descriptor refers to an extended
292 * file attributes directory. There is no standard syscall or
293 * libc function that can make such a determination.
294 */
295 file_t *dir_filep;
296
297 dir_filep = getf((int)arg1);
298 if (dir_filep == NULL)
299 return (EBADF);
300 ASSERT(dir_filep->f_vnode != NULL);
301 *rval = IS_XATTRDIR(dir_filep->f_vnode);
302 releasef((int)arg1);
303 return (0);
304 }
305
306 #ifdef __amd64
307 case B_S10_FSREGCORRECTION:
308 /*
309 * This subcommand exists so that the SYS_lwp_private and
310 * SYS_lwp_create syscalls can manually set the current thread's
311 * %fs register to the legacy S10 selector value for 64-bit x86
312 * processes.
313 */
314 s10_amd64_correct_fsreg(ttolwp(curthread));
315 return (0);
316 #endif /* __amd64 */
317 }
318
319 return (EINVAL);
320 }
321
322 void
s10_copy_procdata(proc_t * child,proc_t * parent)323 s10_copy_procdata(proc_t *child, proc_t *parent)
324 {
325 brand_solaris_copy_procdata(child, parent, &s10_brand);
326 }
327
328 void
s10_proc_exit(struct proc * p,klwp_t * l)329 s10_proc_exit(struct proc *p, klwp_t *l)
330 {
331 brand_solaris_proc_exit(p, l, &s10_brand);
332 }
333
334 void
s10_exec()335 s10_exec()
336 {
337 brand_solaris_exec(&s10_brand);
338 }
339
340 int
s10_initlwp(klwp_t * l)341 s10_initlwp(klwp_t *l)
342 {
343 return (brand_solaris_initlwp(l, &s10_brand));
344 }
345
346 void
s10_forklwp(klwp_t * p,klwp_t * c)347 s10_forklwp(klwp_t *p, klwp_t *c)
348 {
349 brand_solaris_forklwp(p, c, &s10_brand);
350
351 #ifdef __amd64
352 /*
353 * Only correct the child's %fs register if the parent's %fs register
354 * is LWPFS_SEL. If the parent's %fs register is zero, then the Solaris
355 * 10 environment that we're emulating uses a version of libc that
356 * works when %fs is zero (i.e., it contains backports of CRs 6467491
357 * and 6501650).
358 */
359 if (p->lwp_pcb.pcb_fs == LWPFS_SEL)
360 s10_amd64_correct_fsreg(c);
361 #endif /* __amd64 */
362 }
363
364 void
s10_freelwp(klwp_t * l)365 s10_freelwp(klwp_t *l)
366 {
367 brand_solaris_freelwp(l, &s10_brand);
368 }
369
370 void
s10_lwpexit(klwp_t * l)371 s10_lwpexit(klwp_t *l)
372 {
373 brand_solaris_lwpexit(l, &s10_brand);
374 }
375
376 void
s10_free_brand_data(zone_t * zone)377 s10_free_brand_data(zone_t *zone)
378 {
379 kmem_free(zone->zone_brand_data, sizeof (s10_zone_data_t));
380 }
381
382 void
s10_init_brand_data(zone_t * zone)383 s10_init_brand_data(zone_t *zone)
384 {
385 ASSERT(zone->zone_brand == &s10_brand);
386 ASSERT(zone->zone_brand_data == NULL);
387 zone->zone_brand_data = kmem_zalloc(sizeof (s10_zone_data_t), KM_SLEEP);
388 }
389
390 int
s10_elfexec(vnode_t * vp,execa_t * uap,uarg_t * args,intpdata_t * idatap,int level,long * execsz,int setid,caddr_t exec_file,cred_t * cred,int brand_action)391 s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
392 int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
393 int brand_action)
394 {
395 return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
396 setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME,
397 S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32));
398 }
399
400 void
s10_sigset_native_to_s10(sigset_t * set)401 s10_sigset_native_to_s10(sigset_t *set)
402 {
403 int nativesig;
404 int s10sig;
405 sigset_t s10set;
406
407 /*
408 * Shortcut: we know the first 32 signals are the same in both
409 * s10 and native Solaris. Just assign the first word.
410 */
411 s10set.__sigbits[0] = set->__sigbits[0];
412 s10set.__sigbits[1] = 0;
413 s10set.__sigbits[2] = 0;
414 s10set.__sigbits[3] = 0;
415
416 /*
417 * Copy the remainder of the initial set of common signals.
418 */
419 for (nativesig = 33; nativesig < S10_SIGRTMIN; nativesig++)
420 if (sigismember(set, nativesig))
421 sigaddset(&s10set, nativesig);
422
423 /*
424 * Convert any native RT signals to their S10 values.
425 */
426 for (nativesig = _SIGRTMIN, s10sig = S10_SIGRTMIN;
427 nativesig <= _SIGRTMAX && s10sig <= S10_SIGRTMAX;
428 nativesig++, s10sig++) {
429 if (sigismember(set, nativesig))
430 sigaddset(&s10set, s10sig);
431 }
432
433 *set = s10set;
434 }
435
436 void
s10_sigset_s10_to_native(sigset_t * set)437 s10_sigset_s10_to_native(sigset_t *set)
438 {
439 int s10sig;
440 int nativesig;
441 sigset_t nativeset;
442
443 /*
444 * Shortcut: we know the first 32 signals are the same in both
445 * s10 and native Solaris. Just assign the first word.
446 */
447 nativeset.__sigbits[0] = set->__sigbits[0];
448 nativeset.__sigbits[1] = 0;
449 nativeset.__sigbits[2] = 0;
450 nativeset.__sigbits[3] = 0;
451
452 /*
453 * Copy the remainder of the initial set of common signals.
454 */
455 for (s10sig = 33; s10sig < S10_SIGRTMIN; s10sig++)
456 if (sigismember(set, s10sig))
457 sigaddset(&nativeset, s10sig);
458
459 /*
460 * Convert any S10 RT signals to their native values.
461 */
462 for (s10sig = S10_SIGRTMIN, nativesig = _SIGRTMIN;
463 s10sig <= S10_SIGRTMAX && nativesig <= _SIGRTMAX;
464 s10sig++, nativesig++) {
465 if (sigismember(set, s10sig))
466 sigaddset(&nativeset, nativesig);
467 }
468
469 *set = nativeset;
470 }
471
472 int
_init(void)473 _init(void)
474 {
475 int err;
476
477 /*
478 * Set up the table indicating which system calls we want to
479 * interpose on. We should probably build this automatically from
480 * a list of system calls that is shared with the user-space
481 * library.
482 */
483 s10_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP);
484 s10_emulation_table[S10_SYS_forkall] = 1; /* 2 */
485 s10_emulation_table[S10_SYS_open] = 1; /* 5 */
486 s10_emulation_table[S10_SYS_wait] = 1; /* 7 */
487 s10_emulation_table[S10_SYS_creat] = 1; /* 8 */
488 s10_emulation_table[S10_SYS_link] = 1; /* 9 */
489 s10_emulation_table[S10_SYS_unlink] = 1; /* 10 */
490 s10_emulation_table[S10_SYS_exec] = 1; /* 11 */
491 s10_emulation_table[S10_SYS_mknod] = 1; /* 14 */
492 s10_emulation_table[S10_SYS_chmod] = 1; /* 15 */
493 s10_emulation_table[S10_SYS_chown] = 1; /* 16 */
494 s10_emulation_table[S10_SYS_stat] = 1; /* 18 */
495 s10_emulation_table[S10_SYS_umount] = 1; /* 22 */
496 s10_emulation_table[S10_SYS_fstat] = 1; /* 28 */
497 s10_emulation_table[S10_SYS_utime] = 1; /* 30 */
498 s10_emulation_table[S10_SYS_access] = 1; /* 33 */
499 s10_emulation_table[SYS_kill] = 1; /* 37 */
500 s10_emulation_table[S10_SYS_dup] = 1; /* 41 */
501 s10_emulation_table[S10_SYS_pipe] = 1; /* 42 */
502 s10_emulation_table[SYS_ioctl] = 1; /* 54 */
503 s10_emulation_table[SYS_execve] = 1; /* 59 */
504 s10_emulation_table[SYS_acctctl] = 1; /* 71 */
505 s10_emulation_table[S10_SYS_issetugid] = 1; /* 75 */
506 s10_emulation_table[S10_SYS_fsat] = 1; /* 76 */
507 s10_emulation_table[S10_SYS_rmdir] = 1; /* 79 */
508 s10_emulation_table[S10_SYS_mkdir] = 1; /* 80 */
509 s10_emulation_table[SYS_getdents] = 1; /* 81 */
510 s10_emulation_table[S10_SYS_poll] = 1; /* 87 */
511 s10_emulation_table[S10_SYS_lstat] = 1; /* 88 */
512 s10_emulation_table[S10_SYS_symlink] = 1; /* 89 */
513 s10_emulation_table[S10_SYS_readlink] = 1; /* 90 */
514 s10_emulation_table[S10_SYS_fchmod] = 1; /* 93 */
515 s10_emulation_table[S10_SYS_fchown] = 1; /* 94 */
516 s10_emulation_table[SYS_sigprocmask] = 1; /* 95 */
517 s10_emulation_table[SYS_sigsuspend] = 1; /* 96 */
518 s10_emulation_table[SYS_sigaction] = 1; /* 98 */
519 s10_emulation_table[SYS_sigpending] = 1; /* 99 */
520 s10_emulation_table[SYS_waitid] = 1; /* 107 */
521 s10_emulation_table[SYS_sigsendsys] = 1; /* 108 */
522 #if defined(__x86)
523 s10_emulation_table[S10_SYS_xstat] = 1; /* 123 */
524 s10_emulation_table[S10_SYS_lxstat] = 1; /* 124 */
525 s10_emulation_table[S10_SYS_fxstat] = 1; /* 125 */
526 s10_emulation_table[S10_SYS_xmknod] = 1; /* 126 */
527 #endif
528 s10_emulation_table[S10_SYS_lchown] = 1; /* 130 */
529 s10_emulation_table[S10_SYS_rename] = 1; /* 134 */
530 s10_emulation_table[SYS_uname] = 1; /* 135 */
531 s10_emulation_table[SYS_sysconfig] = 1; /* 137 */
532 s10_emulation_table[SYS_systeminfo] = 1; /* 139 */
533 s10_emulation_table[S10_SYS_fork1] = 1; /* 143 */
534 s10_emulation_table[SYS_sigtimedwait] = 1; /* 144 */
535 s10_emulation_table[S10_SYS_lwp_sema_wait] = 1; /* 147 */
536 s10_emulation_table[S10_SYS_utimes] = 1; /* 154 */
537 s10_emulation_table[SYS_lwp_create] = 1; /* 159 */
538 s10_emulation_table[SYS_lwp_kill] = 1; /* 163 */
539 s10_emulation_table[SYS_lwp_sigmask] = 1; /* 165 */
540 #if defined(__amd64)
541 s10_emulation_table[SYS_lwp_private] = 1; /* 166 */
542 #endif /* __amd64 */
543 s10_emulation_table[S10_SYS_lwp_mutex_lock] = 1; /* 169 */
544 s10_emulation_table[SYS_pwrite] = 1; /* 174 */
545 s10_emulation_table[SYS_acl] = 1; /* 185 */
546 s10_emulation_table[SYS_auditsys] = 1; /* 186 */
547 s10_emulation_table[SYS_sigqueue] = 1; /* 190 */
548 s10_emulation_table[SYS_facl] = 1; /* 200 */
549 s10_emulation_table[SYS_signotify] = 1; /* 205 */
550 s10_emulation_table[SYS_lwp_mutex_timedlock] = 1; /* 210 */
551 s10_emulation_table[SYS_getdents64] = 1; /* 213 */
552 s10_emulation_table[S10_SYS_stat64] = 1; /* 215 */
553 s10_emulation_table[S10_SYS_lstat64] = 1; /* 216 */
554 s10_emulation_table[S10_SYS_fstat64] = 1; /* 217 */
555 s10_emulation_table[SYS_pwrite64] = 1; /* 223 */
556 s10_emulation_table[S10_SYS_creat64] = 1; /* 224 */
557 s10_emulation_table[S10_SYS_open64] = 1; /* 225 */
558 s10_emulation_table[SYS_zone] = 1; /* 227 */
559 s10_emulation_table[S10_SYS_so_socket] = 1; /* 230 */
560 s10_emulation_table[S10_SYS_accept] = 1; /* 234 */
561 s10_emulation_table[SYS_lwp_mutex_trylock] = 1; /* 251 */
562
563 err = mod_install(&modlinkage);
564 if (err) {
565 cmn_err(CE_WARN, "Couldn't install brand module");
566 kmem_free(s10_emulation_table, NSYSCALL);
567 }
568
569 return (err);
570 }
571
572 int
_info(struct modinfo * modinfop)573 _info(struct modinfo *modinfop)
574 {
575 return (mod_info(&modlinkage, modinfop));
576 }
577
578 int
_fini(void)579 _fini(void)
580 {
581 return (brand_solaris_fini(&s10_emulation_table, &modlinkage,
582 &s10_brand));
583 }
584