1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <unistd.h>
31 #include <sys/auxv.h>
32 #include <sys/bitmap.h>
33 #include <sys/brand.h>
34 #include <sys/inttypes.h>
35 #include <sys/lwp.h>
36 #include <sys/syscall.h>
37 #include <sys/systm.h>
38 #include <sys/utsname.h>
39 #include <fcntl.h>
40 #include <brand_misc.h>
41 #include <sys/brand.h>
42
43 extern brand_sysent_table_t brand_sysent_table[];
44
45 /*LINTED: static unused*/
46 static volatile int brand_abort_err;
47 /*LINTED: static unused*/
48 static volatile const char *brand_abort_msg;
49 /*LINTED: static unused*/
50 static volatile const char *brand_abort_file;
51 /*LINTED: static unused*/
52 static volatile int brand_abort_line;
53
54 /*
55 * Principles of emulation 101.
56 *
57 *
58 * *** Setting errno
59 *
60 * Just don't do it. This emulation library is loaded onto a
61 * seperate link map from the application who's address space we're
62 * running in. We have our own private copy of libc, so there for,
63 * the errno value accessible from here is is also private and changing
64 * it will not affect any errno value that the processes who's address
65 * space we are running in will see. To return an error condition we
66 * should return the errno value we'd like the system to return.
67 * For more information about this see the comments in brand_misc.h.
68 * Basically, when we return to the caller that initiated the system
69 * call it's their responsibility to set errno.
70 *
71 *
72 * *** Recursion Considerations
73 *
74 * When emulating system calls we need to be very careful about what
75 * library calls we invoke. Library calls should be kept to a minimum.
76 * One issue is that library calls can invoke system calls, so if we're
77 * emulating a system call and we invoke a library call that depends on
78 * that system call we will probably enter a recursive loop, which would
79 * be bad.
80 *
81 *
82 * *** Return Values.
83 *
84 * See brand_misc.h.
85 *
86 * *** Agent lwp considerations
87 *
88 * It is currently impossible to do any emulation for these system call
89 * when they are being invoked on behalf of an agent lwp. To understand why
90 * it's impossible you have to understand how agent lwp syscalls work.
91 *
92 * The agent lwp syscall process works as follows:
93 * 1 The controlling process stops the target.
94 * 2 The controlling process injects an agent lwp which is also stopped.
95 * This agent lwp assumes the userland stack and register values
96 * of another stopped lwp in the current process.
97 * 3 The controlling process configures the agent lwp to start
98 * executing the requested system call.
99 * 4 The controlling process configure /proc to stop the agent lwp when
100 * it enters the requested system call.
101 * 5 The controlling processes allows the agent lwp to start executing.
102 * 6 The agent lwp traps into the kernel to perform the requested system
103 * call and immediately stop.
104 * 7 The controlling process copies all the arguments for the requested
105 * system call onto the agent lwp's stack.
106 * 8 The controlling process configures /proc to stop the agent lwp
107 * when it completes the requested system call.
108 * 9 The controlling processes allows the agent lwp to start executing.
109 * 10 The agent lwp executes the system call and then stop before returning
110 * to userland.
111 * 11 The controlling process copies the return value and return arguments
112 * back from the agent lwps stack.
113 * 12 The controlling process destroys the agent lwp and restarts
114 * the target process.
115 *
116 * The fundamental problem is that when the agent executes the request
117 * system call in step 5, if we're emulating that system call then the
118 * lwp is redirected back to our emulation layer without blocking
119 * in the kernel. But our emulation layer can't access the arguments
120 * for the system call because they haven't been copied to the stack
121 * yet and they still only exist in the controlling processes address
122 * space. This prevents us from being able to do any emulation of
123 * agent lwp system calls. Hence, currently our brand trap interposition
124 * callback (XXX_brand_syscall_callback_common) will detect if a system
125 * call is being made by an agent lwp, and if this is the case it will
126 * never redirect the system call to this emulation library.
127 *
128 * In the future, if this proves to be a problem the the easiest solution
129 * would probably be to replace the branded versions of these application
130 * with their native counterparts. Ie, truss, plimit, and pfiles could be
131 * replace with wrapper scripts that execute the native versions of these
132 * applications. In the case of plimit and pfiles this should be pretty
133 * strait forward. Truss would probably be more tricky since it can
134 * execute applications which would be branded applications, so in that
135 * case it might be necessary to create a loadable library which could
136 * be LD_PRELOADed into truss and this library would interpose on the
137 * exec() system call to allow truss to correctly execute branded
138 * processes. It should be pointed out that this solution could work
139 * because "native agent lwps" (ie, agent lwps created by native
140 * processes) can be treated differently from "branded aged lwps" (ie,
141 * agent lwps created by branded processes), since native agent lwps
142 * would presumably be making native system calls and hence not need
143 * any interposition.
144 *
145 * *** General considerations
146 *
147 * One of the differences between the lx brand and the s10
148 * brand, is that the s10 brand only interposes on syscalls
149 * that need some kind of emulation, whereas the lx brand interposes
150 * on _all_ system calls. Lx branded system calls that don't need
151 * any emulation are then redirected back to the kernel from the
152 * userland library via the IN_KERNEL_SYSCALL macro. The lx-syscall
153 * dtrace provider depends on this behavior.
154 *
155 */
156
157 /*ARGSUSED*/
158 void
_brand_abort(int err,const char * msg,const char * file,int line)159 _brand_abort(int err, const char *msg, const char *file, int line)
160 {
161 sysret_t rval;
162
163 /* Save the error message into convenient globals */
164 brand_abort_err = err;
165 brand_abort_msg = msg;
166 brand_abort_file = file;
167 brand_abort_line = line;
168
169 /* kill ourselves */
170 abort();
171
172 /* If abort() didn't work, try something stronger. */
173 (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGKILL);
174 }
175
176 int
brand_uucopy(const void * from,void * to,size_t size)177 brand_uucopy(const void *from, void *to, size_t size)
178 {
179 sysret_t rval;
180
181 if (__systemcall(&rval, SYS_uucopy + 1024, from, to, size) != 0)
182 return (EFAULT);
183 return (0);
184 }
185
186 /*
187 * ATTENTION: uucopystr() does NOT ensure that string are null terminated!
188 */
189 int
brand_uucopystr(const void * from,void * to,size_t size)190 brand_uucopystr(const void *from, void *to, size_t size)
191 {
192 sysret_t rval;
193
194 if (__systemcall(&rval, SYS_uucopystr + 1024, from, to, size) != 0)
195 return (EFAULT);
196 return (0);
197 }
198
199 /*
200 * This function is defined to be NOSYS but it won't be called from the
201 * the kernel since the NOSYS system calls are not enabled in the kernel.
202 * Thus, the only time this function is called is directly from within the
203 * indirect system call path.
204 */
205 /*ARGSUSED*/
206 long
brand_unimpl(sysret_t * rv,uintptr_t p1)207 brand_unimpl(sysret_t *rv, uintptr_t p1)
208 {
209 sysret_t rval;
210
211 /*
212 * We'd like to print out some kind of error message here like
213 * "unsupported syscall", but we can't because it's not safe to
214 * assume that stderr or STDERR_FILENO actually points to something
215 * that is a terminal, and if we wrote to those files we could
216 * inadvertantly write to some applications open files, which would
217 * be bad.
218 *
219 * Normally, if an application calls an invalid system call
220 * it get a SIGSYS sent to it. So we'll just go ahead and send
221 * ourselves a signal here. Note that this is far from ideal since
222 * if the application has registered a signal handler, that signal
223 * handler may recieve a ucontext_t as the third parameter to
224 * indicate the context of the process when the signal was
225 * generated, and in this case that context will not be what the
226 * application is expecting. Hence, we should probably create a
227 * brandsys() kernel function that can deliver the signal to us
228 * with the correct ucontext_t.
229 */
230 (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGSYS);
231 return (ENOSYS);
232 }
233
234 #if defined(__sparc) && !defined(__sparcv9)
235 /*
236 * Yuck. For 32-bit sparc applications, handle indirect system calls.
237 * Note that we declare this interface to use the maximum number of
238 * system call arguments. If we recieve a system call that uses less
239 * arguments, then the additional arguments will be garbage, but they
240 * will also be ignored so that should be ok.
241 */
242 long
brand_indir(sysret_t * rv,int code,uintptr_t a0,uintptr_t a1,uintptr_t a2,uintptr_t a3,uintptr_t a4,uintptr_t a5,uintptr_t a6,uintptr_t a7)243 brand_indir(sysret_t *rv, int code,
244 uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4,
245 uintptr_t a5, uintptr_t a6, uintptr_t a7)
246 {
247 brand_sysent_table_t *sst = &(brand_sysent_table[code]);
248
249 brand_assert(code < NSYSCALL);
250 switch (sst->st_args & NARGS_MASK) {
251 case 0:
252 return ((sst->st_callc)(rv));
253 case 1:
254 return ((sst->st_callc)(rv, a0));
255 case 2:
256 return ((sst->st_callc)(rv, a0, a1));
257 case 3:
258 return ((sst->st_callc)(rv, a0, a1, a2));
259 case 4:
260 return ((sst->st_callc)(rv, a0, a1, a2, a3));
261 case 5:
262 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4));
263 case 6:
264 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5));
265 case 7:
266 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6));
267 case 8:
268 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6, a7));
269 }
270 brand_abort(0, "invalid entry in brand_sysent_table");
271 return (EINVAL);
272 }
273 #endif /* __sparc && !__sparcv9 */
274
275 /*
276 * Close a libc file handle, but don't actually close the underlying
277 * file descriptor.
278 */
279 static void
brand_close_fh(FILE * file)280 brand_close_fh(FILE *file)
281 {
282 int fd, fd_new;
283
284 if (file == NULL)
285 return;
286
287 if ((fd = fileno(file)) < 0)
288 return;
289
290 /*
291 * We're a branded process but our handler isn't installed yet. We
292 * can't use the dup() syscall since it no longer exists.
293 */
294 fd_new = fcntl(fd, F_DUPFD, 0);
295 if (fd_new == -1)
296 return;
297
298 (void) fclose(file);
299 (void) dup2(fd_new, fd);
300 (void) close(fd_new);
301 }
302
303 /*ARGSUSED*/
304 void
brand_pre_init()305 brand_pre_init()
306 {
307 int i;
308
309 /* Sanity check our translation table return value codes */
310 for (i = 0; i < NSYSCALL; i++) {
311 brand_sysent_table_t *est = &(brand_sysent_table[i]);
312 brand_assert(BIT_ONLYONESET(est->st_args & RV_MASK));
313 }
314
315 /*
316 * We need to shutdown all libc stdio. libc stdio normally goes to
317 * file descriptors, but since we're actually part of a another
318 * process we don't own these file descriptors and we can't make
319 * any assumptions about their state.
320 */
321 brand_close_fh(stdin);
322 brand_close_fh(stdout);
323 brand_close_fh(stderr);
324 }
325
326 /*ARGSUSED*/
327 ulong_t
brand_post_init(int version,int argc,char * argv[],char * envp[])328 brand_post_init(int version, int argc, char *argv[], char *envp[])
329 {
330 sysret_t rval;
331 brand_proc_reg_t reg;
332 brand_elf_data_t sed;
333 auxv_t *ap;
334 uintptr_t *p;
335 int err;
336
337 /*
338 * Register our syscall emulation table with the kernel.
339 * Note that we don't have to do invoke (syscall_number + 1024)
340 * until we've actually establised a syscall emulation callback
341 * handler address, which is what we're doing with this brand
342 * syscall.
343 */
344 reg.sbr_version = version;
345 #ifdef __x86
346 reg.sbr_handler = (caddr_t)brand_handler_table;
347 #else /* !__x86 */
348 reg.sbr_handler = (caddr_t)brand_handler;
349 #endif /* !__x86 */
350
351 if ((err = __systemcall(&rval, SYS_brand, B_REGISTER, ®)) != 0) {
352 brand_abort(err, "Failed to brand current process");
353
354 /*NOTREACHED*/
355 }
356
357 /* Get data about the executable we're running from the kernel. */
358 if ((err = __systemcall(&rval, SYS_brand + 1024,
359 B_ELFDATA, (void *)&sed)) != 0) {
360 brand_abort(err,
361 "Failed to get required brand ELF data from the kernel");
362 /*NOTREACHED*/
363 }
364
365 /*
366 * Find the aux vector on the stack.
367 */
368 p = (uintptr_t *)envp;
369 while (*p != NULL)
370 p++;
371
372 /*
373 * p is now pointing at the 0 word after the environ pointers.
374 * After that is the aux vectors.
375 *
376 * The aux vectors are currently pointing to the brand emulation
377 * library and associated linker. We're going to change them to
378 * point to the brand executable and associated linker (or to no
379 * linker for static binaries). This matches the process data
380 * stored within the kernel and visible from /proc, which was
381 * all setup in sn1_elfexec(). We do this so that when a debugger
382 * attaches to the process it sees the process as a normal solaris
383 * process, this brand emulation library and everything on it's
384 * link map will not be visible, unless our librtld_db plugin
385 * is used. Note that this is very different from how Linux
386 * branded processes are implemented within lx branded zones.
387 * In that situation, the primary linkmap of the process is the
388 * brand emulation libraries linkmap, not the Linux applications
389 * linkmap.
390 *
391 * We also need to clear the AF_SUN_NOPLM flag from the AT_SUN_AUXFLAGS
392 * aux vector. This flag told our linker that we don't have a
393 * primary link map. Now that our linker is done initializing, we
394 * want to clear this flag before we transfer control to the
395 * applications copy of the linker, since we want that linker to have
396 * a primary link map which will be the link map for the application
397 * we're running.
398 */
399 p++;
400 for (ap = (auxv_t *)p; ap->a_type != AT_NULL; ap++) {
401 switch (ap->a_type) {
402 case AT_BASE:
403 /* Hide AT_BASE if static binary */
404 if (sed.sed_base == NULL) {
405 ap->a_type = AT_IGNORE;
406 ap->a_un.a_val = NULL;
407 } else {
408 ap->a_un.a_val = sed.sed_base;
409 }
410 break;
411 case AT_ENTRY:
412 ap->a_un.a_val = sed.sed_entry;
413 break;
414 case AT_PHDR:
415 ap->a_un.a_val = sed.sed_phdr;
416 break;
417 case AT_PHENT:
418 ap->a_un.a_val = sed.sed_phent;
419 break;
420 case AT_PHNUM:
421 ap->a_un.a_val = sed.sed_phnum;
422 break;
423 case AT_SUN_AUXFLAGS:
424 ap->a_un.a_val &= ~AF_SUN_NOPLM;
425 break;
426 case AT_SUN_EMULATOR:
427 /*
428 * ld.so.1 inspects AT_SUN_EMULATOR to see if
429 * if it is the linker for the brand emulation
430 * library. Hide AT_SUN_EMULATOR, as the
431 * linker we are about to jump to is the linker
432 * for the binary.
433 */
434 ap->a_type = AT_IGNORE;
435 ap->a_un.a_val = NULL;
436 break;
437 case AT_SUN_LDDATA:
438 /* Hide AT_SUN_LDDATA if static binary */
439 if (sed.sed_lddata == NULL) {
440 ap->a_type = AT_IGNORE;
441 ap->a_un.a_val = NULL;
442 } else {
443 ap->a_un.a_val = sed.sed_lddata;
444 }
445 break;
446 default:
447 break;
448 }
449 }
450
451 return (sed.sed_ldentry);
452 }
453