xref: /illumos-gate/usr/src/lib/brand/shared/brand/common/brand_util.c (revision 1a2d662a91cee3bf82f41cd47c7ae6f3825d9db2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <unistd.h>
31 #include <sys/auxv.h>
32 #include <sys/bitmap.h>
33 #include <sys/brand.h>
34 #include <sys/inttypes.h>
35 #include <sys/lwp.h>
36 #include <sys/syscall.h>
37 #include <sys/systm.h>
38 #include <sys/utsname.h>
39 #include <fcntl.h>
40 #include <brand_misc.h>
41 #include <sys/brand.h>
42 
43 extern brand_sysent_table_t brand_sysent_table[];
44 
45 /*LINTED: static unused*/
46 static volatile int		brand_abort_err;
47 /*LINTED: static unused*/
48 static volatile const char	*brand_abort_msg;
49 /*LINTED: static unused*/
50 static volatile const char	*brand_abort_file;
51 /*LINTED: static unused*/
52 static volatile int		brand_abort_line;
53 
54 /*
55  * Principles of emulation 101.
56  *
57  *
58  * *** Setting errno
59  *
60  * Just don't do it.  This emulation library is loaded onto a
61  * seperate link map from the application who's address space we're
62  * running in.  We have our own private copy of libc, so there for,
63  * the errno value accessible from here is is also private and changing
64  * it will not affect any errno value that the processes who's address
65  * space we are running in will see.  To return an error condition we
66  * should return the errno value we'd like the system to return.
67  * For more information about this see the comments in brand_misc.h.
68  * Basically, when we return to the caller that initiated the system
69  * call it's their responsibility to set errno.
70  *
71  *
72  * *** Recursion Considerations
73  *
74  * When emulating system calls we need to be very careful about what
75  * library calls we invoke.  Library calls should be kept to a minimum.
76  * One issue is that library calls can invoke system calls, so if we're
77  * emulating a system call and we invoke a library call that depends on
78  * that system call we will probably enter a recursive loop, which would
79  * be bad.
80  *
81  *
82  * *** Return Values.
83  *
84  * See brand_misc.h.
85  *
86  * *** Agent lwp considerations
87  *
88  * It is currently impossible to do any emulation for these system call
89  * when they are being invoked on behalf of an agent lwp.  To understand why
90  * it's impossible you have to understand how agent lwp syscalls work.
91  *
92  * The agent lwp syscall process works as follows:
93  *   1  The controlling process stops the target.
94  *   2  The controlling process injects an agent lwp which is also stopped.
95  *      This agent lwp assumes the userland stack and register values
96  *      of another stopped lwp in the current process.
97  *   3  The controlling process configures the agent lwp to start
98  *      executing the requested system call.
99  *   4  The controlling process configure /proc to stop the agent lwp when
100  *      it enters the requested system call.
101  *   5  The controlling processes allows the agent lwp to start executing.
102  *   6  The agent lwp traps into the kernel to perform the requested system
103  *      call and immediately stop.
104  *   7  The controlling process copies all the arguments for the requested
105  *      system call onto the agent lwp's stack.
106  *   8  The controlling process configures /proc to stop the agent lwp
107  *      when it completes the requested system call.
108  *   9  The controlling processes allows the agent lwp to start executing.
109  *  10  The agent lwp executes the system call and then stop before returning
110  *      to userland.
111  *  11  The controlling process copies the return value and return arguments
112  *      back from the agent lwps stack.
113  *  12  The controlling process destroys the agent lwp and restarts
114  *      the target process.
115  *
116  * The fundamental problem is that when the agent executes the request
117  * system call in step 5, if we're emulating that system call then the
118  * lwp is redirected back to our emulation layer without blocking
119  * in the kernel.  But our emulation layer can't access the arguments
120  * for the system call because they haven't been copied to the stack
121  * yet and they still only exist in the controlling processes address
122  * space.  This prevents us from being able to do any emulation of
123  * agent lwp system calls.  Hence, currently our brand trap interposition
124  * callback (XXX_brand_syscall_callback_common) will detect if a system
125  * call is being made by an agent lwp, and if this is the case it will
126  * never redirect the system call to this emulation library.
127  *
128  * In the future, if this proves to be a problem the the easiest solution
129  * would probably be to replace the branded versions of these application
130  * with their native counterparts.  Ie,  truss, plimit, and pfiles could be
131  * replace with wrapper scripts that execute the native versions of these
132  * applications.  In the case of plimit and pfiles this should be pretty
133  * strait forward.  Truss would probably be more tricky since it can
134  * execute applications which would be branded applications, so in that
135  * case it might be necessary to create a loadable library which could
136  * be LD_PRELOADed into truss and this library would interpose on the
137  * exec() system call to allow truss to correctly execute branded
138  * processes.  It should be pointed out that this solution could work
139  * because "native agent lwps" (ie, agent lwps created by native
140  * processes) can be treated differently from "branded aged lwps" (ie,
141  * agent lwps created by branded processes), since native agent lwps
142  * would presumably be making native system calls and hence not need
143  * any interposition.
144  *
145  * *** General considerations
146  *
147  * One of the differences between the lx brand and the s10
148  * brand, is that the s10 brand only interposes on syscalls
149  * that need some kind of emulation, whereas the lx brand interposes
150  * on _all_ system calls.  Lx branded system calls that don't need
151  * any emulation are then redirected back to the kernel from the
152  * userland library via the IN_KERNEL_SYSCALL macro.  The lx-syscall
153  * dtrace provider depends on this behavior.
154  *
155  */
156 
157 /*ARGSUSED*/
158 void
_brand_abort(int err,const char * msg,const char * file,int line)159 _brand_abort(int err, const char *msg, const char *file, int line)
160 {
161 	sysret_t rval;
162 
163 	/* Save the error message into convenient globals */
164 	brand_abort_err = err;
165 	brand_abort_msg = msg;
166 	brand_abort_file = file;
167 	brand_abort_line = line;
168 
169 	/* kill ourselves */
170 	abort();
171 
172 	/* If abort() didn't work, try something stronger. */
173 	(void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGKILL);
174 }
175 
176 int
brand_uucopy(const void * from,void * to,size_t size)177 brand_uucopy(const void *from, void *to, size_t size)
178 {
179 	sysret_t rval;
180 
181 	if (__systemcall(&rval, SYS_uucopy + 1024, from, to, size) != 0)
182 		return (EFAULT);
183 	return (0);
184 }
185 
186 /*
187  * ATTENTION: uucopystr() does NOT ensure that string are null terminated!
188  */
189 int
brand_uucopystr(const void * from,void * to,size_t size)190 brand_uucopystr(const void *from, void *to, size_t size)
191 {
192 	sysret_t rval;
193 
194 	if (__systemcall(&rval, SYS_uucopystr + 1024, from, to, size) != 0)
195 		return (EFAULT);
196 	return (0);
197 }
198 
199 /*
200  * This function is defined to be NOSYS but it won't be called from the
201  * the kernel since the NOSYS system calls are not enabled in the kernel.
202  * Thus, the only time this function is called is directly from within the
203  * indirect system call path.
204  */
205 /*ARGSUSED*/
206 long
brand_unimpl(sysret_t * rv,uintptr_t p1)207 brand_unimpl(sysret_t *rv, uintptr_t p1)
208 {
209 	sysret_t rval;
210 
211 	/*
212 	 * We'd like to print out some kind of error message here like
213 	 * "unsupported syscall", but we can't because it's not safe to
214 	 * assume that stderr or STDERR_FILENO actually points to something
215 	 * that is a terminal, and if we wrote to those files we could
216 	 * inadvertantly write to some applications open files, which would
217 	 * be bad.
218 	 *
219 	 * Normally, if an application calls an invalid system call
220 	 * it get a SIGSYS sent to it.  So we'll just go ahead and send
221 	 * ourselves a signal here.  Note that this is far from ideal since
222 	 * if the application has registered a signal handler, that signal
223 	 * handler may recieve a ucontext_t as the third parameter to
224 	 * indicate the context of the process when the signal was
225 	 * generated, and in this case that context will not be what the
226 	 * application is expecting.  Hence, we should probably create a
227 	 * brandsys() kernel function that can deliver the signal to us
228 	 * with the correct ucontext_t.
229 	 */
230 	(void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGSYS);
231 	return (ENOSYS);
232 }
233 
234 #if defined(__sparc) && !defined(__sparcv9)
235 /*
236  * Yuck.  For 32-bit sparc applications, handle indirect system calls.
237  * Note that we declare this interface to use the maximum number of
238  * system call arguments.  If we recieve a system call that uses less
239  * arguments, then the additional arguments will be garbage, but they
240  * will also be ignored so that should be ok.
241  */
242 long
brand_indir(sysret_t * rv,int code,uintptr_t a0,uintptr_t a1,uintptr_t a2,uintptr_t a3,uintptr_t a4,uintptr_t a5,uintptr_t a6,uintptr_t a7)243 brand_indir(sysret_t *rv, int code,
244     uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4,
245     uintptr_t a5, uintptr_t a6, uintptr_t a7)
246 {
247 	brand_sysent_table_t *sst = &(brand_sysent_table[code]);
248 
249 	brand_assert(code < NSYSCALL);
250 	switch (sst->st_args & NARGS_MASK) {
251 	case 0:
252 		return ((sst->st_callc)(rv));
253 	case 1:
254 		return ((sst->st_callc)(rv, a0));
255 	case 2:
256 		return ((sst->st_callc)(rv, a0, a1));
257 	case 3:
258 		return ((sst->st_callc)(rv, a0, a1, a2));
259 	case 4:
260 		return ((sst->st_callc)(rv, a0, a1, a2, a3));
261 	case 5:
262 		return ((sst->st_callc)(rv, a0, a1, a2, a3, a4));
263 	case 6:
264 		return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5));
265 	case 7:
266 		return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6));
267 	case 8:
268 		return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6, a7));
269 	}
270 	brand_abort(0, "invalid entry in brand_sysent_table");
271 	return (EINVAL);
272 }
273 #endif /* __sparc && !__sparcv9 */
274 
275 /*
276  * Close a libc file handle, but don't actually close the underlying
277  * file descriptor.
278  */
279 static void
brand_close_fh(FILE * file)280 brand_close_fh(FILE *file)
281 {
282 	int fd, fd_new;
283 
284 	if (file == NULL)
285 		return;
286 
287 	if ((fd = fileno(file)) < 0)
288 		return;
289 
290 	/*
291 	 * We're a branded process but our handler isn't installed yet.  We
292 	 * can't use the dup() syscall since it no longer exists. Unfortunately
293 	 * in an S10 branded zone, this will all pick up the old libc and not
294 	 * use emulation, which makes it fairly impossible to use safely.
295 	 */
296 	fd_new = fcntl(fd, F_DUPFD, 0);
297 	if (fd_new == -1)
298 		return;
299 
300 	(void) fclose(file);
301 	(void) dup2(fd_new, fd);
302 	(void) close(fd_new);
303 }
304 
305 /*ARGSUSED*/
306 void
brand_pre_init()307 brand_pre_init()
308 {
309 	int			i;
310 
311 	/* Sanity check our translation table return value codes */
312 	for (i = 0; i < NSYSCALL; i++) {
313 		brand_sysent_table_t *est = &(brand_sysent_table[i]);
314 		brand_assert(BIT_ONLYONESET(est->st_args & RV_MASK));
315 	}
316 
317 	/*
318 	 * We need to shutdown all libc stdio.  libc stdio normally goes to
319 	 * file descriptors, but since we're actually part of a another
320 	 * process we don't own these file descriptors and we can't make
321 	 * any assumptions about their state.
322 	 */
323 	brand_close_fh(stdin);
324 	brand_close_fh(stdout);
325 	brand_close_fh(stderr);
326 }
327 
328 /*ARGSUSED*/
329 ulong_t
brand_post_init(int version,int argc,char * argv[],char * envp[])330 brand_post_init(int version, int argc, char *argv[], char *envp[])
331 {
332 	sysret_t		rval;
333 	brand_proc_reg_t	reg;
334 	brand_elf_data_t	sed;
335 	auxv_t			*ap;
336 	uintptr_t		*p;
337 	int			err;
338 
339 	/*
340 	 * Register our syscall emulation table with the kernel.
341 	 * Note that we don't have to do invoke (syscall_number + 1024)
342 	 * until we've actually establised a syscall emulation callback
343 	 * handler address, which is what we're doing with this brand
344 	 * syscall.
345 	 */
346 	reg.sbr_version = version;
347 #ifdef	__x86
348 	reg.sbr_handler = (caddr_t)brand_handler_table;
349 #else	/* !__x86 */
350 	reg.sbr_handler = (caddr_t)brand_handler;
351 #endif	/* !__x86 */
352 
353 	if ((err = __systemcall(&rval, SYS_brand, B_REGISTER, &reg)) != 0) {
354 		brand_abort(err, "Failed to brand current process");
355 
356 		/*NOTREACHED*/
357 	}
358 
359 	/* Get data about the executable we're running from the kernel. */
360 	if ((err = __systemcall(&rval, SYS_brand + 1024,
361 	    B_ELFDATA, (void *)&sed)) != 0) {
362 		brand_abort(err,
363 		    "Failed to get required brand ELF data from the kernel");
364 		/*NOTREACHED*/
365 	}
366 
367 	/*
368 	 * Find the aux vector on the stack.
369 	 */
370 	p = (uintptr_t *)envp;
371 	while (*p != 0)
372 		p++;
373 
374 	/*
375 	 * p is now pointing at the 0 word after the environ pointers.
376 	 * After that is the aux vectors.
377 	 *
378 	 * The aux vectors are currently pointing to the brand emulation
379 	 * library and associated linker.  We're going to change them to
380 	 * point to the brand executable and associated linker (or to no
381 	 * linker for static binaries).  This matches the process data
382 	 * stored within the kernel and visible from /proc, which was
383 	 * all setup in sn1_elfexec().  We do this so that when a debugger
384 	 * attaches to the process it sees the process as a normal solaris
385 	 * process, this brand emulation library and everything on it's
386 	 * link map will not be visible, unless our librtld_db plugin
387 	 * is used.  Note that this is very different from how Linux
388 	 * branded processes are implemented within lx branded zones.
389 	 * In that situation, the primary linkmap of the process is the
390 	 * brand emulation libraries linkmap, not the Linux applications
391 	 * linkmap.
392 	 *
393 	 * We also need to clear the AF_SUN_NOPLM flag from the AT_SUN_AUXFLAGS
394 	 * aux vector.  This flag told our linker that we don't have a
395 	 * primary link map.  Now that our linker is done initializing, we
396 	 * want to clear this flag before we transfer control to the
397 	 * applications copy of the linker, since we want that linker to have
398 	 * a primary link map which will be the link map for the application
399 	 * we're running.
400 	 */
401 	p++;
402 	for (ap = (auxv_t *)p; ap->a_type != AT_NULL; ap++) {
403 		switch (ap->a_type) {
404 			case AT_BASE:
405 				/* Hide AT_BASE if static binary */
406 				if (sed.sed_base == 0) {
407 					ap->a_type = AT_IGNORE;
408 					ap->a_un.a_val = 0;
409 				} else {
410 					ap->a_un.a_val = sed.sed_base;
411 				}
412 				break;
413 			case AT_ENTRY:
414 				ap->a_un.a_val = sed.sed_entry;
415 				break;
416 			case AT_PHDR:
417 				ap->a_un.a_val = sed.sed_phdr;
418 				break;
419 			case AT_PHENT:
420 				ap->a_un.a_val = sed.sed_phent;
421 				break;
422 			case AT_PHNUM:
423 				ap->a_un.a_val = sed.sed_phnum;
424 				break;
425 			case AT_SUN_AUXFLAGS:
426 				ap->a_un.a_val &= ~AF_SUN_NOPLM;
427 				break;
428 			case AT_SUN_EMULATOR:
429 				/*
430 				 * ld.so.1 inspects AT_SUN_EMULATOR to see if
431 				 * if it is the linker for the brand emulation
432 				 * library.  Hide AT_SUN_EMULATOR, as the
433 				 * linker we are about to jump to is the linker
434 				 * for the binary.
435 				 */
436 				ap->a_type = AT_IGNORE;
437 				ap->a_un.a_val = 0;
438 				break;
439 			case AT_SUN_LDDATA:
440 				/* Hide AT_SUN_LDDATA if static binary */
441 				if (sed.sed_lddata == 0) {
442 					ap->a_type = AT_IGNORE;
443 					ap->a_un.a_val = 0;
444 				} else {
445 					ap->a_un.a_val = sed.sed_lddata;
446 				}
447 				break;
448 			default:
449 				break;
450 		}
451 	}
452 
453 	return (sed.sed_ldentry);
454 }
455