xref: /illumos-gate/usr/src/uts/common/exec/elf/elf.c (revision 14b24e2b79293068c8e016a69ef1d872fb5e2fd5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 /*
29  * Copyright 2016 Joyent, Inc.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/thread.h>
35 #include <sys/sysmacros.h>
36 #include <sys/signal.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/vnode.h>
41 #include <sys/mman.h>
42 #include <sys/kmem.h>
43 #include <sys/proc.h>
44 #include <sys/pathname.h>
45 #include <sys/policy.h>
46 #include <sys/cmn_err.h>
47 #include <sys/systm.h>
48 #include <sys/elf.h>
49 #include <sys/vmsystm.h>
50 #include <sys/debug.h>
51 #include <sys/auxv.h>
52 #include <sys/exec.h>
53 #include <sys/prsystm.h>
54 #include <vm/as.h>
55 #include <vm/rm.h>
56 #include <vm/seg.h>
57 #include <vm/seg_vn.h>
58 #include <sys/modctl.h>
59 #include <sys/systeminfo.h>
60 #include <sys/vmparam.h>
61 #include <sys/machelf.h>
62 #include <sys/shm_impl.h>
63 #include <sys/archsystm.h>
64 #include <sys/fasttrap.h>
65 #include <sys/brand.h>
66 #include "elf_impl.h"
67 #include <sys/sdt.h>
68 #include <sys/siginfo.h>
69 #include <sys/random.h>
70 
71 #if defined(__x86)
72 #include <sys/comm_page_util.h>
73 #endif /* defined(__x86) */
74 
75 
76 extern int at_flags;
77 extern volatile size_t aslr_max_brk_skew;
78 
79 #define	ORIGIN_STR	"ORIGIN"
80 #define	ORIGIN_STR_SIZE	6
81 
82 static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
83 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
84     ssize_t *);
85 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
86     ssize_t *, caddr_t *, ssize_t *);
87 static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
88 static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
89     Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
90     caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
91 
92 typedef enum {
93 	STR_CTF,
94 	STR_SYMTAB,
95 	STR_DYNSYM,
96 	STR_STRTAB,
97 	STR_DYNSTR,
98 	STR_SHSTRTAB,
99 	STR_NUM
100 } shstrtype_t;
101 
102 static const char *shstrtab_data[] = {
103 	".SUNW_ctf",
104 	".symtab",
105 	".dynsym",
106 	".strtab",
107 	".dynstr",
108 	".shstrtab"
109 };
110 
111 typedef struct shstrtab {
112 	int	sst_ndx[STR_NUM];
113 	int	sst_cur;
114 } shstrtab_t;
115 
116 static void
117 shstrtab_init(shstrtab_t *s)
118 {
119 	bzero(&s->sst_ndx, sizeof (s->sst_ndx));
120 	s->sst_cur = 1;
121 }
122 
123 static int
124 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
125 {
126 	int ret;
127 
128 	if ((ret = s->sst_ndx[type]) != 0)
129 		return (ret);
130 
131 	ret = s->sst_ndx[type] = s->sst_cur;
132 	s->sst_cur += strlen(shstrtab_data[type]) + 1;
133 
134 	return (ret);
135 }
136 
137 static size_t
138 shstrtab_size(const shstrtab_t *s)
139 {
140 	return (s->sst_cur);
141 }
142 
143 static void
144 shstrtab_dump(const shstrtab_t *s, char *buf)
145 {
146 	int i, ndx;
147 
148 	*buf = '\0';
149 	for (i = 0; i < STR_NUM; i++) {
150 		if ((ndx = s->sst_ndx[i]) != 0)
151 			(void) strcpy(buf + ndx, shstrtab_data[i]);
152 	}
153 }
154 
155 static int
156 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
157 {
158 	ASSERT(phdrp->p_type == PT_SUNWDTRACE);
159 
160 	/*
161 	 * See the comment in fasttrap.h for information on how to safely
162 	 * update this program header.
163 	 */
164 	if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
165 	    (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
166 		return (-1);
167 
168 	args->thrptr = phdrp->p_vaddr + base;
169 
170 	return (0);
171 }
172 
173 static int
174 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
175 {
176 	uint_t flag;
177 
178 	switch (dt) {
179 	case DT_SUNW_ASLR:
180 		flag = PROC_SEC_ASLR;
181 		break;
182 	default:
183 		return (EINVAL);
184 	}
185 
186 	if (val == 0) {
187 		if (secflag_isset(p->p_secflags.psf_lower, flag))
188 			return (EPERM);
189 		if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
190 		    secflag_isset(p->p_secflags.psf_inherit, flag))
191 			return (EPERM);
192 
193 		secflag_clear(&p->p_secflags.psf_effective, flag);
194 	} else {
195 		if (!secflag_isset(p->p_secflags.psf_upper, flag))
196 			return (EPERM);
197 
198 		if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
199 		    !secflag_isset(p->p_secflags.psf_inherit, flag))
200 			return (EPERM);
201 
202 		secflag_set(&p->p_secflags.psf_effective, flag);
203 	}
204 
205 	return (0);
206 }
207 
208 /*
209  * Map in the executable pointed to by vp. Returns 0 on success.
210  */
211 int
212 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
213     intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
214     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
215 {
216 	size_t		len;
217 	struct vattr	vat;
218 	caddr_t		phdrbase = NULL;
219 	ssize_t		phdrsize;
220 	int		nshdrs, shstrndx, nphdrs;
221 	int		error = 0;
222 	Phdr		*uphdr = NULL;
223 	Phdr		*junk = NULL;
224 	Phdr		*dynphdr = NULL;
225 	Phdr		*dtrphdr = NULL;
226 	uintptr_t	lddata;
227 	long		execsz;
228 	intptr_t	minaddr;
229 
230 	if (lddatap != NULL)
231 		*lddatap = NULL;
232 
233 	if (error = execpermissions(vp, &vat, args)) {
234 		uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
235 		return (error);
236 	}
237 
238 	if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
239 	    &nphdrs)) != 0 ||
240 	    (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
241 	    &phdrsize)) != 0) {
242 		uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
243 		return (error);
244 	}
245 
246 	if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
247 		uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
248 		kmem_free(phdrbase, phdrsize);
249 		return (ENOEXEC);
250 	}
251 	if (lddatap != NULL)
252 		*lddatap = lddata;
253 
254 	if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
255 	    &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
256 	    len, &execsz, brksize)) {
257 		uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
258 		kmem_free(phdrbase, phdrsize);
259 		return (error);
260 	}
261 
262 	/*
263 	 * Inform our caller if the executable needs an interpreter.
264 	 */
265 	*interp = (dynphdr == NULL) ? 0 : 1;
266 
267 	/*
268 	 * If this is a statically linked executable, voffset should indicate
269 	 * the address of the executable itself (it normally holds the address
270 	 * of the interpreter).
271 	 */
272 	if (ehdr->e_type == ET_EXEC && *interp == 0)
273 		*voffset = minaddr;
274 
275 	if (uphdr != NULL) {
276 		*uphdr_vaddr = uphdr->p_vaddr;
277 	} else {
278 		*uphdr_vaddr = (Addr)-1;
279 	}
280 
281 	kmem_free(phdrbase, phdrsize);
282 	return (error);
283 }
284 
285 /*ARGSUSED*/
286 int
287 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
288     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
289     int brand_action)
290 {
291 	caddr_t		phdrbase = NULL;
292 	caddr_t 	bssbase = 0;
293 	caddr_t 	brkbase = 0;
294 	size_t		brksize = 0;
295 	ssize_t		dlnsize;
296 	aux_entry_t	*aux;
297 	int		error;
298 	ssize_t		resid;
299 	int		fd = -1;
300 	intptr_t	voffset;
301 	Phdr		*intphdr = NULL;
302 	Phdr		*dynamicphdr = NULL;
303 	Phdr		*stphdr = NULL;
304 	Phdr		*uphdr = NULL;
305 	Phdr		*junk = NULL;
306 	size_t		len;
307 	ssize_t		phdrsize;
308 	int		postfixsize = 0;
309 	int		i, hsize;
310 	Phdr		*phdrp;
311 	Phdr		*dataphdrp = NULL;
312 	Phdr		*dtrphdr;
313 	Phdr		*capphdr = NULL;
314 	Cap		*cap = NULL;
315 	ssize_t		capsize;
316 	Dyn		*dyn = NULL;
317 	int		hasu = 0;
318 	int		hasauxv = 0;
319 	int		hasintp = 0;
320 	int		branded = 0;
321 
322 	struct proc *p = ttoproc(curthread);
323 	struct user *up = PTOU(p);
324 	struct bigwad {
325 		Ehdr	ehdr;
326 		aux_entry_t	elfargs[__KERN_NAUXV_IMPL];
327 		char		dl_name[MAXPATHLEN];
328 		char		pathbuf[MAXPATHLEN];
329 		struct vattr	vattr;
330 		struct execenv	exenv;
331 	} *bigwad;	/* kmem_alloc this behemoth so we don't blow stack */
332 	Ehdr		*ehdrp;
333 	int		nshdrs, shstrndx, nphdrs;
334 	char		*dlnp;
335 	char		*pathbufp;
336 	rlim64_t	limit;
337 	rlim64_t	roundlimit;
338 
339 	ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
340 
341 	bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
342 	ehdrp = &bigwad->ehdr;
343 	dlnp = bigwad->dl_name;
344 	pathbufp = bigwad->pathbuf;
345 
346 	/*
347 	 * Obtain ELF and program header information.
348 	 */
349 	if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
350 	    &nphdrs)) != 0 ||
351 	    (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
352 	    &phdrsize)) != 0)
353 		goto out;
354 
355 	/*
356 	 * Prevent executing an ELF file that has no entry point.
357 	 */
358 	if (ehdrp->e_entry == 0) {
359 		uprintf("%s: Bad entry point\n", exec_file);
360 		goto bad;
361 	}
362 
363 	/*
364 	 * Put data model that we're exec-ing to into the args passed to
365 	 * exec_args(), so it will know what it is copying to on new stack.
366 	 * Now that we know whether we are exec-ing a 32-bit or 64-bit
367 	 * executable, we can set execsz with the appropriate NCARGS.
368 	 */
369 #ifdef	_LP64
370 	if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
371 		args->to_model = DATAMODEL_ILP32;
372 		*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
373 	} else {
374 		args->to_model = DATAMODEL_LP64;
375 		args->stk_prot &= ~PROT_EXEC;
376 #if defined(__i386) || defined(__amd64)
377 		args->dat_prot &= ~PROT_EXEC;
378 #endif
379 		*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
380 	}
381 #else	/* _LP64 */
382 	args->to_model = DATAMODEL_ILP32;
383 	*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
384 #endif	/* _LP64 */
385 
386 	/*
387 	 * We delay invoking the brand callback until we've figured out
388 	 * what kind of elf binary we're trying to run, 32-bit or 64-bit.
389 	 * We do this because now the brand library can just check
390 	 * args->to_model to see if the target is 32-bit or 64-bit without
391 	 * having do duplicate all the code above.
392 	 *
393 	 * The level checks associated with brand handling below are used to
394 	 * prevent a loop since the brand elfexec function typically comes back
395 	 * through this function. We must check <= here since the nested
396 	 * handling in the #! interpreter code will increment the level before
397 	 * calling gexec to run the final elfexec interpreter.
398 	 */
399 	if ((level <= INTP_MAXDEPTH) &&
400 	    (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
401 		error = BROP(p)->b_elfexec(vp, uap, args,
402 		    idatap, level + 1, execsz, setid, exec_file, cred,
403 		    brand_action);
404 		goto out;
405 	}
406 
407 	/*
408 	 * Determine aux size now so that stack can be built
409 	 * in one shot (except actual copyout of aux image),
410 	 * determine any non-default stack protections,
411 	 * and still have this code be machine independent.
412 	 */
413 	hsize = ehdrp->e_phentsize;
414 	phdrp = (Phdr *)phdrbase;
415 	for (i = nphdrs; i > 0; i--) {
416 		switch (phdrp->p_type) {
417 		case PT_INTERP:
418 			hasauxv = hasintp = 1;
419 			break;
420 		case PT_PHDR:
421 			hasu = 1;
422 			break;
423 		case PT_SUNWSTACK:
424 			args->stk_prot = PROT_USER;
425 			if (phdrp->p_flags & PF_R)
426 				args->stk_prot |= PROT_READ;
427 			if (phdrp->p_flags & PF_W)
428 				args->stk_prot |= PROT_WRITE;
429 			if (phdrp->p_flags & PF_X)
430 				args->stk_prot |= PROT_EXEC;
431 			break;
432 		case PT_LOAD:
433 			dataphdrp = phdrp;
434 			break;
435 		case PT_SUNWCAP:
436 			capphdr = phdrp;
437 			break;
438 		case PT_DYNAMIC:
439 			dynamicphdr = phdrp;
440 			break;
441 		}
442 		phdrp = (Phdr *)((caddr_t)phdrp + hsize);
443 	}
444 
445 	if (ehdrp->e_type != ET_EXEC) {
446 		dataphdrp = NULL;
447 		hasauxv = 1;
448 	}
449 
450 	/* Copy BSS permissions to args->dat_prot */
451 	if (dataphdrp != NULL) {
452 		args->dat_prot = PROT_USER;
453 		if (dataphdrp->p_flags & PF_R)
454 			args->dat_prot |= PROT_READ;
455 		if (dataphdrp->p_flags & PF_W)
456 			args->dat_prot |= PROT_WRITE;
457 		if (dataphdrp->p_flags & PF_X)
458 			args->dat_prot |= PROT_EXEC;
459 	}
460 
461 	/*
462 	 * If a auxvector will be required - reserve the space for
463 	 * it now.  This may be increased by exec_args if there are
464 	 * ISA-specific types (included in __KERN_NAUXV_IMPL).
465 	 */
466 	if (hasauxv) {
467 		/*
468 		 * If a AUX vector is being built - the base AUX
469 		 * entries are:
470 		 *
471 		 *	AT_BASE
472 		 *	AT_FLAGS
473 		 *	AT_PAGESZ
474 		 *	AT_SUN_AUXFLAGS
475 		 *	AT_SUN_HWCAP
476 		 *	AT_SUN_HWCAP2
477 		 *	AT_SUN_PLATFORM (added in stk_copyout)
478 		 *	AT_SUN_EXECNAME (added in stk_copyout)
479 		 *	AT_NULL
480 		 *
481 		 * total == 9
482 		 */
483 		if (hasintp && hasu) {
484 			/*
485 			 * Has PT_INTERP & PT_PHDR - the auxvectors that
486 			 * will be built are:
487 			 *
488 			 *	AT_PHDR
489 			 *	AT_PHENT
490 			 *	AT_PHNUM
491 			 *	AT_ENTRY
492 			 *	AT_LDDATA
493 			 *
494 			 * total = 5
495 			 */
496 			args->auxsize = (9 + 5) * sizeof (aux_entry_t);
497 		} else if (hasintp) {
498 			/*
499 			 * Has PT_INTERP but no PT_PHDR
500 			 *
501 			 *	AT_EXECFD
502 			 *	AT_LDDATA
503 			 *
504 			 * total = 2
505 			 */
506 			args->auxsize = (9 + 2) * sizeof (aux_entry_t);
507 		} else {
508 			args->auxsize = 9 * sizeof (aux_entry_t);
509 		}
510 	} else {
511 		args->auxsize = 0;
512 	}
513 
514 	/*
515 	 * If this binary is using an emulator, we need to add an
516 	 * AT_SUN_EMULATOR aux entry.
517 	 */
518 	if (args->emulator != NULL)
519 		args->auxsize += sizeof (aux_entry_t);
520 
521 	/*
522 	 * On supported kernels (x86_64) make room in the auxv for the
523 	 * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
524 	 * which do not provide such functionality.
525 	 */
526 #if defined(__amd64)
527 	args->auxsize += sizeof (aux_entry_t);
528 #endif /* defined(__amd64) */
529 
530 	if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
531 		branded = 1;
532 		/*
533 		 * We will be adding 4 entries to the aux vectors.  One for
534 		 * the the brandname and 3 for the brand specific aux vectors.
535 		 */
536 		args->auxsize += 4 * sizeof (aux_entry_t);
537 	}
538 
539 	/* If the binary has an explicit ASLR flag, it must be honoured */
540 	if ((dynamicphdr != NULL) &&
541 	    (dynamicphdr->p_filesz > 0)) {
542 		Dyn *dp;
543 		off_t i = 0;
544 
545 #define	DYN_STRIDE	100
546 		for (i = 0; i < dynamicphdr->p_filesz;
547 		    i += sizeof (*dyn) * DYN_STRIDE) {
548 			int ndyns = (dynamicphdr->p_filesz - i) / sizeof (*dyn);
549 			size_t dynsize;
550 
551 			ndyns = MIN(DYN_STRIDE, ndyns);
552 			dynsize = ndyns * sizeof (*dyn);
553 
554 			dyn = kmem_alloc(dynsize, KM_SLEEP);
555 
556 			if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
557 			    dynsize, (offset_t)(dynamicphdr->p_offset + i),
558 			    UIO_SYSSPACE, 0, (rlim64_t)0,
559 			    CRED(), &resid)) != 0) {
560 				uprintf("%s: cannot read .dynamic section\n",
561 				    exec_file);
562 				goto out;
563 			}
564 
565 			for (dp = dyn; dp < (dyn + ndyns); dp++) {
566 				if (dp->d_tag == DT_SUNW_ASLR) {
567 					if ((error = handle_secflag_dt(p,
568 					    DT_SUNW_ASLR,
569 					    dp->d_un.d_val)) != 0) {
570 						uprintf("%s: error setting "
571 						    "security-flag from "
572 						    "DT_SUNW_ASLR: %d\n",
573 						    exec_file, error);
574 						goto out;
575 					}
576 				}
577 			}
578 
579 			kmem_free(dyn, dynsize);
580 		}
581 	}
582 
583 	/* Hardware/Software capabilities */
584 	if (capphdr != NULL &&
585 	    (capsize = capphdr->p_filesz) > 0 &&
586 	    capsize <= 16 * sizeof (*cap)) {
587 		int ncaps = capsize / sizeof (*cap);
588 		Cap *cp;
589 
590 		cap = kmem_alloc(capsize, KM_SLEEP);
591 		if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
592 		    capsize, (offset_t)capphdr->p_offset,
593 		    UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
594 			uprintf("%s: Cannot read capabilities section\n",
595 			    exec_file);
596 			goto out;
597 		}
598 		for (cp = cap; cp < cap + ncaps; cp++) {
599 			if (cp->c_tag == CA_SUNW_SF_1 &&
600 			    (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
601 				if (args->to_model == DATAMODEL_LP64)
602 					args->addr32 = 1;
603 				break;
604 			}
605 		}
606 	}
607 
608 	aux = bigwad->elfargs;
609 	/*
610 	 * Move args to the user's stack.
611 	 * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
612 	 */
613 	if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
614 		if (error == -1) {
615 			error = ENOEXEC;
616 			goto bad;
617 		}
618 		goto out;
619 	}
620 	/* we're single threaded after this point */
621 
622 	/*
623 	 * If this is an ET_DYN executable (shared object),
624 	 * determine its memory size so that mapelfexec() can load it.
625 	 */
626 	if (ehdrp->e_type == ET_DYN)
627 		len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
628 	else
629 		len = 0;
630 
631 	dtrphdr = NULL;
632 
633 	if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
634 	    &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
635 	    len, execsz, &brksize)) != 0)
636 		goto bad;
637 
638 	if (uphdr != NULL && intphdr == NULL)
639 		goto bad;
640 
641 	if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
642 		uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
643 		goto bad;
644 	}
645 
646 	if (intphdr != NULL) {
647 		size_t		len;
648 		uintptr_t	lddata;
649 		char		*p;
650 		struct vnode	*nvp;
651 
652 		dlnsize = intphdr->p_filesz;
653 
654 		if (dlnsize > MAXPATHLEN || dlnsize <= 0)
655 			goto bad;
656 
657 		/*
658 		 * Read in "interpreter" pathname.
659 		 */
660 		if ((error = vn_rdwr(UIO_READ, vp, dlnp, intphdr->p_filesz,
661 		    (offset_t)intphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
662 		    CRED(), &resid)) != 0) {
663 			uprintf("%s: Cannot obtain interpreter pathname\n",
664 			    exec_file);
665 			goto bad;
666 		}
667 
668 		if (resid != 0 || dlnp[dlnsize - 1] != '\0')
669 			goto bad;
670 
671 		/*
672 		 * Search for '$ORIGIN' token in interpreter path.
673 		 * If found, expand it.
674 		 */
675 		for (p = dlnp; p = strchr(p, '$'); ) {
676 			uint_t	len, curlen;
677 			char	*_ptr;
678 
679 			if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
680 				continue;
681 
682 			/*
683 			 * We don't support $ORIGIN on setid programs to close
684 			 * a potential attack vector.
685 			 */
686 			if ((setid & EXECSETID_SETID) != 0) {
687 				error = ENOEXEC;
688 				goto bad;
689 			}
690 
691 			curlen = 0;
692 			len = p - dlnp - 1;
693 			if (len) {
694 				bcopy(dlnp, pathbufp, len);
695 				curlen += len;
696 			}
697 			if (_ptr = strrchr(args->pathname, '/')) {
698 				len = _ptr - args->pathname;
699 				if ((curlen + len) > MAXPATHLEN)
700 					break;
701 
702 				bcopy(args->pathname, &pathbufp[curlen], len);
703 				curlen += len;
704 			} else {
705 				/*
706 				 * executable is a basename found in the
707 				 * current directory.  So - just substitue
708 				 * '.' for ORIGIN.
709 				 */
710 				pathbufp[curlen] = '.';
711 				curlen++;
712 			}
713 			p += ORIGIN_STR_SIZE;
714 			len = strlen(p);
715 
716 			if ((curlen + len) > MAXPATHLEN)
717 				break;
718 			bcopy(p, &pathbufp[curlen], len);
719 			curlen += len;
720 			pathbufp[curlen++] = '\0';
721 			bcopy(pathbufp, dlnp, curlen);
722 		}
723 
724 		/*
725 		 * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
726 		 * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
727 		 * Just in case /usr is not mounted, change it now.
728 		 */
729 		if (strcmp(dlnp, USR_LIB_RTLD) == 0)
730 			dlnp += 4;
731 		error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
732 		if (error && dlnp != bigwad->dl_name) {
733 			/* new kernel, old user-level */
734 			error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
735 			    NULLVPP, &nvp);
736 		}
737 		if (error) {
738 			uprintf("%s: Cannot find %s\n", exec_file, dlnp);
739 			goto bad;
740 		}
741 
742 		/*
743 		 * Setup the "aux" vector.
744 		 */
745 		if (uphdr) {
746 			if (ehdrp->e_type == ET_DYN) {
747 				/* don't use the first page */
748 				bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
749 				bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
750 			} else {
751 				bigwad->exenv.ex_bssbase = bssbase;
752 				bigwad->exenv.ex_brkbase = brkbase;
753 			}
754 			bigwad->exenv.ex_brksize = brksize;
755 			bigwad->exenv.ex_magic = elfmagic;
756 			bigwad->exenv.ex_vp = vp;
757 			setexecenv(&bigwad->exenv);
758 
759 			ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
760 			ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
761 			ADDAUX(aux, AT_PHNUM, nphdrs)
762 			ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
763 		} else {
764 			if ((error = execopen(&vp, &fd)) != 0) {
765 				VN_RELE(nvp);
766 				goto bad;
767 			}
768 
769 			ADDAUX(aux, AT_EXECFD, fd)
770 		}
771 
772 		if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
773 			VN_RELE(nvp);
774 			uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
775 			goto bad;
776 		}
777 
778 		/*
779 		 * Now obtain the ELF header along with the entire program
780 		 * header contained in "nvp".
781 		 */
782 		kmem_free(phdrbase, phdrsize);
783 		phdrbase = NULL;
784 		if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
785 		    &shstrndx, &nphdrs)) != 0 ||
786 		    (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
787 		    &phdrsize)) != 0) {
788 			VN_RELE(nvp);
789 			uprintf("%s: Cannot read %s\n", exec_file, dlnp);
790 			goto bad;
791 		}
792 
793 		/*
794 		 * Determine memory size of the "interpreter's" loadable
795 		 * sections.  This size is then used to obtain the virtual
796 		 * address of a hole, in the user's address space, large
797 		 * enough to map the "interpreter".
798 		 */
799 		if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
800 			VN_RELE(nvp);
801 			uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
802 			goto bad;
803 		}
804 
805 		dtrphdr = NULL;
806 
807 		error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
808 		    &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
809 		    execsz, NULL);
810 		if (error || junk != NULL) {
811 			VN_RELE(nvp);
812 			uprintf("%s: Cannot map %s\n", exec_file, dlnp);
813 			goto bad;
814 		}
815 
816 		/*
817 		 * We use the DTrace program header to initialize the
818 		 * architecture-specific user per-LWP location. The dtrace
819 		 * fasttrap provider requires ready access to per-LWP scratch
820 		 * space. We assume that there is only one such program header
821 		 * in the interpreter.
822 		 */
823 		if (dtrphdr != NULL &&
824 		    dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
825 			VN_RELE(nvp);
826 			uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
827 			goto bad;
828 		}
829 
830 		VN_RELE(nvp);
831 		ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
832 	}
833 
834 	if (hasauxv) {
835 		int auxf = AF_SUN_HWCAPVERIFY;
836 
837 		/*
838 		 * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
839 		 * exec_args()
840 		 */
841 		ADDAUX(aux, AT_BASE, voffset)
842 		ADDAUX(aux, AT_FLAGS, at_flags)
843 		ADDAUX(aux, AT_PAGESZ, PAGESIZE)
844 		/*
845 		 * Linker flags. (security)
846 		 * p_flag not yet set at this time.
847 		 * We rely on gexec() to provide us with the information.
848 		 * If the application is set-uid but this is not reflected
849 		 * in a mismatch between real/effective uids/gids, then
850 		 * don't treat this as a set-uid exec.  So we care about
851 		 * the EXECSETID_UGIDS flag but not the ...SETID flag.
852 		 */
853 		if ((setid &= ~EXECSETID_SETID) != 0)
854 			auxf |= AF_SUN_SETUGID;
855 
856 		/*
857 		 * If we're running a native process from within a branded
858 		 * zone under pfexec then we clear the AF_SUN_SETUGID flag so
859 		 * that the native ld.so.1 is able to link with the native
860 		 * libraries instead of using the brand libraries that are
861 		 * installed in the zone.  We only do this for processes
862 		 * which we trust because we see they are already running
863 		 * under pfexec (where uid != euid).  This prevents a
864 		 * malicious user within the zone from crafting a wrapper to
865 		 * run native suid commands with unsecure libraries interposed.
866 		 */
867 		if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
868 		    (setid &= ~EXECSETID_SETID) != 0))
869 			auxf &= ~AF_SUN_SETUGID;
870 
871 		/*
872 		 * Record the user addr of the auxflags aux vector entry
873 		 * since brands may optionally want to manipulate this field.
874 		 */
875 		args->auxp_auxflags =
876 		    (char *)((char *)args->stackend +
877 		    ((char *)&aux->a_type -
878 		    (char *)bigwad->elfargs));
879 		ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
880 
881 		/*
882 		 * Hardware capability flag word (performance hints)
883 		 * Used for choosing faster library routines.
884 		 * (Potentially different between 32-bit and 64-bit ABIs)
885 		 */
886 #if defined(_LP64)
887 		if (args->to_model == DATAMODEL_NATIVE) {
888 			ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
889 			ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
890 		} else {
891 			ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
892 			ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
893 		}
894 #else
895 		ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
896 		ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
897 #endif
898 		if (branded) {
899 			/*
900 			 * Reserve space for the brand-private aux vectors,
901 			 * and record the user addr of that space.
902 			 */
903 			args->auxp_brand =
904 			    (char *)((char *)args->stackend +
905 			    ((char *)&aux->a_type -
906 			    (char *)bigwad->elfargs));
907 			ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
908 			ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
909 			ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
910 		}
911 
912 		/*
913 		 * Add the comm page auxv entry, mapping it in if needed.
914 		 */
915 #if defined(__amd64)
916 		if (args->commpage != NULL ||
917 		    (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) {
918 			ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
919 		} else {
920 			/*
921 			 * If the comm page cannot be mapped, pad out the auxv
922 			 * to satisfy later size checks.
923 			 */
924 			ADDAUX(aux, AT_NULL, 0)
925 		}
926 #endif /* defined(__amd64) */
927 
928 		ADDAUX(aux, AT_NULL, 0)
929 		postfixsize = (char *)aux - (char *)bigwad->elfargs;
930 
931 		/*
932 		 * We make assumptions above when we determine how many aux
933 		 * vector entries we will be adding. However, if we have an
934 		 * invalid elf file, it is possible that mapelfexec might
935 		 * behave differently (but not return an error), in which case
936 		 * the number of aux entries we actually add will be different.
937 		 * We detect that now and error out.
938 		 */
939 		if (postfixsize != args->auxsize) {
940 			DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
941 			    int, args->auxsize);
942 			goto bad;
943 		}
944 		ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
945 	}
946 
947 	/*
948 	 * For the 64-bit kernel, the limit is big enough that rounding it up
949 	 * to a page can overflow the 64-bit limit, so we check for btopr()
950 	 * overflowing here by comparing it with the unrounded limit in pages.
951 	 * If it hasn't overflowed, compare the exec size with the rounded up
952 	 * limit in pages.  Otherwise, just compare with the unrounded limit.
953 	 */
954 	limit = btop(p->p_vmem_ctl);
955 	roundlimit = btopr(p->p_vmem_ctl);
956 	if ((roundlimit > limit && *execsz > roundlimit) ||
957 	    (roundlimit < limit && *execsz > limit)) {
958 		mutex_enter(&p->p_lock);
959 		(void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
960 		    RCA_SAFE);
961 		mutex_exit(&p->p_lock);
962 		error = ENOMEM;
963 		goto bad;
964 	}
965 
966 	bzero(up->u_auxv, sizeof (up->u_auxv));
967 	up->u_commpagep = args->commpage;
968 	if (postfixsize) {
969 		int num_auxv;
970 
971 		/*
972 		 * Copy the aux vector to the user stack.
973 		 */
974 		error = execpoststack(args, bigwad->elfargs, postfixsize);
975 		if (error)
976 			goto bad;
977 
978 		/*
979 		 * Copy auxv to the process's user structure for use by /proc.
980 		 * If this is a branded process, the brand's exec routine will
981 		 * copy it's private entries to the user structure later. It
982 		 * relies on the fact that the blank entries are at the end.
983 		 */
984 		num_auxv = postfixsize / sizeof (aux_entry_t);
985 		ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
986 		aux = bigwad->elfargs;
987 		for (i = 0; i < num_auxv; i++) {
988 			up->u_auxv[i].a_type = aux[i].a_type;
989 			up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
990 		}
991 	}
992 
993 	/*
994 	 * Pass back the starting address so we can set the program counter.
995 	 */
996 	args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
997 
998 	if (!uphdr) {
999 		if (ehdrp->e_type == ET_DYN) {
1000 			/*
1001 			 * If we are executing a shared library which doesn't
1002 			 * have a interpreter (probably ld.so.1) then
1003 			 * we don't set the brkbase now.  Instead we
1004 			 * delay it's setting until the first call
1005 			 * via grow.c::brk().  This permits ld.so.1 to
1006 			 * initialize brkbase to the tail of the executable it
1007 			 * loads (which is where it needs to be).
1008 			 */
1009 			bigwad->exenv.ex_brkbase = (caddr_t)0;
1010 			bigwad->exenv.ex_bssbase = (caddr_t)0;
1011 			bigwad->exenv.ex_brksize = 0;
1012 		} else {
1013 			bigwad->exenv.ex_brkbase = brkbase;
1014 			bigwad->exenv.ex_bssbase = bssbase;
1015 			bigwad->exenv.ex_brksize = brksize;
1016 		}
1017 		bigwad->exenv.ex_magic = elfmagic;
1018 		bigwad->exenv.ex_vp = vp;
1019 		setexecenv(&bigwad->exenv);
1020 	}
1021 
1022 	ASSERT(error == 0);
1023 	goto out;
1024 
1025 bad:
1026 	if (fd != -1)		/* did we open the a.out yet */
1027 		(void) execclose(fd);
1028 
1029 	psignal(p, SIGKILL);
1030 
1031 	if (error == 0)
1032 		error = ENOEXEC;
1033 out:
1034 	if (phdrbase != NULL)
1035 		kmem_free(phdrbase, phdrsize);
1036 	if (cap != NULL)
1037 		kmem_free(cap, capsize);
1038 	kmem_free(bigwad, sizeof (struct bigwad));
1039 	return (error);
1040 }
1041 
1042 /*
1043  * Compute the memory size requirement for the ELF file.
1044  */
1045 static size_t
1046 elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
1047 {
1048 	size_t	len;
1049 	Phdr	*phdrp = (Phdr *)phdrbase;
1050 	int	hsize = ehdrp->e_phentsize;
1051 	int	first = 1;
1052 	int	dfirst = 1;	/* first data segment */
1053 	uintptr_t loaddr = 0;
1054 	uintptr_t hiaddr = 0;
1055 	uintptr_t lo, hi;
1056 	int	i;
1057 
1058 	for (i = nphdrs; i > 0; i--) {
1059 		if (phdrp->p_type == PT_LOAD) {
1060 			lo = phdrp->p_vaddr;
1061 			hi = lo + phdrp->p_memsz;
1062 			if (first) {
1063 				loaddr = lo;
1064 				hiaddr = hi;
1065 				first = 0;
1066 			} else {
1067 				if (loaddr > lo)
1068 					loaddr = lo;
1069 				if (hiaddr < hi)
1070 					hiaddr = hi;
1071 			}
1072 
1073 			/*
1074 			 * save the address of the first data segment
1075 			 * of a object - used for the AT_SUNW_LDDATA
1076 			 * aux entry.
1077 			 */
1078 			if ((lddata != NULL) && dfirst &&
1079 			    (phdrp->p_flags & PF_W)) {
1080 				*lddata = lo;
1081 				dfirst = 0;
1082 			}
1083 		}
1084 		phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1085 	}
1086 
1087 	len = hiaddr - (loaddr & PAGEMASK);
1088 	len = roundup(len, PAGESIZE);
1089 
1090 	return (len);
1091 }
1092 
1093 /*
1094  * Read in the ELF header and program header table.
1095  * SUSV3 requires:
1096  *	ENOEXEC	File format is not recognized
1097  *	EINVAL	Format recognized but execution not supported
1098  */
1099 static int
1100 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
1101     int *nphdrs)
1102 {
1103 	int error;
1104 	ssize_t resid;
1105 
1106 	/*
1107 	 * We got here by the first two bytes in ident,
1108 	 * now read the entire ELF header.
1109 	 */
1110 	if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
1111 	    sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
1112 	    (rlim64_t)0, credp, &resid)) != 0)
1113 		return (error);
1114 
1115 	/*
1116 	 * Since a separate version is compiled for handling 32-bit and
1117 	 * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1118 	 * doesn't need to be able to deal with 32-bit ELF files.
1119 	 */
1120 	if (resid != 0 ||
1121 	    ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1122 	    ehdr->e_ident[EI_MAG3] != ELFMAG3)
1123 		return (ENOEXEC);
1124 
1125 	if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1126 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1127 	    ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1128 #else
1129 	    ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1130 #endif
1131 	    !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1132 	    ehdr->e_flags))
1133 		return (EINVAL);
1134 
1135 	*nshdrs = ehdr->e_shnum;
1136 	*shstrndx = ehdr->e_shstrndx;
1137 	*nphdrs = ehdr->e_phnum;
1138 
1139 	/*
1140 	 * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1141 	 * to read in the section header at index zero to acces the true
1142 	 * values for those fields.
1143 	 */
1144 	if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1145 	    *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1146 		Shdr shdr;
1147 
1148 		if (ehdr->e_shoff == 0)
1149 			return (EINVAL);
1150 
1151 		if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1152 		    sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1153 		    (rlim64_t)0, credp, &resid)) != 0)
1154 			return (error);
1155 
1156 		if (*nshdrs == 0)
1157 			*nshdrs = shdr.sh_size;
1158 		if (*shstrndx == SHN_XINDEX)
1159 			*shstrndx = shdr.sh_link;
1160 		if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1161 			*nphdrs = shdr.sh_info;
1162 	}
1163 
1164 	return (0);
1165 }
1166 
1167 #ifdef _ELF32_COMPAT
1168 extern size_t elf_nphdr_max;
1169 #else
1170 size_t elf_nphdr_max = 1000;
1171 #endif
1172 
1173 static int
1174 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
1175     caddr_t *phbasep, ssize_t *phsizep)
1176 {
1177 	ssize_t resid, minsize;
1178 	int err;
1179 
1180 	/*
1181 	 * Since we're going to be using e_phentsize to iterate down the
1182 	 * array of program headers, it must be 8-byte aligned or else
1183 	 * a we might cause a misaligned access. We use all members through
1184 	 * p_flags on 32-bit ELF files and p_memsz on 64-bit ELF files so
1185 	 * e_phentsize must be at least large enough to include those
1186 	 * members.
1187 	 */
1188 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1189 	minsize = offsetof(Phdr, p_flags) + sizeof (((Phdr *)NULL)->p_flags);
1190 #else
1191 	minsize = offsetof(Phdr, p_memsz) + sizeof (((Phdr *)NULL)->p_memsz);
1192 #endif
1193 	if (ehdr->e_phentsize < minsize || (ehdr->e_phentsize & 3))
1194 		return (EINVAL);
1195 
1196 	*phsizep = nphdrs * ehdr->e_phentsize;
1197 
1198 	if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1199 		if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1200 			return (ENOMEM);
1201 	} else {
1202 		*phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1203 	}
1204 
1205 	if ((err = vn_rdwr(UIO_READ, vp, *phbasep, *phsizep,
1206 	    (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1207 	    credp, &resid)) != 0) {
1208 		kmem_free(*phbasep, *phsizep);
1209 		*phbasep = NULL;
1210 		return (err);
1211 	}
1212 
1213 	return (0);
1214 }
1215 
1216 #ifdef _ELF32_COMPAT
1217 extern size_t elf_nshdr_max;
1218 extern size_t elf_shstrtab_max;
1219 #else
1220 size_t elf_nshdr_max = 10000;
1221 size_t elf_shstrtab_max = 100 * 1024;
1222 #endif
1223 
1224 
1225 static int
1226 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
1227     int nshdrs, int shstrndx, caddr_t *shbasep, ssize_t *shsizep,
1228     char **shstrbasep, ssize_t *shstrsizep)
1229 {
1230 	ssize_t resid, minsize;
1231 	int err;
1232 	Shdr *shdr;
1233 
1234 	/*
1235 	 * Since we're going to be using e_shentsize to iterate down the
1236 	 * array of section headers, it must be 8-byte aligned or else
1237 	 * a we might cause a misaligned access. We use all members through
1238 	 * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1239 	 * must be at least large enough to include that member. The index
1240 	 * of the string table section must also be valid.
1241 	 */
1242 	minsize = offsetof(Shdr, sh_entsize) + sizeof (shdr->sh_entsize);
1243 	if (ehdr->e_shentsize < minsize || (ehdr->e_shentsize & 3) ||
1244 	    shstrndx >= nshdrs)
1245 		return (EINVAL);
1246 
1247 	*shsizep = nshdrs * ehdr->e_shentsize;
1248 
1249 	if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1250 		if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1251 			return (ENOMEM);
1252 	} else {
1253 		*shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1254 	}
1255 
1256 	if ((err = vn_rdwr(UIO_READ, vp, *shbasep, *shsizep,
1257 	    (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1258 	    credp, &resid)) != 0) {
1259 		kmem_free(*shbasep, *shsizep);
1260 		return (err);
1261 	}
1262 
1263 	/*
1264 	 * Pull the section string table out of the vnode; fail if the size
1265 	 * is zero.
1266 	 */
1267 	shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1268 	if ((*shstrsizep = shdr->sh_size) == 0) {
1269 		kmem_free(*shbasep, *shsizep);
1270 		return (EINVAL);
1271 	}
1272 
1273 	if (*shstrsizep > elf_shstrtab_max) {
1274 		if ((*shstrbasep = kmem_alloc(*shstrsizep,
1275 		    KM_NOSLEEP)) == NULL) {
1276 			kmem_free(*shbasep, *shsizep);
1277 			return (ENOMEM);
1278 		}
1279 	} else {
1280 		*shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1281 	}
1282 
1283 	if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
1284 	    (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1285 	    credp, &resid)) != 0) {
1286 		kmem_free(*shbasep, *shsizep);
1287 		kmem_free(*shstrbasep, *shstrsizep);
1288 		return (err);
1289 	}
1290 
1291 	/*
1292 	 * Make sure the strtab is null-terminated to make sure we
1293 	 * don't run off the end of the table.
1294 	 */
1295 	(*shstrbasep)[*shstrsizep - 1] = '\0';
1296 
1297 	return (0);
1298 }
1299 
1300 static int
1301 mapelfexec(
1302 	vnode_t *vp,
1303 	Ehdr *ehdr,
1304 	int nphdrs,
1305 	caddr_t phdrbase,
1306 	Phdr **uphdr,
1307 	Phdr **intphdr,
1308 	Phdr **stphdr,
1309 	Phdr **dtphdr,
1310 	Phdr *dataphdrp,
1311 	caddr_t *bssbase,
1312 	caddr_t *brkbase,
1313 	intptr_t *voffset,
1314 	intptr_t *minaddr,
1315 	size_t len,
1316 	long *execsz,
1317 	size_t *brksize)
1318 {
1319 	Phdr *phdr;
1320 	int i, prot, error;
1321 	caddr_t addr = NULL;
1322 	size_t zfodsz;
1323 	int ptload = 0;
1324 	int page;
1325 	off_t offset;
1326 	int hsize = ehdr->e_phentsize;
1327 	caddr_t mintmp = (caddr_t)-1;
1328 	extern int use_brk_lpg;
1329 
1330 	if (ehdr->e_type == ET_DYN) {
1331 		secflagset_t flags = 0;
1332 		/*
1333 		 * Obtain the virtual address of a hole in the
1334 		 * address space to map the "interpreter".
1335 		 */
1336 		if (secflag_enabled(curproc, PROC_SEC_ASLR))
1337 			flags |= _MAP_RANDOMIZE;
1338 
1339 		map_addr(&addr, len, (offset_t)0, 1, flags);
1340 		if (addr == NULL)
1341 			return (ENOMEM);
1342 		*voffset = (intptr_t)addr;
1343 
1344 		/*
1345 		 * Calculate the minimum vaddr so it can be subtracted out.
1346 		 * According to the ELF specification, since PT_LOAD sections
1347 		 * must be sorted by increasing p_vaddr values, this is
1348 		 * guaranteed to be the first PT_LOAD section.
1349 		 */
1350 		phdr = (Phdr *)phdrbase;
1351 		for (i = nphdrs; i > 0; i--) {
1352 			if (phdr->p_type == PT_LOAD) {
1353 				*voffset -= (uintptr_t)phdr->p_vaddr;
1354 				break;
1355 			}
1356 			phdr = (Phdr *)((caddr_t)phdr + hsize);
1357 		}
1358 
1359 	} else {
1360 		*voffset = 0;
1361 	}
1362 	phdr = (Phdr *)phdrbase;
1363 	for (i = nphdrs; i > 0; i--) {
1364 		switch (phdr->p_type) {
1365 		case PT_LOAD:
1366 			if ((*intphdr != NULL) && (*uphdr == NULL))
1367 				return (0);
1368 
1369 			ptload = 1;
1370 			prot = PROT_USER;
1371 			if (phdr->p_flags & PF_R)
1372 				prot |= PROT_READ;
1373 			if (phdr->p_flags & PF_W)
1374 				prot |= PROT_WRITE;
1375 			if (phdr->p_flags & PF_X)
1376 				prot |= PROT_EXEC;
1377 
1378 			addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1379 
1380 			/*
1381 			 * Keep track of the segment with the lowest starting
1382 			 * address.
1383 			 */
1384 			if (addr < mintmp)
1385 				mintmp = addr;
1386 
1387 			zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1388 
1389 			offset = phdr->p_offset;
1390 			if (((uintptr_t)offset & PAGEOFFSET) ==
1391 			    ((uintptr_t)addr & PAGEOFFSET) &&
1392 			    (!(vp->v_flag & VNOMAP))) {
1393 				page = 1;
1394 			} else {
1395 				page = 0;
1396 			}
1397 
1398 			/*
1399 			 * Set the heap pagesize for OOB when the bss size
1400 			 * is known and use_brk_lpg is not 0.
1401 			 */
1402 			if (brksize != NULL && use_brk_lpg &&
1403 			    zfodsz != 0 && phdr == dataphdrp &&
1404 			    (prot & PROT_WRITE)) {
1405 				size_t tlen = P2NPHASE((uintptr_t)addr +
1406 				    phdr->p_filesz, PAGESIZE);
1407 
1408 				if (zfodsz > tlen) {
1409 					curproc->p_brkpageszc =
1410 					    page_szc(map_pgsz(MAPPGSZ_HEAP,
1411 					    curproc, addr + phdr->p_filesz +
1412 					    tlen, zfodsz - tlen, 0));
1413 				}
1414 			}
1415 
1416 			if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1417 			    (prot & PROT_WRITE)) {
1418 				uint_t	szc = curproc->p_brkpageszc;
1419 				size_t pgsz = page_get_pagesize(szc);
1420 				caddr_t ebss = addr + phdr->p_memsz;
1421 				/*
1422 				 * If we need extra space to keep the BSS an
1423 				 * integral number of pages in size, some of
1424 				 * that space may fall beyond p_brkbase, so we
1425 				 * need to set p_brksize to account for it
1426 				 * being (logically) part of the brk.
1427 				 */
1428 				size_t extra_zfodsz;
1429 
1430 				ASSERT(pgsz > PAGESIZE);
1431 
1432 				extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1433 
1434 				if (error = execmap(vp, addr, phdr->p_filesz,
1435 				    zfodsz + extra_zfodsz, phdr->p_offset,
1436 				    prot, page, szc))
1437 					goto bad;
1438 				if (brksize != NULL)
1439 					*brksize = extra_zfodsz;
1440 			} else {
1441 				if (error = execmap(vp, addr, phdr->p_filesz,
1442 				    zfodsz, phdr->p_offset, prot, page, 0))
1443 					goto bad;
1444 			}
1445 
1446 			if (bssbase != NULL && addr >= *bssbase &&
1447 			    phdr == dataphdrp) {
1448 				*bssbase = addr + phdr->p_filesz;
1449 			}
1450 			if (brkbase != NULL && addr >= *brkbase) {
1451 				*brkbase = addr + phdr->p_memsz;
1452 			}
1453 
1454 			*execsz += btopr(phdr->p_memsz);
1455 			break;
1456 
1457 		case PT_INTERP:
1458 			if (ptload)
1459 				goto bad;
1460 			*intphdr = phdr;
1461 			break;
1462 
1463 		case PT_SHLIB:
1464 			*stphdr = phdr;
1465 			break;
1466 
1467 		case PT_PHDR:
1468 			if (ptload)
1469 				goto bad;
1470 			*uphdr = phdr;
1471 			break;
1472 
1473 		case PT_NULL:
1474 		case PT_DYNAMIC:
1475 		case PT_NOTE:
1476 			break;
1477 
1478 		case PT_SUNWDTRACE:
1479 			if (dtphdr != NULL)
1480 				*dtphdr = phdr;
1481 			break;
1482 
1483 		default:
1484 			break;
1485 		}
1486 		phdr = (Phdr *)((caddr_t)phdr + hsize);
1487 	}
1488 
1489 	if (minaddr != NULL) {
1490 		ASSERT(mintmp != (caddr_t)-1);
1491 		*minaddr = (intptr_t)mintmp;
1492 	}
1493 
1494 	if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1495 		size_t off;
1496 		uintptr_t base = (uintptr_t)*brkbase;
1497 		uintptr_t oend = base + *brksize;
1498 
1499 		ASSERT(ISP2(aslr_max_brk_skew));
1500 
1501 		(void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1502 		base += P2PHASE(off, aslr_max_brk_skew);
1503 		base = P2ROUNDUP(base, PAGESIZE);
1504 		*brkbase = (caddr_t)base;
1505 		/*
1506 		 * Above, we set *brksize to account for the possibility we
1507 		 * had to grow the 'brk' in padding out the BSS to a page
1508 		 * boundary.
1509 		 *
1510 		 * We now need to adjust that based on where we now are
1511 		 * actually putting the brk.
1512 		 */
1513 		if (oend > base)
1514 			*brksize = oend - base;
1515 		else
1516 			*brksize = 0;
1517 	}
1518 
1519 	return (0);
1520 bad:
1521 	if (error == 0)
1522 		error = EINVAL;
1523 	return (error);
1524 }
1525 
1526 int
1527 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1528     rlim64_t rlimit, cred_t *credp)
1529 {
1530 	Note note;
1531 	int error;
1532 
1533 	bzero(&note, sizeof (note));
1534 	bcopy("CORE", note.name, 4);
1535 	note.nhdr.n_type = type;
1536 	/*
1537 	 * The System V ABI states that n_namesz must be the length of the
1538 	 * string that follows the Nhdr structure including the terminating
1539 	 * null. The ABI also specifies that sufficient padding should be
1540 	 * included so that the description that follows the name string
1541 	 * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1542 	 * respectively. However, since this change was not made correctly
1543 	 * at the time of the 64-bit port, both 32- and 64-bit binaries
1544 	 * descriptions are only guaranteed to begin on a 4-byte boundary.
1545 	 */
1546 	note.nhdr.n_namesz = 5;
1547 	note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1548 
1549 	if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1550 	    sizeof (note), rlimit, credp))
1551 		return (error);
1552 
1553 	*offsetp += sizeof (note);
1554 
1555 	if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1556 	    note.nhdr.n_descsz, rlimit, credp))
1557 		return (error);
1558 
1559 	*offsetp += note.nhdr.n_descsz;
1560 	return (0);
1561 }
1562 
1563 /*
1564  * Copy the section data from one vnode to the section of another vnode.
1565  */
1566 static void
1567 copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset,
1568     void *buf, size_t size, cred_t *credp, rlim64_t rlimit)
1569 {
1570 	ssize_t resid;
1571 	size_t len, n = src->sh_size;
1572 	offset_t off = 0;
1573 
1574 	while (n != 0) {
1575 		len = MIN(size, n);
1576 		if (vn_rdwr(UIO_READ, src_vp, buf, len, src->sh_offset + off,
1577 		    UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1578 		    resid >= len ||
1579 		    core_write(dst_vp, UIO_SYSSPACE, *doffset + off,
1580 		    buf, len - resid, rlimit, credp) != 0) {
1581 			dst->sh_size = 0;
1582 			dst->sh_offset = 0;
1583 			return;
1584 		}
1585 
1586 		ASSERT(n >= len - resid);
1587 
1588 		n -= len - resid;
1589 		off += len - resid;
1590 	}
1591 
1592 	*doffset += src->sh_size;
1593 }
1594 
1595 #ifdef _ELF32_COMPAT
1596 extern size_t elf_datasz_max;
1597 #else
1598 size_t elf_datasz_max = 1 * 1024 * 1024;
1599 #endif
1600 
1601 /*
1602  * This function processes mappings that correspond to load objects to
1603  * examine their respective sections for elfcore(). It's called once with
1604  * v set to NULL to count the number of sections that we're going to need
1605  * and then again with v set to some allocated buffer that we fill in with
1606  * all the section data.
1607  */
1608 static int
1609 process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp,
1610     Shdr *v, int nv, rlim64_t rlimit, Off *doffsetp, int *nshdrsp)
1611 {
1612 	vnode_t *lastvp = NULL;
1613 	struct seg *seg;
1614 	int i, j;
1615 	void *data = NULL;
1616 	size_t datasz = 0;
1617 	shstrtab_t shstrtab;
1618 	struct as *as = p->p_as;
1619 	int error = 0;
1620 
1621 	if (v != NULL)
1622 		shstrtab_init(&shstrtab);
1623 
1624 	i = 1;
1625 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1626 		uint_t prot;
1627 		vnode_t *mvp;
1628 		void *tmp = NULL;
1629 		caddr_t saddr = seg->s_base;
1630 		caddr_t naddr;
1631 		caddr_t eaddr;
1632 		size_t segsize;
1633 
1634 		Ehdr ehdr;
1635 		int nshdrs, shstrndx, nphdrs;
1636 		caddr_t shbase;
1637 		ssize_t shsize;
1638 		char *shstrbase;
1639 		ssize_t shstrsize;
1640 
1641 		Shdr *shdr;
1642 		const char *name;
1643 		size_t sz;
1644 		uintptr_t off;
1645 
1646 		int ctf_ndx = 0;
1647 		int symtab_ndx = 0;
1648 
1649 		/*
1650 		 * Since we're just looking for text segments of load
1651 		 * objects, we only care about the protection bits; we don't
1652 		 * care about the actual size of the segment so we use the
1653 		 * reserved size. If the segment's size is zero, there's
1654 		 * something fishy going on so we ignore this segment.
1655 		 */
1656 		if (seg->s_ops != &segvn_ops ||
1657 		    SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
1658 		    mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
1659 		    (segsize = pr_getsegsize(seg, 1)) == 0)
1660 			continue;
1661 
1662 		eaddr = saddr + segsize;
1663 		prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
1664 		pr_getprot_done(&tmp);
1665 
1666 		/*
1667 		 * Skip this segment unless the protection bits look like
1668 		 * what we'd expect for a text segment.
1669 		 */
1670 		if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
1671 			continue;
1672 
1673 		if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx,
1674 		    &nphdrs) != 0 ||
1675 		    getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx,
1676 		    &shbase, &shsize, &shstrbase, &shstrsize) != 0)
1677 			continue;
1678 
1679 		off = ehdr.e_shentsize;
1680 		for (j = 1; j < nshdrs; j++, off += ehdr.e_shentsize) {
1681 			Shdr *symtab = NULL, *strtab;
1682 
1683 			shdr = (Shdr *)(shbase + off);
1684 
1685 			if (shdr->sh_name >= shstrsize)
1686 				continue;
1687 
1688 			name = shstrbase + shdr->sh_name;
1689 
1690 			if (strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1691 				if ((content & CC_CONTENT_CTF) == 0 ||
1692 				    ctf_ndx != 0)
1693 					continue;
1694 
1695 				if (shdr->sh_link > 0 &&
1696 				    shdr->sh_link < nshdrs) {
1697 					symtab = (Shdr *)(shbase +
1698 					    shdr->sh_link * ehdr.e_shentsize);
1699 				}
1700 
1701 				if (v != NULL && i < nv - 1) {
1702 					if (shdr->sh_size > datasz &&
1703 					    shdr->sh_size <= elf_datasz_max) {
1704 						if (data != NULL)
1705 							kmem_free(data, datasz);
1706 
1707 						datasz = shdr->sh_size;
1708 						data = kmem_alloc(datasz,
1709 						    KM_SLEEP);
1710 					}
1711 
1712 					v[i].sh_name = shstrtab_ndx(&shstrtab,
1713 					    STR_CTF);
1714 					v[i].sh_addr = (Addr)(uintptr_t)saddr;
1715 					v[i].sh_type = SHT_PROGBITS;
1716 					v[i].sh_addralign = 4;
1717 					*doffsetp = roundup(*doffsetp,
1718 					    v[i].sh_addralign);
1719 					v[i].sh_offset = *doffsetp;
1720 					v[i].sh_size = shdr->sh_size;
1721 					if (symtab == NULL)  {
1722 						v[i].sh_link = 0;
1723 					} else if (symtab->sh_type ==
1724 					    SHT_SYMTAB &&
1725 					    symtab_ndx != 0) {
1726 						v[i].sh_link =
1727 						    symtab_ndx;
1728 					} else {
1729 						v[i].sh_link = i + 1;
1730 					}
1731 
1732 					copy_scn(shdr, mvp, &v[i], vp,
1733 					    doffsetp, data, datasz, credp,
1734 					    rlimit);
1735 				}
1736 
1737 				ctf_ndx = i++;
1738 
1739 				/*
1740 				 * We've already dumped the symtab.
1741 				 */
1742 				if (symtab != NULL &&
1743 				    symtab->sh_type == SHT_SYMTAB &&
1744 				    symtab_ndx != 0)
1745 					continue;
1746 
1747 			} else if (strcmp(name,
1748 			    shstrtab_data[STR_SYMTAB]) == 0) {
1749 				if ((content & CC_CONTENT_SYMTAB) == 0 ||
1750 				    symtab != 0)
1751 					continue;
1752 
1753 				symtab = shdr;
1754 			}
1755 
1756 			if (symtab != NULL) {
1757 				if ((symtab->sh_type != SHT_DYNSYM &&
1758 				    symtab->sh_type != SHT_SYMTAB) ||
1759 				    symtab->sh_link == 0 ||
1760 				    symtab->sh_link >= nshdrs)
1761 					continue;
1762 
1763 				strtab = (Shdr *)(shbase +
1764 				    symtab->sh_link * ehdr.e_shentsize);
1765 
1766 				if (strtab->sh_type != SHT_STRTAB)
1767 					continue;
1768 
1769 				if (v != NULL && i < nv - 2) {
1770 					sz = MAX(symtab->sh_size,
1771 					    strtab->sh_size);
1772 					if (sz > datasz &&
1773 					    sz <= elf_datasz_max) {
1774 						if (data != NULL)
1775 							kmem_free(data, datasz);
1776 
1777 						datasz = sz;
1778 						data = kmem_alloc(datasz,
1779 						    KM_SLEEP);
1780 					}
1781 
1782 					if (symtab->sh_type == SHT_DYNSYM) {
1783 						v[i].sh_name = shstrtab_ndx(
1784 						    &shstrtab, STR_DYNSYM);
1785 						v[i + 1].sh_name = shstrtab_ndx(
1786 						    &shstrtab, STR_DYNSTR);
1787 					} else {
1788 						v[i].sh_name = shstrtab_ndx(
1789 						    &shstrtab, STR_SYMTAB);
1790 						v[i + 1].sh_name = shstrtab_ndx(
1791 						    &shstrtab, STR_STRTAB);
1792 					}
1793 
1794 					v[i].sh_type = symtab->sh_type;
1795 					v[i].sh_addr = symtab->sh_addr;
1796 					if (ehdr.e_type == ET_DYN ||
1797 					    v[i].sh_addr == 0)
1798 						v[i].sh_addr +=
1799 						    (Addr)(uintptr_t)saddr;
1800 					v[i].sh_addralign =
1801 					    symtab->sh_addralign;
1802 					*doffsetp = roundup(*doffsetp,
1803 					    v[i].sh_addralign);
1804 					v[i].sh_offset = *doffsetp;
1805 					v[i].sh_size = symtab->sh_size;
1806 					v[i].sh_link = i + 1;
1807 					v[i].sh_entsize = symtab->sh_entsize;
1808 					v[i].sh_info = symtab->sh_info;
1809 
1810 					copy_scn(symtab, mvp, &v[i], vp,
1811 					    doffsetp, data, datasz, credp,
1812 					    rlimit);
1813 
1814 					v[i + 1].sh_type = SHT_STRTAB;
1815 					v[i + 1].sh_flags = SHF_STRINGS;
1816 					v[i + 1].sh_addr = symtab->sh_addr;
1817 					if (ehdr.e_type == ET_DYN ||
1818 					    v[i + 1].sh_addr == 0)
1819 						v[i + 1].sh_addr +=
1820 						    (Addr)(uintptr_t)saddr;
1821 					v[i + 1].sh_addralign =
1822 					    strtab->sh_addralign;
1823 					*doffsetp = roundup(*doffsetp,
1824 					    v[i + 1].sh_addralign);
1825 					v[i + 1].sh_offset = *doffsetp;
1826 					v[i + 1].sh_size = strtab->sh_size;
1827 
1828 					copy_scn(strtab, mvp, &v[i + 1], vp,
1829 					    doffsetp, data, datasz, credp,
1830 					    rlimit);
1831 				}
1832 
1833 				if (symtab->sh_type == SHT_SYMTAB)
1834 					symtab_ndx = i;
1835 				i += 2;
1836 			}
1837 		}
1838 
1839 		kmem_free(shstrbase, shstrsize);
1840 		kmem_free(shbase, shsize);
1841 
1842 		lastvp = mvp;
1843 	}
1844 
1845 	if (v == NULL) {
1846 		if (i == 1)
1847 			*nshdrsp = 0;
1848 		else
1849 			*nshdrsp = i + 1;
1850 		goto done;
1851 	}
1852 
1853 	if (i != nv - 1) {
1854 		cmn_err(CE_WARN, "elfcore: core dump failed for "
1855 		    "process %d; address space is changing", p->p_pid);
1856 		error = EIO;
1857 		goto done;
1858 	}
1859 
1860 	v[i].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
1861 	v[i].sh_size = shstrtab_size(&shstrtab);
1862 	v[i].sh_addralign = 1;
1863 	*doffsetp = roundup(*doffsetp, v[i].sh_addralign);
1864 	v[i].sh_offset = *doffsetp;
1865 	v[i].sh_flags = SHF_STRINGS;
1866 	v[i].sh_type = SHT_STRTAB;
1867 
1868 	if (v[i].sh_size > datasz) {
1869 		if (data != NULL)
1870 			kmem_free(data, datasz);
1871 
1872 		datasz = v[i].sh_size;
1873 		data = kmem_alloc(datasz,
1874 		    KM_SLEEP);
1875 	}
1876 
1877 	shstrtab_dump(&shstrtab, data);
1878 
1879 	if ((error = core_write(vp, UIO_SYSSPACE, *doffsetp,
1880 	    data, v[i].sh_size, rlimit, credp)) != 0)
1881 		goto done;
1882 
1883 	*doffsetp += v[i].sh_size;
1884 
1885 done:
1886 	if (data != NULL)
1887 		kmem_free(data, datasz);
1888 
1889 	return (error);
1890 }
1891 
1892 int
1893 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
1894     core_content_t content)
1895 {
1896 	offset_t poffset, soffset;
1897 	Off doffset;
1898 	int error, i, nphdrs, nshdrs;
1899 	int overflow = 0;
1900 	struct seg *seg;
1901 	struct as *as = p->p_as;
1902 	union {
1903 		Ehdr ehdr;
1904 		Phdr phdr[1];
1905 		Shdr shdr[1];
1906 	} *bigwad;
1907 	size_t bigsize;
1908 	size_t phdrsz, shdrsz;
1909 	Ehdr *ehdr;
1910 	Phdr *v;
1911 	caddr_t brkbase;
1912 	size_t brksize;
1913 	caddr_t stkbase;
1914 	size_t stksize;
1915 	int ntries = 0;
1916 	klwp_t *lwp = ttolwp(curthread);
1917 
1918 top:
1919 	/*
1920 	 * Make sure we have everything we need (registers, etc.).
1921 	 * All other lwps have already stopped and are in an orderly state.
1922 	 */
1923 	ASSERT(p == ttoproc(curthread));
1924 	prstop(0, 0);
1925 
1926 	AS_LOCK_ENTER(as, RW_WRITER);
1927 	nphdrs = prnsegs(as, 0) + 2;		/* two CORE note sections */
1928 
1929 	/*
1930 	 * Count the number of section headers we're going to need.
1931 	 */
1932 	nshdrs = 0;
1933 	if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
1934 		(void) process_scns(content, p, credp, NULL, NULL, NULL, 0,
1935 		    NULL, &nshdrs);
1936 	}
1937 	AS_LOCK_EXIT(as);
1938 
1939 	ASSERT(nshdrs == 0 || nshdrs > 1);
1940 
1941 	/*
1942 	 * The core file contents may required zero section headers, but if
1943 	 * we overflow the 16 bits allotted to the program header count in
1944 	 * the ELF header, we'll need that program header at index zero.
1945 	 */
1946 	if (nshdrs == 0 && nphdrs >= PN_XNUM)
1947 		nshdrs = 1;
1948 
1949 	phdrsz = nphdrs * sizeof (Phdr);
1950 	shdrsz = nshdrs * sizeof (Shdr);
1951 
1952 	bigsize = MAX(sizeof (*bigwad), MAX(phdrsz, shdrsz));
1953 	bigwad = kmem_alloc(bigsize, KM_SLEEP);
1954 
1955 	ehdr = &bigwad->ehdr;
1956 	bzero(ehdr, sizeof (*ehdr));
1957 
1958 	ehdr->e_ident[EI_MAG0] = ELFMAG0;
1959 	ehdr->e_ident[EI_MAG1] = ELFMAG1;
1960 	ehdr->e_ident[EI_MAG2] = ELFMAG2;
1961 	ehdr->e_ident[EI_MAG3] = ELFMAG3;
1962 	ehdr->e_ident[EI_CLASS] = ELFCLASS;
1963 	ehdr->e_type = ET_CORE;
1964 
1965 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1966 
1967 #if defined(__sparc)
1968 	ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
1969 	ehdr->e_machine = EM_SPARC;
1970 #elif defined(__i386) || defined(__i386_COMPAT)
1971 	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
1972 	ehdr->e_machine = EM_386;
1973 #else
1974 #error "no recognized machine type is defined"
1975 #endif
1976 
1977 #else	/* !defined(_LP64) || defined(_ELF32_COMPAT) */
1978 
1979 #if defined(__sparc)
1980 	ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
1981 	ehdr->e_machine = EM_SPARCV9;
1982 #elif defined(__amd64)
1983 	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
1984 	ehdr->e_machine = EM_AMD64;
1985 #else
1986 #error "no recognized 64-bit machine type is defined"
1987 #endif
1988 
1989 #endif	/* !defined(_LP64) || defined(_ELF32_COMPAT) */
1990 
1991 	/*
1992 	 * If the count of program headers or section headers or the index
1993 	 * of the section string table can't fit in the mere 16 bits
1994 	 * shortsightedly allotted to them in the ELF header, we use the
1995 	 * extended formats and put the real values in the section header
1996 	 * as index 0.
1997 	 */
1998 	ehdr->e_version = EV_CURRENT;
1999 	ehdr->e_ehsize = sizeof (Ehdr);
2000 
2001 	if (nphdrs >= PN_XNUM)
2002 		ehdr->e_phnum = PN_XNUM;
2003 	else
2004 		ehdr->e_phnum = (unsigned short)nphdrs;
2005 
2006 	ehdr->e_phoff = sizeof (Ehdr);
2007 	ehdr->e_phentsize = sizeof (Phdr);
2008 
2009 	if (nshdrs > 0) {
2010 		if (nshdrs >= SHN_LORESERVE)
2011 			ehdr->e_shnum = 0;
2012 		else
2013 			ehdr->e_shnum = (unsigned short)nshdrs;
2014 
2015 		if (nshdrs - 1 >= SHN_LORESERVE)
2016 			ehdr->e_shstrndx = SHN_XINDEX;
2017 		else
2018 			ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2019 
2020 		ehdr->e_shoff = ehdr->e_phoff + ehdr->e_phentsize * nphdrs;
2021 		ehdr->e_shentsize = sizeof (Shdr);
2022 	}
2023 
2024 	if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2025 	    sizeof (Ehdr), rlimit, credp))
2026 		goto done;
2027 
2028 	poffset = sizeof (Ehdr);
2029 	soffset = sizeof (Ehdr) + phdrsz;
2030 	doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2031 
2032 	v = &bigwad->phdr[0];
2033 	bzero(v, phdrsz);
2034 
2035 	setup_old_note_header(&v[0], p);
2036 	v[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2037 	doffset += v[0].p_filesz;
2038 
2039 	setup_note_header(&v[1], p);
2040 	v[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2041 	doffset += v[1].p_filesz;
2042 
2043 	mutex_enter(&p->p_lock);
2044 
2045 	brkbase = p->p_brkbase;
2046 	brksize = p->p_brksize;
2047 
2048 	stkbase = p->p_usrstack - p->p_stksize;
2049 	stksize = p->p_stksize;
2050 
2051 	mutex_exit(&p->p_lock);
2052 
2053 	AS_LOCK_ENTER(as, RW_WRITER);
2054 	i = 2;
2055 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2056 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2057 		caddr_t saddr, naddr;
2058 		void *tmp = NULL;
2059 		extern struct seg_ops segspt_shmops;
2060 
2061 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2062 			uint_t prot;
2063 			size_t size;
2064 			int type;
2065 			vnode_t *mvp;
2066 
2067 			prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2068 			prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2069 			if ((size = (size_t)(naddr - saddr)) == 0)
2070 				continue;
2071 			if (i == nphdrs) {
2072 				overflow++;
2073 				continue;
2074 			}
2075 			v[i].p_type = PT_LOAD;
2076 			v[i].p_vaddr = (Addr)(uintptr_t)saddr;
2077 			v[i].p_memsz = size;
2078 			if (prot & PROT_READ)
2079 				v[i].p_flags |= PF_R;
2080 			if (prot & PROT_WRITE)
2081 				v[i].p_flags |= PF_W;
2082 			if (prot & PROT_EXEC)
2083 				v[i].p_flags |= PF_X;
2084 
2085 			/*
2086 			 * Figure out which mappings to include in the core.
2087 			 */
2088 			type = SEGOP_GETTYPE(seg, saddr);
2089 
2090 			if (saddr == stkbase && size == stksize) {
2091 				if (!(content & CC_CONTENT_STACK))
2092 					goto exclude;
2093 
2094 			} else if (saddr == brkbase && size == brksize) {
2095 				if (!(content & CC_CONTENT_HEAP))
2096 					goto exclude;
2097 
2098 			} else if (seg->s_ops == &segspt_shmops) {
2099 				if (type & MAP_NORESERVE) {
2100 					if (!(content & CC_CONTENT_DISM))
2101 						goto exclude;
2102 				} else {
2103 					if (!(content & CC_CONTENT_ISM))
2104 						goto exclude;
2105 				}
2106 
2107 			} else if (seg->s_ops != &segvn_ops) {
2108 				goto exclude;
2109 
2110 			} else if (type & MAP_SHARED) {
2111 				if (shmgetid(p, saddr) != SHMID_NONE) {
2112 					if (!(content & CC_CONTENT_SHM))
2113 						goto exclude;
2114 
2115 				} else if (SEGOP_GETVP(seg, seg->s_base,
2116 				    &mvp) != 0 || mvp == NULL ||
2117 				    mvp->v_type != VREG) {
2118 					if (!(content & CC_CONTENT_SHANON))
2119 						goto exclude;
2120 
2121 				} else {
2122 					if (!(content & CC_CONTENT_SHFILE))
2123 						goto exclude;
2124 				}
2125 
2126 			} else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2127 			    mvp == NULL || mvp->v_type != VREG) {
2128 				if (!(content & CC_CONTENT_ANON))
2129 					goto exclude;
2130 
2131 			} else if (prot == (PROT_READ | PROT_EXEC)) {
2132 				if (!(content & CC_CONTENT_TEXT))
2133 					goto exclude;
2134 
2135 			} else if (prot == PROT_READ) {
2136 				if (!(content & CC_CONTENT_RODATA))
2137 					goto exclude;
2138 
2139 			} else {
2140 				if (!(content & CC_CONTENT_DATA))
2141 					goto exclude;
2142 			}
2143 
2144 			doffset = roundup(doffset, sizeof (Word));
2145 			v[i].p_offset = doffset;
2146 			v[i].p_filesz = size;
2147 			doffset += size;
2148 exclude:
2149 			i++;
2150 		}
2151 		ASSERT(tmp == NULL);
2152 	}
2153 	AS_LOCK_EXIT(as);
2154 
2155 	if (overflow || i != nphdrs) {
2156 		if (ntries++ == 0) {
2157 			kmem_free(bigwad, bigsize);
2158 			overflow = 0;
2159 			goto top;
2160 		}
2161 		cmn_err(CE_WARN, "elfcore: core dump failed for "
2162 		    "process %d; address space is changing", p->p_pid);
2163 		error = EIO;
2164 		goto done;
2165 	}
2166 
2167 	if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2168 	    v, phdrsz, rlimit, credp)) != 0)
2169 		goto done;
2170 
2171 	if ((error = write_old_elfnotes(p, sig, vp, v[0].p_offset, rlimit,
2172 	    credp)) != 0)
2173 		goto done;
2174 
2175 	if ((error = write_elfnotes(p, sig, vp, v[1].p_offset, rlimit,
2176 	    credp, content)) != 0)
2177 		goto done;
2178 
2179 	for (i = 2; i < nphdrs; i++) {
2180 		prkillinfo_t killinfo;
2181 		sigqueue_t *sq;
2182 		int sig, j;
2183 
2184 		if (v[i].p_filesz == 0)
2185 			continue;
2186 
2187 		/*
2188 		 * If dumping out this segment fails, rather than failing
2189 		 * the core dump entirely, we reset the size of the mapping
2190 		 * to zero to indicate that the data is absent from the core
2191 		 * file and or in the PF_SUNW_FAILURE flag to differentiate
2192 		 * this from mappings that were excluded due to the core file
2193 		 * content settings.
2194 		 */
2195 		if ((error = core_seg(p, vp, v[i].p_offset,
2196 		    (caddr_t)(uintptr_t)v[i].p_vaddr, v[i].p_filesz,
2197 		    rlimit, credp)) == 0) {
2198 			continue;
2199 		}
2200 
2201 		if ((sig = lwp->lwp_cursig) == 0) {
2202 			/*
2203 			 * We failed due to something other than a signal.
2204 			 * Since the space reserved for the segment is now
2205 			 * unused, we stash the errno in the first four
2206 			 * bytes. This undocumented interface will let us
2207 			 * understand the nature of the failure.
2208 			 */
2209 			(void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2210 			    &error, sizeof (error), rlimit, credp);
2211 
2212 			v[i].p_filesz = 0;
2213 			v[i].p_flags |= PF_SUNW_FAILURE;
2214 			if ((error = core_write(vp, UIO_SYSSPACE,
2215 			    poffset + sizeof (v[i]) * i, &v[i], sizeof (v[i]),
2216 			    rlimit, credp)) != 0)
2217 				goto done;
2218 
2219 			continue;
2220 		}
2221 
2222 		/*
2223 		 * We took a signal.  We want to abort the dump entirely, but
2224 		 * we also want to indicate what failed and why.  We therefore
2225 		 * use the space reserved for the first failing segment to
2226 		 * write our error (which, for purposes of compatability with
2227 		 * older core dump readers, we set to EINTR) followed by any
2228 		 * siginfo associated with the signal.
2229 		 */
2230 		bzero(&killinfo, sizeof (killinfo));
2231 		killinfo.prk_error = EINTR;
2232 
2233 		sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2234 
2235 		if (sq != NULL) {
2236 			bcopy(&sq->sq_info, &killinfo.prk_info,
2237 			    sizeof (sq->sq_info));
2238 		} else {
2239 			killinfo.prk_info.si_signo = lwp->lwp_cursig;
2240 			killinfo.prk_info.si_code = SI_NOINFO;
2241 		}
2242 
2243 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2244 		/*
2245 		 * If this is a 32-bit process, we need to translate from the
2246 		 * native siginfo to the 32-bit variant.  (Core readers must
2247 		 * always have the same data model as their target or must
2248 		 * be aware of -- and compensate for -- data model differences.)
2249 		 */
2250 		if (curproc->p_model == DATAMODEL_ILP32) {
2251 			siginfo32_t si32;
2252 
2253 			siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2254 			bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2255 		}
2256 #endif
2257 
2258 		(void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2259 		    &killinfo, sizeof (killinfo), rlimit, credp);
2260 
2261 		/*
2262 		 * For the segment on which we took the signal, indicate that
2263 		 * its data now refers to a siginfo.
2264 		 */
2265 		v[i].p_filesz = 0;
2266 		v[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2267 		    PF_SUNW_SIGINFO;
2268 
2269 		/*
2270 		 * And for every other segment, indicate that its absence
2271 		 * is due to a signal.
2272 		 */
2273 		for (j = i + 1; j < nphdrs; j++) {
2274 			v[j].p_filesz = 0;
2275 			v[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2276 		}
2277 
2278 		/*
2279 		 * Finally, write out our modified program headers.
2280 		 */
2281 		if ((error = core_write(vp, UIO_SYSSPACE,
2282 		    poffset + sizeof (v[i]) * i, &v[i],
2283 		    sizeof (v[i]) * (nphdrs - i), rlimit, credp)) != 0)
2284 			goto done;
2285 
2286 		break;
2287 	}
2288 
2289 	if (nshdrs > 0) {
2290 		bzero(&bigwad->shdr[0], shdrsz);
2291 
2292 		if (nshdrs >= SHN_LORESERVE)
2293 			bigwad->shdr[0].sh_size = nshdrs;
2294 
2295 		if (nshdrs - 1 >= SHN_LORESERVE)
2296 			bigwad->shdr[0].sh_link = nshdrs - 1;
2297 
2298 		if (nphdrs >= PN_XNUM)
2299 			bigwad->shdr[0].sh_info = nphdrs;
2300 
2301 		if (nshdrs > 1) {
2302 			AS_LOCK_ENTER(as, RW_WRITER);
2303 			if ((error = process_scns(content, p, credp, vp,
2304 			    &bigwad->shdr[0], nshdrs, rlimit, &doffset,
2305 			    NULL)) != 0) {
2306 				AS_LOCK_EXIT(as);
2307 				goto done;
2308 			}
2309 			AS_LOCK_EXIT(as);
2310 		}
2311 
2312 		if ((error = core_write(vp, UIO_SYSSPACE, soffset,
2313 		    &bigwad->shdr[0], shdrsz, rlimit, credp)) != 0)
2314 			goto done;
2315 	}
2316 
2317 done:
2318 	kmem_free(bigwad, bigsize);
2319 	return (error);
2320 }
2321 
2322 #ifndef	_ELF32_COMPAT
2323 
2324 static struct execsw esw = {
2325 #ifdef	_LP64
2326 	elf64magicstr,
2327 #else	/* _LP64 */
2328 	elf32magicstr,
2329 #endif	/* _LP64 */
2330 	0,
2331 	5,
2332 	elfexec,
2333 	elfcore
2334 };
2335 
2336 static struct modlexec modlexec = {
2337 	&mod_execops, "exec module for elf", &esw
2338 };
2339 
2340 #ifdef	_LP64
2341 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2342 			intpdata_t *idatap, int level, long *execsz,
2343 			int setid, caddr_t exec_file, cred_t *cred,
2344 			int brand_action);
2345 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2346 			rlim64_t rlimit, int sig, core_content_t content);
2347 
2348 static struct execsw esw32 = {
2349 	elf32magicstr,
2350 	0,
2351 	5,
2352 	elf32exec,
2353 	elf32core
2354 };
2355 
2356 static struct modlexec modlexec32 = {
2357 	&mod_execops, "32-bit exec module for elf", &esw32
2358 };
2359 #endif	/* _LP64 */
2360 
2361 static struct modlinkage modlinkage = {
2362 	MODREV_1,
2363 	(void *)&modlexec,
2364 #ifdef	_LP64
2365 	(void *)&modlexec32,
2366 #endif	/* _LP64 */
2367 	NULL
2368 };
2369 
2370 int
2371 _init(void)
2372 {
2373 	return (mod_install(&modlinkage));
2374 }
2375 
2376 int
2377 _fini(void)
2378 {
2379 	return (mod_remove(&modlinkage));
2380 }
2381 
2382 int
2383 _info(struct modinfo *modinfop)
2384 {
2385 	return (mod_info(&modlinkage, modinfop));
2386 }
2387 
2388 #endif	/* !_ELF32_COMPAT */
2389