xref: /illumos-gate/usr/src/uts/common/exec/elf/elf.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	   All Rights Reserved	*/
28 /*
29  * Copyright 2019, Joyent, Inc.
30  * Copyright 2022 Oxide Computer Company
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/thread.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/vnode.h>
42 #include <sys/mman.h>
43 #include <sys/kmem.h>
44 #include <sys/proc.h>
45 #include <sys/pathname.h>
46 #include <sys/policy.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/elf.h>
50 #include <sys/vmsystm.h>
51 #include <sys/debug.h>
52 #include <sys/auxv.h>
53 #include <sys/exec.h>
54 #include <sys/prsystm.h>
55 #include <vm/as.h>
56 #include <vm/rm.h>
57 #include <vm/seg.h>
58 #include <vm/seg_vn.h>
59 #include <sys/modctl.h>
60 #include <sys/systeminfo.h>
61 #include <sys/vmparam.h>
62 #include <sys/machelf.h>
63 #include <sys/shm_impl.h>
64 #include <sys/archsystm.h>
65 #include <sys/fasttrap.h>
66 #include <sys/brand.h>
67 #include "elf_impl.h"
68 #include <sys/sdt.h>
69 #include <sys/siginfo.h>
70 #include <sys/random.h>
71 
72 #include <core_shstrtab.h>
73 
74 #if defined(__x86)
75 #include <sys/comm_page_util.h>
76 #include <sys/fp.h>
77 #endif /* defined(__x86) */
78 
79 
80 extern int at_flags;
81 extern volatile size_t aslr_max_brk_skew;
82 
83 #define	ORIGIN_STR	"ORIGIN"
84 #define	ORIGIN_STR_SIZE	6
85 
86 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
87     uint_t *);
88 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
89     size_t *);
90 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
91     caddr_t *, size_t *, caddr_t *, size_t *);
92 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
93 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
94     Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
95     size_t, size_t *, size_t *);
96 
97 
98 #ifdef _ELF32_COMPAT
99 /* Link against the non-compat instances when compiling the 32-bit version. */
100 extern size_t elf_datasz_max;
101 extern size_t elf_zeropg_sz;
102 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
103 extern uint_t elf_nphdr_max;
104 extern uint_t elf_nshdr_max;
105 extern size_t elf_shstrtab_max;
106 #else
107 size_t elf_datasz_max = 1 * 1024 * 1024;
108 size_t elf_zeropg_sz = 4 * 1024;
109 uint_t elf_nphdr_max = 1000;
110 uint_t elf_nshdr_max = 10000;
111 size_t elf_shstrtab_max = 100 * 1024;
112 #endif
113 
114 static int
115 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
116 {
117 	ASSERT(phdrp->p_type == PT_SUNWDTRACE);
118 
119 	/*
120 	 * See the comment in fasttrap.h for information on how to safely
121 	 * update this program header.
122 	 */
123 	if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
124 	    (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
125 		return (-1);
126 
127 	args->thrptr = phdrp->p_vaddr + base;
128 
129 	return (0);
130 }
131 
132 static int
133 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
134 {
135 	uint_t flag;
136 
137 	switch (dt) {
138 	case DT_SUNW_ASLR:
139 		flag = PROC_SEC_ASLR;
140 		break;
141 	default:
142 		return (EINVAL);
143 	}
144 
145 	if (val == 0) {
146 		if (secflag_isset(p->p_secflags.psf_lower, flag))
147 			return (EPERM);
148 		if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
149 		    secflag_isset(p->p_secflags.psf_inherit, flag))
150 			return (EPERM);
151 
152 		secflag_clear(&p->p_secflags.psf_effective, flag);
153 	} else {
154 		if (!secflag_isset(p->p_secflags.psf_upper, flag))
155 			return (EPERM);
156 
157 		if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
158 		    !secflag_isset(p->p_secflags.psf_inherit, flag))
159 			return (EPERM);
160 
161 		secflag_set(&p->p_secflags.psf_effective, flag);
162 	}
163 
164 	return (0);
165 }
166 
167 #ifndef _ELF32_COMPAT
168 void
169 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
170 {
171 	size_t target = MIN(sz, elf_datasz_max);
172 
173 	if (target > ctx->ecc_bufsz) {
174 		if (ctx->ecc_buf != NULL) {
175 			kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
176 		}
177 		ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
178 		ctx->ecc_bufsz = target;
179 	}
180 }
181 #endif /* _ELF32_COMPAT */
182 
183 /*
184  * Map in the executable pointed to by vp. Returns 0 on success.
185  */
186 int
187 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
188     intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
189     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
190 {
191 	size_t		len, phdrsize;
192 	struct vattr	vat;
193 	caddr_t		phdrbase = NULL;
194 	uint_t		nshdrs, shstrndx, nphdrs;
195 	int		error = 0;
196 	Phdr		*uphdr = NULL;
197 	Phdr		*junk = NULL;
198 	Phdr		*dynphdr = NULL;
199 	Phdr		*dtrphdr = NULL;
200 	uintptr_t	lddata, minaddr;
201 	size_t		execsz;
202 
203 	if (lddatap != NULL)
204 		*lddatap = 0;
205 
206 	if (error = execpermissions(vp, &vat, args)) {
207 		uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
208 		return (error);
209 	}
210 
211 	if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
212 	    &nphdrs)) != 0 ||
213 	    (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
214 	    &phdrsize)) != 0) {
215 		uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
216 		return (error);
217 	}
218 
219 	if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
220 		uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
221 		kmem_free(phdrbase, phdrsize);
222 		return (ENOEXEC);
223 	}
224 	if (lddatap != NULL)
225 		*lddatap = lddata;
226 
227 	if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
228 	    &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
229 	    len, &execsz, brksize)) {
230 		uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
231 		if (uphdr != NULL && uphdr->p_flags == 0)
232 			kmem_free(uphdr, sizeof (Phdr));
233 		kmem_free(phdrbase, phdrsize);
234 		return (error);
235 	}
236 
237 	/*
238 	 * Inform our caller if the executable needs an interpreter.
239 	 */
240 	*interp = (dynphdr == NULL) ? 0 : 1;
241 
242 	/*
243 	 * If this is a statically linked executable, voffset should indicate
244 	 * the address of the executable itself (it normally holds the address
245 	 * of the interpreter).
246 	 */
247 	if (ehdr->e_type == ET_EXEC && *interp == 0)
248 		*voffset = minaddr;
249 
250 	if (uphdr != NULL) {
251 		*uphdr_vaddr = uphdr->p_vaddr;
252 
253 		if (uphdr->p_flags == 0)
254 			kmem_free(uphdr, sizeof (Phdr));
255 	} else {
256 		*uphdr_vaddr = (Addr)-1;
257 	}
258 
259 	kmem_free(phdrbase, phdrsize);
260 	return (error);
261 }
262 
263 int
264 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
265     int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
266     int brand_action)
267 {
268 	caddr_t		phdrbase = NULL;
269 	caddr_t		bssbase = 0;
270 	caddr_t		brkbase = 0;
271 	size_t		brksize = 0;
272 	size_t		dlnsize;
273 	aux_entry_t	*aux;
274 	int		error;
275 	ssize_t		resid;
276 	int		fd = -1;
277 	intptr_t	voffset;
278 	Phdr		*intphdr = NULL;
279 	Phdr		*dynamicphdr = NULL;
280 	Phdr		*stphdr = NULL;
281 	Phdr		*uphdr = NULL;
282 	Phdr		*junk = NULL;
283 	size_t		len;
284 	size_t		postfixsize = 0;
285 	size_t		i;
286 	Phdr		*phdrp;
287 	Phdr		*dataphdrp = NULL;
288 	Phdr		*dtrphdr;
289 	Phdr		*capphdr = NULL;
290 	Cap		*cap = NULL;
291 	size_t		capsize;
292 	int		hasu = 0;
293 	int		hasauxv = 0;
294 	int		hasintp = 0;
295 	int		branded = 0;
296 	boolean_t	dynuphdr = B_FALSE;
297 
298 	struct proc *p = ttoproc(curthread);
299 	struct user *up = PTOU(p);
300 	struct bigwad {
301 		Ehdr	ehdr;
302 		aux_entry_t	elfargs[__KERN_NAUXV_IMPL];
303 		char		dl_name[MAXPATHLEN];
304 		char		pathbuf[MAXPATHLEN];
305 		struct vattr	vattr;
306 		struct execenv	exenv;
307 	} *bigwad;	/* kmem_alloc this behemoth so we don't blow stack */
308 	Ehdr		*ehdrp;
309 	uint_t		nshdrs, shstrndx, nphdrs;
310 	size_t		phdrsize;
311 	char		*dlnp;
312 	char		*pathbufp;
313 	rlim64_t	limit;
314 	rlim64_t	roundlimit;
315 
316 	ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
317 
318 	bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
319 	ehdrp = &bigwad->ehdr;
320 	dlnp = bigwad->dl_name;
321 	pathbufp = bigwad->pathbuf;
322 
323 	/*
324 	 * Obtain ELF and program header information.
325 	 */
326 	if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
327 	    &nphdrs)) != 0 ||
328 	    (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
329 	    &phdrsize)) != 0)
330 		goto out;
331 
332 	/*
333 	 * Prevent executing an ELF file that has no entry point.
334 	 */
335 	if (ehdrp->e_entry == 0) {
336 		uprintf("%s: Bad entry point\n", exec_file);
337 		goto bad;
338 	}
339 
340 	/*
341 	 * Put data model that we're exec-ing to into the args passed to
342 	 * exec_args(), so it will know what it is copying to on new stack.
343 	 * Now that we know whether we are exec-ing a 32-bit or 64-bit
344 	 * executable, we can set execsz with the appropriate NCARGS.
345 	 */
346 #ifdef	_LP64
347 	if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
348 		args->to_model = DATAMODEL_ILP32;
349 		*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
350 	} else {
351 		args->to_model = DATAMODEL_LP64;
352 		args->stk_prot &= ~PROT_EXEC;
353 #if defined(__x86)
354 		args->dat_prot &= ~PROT_EXEC;
355 #endif
356 		*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
357 	}
358 #else	/* _LP64 */
359 	args->to_model = DATAMODEL_ILP32;
360 	*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
361 #endif	/* _LP64 */
362 
363 	/*
364 	 * We delay invoking the brand callback until we've figured out
365 	 * what kind of elf binary we're trying to run, 32-bit or 64-bit.
366 	 * We do this because now the brand library can just check
367 	 * args->to_model to see if the target is 32-bit or 64-bit without
368 	 * having do duplicate all the code above.
369 	 *
370 	 * The level checks associated with brand handling below are used to
371 	 * prevent a loop since the brand elfexec function typically comes back
372 	 * through this function. We must check <= here since the nested
373 	 * handling in the #! interpreter code will increment the level before
374 	 * calling gexec to run the final elfexec interpreter.
375 	 */
376 	if ((level <= INTP_MAXDEPTH) &&
377 	    (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
378 		error = BROP(p)->b_elfexec(vp, uap, args,
379 		    idatap, level + 1, execsz, setid, exec_file, cred,
380 		    brand_action);
381 		goto out;
382 	}
383 
384 	/*
385 	 * Determine aux size now so that stack can be built
386 	 * in one shot (except actual copyout of aux image),
387 	 * determine any non-default stack protections,
388 	 * and still have this code be machine independent.
389 	 */
390 	const uint_t hsize = ehdrp->e_phentsize;
391 	phdrp = (Phdr *)phdrbase;
392 	for (i = nphdrs; i > 0; i--) {
393 		switch (phdrp->p_type) {
394 		case PT_INTERP:
395 			hasauxv = hasintp = 1;
396 			break;
397 		case PT_PHDR:
398 			hasu = 1;
399 			break;
400 		case PT_SUNWSTACK:
401 			args->stk_prot = PROT_USER;
402 			if (phdrp->p_flags & PF_R)
403 				args->stk_prot |= PROT_READ;
404 			if (phdrp->p_flags & PF_W)
405 				args->stk_prot |= PROT_WRITE;
406 			if (phdrp->p_flags & PF_X)
407 				args->stk_prot |= PROT_EXEC;
408 			break;
409 		case PT_LOAD:
410 			dataphdrp = phdrp;
411 			break;
412 		case PT_SUNWCAP:
413 			capphdr = phdrp;
414 			break;
415 		case PT_DYNAMIC:
416 			dynamicphdr = phdrp;
417 			break;
418 		}
419 		phdrp = (Phdr *)((caddr_t)phdrp + hsize);
420 	}
421 
422 	if (ehdrp->e_type != ET_EXEC) {
423 		dataphdrp = NULL;
424 		hasauxv = 1;
425 	}
426 
427 	/* Copy BSS permissions to args->dat_prot */
428 	if (dataphdrp != NULL) {
429 		args->dat_prot = PROT_USER;
430 		if (dataphdrp->p_flags & PF_R)
431 			args->dat_prot |= PROT_READ;
432 		if (dataphdrp->p_flags & PF_W)
433 			args->dat_prot |= PROT_WRITE;
434 		if (dataphdrp->p_flags & PF_X)
435 			args->dat_prot |= PROT_EXEC;
436 	}
437 
438 	/*
439 	 * If a auxvector will be required - reserve the space for
440 	 * it now.  This may be increased by exec_args if there are
441 	 * ISA-specific types (included in __KERN_NAUXV_IMPL).
442 	 */
443 	if (hasauxv) {
444 		/*
445 		 * If a AUX vector is being built - the base AUX
446 		 * entries are:
447 		 *
448 		 *	AT_BASE
449 		 *	AT_FLAGS
450 		 *	AT_PAGESZ
451 		 *	AT_SUN_AUXFLAGS
452 		 *	AT_SUN_HWCAP
453 		 *	AT_SUN_HWCAP2
454 		 *	AT_SUN_HWCAP3
455 		 *	AT_SUN_PLATFORM (added in stk_copyout)
456 		 *	AT_SUN_EXECNAME (added in stk_copyout)
457 		 *	AT_NULL
458 		 *
459 		 * total == 10
460 		 */
461 		if (hasintp && hasu) {
462 			/*
463 			 * Has PT_INTERP & PT_PHDR - the auxvectors that
464 			 * will be built are:
465 			 *
466 			 *	AT_PHDR
467 			 *	AT_PHENT
468 			 *	AT_PHNUM
469 			 *	AT_ENTRY
470 			 *	AT_LDDATA
471 			 *
472 			 * total = 5
473 			 */
474 			args->auxsize = (10 + 5) * sizeof (aux_entry_t);
475 		} else if (hasintp) {
476 			/*
477 			 * Has PT_INTERP but no PT_PHDR
478 			 *
479 			 *	AT_EXECFD
480 			 *	AT_LDDATA
481 			 *
482 			 * total = 2
483 			 */
484 			args->auxsize = (10 + 2) * sizeof (aux_entry_t);
485 		} else {
486 			args->auxsize = 10 * sizeof (aux_entry_t);
487 		}
488 	} else {
489 		args->auxsize = 0;
490 	}
491 
492 	/*
493 	 * If this binary is using an emulator, we need to add an
494 	 * AT_SUN_EMULATOR aux entry.
495 	 */
496 	if (args->emulator != NULL)
497 		args->auxsize += sizeof (aux_entry_t);
498 
499 	/*
500 	 * On supported kernels (x86_64) make room in the auxv for the
501 	 * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
502 	 * which do not provide such functionality.
503 	 *
504 	 * Additionally cover the floating point information AT_SUN_FPSIZE and
505 	 * AT_SUN_FPTYPE.
506 	 */
507 #if defined(__amd64)
508 	args->auxsize += 3 * sizeof (aux_entry_t);
509 #endif /* defined(__amd64) */
510 
511 	if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
512 		branded = 1;
513 		/*
514 		 * We will be adding 4 entries to the aux vectors.  One for
515 		 * the the brandname and 3 for the brand specific aux vectors.
516 		 */
517 		args->auxsize += 4 * sizeof (aux_entry_t);
518 	}
519 
520 	/* If the binary has an explicit ASLR flag, it must be honoured */
521 	if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
522 		const size_t dynfilesz = dynamicphdr->p_filesz;
523 		const size_t dynoffset = dynamicphdr->p_offset;
524 		Dyn *dyn, *dp;
525 
526 		if (dynoffset > MAXOFFSET_T ||
527 		    dynfilesz > MAXOFFSET_T ||
528 		    dynoffset + dynfilesz > MAXOFFSET_T) {
529 			uprintf("%s: cannot read full .dynamic section\n",
530 			    exec_file);
531 			error = EINVAL;
532 			goto out;
533 		}
534 
535 #define	DYN_STRIDE	100
536 		for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
537 			const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
538 			const size_t ndyns = MIN(DYN_STRIDE, remdyns);
539 			const size_t dynsize = ndyns * sizeof (*dyn);
540 
541 			dyn = kmem_alloc(dynsize, KM_SLEEP);
542 
543 			if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
544 			    (ssize_t)dynsize, (offset_t)(dynoffset + i),
545 			    UIO_SYSSPACE, 0, (rlim64_t)0,
546 			    CRED(), NULL)) != 0) {
547 				uprintf("%s: cannot read .dynamic section\n",
548 				    exec_file);
549 				goto out;
550 			}
551 
552 			for (dp = dyn; dp < (dyn + ndyns); dp++) {
553 				if (dp->d_tag == DT_SUNW_ASLR) {
554 					if ((error = handle_secflag_dt(p,
555 					    DT_SUNW_ASLR,
556 					    dp->d_un.d_val)) != 0) {
557 						uprintf("%s: error setting "
558 						    "security-flag from "
559 						    "DT_SUNW_ASLR: %d\n",
560 						    exec_file, error);
561 						goto out;
562 					}
563 				}
564 			}
565 
566 			kmem_free(dyn, dynsize);
567 		}
568 	}
569 
570 	/* Hardware/Software capabilities */
571 	if (capphdr != NULL &&
572 	    (capsize = capphdr->p_filesz) > 0 &&
573 	    capsize <= 16 * sizeof (*cap)) {
574 		const uint_t ncaps = capsize / sizeof (*cap);
575 		Cap *cp;
576 
577 		cap = kmem_alloc(capsize, KM_SLEEP);
578 		if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
579 		    (ssize_t)capsize, (offset_t)capphdr->p_offset,
580 		    UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
581 			uprintf("%s: Cannot read capabilities section\n",
582 			    exec_file);
583 			goto out;
584 		}
585 		for (cp = cap; cp < cap + ncaps; cp++) {
586 			if (cp->c_tag == CA_SUNW_SF_1 &&
587 			    (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
588 				if (args->to_model == DATAMODEL_LP64)
589 					args->addr32 = 1;
590 				break;
591 			}
592 		}
593 	}
594 
595 	aux = bigwad->elfargs;
596 	/*
597 	 * Move args to the user's stack.
598 	 * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
599 	 */
600 	if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
601 		if (error == -1) {
602 			error = ENOEXEC;
603 			goto bad;
604 		}
605 		goto out;
606 	}
607 	/* we're single threaded after this point */
608 
609 	/*
610 	 * If this is an ET_DYN executable (shared object),
611 	 * determine its memory size so that mapelfexec() can load it.
612 	 */
613 	if (ehdrp->e_type == ET_DYN)
614 		len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
615 	else
616 		len = 0;
617 
618 	dtrphdr = NULL;
619 
620 	error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
621 	    &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
622 	    len, execsz, &brksize);
623 
624 	/*
625 	 * Our uphdr has been dynamically allocated if (and only if) its
626 	 * program header flags are clear.  To avoid leaks, this must be
627 	 * checked regardless of whether mapelfexec() emitted an error.
628 	 */
629 	dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
630 
631 	if (error != 0)
632 		goto bad;
633 
634 	if (uphdr != NULL && intphdr == NULL)
635 		goto bad;
636 
637 	if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
638 		uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
639 		goto bad;
640 	}
641 
642 	if (intphdr != NULL) {
643 		size_t		len;
644 		uintptr_t	lddata;
645 		char		*p;
646 		struct vnode	*nvp;
647 
648 		dlnsize = intphdr->p_filesz;
649 
650 		/*
651 		 * Make sure none of the component pieces of dlnsize result in
652 		 * an oversized or zeroed result.
653 		 */
654 		if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
655 		    dlnsize == 0 || dlnsize < intphdr->p_filesz) {
656 			goto bad;
657 		}
658 
659 		/*
660 		 * Read in "interpreter" pathname.
661 		 */
662 		if ((error = vn_rdwr(UIO_READ, vp, dlnp,
663 		    (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
664 		    UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
665 			uprintf("%s: Cannot obtain interpreter pathname\n",
666 			    exec_file);
667 			goto bad;
668 		}
669 
670 		if (resid != 0 || dlnp[dlnsize - 1] != '\0')
671 			goto bad;
672 
673 		/*
674 		 * Search for '$ORIGIN' token in interpreter path.
675 		 * If found, expand it.
676 		 */
677 		for (p = dlnp; p = strchr(p, '$'); ) {
678 			uint_t	len, curlen;
679 			char	*_ptr;
680 
681 			if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
682 				continue;
683 
684 			/*
685 			 * We don't support $ORIGIN on setid programs to close
686 			 * a potential attack vector.
687 			 */
688 			if ((setid & EXECSETID_SETID) != 0) {
689 				error = ENOEXEC;
690 				goto bad;
691 			}
692 
693 			curlen = 0;
694 			len = p - dlnp - 1;
695 			if (len) {
696 				bcopy(dlnp, pathbufp, len);
697 				curlen += len;
698 			}
699 			if (_ptr = strrchr(args->pathname, '/')) {
700 				len = _ptr - args->pathname;
701 				if ((curlen + len) > MAXPATHLEN)
702 					break;
703 
704 				bcopy(args->pathname, &pathbufp[curlen], len);
705 				curlen += len;
706 			} else {
707 				/*
708 				 * executable is a basename found in the
709 				 * current directory.  So - just substitue
710 				 * '.' for ORIGIN.
711 				 */
712 				pathbufp[curlen] = '.';
713 				curlen++;
714 			}
715 			p += ORIGIN_STR_SIZE;
716 			len = strlen(p);
717 
718 			if ((curlen + len) > MAXPATHLEN)
719 				break;
720 			bcopy(p, &pathbufp[curlen], len);
721 			curlen += len;
722 			pathbufp[curlen++] = '\0';
723 			bcopy(pathbufp, dlnp, curlen);
724 		}
725 
726 		/*
727 		 * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
728 		 * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
729 		 * Just in case /usr is not mounted, change it now.
730 		 */
731 		if (strcmp(dlnp, USR_LIB_RTLD) == 0)
732 			dlnp += 4;
733 		error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
734 		if (error && dlnp != bigwad->dl_name) {
735 			/* new kernel, old user-level */
736 			error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
737 			    NULLVPP, &nvp);
738 		}
739 		if (error) {
740 			uprintf("%s: Cannot find %s\n", exec_file, dlnp);
741 			goto bad;
742 		}
743 
744 		/*
745 		 * Setup the "aux" vector.
746 		 */
747 		if (uphdr) {
748 			if (ehdrp->e_type == ET_DYN) {
749 				/* don't use the first page */
750 				bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
751 				bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
752 			} else {
753 				bigwad->exenv.ex_bssbase = bssbase;
754 				bigwad->exenv.ex_brkbase = brkbase;
755 			}
756 			bigwad->exenv.ex_brksize = brksize;
757 			bigwad->exenv.ex_magic = elfmagic;
758 			bigwad->exenv.ex_vp = vp;
759 			setexecenv(&bigwad->exenv);
760 
761 			ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
762 			ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
763 			ADDAUX(aux, AT_PHNUM, nphdrs)
764 			ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
765 		} else {
766 			if ((error = execopen(&vp, &fd)) != 0) {
767 				VN_RELE(nvp);
768 				goto bad;
769 			}
770 
771 			ADDAUX(aux, AT_EXECFD, fd)
772 		}
773 
774 		if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
775 			VN_RELE(nvp);
776 			uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
777 			goto bad;
778 		}
779 
780 		/*
781 		 * Now obtain the ELF header along with the entire program
782 		 * header contained in "nvp".
783 		 */
784 		kmem_free(phdrbase, phdrsize);
785 		phdrbase = NULL;
786 		if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
787 		    &shstrndx, &nphdrs)) != 0 ||
788 		    (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
789 		    &phdrsize)) != 0) {
790 			VN_RELE(nvp);
791 			uprintf("%s: Cannot read %s\n", exec_file, dlnp);
792 			goto bad;
793 		}
794 
795 		/*
796 		 * Determine memory size of the "interpreter's" loadable
797 		 * sections.  This size is then used to obtain the virtual
798 		 * address of a hole, in the user's address space, large
799 		 * enough to map the "interpreter".
800 		 */
801 		if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
802 			VN_RELE(nvp);
803 			uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
804 			goto bad;
805 		}
806 
807 		dtrphdr = NULL;
808 
809 		error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
810 		    &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
811 		    execsz, NULL);
812 
813 		if (error || junk != NULL) {
814 			VN_RELE(nvp);
815 			uprintf("%s: Cannot map %s\n", exec_file, dlnp);
816 			goto bad;
817 		}
818 
819 		/*
820 		 * We use the DTrace program header to initialize the
821 		 * architecture-specific user per-LWP location. The dtrace
822 		 * fasttrap provider requires ready access to per-LWP scratch
823 		 * space. We assume that there is only one such program header
824 		 * in the interpreter.
825 		 */
826 		if (dtrphdr != NULL &&
827 		    dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
828 			VN_RELE(nvp);
829 			uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
830 			goto bad;
831 		}
832 
833 		VN_RELE(nvp);
834 		ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
835 	}
836 
837 	if (hasauxv) {
838 		int auxf = AF_SUN_HWCAPVERIFY;
839 #if defined(__amd64)
840 		size_t fpsize;
841 		int fptype;
842 #endif /* defined(__amd64) */
843 
844 		/*
845 		 * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
846 		 * exec_args()
847 		 */
848 		ADDAUX(aux, AT_BASE, voffset)
849 		ADDAUX(aux, AT_FLAGS, at_flags)
850 		ADDAUX(aux, AT_PAGESZ, PAGESIZE)
851 		/*
852 		 * Linker flags. (security)
853 		 * p_flag not yet set at this time.
854 		 * We rely on gexec() to provide us with the information.
855 		 * If the application is set-uid but this is not reflected
856 		 * in a mismatch between real/effective uids/gids, then
857 		 * don't treat this as a set-uid exec.  So we care about
858 		 * the EXECSETID_UGIDS flag but not the ...SETID flag.
859 		 */
860 		if ((setid &= ~EXECSETID_SETID) != 0)
861 			auxf |= AF_SUN_SETUGID;
862 
863 		/*
864 		 * If we're running a native process from within a branded
865 		 * zone under pfexec then we clear the AF_SUN_SETUGID flag so
866 		 * that the native ld.so.1 is able to link with the native
867 		 * libraries instead of using the brand libraries that are
868 		 * installed in the zone.  We only do this for processes
869 		 * which we trust because we see they are already running
870 		 * under pfexec (where uid != euid).  This prevents a
871 		 * malicious user within the zone from crafting a wrapper to
872 		 * run native suid commands with unsecure libraries interposed.
873 		 */
874 		if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
875 		    (setid &= ~EXECSETID_SETID) != 0))
876 			auxf &= ~AF_SUN_SETUGID;
877 
878 		/*
879 		 * Record the user addr of the auxflags aux vector entry
880 		 * since brands may optionally want to manipulate this field.
881 		 */
882 		args->auxp_auxflags =
883 		    (char *)((char *)args->stackend +
884 		    ((char *)&aux->a_type -
885 		    (char *)bigwad->elfargs));
886 		ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
887 
888 		/*
889 		 * Hardware capability flag word (performance hints)
890 		 * Used for choosing faster library routines.
891 		 * (Potentially different between 32-bit and 64-bit ABIs)
892 		 */
893 		if (args->to_model == DATAMODEL_NATIVE) {
894 			ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
895 			ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
896 			ADDAUX(aux, AT_SUN_HWCAP3, auxv_hwcap_3)
897 		} else {
898 			ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
899 			ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
900 			ADDAUX(aux, AT_SUN_HWCAP3, auxv_hwcap32_3)
901 		}
902 
903 		if (branded) {
904 			/*
905 			 * Reserve space for the brand-private aux vectors,
906 			 * and record the user addr of that space.
907 			 */
908 			args->auxp_brand =
909 			    (char *)((char *)args->stackend +
910 			    ((char *)&aux->a_type -
911 			    (char *)bigwad->elfargs));
912 			ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
913 			ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
914 			ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
915 		}
916 
917 		/*
918 		 * Add the comm page auxv entry, mapping it in if needed. Also
919 		 * take care of the FPU entries.
920 		 */
921 #if defined(__amd64)
922 		if (args->commpage != (uintptr_t)NULL ||
923 		    (args->commpage = (uintptr_t)comm_page_mapin()) !=
924 		    (uintptr_t)NULL) {
925 			ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
926 		} else {
927 			/*
928 			 * If the comm page cannot be mapped, pad out the auxv
929 			 * to satisfy later size checks.
930 			 */
931 			ADDAUX(aux, AT_NULL, 0)
932 		}
933 
934 		fptype = AT_386_FPINFO_NONE;
935 		fpu_auxv_info(&fptype, &fpsize);
936 		if (fptype != AT_386_FPINFO_NONE) {
937 			ADDAUX(aux, AT_SUN_FPTYPE, fptype)
938 			ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
939 		} else {
940 			ADDAUX(aux, AT_NULL, 0)
941 			ADDAUX(aux, AT_NULL, 0)
942 		}
943 #endif /* defined(__amd64) */
944 
945 		ADDAUX(aux, AT_NULL, 0)
946 		postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
947 
948 		/*
949 		 * We make assumptions above when we determine how many aux
950 		 * vector entries we will be adding. However, if we have an
951 		 * invalid elf file, it is possible that mapelfexec might
952 		 * behave differently (but not return an error), in which case
953 		 * the number of aux entries we actually add will be different.
954 		 * We detect that now and error out.
955 		 */
956 		if (postfixsize != args->auxsize) {
957 			DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
958 			    size_t, args->auxsize);
959 			goto bad;
960 		}
961 		ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
962 	}
963 
964 	/*
965 	 * For the 64-bit kernel, the limit is big enough that rounding it up
966 	 * to a page can overflow the 64-bit limit, so we check for btopr()
967 	 * overflowing here by comparing it with the unrounded limit in pages.
968 	 * If it hasn't overflowed, compare the exec size with the rounded up
969 	 * limit in pages.  Otherwise, just compare with the unrounded limit.
970 	 */
971 	limit = btop(p->p_vmem_ctl);
972 	roundlimit = btopr(p->p_vmem_ctl);
973 	if ((roundlimit > limit && *execsz > roundlimit) ||
974 	    (roundlimit < limit && *execsz > limit)) {
975 		mutex_enter(&p->p_lock);
976 		(void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
977 		    RCA_SAFE);
978 		mutex_exit(&p->p_lock);
979 		error = ENOMEM;
980 		goto bad;
981 	}
982 
983 	bzero(up->u_auxv, sizeof (up->u_auxv));
984 	up->u_commpagep = args->commpage;
985 	if (postfixsize) {
986 		size_t num_auxv;
987 
988 		/*
989 		 * Copy the aux vector to the user stack.
990 		 */
991 		error = execpoststack(args, bigwad->elfargs, postfixsize);
992 		if (error)
993 			goto bad;
994 
995 		/*
996 		 * Copy auxv to the process's user structure for use by /proc.
997 		 * If this is a branded process, the brand's exec routine will
998 		 * copy it's private entries to the user structure later. It
999 		 * relies on the fact that the blank entries are at the end.
1000 		 */
1001 		num_auxv = postfixsize / sizeof (aux_entry_t);
1002 		ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1003 		aux = bigwad->elfargs;
1004 		for (i = 0; i < num_auxv; i++) {
1005 			up->u_auxv[i].a_type = aux[i].a_type;
1006 			up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1007 		}
1008 	}
1009 
1010 	/*
1011 	 * Pass back the starting address so we can set the program counter.
1012 	 */
1013 	args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1014 
1015 	if (!uphdr) {
1016 		if (ehdrp->e_type == ET_DYN) {
1017 			/*
1018 			 * If we are executing a shared library which doesn't
1019 			 * have a interpreter (probably ld.so.1) then
1020 			 * we don't set the brkbase now.  Instead we
1021 			 * delay it's setting until the first call
1022 			 * via grow.c::brk().  This permits ld.so.1 to
1023 			 * initialize brkbase to the tail of the executable it
1024 			 * loads (which is where it needs to be).
1025 			 */
1026 			bigwad->exenv.ex_brkbase = (caddr_t)0;
1027 			bigwad->exenv.ex_bssbase = (caddr_t)0;
1028 			bigwad->exenv.ex_brksize = 0;
1029 		} else {
1030 			bigwad->exenv.ex_brkbase = brkbase;
1031 			bigwad->exenv.ex_bssbase = bssbase;
1032 			bigwad->exenv.ex_brksize = brksize;
1033 		}
1034 		bigwad->exenv.ex_magic = elfmagic;
1035 		bigwad->exenv.ex_vp = vp;
1036 		setexecenv(&bigwad->exenv);
1037 	}
1038 
1039 	ASSERT(error == 0);
1040 	goto out;
1041 
1042 bad:
1043 	if (fd != -1)		/* did we open the a.out yet */
1044 		(void) execclose(fd);
1045 
1046 	psignal(p, SIGKILL);
1047 
1048 	if (error == 0)
1049 		error = ENOEXEC;
1050 out:
1051 	if (dynuphdr)
1052 		kmem_free(uphdr, sizeof (Phdr));
1053 	if (phdrbase != NULL)
1054 		kmem_free(phdrbase, phdrsize);
1055 	if (cap != NULL)
1056 		kmem_free(cap, capsize);
1057 	kmem_free(bigwad, sizeof (struct bigwad));
1058 	return (error);
1059 }
1060 
1061 /*
1062  * Compute the memory size requirement for the ELF file.
1063  */
1064 static size_t
1065 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1066     uintptr_t *lddata)
1067 {
1068 	const Phdr *phdrp = (Phdr *)phdrbase;
1069 	const uint_t hsize = ehdrp->e_phentsize;
1070 	boolean_t dfirst = B_TRUE;
1071 	uintptr_t loaddr = UINTPTR_MAX;
1072 	uintptr_t hiaddr = 0;
1073 	uint_t i;
1074 
1075 	for (i = nphdrs; i > 0; i--) {
1076 		if (phdrp->p_type == PT_LOAD) {
1077 			const uintptr_t lo = phdrp->p_vaddr;
1078 			const uintptr_t hi = lo + phdrp->p_memsz;
1079 
1080 			loaddr = MIN(lo, loaddr);
1081 			hiaddr = MAX(hi, hiaddr);
1082 
1083 			/*
1084 			 * save the address of the first data segment
1085 			 * of a object - used for the AT_SUNW_LDDATA
1086 			 * aux entry.
1087 			 */
1088 			if ((lddata != NULL) && dfirst &&
1089 			    (phdrp->p_flags & PF_W)) {
1090 				*lddata = lo;
1091 				dfirst = B_FALSE;
1092 			}
1093 		}
1094 		phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1095 	}
1096 
1097 	if (hiaddr <= loaddr) {
1098 		/* No non-zero PT_LOAD segment found */
1099 		return (0);
1100 	}
1101 
1102 	return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1103 }
1104 
1105 /*
1106  * Read in the ELF header and program header table.
1107  * SUSV3 requires:
1108  *	ENOEXEC	File format is not recognized
1109  *	EINVAL	Format recognized but execution not supported
1110  */
1111 static int
1112 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1113     uint_t *shstrndx, uint_t *nphdrs)
1114 {
1115 	int error;
1116 	ssize_t resid;
1117 
1118 	/*
1119 	 * We got here by the first two bytes in ident,
1120 	 * now read the entire ELF header.
1121 	 */
1122 	if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
1123 	    sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
1124 	    (rlim64_t)0, credp, &resid)) != 0)
1125 		return (error);
1126 
1127 	/*
1128 	 * Since a separate version is compiled for handling 32-bit and
1129 	 * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1130 	 * doesn't need to be able to deal with 32-bit ELF files.
1131 	 */
1132 	if (resid != 0 ||
1133 	    ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1134 	    ehdr->e_ident[EI_MAG3] != ELFMAG3)
1135 		return (ENOEXEC);
1136 
1137 	if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1138 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1139 	    ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1140 #else
1141 	    ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1142 #endif
1143 	    !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1144 	    ehdr->e_flags))
1145 		return (EINVAL);
1146 
1147 	*nshdrs = ehdr->e_shnum;
1148 	*shstrndx = ehdr->e_shstrndx;
1149 	*nphdrs = ehdr->e_phnum;
1150 
1151 	/*
1152 	 * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1153 	 * to read in the section header at index zero to access the true
1154 	 * values for those fields.
1155 	 */
1156 	if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1157 	    *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1158 		Shdr shdr;
1159 
1160 		if (ehdr->e_shoff == 0)
1161 			return (EINVAL);
1162 
1163 		if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1164 		    sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1165 		    (rlim64_t)0, credp, NULL)) != 0) {
1166 			return (error);
1167 		}
1168 
1169 		if (*nshdrs == 0)
1170 			*nshdrs = shdr.sh_size;
1171 		if (*shstrndx == SHN_XINDEX)
1172 			*shstrndx = shdr.sh_link;
1173 		if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1174 			*nphdrs = shdr.sh_info;
1175 	}
1176 
1177 	return (0);
1178 }
1179 
1180 /*
1181  * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1182  * so e_phentsize must be at least large enough to include those members.
1183  */
1184 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1185 #define	MINPHENTSZ	(offsetof(Phdr, p_flags) + \
1186 			sizeof (((Phdr *)NULL)->p_flags))
1187 #else
1188 #define	MINPHENTSZ	(offsetof(Phdr, p_memsz) + \
1189 			sizeof (((Phdr *)NULL)->p_memsz))
1190 #endif
1191 
1192 static int
1193 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1194     caddr_t *phbasep, size_t *phsizep)
1195 {
1196 	int err;
1197 
1198 	/*
1199 	 * Ensure that e_phentsize is large enough for required fields to be
1200 	 * accessible and will maintain 8-byte alignment.
1201 	 */
1202 	if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1203 		return (EINVAL);
1204 
1205 	*phsizep = nphdrs * ehdr->e_phentsize;
1206 
1207 	if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1208 		if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1209 			return (ENOMEM);
1210 	} else {
1211 		*phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1212 	}
1213 
1214 	if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1215 	    (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1216 	    credp, NULL)) != 0) {
1217 		kmem_free(*phbasep, *phsizep);
1218 		*phbasep = NULL;
1219 		return (err);
1220 	}
1221 
1222 	return (0);
1223 }
1224 
1225 #define	MINSHDRSZ	(offsetof(Shdr, sh_entsize) + \
1226 			sizeof (((Shdr *)NULL)->sh_entsize))
1227 
1228 static int
1229 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1230     uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1231     size_t *shstrsizep)
1232 {
1233 	int err;
1234 	Shdr *shdr;
1235 
1236 	/*
1237 	 * Since we're going to be using e_shentsize to iterate down the
1238 	 * array of section headers, it must be 8-byte aligned or else
1239 	 * a we might cause a misaligned access. We use all members through
1240 	 * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1241 	 * must be at least large enough to include that member. The index
1242 	 * of the string table section must also be valid.
1243 	 */
1244 	if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1245 	    nshdrs == 0 || shstrndx >= nshdrs) {
1246 		return (EINVAL);
1247 	}
1248 
1249 	*shsizep = nshdrs * ehdr->e_shentsize;
1250 
1251 	if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1252 		if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1253 			return (ENOMEM);
1254 	} else {
1255 		*shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1256 	}
1257 
1258 	if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1259 	    (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1260 	    credp, NULL)) != 0) {
1261 		kmem_free(*shbasep, *shsizep);
1262 		return (err);
1263 	}
1264 
1265 	/*
1266 	 * Grab the section string table.  Walking through the shdrs is
1267 	 * pointless if their names cannot be interrogated.
1268 	 */
1269 	shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1270 	if ((*shstrsizep = shdr->sh_size) == 0) {
1271 		kmem_free(*shbasep, *shsizep);
1272 		return (EINVAL);
1273 	}
1274 
1275 	if (*shstrsizep > elf_shstrtab_max) {
1276 		if ((*shstrbasep = kmem_alloc(*shstrsizep,
1277 		    KM_NOSLEEP)) == NULL) {
1278 			kmem_free(*shbasep, *shsizep);
1279 			return (ENOMEM);
1280 		}
1281 	} else {
1282 		*shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1283 	}
1284 
1285 	if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1286 	    (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1287 	    credp, NULL)) != 0) {
1288 		kmem_free(*shbasep, *shsizep);
1289 		kmem_free(*shstrbasep, *shstrsizep);
1290 		return (err);
1291 	}
1292 
1293 	/*
1294 	 * Make sure the strtab is null-terminated to make sure we
1295 	 * don't run off the end of the table.
1296 	 */
1297 	(*shstrbasep)[*shstrsizep - 1] = '\0';
1298 
1299 	return (0);
1300 }
1301 
1302 int
1303 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1304     caddr_t *phbasep, size_t *phsizep)
1305 {
1306 	int error;
1307 	uint_t nshdrs, shstrndx;
1308 
1309 	if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1310 	    nphdrs)) != 0 ||
1311 	    (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1312 	    phsizep)) != 0) {
1313 		return (error);
1314 	}
1315 	return (0);
1316 }
1317 
1318 static int
1319 mapelfexec(
1320 	vnode_t *vp,
1321 	Ehdr *ehdr,
1322 	uint_t nphdrs,
1323 	caddr_t phdrbase,
1324 	Phdr **uphdr,
1325 	Phdr **intphdr,
1326 	Phdr **stphdr,
1327 	Phdr **dtphdr,
1328 	Phdr *dataphdrp,
1329 	caddr_t *bssbase,
1330 	caddr_t *brkbase,
1331 	intptr_t *voffset,
1332 	uintptr_t *minaddrp,
1333 	size_t len,
1334 	size_t *execsz,
1335 	size_t *brksize)
1336 {
1337 	Phdr *phdr;
1338 	int error, page, prot;
1339 	caddr_t addr = NULL;
1340 	caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1341 	uint_t i;
1342 	size_t zfodsz, memsz;
1343 	boolean_t ptload = B_FALSE;
1344 	off_t offset;
1345 	const uint_t hsize = ehdr->e_phentsize;
1346 	extern int use_brk_lpg;
1347 
1348 	if (ehdr->e_type == ET_DYN) {
1349 		secflagset_t flags = 0;
1350 		/*
1351 		 * Obtain the virtual address of a hole in the
1352 		 * address space to map the "interpreter".
1353 		 */
1354 		if (secflag_enabled(curproc, PROC_SEC_ASLR))
1355 			flags |= _MAP_RANDOMIZE;
1356 
1357 		map_addr(&addr, len, (offset_t)0, 1, flags);
1358 		if (addr == NULL)
1359 			return (ENOMEM);
1360 		*voffset = (intptr_t)addr;
1361 
1362 		/*
1363 		 * Calculate the minimum vaddr so it can be subtracted out.
1364 		 * According to the ELF specification, since PT_LOAD sections
1365 		 * must be sorted by increasing p_vaddr values, this is
1366 		 * guaranteed to be the first PT_LOAD section.
1367 		 */
1368 		phdr = (Phdr *)phdrbase;
1369 		for (i = nphdrs; i > 0; i--) {
1370 			if (phdr->p_type == PT_LOAD) {
1371 				*voffset -= (uintptr_t)phdr->p_vaddr;
1372 				break;
1373 			}
1374 			phdr = (Phdr *)((caddr_t)phdr + hsize);
1375 		}
1376 
1377 	} else {
1378 		*voffset = 0;
1379 	}
1380 
1381 	phdr = (Phdr *)phdrbase;
1382 	for (i = nphdrs; i > 0; i--) {
1383 		switch (phdr->p_type) {
1384 		case PT_LOAD:
1385 			ptload = B_TRUE;
1386 			prot = PROT_USER;
1387 			if (phdr->p_flags & PF_R)
1388 				prot |= PROT_READ;
1389 			if (phdr->p_flags & PF_W)
1390 				prot |= PROT_WRITE;
1391 			if (phdr->p_flags & PF_X)
1392 				prot |= PROT_EXEC;
1393 
1394 			addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1395 
1396 			if (*intphdr != NULL && uphdr != NULL &&
1397 			    *uphdr == NULL) {
1398 				/*
1399 				 * The PT_PHDR program header is, strictly
1400 				 * speaking, optional.  If we find that this
1401 				 * is missing, we will determine the location
1402 				 * of the program headers based on the address
1403 				 * of the lowest PT_LOAD segment (namely, this
1404 				 * one):  we subtract the p_offset to get to
1405 				 * the ELF header and then add back the program
1406 				 * header offset to get to the program headers.
1407 				 * We then cons up a Phdr that corresponds to
1408 				 * the (missing) PT_PHDR, setting the flags
1409 				 * to 0 to denote that this is artificial and
1410 				 * should (must) be freed by the caller.
1411 				 */
1412 				Phdr *cons;
1413 
1414 				cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1415 
1416 				cons->p_flags = 0;
1417 				cons->p_type = PT_PHDR;
1418 				cons->p_vaddr = ((uintptr_t)addr -
1419 				    phdr->p_offset) + ehdr->e_phoff;
1420 
1421 				*uphdr = cons;
1422 			}
1423 
1424 			/*
1425 			 * The ELF spec dictates that p_filesz may not be
1426 			 * larger than p_memsz in PT_LOAD segments.
1427 			 */
1428 			if (phdr->p_filesz > phdr->p_memsz) {
1429 				error = EINVAL;
1430 				goto bad;
1431 			}
1432 
1433 			/*
1434 			 * Keep track of the segment with the lowest starting
1435 			 * address.
1436 			 */
1437 			if (addr < minaddr)
1438 				minaddr = addr;
1439 
1440 			zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1441 
1442 			offset = phdr->p_offset;
1443 			if (((uintptr_t)offset & PAGEOFFSET) ==
1444 			    ((uintptr_t)addr & PAGEOFFSET) &&
1445 			    (!(vp->v_flag & VNOMAP))) {
1446 				page = 1;
1447 			} else {
1448 				page = 0;
1449 			}
1450 
1451 			/*
1452 			 * Set the heap pagesize for OOB when the bss size
1453 			 * is known and use_brk_lpg is not 0.
1454 			 */
1455 			if (brksize != NULL && use_brk_lpg &&
1456 			    zfodsz != 0 && phdr == dataphdrp &&
1457 			    (prot & PROT_WRITE)) {
1458 				const size_t tlen = P2NPHASE((uintptr_t)addr +
1459 				    phdr->p_filesz, PAGESIZE);
1460 
1461 				if (zfodsz > tlen) {
1462 					const caddr_t taddr = addr +
1463 					    phdr->p_filesz + tlen;
1464 
1465 					/*
1466 					 * Since a hole in the AS large enough
1467 					 * for this object as calculated by
1468 					 * elfsize() is available, we do not
1469 					 * need to fear overflow for 'taddr'.
1470 					 */
1471 					curproc->p_brkpageszc =
1472 					    page_szc(map_pgsz(MAPPGSZ_HEAP,
1473 					    curproc, taddr, zfodsz - tlen, 0));
1474 				}
1475 			}
1476 
1477 			if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1478 			    (prot & PROT_WRITE)) {
1479 				uint_t	szc = curproc->p_brkpageszc;
1480 				size_t pgsz = page_get_pagesize(szc);
1481 				caddr_t ebss = addr + phdr->p_memsz;
1482 				/*
1483 				 * If we need extra space to keep the BSS an
1484 				 * integral number of pages in size, some of
1485 				 * that space may fall beyond p_brkbase, so we
1486 				 * need to set p_brksize to account for it
1487 				 * being (logically) part of the brk.
1488 				 */
1489 				size_t extra_zfodsz;
1490 
1491 				ASSERT(pgsz > PAGESIZE);
1492 
1493 				extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1494 
1495 				if (error = execmap(vp, addr, phdr->p_filesz,
1496 				    zfodsz + extra_zfodsz, phdr->p_offset,
1497 				    prot, page, szc))
1498 					goto bad;
1499 				if (brksize != NULL)
1500 					*brksize = extra_zfodsz;
1501 			} else {
1502 				if (error = execmap(vp, addr, phdr->p_filesz,
1503 				    zfodsz, phdr->p_offset, prot, page, 0))
1504 					goto bad;
1505 			}
1506 
1507 			if (bssbase != NULL && addr >= *bssbase &&
1508 			    phdr == dataphdrp) {
1509 				*bssbase = addr + phdr->p_filesz;
1510 			}
1511 			if (brkbase != NULL && addr >= *brkbase) {
1512 				*brkbase = addr + phdr->p_memsz;
1513 			}
1514 
1515 			memsz = btopr(phdr->p_memsz);
1516 			if ((*execsz + memsz) < *execsz) {
1517 				error = ENOMEM;
1518 				goto bad;
1519 			}
1520 			*execsz += memsz;
1521 			break;
1522 
1523 		case PT_INTERP:
1524 			if (ptload)
1525 				goto bad;
1526 			*intphdr = phdr;
1527 			break;
1528 
1529 		case PT_SHLIB:
1530 			*stphdr = phdr;
1531 			break;
1532 
1533 		case PT_PHDR:
1534 			if (ptload || phdr->p_flags == 0)
1535 				goto bad;
1536 
1537 			if (uphdr != NULL)
1538 				*uphdr = phdr;
1539 
1540 			break;
1541 
1542 		case PT_NULL:
1543 		case PT_DYNAMIC:
1544 		case PT_NOTE:
1545 			break;
1546 
1547 		case PT_SUNWDTRACE:
1548 			if (dtphdr != NULL)
1549 				*dtphdr = phdr;
1550 			break;
1551 
1552 		default:
1553 			break;
1554 		}
1555 		phdr = (Phdr *)((caddr_t)phdr + hsize);
1556 	}
1557 
1558 	if (minaddrp != NULL) {
1559 		ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1560 		*minaddrp = (uintptr_t)minaddr;
1561 	}
1562 
1563 	if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1564 		size_t off;
1565 		uintptr_t base = (uintptr_t)*brkbase;
1566 		uintptr_t oend = base + *brksize;
1567 
1568 		ASSERT(ISP2(aslr_max_brk_skew));
1569 
1570 		(void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1571 		base += P2PHASE(off, aslr_max_brk_skew);
1572 		base = P2ROUNDUP(base, PAGESIZE);
1573 		*brkbase = (caddr_t)base;
1574 		/*
1575 		 * Above, we set *brksize to account for the possibility we
1576 		 * had to grow the 'brk' in padding out the BSS to a page
1577 		 * boundary.
1578 		 *
1579 		 * We now need to adjust that based on where we now are
1580 		 * actually putting the brk.
1581 		 */
1582 		if (oend > base)
1583 			*brksize = oend - base;
1584 		else
1585 			*brksize = 0;
1586 	}
1587 
1588 	return (0);
1589 bad:
1590 	if (error == 0)
1591 		error = EINVAL;
1592 	return (error);
1593 }
1594 
1595 int
1596 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1597     rlim64_t rlimit, cred_t *credp)
1598 {
1599 	Note note;
1600 	int error;
1601 
1602 	bzero(&note, sizeof (note));
1603 	bcopy("CORE", note.name, 4);
1604 	note.nhdr.n_type = type;
1605 	/*
1606 	 * The System V ABI states that n_namesz must be the length of the
1607 	 * string that follows the Nhdr structure including the terminating
1608 	 * null. The ABI also specifies that sufficient padding should be
1609 	 * included so that the description that follows the name string
1610 	 * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1611 	 * respectively. However, since this change was not made correctly
1612 	 * at the time of the 64-bit port, both 32- and 64-bit binaries
1613 	 * descriptions are only guaranteed to begin on a 4-byte boundary.
1614 	 */
1615 	note.nhdr.n_namesz = 5;
1616 	note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1617 
1618 	if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1619 	    sizeof (note), rlimit, credp))
1620 		return (error);
1621 
1622 	*offsetp += sizeof (note);
1623 
1624 	if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1625 	    note.nhdr.n_descsz, rlimit, credp))
1626 		return (error);
1627 
1628 	*offsetp += note.nhdr.n_descsz;
1629 	return (0);
1630 }
1631 
1632 /*
1633  * Copy the section data from one vnode to the section of another vnode.
1634  */
1635 static void
1636 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1637 {
1638 	size_t n = src->sh_size;
1639 	u_offset_t off = 0;
1640 	const u_offset_t soff = src->sh_offset;
1641 	const u_offset_t doff = ctx->ecc_doffset;
1642 	void *buf = ctx->ecc_buf;
1643 	vnode_t *dst_vp = ctx->ecc_vp;
1644 	cred_t *credp = ctx->ecc_credp;
1645 
1646 	/* Protect the copy loop below from overflow on the offsets */
1647 	if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1648 	    (n + soff) < n || (n + doff) < n) {
1649 		dst->sh_size = 0;
1650 		dst->sh_offset = 0;
1651 		return;
1652 	}
1653 
1654 	while (n != 0) {
1655 		const size_t len = MIN(ctx->ecc_bufsz, n);
1656 		ssize_t resid;
1657 
1658 		if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1659 		    (offset_t)(soff + off),
1660 		    UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1661 		    resid >= len || resid < 0 ||
1662 		    core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1663 		    buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1664 			dst->sh_size = 0;
1665 			dst->sh_offset = 0;
1666 			return;
1667 		}
1668 
1669 		ASSERT(n >= len - resid);
1670 
1671 		n -= len - resid;
1672 		off += len - resid;
1673 	}
1674 
1675 	ctx->ecc_doffset += src->sh_size;
1676 }
1677 
1678 /*
1679  * Walk sections for a given ELF object, counting (or copying) those of
1680  * interest (CTF, symtab, strtab, .debug_*).
1681  */
1682 static int
1683 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1684     Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab, uint_t *countp)
1685 {
1686 	Ehdr ehdr;
1687 	const core_content_t content = ctx->ecc_content;
1688 	cred_t *credp = ctx->ecc_credp;
1689 	Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1690 	uintptr_t off = 0;
1691 	uint_t nshdrs, shstrndx, nphdrs, count = 0;
1692 	u_offset_t *doffp = &ctx->ecc_doffset;
1693 	boolean_t ctf_link = B_FALSE;
1694 	caddr_t shbase;
1695 	size_t shsize, shstrsize;
1696 	char *shstrbase;
1697 	int error = 0;
1698 	const boolean_t justcounting = v == NULL;
1699 
1700 	*countp = 0;
1701 
1702 	if ((content &
1703 	    (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG)) == 0) {
1704 		return (0);
1705 	}
1706 
1707 	if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
1708 	    getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
1709 	    &shstrbase, &shstrsize) != 0) {
1710 		return (0);
1711 	}
1712 
1713 	/* Starting at index 1 skips SHT_NULL which is expected at index 0 */
1714 	off = ehdr.e_shentsize;
1715 	for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
1716 		Shdr *shdr, *symchk = NULL, *strchk;
1717 		const char *name;
1718 
1719 		shdr = (Shdr *)(shbase + off);
1720 		if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
1721 			continue;
1722 
1723 		name = shstrbase + shdr->sh_name;
1724 
1725 		if (ctf == NULL &&
1726 		    (content & CC_CONTENT_CTF) != 0 &&
1727 		    strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1728 			ctf = shdr;
1729 			if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
1730 				/* check linked symtab below */
1731 				symchk = (Shdr *)(shbase +
1732 				    shdr->sh_link * ehdr.e_shentsize);
1733 				ctf_link = B_TRUE;
1734 			} else {
1735 				continue;
1736 			}
1737 		} else if (symtab == NULL &&
1738 		    (content & CC_CONTENT_SYMTAB) != 0 &&
1739 		    strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
1740 			symchk = shdr;
1741 		} else if ((content & CC_CONTENT_DEBUG) != 0 &&
1742 		    strncmp(name, ".debug_", strlen(".debug_")) == 0) {
1743 			/*
1744 			 * The design of the above check is intentional. In
1745 			 * particular, we want to capture any sections that
1746 			 * begin with '.debug_' for a few reasons:
1747 			 *
1748 			 * 1) Various revisions to the DWARF spec end up
1749 			 * changing the set of section headers that exist. This
1750 			 * ensures that we don't need to change the kernel to
1751 			 * get a new version.
1752 			 *
1753 			 * 2) Other software uses .debug_ sections for things
1754 			 * which aren't DWARF. This allows them to be captured
1755 			 * as well.
1756 			 */
1757 			count++;
1758 
1759 			if (!justcounting) {
1760 				if (count > remain) {
1761 					error = ENOMEM;
1762 					goto done;
1763 				}
1764 
1765 				elf_ctx_resize_scratch(ctx, shdr->sh_size);
1766 
1767 				if (!shstrtab_ndx(shstrtab,
1768 				    name, &v[idx].sh_name)) {
1769 					error = ENOMEM;
1770 					goto done;
1771 				}
1772 
1773 				v[idx].sh_addr = (Addr)(uintptr_t)saddr;
1774 				v[idx].sh_type = shdr->sh_type;
1775 				v[idx].sh_addralign = shdr->sh_addralign;
1776 				*doffp = roundup(*doffp, v[idx].sh_addralign);
1777 				v[idx].sh_offset = *doffp;
1778 				v[idx].sh_size = shdr->sh_size;
1779 				v[idx].sh_link = 0;
1780 				v[idx].sh_entsize = shdr->sh_entsize;
1781 				v[idx].sh_info = shdr->sh_info;
1782 
1783 				elf_copy_scn(ctx, shdr, mvp, &v[idx]);
1784 				idx++;
1785 			}
1786 
1787 			continue;
1788 		} else {
1789 			continue;
1790 		}
1791 
1792 		ASSERT(symchk != NULL);
1793 		if ((symchk->sh_type != SHT_DYNSYM &&
1794 		    symchk->sh_type != SHT_SYMTAB) ||
1795 		    symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
1796 			ctf_link = B_FALSE;
1797 			continue;
1798 		}
1799 		strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
1800 		if (strchk->sh_type != SHT_STRTAB) {
1801 			ctf_link = B_FALSE;
1802 			continue;
1803 		}
1804 		symtab = symchk;
1805 		strtab = strchk;
1806 
1807 		if (symtab != NULL && ctf != NULL &&
1808 		    (content & CC_CONTENT_DEBUG) == 0) {
1809 			/* No other shdrs are of interest at this point */
1810 			break;
1811 		}
1812 	}
1813 
1814 	if (ctf != NULL)
1815 		count += 1;
1816 	if (symtab != NULL)
1817 		count += 2;
1818 
1819 	if (count > remain) {
1820 		count = remain;
1821 		if (!justcounting)
1822 			error = ENOMEM;
1823 		goto done;
1824 	}
1825 
1826 	if (justcounting)
1827 		goto done;
1828 
1829 	/* output CTF section */
1830 	if (ctf != NULL) {
1831 		elf_ctx_resize_scratch(ctx, ctf->sh_size);
1832 
1833 		if (!shstrtab_ndx(shstrtab,
1834 		    shstrtab_data[STR_CTF], &v[idx].sh_name)) {
1835 			error = ENOMEM;
1836 			goto done;
1837 		}
1838 		v[idx].sh_addr = (Addr)(uintptr_t)saddr;
1839 		v[idx].sh_type = SHT_PROGBITS;
1840 		v[idx].sh_addralign = 4;
1841 		*doffp = roundup(*doffp, v[idx].sh_addralign);
1842 		v[idx].sh_offset = *doffp;
1843 		v[idx].sh_size = ctf->sh_size;
1844 
1845 		if (ctf_link) {
1846 			/*
1847 			 * The linked symtab (and strtab) will be output
1848 			 * immediately after this CTF section.  Its shdr index
1849 			 * directly follows this one.
1850 			 */
1851 			v[idx].sh_link = idx + 1;
1852 			ASSERT(symtab != NULL);
1853 		} else {
1854 			v[idx].sh_link = 0;
1855 		}
1856 		elf_copy_scn(ctx, ctf, mvp, &v[idx]);
1857 		idx++;
1858 	}
1859 
1860 	/* output SYMTAB/STRTAB sections */
1861 	if (symtab != NULL) {
1862 		shstrtype_t symtab_type, strtab_type;
1863 		uint_t symtab_name, strtab_name;
1864 
1865 		elf_ctx_resize_scratch(ctx,
1866 		    MAX(symtab->sh_size, strtab->sh_size));
1867 
1868 		if (symtab->sh_type == SHT_DYNSYM) {
1869 			symtab_type = STR_DYNSYM;
1870 			strtab_type = STR_DYNSTR;
1871 		} else {
1872 			symtab_type = STR_SYMTAB;
1873 			strtab_type = STR_STRTAB;
1874 		}
1875 
1876 		if (!shstrtab_ndx(shstrtab,
1877 		    shstrtab_data[symtab_type], &symtab_name)) {
1878 			error = ENOMEM;
1879 			goto done;
1880 		}
1881 		if (!shstrtab_ndx(shstrtab,
1882 		    shstrtab_data[strtab_type], &strtab_name)) {
1883 			error = ENOMEM;
1884 			goto done;
1885 		}
1886 
1887 		v[idx].sh_name = symtab_name;
1888 		v[idx].sh_type = symtab->sh_type;
1889 		v[idx].sh_addr = symtab->sh_addr;
1890 		if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
1891 			v[idx].sh_addr += (Addr)(uintptr_t)saddr;
1892 		v[idx].sh_addralign = symtab->sh_addralign;
1893 		*doffp = roundup(*doffp, v[idx].sh_addralign);
1894 		v[idx].sh_offset = *doffp;
1895 		v[idx].sh_size = symtab->sh_size;
1896 		v[idx].sh_link = idx + 1;
1897 		v[idx].sh_entsize = symtab->sh_entsize;
1898 		v[idx].sh_info = symtab->sh_info;
1899 
1900 		elf_copy_scn(ctx, symtab, mvp, &v[idx]);
1901 		idx++;
1902 
1903 		v[idx].sh_name = strtab_name;
1904 		v[idx].sh_type = SHT_STRTAB;
1905 		v[idx].sh_flags = SHF_STRINGS;
1906 		v[idx].sh_addr = strtab->sh_addr;
1907 		if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
1908 			v[idx].sh_addr += (Addr)(uintptr_t)saddr;
1909 		v[idx].sh_addralign = strtab->sh_addralign;
1910 		*doffp = roundup(*doffp, v[idx].sh_addralign);
1911 		v[idx].sh_offset = *doffp;
1912 		v[idx].sh_size = strtab->sh_size;
1913 
1914 		elf_copy_scn(ctx, strtab, mvp, &v[idx]);
1915 		idx++;
1916 	}
1917 
1918 done:
1919 	kmem_free(shstrbase, shstrsize);
1920 	kmem_free(shbase, shsize);
1921 
1922 	if (error == 0)
1923 		*countp = count;
1924 
1925 	return (error);
1926 }
1927 
1928 /*
1929  * Walk mappings in process address space, examining those which correspond to
1930  * loaded objects.  It is called twice from elfcore: Once to simply count
1931  * relevant sections, and again later to copy those sections once an adequate
1932  * buffer has been allocated for the shdr details.
1933  */
1934 static int
1935 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
1936 {
1937 	vnode_t *lastvp = NULL;
1938 	struct seg *seg;
1939 	uint_t idx = 0, remain;
1940 	shstrtab_t shstrtab;
1941 	struct as *as = ctx->ecc_p->p_as;
1942 	int error = 0;
1943 
1944 	ASSERT(AS_WRITE_HELD(as));
1945 
1946 	if (v != NULL) {
1947 		ASSERT(nv != 0);
1948 
1949 		if (!shstrtab_init(&shstrtab))
1950 			return (ENOMEM);
1951 		remain = nv;
1952 	} else {
1953 		ASSERT(nv == 0);
1954 
1955 		/*
1956 		 * The shdrs are being counted, rather than outputting them
1957 		 * into a buffer.  Leave room for two entries: the SHT_NULL at
1958 		 * index 0 and the shstrtab at the end.
1959 		 */
1960 		remain = UINT_MAX - 2;
1961 	}
1962 
1963 	/* Per the ELF spec, shdr index 0 is reserved. */
1964 	idx = 1;
1965 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1966 		vnode_t *mvp;
1967 		void *tmp = NULL;
1968 		caddr_t saddr = seg->s_base, naddr, eaddr;
1969 		size_t segsize;
1970 		uint_t count, prot;
1971 
1972 		/*
1973 		 * Since we're just looking for text segments of load
1974 		 * objects, we only care about the protection bits; we don't
1975 		 * care about the actual size of the segment so we use the
1976 		 * reserved size. If the segment's size is zero, there's
1977 		 * something fishy going on so we ignore this segment.
1978 		 */
1979 		if (seg->s_ops != &segvn_ops ||
1980 		    SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
1981 		    mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
1982 		    (segsize = pr_getsegsize(seg, 1)) == 0)
1983 			continue;
1984 
1985 		eaddr = saddr + segsize;
1986 		prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
1987 		pr_getprot_done(&tmp);
1988 
1989 		/*
1990 		 * Skip this segment unless the protection bits look like
1991 		 * what we'd expect for a text segment.
1992 		 */
1993 		if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
1994 			continue;
1995 
1996 		error = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
1997 		    &shstrtab, &count);
1998 		if (error != 0)
1999 			goto done;
2000 
2001 		ASSERT(count <= remain);
2002 		ASSERT(v == NULL || (idx + count) < nv);
2003 
2004 		remain -= count;
2005 		idx += count;
2006 		lastvp = mvp;
2007 	}
2008 
2009 	if (v == NULL) {
2010 		if (idx == 1) {
2011 			*nshdrsp = 0;
2012 		} else {
2013 			/* Include room for the shrstrtab at the end */
2014 			*nshdrsp = idx + 1;
2015 		}
2016 		return (0);
2017 	}
2018 
2019 	if (idx != nv - 1) {
2020 		cmn_err(CE_WARN, "elfcore: core dump failed for "
2021 		    "process %d; address space is changing",
2022 		    ctx->ecc_p->p_pid);
2023 		error = EIO;
2024 		goto done;
2025 	}
2026 
2027 	if (!shstrtab_ndx(&shstrtab, shstrtab_data[STR_SHSTRTAB],
2028 	    &v[idx].sh_name)) {
2029 		error = ENOMEM;
2030 		goto done;
2031 	}
2032 	v[idx].sh_size = shstrtab_size(&shstrtab);
2033 	v[idx].sh_addralign = 1;
2034 	v[idx].sh_offset = ctx->ecc_doffset;
2035 	v[idx].sh_flags = SHF_STRINGS;
2036 	v[idx].sh_type = SHT_STRTAB;
2037 
2038 	elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2039 	VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2040 	shstrtab_dump(&shstrtab, ctx->ecc_buf);
2041 
2042 	error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2043 	    ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2044 	if (error == 0) {
2045 		ctx->ecc_doffset += v[idx].sh_size;
2046 	}
2047 
2048 done:
2049 	if (v != NULL)
2050 		shstrtab_fini(&shstrtab);
2051 
2052 	return (error);
2053 }
2054 
2055 int
2056 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2057     core_content_t content)
2058 {
2059 	u_offset_t poffset, soffset, doffset;
2060 	int error;
2061 	uint_t i, nphdrs, nshdrs;
2062 	struct seg *seg;
2063 	struct as *as = p->p_as;
2064 	void *bigwad, *zeropg = NULL;
2065 	size_t bigsize, phdrsz, shdrsz;
2066 	Ehdr *ehdr;
2067 	Phdr *phdr;
2068 	Shdr shdr0;
2069 	caddr_t brkbase, stkbase;
2070 	size_t brksize, stksize;
2071 	boolean_t overflowed = B_FALSE, retried = B_FALSE;
2072 	klwp_t *lwp = ttolwp(curthread);
2073 	elf_core_ctx_t ctx = {
2074 		.ecc_vp = vp,
2075 		.ecc_p = p,
2076 		.ecc_credp = credp,
2077 		.ecc_rlimit = rlimit,
2078 		.ecc_content = content,
2079 		.ecc_doffset = 0,
2080 		.ecc_buf = NULL,
2081 		.ecc_bufsz = 0
2082 	};
2083 
2084 top:
2085 	/*
2086 	 * Make sure we have everything we need (registers, etc.).
2087 	 * All other lwps have already stopped and are in an orderly state.
2088 	 */
2089 	ASSERT(p == ttoproc(curthread));
2090 	prstop(0, 0);
2091 
2092 	AS_LOCK_ENTER(as, RW_WRITER);
2093 	nphdrs = prnsegs(as, 0) + 2;		/* two CORE note sections */
2094 
2095 	/*
2096 	 * Count the number of section headers we're going to need.
2097 	 */
2098 	nshdrs = 0;
2099 	if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG)) {
2100 		VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2101 	}
2102 	AS_LOCK_EXIT(as);
2103 
2104 	/*
2105 	 * The core file contents may require zero section headers, but if
2106 	 * we overflow the 16 bits allotted to the program header count in
2107 	 * the ELF header, we'll need that program header at index zero.
2108 	 */
2109 	if (nshdrs == 0 && nphdrs >= PN_XNUM)
2110 		nshdrs = 1;
2111 
2112 	/*
2113 	 * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
2114 	 * or shdrs needed to produce the core file.  It is used for the three
2115 	 * tasks sequentially, not simultaneously, so it does not need space
2116 	 * for all three data at once, only the largest one.
2117 	 */
2118 	VERIFY(nphdrs >= 2);
2119 	phdrsz = nphdrs * sizeof (Phdr);
2120 	shdrsz = nshdrs * sizeof (Shdr);
2121 	bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2122 	bigwad = kmem_alloc(bigsize, KM_SLEEP);
2123 
2124 	ehdr = (Ehdr *)bigwad;
2125 	bzero(ehdr, sizeof (*ehdr));
2126 
2127 	ehdr->e_ident[EI_MAG0] = ELFMAG0;
2128 	ehdr->e_ident[EI_MAG1] = ELFMAG1;
2129 	ehdr->e_ident[EI_MAG2] = ELFMAG2;
2130 	ehdr->e_ident[EI_MAG3] = ELFMAG3;
2131 	ehdr->e_ident[EI_CLASS] = ELFCLASS;
2132 	ehdr->e_type = ET_CORE;
2133 
2134 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2135 
2136 #if defined(__sparc)
2137 	ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2138 	ehdr->e_machine = EM_SPARC;
2139 #elif defined(__i386_COMPAT)
2140 	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2141 	ehdr->e_machine = EM_386;
2142 #else
2143 #error "no recognized machine type is defined"
2144 #endif
2145 
2146 #else	/* !defined(_LP64) || defined(_ELF32_COMPAT) */
2147 
2148 #if defined(__sparc)
2149 	ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2150 	ehdr->e_machine = EM_SPARCV9;
2151 #elif defined(__amd64)
2152 	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2153 	ehdr->e_machine = EM_AMD64;
2154 #else
2155 #error "no recognized 64-bit machine type is defined"
2156 #endif
2157 
2158 #endif	/* !defined(_LP64) || defined(_ELF32_COMPAT) */
2159 
2160 	poffset = sizeof (Ehdr);
2161 	soffset = sizeof (Ehdr) + phdrsz;
2162 	doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2163 	bzero(&shdr0, sizeof (shdr0));
2164 
2165 	/*
2166 	 * If the count of program headers or section headers or the index
2167 	 * of the section string table can't fit in the mere 16 bits
2168 	 * shortsightedly allotted to them in the ELF header, we use the
2169 	 * extended formats and put the real values in the section header
2170 	 * as index 0.
2171 	 */
2172 	if (nphdrs >= PN_XNUM) {
2173 		ehdr->e_phnum = PN_XNUM;
2174 		shdr0.sh_info = nphdrs;
2175 	} else {
2176 		ehdr->e_phnum = (unsigned short)nphdrs;
2177 	}
2178 
2179 	if (nshdrs > 0) {
2180 		if (nshdrs >= SHN_LORESERVE) {
2181 			ehdr->e_shnum = 0;
2182 			shdr0.sh_size = nshdrs;
2183 		} else {
2184 			ehdr->e_shnum = (unsigned short)nshdrs;
2185 		}
2186 
2187 		if (nshdrs - 1 >= SHN_LORESERVE) {
2188 			ehdr->e_shstrndx = SHN_XINDEX;
2189 			shdr0.sh_link = nshdrs - 1;
2190 		} else {
2191 			ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2192 		}
2193 
2194 		ehdr->e_shoff = soffset;
2195 		ehdr->e_shentsize = sizeof (Shdr);
2196 	}
2197 
2198 	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
2199 	ehdr->e_version = EV_CURRENT;
2200 	ehdr->e_ehsize = sizeof (Ehdr);
2201 	ehdr->e_phoff = poffset;
2202 	ehdr->e_phentsize = sizeof (Phdr);
2203 
2204 	if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2205 	    sizeof (Ehdr), rlimit, credp)) {
2206 		goto done;
2207 	}
2208 
2209 	phdr = (Phdr *)bigwad;
2210 	bzero(phdr, phdrsz);
2211 
2212 	setup_old_note_header(&phdr[0], p);
2213 	phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2214 	doffset += phdr[0].p_filesz;
2215 
2216 	setup_note_header(&phdr[1], p);
2217 	phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2218 	doffset += phdr[1].p_filesz;
2219 
2220 	mutex_enter(&p->p_lock);
2221 
2222 	brkbase = p->p_brkbase;
2223 	brksize = p->p_brksize;
2224 
2225 	stkbase = p->p_usrstack - p->p_stksize;
2226 	stksize = p->p_stksize;
2227 
2228 	mutex_exit(&p->p_lock);
2229 
2230 	AS_LOCK_ENTER(as, RW_WRITER);
2231 	i = 2;
2232 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2233 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2234 		caddr_t saddr, naddr;
2235 		void *tmp = NULL;
2236 		extern struct seg_ops segspt_shmops;
2237 
2238 		if ((seg->s_flags & S_HOLE) != 0) {
2239 			continue;
2240 		}
2241 
2242 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2243 			uint_t prot;
2244 			size_t size;
2245 			int type;
2246 			vnode_t *mvp;
2247 
2248 			prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2249 			prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2250 			if ((size = (size_t)(naddr - saddr)) == 0) {
2251 				ASSERT(tmp == NULL);
2252 				continue;
2253 			} else if (i == nphdrs) {
2254 				pr_getprot_done(&tmp);
2255 				overflowed = B_TRUE;
2256 				break;
2257 			}
2258 			phdr[i].p_type = PT_LOAD;
2259 			phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2260 			phdr[i].p_memsz = size;
2261 			if (prot & PROT_READ)
2262 				phdr[i].p_flags |= PF_R;
2263 			if (prot & PROT_WRITE)
2264 				phdr[i].p_flags |= PF_W;
2265 			if (prot & PROT_EXEC)
2266 				phdr[i].p_flags |= PF_X;
2267 
2268 			/*
2269 			 * Figure out which mappings to include in the core.
2270 			 */
2271 			type = SEGOP_GETTYPE(seg, saddr);
2272 
2273 			if (saddr == stkbase && size == stksize) {
2274 				if (!(content & CC_CONTENT_STACK))
2275 					goto exclude;
2276 
2277 			} else if (saddr == brkbase && size == brksize) {
2278 				if (!(content & CC_CONTENT_HEAP))
2279 					goto exclude;
2280 
2281 			} else if (seg->s_ops == &segspt_shmops) {
2282 				if (type & MAP_NORESERVE) {
2283 					if (!(content & CC_CONTENT_DISM))
2284 						goto exclude;
2285 				} else {
2286 					if (!(content & CC_CONTENT_ISM))
2287 						goto exclude;
2288 				}
2289 
2290 			} else if (seg->s_ops != &segvn_ops) {
2291 				goto exclude;
2292 
2293 			} else if (type & MAP_SHARED) {
2294 				if (shmgetid(p, saddr) != SHMID_NONE) {
2295 					if (!(content & CC_CONTENT_SHM))
2296 						goto exclude;
2297 
2298 				} else if (SEGOP_GETVP(seg, seg->s_base,
2299 				    &mvp) != 0 || mvp == NULL ||
2300 				    mvp->v_type != VREG) {
2301 					if (!(content & CC_CONTENT_SHANON))
2302 						goto exclude;
2303 
2304 				} else {
2305 					if (!(content & CC_CONTENT_SHFILE))
2306 						goto exclude;
2307 				}
2308 
2309 			} else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2310 			    mvp == NULL || mvp->v_type != VREG) {
2311 				if (!(content & CC_CONTENT_ANON))
2312 					goto exclude;
2313 
2314 			} else if (prot == (PROT_READ | PROT_EXEC)) {
2315 				if (!(content & CC_CONTENT_TEXT))
2316 					goto exclude;
2317 
2318 			} else if (prot == PROT_READ) {
2319 				if (!(content & CC_CONTENT_RODATA))
2320 					goto exclude;
2321 
2322 			} else {
2323 				if (!(content & CC_CONTENT_DATA))
2324 					goto exclude;
2325 			}
2326 
2327 			doffset = roundup(doffset, sizeof (Word));
2328 			phdr[i].p_offset = doffset;
2329 			phdr[i].p_filesz = size;
2330 			doffset += size;
2331 exclude:
2332 			i++;
2333 		}
2334 		VERIFY(tmp == NULL);
2335 		if (overflowed)
2336 			break;
2337 	}
2338 	AS_LOCK_EXIT(as);
2339 
2340 	if (overflowed || i != nphdrs) {
2341 		if (!retried) {
2342 			retried = B_TRUE;
2343 			overflowed = B_FALSE;
2344 			kmem_free(bigwad, bigsize);
2345 			goto top;
2346 		}
2347 		cmn_err(CE_WARN, "elfcore: core dump failed for "
2348 		    "process %d; address space is changing", p->p_pid);
2349 		error = EIO;
2350 		goto done;
2351 	}
2352 
2353 	if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2354 	    phdr, phdrsz, rlimit, credp)) != 0) {
2355 		goto done;
2356 	}
2357 
2358 	if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2359 	    credp)) != 0) {
2360 		goto done;
2361 	}
2362 	if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2363 	    credp, content)) != 0) {
2364 		goto done;
2365 	}
2366 
2367 	for (i = 2; i < nphdrs; i++) {
2368 		prkillinfo_t killinfo;
2369 		sigqueue_t *sq;
2370 		int sig, j;
2371 
2372 		if (phdr[i].p_filesz == 0)
2373 			continue;
2374 
2375 		/*
2376 		 * If we hit a region that was mapped PROT_NONE then we cannot
2377 		 * continue dumping this normally as the kernel would be unable
2378 		 * to read from the page and that would result in us failing to
2379 		 * dump the page. As such, any region mapped PROT_NONE, we dump
2380 		 * as a zero-filled page such that this is still represented in
2381 		 * the map.
2382 		 *
2383 		 * If dumping out this segment fails, rather than failing
2384 		 * the core dump entirely, we reset the size of the mapping
2385 		 * to zero to indicate that the data is absent from the core
2386 		 * file and or in the PF_SUNW_FAILURE flag to differentiate
2387 		 * this from mappings that were excluded due to the core file
2388 		 * content settings.
2389 		 */
2390 		if ((phdr[i].p_flags & (PF_R | PF_W | PF_X)) == 0) {
2391 			size_t towrite = phdr[i].p_filesz;
2392 			size_t curoff = 0;
2393 
2394 			if (zeropg == NULL) {
2395 				zeropg = kmem_zalloc(elf_zeropg_sz, KM_SLEEP);
2396 			}
2397 
2398 			error = 0;
2399 			while (towrite != 0) {
2400 				size_t len = MIN(towrite, elf_zeropg_sz);
2401 
2402 				error = core_write(vp, UIO_SYSSPACE,
2403 				    phdr[i].p_offset + curoff, zeropg, len,
2404 				    rlimit, credp);
2405 				if (error != 0)
2406 					break;
2407 
2408 				towrite -= len;
2409 				curoff += len;
2410 			}
2411 		} else {
2412 			error = core_seg(p, vp, phdr[i].p_offset,
2413 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr,
2414 			    phdr[i].p_filesz, rlimit, credp);
2415 		}
2416 		if (error == 0)
2417 			continue;
2418 
2419 		if ((sig = lwp->lwp_cursig) == 0) {
2420 			/*
2421 			 * We failed due to something other than a signal.
2422 			 * Since the space reserved for the segment is now
2423 			 * unused, we stash the errno in the first four
2424 			 * bytes. This undocumented interface will let us
2425 			 * understand the nature of the failure.
2426 			 */
2427 			(void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2428 			    &error, sizeof (error), rlimit, credp);
2429 
2430 			phdr[i].p_filesz = 0;
2431 			phdr[i].p_flags |= PF_SUNW_FAILURE;
2432 			if ((error = core_write(vp, UIO_SYSSPACE,
2433 			    poffset + sizeof (Phdr) * i, &phdr[i],
2434 			    sizeof (Phdr), rlimit, credp)) != 0)
2435 				goto done;
2436 
2437 			continue;
2438 		}
2439 
2440 		/*
2441 		 * We took a signal.  We want to abort the dump entirely, but
2442 		 * we also want to indicate what failed and why.  We therefore
2443 		 * use the space reserved for the first failing segment to
2444 		 * write our error (which, for purposes of compatability with
2445 		 * older core dump readers, we set to EINTR) followed by any
2446 		 * siginfo associated with the signal.
2447 		 */
2448 		bzero(&killinfo, sizeof (killinfo));
2449 		killinfo.prk_error = EINTR;
2450 
2451 		sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2452 
2453 		if (sq != NULL) {
2454 			bcopy(&sq->sq_info, &killinfo.prk_info,
2455 			    sizeof (sq->sq_info));
2456 		} else {
2457 			killinfo.prk_info.si_signo = lwp->lwp_cursig;
2458 			killinfo.prk_info.si_code = SI_NOINFO;
2459 		}
2460 
2461 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2462 		/*
2463 		 * If this is a 32-bit process, we need to translate from the
2464 		 * native siginfo to the 32-bit variant.  (Core readers must
2465 		 * always have the same data model as their target or must
2466 		 * be aware of -- and compensate for -- data model differences.)
2467 		 */
2468 		if (curproc->p_model == DATAMODEL_ILP32) {
2469 			siginfo32_t si32;
2470 
2471 			siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2472 			bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2473 		}
2474 #endif
2475 
2476 		(void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2477 		    &killinfo, sizeof (killinfo), rlimit, credp);
2478 
2479 		/*
2480 		 * For the segment on which we took the signal, indicate that
2481 		 * its data now refers to a siginfo.
2482 		 */
2483 		phdr[i].p_filesz = 0;
2484 		phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2485 		    PF_SUNW_SIGINFO;
2486 
2487 		/*
2488 		 * And for every other segment, indicate that its absence
2489 		 * is due to a signal.
2490 		 */
2491 		for (j = i + 1; j < nphdrs; j++) {
2492 			phdr[j].p_filesz = 0;
2493 			phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2494 		}
2495 
2496 		/*
2497 		 * Finally, write out our modified program headers.
2498 		 */
2499 		if ((error = core_write(vp, UIO_SYSSPACE,
2500 		    poffset + sizeof (Phdr) * i, &phdr[i],
2501 		    sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2502 			goto done;
2503 		}
2504 
2505 		break;
2506 	}
2507 
2508 	if (nshdrs > 0) {
2509 		Shdr *shdr = (Shdr *)bigwad;
2510 
2511 		bzero(shdr, shdrsz);
2512 		if (nshdrs > 1) {
2513 			ctx.ecc_doffset = doffset;
2514 			AS_LOCK_ENTER(as, RW_WRITER);
2515 			error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2516 			AS_LOCK_EXIT(as);
2517 			if (error != 0) {
2518 				goto done;
2519 			}
2520 		}
2521 		/* Copy any extended format data destined for the first shdr */
2522 		bcopy(&shdr0, shdr, sizeof (shdr0));
2523 
2524 		error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2525 		    rlimit, credp);
2526 	}
2527 
2528 done:
2529 	if (zeropg != NULL)
2530 		kmem_free(zeropg, elf_zeropg_sz);
2531 	if (ctx.ecc_bufsz != 0)
2532 		kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2533 	kmem_free(bigwad, bigsize);
2534 	return (error);
2535 }
2536 
2537 #ifndef	_ELF32_COMPAT
2538 
2539 static struct execsw esw = {
2540 #ifdef	_LP64
2541 	elf64magicstr,
2542 #else	/* _LP64 */
2543 	elf32magicstr,
2544 #endif	/* _LP64 */
2545 	0,
2546 	5,
2547 	elfexec,
2548 	elfcore
2549 };
2550 
2551 static struct modlexec modlexec = {
2552 	&mod_execops, "exec module for elf", &esw
2553 };
2554 
2555 #ifdef	_LP64
2556 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2557 			intpdata_t *idatap, int level, size_t *execsz,
2558 			int setid, caddr_t exec_file, cred_t *cred,
2559 			int brand_action);
2560 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2561 			rlim64_t rlimit, int sig, core_content_t content);
2562 
2563 static struct execsw esw32 = {
2564 	elf32magicstr,
2565 	0,
2566 	5,
2567 	elf32exec,
2568 	elf32core
2569 };
2570 
2571 static struct modlexec modlexec32 = {
2572 	&mod_execops, "32-bit exec module for elf", &esw32
2573 };
2574 #endif	/* _LP64 */
2575 
2576 static struct modlinkage modlinkage = {
2577 	MODREV_1,
2578 	(void *)&modlexec,
2579 #ifdef	_LP64
2580 	(void *)&modlexec32,
2581 #endif	/* _LP64 */
2582 	NULL
2583 };
2584 
2585 int
2586 _init(void)
2587 {
2588 	return (mod_install(&modlinkage));
2589 }
2590 
2591 int
2592 _fini(void)
2593 {
2594 	return (mod_remove(&modlinkage));
2595 }
2596 
2597 int
2598 _info(struct modinfo *modinfop)
2599 {
2600 	return (mod_info(&modlinkage, modinfop));
2601 }
2602 
2603 #endif	/* !_ELF32_COMPAT */
2604