xref: /illumos-gate/usr/src/uts/common/fs/proc/prsubr.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
26  * Copyright 2022 MNX Cloud, Inc.
27  * Copyright 2022 Oxide Computer Company
28  */
29 
30 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved	*/
32 
33 #include <sys/types.h>
34 #include <sys/t_lock.h>
35 #include <sys/param.h>
36 #include <sys/cmn_err.h>
37 #include <sys/cred.h>
38 #include <sys/priv.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/inline.h>
42 #include <sys/kmem.h>
43 #include <sys/mman.h>
44 #include <sys/proc.h>
45 #include <sys/brand.h>
46 #include <sys/sobject.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/uio.h>
50 #include <sys/var.h>
51 #include <sys/vfs.h>
52 #include <sys/vnode.h>
53 #include <sys/session.h>
54 #include <sys/pcb.h>
55 #include <sys/signal.h>
56 #include <sys/user.h>
57 #include <sys/disp.h>
58 #include <sys/class.h>
59 #include <sys/ts.h>
60 #include <sys/bitmap.h>
61 #include <sys/poll.h>
62 #include <sys/shm_impl.h>
63 #include <sys/fault.h>
64 #include <sys/syscall.h>
65 #include <sys/procfs.h>
66 #include <sys/processor.h>
67 #include <sys/cpuvar.h>
68 #include <sys/copyops.h>
69 #include <sys/time.h>
70 #include <sys/msacct.h>
71 #include <sys/flock_impl.h>
72 #include <sys/stropts.h>
73 #include <sys/strsubr.h>
74 #include <sys/pathname.h>
75 #include <sys/mode.h>
76 #include <sys/socketvar.h>
77 #include <sys/autoconf.h>
78 #include <sys/dtrace.h>
79 #include <sys/timod.h>
80 #include <sys/fs/namenode.h>
81 #include <netinet/udp.h>
82 #include <netinet/tcp.h>
83 #include <inet/cc.h>
84 #include <vm/as.h>
85 #include <vm/rm.h>
86 #include <vm/seg.h>
87 #include <vm/seg_vn.h>
88 #include <vm/seg_dev.h>
89 #include <vm/seg_spt.h>
90 #include <vm/page.h>
91 #include <sys/vmparam.h>
92 #include <sys/swap.h>
93 #include <fs/proc/prdata.h>
94 #include <sys/task.h>
95 #include <sys/project.h>
96 #include <sys/contract_impl.h>
97 #include <sys/contract/process.h>
98 #include <sys/contract/process_impl.h>
99 #include <sys/schedctl.h>
100 #include <sys/pool.h>
101 #include <sys/zone.h>
102 #include <sys/atomic.h>
103 #include <sys/sdt.h>
104 
105 #define	MAX_ITERS_SPIN	5
106 
107 typedef struct prpagev {
108 	uint_t *pg_protv;	/* vector of page permissions */
109 	char *pg_incore;	/* vector of incore flags */
110 	size_t pg_npages;	/* number of pages in protv and incore */
111 	ulong_t pg_pnbase;	/* pn within segment of first protv element */
112 } prpagev_t;
113 
114 size_t pagev_lim = 256 * 1024;	/* limit on number of pages in prpagev_t */
115 
116 extern struct seg_ops segdev_ops;	/* needs a header file */
117 extern struct seg_ops segspt_shmops;	/* needs a header file */
118 
119 static	int	set_watched_page(proc_t *, caddr_t, caddr_t, ulong_t, ulong_t);
120 static	void	clear_watched_page(proc_t *, caddr_t, caddr_t, ulong_t);
121 
122 /*
123  * Choose an lwp from the complete set of lwps for the process.
124  * This is called for any operation applied to the process
125  * file descriptor that requires an lwp to operate upon.
126  *
127  * Returns a pointer to the thread for the selected LWP,
128  * and with the dispatcher lock held for the thread.
129  *
130  * The algorithm for choosing an lwp is critical for /proc semantics;
131  * don't touch this code unless you know all of the implications.
132  */
133 kthread_t *
134 prchoose(proc_t *p)
135 {
136 	kthread_t *t;
137 	kthread_t *t_onproc = NULL;	/* running on processor */
138 	kthread_t *t_run = NULL;	/* runnable, on disp queue */
139 	kthread_t *t_sleep = NULL;	/* sleeping */
140 	kthread_t *t_hold = NULL;	/* sleeping, performing hold */
141 	kthread_t *t_susp = NULL;	/* suspended stop */
142 	kthread_t *t_jstop = NULL;	/* jobcontrol stop, w/o directed stop */
143 	kthread_t *t_jdstop = NULL;	/* jobcontrol stop with directed stop */
144 	kthread_t *t_req = NULL;	/* requested stop */
145 	kthread_t *t_istop = NULL;	/* event-of-interest stop */
146 	kthread_t *t_dtrace = NULL;	/* DTrace stop */
147 
148 	ASSERT(MUTEX_HELD(&p->p_lock));
149 
150 	/*
151 	 * If the agent lwp exists, it takes precedence over all others.
152 	 */
153 	if ((t = p->p_agenttp) != NULL) {
154 		thread_lock(t);
155 		return (t);
156 	}
157 
158 	if ((t = p->p_tlist) == NULL)	/* start at the head of the list */
159 		return (t);
160 	do {		/* for eacn lwp in the process */
161 		if (VSTOPPED(t)) {	/* virtually stopped */
162 			if (t_req == NULL)
163 				t_req = t;
164 			continue;
165 		}
166 
167 		/* If this is a process kernel thread, ignore it. */
168 		if ((t->t_proc_flag & TP_KTHREAD) != 0) {
169 			continue;
170 		}
171 
172 		thread_lock(t);		/* make sure thread is in good state */
173 		switch (t->t_state) {
174 		default:
175 			panic("prchoose: bad thread state %d, thread 0x%p",
176 			    t->t_state, (void *)t);
177 			/*NOTREACHED*/
178 		case TS_SLEEP:
179 			/* this is filthy */
180 			if (t->t_wchan == (caddr_t)&p->p_holdlwps &&
181 			    t->t_wchan0 == NULL) {
182 				if (t_hold == NULL)
183 					t_hold = t;
184 			} else {
185 				if (t_sleep == NULL)
186 					t_sleep = t;
187 			}
188 			break;
189 		case TS_RUN:
190 		case TS_WAIT:
191 			if (t_run == NULL)
192 				t_run = t;
193 			break;
194 		case TS_ONPROC:
195 			if (t_onproc == NULL)
196 				t_onproc = t;
197 			break;
198 		case TS_ZOMB:		/* last possible choice */
199 			break;
200 		case TS_STOPPED:
201 			switch (t->t_whystop) {
202 			case PR_SUSPENDED:
203 				if (t_susp == NULL)
204 					t_susp = t;
205 				break;
206 			case PR_JOBCONTROL:
207 				if (t->t_proc_flag & TP_PRSTOP) {
208 					if (t_jdstop == NULL)
209 						t_jdstop = t;
210 				} else {
211 					if (t_jstop == NULL)
212 						t_jstop = t;
213 				}
214 				break;
215 			case PR_REQUESTED:
216 				if (t->t_dtrace_stop && t_dtrace == NULL)
217 					t_dtrace = t;
218 				else if (t_req == NULL)
219 					t_req = t;
220 				break;
221 			case PR_SYSENTRY:
222 			case PR_SYSEXIT:
223 			case PR_SIGNALLED:
224 			case PR_FAULTED:
225 				/*
226 				 * Make an lwp calling exit() be the
227 				 * last lwp seen in the process.
228 				 */
229 				if (t_istop == NULL ||
230 				    (t_istop->t_whystop == PR_SYSENTRY &&
231 				    t_istop->t_whatstop == SYS_exit))
232 					t_istop = t;
233 				break;
234 			case PR_CHECKPOINT:	/* can't happen? */
235 				break;
236 			default:
237 				panic("prchoose: bad t_whystop %d, thread 0x%p",
238 				    t->t_whystop, (void *)t);
239 				/*NOTREACHED*/
240 			}
241 			break;
242 		}
243 		thread_unlock(t);
244 	} while ((t = t->t_forw) != p->p_tlist);
245 
246 	if (t_onproc)
247 		t = t_onproc;
248 	else if (t_run)
249 		t = t_run;
250 	else if (t_sleep)
251 		t = t_sleep;
252 	else if (t_jstop)
253 		t = t_jstop;
254 	else if (t_jdstop)
255 		t = t_jdstop;
256 	else if (t_istop)
257 		t = t_istop;
258 	else if (t_dtrace)
259 		t = t_dtrace;
260 	else if (t_req)
261 		t = t_req;
262 	else if (t_hold)
263 		t = t_hold;
264 	else if (t_susp)
265 		t = t_susp;
266 	else			/* TS_ZOMB */
267 		t = p->p_tlist;
268 
269 	if (t != NULL)
270 		thread_lock(t);
271 	return (t);
272 }
273 
274 /*
275  * Wakeup anyone sleeping on the /proc vnode for the process/lwp to stop.
276  * Also call pollwakeup() if any lwps are waiting in poll() for POLLPRI
277  * on the /proc file descriptor.  Called from stop() when a traced
278  * process stops on an event of interest.  Also called from exit()
279  * and prinvalidate() to indicate POLLHUP and POLLERR respectively.
280  */
281 void
282 prnotify(struct vnode *vp)
283 {
284 	prcommon_t *pcp = VTOP(vp)->pr_common;
285 
286 	mutex_enter(&pcp->prc_mutex);
287 	cv_broadcast(&pcp->prc_wait);
288 	mutex_exit(&pcp->prc_mutex);
289 	if (pcp->prc_flags & PRC_POLL) {
290 		/*
291 		 * We call pollwakeup() with POLLHUP to ensure that
292 		 * the pollers are awakened even if they are polling
293 		 * for nothing (i.e., waiting for the process to exit).
294 		 * This enables the use of the PRC_POLL flag for optimization
295 		 * (we can turn off PRC_POLL only if we know no pollers remain).
296 		 */
297 		pcp->prc_flags &= ~PRC_POLL;
298 		pollwakeup(&pcp->prc_pollhead, POLLHUP);
299 	}
300 }
301 
302 /* called immediately below, in prfree() */
303 static void
304 prfreenotify(vnode_t *vp)
305 {
306 	prnode_t *pnp;
307 	prcommon_t *pcp;
308 
309 	while (vp != NULL) {
310 		pnp = VTOP(vp);
311 		pcp = pnp->pr_common;
312 		ASSERT(pcp->prc_thread == NULL);
313 		pcp->prc_proc = NULL;
314 		/*
315 		 * We can't call prnotify() here because we are holding
316 		 * pidlock.  We assert that there is no need to.
317 		 */
318 		mutex_enter(&pcp->prc_mutex);
319 		cv_broadcast(&pcp->prc_wait);
320 		mutex_exit(&pcp->prc_mutex);
321 		ASSERT(!(pcp->prc_flags & PRC_POLL));
322 
323 		vp = pnp->pr_next;
324 		pnp->pr_next = NULL;
325 	}
326 }
327 
328 /*
329  * Called from a hook in freeproc() when a traced process is removed
330  * from the process table.  The proc-table pointers of all associated
331  * /proc vnodes are cleared to indicate that the process has gone away.
332  */
333 void
334 prfree(proc_t *p)
335 {
336 	uint_t slot = p->p_slot;
337 
338 	ASSERT(MUTEX_HELD(&pidlock));
339 
340 	/*
341 	 * Block the process against /proc so it can be freed.
342 	 * It cannot be freed while locked by some controlling process.
343 	 * Lock ordering:
344 	 *	pidlock -> pr_pidlock -> p->p_lock -> pcp->prc_mutex
345 	 */
346 	mutex_enter(&pr_pidlock);	/* protects pcp->prc_proc */
347 	mutex_enter(&p->p_lock);
348 	while (p->p_proc_flag & P_PR_LOCK) {
349 		mutex_exit(&pr_pidlock);
350 		cv_wait(&pr_pid_cv[slot], &p->p_lock);
351 		mutex_exit(&p->p_lock);
352 		mutex_enter(&pr_pidlock);
353 		mutex_enter(&p->p_lock);
354 	}
355 
356 	ASSERT(p->p_tlist == NULL);
357 
358 	prfreenotify(p->p_plist);
359 	p->p_plist = NULL;
360 
361 	prfreenotify(p->p_trace);
362 	p->p_trace = NULL;
363 
364 	/*
365 	 * We broadcast to wake up everyone waiting for this process.
366 	 * No one can reach this process from this point on.
367 	 */
368 	cv_broadcast(&pr_pid_cv[slot]);
369 
370 	mutex_exit(&p->p_lock);
371 	mutex_exit(&pr_pidlock);
372 }
373 
374 /*
375  * Called from a hook in exit() when a traced process is becoming a zombie.
376  */
377 void
378 prexit(proc_t *p)
379 {
380 	ASSERT(MUTEX_HELD(&p->p_lock));
381 
382 	if (pr_watch_active(p)) {
383 		pr_free_watchpoints(p);
384 		watch_disable(curthread);
385 	}
386 	/* pr_free_watched_pages() is called in exit(), after dropping p_lock */
387 	if (p->p_trace) {
388 		VTOP(p->p_trace)->pr_common->prc_flags |= PRC_DESTROY;
389 		prnotify(p->p_trace);
390 	}
391 	cv_broadcast(&pr_pid_cv[p->p_slot]);	/* pauselwps() */
392 }
393 
394 /*
395  * Called when a thread calls lwp_exit().
396  */
397 void
398 prlwpexit(kthread_t *t)
399 {
400 	vnode_t *vp;
401 	prnode_t *pnp;
402 	prcommon_t *pcp;
403 	proc_t *p = ttoproc(t);
404 	lwpent_t *lep = p->p_lwpdir[t->t_dslot].ld_entry;
405 
406 	ASSERT(t == curthread);
407 	ASSERT(MUTEX_HELD(&p->p_lock));
408 
409 	/*
410 	 * The process must be blocked against /proc to do this safely.
411 	 * The lwp must not disappear while the process is marked P_PR_LOCK.
412 	 * It is the caller's responsibility to have called prbarrier(p).
413 	 */
414 	ASSERT(!(p->p_proc_flag & P_PR_LOCK));
415 
416 	for (vp = p->p_plist; vp != NULL; vp = pnp->pr_next) {
417 		pnp = VTOP(vp);
418 		pcp = pnp->pr_common;
419 		if (pcp->prc_thread == t) {
420 			pcp->prc_thread = NULL;
421 			pcp->prc_flags |= PRC_DESTROY;
422 		}
423 	}
424 
425 	for (vp = lep->le_trace; vp != NULL; vp = pnp->pr_next) {
426 		pnp = VTOP(vp);
427 		pcp = pnp->pr_common;
428 		pcp->prc_thread = NULL;
429 		pcp->prc_flags |= PRC_DESTROY;
430 		prnotify(vp);
431 	}
432 
433 	if (p->p_trace)
434 		prnotify(p->p_trace);
435 }
436 
437 /*
438  * Called when a zombie thread is joined or when a
439  * detached lwp exits.  Called from lwp_hash_out().
440  */
441 void
442 prlwpfree(proc_t *p, lwpent_t *lep)
443 {
444 	vnode_t *vp;
445 	prnode_t *pnp;
446 	prcommon_t *pcp;
447 
448 	ASSERT(MUTEX_HELD(&p->p_lock));
449 
450 	/*
451 	 * The process must be blocked against /proc to do this safely.
452 	 * The lwp must not disappear while the process is marked P_PR_LOCK.
453 	 * It is the caller's responsibility to have called prbarrier(p).
454 	 */
455 	ASSERT(!(p->p_proc_flag & P_PR_LOCK));
456 
457 	vp = lep->le_trace;
458 	lep->le_trace = NULL;
459 	while (vp) {
460 		prnotify(vp);
461 		pnp = VTOP(vp);
462 		pcp = pnp->pr_common;
463 		ASSERT(pcp->prc_thread == NULL &&
464 		    (pcp->prc_flags & PRC_DESTROY));
465 		pcp->prc_tslot = -1;
466 		vp = pnp->pr_next;
467 		pnp->pr_next = NULL;
468 	}
469 
470 	if (p->p_trace)
471 		prnotify(p->p_trace);
472 }
473 
474 /*
475  * Called from a hook in exec() when a thread starts exec().
476  */
477 void
478 prexecstart(void)
479 {
480 	proc_t *p = ttoproc(curthread);
481 	klwp_t *lwp = ttolwp(curthread);
482 
483 	/*
484 	 * The P_PR_EXEC flag blocks /proc operations for
485 	 * the duration of the exec().
486 	 * We can't start exec() while the process is
487 	 * locked by /proc, so we call prbarrier().
488 	 * lwp_nostop keeps the process from being stopped
489 	 * via job control for the duration of the exec().
490 	 */
491 
492 	ASSERT(MUTEX_HELD(&p->p_lock));
493 	prbarrier(p);
494 	lwp->lwp_nostop++;
495 	p->p_proc_flag |= P_PR_EXEC;
496 }
497 
498 /*
499  * Called from a hook in exec() when a thread finishes exec().
500  * The thread may or may not have succeeded.  Some other thread
501  * may have beat it to the punch.
502  */
503 void
504 prexecend(void)
505 {
506 	proc_t *p = ttoproc(curthread);
507 	klwp_t *lwp = ttolwp(curthread);
508 	vnode_t *vp;
509 	prnode_t *pnp;
510 	prcommon_t *pcp;
511 	model_t model = p->p_model;
512 	id_t tid = curthread->t_tid;
513 	int tslot = curthread->t_dslot;
514 
515 	ASSERT(MUTEX_HELD(&p->p_lock));
516 
517 	lwp->lwp_nostop--;
518 	if (p->p_flag & SEXITLWPS) {
519 		/*
520 		 * We are on our way to exiting because some
521 		 * other thread beat us in the race to exec().
522 		 * Don't clear the P_PR_EXEC flag in this case.
523 		 */
524 		return;
525 	}
526 
527 	/*
528 	 * Wake up anyone waiting in /proc for the process to complete exec().
529 	 */
530 	p->p_proc_flag &= ~P_PR_EXEC;
531 	if ((vp = p->p_trace) != NULL) {
532 		pcp = VTOP(vp)->pr_common;
533 		mutex_enter(&pcp->prc_mutex);
534 		cv_broadcast(&pcp->prc_wait);
535 		mutex_exit(&pcp->prc_mutex);
536 		for (; vp != NULL; vp = pnp->pr_next) {
537 			pnp = VTOP(vp);
538 			pnp->pr_common->prc_datamodel = model;
539 		}
540 	}
541 	if ((vp = p->p_lwpdir[tslot].ld_entry->le_trace) != NULL) {
542 		/*
543 		 * We dealt with the process common above.
544 		 */
545 		ASSERT(p->p_trace != NULL);
546 		pcp = VTOP(vp)->pr_common;
547 		mutex_enter(&pcp->prc_mutex);
548 		cv_broadcast(&pcp->prc_wait);
549 		mutex_exit(&pcp->prc_mutex);
550 		for (; vp != NULL; vp = pnp->pr_next) {
551 			pnp = VTOP(vp);
552 			pcp = pnp->pr_common;
553 			pcp->prc_datamodel = model;
554 			pcp->prc_tid = tid;
555 			pcp->prc_tslot = tslot;
556 		}
557 	}
558 }
559 
560 /*
561  * Called from a hook in relvm() just before freeing the address space.
562  * We free all the watched areas now.
563  */
564 void
565 prrelvm(void)
566 {
567 	proc_t *p = ttoproc(curthread);
568 
569 	mutex_enter(&p->p_lock);
570 	prbarrier(p);	/* block all other /proc operations */
571 	if (pr_watch_active(p)) {
572 		pr_free_watchpoints(p);
573 		watch_disable(curthread);
574 	}
575 	mutex_exit(&p->p_lock);
576 	pr_free_watched_pages(p);
577 }
578 
579 /*
580  * Called from hooks in exec-related code when a traced process
581  * attempts to exec(2) a setuid/setgid program or an unreadable
582  * file.  Rather than fail the exec we invalidate the associated
583  * /proc vnodes so that subsequent attempts to use them will fail.
584  *
585  * All /proc vnodes, except directory vnodes, are retained on a linked
586  * list (rooted at p_plist in the process structure) until last close.
587  *
588  * A controlling process must re-open the /proc files in order to
589  * regain control.
590  */
591 void
592 prinvalidate(struct user *up)
593 {
594 	kthread_t *t = curthread;
595 	proc_t *p = ttoproc(t);
596 	vnode_t *vp;
597 	prnode_t *pnp;
598 	int writers = 0;
599 
600 	mutex_enter(&p->p_lock);
601 	prbarrier(p);	/* block all other /proc operations */
602 
603 	/*
604 	 * At this moment, there can be only one lwp in the process.
605 	 */
606 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
607 
608 	/*
609 	 * Invalidate any currently active /proc vnodes.
610 	 */
611 	for (vp = p->p_plist; vp != NULL; vp = pnp->pr_next) {
612 		pnp = VTOP(vp);
613 		switch (pnp->pr_type) {
614 		case PR_PSINFO:		/* these files can read by anyone */
615 		case PR_LPSINFO:
616 		case PR_LWPSINFO:
617 		case PR_LWPDIR:
618 		case PR_LWPIDDIR:
619 		case PR_USAGE:
620 		case PR_LUSAGE:
621 		case PR_LWPUSAGE:
622 			break;
623 		default:
624 			pnp->pr_flags |= PR_INVAL;
625 			break;
626 		}
627 	}
628 	/*
629 	 * Wake up anyone waiting for the process or lwp.
630 	 * p->p_trace is guaranteed to be non-NULL if there
631 	 * are any open /proc files for this process.
632 	 */
633 	if ((vp = p->p_trace) != NULL) {
634 		prcommon_t *pcp = VTOP(vp)->pr_pcommon;
635 
636 		prnotify(vp);
637 		/*
638 		 * Are there any writers?
639 		 */
640 		if ((writers = pcp->prc_writers) != 0) {
641 			/*
642 			 * Clear the exclusive open flag (old /proc interface).
643 			 * Set prc_selfopens equal to prc_writers so that
644 			 * the next O_EXCL|O_WRITE open will succeed
645 			 * even with existing (though invalid) writers.
646 			 * prclose() must decrement prc_selfopens when
647 			 * the invalid files are closed.
648 			 */
649 			pcp->prc_flags &= ~PRC_EXCL;
650 			ASSERT(pcp->prc_selfopens <= writers);
651 			pcp->prc_selfopens = writers;
652 		}
653 	}
654 	vp = p->p_lwpdir[t->t_dslot].ld_entry->le_trace;
655 	while (vp != NULL) {
656 		/*
657 		 * We should not invalidate the lwpiddir vnodes,
658 		 * but the necessities of maintaining the old
659 		 * ioctl()-based version of /proc require it.
660 		 */
661 		pnp = VTOP(vp);
662 		pnp->pr_flags |= PR_INVAL;
663 		prnotify(vp);
664 		vp = pnp->pr_next;
665 	}
666 
667 	/*
668 	 * If any tracing flags are in effect and any vnodes are open for
669 	 * writing then set the requested-stop and run-on-last-close flags.
670 	 * Otherwise, clear all tracing flags.
671 	 */
672 	t->t_proc_flag &= ~TP_PAUSE;
673 	if ((p->p_proc_flag & P_PR_TRACE) && writers) {
674 		t->t_proc_flag |= TP_PRSTOP;
675 		aston(t);		/* so ISSIG will see the flag */
676 		p->p_proc_flag |= P_PR_RUNLCL;
677 	} else {
678 		premptyset(&up->u_entrymask);		/* syscalls */
679 		premptyset(&up->u_exitmask);
680 		up->u_systrap = 0;
681 		premptyset(&p->p_sigmask);		/* signals */
682 		premptyset(&p->p_fltmask);		/* faults */
683 		t->t_proc_flag &= ~(TP_PRSTOP|TP_PRVSTOP|TP_STOPPING);
684 		p->p_proc_flag &= ~(P_PR_RUNLCL|P_PR_KILLCL|P_PR_TRACE);
685 		prnostep(ttolwp(t));
686 	}
687 
688 	mutex_exit(&p->p_lock);
689 }
690 
691 /*
692  * Acquire the controlled process's p_lock and mark it P_PR_LOCK.
693  * Return with pr_pidlock held in all cases.
694  * Return with p_lock held if the the process still exists.
695  * Return value is the process pointer if the process still exists, else NULL.
696  * If we lock the process, give ourself kernel priority to avoid deadlocks;
697  * this is undone in prunlock().
698  */
699 proc_t *
700 pr_p_lock(prnode_t *pnp)
701 {
702 	proc_t *p;
703 	prcommon_t *pcp;
704 
705 	mutex_enter(&pr_pidlock);
706 	if ((pcp = pnp->pr_pcommon) == NULL || (p = pcp->prc_proc) == NULL)
707 		return (NULL);
708 	mutex_enter(&p->p_lock);
709 	while (p->p_proc_flag & P_PR_LOCK) {
710 		/*
711 		 * This cv/mutex pair is persistent even if
712 		 * the process disappears while we sleep.
713 		 */
714 		kcondvar_t *cv = &pr_pid_cv[p->p_slot];
715 		kmutex_t *mp = &p->p_lock;
716 
717 		mutex_exit(&pr_pidlock);
718 		cv_wait(cv, mp);
719 		mutex_exit(mp);
720 		mutex_enter(&pr_pidlock);
721 		if (pcp->prc_proc == NULL)
722 			return (NULL);
723 		ASSERT(p == pcp->prc_proc);
724 		mutex_enter(&p->p_lock);
725 	}
726 	p->p_proc_flag |= P_PR_LOCK;
727 	return (p);
728 }
729 
730 /*
731  * Lock the target process by setting P_PR_LOCK and grabbing p->p_lock.
732  * This prevents any lwp of the process from disappearing and
733  * blocks most operations that a process can perform on itself.
734  * Returns 0 on success, a non-zero error number on failure.
735  *
736  * 'zdisp' is ZYES or ZNO to indicate whether prlock() should succeed when
737  * the subject process is a zombie (ZYES) or fail for zombies (ZNO).
738  *
739  * error returns:
740  *	ENOENT: process or lwp has disappeared or process is exiting
741  *		(or has become a zombie and zdisp == ZNO).
742  *	EAGAIN: procfs vnode has become invalid.
743  *	EINTR:  signal arrived while waiting for exec to complete.
744  */
745 int
746 prlock(prnode_t *pnp, int zdisp)
747 {
748 	prcommon_t *pcp;
749 	proc_t *p;
750 
751 again:
752 	pcp = pnp->pr_common;
753 	p = pr_p_lock(pnp);
754 	mutex_exit(&pr_pidlock);
755 
756 	/*
757 	 * Return ENOENT immediately if there is no process.
758 	 */
759 	if (p == NULL)
760 		return (ENOENT);
761 
762 	ASSERT(p == pcp->prc_proc && p->p_stat != 0 && p->p_stat != SIDL);
763 
764 	/*
765 	 * Return ENOENT if process entered zombie state or is exiting
766 	 * and the 'zdisp' flag is set to ZNO indicating not to lock zombies.
767 	 */
768 	if (zdisp == ZNO &&
769 	    ((pcp->prc_flags & PRC_DESTROY) || (p->p_flag & SEXITING))) {
770 		prunlock(pnp);
771 		return (ENOENT);
772 	}
773 
774 	/*
775 	 * If lwp-specific, check to see if lwp has disappeared.
776 	 */
777 	if (pcp->prc_flags & PRC_LWP) {
778 		if ((zdisp == ZNO && (pcp->prc_flags & PRC_DESTROY)) ||
779 		    pcp->prc_tslot == -1) {
780 			prunlock(pnp);
781 			return (ENOENT);
782 		}
783 	}
784 
785 	/*
786 	 * Return EAGAIN if we have encountered a security violation.
787 	 * (The process exec'd a set-id or unreadable executable file.)
788 	 */
789 	if (pnp->pr_flags & PR_INVAL) {
790 		prunlock(pnp);
791 		return (EAGAIN);
792 	}
793 
794 	/*
795 	 * If process is undergoing an exec(), wait for
796 	 * completion and then start all over again.
797 	 */
798 	if (p->p_proc_flag & P_PR_EXEC) {
799 		pcp = pnp->pr_pcommon;	/* Put on the correct sleep queue */
800 		mutex_enter(&pcp->prc_mutex);
801 		prunlock(pnp);
802 		if (!cv_wait_sig(&pcp->prc_wait, &pcp->prc_mutex)) {
803 			mutex_exit(&pcp->prc_mutex);
804 			return (EINTR);
805 		}
806 		mutex_exit(&pcp->prc_mutex);
807 		goto again;
808 	}
809 
810 	/*
811 	 * We return holding p->p_lock.
812 	 */
813 	return (0);
814 }
815 
816 /*
817  * Undo prlock() and pr_p_lock().
818  * p->p_lock is still held; pr_pidlock is no longer held.
819  *
820  * prunmark() drops the P_PR_LOCK flag and wakes up another thread,
821  * if any, waiting for the flag to be dropped; it retains p->p_lock.
822  *
823  * prunlock() calls prunmark() and then drops p->p_lock.
824  */
825 void
826 prunmark(proc_t *p)
827 {
828 	ASSERT(p->p_proc_flag & P_PR_LOCK);
829 	ASSERT(MUTEX_HELD(&p->p_lock));
830 
831 	cv_signal(&pr_pid_cv[p->p_slot]);
832 	p->p_proc_flag &= ~P_PR_LOCK;
833 }
834 
835 void
836 prunlock(prnode_t *pnp)
837 {
838 	prcommon_t *pcp = pnp->pr_common;
839 	proc_t *p = pcp->prc_proc;
840 
841 	/*
842 	 * If we (or someone) gave it a SIGKILL, and it is not
843 	 * already a zombie, set it running unconditionally.
844 	 */
845 	if ((p->p_flag & SKILLED) &&
846 	    !(p->p_flag & SEXITING) &&
847 	    !(pcp->prc_flags & PRC_DESTROY) &&
848 	    !((pcp->prc_flags & PRC_LWP) && pcp->prc_tslot == -1))
849 		(void) pr_setrun(pnp, 0);
850 	prunmark(p);
851 	mutex_exit(&p->p_lock);
852 }
853 
854 /*
855  * Called while holding p->p_lock to delay until the process is unlocked.
856  * We enter holding p->p_lock; p->p_lock is dropped and reacquired.
857  * The process cannot become locked again until p->p_lock is dropped.
858  */
859 void
860 prbarrier(proc_t *p)
861 {
862 	ASSERT(MUTEX_HELD(&p->p_lock));
863 
864 	if (p->p_proc_flag & P_PR_LOCK) {
865 		/* The process is locked; delay until not locked */
866 		uint_t slot = p->p_slot;
867 
868 		while (p->p_proc_flag & P_PR_LOCK)
869 			cv_wait(&pr_pid_cv[slot], &p->p_lock);
870 		cv_signal(&pr_pid_cv[slot]);
871 	}
872 }
873 
874 /*
875  * Return process/lwp status.
876  * The u-block is mapped in by this routine and unmapped at the end.
877  */
878 void
879 prgetstatus(proc_t *p, pstatus_t *sp, zone_t *zp)
880 {
881 	kthread_t *t;
882 
883 	ASSERT(MUTEX_HELD(&p->p_lock));
884 
885 	t = prchoose(p);	/* returns locked thread */
886 	ASSERT(t != NULL);
887 	thread_unlock(t);
888 
889 	/* just bzero the process part, prgetlwpstatus() does the rest */
890 	bzero(sp, sizeof (pstatus_t) - sizeof (lwpstatus_t));
891 	sp->pr_nlwp = p->p_lwpcnt;
892 	sp->pr_nzomb = p->p_zombcnt;
893 	prassignset(&sp->pr_sigpend, &p->p_sig);
894 	sp->pr_brkbase = (uintptr_t)p->p_brkbase;
895 	sp->pr_brksize = p->p_brksize;
896 	sp->pr_stkbase = (uintptr_t)prgetstackbase(p);
897 	sp->pr_stksize = p->p_stksize;
898 	sp->pr_pid = p->p_pid;
899 	if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
900 	    (p->p_flag & SZONETOP)) {
901 		ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
902 		/*
903 		 * Inside local zones, fake zsched's pid as parent pids for
904 		 * processes which reference processes outside of the zone.
905 		 */
906 		sp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
907 	} else {
908 		sp->pr_ppid = p->p_ppid;
909 	}
910 	sp->pr_pgid  = p->p_pgrp;
911 	sp->pr_sid   = p->p_sessp->s_sid;
912 	sp->pr_taskid = p->p_task->tk_tkid;
913 	sp->pr_projid = p->p_task->tk_proj->kpj_id;
914 	sp->pr_zoneid = p->p_zone->zone_id;
915 	hrt2ts(mstate_aggr_state(p, LMS_USER), &sp->pr_utime);
916 	hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &sp->pr_stime);
917 	TICK_TO_TIMESTRUC(p->p_cutime, &sp->pr_cutime);
918 	TICK_TO_TIMESTRUC(p->p_cstime, &sp->pr_cstime);
919 	prassignset(&sp->pr_sigtrace, &p->p_sigmask);
920 	prassignset(&sp->pr_flttrace, &p->p_fltmask);
921 	prassignset(&sp->pr_sysentry, &PTOU(p)->u_entrymask);
922 	prassignset(&sp->pr_sysexit, &PTOU(p)->u_exitmask);
923 	switch (p->p_model) {
924 	case DATAMODEL_ILP32:
925 		sp->pr_dmodel = PR_MODEL_ILP32;
926 		break;
927 	case DATAMODEL_LP64:
928 		sp->pr_dmodel = PR_MODEL_LP64;
929 		break;
930 	}
931 	if (p->p_agenttp)
932 		sp->pr_agentid = p->p_agenttp->t_tid;
933 
934 	/* get the chosen lwp's status */
935 	prgetlwpstatus(t, &sp->pr_lwp, zp);
936 
937 	/* replicate the flags */
938 	sp->pr_flags = sp->pr_lwp.pr_flags;
939 }
940 
941 /*
942  * Query mask of held signals for a given thread.
943  *
944  * This makes use of schedctl_sigblock() to query if userspace has requested
945  * that all maskable signals be held.  While it would be tempting to call
946  * schedctl_finish_sigblock() and apply that update to t->t_hold, it cannot be
947  * done safely without the risk of racing with the thread under consideration.
948  */
949 void
950 prgethold(kthread_t *t, sigset_t *sp)
951 {
952 	k_sigset_t set;
953 
954 	if (schedctl_sigblock(t)) {
955 		set.__sigbits[0] = FILLSET0 & ~CANTMASK0;
956 		set.__sigbits[1] = FILLSET1 & ~CANTMASK1;
957 		set.__sigbits[2] = FILLSET2 & ~CANTMASK2;
958 	} else {
959 		set = t->t_hold;
960 	}
961 	sigktou(&set, sp);
962 }
963 
964 #ifdef _SYSCALL32_IMPL
965 void
966 prgetlwpstatus32(kthread_t *t, lwpstatus32_t *sp, zone_t *zp)
967 {
968 	proc_t *p = ttoproc(t);
969 	klwp_t *lwp = ttolwp(t);
970 	struct mstate *ms = &lwp->lwp_mstate;
971 	hrtime_t usr, sys;
972 	int flags;
973 	ulong_t instr;
974 
975 	ASSERT(MUTEX_HELD(&p->p_lock));
976 
977 	bzero(sp, sizeof (*sp));
978 	flags = 0L;
979 	if (t->t_state == TS_STOPPED) {
980 		flags |= PR_STOPPED;
981 		if ((t->t_schedflag & TS_PSTART) == 0)
982 			flags |= PR_ISTOP;
983 	} else if (VSTOPPED(t)) {
984 		flags |= PR_STOPPED|PR_ISTOP;
985 	}
986 	if (!(flags & PR_ISTOP) && (t->t_proc_flag & TP_PRSTOP))
987 		flags |= PR_DSTOP;
988 	if (lwp->lwp_asleep)
989 		flags |= PR_ASLEEP;
990 	if (t == p->p_agenttp)
991 		flags |= PR_AGENT;
992 	if (!(t->t_proc_flag & TP_TWAIT))
993 		flags |= PR_DETACH;
994 	if (t->t_proc_flag & TP_DAEMON)
995 		flags |= PR_DAEMON;
996 	if (p->p_proc_flag & P_PR_FORK)
997 		flags |= PR_FORK;
998 	if (p->p_proc_flag & P_PR_RUNLCL)
999 		flags |= PR_RLC;
1000 	if (p->p_proc_flag & P_PR_KILLCL)
1001 		flags |= PR_KLC;
1002 	if (p->p_proc_flag & P_PR_ASYNC)
1003 		flags |= PR_ASYNC;
1004 	if (p->p_proc_flag & P_PR_BPTADJ)
1005 		flags |= PR_BPTADJ;
1006 	if (p->p_proc_flag & P_PR_PTRACE)
1007 		flags |= PR_PTRACE;
1008 	if (p->p_flag & SMSACCT)
1009 		flags |= PR_MSACCT;
1010 	if (p->p_flag & SMSFORK)
1011 		flags |= PR_MSFORK;
1012 	if (p->p_flag & SVFWAIT)
1013 		flags |= PR_VFORKP;
1014 	sp->pr_flags = flags;
1015 	if (VSTOPPED(t)) {
1016 		sp->pr_why   = PR_REQUESTED;
1017 		sp->pr_what  = 0;
1018 	} else {
1019 		sp->pr_why   = t->t_whystop;
1020 		sp->pr_what  = t->t_whatstop;
1021 	}
1022 	sp->pr_lwpid = t->t_tid;
1023 	sp->pr_cursig  = lwp->lwp_cursig;
1024 	prassignset(&sp->pr_lwppend, &t->t_sig);
1025 	prgethold(t, &sp->pr_lwphold);
1026 	if (t->t_whystop == PR_FAULTED) {
1027 		siginfo_kto32(&lwp->lwp_siginfo, &sp->pr_info);
1028 		if (t->t_whatstop == FLTPAGE)
1029 			sp->pr_info.si_addr =
1030 			    (caddr32_t)(uintptr_t)lwp->lwp_siginfo.si_addr;
1031 	} else if (lwp->lwp_curinfo)
1032 		siginfo_kto32(&lwp->lwp_curinfo->sq_info, &sp->pr_info);
1033 	if (SI_FROMUSER(&lwp->lwp_siginfo) && zp->zone_id != GLOBAL_ZONEID &&
1034 	    sp->pr_info.si_zoneid != zp->zone_id) {
1035 		sp->pr_info.si_pid = zp->zone_zsched->p_pid;
1036 		sp->pr_info.si_uid = 0;
1037 		sp->pr_info.si_ctid = -1;
1038 		sp->pr_info.si_zoneid = zp->zone_id;
1039 	}
1040 	sp->pr_altstack.ss_sp =
1041 	    (caddr32_t)(uintptr_t)lwp->lwp_sigaltstack.ss_sp;
1042 	sp->pr_altstack.ss_size = (size32_t)lwp->lwp_sigaltstack.ss_size;
1043 	sp->pr_altstack.ss_flags = (int32_t)lwp->lwp_sigaltstack.ss_flags;
1044 	prgetaction32(p, PTOU(p), lwp->lwp_cursig, &sp->pr_action);
1045 	sp->pr_oldcontext = (caddr32_t)lwp->lwp_oldcontext;
1046 	sp->pr_ustack = (caddr32_t)lwp->lwp_ustack;
1047 	(void) strncpy(sp->pr_clname, sclass[t->t_cid].cl_name,
1048 	    sizeof (sp->pr_clname) - 1);
1049 	if (flags & PR_STOPPED)
1050 		hrt2ts32(t->t_stoptime, &sp->pr_tstamp);
1051 	usr = ms->ms_acct[LMS_USER];
1052 	sys = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
1053 	scalehrtime(&usr);
1054 	scalehrtime(&sys);
1055 	hrt2ts32(usr, &sp->pr_utime);
1056 	hrt2ts32(sys, &sp->pr_stime);
1057 
1058 	/*
1059 	 * Fetch the current instruction, if not a system process.
1060 	 * We don't attempt this unless the lwp is stopped.
1061 	 */
1062 	if ((p->p_flag & SSYS) || p->p_as == &kas)
1063 		sp->pr_flags |= (PR_ISSYS|PR_PCINVAL);
1064 	else if (!(flags & PR_STOPPED))
1065 		sp->pr_flags |= PR_PCINVAL;
1066 	else if (!prfetchinstr(lwp, &instr))
1067 		sp->pr_flags |= PR_PCINVAL;
1068 	else
1069 		sp->pr_instr = (uint32_t)instr;
1070 
1071 	/*
1072 	 * Drop p_lock while touching the lwp's stack.
1073 	 */
1074 	mutex_exit(&p->p_lock);
1075 	if (prisstep(lwp))
1076 		sp->pr_flags |= PR_STEP;
1077 	if ((flags & (PR_STOPPED|PR_ASLEEP)) && t->t_sysnum) {
1078 		int i;
1079 
1080 		sp->pr_syscall = get_syscall32_args(lwp,
1081 		    (int *)sp->pr_sysarg, &i);
1082 		sp->pr_nsysarg = (ushort_t)i;
1083 	}
1084 	if ((flags & PR_STOPPED) || t == curthread)
1085 		prgetprregs32(lwp, sp->pr_reg);
1086 	if ((t->t_state == TS_STOPPED && t->t_whystop == PR_SYSEXIT) ||
1087 	    (flags & PR_VFORKP)) {
1088 		long r1, r2;
1089 		user_t *up;
1090 		auxv_t *auxp;
1091 		int i;
1092 
1093 		sp->pr_errno = prgetrvals(lwp, &r1, &r2);
1094 		if (sp->pr_errno == 0) {
1095 			sp->pr_rval1 = (int32_t)r1;
1096 			sp->pr_rval2 = (int32_t)r2;
1097 			sp->pr_errpriv = PRIV_NONE;
1098 		} else
1099 			sp->pr_errpriv = lwp->lwp_badpriv;
1100 
1101 		if (t->t_sysnum == SYS_execve) {
1102 			up = PTOU(p);
1103 			sp->pr_sysarg[0] = 0;
1104 			sp->pr_sysarg[1] = (caddr32_t)up->u_argv;
1105 			sp->pr_sysarg[2] = (caddr32_t)up->u_envp;
1106 			sp->pr_sysarg[3] = 0;
1107 			for (i = 0, auxp = up->u_auxv;
1108 			    i < sizeof (up->u_auxv) / sizeof (up->u_auxv[0]);
1109 			    i++, auxp++) {
1110 				if (auxp->a_type == AT_SUN_EXECNAME) {
1111 					sp->pr_sysarg[0] =
1112 					    (caddr32_t)
1113 					    (uintptr_t)auxp->a_un.a_ptr;
1114 					break;
1115 				}
1116 			}
1117 		}
1118 	}
1119 	if (prhasfp())
1120 		prgetprfpregs32(lwp, &sp->pr_fpreg);
1121 	mutex_enter(&p->p_lock);
1122 }
1123 
1124 void
1125 prgetstatus32(proc_t *p, pstatus32_t *sp, zone_t *zp)
1126 {
1127 	kthread_t *t;
1128 
1129 	ASSERT(MUTEX_HELD(&p->p_lock));
1130 
1131 	t = prchoose(p);	/* returns locked thread */
1132 	ASSERT(t != NULL);
1133 	thread_unlock(t);
1134 
1135 	/* just bzero the process part, prgetlwpstatus32() does the rest */
1136 	bzero(sp, sizeof (pstatus32_t) - sizeof (lwpstatus32_t));
1137 	sp->pr_nlwp = p->p_lwpcnt;
1138 	sp->pr_nzomb = p->p_zombcnt;
1139 	prassignset(&sp->pr_sigpend, &p->p_sig);
1140 	sp->pr_brkbase = (uint32_t)(uintptr_t)p->p_brkbase;
1141 	sp->pr_brksize = (uint32_t)p->p_brksize;
1142 	sp->pr_stkbase = (uint32_t)(uintptr_t)prgetstackbase(p);
1143 	sp->pr_stksize = (uint32_t)p->p_stksize;
1144 	sp->pr_pid   = p->p_pid;
1145 	if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
1146 	    (p->p_flag & SZONETOP)) {
1147 		ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
1148 		/*
1149 		 * Inside local zones, fake zsched's pid as parent pids for
1150 		 * processes which reference processes outside of the zone.
1151 		 */
1152 		sp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
1153 	} else {
1154 		sp->pr_ppid = p->p_ppid;
1155 	}
1156 	sp->pr_pgid  = p->p_pgrp;
1157 	sp->pr_sid   = p->p_sessp->s_sid;
1158 	sp->pr_taskid = p->p_task->tk_tkid;
1159 	sp->pr_projid = p->p_task->tk_proj->kpj_id;
1160 	sp->pr_zoneid = p->p_zone->zone_id;
1161 	hrt2ts32(mstate_aggr_state(p, LMS_USER), &sp->pr_utime);
1162 	hrt2ts32(mstate_aggr_state(p, LMS_SYSTEM), &sp->pr_stime);
1163 	TICK_TO_TIMESTRUC32(p->p_cutime, &sp->pr_cutime);
1164 	TICK_TO_TIMESTRUC32(p->p_cstime, &sp->pr_cstime);
1165 	prassignset(&sp->pr_sigtrace, &p->p_sigmask);
1166 	prassignset(&sp->pr_flttrace, &p->p_fltmask);
1167 	prassignset(&sp->pr_sysentry, &PTOU(p)->u_entrymask);
1168 	prassignset(&sp->pr_sysexit, &PTOU(p)->u_exitmask);
1169 	switch (p->p_model) {
1170 	case DATAMODEL_ILP32:
1171 		sp->pr_dmodel = PR_MODEL_ILP32;
1172 		break;
1173 	case DATAMODEL_LP64:
1174 		sp->pr_dmodel = PR_MODEL_LP64;
1175 		break;
1176 	}
1177 	if (p->p_agenttp)
1178 		sp->pr_agentid = p->p_agenttp->t_tid;
1179 
1180 	/* get the chosen lwp's status */
1181 	prgetlwpstatus32(t, &sp->pr_lwp, zp);
1182 
1183 	/* replicate the flags */
1184 	sp->pr_flags = sp->pr_lwp.pr_flags;
1185 }
1186 #endif	/* _SYSCALL32_IMPL */
1187 
1188 /*
1189  * Return lwp status.
1190  */
1191 void
1192 prgetlwpstatus(kthread_t *t, lwpstatus_t *sp, zone_t *zp)
1193 {
1194 	proc_t *p = ttoproc(t);
1195 	klwp_t *lwp = ttolwp(t);
1196 	struct mstate *ms = &lwp->lwp_mstate;
1197 	hrtime_t usr, sys;
1198 	int flags;
1199 	ulong_t instr;
1200 
1201 	ASSERT(MUTEX_HELD(&p->p_lock));
1202 
1203 	bzero(sp, sizeof (*sp));
1204 	flags = 0L;
1205 	if (t->t_state == TS_STOPPED) {
1206 		flags |= PR_STOPPED;
1207 		if ((t->t_schedflag & TS_PSTART) == 0)
1208 			flags |= PR_ISTOP;
1209 	} else if (VSTOPPED(t)) {
1210 		flags |= PR_STOPPED|PR_ISTOP;
1211 	}
1212 	if (!(flags & PR_ISTOP) && (t->t_proc_flag & TP_PRSTOP))
1213 		flags |= PR_DSTOP;
1214 	if (lwp->lwp_asleep)
1215 		flags |= PR_ASLEEP;
1216 	if (t == p->p_agenttp)
1217 		flags |= PR_AGENT;
1218 	if (!(t->t_proc_flag & TP_TWAIT))
1219 		flags |= PR_DETACH;
1220 	if (t->t_proc_flag & TP_DAEMON)
1221 		flags |= PR_DAEMON;
1222 	if (p->p_proc_flag & P_PR_FORK)
1223 		flags |= PR_FORK;
1224 	if (p->p_proc_flag & P_PR_RUNLCL)
1225 		flags |= PR_RLC;
1226 	if (p->p_proc_flag & P_PR_KILLCL)
1227 		flags |= PR_KLC;
1228 	if (p->p_proc_flag & P_PR_ASYNC)
1229 		flags |= PR_ASYNC;
1230 	if (p->p_proc_flag & P_PR_BPTADJ)
1231 		flags |= PR_BPTADJ;
1232 	if (p->p_proc_flag & P_PR_PTRACE)
1233 		flags |= PR_PTRACE;
1234 	if (p->p_flag & SMSACCT)
1235 		flags |= PR_MSACCT;
1236 	if (p->p_flag & SMSFORK)
1237 		flags |= PR_MSFORK;
1238 	if (p->p_flag & SVFWAIT)
1239 		flags |= PR_VFORKP;
1240 	if (p->p_pgidp->pid_pgorphaned)
1241 		flags |= PR_ORPHAN;
1242 	if (p->p_pidflag & CLDNOSIGCHLD)
1243 		flags |= PR_NOSIGCHLD;
1244 	if (p->p_pidflag & CLDWAITPID)
1245 		flags |= PR_WAITPID;
1246 	sp->pr_flags = flags;
1247 	if (VSTOPPED(t)) {
1248 		sp->pr_why   = PR_REQUESTED;
1249 		sp->pr_what  = 0;
1250 	} else {
1251 		sp->pr_why   = t->t_whystop;
1252 		sp->pr_what  = t->t_whatstop;
1253 	}
1254 	sp->pr_lwpid = t->t_tid;
1255 	sp->pr_cursig  = lwp->lwp_cursig;
1256 	prassignset(&sp->pr_lwppend, &t->t_sig);
1257 	prgethold(t, &sp->pr_lwphold);
1258 	if (t->t_whystop == PR_FAULTED)
1259 		bcopy(&lwp->lwp_siginfo,
1260 		    &sp->pr_info, sizeof (k_siginfo_t));
1261 	else if (lwp->lwp_curinfo)
1262 		bcopy(&lwp->lwp_curinfo->sq_info,
1263 		    &sp->pr_info, sizeof (k_siginfo_t));
1264 	if (SI_FROMUSER(&lwp->lwp_siginfo) && zp->zone_id != GLOBAL_ZONEID &&
1265 	    sp->pr_info.si_zoneid != zp->zone_id) {
1266 		sp->pr_info.si_pid = zp->zone_zsched->p_pid;
1267 		sp->pr_info.si_uid = 0;
1268 		sp->pr_info.si_ctid = -1;
1269 		sp->pr_info.si_zoneid = zp->zone_id;
1270 	}
1271 	sp->pr_altstack = lwp->lwp_sigaltstack;
1272 	prgetaction(p, PTOU(p), lwp->lwp_cursig, &sp->pr_action);
1273 	sp->pr_oldcontext = (uintptr_t)lwp->lwp_oldcontext;
1274 	sp->pr_ustack = lwp->lwp_ustack;
1275 	(void) strncpy(sp->pr_clname, sclass[t->t_cid].cl_name,
1276 	    sizeof (sp->pr_clname) - 1);
1277 	if (flags & PR_STOPPED)
1278 		hrt2ts(t->t_stoptime, &sp->pr_tstamp);
1279 	usr = ms->ms_acct[LMS_USER];
1280 	sys = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
1281 	scalehrtime(&usr);
1282 	scalehrtime(&sys);
1283 	hrt2ts(usr, &sp->pr_utime);
1284 	hrt2ts(sys, &sp->pr_stime);
1285 
1286 	/*
1287 	 * Fetch the current instruction, if not a system process.
1288 	 * We don't attempt this unless the lwp is stopped.
1289 	 */
1290 	if ((p->p_flag & SSYS) || p->p_as == &kas)
1291 		sp->pr_flags |= (PR_ISSYS|PR_PCINVAL);
1292 	else if (!(flags & PR_STOPPED))
1293 		sp->pr_flags |= PR_PCINVAL;
1294 	else if (!prfetchinstr(lwp, &instr))
1295 		sp->pr_flags |= PR_PCINVAL;
1296 	else
1297 		sp->pr_instr = instr;
1298 
1299 	/*
1300 	 * Drop p_lock while touching the lwp's stack.
1301 	 */
1302 	mutex_exit(&p->p_lock);
1303 	if (prisstep(lwp))
1304 		sp->pr_flags |= PR_STEP;
1305 	if ((flags & (PR_STOPPED|PR_ASLEEP)) && t->t_sysnum) {
1306 		int i;
1307 
1308 		sp->pr_syscall = get_syscall_args(lwp,
1309 		    (long *)sp->pr_sysarg, &i);
1310 		sp->pr_nsysarg = (ushort_t)i;
1311 	}
1312 	if ((flags & PR_STOPPED) || t == curthread)
1313 		prgetprregs(lwp, sp->pr_reg);
1314 	if ((t->t_state == TS_STOPPED && t->t_whystop == PR_SYSEXIT) ||
1315 	    (flags & PR_VFORKP)) {
1316 		user_t *up;
1317 		auxv_t *auxp;
1318 		int i;
1319 
1320 		sp->pr_errno = prgetrvals(lwp, &sp->pr_rval1, &sp->pr_rval2);
1321 		if (sp->pr_errno == 0)
1322 			sp->pr_errpriv = PRIV_NONE;
1323 		else
1324 			sp->pr_errpriv = lwp->lwp_badpriv;
1325 
1326 		if (t->t_sysnum == SYS_execve) {
1327 			up = PTOU(p);
1328 			sp->pr_sysarg[0] = 0;
1329 			sp->pr_sysarg[1] = (uintptr_t)up->u_argv;
1330 			sp->pr_sysarg[2] = (uintptr_t)up->u_envp;
1331 			sp->pr_sysarg[3] = 0;
1332 			for (i = 0, auxp = up->u_auxv;
1333 			    i < sizeof (up->u_auxv) / sizeof (up->u_auxv[0]);
1334 			    i++, auxp++) {
1335 				if (auxp->a_type == AT_SUN_EXECNAME) {
1336 					sp->pr_sysarg[0] =
1337 					    (uintptr_t)auxp->a_un.a_ptr;
1338 					break;
1339 				}
1340 			}
1341 		}
1342 	}
1343 	if (prhasfp())
1344 		prgetprfpregs(lwp, &sp->pr_fpreg);
1345 	mutex_enter(&p->p_lock);
1346 }
1347 
1348 /*
1349  * Get the sigaction structure for the specified signal.  The u-block
1350  * must already have been mapped in by the caller.
1351  */
1352 void
1353 prgetaction(proc_t *p, user_t *up, uint_t sig, struct sigaction *sp)
1354 {
1355 	int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG;
1356 
1357 	bzero(sp, sizeof (*sp));
1358 
1359 	if (sig != 0 && (unsigned)sig < nsig) {
1360 		sp->sa_handler = up->u_signal[sig-1];
1361 		prassignset(&sp->sa_mask, &up->u_sigmask[sig-1]);
1362 		if (sigismember(&up->u_sigonstack, sig))
1363 			sp->sa_flags |= SA_ONSTACK;
1364 		if (sigismember(&up->u_sigresethand, sig))
1365 			sp->sa_flags |= SA_RESETHAND;
1366 		if (sigismember(&up->u_sigrestart, sig))
1367 			sp->sa_flags |= SA_RESTART;
1368 		if (sigismember(&p->p_siginfo, sig))
1369 			sp->sa_flags |= SA_SIGINFO;
1370 		if (sigismember(&up->u_signodefer, sig))
1371 			sp->sa_flags |= SA_NODEFER;
1372 		if (sig == SIGCLD) {
1373 			if (p->p_flag & SNOWAIT)
1374 				sp->sa_flags |= SA_NOCLDWAIT;
1375 			if ((p->p_flag & SJCTL) == 0)
1376 				sp->sa_flags |= SA_NOCLDSTOP;
1377 		}
1378 	}
1379 }
1380 
1381 #ifdef _SYSCALL32_IMPL
1382 void
1383 prgetaction32(proc_t *p, user_t *up, uint_t sig, struct sigaction32 *sp)
1384 {
1385 	int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG;
1386 
1387 	bzero(sp, sizeof (*sp));
1388 
1389 	if (sig != 0 && (unsigned)sig < nsig) {
1390 		sp->sa_handler = (caddr32_t)(uintptr_t)up->u_signal[sig-1];
1391 		prassignset(&sp->sa_mask, &up->u_sigmask[sig-1]);
1392 		if (sigismember(&up->u_sigonstack, sig))
1393 			sp->sa_flags |= SA_ONSTACK;
1394 		if (sigismember(&up->u_sigresethand, sig))
1395 			sp->sa_flags |= SA_RESETHAND;
1396 		if (sigismember(&up->u_sigrestart, sig))
1397 			sp->sa_flags |= SA_RESTART;
1398 		if (sigismember(&p->p_siginfo, sig))
1399 			sp->sa_flags |= SA_SIGINFO;
1400 		if (sigismember(&up->u_signodefer, sig))
1401 			sp->sa_flags |= SA_NODEFER;
1402 		if (sig == SIGCLD) {
1403 			if (p->p_flag & SNOWAIT)
1404 				sp->sa_flags |= SA_NOCLDWAIT;
1405 			if ((p->p_flag & SJCTL) == 0)
1406 				sp->sa_flags |= SA_NOCLDSTOP;
1407 		}
1408 	}
1409 }
1410 #endif	/* _SYSCALL32_IMPL */
1411 
1412 /*
1413  * Count the number of segments in this process's address space.
1414  */
1415 uint_t
1416 prnsegs(struct as *as, int reserved)
1417 {
1418 	uint_t n = 0;
1419 	struct seg *seg;
1420 
1421 	ASSERT(as != &kas && AS_WRITE_HELD(as));
1422 
1423 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1424 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, reserved);
1425 		caddr_t saddr, naddr;
1426 		void *tmp = NULL;
1427 
1428 		if ((seg->s_flags & S_HOLE) != 0) {
1429 			continue;
1430 		}
1431 
1432 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
1433 			(void) pr_getprot(seg, reserved, &tmp,
1434 			    &saddr, &naddr, eaddr);
1435 			if (saddr != naddr) {
1436 				n++;
1437 				/*
1438 				 * prnsegs() was formerly designated to return
1439 				 * an 'int' despite having no ability or use
1440 				 * for negative results.  As part of changing
1441 				 * it to 'uint_t', keep the old effective limit
1442 				 * of INT_MAX in place.
1443 				 */
1444 				if (n == INT_MAX) {
1445 					pr_getprot_done(&tmp);
1446 					ASSERT(tmp == NULL);
1447 					return (n);
1448 				}
1449 			}
1450 		}
1451 
1452 		ASSERT(tmp == NULL);
1453 	}
1454 
1455 	return (n);
1456 }
1457 
1458 /*
1459  * Convert uint32_t to decimal string w/o leading zeros.
1460  * Add trailing null characters if 'len' is greater than string length.
1461  * Return the string length.
1462  */
1463 int
1464 pr_u32tos(uint32_t n, char *s, int len)
1465 {
1466 	char cbuf[11];		/* 32-bit unsigned integer fits in 10 digits */
1467 	char *cp = cbuf;
1468 	char *end = s + len;
1469 
1470 	do {
1471 		*cp++ = (char)(n % 10 + '0');
1472 		n /= 10;
1473 	} while (n);
1474 
1475 	len = (int)(cp - cbuf);
1476 
1477 	do {
1478 		*s++ = *--cp;
1479 	} while (cp > cbuf);
1480 
1481 	while (s < end)		/* optional pad */
1482 		*s++ = '\0';
1483 
1484 	return (len);
1485 }
1486 
1487 /*
1488  * Convert uint64_t to decimal string w/o leading zeros.
1489  * Return the string length.
1490  */
1491 static int
1492 pr_u64tos(uint64_t n, char *s)
1493 {
1494 	char cbuf[21];		/* 64-bit unsigned integer fits in 20 digits */
1495 	char *cp = cbuf;
1496 	int len;
1497 
1498 	do {
1499 		*cp++ = (char)(n % 10 + '0');
1500 		n /= 10;
1501 	} while (n);
1502 
1503 	len = (int)(cp - cbuf);
1504 
1505 	do {
1506 		*s++ = *--cp;
1507 	} while (cp > cbuf);
1508 
1509 	return (len);
1510 }
1511 
1512 /*
1513  * Similar to getf() / getf_gen(), but for the specified process.  On success,
1514  * returns the fp with fp->f_count incremented.  The caller MUST call
1515  * closef(fp) on the returned fp after completing any actions using that fp.
1516  * We return a reference-held (fp->f_count bumped) file_t so no other closef()
1517  * can invoke destructive VOP_CLOSE actions while we're inspecting the
1518  * process's FD.
1519  *
1520  * Returns NULL for errors: either an empty process-table slot post-fi_lock
1521  * and UF_ENTER, or too many mutex_tryenter() failures on the file_t's f_tlock.
1522  * Both failure modes have DTrace probes.
1523  *
1524  * The current design of the procfs "close" code path uses the following lock
1525  * order of:
1526  *
1527  *   1: (file_t) f_tlock
1528  *   2: (proc_t) p_lock AND setting p->p_proc_flag's P_PR_LOCK
1529  *
1530  * That happens because closef() holds f_tlock while calling fop_close(),
1531  * which can be prclose(), which currently waits on and sets P_PR_LOCK at its
1532  * beginning.
1533  *
1534  * That lock order creates a challenge for pr_getf, which needs to take those
1535  * locks in the opposite order when the fd points to a procfs file descriptor.
1536  * The solution chosen here is to use mutex_tryenter on f_tlock and retry some
1537  * (limited) number of times, failing if we don't get both locks.
1538  *
1539  * The cases where this can fail are rare, and all involve a procfs caller
1540  * asking for info (eg. FDINFO) on another procfs FD.  In these cases,
1541  * returning EBADF (which results from a NULL return from pr_getf()) is
1542  * acceptable.
1543  *
1544  * One can increase the number of tries in pr_getf_maxtries if one is worried
1545  * about the contentuous case.
1546  */
1547 
1548 uint64_t pr_getf_tryfails; /* Bumped for statistic purposes. */
1549 int pr_getf_maxtries = 3;  /* So you can tune it from /etc/system */
1550 
1551 file_t *
1552 pr_getf(proc_t *p, uint_t fd, short *flag)
1553 {
1554 	uf_entry_t *ufp;
1555 	uf_info_t *fip;
1556 	file_t *fp;
1557 	int tries = 0;
1558 
1559 	ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK));
1560 
1561 retry:
1562 	fip = P_FINFO(p);
1563 
1564 	if (fd >= fip->fi_nfiles)
1565 		return (NULL);
1566 
1567 	mutex_exit(&p->p_lock);
1568 	mutex_enter(&fip->fi_lock);
1569 	UF_ENTER(ufp, fip, fd);
1570 	if ((fp = ufp->uf_file) != NULL && fp->f_count > 0) {
1571 		if (mutex_tryenter(&fp->f_tlock)) {
1572 			ASSERT(fp->f_count > 0);
1573 			fp->f_count++;
1574 			mutex_exit(&fp->f_tlock);
1575 			if (flag != NULL)
1576 				*flag = ufp->uf_flag;
1577 		} else {
1578 			/*
1579 			 * Note the number of mutex_trylock attempts.
1580 			 *
1581 			 * The exit path will catch this and try again if we
1582 			 * are below the retry threshhold (pr_getf_maxtries).
1583 			 */
1584 			tries++;
1585 			pr_getf_tryfails++;
1586 			/*
1587 			 * If we hit pr_getf_maxtries, we'll return NULL.
1588 			 * DTrace scripts looking for this sort of failure
1589 			 * should check when arg1 is pr_getf_maxtries.
1590 			 */
1591 			DTRACE_PROBE2(pr_getf_tryfail, file_t *, fp, int,
1592 			    tries);
1593 			fp = NULL;
1594 		}
1595 	} else {
1596 		fp = NULL;
1597 		/* If we fail here, someone else closed this FD. */
1598 		DTRACE_PROBE1(pr_getf_emptyslot, int, tries);
1599 		tries = pr_getf_maxtries; /* Don't bother retrying. */
1600 	}
1601 	UF_EXIT(ufp);
1602 	mutex_exit(&fip->fi_lock);
1603 	mutex_enter(&p->p_lock);
1604 
1605 	/* Use goto instead of tail-recursion so we can keep "tries" around. */
1606 	if (fp == NULL) {
1607 		/* "tries" starts at 1. */
1608 		if (tries < pr_getf_maxtries)
1609 			goto retry;
1610 	} else {
1611 		/*
1612 		 * Probes here will detect successes after arg1's number of
1613 		 * mutex_tryenter() calls.
1614 		 */
1615 		DTRACE_PROBE2(pr_getf_trysuccess, file_t *, fp, int, tries + 1);
1616 	}
1617 
1618 	return (fp);
1619 }
1620 
1621 
1622 /*
1623  * Just as pr_getf() is a little unusual in how it goes about making the file_t
1624  * safe for procfs consumers to access it, so too is pr_releasef() for safely
1625  * releasing that "hold".  The "hold" is unlike normal file descriptor activity
1626  * -- procfs is just an interloper here, wanting access to the vnode_t without
1627  * risk of a racing close() disrupting the state.  Just as pr_getf() avoids some
1628  * of the typical file_t behavior (such as auditing) when establishing its hold,
1629  * so too should pr_releasef().  It should not go through the motions of
1630  * closef() (since it is not a true close()) unless racing activity causes it to
1631  * be the last actor holding the refcount above zero.
1632  *
1633  * Under normal circumstances, we expect to find file_t`f_count > 1 after
1634  * the successful pr_getf() call.  We are, after all, accessing a resource
1635  * already held by the process in question.  We would also expect to rarely race
1636  * with a close() of the underlying fd, meaning that file_t`f_count > 1 would
1637  * still holds at pr_releasef() time.  That would mean we only need to decrement
1638  * f_count, leaving it to the process to later close the fd (thus triggering
1639  * VOP_CLOSE(), etc).
1640  *
1641  * It is only when that process manages to close() the fd while we have it
1642  * "held" in procfs that we must make a trip through the traditional closef()
1643  * logic to ensure proper tear-down of the file_t.
1644  */
1645 void
1646 pr_releasef(file_t *fp)
1647 {
1648 	mutex_enter(&fp->f_tlock);
1649 	if (fp->f_count > 1) {
1650 		/*
1651 		 * This is the most common case: The file is still held open by
1652 		 * the process, and we simply need to release our hold by
1653 		 * decrementing f_count
1654 		 */
1655 		fp->f_count--;
1656 		mutex_exit(&fp->f_tlock);
1657 	} else {
1658 		/*
1659 		 * A rare occasion: The process snuck a close() of this file
1660 		 * while we were doing our business in procfs.  Given that
1661 		 * f_count == 1, we are the only one with a reference to the
1662 		 * file_t and need to take a trip through closef() to free it.
1663 		 */
1664 		mutex_exit(&fp->f_tlock);
1665 		(void) closef(fp);
1666 	}
1667 }
1668 
1669 void
1670 pr_object_name(char *name, vnode_t *vp, struct vattr *vattr)
1671 {
1672 	char *s = name;
1673 	struct vfs *vfsp;
1674 	struct vfssw *vfsswp;
1675 
1676 	if ((vfsp = vp->v_vfsp) != NULL &&
1677 	    ((vfsswp = vfssw + vfsp->vfs_fstype), vfsswp->vsw_name) &&
1678 	    *vfsswp->vsw_name) {
1679 		(void) strcpy(s, vfsswp->vsw_name);
1680 		s += strlen(s);
1681 		*s++ = '.';
1682 	}
1683 	s += pr_u32tos(getmajor(vattr->va_fsid), s, 0);
1684 	*s++ = '.';
1685 	s += pr_u32tos(getminor(vattr->va_fsid), s, 0);
1686 	*s++ = '.';
1687 	s += pr_u64tos(vattr->va_nodeid, s);
1688 	*s++ = '\0';
1689 }
1690 
1691 struct seg *
1692 break_seg(proc_t *p)
1693 {
1694 	caddr_t addr = p->p_brkbase;
1695 	struct seg *seg;
1696 	struct vnode *vp;
1697 
1698 	if (p->p_brksize != 0)
1699 		addr += p->p_brksize - 1;
1700 	seg = as_segat(p->p_as, addr);
1701 	if (seg != NULL && seg->s_ops == &segvn_ops &&
1702 	    (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL))
1703 		return (seg);
1704 	return (NULL);
1705 }
1706 
1707 /*
1708  * Implementation of service functions to handle procfs generic chained
1709  * copyout buffers.
1710  */
1711 typedef struct pr_iobuf_list {
1712 	list_node_t	piol_link;	/* buffer linkage */
1713 	size_t		piol_size;	/* total size (header + data) */
1714 	size_t		piol_usedsize;	/* amount to copy out from this buf */
1715 } piol_t;
1716 
1717 #define	MAPSIZE	(64 * 1024)
1718 #define	PIOL_DATABUF(iol)	((void *)(&(iol)[1]))
1719 
1720 void
1721 pr_iol_initlist(list_t *iolhead, size_t itemsize, int n)
1722 {
1723 	piol_t	*iol;
1724 	size_t	initial_size = MIN(1, n) * itemsize;
1725 
1726 	list_create(iolhead, sizeof (piol_t), offsetof(piol_t, piol_link));
1727 
1728 	ASSERT(list_head(iolhead) == NULL);
1729 	ASSERT(itemsize < MAPSIZE - sizeof (*iol));
1730 	ASSERT(initial_size > 0);
1731 
1732 	/*
1733 	 * Someone creating chained copyout buffers may ask for less than
1734 	 * MAPSIZE if the amount of data to be buffered is known to be
1735 	 * smaller than that.
1736 	 * But in order to prevent involuntary self-denial of service,
1737 	 * the requested input size is clamped at MAPSIZE.
1738 	 */
1739 	initial_size = MIN(MAPSIZE, initial_size + sizeof (*iol));
1740 	iol = kmem_alloc(initial_size, KM_SLEEP);
1741 	list_insert_head(iolhead, iol);
1742 	iol->piol_usedsize = 0;
1743 	iol->piol_size = initial_size;
1744 }
1745 
1746 void *
1747 pr_iol_newbuf(list_t *iolhead, size_t itemsize)
1748 {
1749 	piol_t	*iol;
1750 	char	*new;
1751 
1752 	ASSERT(itemsize < MAPSIZE - sizeof (*iol));
1753 	ASSERT(list_head(iolhead) != NULL);
1754 
1755 	iol = (piol_t *)list_tail(iolhead);
1756 
1757 	if (iol->piol_size <
1758 	    iol->piol_usedsize + sizeof (*iol) + itemsize) {
1759 		/*
1760 		 * Out of space in the current buffer. Allocate more.
1761 		 */
1762 		piol_t *newiol;
1763 
1764 		newiol = kmem_alloc(MAPSIZE, KM_SLEEP);
1765 		newiol->piol_size = MAPSIZE;
1766 		newiol->piol_usedsize = 0;
1767 
1768 		list_insert_after(iolhead, iol, newiol);
1769 		iol = list_next(iolhead, iol);
1770 		ASSERT(iol == newiol);
1771 	}
1772 	new = (char *)PIOL_DATABUF(iol) + iol->piol_usedsize;
1773 	iol->piol_usedsize += itemsize;
1774 	bzero(new, itemsize);
1775 	return (new);
1776 }
1777 
1778 void
1779 pr_iol_freelist(list_t *iolhead)
1780 {
1781 	piol_t	*iol;
1782 
1783 	while ((iol = list_head(iolhead)) != NULL) {
1784 		list_remove(iolhead, iol);
1785 		kmem_free(iol, iol->piol_size);
1786 	}
1787 	list_destroy(iolhead);
1788 }
1789 
1790 int
1791 pr_iol_copyout_and_free(list_t *iolhead, caddr_t *tgt, int errin)
1792 {
1793 	int error = errin;
1794 	piol_t	*iol;
1795 
1796 	while ((iol = list_head(iolhead)) != NULL) {
1797 		list_remove(iolhead, iol);
1798 		if (!error) {
1799 			if (copyout(PIOL_DATABUF(iol), *tgt,
1800 			    iol->piol_usedsize))
1801 				error = EFAULT;
1802 			*tgt += iol->piol_usedsize;
1803 		}
1804 		kmem_free(iol, iol->piol_size);
1805 	}
1806 	list_destroy(iolhead);
1807 
1808 	return (error);
1809 }
1810 
1811 int
1812 pr_iol_uiomove_and_free(list_t *iolhead, uio_t *uiop, int errin)
1813 {
1814 	offset_t	off = uiop->uio_offset;
1815 	char		*base;
1816 	size_t		size;
1817 	piol_t		*iol;
1818 	int		error = errin;
1819 
1820 	while ((iol = list_head(iolhead)) != NULL) {
1821 		list_remove(iolhead, iol);
1822 		base = PIOL_DATABUF(iol);
1823 		size = iol->piol_usedsize;
1824 		if (off <= size && error == 0 && uiop->uio_resid > 0)
1825 			error = uiomove(base + off, size - off,
1826 			    UIO_READ, uiop);
1827 		off = MAX(0, off - (offset_t)size);
1828 		kmem_free(iol, iol->piol_size);
1829 	}
1830 	list_destroy(iolhead);
1831 
1832 	return (error);
1833 }
1834 
1835 /*
1836  * Return an array of structures with memory map information.
1837  * We allocate here; the caller must deallocate.
1838  */
1839 int
1840 prgetmap(proc_t *p, int reserved, list_t *iolhead)
1841 {
1842 	struct as *as = p->p_as;
1843 	prmap_t *mp;
1844 	struct seg *seg;
1845 	struct seg *brkseg, *stkseg;
1846 	struct vnode *vp;
1847 	struct vattr vattr;
1848 	uint_t prot;
1849 
1850 	ASSERT(as != &kas && AS_WRITE_HELD(as));
1851 
1852 	/*
1853 	 * Request an initial buffer size that doesn't waste memory
1854 	 * if the address space has only a small number of segments.
1855 	 */
1856 	pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
1857 
1858 	if ((seg = AS_SEGFIRST(as)) == NULL)
1859 		return (0);
1860 
1861 	brkseg = break_seg(p);
1862 	stkseg = as_segat(as, prgetstackbase(p));
1863 
1864 	do {
1865 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, reserved);
1866 		caddr_t saddr, naddr;
1867 		void *tmp = NULL;
1868 
1869 		if ((seg->s_flags & S_HOLE) != 0) {
1870 			continue;
1871 		}
1872 
1873 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
1874 			prot = pr_getprot(seg, reserved, &tmp,
1875 			    &saddr, &naddr, eaddr);
1876 			if (saddr == naddr)
1877 				continue;
1878 
1879 			mp = pr_iol_newbuf(iolhead, sizeof (*mp));
1880 
1881 			mp->pr_vaddr = (uintptr_t)saddr;
1882 			mp->pr_size = naddr - saddr;
1883 			mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
1884 			mp->pr_mflags = 0;
1885 			if (prot & PROT_READ)
1886 				mp->pr_mflags |= MA_READ;
1887 			if (prot & PROT_WRITE)
1888 				mp->pr_mflags |= MA_WRITE;
1889 			if (prot & PROT_EXEC)
1890 				mp->pr_mflags |= MA_EXEC;
1891 			if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
1892 				mp->pr_mflags |= MA_SHARED;
1893 			if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
1894 				mp->pr_mflags |= MA_NORESERVE;
1895 			if (seg->s_ops == &segspt_shmops ||
1896 			    (seg->s_ops == &segvn_ops &&
1897 			    (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
1898 				mp->pr_mflags |= MA_ANON;
1899 			if (seg == brkseg)
1900 				mp->pr_mflags |= MA_BREAK;
1901 			else if (seg == stkseg) {
1902 				mp->pr_mflags |= MA_STACK;
1903 				if (reserved) {
1904 					size_t maxstack =
1905 					    ((size_t)p->p_stk_ctl +
1906 					    PAGEOFFSET) & PAGEMASK;
1907 					mp->pr_vaddr =
1908 					    (uintptr_t)prgetstackbase(p) +
1909 					    p->p_stksize - maxstack;
1910 					mp->pr_size = (uintptr_t)naddr -
1911 					    mp->pr_vaddr;
1912 				}
1913 			}
1914 			if (seg->s_ops == &segspt_shmops)
1915 				mp->pr_mflags |= MA_ISM | MA_SHM;
1916 			mp->pr_pagesize = PAGESIZE;
1917 
1918 			/*
1919 			 * Manufacture a filename for the "object" directory.
1920 			 */
1921 			vattr.va_mask = AT_FSID|AT_NODEID;
1922 			if (seg->s_ops == &segvn_ops &&
1923 			    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
1924 			    vp != NULL && vp->v_type == VREG &&
1925 			    VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
1926 				if (vp == p->p_exec)
1927 					(void) strcpy(mp->pr_mapname, "a.out");
1928 				else
1929 					pr_object_name(mp->pr_mapname,
1930 					    vp, &vattr);
1931 			}
1932 
1933 			/*
1934 			 * Get the SysV shared memory id, if any.
1935 			 */
1936 			if ((mp->pr_mflags & MA_SHARED) && p->p_segacct &&
1937 			    (mp->pr_shmid = shmgetid(p, seg->s_base)) !=
1938 			    SHMID_NONE) {
1939 				if (mp->pr_shmid == SHMID_FREE)
1940 					mp->pr_shmid = -1;
1941 
1942 				mp->pr_mflags |= MA_SHM;
1943 			} else {
1944 				mp->pr_shmid = -1;
1945 			}
1946 		}
1947 		ASSERT(tmp == NULL);
1948 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
1949 
1950 	return (0);
1951 }
1952 
1953 #ifdef _SYSCALL32_IMPL
1954 int
1955 prgetmap32(proc_t *p, int reserved, list_t *iolhead)
1956 {
1957 	struct as *as = p->p_as;
1958 	prmap32_t *mp;
1959 	struct seg *seg;
1960 	struct seg *brkseg, *stkseg;
1961 	struct vnode *vp;
1962 	struct vattr vattr;
1963 	uint_t prot;
1964 
1965 	ASSERT(as != &kas && AS_WRITE_HELD(as));
1966 
1967 	/*
1968 	 * Request an initial buffer size that doesn't waste memory
1969 	 * if the address space has only a small number of segments.
1970 	 */
1971 	pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
1972 
1973 	if ((seg = AS_SEGFIRST(as)) == NULL)
1974 		return (0);
1975 
1976 	brkseg = break_seg(p);
1977 	stkseg = as_segat(as, prgetstackbase(p));
1978 
1979 	do {
1980 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, reserved);
1981 		caddr_t saddr, naddr;
1982 		void *tmp = NULL;
1983 
1984 		if ((seg->s_flags & S_HOLE) != 0) {
1985 			continue;
1986 		}
1987 
1988 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
1989 			prot = pr_getprot(seg, reserved, &tmp,
1990 			    &saddr, &naddr, eaddr);
1991 			if (saddr == naddr)
1992 				continue;
1993 
1994 			mp = pr_iol_newbuf(iolhead, sizeof (*mp));
1995 
1996 			mp->pr_vaddr = (caddr32_t)(uintptr_t)saddr;
1997 			mp->pr_size = (size32_t)(naddr - saddr);
1998 			mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
1999 			mp->pr_mflags = 0;
2000 			if (prot & PROT_READ)
2001 				mp->pr_mflags |= MA_READ;
2002 			if (prot & PROT_WRITE)
2003 				mp->pr_mflags |= MA_WRITE;
2004 			if (prot & PROT_EXEC)
2005 				mp->pr_mflags |= MA_EXEC;
2006 			if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
2007 				mp->pr_mflags |= MA_SHARED;
2008 			if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
2009 				mp->pr_mflags |= MA_NORESERVE;
2010 			if (seg->s_ops == &segspt_shmops ||
2011 			    (seg->s_ops == &segvn_ops &&
2012 			    (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
2013 				mp->pr_mflags |= MA_ANON;
2014 			if (seg == brkseg)
2015 				mp->pr_mflags |= MA_BREAK;
2016 			else if (seg == stkseg) {
2017 				mp->pr_mflags |= MA_STACK;
2018 				if (reserved) {
2019 					size_t maxstack =
2020 					    ((size_t)p->p_stk_ctl +
2021 					    PAGEOFFSET) & PAGEMASK;
2022 					uintptr_t vaddr =
2023 					    (uintptr_t)prgetstackbase(p) +
2024 					    p->p_stksize - maxstack;
2025 					mp->pr_vaddr = (caddr32_t)vaddr;
2026 					mp->pr_size = (size32_t)
2027 					    ((uintptr_t)naddr - vaddr);
2028 				}
2029 			}
2030 			if (seg->s_ops == &segspt_shmops)
2031 				mp->pr_mflags |= MA_ISM | MA_SHM;
2032 			mp->pr_pagesize = PAGESIZE;
2033 
2034 			/*
2035 			 * Manufacture a filename for the "object" directory.
2036 			 */
2037 			vattr.va_mask = AT_FSID|AT_NODEID;
2038 			if (seg->s_ops == &segvn_ops &&
2039 			    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
2040 			    vp != NULL && vp->v_type == VREG &&
2041 			    VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
2042 				if (vp == p->p_exec)
2043 					(void) strcpy(mp->pr_mapname, "a.out");
2044 				else
2045 					pr_object_name(mp->pr_mapname,
2046 					    vp, &vattr);
2047 			}
2048 
2049 			/*
2050 			 * Get the SysV shared memory id, if any.
2051 			 */
2052 			if ((mp->pr_mflags & MA_SHARED) && p->p_segacct &&
2053 			    (mp->pr_shmid = shmgetid(p, seg->s_base)) !=
2054 			    SHMID_NONE) {
2055 				if (mp->pr_shmid == SHMID_FREE)
2056 					mp->pr_shmid = -1;
2057 
2058 				mp->pr_mflags |= MA_SHM;
2059 			} else {
2060 				mp->pr_shmid = -1;
2061 			}
2062 		}
2063 		ASSERT(tmp == NULL);
2064 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2065 
2066 	return (0);
2067 }
2068 #endif	/* _SYSCALL32_IMPL */
2069 
2070 /*
2071  * Return the size of the /proc page data file.
2072  */
2073 size_t
2074 prpdsize(struct as *as)
2075 {
2076 	struct seg *seg;
2077 	size_t size;
2078 
2079 	ASSERT(as != &kas && AS_WRITE_HELD(as));
2080 
2081 	if ((seg = AS_SEGFIRST(as)) == NULL)
2082 		return (0);
2083 
2084 	size = sizeof (prpageheader_t);
2085 	do {
2086 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2087 		caddr_t saddr, naddr;
2088 		void *tmp = NULL;
2089 		size_t npage;
2090 
2091 		if ((seg->s_flags & S_HOLE) != 0) {
2092 			continue;
2093 		}
2094 
2095 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2096 			(void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2097 			if ((npage = (naddr - saddr) / PAGESIZE) != 0)
2098 				size += sizeof (prasmap_t) + round8(npage);
2099 		}
2100 		ASSERT(tmp == NULL);
2101 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2102 
2103 	return (size);
2104 }
2105 
2106 #ifdef _SYSCALL32_IMPL
2107 size_t
2108 prpdsize32(struct as *as)
2109 {
2110 	struct seg *seg;
2111 	size_t size;
2112 
2113 	ASSERT(as != &kas && AS_WRITE_HELD(as));
2114 
2115 	if ((seg = AS_SEGFIRST(as)) == NULL)
2116 		return (0);
2117 
2118 	size = sizeof (prpageheader32_t);
2119 	do {
2120 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2121 		caddr_t saddr, naddr;
2122 		void *tmp = NULL;
2123 		size_t npage;
2124 
2125 		if ((seg->s_flags & S_HOLE) != 0) {
2126 			continue;
2127 		}
2128 
2129 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2130 			(void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2131 			if ((npage = (naddr - saddr) / PAGESIZE) != 0)
2132 				size += sizeof (prasmap32_t) + round8(npage);
2133 		}
2134 		ASSERT(tmp == NULL);
2135 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2136 
2137 	return (size);
2138 }
2139 #endif	/* _SYSCALL32_IMPL */
2140 
2141 /*
2142  * Read page data information.
2143  */
2144 int
2145 prpdread(proc_t *p, uint_t hatid, struct uio *uiop)
2146 {
2147 	struct as *as = p->p_as;
2148 	caddr_t buf;
2149 	size_t size;
2150 	prpageheader_t *php;
2151 	prasmap_t *pmp;
2152 	struct seg *seg;
2153 	int error;
2154 
2155 again:
2156 	AS_LOCK_ENTER(as, RW_WRITER);
2157 
2158 	if ((seg = AS_SEGFIRST(as)) == NULL) {
2159 		AS_LOCK_EXIT(as);
2160 		return (0);
2161 	}
2162 	size = prpdsize(as);
2163 	if (uiop->uio_resid < size) {
2164 		AS_LOCK_EXIT(as);
2165 		return (E2BIG);
2166 	}
2167 
2168 	buf = kmem_zalloc(size, KM_SLEEP);
2169 	php = (prpageheader_t *)buf;
2170 	pmp = (prasmap_t *)(buf + sizeof (prpageheader_t));
2171 
2172 	hrt2ts(gethrtime(), &php->pr_tstamp);
2173 	php->pr_nmap = 0;
2174 	php->pr_npage = 0;
2175 	do {
2176 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2177 		caddr_t saddr, naddr;
2178 		void *tmp = NULL;
2179 
2180 		if ((seg->s_flags & S_HOLE) != 0) {
2181 			continue;
2182 		}
2183 
2184 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2185 			struct vnode *vp;
2186 			struct vattr vattr;
2187 			size_t len;
2188 			size_t npage;
2189 			uint_t prot;
2190 			uintptr_t next;
2191 
2192 			prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2193 			if ((len = (size_t)(naddr - saddr)) == 0)
2194 				continue;
2195 			npage = len / PAGESIZE;
2196 			next = (uintptr_t)(pmp + 1) + round8(npage);
2197 			/*
2198 			 * It's possible that the address space can change
2199 			 * subtlely even though we're holding as->a_lock
2200 			 * due to the nondeterminism of page_exists() in
2201 			 * the presence of asychronously flushed pages or
2202 			 * mapped files whose sizes are changing.
2203 			 * page_exists() may be called indirectly from
2204 			 * pr_getprot() by a SEGOP_INCORE() routine.
2205 			 * If this happens we need to make sure we don't
2206 			 * overrun the buffer whose size we computed based
2207 			 * on the initial iteration through the segments.
2208 			 * Once we've detected an overflow, we need to clean
2209 			 * up the temporary memory allocated in pr_getprot()
2210 			 * and retry. If there's a pending signal, we return
2211 			 * EINTR so that this thread can be dislodged if
2212 			 * a latent bug causes us to spin indefinitely.
2213 			 */
2214 			if (next > (uintptr_t)buf + size) {
2215 				pr_getprot_done(&tmp);
2216 				AS_LOCK_EXIT(as);
2217 
2218 				kmem_free(buf, size);
2219 
2220 				if (ISSIG(curthread, JUSTLOOKING))
2221 					return (EINTR);
2222 
2223 				goto again;
2224 			}
2225 
2226 			php->pr_nmap++;
2227 			php->pr_npage += npage;
2228 			pmp->pr_vaddr = (uintptr_t)saddr;
2229 			pmp->pr_npage = npage;
2230 			pmp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
2231 			pmp->pr_mflags = 0;
2232 			if (prot & PROT_READ)
2233 				pmp->pr_mflags |= MA_READ;
2234 			if (prot & PROT_WRITE)
2235 				pmp->pr_mflags |= MA_WRITE;
2236 			if (prot & PROT_EXEC)
2237 				pmp->pr_mflags |= MA_EXEC;
2238 			if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
2239 				pmp->pr_mflags |= MA_SHARED;
2240 			if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
2241 				pmp->pr_mflags |= MA_NORESERVE;
2242 			if (seg->s_ops == &segspt_shmops ||
2243 			    (seg->s_ops == &segvn_ops &&
2244 			    (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
2245 				pmp->pr_mflags |= MA_ANON;
2246 			if (seg->s_ops == &segspt_shmops)
2247 				pmp->pr_mflags |= MA_ISM | MA_SHM;
2248 			pmp->pr_pagesize = PAGESIZE;
2249 			/*
2250 			 * Manufacture a filename for the "object" directory.
2251 			 */
2252 			vattr.va_mask = AT_FSID|AT_NODEID;
2253 			if (seg->s_ops == &segvn_ops &&
2254 			    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
2255 			    vp != NULL && vp->v_type == VREG &&
2256 			    VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
2257 				if (vp == p->p_exec)
2258 					(void) strcpy(pmp->pr_mapname, "a.out");
2259 				else
2260 					pr_object_name(pmp->pr_mapname,
2261 					    vp, &vattr);
2262 			}
2263 
2264 			/*
2265 			 * Get the SysV shared memory id, if any.
2266 			 */
2267 			if ((pmp->pr_mflags & MA_SHARED) && p->p_segacct &&
2268 			    (pmp->pr_shmid = shmgetid(p, seg->s_base)) !=
2269 			    SHMID_NONE) {
2270 				if (pmp->pr_shmid == SHMID_FREE)
2271 					pmp->pr_shmid = -1;
2272 
2273 				pmp->pr_mflags |= MA_SHM;
2274 			} else {
2275 				pmp->pr_shmid = -1;
2276 			}
2277 
2278 			hat_getstat(as, saddr, len, hatid,
2279 			    (char *)(pmp + 1), HAT_SYNC_ZERORM);
2280 			pmp = (prasmap_t *)next;
2281 		}
2282 		ASSERT(tmp == NULL);
2283 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2284 
2285 	AS_LOCK_EXIT(as);
2286 
2287 	ASSERT((uintptr_t)pmp <= (uintptr_t)buf + size);
2288 	error = uiomove(buf, (caddr_t)pmp - buf, UIO_READ, uiop);
2289 	kmem_free(buf, size);
2290 
2291 	return (error);
2292 }
2293 
2294 #ifdef _SYSCALL32_IMPL
2295 int
2296 prpdread32(proc_t *p, uint_t hatid, struct uio *uiop)
2297 {
2298 	struct as *as = p->p_as;
2299 	caddr_t buf;
2300 	size_t size;
2301 	prpageheader32_t *php;
2302 	prasmap32_t *pmp;
2303 	struct seg *seg;
2304 	int error;
2305 
2306 again:
2307 	AS_LOCK_ENTER(as, RW_WRITER);
2308 
2309 	if ((seg = AS_SEGFIRST(as)) == NULL) {
2310 		AS_LOCK_EXIT(as);
2311 		return (0);
2312 	}
2313 	size = prpdsize32(as);
2314 	if (uiop->uio_resid < size) {
2315 		AS_LOCK_EXIT(as);
2316 		return (E2BIG);
2317 	}
2318 
2319 	buf = kmem_zalloc(size, KM_SLEEP);
2320 	php = (prpageheader32_t *)buf;
2321 	pmp = (prasmap32_t *)(buf + sizeof (prpageheader32_t));
2322 
2323 	hrt2ts32(gethrtime(), &php->pr_tstamp);
2324 	php->pr_nmap = 0;
2325 	php->pr_npage = 0;
2326 	do {
2327 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2328 		caddr_t saddr, naddr;
2329 		void *tmp = NULL;
2330 
2331 		if ((seg->s_flags & S_HOLE) != 0) {
2332 			continue;
2333 		}
2334 
2335 		for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2336 			struct vnode *vp;
2337 			struct vattr vattr;
2338 			size_t len;
2339 			size_t npage;
2340 			uint_t prot;
2341 			uintptr_t next;
2342 
2343 			prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2344 			if ((len = (size_t)(naddr - saddr)) == 0)
2345 				continue;
2346 			npage = len / PAGESIZE;
2347 			next = (uintptr_t)(pmp + 1) + round8(npage);
2348 			/*
2349 			 * It's possible that the address space can change
2350 			 * subtlely even though we're holding as->a_lock
2351 			 * due to the nondeterminism of page_exists() in
2352 			 * the presence of asychronously flushed pages or
2353 			 * mapped files whose sizes are changing.
2354 			 * page_exists() may be called indirectly from
2355 			 * pr_getprot() by a SEGOP_INCORE() routine.
2356 			 * If this happens we need to make sure we don't
2357 			 * overrun the buffer whose size we computed based
2358 			 * on the initial iteration through the segments.
2359 			 * Once we've detected an overflow, we need to clean
2360 			 * up the temporary memory allocated in pr_getprot()
2361 			 * and retry. If there's a pending signal, we return
2362 			 * EINTR so that this thread can be dislodged if
2363 			 * a latent bug causes us to spin indefinitely.
2364 			 */
2365 			if (next > (uintptr_t)buf + size) {
2366 				pr_getprot_done(&tmp);
2367 				AS_LOCK_EXIT(as);
2368 
2369 				kmem_free(buf, size);
2370 
2371 				if (ISSIG(curthread, JUSTLOOKING))
2372 					return (EINTR);
2373 
2374 				goto again;
2375 			}
2376 
2377 			php->pr_nmap++;
2378 			php->pr_npage += npage;
2379 			pmp->pr_vaddr = (caddr32_t)(uintptr_t)saddr;
2380 			pmp->pr_npage = (size32_t)npage;
2381 			pmp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
2382 			pmp->pr_mflags = 0;
2383 			if (prot & PROT_READ)
2384 				pmp->pr_mflags |= MA_READ;
2385 			if (prot & PROT_WRITE)
2386 				pmp->pr_mflags |= MA_WRITE;
2387 			if (prot & PROT_EXEC)
2388 				pmp->pr_mflags |= MA_EXEC;
2389 			if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
2390 				pmp->pr_mflags |= MA_SHARED;
2391 			if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
2392 				pmp->pr_mflags |= MA_NORESERVE;
2393 			if (seg->s_ops == &segspt_shmops ||
2394 			    (seg->s_ops == &segvn_ops &&
2395 			    (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
2396 				pmp->pr_mflags |= MA_ANON;
2397 			if (seg->s_ops == &segspt_shmops)
2398 				pmp->pr_mflags |= MA_ISM | MA_SHM;
2399 			pmp->pr_pagesize = PAGESIZE;
2400 			/*
2401 			 * Manufacture a filename for the "object" directory.
2402 			 */
2403 			vattr.va_mask = AT_FSID|AT_NODEID;
2404 			if (seg->s_ops == &segvn_ops &&
2405 			    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
2406 			    vp != NULL && vp->v_type == VREG &&
2407 			    VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
2408 				if (vp == p->p_exec)
2409 					(void) strcpy(pmp->pr_mapname, "a.out");
2410 				else
2411 					pr_object_name(pmp->pr_mapname,
2412 					    vp, &vattr);
2413 			}
2414 
2415 			/*
2416 			 * Get the SysV shared memory id, if any.
2417 			 */
2418 			if ((pmp->pr_mflags & MA_SHARED) && p->p_segacct &&
2419 			    (pmp->pr_shmid = shmgetid(p, seg->s_base)) !=
2420 			    SHMID_NONE) {
2421 				if (pmp->pr_shmid == SHMID_FREE)
2422 					pmp->pr_shmid = -1;
2423 
2424 				pmp->pr_mflags |= MA_SHM;
2425 			} else {
2426 				pmp->pr_shmid = -1;
2427 			}
2428 
2429 			hat_getstat(as, saddr, len, hatid,
2430 			    (char *)(pmp + 1), HAT_SYNC_ZERORM);
2431 			pmp = (prasmap32_t *)next;
2432 		}
2433 		ASSERT(tmp == NULL);
2434 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2435 
2436 	AS_LOCK_EXIT(as);
2437 
2438 	ASSERT((uintptr_t)pmp <= (uintptr_t)buf + size);
2439 	error = uiomove(buf, (caddr_t)pmp - buf, UIO_READ, uiop);
2440 	kmem_free(buf, size);
2441 
2442 	return (error);
2443 }
2444 #endif	/* _SYSCALL32_IMPL */
2445 
2446 ushort_t
2447 prgetpctcpu(uint64_t pct)
2448 {
2449 	/*
2450 	 * The value returned will be relevant in the zone of the examiner,
2451 	 * which may not be the same as the zone which performed the procfs
2452 	 * mount.
2453 	 */
2454 	int nonline = zone_ncpus_online_get(curproc->p_zone);
2455 
2456 	/*
2457 	 * Prorate over online cpus so we don't exceed 100%
2458 	 */
2459 	if (nonline > 1)
2460 		pct /= nonline;
2461 	pct >>= 16;		/* convert to 16-bit scaled integer */
2462 	if (pct > 0x8000)	/* might happen, due to rounding */
2463 		pct = 0x8000;
2464 	return ((ushort_t)pct);
2465 }
2466 
2467 /*
2468  * Return information used by ps(1).
2469  */
2470 void
2471 prgetpsinfo(proc_t *p, psinfo_t *psp)
2472 {
2473 	kthread_t *t;
2474 	struct cred *cred;
2475 	hrtime_t hrutime, hrstime;
2476 
2477 	ASSERT(MUTEX_HELD(&p->p_lock));
2478 
2479 	if ((t = prchoose(p)) == NULL)	/* returns locked thread */
2480 		bzero(psp, sizeof (*psp));
2481 	else {
2482 		thread_unlock(t);
2483 		bzero(psp, sizeof (*psp) - sizeof (psp->pr_lwp));
2484 	}
2485 
2486 	/*
2487 	 * only export SSYS and SMSACCT; everything else is off-limits to
2488 	 * userland apps.
2489 	 */
2490 	psp->pr_flag = p->p_flag & (SSYS | SMSACCT);
2491 	psp->pr_nlwp = p->p_lwpcnt;
2492 	psp->pr_nzomb = p->p_zombcnt;
2493 	mutex_enter(&p->p_crlock);
2494 	cred = p->p_cred;
2495 	psp->pr_uid = crgetruid(cred);
2496 	psp->pr_euid = crgetuid(cred);
2497 	psp->pr_gid = crgetrgid(cred);
2498 	psp->pr_egid = crgetgid(cred);
2499 	mutex_exit(&p->p_crlock);
2500 	psp->pr_pid = p->p_pid;
2501 	if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
2502 	    (p->p_flag & SZONETOP)) {
2503 		ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
2504 		/*
2505 		 * Inside local zones, fake zsched's pid as parent pids for
2506 		 * processes which reference processes outside of the zone.
2507 		 */
2508 		psp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
2509 	} else {
2510 		psp->pr_ppid = p->p_ppid;
2511 	}
2512 	psp->pr_pgid = p->p_pgrp;
2513 	psp->pr_sid = p->p_sessp->s_sid;
2514 	psp->pr_taskid = p->p_task->tk_tkid;
2515 	psp->pr_projid = p->p_task->tk_proj->kpj_id;
2516 	psp->pr_poolid = p->p_pool->pool_id;
2517 	psp->pr_zoneid = p->p_zone->zone_id;
2518 	if ((psp->pr_contract = PRCTID(p)) == 0)
2519 		psp->pr_contract = -1;
2520 	psp->pr_addr = (uintptr_t)prgetpsaddr(p);
2521 	switch (p->p_model) {
2522 	case DATAMODEL_ILP32:
2523 		psp->pr_dmodel = PR_MODEL_ILP32;
2524 		break;
2525 	case DATAMODEL_LP64:
2526 		psp->pr_dmodel = PR_MODEL_LP64;
2527 		break;
2528 	}
2529 	hrutime = mstate_aggr_state(p, LMS_USER);
2530 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
2531 	hrt2ts((hrutime + hrstime), &psp->pr_time);
2532 	TICK_TO_TIMESTRUC(p->p_cutime + p->p_cstime, &psp->pr_ctime);
2533 
2534 	if (t == NULL) {
2535 		int wcode = p->p_wcode;		/* must be atomic read */
2536 
2537 		if (wcode)
2538 			psp->pr_wstat = wstat(wcode, p->p_wdata);
2539 		psp->pr_ttydev = PRNODEV;
2540 		psp->pr_lwp.pr_state = SZOMB;
2541 		psp->pr_lwp.pr_sname = 'Z';
2542 		psp->pr_lwp.pr_bindpro = PBIND_NONE;
2543 		psp->pr_lwp.pr_bindpset = PS_NONE;
2544 	} else {
2545 		user_t *up = PTOU(p);
2546 		struct as *as;
2547 		dev_t d;
2548 		extern dev_t rwsconsdev, rconsdev, uconsdev;
2549 
2550 		d = cttydev(p);
2551 		/*
2552 		 * If the controlling terminal is the real
2553 		 * or workstation console device, map to what the
2554 		 * user thinks is the console device. Handle case when
2555 		 * rwsconsdev or rconsdev is set to NODEV for Starfire.
2556 		 */
2557 		if ((d == rwsconsdev || d == rconsdev) && d != NODEV)
2558 			d = uconsdev;
2559 		psp->pr_ttydev = (d == NODEV) ? PRNODEV : d;
2560 		psp->pr_start = up->u_start;
2561 		bcopy(up->u_comm, psp->pr_fname,
2562 		    MIN(sizeof (up->u_comm), sizeof (psp->pr_fname)-1));
2563 		bcopy(up->u_psargs, psp->pr_psargs,
2564 		    MIN(PRARGSZ-1, PSARGSZ));
2565 		psp->pr_argc = up->u_argc;
2566 		psp->pr_argv = up->u_argv;
2567 		psp->pr_envp = up->u_envp;
2568 
2569 		/* get the chosen lwp's lwpsinfo */
2570 		prgetlwpsinfo(t, &psp->pr_lwp);
2571 
2572 		/* compute %cpu for the process */
2573 		if (p->p_lwpcnt == 1)
2574 			psp->pr_pctcpu = psp->pr_lwp.pr_pctcpu;
2575 		else {
2576 			uint64_t pct = 0;
2577 			hrtime_t cur_time = gethrtime_unscaled();
2578 
2579 			t = p->p_tlist;
2580 			do {
2581 				pct += cpu_update_pct(t, cur_time);
2582 			} while ((t = t->t_forw) != p->p_tlist);
2583 
2584 			psp->pr_pctcpu = prgetpctcpu(pct);
2585 		}
2586 		if ((p->p_flag & SSYS) || (as = p->p_as) == &kas) {
2587 			psp->pr_size = 0;
2588 			psp->pr_rssize = 0;
2589 		} else {
2590 			mutex_exit(&p->p_lock);
2591 			AS_LOCK_ENTER(as, RW_READER);
2592 			psp->pr_size = btopr(as->a_resvsize) *
2593 			    (PAGESIZE / 1024);
2594 			psp->pr_rssize = rm_asrss(as) * (PAGESIZE / 1024);
2595 			psp->pr_pctmem = rm_pctmemory(as);
2596 			AS_LOCK_EXIT(as);
2597 			mutex_enter(&p->p_lock);
2598 		}
2599 	}
2600 }
2601 
2602 static size_t
2603 prfdinfomisc(list_t *data, uint_t type, const void *val, size_t vlen)
2604 {
2605 	pr_misc_header_t *misc;
2606 	size_t len;
2607 
2608 	len = PRFDINFO_ROUNDUP(sizeof (*misc) + vlen);
2609 
2610 	if (data != NULL) {
2611 		misc = pr_iol_newbuf(data, len);
2612 		misc->pr_misc_type = type;
2613 		misc->pr_misc_size = len;
2614 		misc++;
2615 		bcopy((char *)val, (char *)misc, vlen);
2616 	}
2617 
2618 	return (len);
2619 }
2620 
2621 /*
2622  * There's no elegant way to determine if a character device
2623  * supports TLI, so just check a hardcoded list of known TLI
2624  * devices.
2625  */
2626 
2627 static boolean_t
2628 pristli(vnode_t *vp)
2629 {
2630 	static const char *tlidevs[] = {
2631 	    "udp", "udp6", "tcp", "tcp6"
2632 	};
2633 	char *devname;
2634 	uint_t i;
2635 
2636 	ASSERT(vp != NULL);
2637 
2638 	if (vp->v_type != VCHR || vp->v_stream == NULL || vp->v_rdev == 0)
2639 		return (B_FALSE);
2640 
2641 	if ((devname = mod_major_to_name(getmajor(vp->v_rdev))) == NULL)
2642 		return (B_FALSE);
2643 
2644 	for (i = 0; i < ARRAY_SIZE(tlidevs); i++) {
2645 		if (strcmp(devname, tlidevs[i]) == 0)
2646 			return (B_TRUE);
2647 	}
2648 
2649 	return (B_FALSE);
2650 }
2651 
2652 static size_t
2653 prfdinfopath(proc_t *p, vnode_t *vp, list_t *data, cred_t *cred)
2654 {
2655 	char *pathname;
2656 	size_t pathlen;
2657 	size_t sz = 0;
2658 
2659 	/*
2660 	 * The global zone's path to a file in a non-global zone can exceed
2661 	 * MAXPATHLEN.
2662 	 */
2663 	pathlen = MAXPATHLEN * 2 + 1;
2664 	pathname = kmem_alloc(pathlen, KM_SLEEP);
2665 
2666 	if (vnodetopath(NULL, vp, pathname, pathlen, cred) == 0) {
2667 		sz += prfdinfomisc(data, PR_PATHNAME,
2668 		    pathname, strlen(pathname) + 1);
2669 	}
2670 
2671 	kmem_free(pathname, pathlen);
2672 
2673 	return (sz);
2674 }
2675 
2676 static size_t
2677 prfdinfotlisockopt(vnode_t *vp, list_t *data, cred_t *cred)
2678 {
2679 	strcmd_t strcmd;
2680 	int32_t rval;
2681 	size_t sz = 0;
2682 
2683 	strcmd.sc_cmd = TI_GETMYNAME;
2684 	strcmd.sc_timeout = 1;
2685 	strcmd.sc_len = STRCMDBUFSIZE;
2686 
2687 	if (VOP_IOCTL(vp, _I_CMD, (intptr_t)&strcmd, FKIOCTL, cred,
2688 	    &rval, NULL) == 0 && strcmd.sc_len > 0) {
2689 		sz += prfdinfomisc(data, PR_SOCKETNAME, strcmd.sc_buf,
2690 		    strcmd.sc_len);
2691 	}
2692 
2693 	strcmd.sc_cmd = TI_GETPEERNAME;
2694 	strcmd.sc_timeout = 1;
2695 	strcmd.sc_len = STRCMDBUFSIZE;
2696 
2697 	if (VOP_IOCTL(vp, _I_CMD, (intptr_t)&strcmd, FKIOCTL, cred,
2698 	    &rval, NULL) == 0 && strcmd.sc_len > 0) {
2699 		sz += prfdinfomisc(data, PR_PEERSOCKNAME, strcmd.sc_buf,
2700 		    strcmd.sc_len);
2701 	}
2702 
2703 	return (sz);
2704 }
2705 
2706 static size_t
2707 prfdinfosockopt(vnode_t *vp, list_t *data, cred_t *cred)
2708 {
2709 	sonode_t *so;
2710 	socklen_t vlen;
2711 	size_t sz = 0;
2712 	uint_t i;
2713 
2714 	if (vp->v_stream != NULL) {
2715 		so = VTOSO(vp->v_stream->sd_vnode);
2716 
2717 		if (so->so_version == SOV_STREAM)
2718 			so = NULL;
2719 	} else {
2720 		so = VTOSO(vp);
2721 	}
2722 
2723 	if (so == NULL)
2724 		return (0);
2725 
2726 	DTRACE_PROBE1(sonode, sonode_t *, so);
2727 
2728 	/* prmisc - PR_SOCKETNAME */
2729 
2730 	struct sockaddr_storage buf;
2731 	struct sockaddr *name = (struct sockaddr *)&buf;
2732 
2733 	vlen = sizeof (buf);
2734 	if (SOP_GETSOCKNAME(so, name, &vlen, cred) == 0 && vlen > 0)
2735 		sz += prfdinfomisc(data, PR_SOCKETNAME, name, vlen);
2736 
2737 	/* prmisc - PR_PEERSOCKNAME */
2738 
2739 	vlen = sizeof (buf);
2740 	if (SOP_GETPEERNAME(so, name, &vlen, B_FALSE, cred) == 0 && vlen > 0)
2741 		sz += prfdinfomisc(data, PR_PEERSOCKNAME, name, vlen);
2742 
2743 	/* prmisc - PR_SOCKOPTS_BOOL_OPTS */
2744 
2745 	static struct boolopt {
2746 		int		level;
2747 		int		opt;
2748 		int		bopt;
2749 	} boolopts[] = {
2750 		{ SOL_SOCKET, SO_DEBUG,		PR_SO_DEBUG },
2751 		{ SOL_SOCKET, SO_REUSEADDR,	PR_SO_REUSEADDR },
2752 #ifdef SO_REUSEPORT
2753 		/* SmartOS and OmniOS have SO_REUSEPORT */
2754 		{ SOL_SOCKET, SO_REUSEPORT,	PR_SO_REUSEPORT },
2755 #endif
2756 		{ SOL_SOCKET, SO_KEEPALIVE,	PR_SO_KEEPALIVE },
2757 		{ SOL_SOCKET, SO_DONTROUTE,	PR_SO_DONTROUTE },
2758 		{ SOL_SOCKET, SO_BROADCAST,	PR_SO_BROADCAST },
2759 		{ SOL_SOCKET, SO_OOBINLINE,	PR_SO_OOBINLINE },
2760 		{ SOL_SOCKET, SO_DGRAM_ERRIND,	PR_SO_DGRAM_ERRIND },
2761 		{ SOL_SOCKET, SO_ALLZONES,	PR_SO_ALLZONES },
2762 		{ SOL_SOCKET, SO_MAC_EXEMPT,	PR_SO_MAC_EXEMPT },
2763 		{ SOL_SOCKET, SO_MAC_IMPLICIT,	PR_SO_MAC_IMPLICIT },
2764 		{ SOL_SOCKET, SO_EXCLBIND,	PR_SO_EXCLBIND },
2765 		{ SOL_SOCKET, SO_VRRP,		PR_SO_VRRP },
2766 		{ IPPROTO_UDP, UDP_NAT_T_ENDPOINT,
2767 		    PR_UDP_NAT_T_ENDPOINT }
2768 	};
2769 	prsockopts_bool_opts_t opts;
2770 	int val;
2771 
2772 	if (data != NULL) {
2773 		opts.prsock_bool_opts = 0;
2774 
2775 		for (i = 0; i < ARRAY_SIZE(boolopts); i++) {
2776 			vlen = sizeof (val);
2777 			if (SOP_GETSOCKOPT(so, boolopts[i].level,
2778 			    boolopts[i].opt, &val, &vlen, 0, cred) == 0 &&
2779 			    val != 0) {
2780 				opts.prsock_bool_opts |= boolopts[i].bopt;
2781 			}
2782 		}
2783 	}
2784 
2785 	sz += prfdinfomisc(data, PR_SOCKOPTS_BOOL_OPTS, &opts, sizeof (opts));
2786 
2787 	/* prmisc - PR_SOCKOPT_LINGER */
2788 
2789 	struct linger l;
2790 
2791 	vlen = sizeof (l);
2792 	if (SOP_GETSOCKOPT(so, SOL_SOCKET, SO_LINGER, &l, &vlen,
2793 	    0, cred) == 0 && vlen > 0) {
2794 		sz += prfdinfomisc(data, PR_SOCKOPT_LINGER, &l, vlen);
2795 	}
2796 
2797 	/* prmisc - PR_SOCKOPT_* int types */
2798 
2799 	static struct sopt {
2800 		int		level;
2801 		int		opt;
2802 		int		bopt;
2803 	} sopts[] = {
2804 		{ SOL_SOCKET, SO_TYPE,		PR_SOCKOPT_TYPE },
2805 		{ SOL_SOCKET, SO_SNDBUF,	PR_SOCKOPT_SNDBUF },
2806 		{ SOL_SOCKET, SO_RCVBUF,	PR_SOCKOPT_RCVBUF }
2807 	};
2808 
2809 	for (i = 0; i < ARRAY_SIZE(sopts); i++) {
2810 		vlen = sizeof (val);
2811 		if (SOP_GETSOCKOPT(so, sopts[i].level, sopts[i].opt,
2812 		    &val, &vlen, 0, cred) == 0 && vlen > 0) {
2813 			sz += prfdinfomisc(data, sopts[i].bopt, &val, vlen);
2814 		}
2815 	}
2816 
2817 	/* prmisc - PR_SOCKOPT_IP_NEXTHOP */
2818 
2819 	in_addr_t nexthop_val;
2820 
2821 	vlen = sizeof (nexthop_val);
2822 	if (SOP_GETSOCKOPT(so, IPPROTO_IP, IP_NEXTHOP,
2823 	    &nexthop_val, &vlen, 0, cred) == 0 && vlen > 0) {
2824 		sz += prfdinfomisc(data, PR_SOCKOPT_IP_NEXTHOP,
2825 		    &nexthop_val, vlen);
2826 	}
2827 
2828 	/* prmisc - PR_SOCKOPT_IPV6_NEXTHOP */
2829 
2830 	struct sockaddr_in6 nexthop6_val;
2831 
2832 	vlen = sizeof (nexthop6_val);
2833 	if (SOP_GETSOCKOPT(so, IPPROTO_IPV6, IPV6_NEXTHOP,
2834 	    &nexthop6_val, &vlen, 0, cred) == 0 && vlen > 0) {
2835 		sz += prfdinfomisc(data, PR_SOCKOPT_IPV6_NEXTHOP,
2836 		    &nexthop6_val, vlen);
2837 	}
2838 
2839 	/* prmisc - PR_SOCKOPT_TCP_CONGESTION */
2840 
2841 	char cong[CC_ALGO_NAME_MAX];
2842 
2843 	vlen = sizeof (cong);
2844 	if (SOP_GETSOCKOPT(so, IPPROTO_TCP, TCP_CONGESTION,
2845 	    &cong, &vlen, 0, cred) == 0 && vlen > 0) {
2846 		sz += prfdinfomisc(data, PR_SOCKOPT_TCP_CONGESTION, cong, vlen);
2847 	}
2848 
2849 	/* prmisc - PR_SOCKFILTERS_PRIV */
2850 
2851 	struct fil_info fi;
2852 
2853 	vlen = sizeof (fi);
2854 	if (SOP_GETSOCKOPT(so, SOL_FILTER, FIL_LIST,
2855 	    &fi, &vlen, 0, cred) == 0 && vlen != 0) {
2856 		pr_misc_header_t *misc;
2857 		size_t len;
2858 
2859 		/*
2860 		 * We limit the number of returned filters to 32.
2861 		 * This is the maximum number that pfiles will print
2862 		 * anyway.
2863 		 */
2864 		vlen = MIN(32, fi.fi_pos + 1);
2865 		vlen *= sizeof (fi);
2866 
2867 		len = PRFDINFO_ROUNDUP(sizeof (*misc) + vlen);
2868 		sz += len;
2869 
2870 		if (data != NULL) {
2871 			/*
2872 			 * So that the filter list can be built incrementally,
2873 			 * prfdinfomisc() is not used here. Instead we
2874 			 * allocate a buffer directly on the copyout list using
2875 			 * pr_iol_newbuf()
2876 			 */
2877 			misc = pr_iol_newbuf(data, len);
2878 			misc->pr_misc_type = PR_SOCKFILTERS_PRIV;
2879 			misc->pr_misc_size = len;
2880 			misc++;
2881 			len = vlen;
2882 			if (SOP_GETSOCKOPT(so, SOL_FILTER, FIL_LIST,
2883 			    misc, &vlen, 0, cred) == 0) {
2884 				/*
2885 				 * In case the number of filters has reduced
2886 				 * since the first call, explicitly zero out
2887 				 * any unpopulated space.
2888 				 */
2889 				if (vlen < len)
2890 					bzero(misc + vlen, len - vlen);
2891 			} else {
2892 				/* Something went wrong, zero out the result */
2893 				bzero(misc, vlen);
2894 			}
2895 		}
2896 	}
2897 
2898 	return (sz);
2899 }
2900 
2901 typedef struct prfdinfo_nm_path_cbdata {
2902 	proc_t		*nmp_p;
2903 	u_offset_t	nmp_sz;
2904 	list_t		*nmp_data;
2905 } prfdinfo_nm_path_cbdata_t;
2906 
2907 static int
2908 prfdinfo_nm_path(const struct namenode *np, cred_t *cred, void *arg)
2909 {
2910 	prfdinfo_nm_path_cbdata_t *cb = arg;
2911 
2912 	cb->nmp_sz += prfdinfopath(cb->nmp_p, np->nm_vnode, cb->nmp_data, cred);
2913 
2914 	return (0);
2915 }
2916 
2917 u_offset_t
2918 prgetfdinfosize(proc_t *p, vnode_t *vp, cred_t *cred)
2919 {
2920 	u_offset_t sz;
2921 
2922 	/*
2923 	 * All fdinfo files will be at least this big -
2924 	 * sizeof fdinfo struct + zero length trailer
2925 	 */
2926 	sz = offsetof(prfdinfo_t, pr_misc) + sizeof (pr_misc_header_t);
2927 
2928 	/* Pathname */
2929 	switch (vp->v_type) {
2930 	case VDOOR: {
2931 		prfdinfo_nm_path_cbdata_t cb = {
2932 			.nmp_p		= p,
2933 			.nmp_data	= NULL,
2934 			.nmp_sz		= 0
2935 		};
2936 
2937 		(void) nm_walk_mounts(vp, prfdinfo_nm_path, cred, &cb);
2938 		sz += cb.nmp_sz;
2939 		break;
2940 	}
2941 	case VSOCK:
2942 		break;
2943 	default:
2944 		sz += prfdinfopath(p, vp, NULL, cred);
2945 	}
2946 
2947 	/* Socket options */
2948 	if (vp->v_type == VSOCK)
2949 		sz += prfdinfosockopt(vp, NULL, cred);
2950 
2951 	/* TLI/XTI sockets */
2952 	if (pristli(vp))
2953 		sz += prfdinfotlisockopt(vp, NULL, cred);
2954 
2955 	return (sz);
2956 }
2957 
2958 int
2959 prgetfdinfo(proc_t *p, vnode_t *vp, prfdinfo_t *fdinfo, cred_t *cred,
2960     cred_t *file_cred, list_t *data)
2961 {
2962 	vattr_t vattr;
2963 	int error;
2964 
2965 	/*
2966 	 * The buffer has been initialised to zero by pr_iol_newbuf().
2967 	 * Initialise defaults for any values that should not default to zero.
2968 	 */
2969 	fdinfo->pr_uid = (uid_t)-1;
2970 	fdinfo->pr_gid = (gid_t)-1;
2971 	fdinfo->pr_size = -1;
2972 	fdinfo->pr_locktype = F_UNLCK;
2973 	fdinfo->pr_lockpid = -1;
2974 	fdinfo->pr_locksysid = -1;
2975 	fdinfo->pr_peerpid = -1;
2976 
2977 	/* Offset */
2978 
2979 	/*
2980 	 * pr_offset has already been set from the underlying file_t.
2981 	 * Check if it is plausible and reset to -1 if not.
2982 	 */
2983 	if (fdinfo->pr_offset != -1 &&
2984 	    VOP_SEEK(vp, 0, (offset_t *)&fdinfo->pr_offset, NULL) != 0)
2985 		fdinfo->pr_offset = -1;
2986 
2987 	/*
2988 	 * Attributes
2989 	 *
2990 	 * We have two cred_t structures available here.
2991 	 * 'cred' is the caller's credential, and 'file_cred' is the credential
2992 	 * for the file being inspected.
2993 	 *
2994 	 * When looking up the file attributes, file_cred is used in order
2995 	 * that the correct ownership is set for doors and FIFOs. Since the
2996 	 * caller has permission to read the fdinfo file in proc, this does
2997 	 * not expose any additional information.
2998 	 */
2999 	vattr.va_mask = AT_STAT;
3000 	if (VOP_GETATTR(vp, &vattr, 0, file_cred, NULL) == 0) {
3001 		fdinfo->pr_major = getmajor(vattr.va_fsid);
3002 		fdinfo->pr_minor = getminor(vattr.va_fsid);
3003 		fdinfo->pr_rmajor = getmajor(vattr.va_rdev);
3004 		fdinfo->pr_rminor = getminor(vattr.va_rdev);
3005 		fdinfo->pr_ino = (ino64_t)vattr.va_nodeid;
3006 		fdinfo->pr_size = (off64_t)vattr.va_size;
3007 		fdinfo->pr_mode = VTTOIF(vattr.va_type) | vattr.va_mode;
3008 		fdinfo->pr_uid = vattr.va_uid;
3009 		fdinfo->pr_gid = vattr.va_gid;
3010 		if (vp->v_type == VSOCK)
3011 			fdinfo->pr_fileflags |= sock_getfasync(vp);
3012 	}
3013 
3014 	/* locks */
3015 
3016 	flock64_t bf;
3017 
3018 	bzero(&bf, sizeof (bf));
3019 	bf.l_type = F_WRLCK;
3020 
3021 	if (VOP_FRLOCK(vp, F_GETLK, &bf,
3022 	    (uint16_t)(fdinfo->pr_fileflags & 0xffff), 0, NULL,
3023 	    cred, NULL) == 0 && bf.l_type != F_UNLCK) {
3024 		fdinfo->pr_locktype = bf.l_type;
3025 		fdinfo->pr_lockpid = bf.l_pid;
3026 		fdinfo->pr_locksysid = bf.l_sysid;
3027 	}
3028 
3029 	/* peer cred */
3030 
3031 	k_peercred_t kpc;
3032 
3033 	switch (vp->v_type) {
3034 	case VFIFO:
3035 	case VSOCK: {
3036 		int32_t rval;
3037 
3038 		error = VOP_IOCTL(vp, _I_GETPEERCRED, (intptr_t)&kpc,
3039 		    FKIOCTL, cred, &rval, NULL);
3040 		break;
3041 	}
3042 	case VCHR: {
3043 		struct strioctl strioc;
3044 		int32_t rval;
3045 
3046 		if (vp->v_stream == NULL) {
3047 			error = ENOTSUP;
3048 			break;
3049 		}
3050 		strioc.ic_cmd = _I_GETPEERCRED;
3051 		strioc.ic_timout = INFTIM;
3052 		strioc.ic_len = (int)sizeof (k_peercred_t);
3053 		strioc.ic_dp = (char *)&kpc;
3054 
3055 		error = strdoioctl(vp->v_stream, &strioc, FNATIVE | FKIOCTL,
3056 		    STR_NOSIG | K_TO_K, cred, &rval);
3057 		break;
3058 	}
3059 	default:
3060 		error = ENOTSUP;
3061 		break;
3062 	}
3063 
3064 	if (error == 0 && kpc.pc_cr != NULL) {
3065 		proc_t *peerp;
3066 
3067 		fdinfo->pr_peerpid = kpc.pc_cpid;
3068 
3069 		crfree(kpc.pc_cr);
3070 
3071 		mutex_enter(&pidlock);
3072 		if ((peerp = prfind(fdinfo->pr_peerpid)) != NULL) {
3073 			user_t *up;
3074 
3075 			mutex_enter(&peerp->p_lock);
3076 			mutex_exit(&pidlock);
3077 
3078 			up = PTOU(peerp);
3079 			bcopy(up->u_comm, fdinfo->pr_peername,
3080 			    MIN(sizeof (up->u_comm),
3081 			    sizeof (fdinfo->pr_peername) - 1));
3082 
3083 			mutex_exit(&peerp->p_lock);
3084 		} else {
3085 			mutex_exit(&pidlock);
3086 		}
3087 	}
3088 
3089 	/* pathname */
3090 
3091 	switch (vp->v_type) {
3092 	case VDOOR: {
3093 		prfdinfo_nm_path_cbdata_t cb = {
3094 			.nmp_p		= p,
3095 			.nmp_data	= data,
3096 			.nmp_sz		= 0
3097 		};
3098 
3099 		(void) nm_walk_mounts(vp, prfdinfo_nm_path, cred, &cb);
3100 		break;
3101 	}
3102 	case VSOCK:
3103 		/*
3104 		 * Don't attempt to determine the path for a socket as the
3105 		 * vnode has no associated v_path. It will cause a linear scan
3106 		 * of the dnlc table and result in no path being found.
3107 		 */
3108 		break;
3109 	default:
3110 		(void) prfdinfopath(p, vp, data, cred);
3111 	}
3112 
3113 	/* socket options */
3114 	if (vp->v_type == VSOCK)
3115 		(void) prfdinfosockopt(vp, data, cred);
3116 
3117 	/* TLI/XTI stream sockets */
3118 	if (pristli(vp))
3119 		(void) prfdinfotlisockopt(vp, data, cred);
3120 
3121 	/*
3122 	 * Add a terminating header with a zero size.
3123 	 */
3124 	pr_misc_header_t *misc;
3125 
3126 	misc = pr_iol_newbuf(data, sizeof (*misc));
3127 	misc->pr_misc_size = 0;
3128 	misc->pr_misc_type = (uint_t)-1;
3129 
3130 	return (0);
3131 }
3132 
3133 #ifdef _SYSCALL32_IMPL
3134 void
3135 prgetpsinfo32(proc_t *p, psinfo32_t *psp)
3136 {
3137 	kthread_t *t;
3138 	struct cred *cred;
3139 	hrtime_t hrutime, hrstime;
3140 
3141 	ASSERT(MUTEX_HELD(&p->p_lock));
3142 
3143 	if ((t = prchoose(p)) == NULL)	/* returns locked thread */
3144 		bzero(psp, sizeof (*psp));
3145 	else {
3146 		thread_unlock(t);
3147 		bzero(psp, sizeof (*psp) - sizeof (psp->pr_lwp));
3148 	}
3149 
3150 	/*
3151 	 * only export SSYS and SMSACCT; everything else is off-limits to
3152 	 * userland apps.
3153 	 */
3154 	psp->pr_flag = p->p_flag & (SSYS | SMSACCT);
3155 	psp->pr_nlwp = p->p_lwpcnt;
3156 	psp->pr_nzomb = p->p_zombcnt;
3157 	mutex_enter(&p->p_crlock);
3158 	cred = p->p_cred;
3159 	psp->pr_uid = crgetruid(cred);
3160 	psp->pr_euid = crgetuid(cred);
3161 	psp->pr_gid = crgetrgid(cred);
3162 	psp->pr_egid = crgetgid(cred);
3163 	mutex_exit(&p->p_crlock);
3164 	psp->pr_pid = p->p_pid;
3165 	if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
3166 	    (p->p_flag & SZONETOP)) {
3167 		ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
3168 		/*
3169 		 * Inside local zones, fake zsched's pid as parent pids for
3170 		 * processes which reference processes outside of the zone.
3171 		 */
3172 		psp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
3173 	} else {
3174 		psp->pr_ppid = p->p_ppid;
3175 	}
3176 	psp->pr_pgid = p->p_pgrp;
3177 	psp->pr_sid = p->p_sessp->s_sid;
3178 	psp->pr_taskid = p->p_task->tk_tkid;
3179 	psp->pr_projid = p->p_task->tk_proj->kpj_id;
3180 	psp->pr_poolid = p->p_pool->pool_id;
3181 	psp->pr_zoneid = p->p_zone->zone_id;
3182 	if ((psp->pr_contract = PRCTID(p)) == 0)
3183 		psp->pr_contract = -1;
3184 	psp->pr_addr = 0;	/* cannot represent 64-bit addr in 32 bits */
3185 	switch (p->p_model) {
3186 	case DATAMODEL_ILP32:
3187 		psp->pr_dmodel = PR_MODEL_ILP32;
3188 		break;
3189 	case DATAMODEL_LP64:
3190 		psp->pr_dmodel = PR_MODEL_LP64;
3191 		break;
3192 	}
3193 	hrutime = mstate_aggr_state(p, LMS_USER);
3194 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
3195 	hrt2ts32(hrutime + hrstime, &psp->pr_time);
3196 	TICK_TO_TIMESTRUC32(p->p_cutime + p->p_cstime, &psp->pr_ctime);
3197 
3198 	if (t == NULL) {
3199 		extern int wstat(int, int);	/* needs a header file */
3200 		int wcode = p->p_wcode;		/* must be atomic read */
3201 
3202 		if (wcode)
3203 			psp->pr_wstat = wstat(wcode, p->p_wdata);
3204 		psp->pr_ttydev = PRNODEV32;
3205 		psp->pr_lwp.pr_state = SZOMB;
3206 		psp->pr_lwp.pr_sname = 'Z';
3207 	} else {
3208 		user_t *up = PTOU(p);
3209 		struct as *as;
3210 		dev_t d;
3211 		extern dev_t rwsconsdev, rconsdev, uconsdev;
3212 
3213 		d = cttydev(p);
3214 		/*
3215 		 * If the controlling terminal is the real
3216 		 * or workstation console device, map to what the
3217 		 * user thinks is the console device. Handle case when
3218 		 * rwsconsdev or rconsdev is set to NODEV for Starfire.
3219 		 */
3220 		if ((d == rwsconsdev || d == rconsdev) && d != NODEV)
3221 			d = uconsdev;
3222 		(void) cmpldev(&psp->pr_ttydev, d);
3223 		TIMESPEC_TO_TIMESPEC32(&psp->pr_start, &up->u_start);
3224 		bcopy(up->u_comm, psp->pr_fname,
3225 		    MIN(sizeof (up->u_comm), sizeof (psp->pr_fname)-1));
3226 		bcopy(up->u_psargs, psp->pr_psargs,
3227 		    MIN(PRARGSZ-1, PSARGSZ));
3228 		psp->pr_argc = up->u_argc;
3229 		psp->pr_argv = (caddr32_t)up->u_argv;
3230 		psp->pr_envp = (caddr32_t)up->u_envp;
3231 
3232 		/* get the chosen lwp's lwpsinfo */
3233 		prgetlwpsinfo32(t, &psp->pr_lwp);
3234 
3235 		/* compute %cpu for the process */
3236 		if (p->p_lwpcnt == 1)
3237 			psp->pr_pctcpu = psp->pr_lwp.pr_pctcpu;
3238 		else {
3239 			uint64_t pct = 0;
3240 			hrtime_t cur_time;
3241 
3242 			t = p->p_tlist;
3243 			cur_time = gethrtime_unscaled();
3244 			do {
3245 				pct += cpu_update_pct(t, cur_time);
3246 			} while ((t = t->t_forw) != p->p_tlist);
3247 
3248 			psp->pr_pctcpu = prgetpctcpu(pct);
3249 		}
3250 		if ((p->p_flag & SSYS) || (as = p->p_as) == &kas) {
3251 			psp->pr_size = 0;
3252 			psp->pr_rssize = 0;
3253 		} else {
3254 			mutex_exit(&p->p_lock);
3255 			AS_LOCK_ENTER(as, RW_READER);
3256 			psp->pr_size = (size32_t)
3257 			    (btopr(as->a_resvsize) * (PAGESIZE / 1024));
3258 			psp->pr_rssize = (size32_t)
3259 			    (rm_asrss(as) * (PAGESIZE / 1024));
3260 			psp->pr_pctmem = rm_pctmemory(as);
3261 			AS_LOCK_EXIT(as);
3262 			mutex_enter(&p->p_lock);
3263 		}
3264 	}
3265 
3266 	/*
3267 	 * If we are looking at an LP64 process, zero out
3268 	 * the fields that cannot be represented in ILP32.
3269 	 */
3270 	if (p->p_model != DATAMODEL_ILP32) {
3271 		psp->pr_size = 0;
3272 		psp->pr_rssize = 0;
3273 		psp->pr_argv = 0;
3274 		psp->pr_envp = 0;
3275 	}
3276 }
3277 
3278 #endif	/* _SYSCALL32_IMPL */
3279 
3280 void
3281 prgetlwpsinfo(kthread_t *t, lwpsinfo_t *psp)
3282 {
3283 	klwp_t *lwp = ttolwp(t);
3284 	sobj_ops_t *sobj;
3285 	char c, state;
3286 	uint64_t pct;
3287 	int retval, niceval;
3288 	hrtime_t hrutime, hrstime;
3289 
3290 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
3291 
3292 	bzero(psp, sizeof (*psp));
3293 
3294 	psp->pr_flag = 0;	/* lwpsinfo_t.pr_flag is deprecated */
3295 	psp->pr_lwpid = t->t_tid;
3296 	psp->pr_addr = (uintptr_t)t;
3297 	psp->pr_wchan = (uintptr_t)t->t_wchan;
3298 
3299 	/* map the thread state enum into a process state enum */
3300 	state = VSTOPPED(t) ? TS_STOPPED : t->t_state;
3301 	switch (state) {
3302 	case TS_SLEEP:		state = SSLEEP;		c = 'S';	break;
3303 	case TS_RUN:		state = SRUN;		c = 'R';	break;
3304 	case TS_ONPROC:		state = SONPROC;	c = 'O';	break;
3305 	case TS_ZOMB:		state = SZOMB;		c = 'Z';	break;
3306 	case TS_STOPPED:	state = SSTOP;		c = 'T';	break;
3307 	case TS_WAIT:		state = SWAIT;		c = 'W';	break;
3308 	default:		state = 0;		c = '?';	break;
3309 	}
3310 	psp->pr_state = state;
3311 	psp->pr_sname = c;
3312 	if ((sobj = t->t_sobj_ops) != NULL)
3313 		psp->pr_stype = SOBJ_TYPE(sobj);
3314 	retval = CL_DONICE(t, NULL, 0, &niceval);
3315 	if (retval == 0) {
3316 		psp->pr_oldpri = v.v_maxsyspri - t->t_pri;
3317 		psp->pr_nice = niceval + NZERO;
3318 	}
3319 	psp->pr_syscall = t->t_sysnum;
3320 	psp->pr_pri = t->t_pri;
3321 	psp->pr_start.tv_sec = t->t_start;
3322 	psp->pr_start.tv_nsec = 0L;
3323 	hrutime = lwp->lwp_mstate.ms_acct[LMS_USER];
3324 	scalehrtime(&hrutime);
3325 	hrstime = lwp->lwp_mstate.ms_acct[LMS_SYSTEM] +
3326 	    lwp->lwp_mstate.ms_acct[LMS_TRAP];
3327 	scalehrtime(&hrstime);
3328 	hrt2ts(hrutime + hrstime, &psp->pr_time);
3329 	/* compute %cpu for the lwp */
3330 	pct = cpu_update_pct(t, gethrtime_unscaled());
3331 	psp->pr_pctcpu = prgetpctcpu(pct);
3332 	psp->pr_cpu = (psp->pr_pctcpu*100 + 0x6000) >> 15;	/* [0..99] */
3333 	if (psp->pr_cpu > 99)
3334 		psp->pr_cpu = 99;
3335 
3336 	(void) strncpy(psp->pr_clname, sclass[t->t_cid].cl_name,
3337 	    sizeof (psp->pr_clname) - 1);
3338 	bzero(psp->pr_name, sizeof (psp->pr_name));	/* XXX ??? */
3339 	psp->pr_onpro = t->t_cpu->cpu_id;
3340 	psp->pr_bindpro = t->t_bind_cpu;
3341 	psp->pr_bindpset = t->t_bind_pset;
3342 	psp->pr_lgrp = t->t_lpl->lpl_lgrpid;
3343 }
3344 
3345 #ifdef _SYSCALL32_IMPL
3346 void
3347 prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp)
3348 {
3349 	klwp_t *lwp = ttolwp(t);
3350 	sobj_ops_t *sobj;
3351 	char c, state;
3352 	uint64_t pct;
3353 	int retval, niceval;
3354 	hrtime_t hrutime, hrstime;
3355 
3356 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
3357 
3358 	bzero(psp, sizeof (*psp));
3359 
3360 	psp->pr_flag = 0;	/* lwpsinfo_t.pr_flag is deprecated */
3361 	psp->pr_lwpid = t->t_tid;
3362 	psp->pr_addr = 0;	/* cannot represent 64-bit addr in 32 bits */
3363 	psp->pr_wchan = 0;	/* cannot represent 64-bit addr in 32 bits */
3364 
3365 	/* map the thread state enum into a process state enum */
3366 	state = VSTOPPED(t) ? TS_STOPPED : t->t_state;
3367 	switch (state) {
3368 	case TS_SLEEP:		state = SSLEEP;		c = 'S';	break;
3369 	case TS_RUN:		state = SRUN;		c = 'R';	break;
3370 	case TS_ONPROC:		state = SONPROC;	c = 'O';	break;
3371 	case TS_ZOMB:		state = SZOMB;		c = 'Z';	break;
3372 	case TS_STOPPED:	state = SSTOP;		c = 'T';	break;
3373 	case TS_WAIT:		state = SWAIT;		c = 'W';	break;
3374 	default:		state = 0;		c = '?';	break;
3375 	}
3376 	psp->pr_state = state;
3377 	psp->pr_sname = c;
3378 	if ((sobj = t->t_sobj_ops) != NULL)
3379 		psp->pr_stype = SOBJ_TYPE(sobj);
3380 	retval = CL_DONICE(t, NULL, 0, &niceval);
3381 	if (retval == 0) {
3382 		psp->pr_oldpri = v.v_maxsyspri - t->t_pri;
3383 		psp->pr_nice = niceval + NZERO;
3384 	} else {
3385 		psp->pr_oldpri = 0;
3386 		psp->pr_nice = 0;
3387 	}
3388 	psp->pr_syscall = t->t_sysnum;
3389 	psp->pr_pri = t->t_pri;
3390 	psp->pr_start.tv_sec = (time32_t)t->t_start;
3391 	psp->pr_start.tv_nsec = 0L;
3392 	hrutime = lwp->lwp_mstate.ms_acct[LMS_USER];
3393 	scalehrtime(&hrutime);
3394 	hrstime = lwp->lwp_mstate.ms_acct[LMS_SYSTEM] +
3395 	    lwp->lwp_mstate.ms_acct[LMS_TRAP];
3396 	scalehrtime(&hrstime);
3397 	hrt2ts32(hrutime + hrstime, &psp->pr_time);
3398 	/* compute %cpu for the lwp */
3399 	pct = cpu_update_pct(t, gethrtime_unscaled());
3400 	psp->pr_pctcpu = prgetpctcpu(pct);
3401 	psp->pr_cpu = (psp->pr_pctcpu*100 + 0x6000) >> 15;	/* [0..99] */
3402 	if (psp->pr_cpu > 99)
3403 		psp->pr_cpu = 99;
3404 
3405 	(void) strncpy(psp->pr_clname, sclass[t->t_cid].cl_name,
3406 	    sizeof (psp->pr_clname) - 1);
3407 	bzero(psp->pr_name, sizeof (psp->pr_name));	/* XXX ??? */
3408 	psp->pr_onpro = t->t_cpu->cpu_id;
3409 	psp->pr_bindpro = t->t_bind_cpu;
3410 	psp->pr_bindpset = t->t_bind_pset;
3411 	psp->pr_lgrp = t->t_lpl->lpl_lgrpid;
3412 }
3413 #endif	/* _SYSCALL32_IMPL */
3414 
3415 #ifdef _SYSCALL32_IMPL
3416 
3417 #define	PR_COPY_FIELD(s, d, field)	 d->field = s->field
3418 
3419 #define	PR_COPY_FIELD_ILP32(s, d, field)				\
3420 	if (s->pr_dmodel == PR_MODEL_ILP32) {			\
3421 		d->field = s->field;				\
3422 	}
3423 
3424 #define	PR_COPY_TIMESPEC(s, d, field)				\
3425 	TIMESPEC_TO_TIMESPEC32(&d->field, &s->field);
3426 
3427 #define	PR_COPY_BUF(s, d, field)				\
3428 	bcopy(s->field, d->field, sizeof (d->field));
3429 
3430 #define	PR_IGNORE_FIELD(s, d, field)
3431 
3432 void
3433 lwpsinfo_kto32(const struct lwpsinfo *src, struct lwpsinfo32 *dest)
3434 {
3435 	bzero(dest, sizeof (*dest));
3436 
3437 	PR_COPY_FIELD(src, dest, pr_flag);
3438 	PR_COPY_FIELD(src, dest, pr_lwpid);
3439 	PR_IGNORE_FIELD(src, dest, pr_addr);
3440 	PR_IGNORE_FIELD(src, dest, pr_wchan);
3441 	PR_COPY_FIELD(src, dest, pr_stype);
3442 	PR_COPY_FIELD(src, dest, pr_state);
3443 	PR_COPY_FIELD(src, dest, pr_sname);
3444 	PR_COPY_FIELD(src, dest, pr_nice);
3445 	PR_COPY_FIELD(src, dest, pr_syscall);
3446 	PR_COPY_FIELD(src, dest, pr_oldpri);
3447 	PR_COPY_FIELD(src, dest, pr_cpu);
3448 	PR_COPY_FIELD(src, dest, pr_pri);
3449 	PR_COPY_FIELD(src, dest, pr_pctcpu);
3450 	PR_COPY_TIMESPEC(src, dest, pr_start);
3451 	PR_COPY_BUF(src, dest, pr_clname);
3452 	PR_COPY_BUF(src, dest, pr_name);
3453 	PR_COPY_FIELD(src, dest, pr_onpro);
3454 	PR_COPY_FIELD(src, dest, pr_bindpro);
3455 	PR_COPY_FIELD(src, dest, pr_bindpset);
3456 	PR_COPY_FIELD(src, dest, pr_lgrp);
3457 }
3458 
3459 void
3460 psinfo_kto32(const struct psinfo *src, struct psinfo32 *dest)
3461 {
3462 	bzero(dest, sizeof (*dest));
3463 
3464 	PR_COPY_FIELD(src, dest, pr_flag);
3465 	PR_COPY_FIELD(src, dest, pr_nlwp);
3466 	PR_COPY_FIELD(src, dest, pr_pid);
3467 	PR_COPY_FIELD(src, dest, pr_ppid);
3468 	PR_COPY_FIELD(src, dest, pr_pgid);
3469 	PR_COPY_FIELD(src, dest, pr_sid);
3470 	PR_COPY_FIELD(src, dest, pr_uid);
3471 	PR_COPY_FIELD(src, dest, pr_euid);
3472 	PR_COPY_FIELD(src, dest, pr_gid);
3473 	PR_COPY_FIELD(src, dest, pr_egid);
3474 	PR_IGNORE_FIELD(src, dest, pr_addr);
3475 	PR_COPY_FIELD_ILP32(src, dest, pr_size);
3476 	PR_COPY_FIELD_ILP32(src, dest, pr_rssize);
3477 	PR_COPY_FIELD(src, dest, pr_ttydev);
3478 	PR_COPY_FIELD(src, dest, pr_pctcpu);
3479 	PR_COPY_FIELD(src, dest, pr_pctmem);
3480 	PR_COPY_TIMESPEC(src, dest, pr_start);
3481 	PR_COPY_TIMESPEC(src, dest, pr_time);
3482 	PR_COPY_TIMESPEC(src, dest, pr_ctime);
3483 	PR_COPY_BUF(src, dest, pr_fname);
3484 	PR_COPY_BUF(src, dest, pr_psargs);
3485 	PR_COPY_FIELD(src, dest, pr_wstat);
3486 	PR_COPY_FIELD(src, dest, pr_argc);
3487 	PR_COPY_FIELD_ILP32(src, dest, pr_argv);
3488 	PR_COPY_FIELD_ILP32(src, dest, pr_envp);
3489 	PR_COPY_FIELD(src, dest, pr_dmodel);
3490 	PR_COPY_FIELD(src, dest, pr_taskid);
3491 	PR_COPY_FIELD(src, dest, pr_projid);
3492 	PR_COPY_FIELD(src, dest, pr_nzomb);
3493 	PR_COPY_FIELD(src, dest, pr_poolid);
3494 	PR_COPY_FIELD(src, dest, pr_contract);
3495 	PR_COPY_FIELD(src, dest, pr_poolid);
3496 	PR_COPY_FIELD(src, dest, pr_poolid);
3497 
3498 	lwpsinfo_kto32(&src->pr_lwp, &dest->pr_lwp);
3499 }
3500 
3501 #undef	PR_COPY_FIELD
3502 #undef	PR_COPY_FIELD_ILP32
3503 #undef	PR_COPY_TIMESPEC
3504 #undef	PR_COPY_BUF
3505 #undef	PR_IGNORE_FIELD
3506 
3507 #endif	/* _SYSCALL32_IMPL */
3508 
3509 /*
3510  * This used to get called when microstate accounting was disabled but
3511  * microstate information was requested.  Since Microstate accounting is on
3512  * regardless of the proc flags, this simply makes it appear to procfs that
3513  * microstate accounting is on.  This is relatively meaningless since you
3514  * can't turn it off, but this is here for the sake of appearances.
3515  */
3516 
3517 /*ARGSUSED*/
3518 void
3519 estimate_msacct(kthread_t *t, hrtime_t curtime)
3520 {
3521 	proc_t *p;
3522 
3523 	if (t == NULL)
3524 		return;
3525 
3526 	p = ttoproc(t);
3527 	ASSERT(MUTEX_HELD(&p->p_lock));
3528 
3529 	/*
3530 	 * A system process (p0) could be referenced if the thread is
3531 	 * in the process of exiting.  Don't turn on microstate accounting
3532 	 * in that case.
3533 	 */
3534 	if (p->p_flag & SSYS)
3535 		return;
3536 
3537 	/*
3538 	 * Loop through all the LWPs (kernel threads) in the process.
3539 	 */
3540 	t = p->p_tlist;
3541 	do {
3542 		t->t_proc_flag |= TP_MSACCT;
3543 	} while ((t = t->t_forw) != p->p_tlist);
3544 
3545 	p->p_flag |= SMSACCT;			/* set process-wide MSACCT */
3546 }
3547 
3548 /*
3549  * It's not really possible to disable microstate accounting anymore.
3550  * However, this routine simply turns off the ms accounting flags in a process
3551  * This way procfs can still pretend to turn microstate accounting on and
3552  * off for a process, but it actually doesn't do anything.  This is
3553  * a neutered form of preemptive idiot-proofing.
3554  */
3555 void
3556 disable_msacct(proc_t *p)
3557 {
3558 	kthread_t *t;
3559 
3560 	ASSERT(MUTEX_HELD(&p->p_lock));
3561 
3562 	p->p_flag &= ~SMSACCT;		/* clear process-wide MSACCT */
3563 	/*
3564 	 * Loop through all the LWPs (kernel threads) in the process.
3565 	 */
3566 	if ((t = p->p_tlist) != NULL) {
3567 		do {
3568 			/* clear per-thread flag */
3569 			t->t_proc_flag &= ~TP_MSACCT;
3570 		} while ((t = t->t_forw) != p->p_tlist);
3571 	}
3572 }
3573 
3574 /*
3575  * Return resource usage information.
3576  */
3577 void
3578 prgetusage(kthread_t *t, prhusage_t *pup)
3579 {
3580 	klwp_t *lwp = ttolwp(t);
3581 	hrtime_t *mstimep;
3582 	struct mstate *ms = &lwp->lwp_mstate;
3583 	int state;
3584 	int i;
3585 	hrtime_t curtime;
3586 	hrtime_t waitrq;
3587 	hrtime_t tmp1;
3588 
3589 	curtime = gethrtime_unscaled();
3590 
3591 	pup->pr_lwpid	= t->t_tid;
3592 	pup->pr_count	= 1;
3593 	pup->pr_create	= ms->ms_start;
3594 	pup->pr_term    = ms->ms_term;
3595 	scalehrtime(&pup->pr_create);
3596 	scalehrtime(&pup->pr_term);
3597 	if (ms->ms_term == 0) {
3598 		pup->pr_rtime = curtime - ms->ms_start;
3599 		scalehrtime(&pup->pr_rtime);
3600 	} else {
3601 		pup->pr_rtime = ms->ms_term - ms->ms_start;
3602 		scalehrtime(&pup->pr_rtime);
3603 	}
3604 
3605 
3606 	pup->pr_utime    = ms->ms_acct[LMS_USER];
3607 	pup->pr_stime    = ms->ms_acct[LMS_SYSTEM];
3608 	pup->pr_ttime    = ms->ms_acct[LMS_TRAP];
3609 	pup->pr_tftime   = ms->ms_acct[LMS_TFAULT];
3610 	pup->pr_dftime   = ms->ms_acct[LMS_DFAULT];
3611 	pup->pr_kftime   = ms->ms_acct[LMS_KFAULT];
3612 	pup->pr_ltime    = ms->ms_acct[LMS_USER_LOCK];
3613 	pup->pr_slptime  = ms->ms_acct[LMS_SLEEP];
3614 	pup->pr_wtime    = ms->ms_acct[LMS_WAIT_CPU];
3615 	pup->pr_stoptime = ms->ms_acct[LMS_STOPPED];
3616 
3617 	prscaleusage(pup);
3618 
3619 	/*
3620 	 * Adjust for time waiting in the dispatcher queue.
3621 	 */
3622 	waitrq = t->t_waitrq;	/* hopefully atomic */
3623 	if (waitrq != 0) {
3624 		if (waitrq > curtime) {
3625 			curtime = gethrtime_unscaled();
3626 		}
3627 		tmp1 = curtime - waitrq;
3628 		scalehrtime(&tmp1);
3629 		pup->pr_wtime += tmp1;
3630 		curtime = waitrq;
3631 	}
3632 
3633 	/*
3634 	 * Adjust for time spent in current microstate.
3635 	 */
3636 	if (ms->ms_state_start > curtime) {
3637 		curtime = gethrtime_unscaled();
3638 	}
3639 
3640 	i = 0;
3641 	do {
3642 		switch (state = t->t_mstate) {
3643 		case LMS_SLEEP:
3644 			/*
3645 			 * Update the timer for the current sleep state.
3646 			 */
3647 			switch (state = ms->ms_prev) {
3648 			case LMS_TFAULT:
3649 			case LMS_DFAULT:
3650 			case LMS_KFAULT:
3651 			case LMS_USER_LOCK:
3652 				break;
3653 			default:
3654 				state = LMS_SLEEP;
3655 				break;
3656 			}
3657 			break;
3658 		case LMS_TFAULT:
3659 		case LMS_DFAULT:
3660 		case LMS_KFAULT:
3661 		case LMS_USER_LOCK:
3662 			state = LMS_SYSTEM;
3663 			break;
3664 		}
3665 		switch (state) {
3666 		case LMS_USER:		mstimep = &pup->pr_utime;	break;
3667 		case LMS_SYSTEM:	mstimep = &pup->pr_stime;	break;
3668 		case LMS_TRAP:		mstimep = &pup->pr_ttime;	break;
3669 		case LMS_TFAULT:	mstimep = &pup->pr_tftime;	break;
3670 		case LMS_DFAULT:	mstimep = &pup->pr_dftime;	break;
3671 		case LMS_KFAULT:	mstimep = &pup->pr_kftime;	break;
3672 		case LMS_USER_LOCK:	mstimep = &pup->pr_ltime;	break;
3673 		case LMS_SLEEP:		mstimep = &pup->pr_slptime;	break;
3674 		case LMS_WAIT_CPU:	mstimep = &pup->pr_wtime;	break;
3675 		case LMS_STOPPED:	mstimep = &pup->pr_stoptime;	break;
3676 		default:		panic("prgetusage: unknown microstate");
3677 		}
3678 		tmp1 = curtime - ms->ms_state_start;
3679 		if (tmp1 < 0) {
3680 			curtime = gethrtime_unscaled();
3681 			i++;
3682 			continue;
3683 		}
3684 		scalehrtime(&tmp1);
3685 	} while (tmp1 < 0 && i < MAX_ITERS_SPIN);
3686 
3687 	*mstimep += tmp1;
3688 
3689 	/* update pup timestamp */
3690 	pup->pr_tstamp = curtime;
3691 	scalehrtime(&pup->pr_tstamp);
3692 
3693 	/*
3694 	 * Resource usage counters.
3695 	 */
3696 	pup->pr_minf  = lwp->lwp_ru.minflt;
3697 	pup->pr_majf  = lwp->lwp_ru.majflt;
3698 	pup->pr_nswap = lwp->lwp_ru.nswap;
3699 	pup->pr_inblk = lwp->lwp_ru.inblock;
3700 	pup->pr_oublk = lwp->lwp_ru.oublock;
3701 	pup->pr_msnd  = lwp->lwp_ru.msgsnd;
3702 	pup->pr_mrcv  = lwp->lwp_ru.msgrcv;
3703 	pup->pr_sigs  = lwp->lwp_ru.nsignals;
3704 	pup->pr_vctx  = lwp->lwp_ru.nvcsw;
3705 	pup->pr_ictx  = lwp->lwp_ru.nivcsw;
3706 	pup->pr_sysc  = lwp->lwp_ru.sysc;
3707 	pup->pr_ioch  = lwp->lwp_ru.ioch;
3708 }
3709 
3710 /*
3711  * Convert ms_acct stats from unscaled high-res time to nanoseconds
3712  */
3713 void
3714 prscaleusage(prhusage_t *usg)
3715 {
3716 	scalehrtime(&usg->pr_utime);
3717 	scalehrtime(&usg->pr_stime);
3718 	scalehrtime(&usg->pr_ttime);
3719 	scalehrtime(&usg->pr_tftime);
3720 	scalehrtime(&usg->pr_dftime);
3721 	scalehrtime(&usg->pr_kftime);
3722 	scalehrtime(&usg->pr_ltime);
3723 	scalehrtime(&usg->pr_slptime);
3724 	scalehrtime(&usg->pr_wtime);
3725 	scalehrtime(&usg->pr_stoptime);
3726 }
3727 
3728 
3729 /*
3730  * Sum resource usage information.
3731  */
3732 void
3733 praddusage(kthread_t *t, prhusage_t *pup)
3734 {
3735 	klwp_t *lwp = ttolwp(t);
3736 	hrtime_t *mstimep;
3737 	struct mstate *ms = &lwp->lwp_mstate;
3738 	int state;
3739 	int i;
3740 	hrtime_t curtime;
3741 	hrtime_t waitrq;
3742 	hrtime_t tmp;
3743 	prhusage_t conv;
3744 
3745 	curtime = gethrtime_unscaled();
3746 
3747 	if (ms->ms_term == 0) {
3748 		tmp = curtime - ms->ms_start;
3749 		scalehrtime(&tmp);
3750 		pup->pr_rtime += tmp;
3751 	} else {
3752 		tmp = ms->ms_term - ms->ms_start;
3753 		scalehrtime(&tmp);
3754 		pup->pr_rtime += tmp;
3755 	}
3756 
3757 	conv.pr_utime = ms->ms_acct[LMS_USER];
3758 	conv.pr_stime = ms->ms_acct[LMS_SYSTEM];
3759 	conv.pr_ttime = ms->ms_acct[LMS_TRAP];
3760 	conv.pr_tftime = ms->ms_acct[LMS_TFAULT];
3761 	conv.pr_dftime = ms->ms_acct[LMS_DFAULT];
3762 	conv.pr_kftime = ms->ms_acct[LMS_KFAULT];
3763 	conv.pr_ltime = ms->ms_acct[LMS_USER_LOCK];
3764 	conv.pr_slptime = ms->ms_acct[LMS_SLEEP];
3765 	conv.pr_wtime = ms->ms_acct[LMS_WAIT_CPU];
3766 	conv.pr_stoptime = ms->ms_acct[LMS_STOPPED];
3767 
3768 	prscaleusage(&conv);
3769 
3770 	pup->pr_utime	+= conv.pr_utime;
3771 	pup->pr_stime	+= conv.pr_stime;
3772 	pup->pr_ttime	+= conv.pr_ttime;
3773 	pup->pr_tftime	+= conv.pr_tftime;
3774 	pup->pr_dftime	+= conv.pr_dftime;
3775 	pup->pr_kftime	+= conv.pr_kftime;
3776 	pup->pr_ltime	+= conv.pr_ltime;
3777 	pup->pr_slptime	+= conv.pr_slptime;
3778 	pup->pr_wtime	+= conv.pr_wtime;
3779 	pup->pr_stoptime += conv.pr_stoptime;
3780 
3781 	/*
3782 	 * Adjust for time waiting in the dispatcher queue.
3783 	 */
3784 	waitrq = t->t_waitrq;	/* hopefully atomic */
3785 	if (waitrq != 0) {
3786 		if (waitrq > curtime) {
3787 			curtime = gethrtime_unscaled();
3788 		}
3789 		tmp = curtime - waitrq;
3790 		scalehrtime(&tmp);
3791 		pup->pr_wtime += tmp;
3792 		curtime = waitrq;
3793 	}
3794 
3795 	/*
3796 	 * Adjust for time spent in current microstate.
3797 	 */
3798 	if (ms->ms_state_start > curtime) {
3799 		curtime = gethrtime_unscaled();
3800 	}
3801 
3802 	i = 0;
3803 	do {
3804 		switch (state = t->t_mstate) {
3805 		case LMS_SLEEP:
3806 			/*
3807 			 * Update the timer for the current sleep state.
3808 			 */
3809 			switch (state = ms->ms_prev) {
3810 			case LMS_TFAULT:
3811 			case LMS_DFAULT:
3812 			case LMS_KFAULT:
3813 			case LMS_USER_LOCK:
3814 				break;
3815 			default:
3816 				state = LMS_SLEEP;
3817 				break;
3818 			}
3819 			break;
3820 		case LMS_TFAULT:
3821 		case LMS_DFAULT:
3822 		case LMS_KFAULT:
3823 		case LMS_USER_LOCK:
3824 			state = LMS_SYSTEM;
3825 			break;
3826 		}
3827 		switch (state) {
3828 		case LMS_USER:		mstimep = &pup->pr_utime;	break;
3829 		case LMS_SYSTEM:	mstimep = &pup->pr_stime;	break;
3830 		case LMS_TRAP:		mstimep = &pup->pr_ttime;	break;
3831 		case LMS_TFAULT:	mstimep = &pup->pr_tftime;	break;
3832 		case LMS_DFAULT:	mstimep = &pup->pr_dftime;	break;
3833 		case LMS_KFAULT:	mstimep = &pup->pr_kftime;	break;
3834 		case LMS_USER_LOCK:	mstimep = &pup->pr_ltime;	break;
3835 		case LMS_SLEEP:		mstimep = &pup->pr_slptime;	break;
3836 		case LMS_WAIT_CPU:	mstimep = &pup->pr_wtime;	break;
3837 		case LMS_STOPPED:	mstimep = &pup->pr_stoptime;	break;
3838 		default:		panic("praddusage: unknown microstate");
3839 		}
3840 		tmp = curtime - ms->ms_state_start;
3841 		if (tmp < 0) {
3842 			curtime = gethrtime_unscaled();
3843 			i++;
3844 			continue;
3845 		}
3846 		scalehrtime(&tmp);
3847 	} while (tmp < 0 && i < MAX_ITERS_SPIN);
3848 
3849 	*mstimep += tmp;
3850 
3851 	/* update pup timestamp */
3852 	pup->pr_tstamp = curtime;
3853 	scalehrtime(&pup->pr_tstamp);
3854 
3855 	/*
3856 	 * Resource usage counters.
3857 	 */
3858 	pup->pr_minf  += lwp->lwp_ru.minflt;
3859 	pup->pr_majf  += lwp->lwp_ru.majflt;
3860 	pup->pr_nswap += lwp->lwp_ru.nswap;
3861 	pup->pr_inblk += lwp->lwp_ru.inblock;
3862 	pup->pr_oublk += lwp->lwp_ru.oublock;
3863 	pup->pr_msnd  += lwp->lwp_ru.msgsnd;
3864 	pup->pr_mrcv  += lwp->lwp_ru.msgrcv;
3865 	pup->pr_sigs  += lwp->lwp_ru.nsignals;
3866 	pup->pr_vctx  += lwp->lwp_ru.nvcsw;
3867 	pup->pr_ictx  += lwp->lwp_ru.nivcsw;
3868 	pup->pr_sysc  += lwp->lwp_ru.sysc;
3869 	pup->pr_ioch  += lwp->lwp_ru.ioch;
3870 }
3871 
3872 /*
3873  * Convert a prhusage_t to a prusage_t.
3874  * This means convert each hrtime_t to a timestruc_t
3875  * and copy the count fields uint64_t => ulong_t.
3876  */
3877 void
3878 prcvtusage(prhusage_t *pup, prusage_t *upup)
3879 {
3880 	uint64_t *ullp;
3881 	ulong_t *ulp;
3882 	int i;
3883 
3884 	upup->pr_lwpid = pup->pr_lwpid;
3885 	upup->pr_count = pup->pr_count;
3886 
3887 	hrt2ts(pup->pr_tstamp,	&upup->pr_tstamp);
3888 	hrt2ts(pup->pr_create,	&upup->pr_create);
3889 	hrt2ts(pup->pr_term,	&upup->pr_term);
3890 	hrt2ts(pup->pr_rtime,	&upup->pr_rtime);
3891 	hrt2ts(pup->pr_utime,	&upup->pr_utime);
3892 	hrt2ts(pup->pr_stime,	&upup->pr_stime);
3893 	hrt2ts(pup->pr_ttime,	&upup->pr_ttime);
3894 	hrt2ts(pup->pr_tftime,	&upup->pr_tftime);
3895 	hrt2ts(pup->pr_dftime,	&upup->pr_dftime);
3896 	hrt2ts(pup->pr_kftime,	&upup->pr_kftime);
3897 	hrt2ts(pup->pr_ltime,	&upup->pr_ltime);
3898 	hrt2ts(pup->pr_slptime,	&upup->pr_slptime);
3899 	hrt2ts(pup->pr_wtime,	&upup->pr_wtime);
3900 	hrt2ts(pup->pr_stoptime, &upup->pr_stoptime);
3901 	bzero(upup->filltime, sizeof (upup->filltime));
3902 
3903 	ullp = &pup->pr_minf;
3904 	ulp = &upup->pr_minf;
3905 	for (i = 0; i < 22; i++)
3906 		*ulp++ = (ulong_t)*ullp++;
3907 }
3908 
3909 #ifdef _SYSCALL32_IMPL
3910 void
3911 prcvtusage32(prhusage_t *pup, prusage32_t *upup)
3912 {
3913 	uint64_t *ullp;
3914 	uint32_t *ulp;
3915 	int i;
3916 
3917 	upup->pr_lwpid = pup->pr_lwpid;
3918 	upup->pr_count = pup->pr_count;
3919 
3920 	hrt2ts32(pup->pr_tstamp,	&upup->pr_tstamp);
3921 	hrt2ts32(pup->pr_create,	&upup->pr_create);
3922 	hrt2ts32(pup->pr_term,		&upup->pr_term);
3923 	hrt2ts32(pup->pr_rtime,		&upup->pr_rtime);
3924 	hrt2ts32(pup->pr_utime,		&upup->pr_utime);
3925 	hrt2ts32(pup->pr_stime,		&upup->pr_stime);
3926 	hrt2ts32(pup->pr_ttime,		&upup->pr_ttime);
3927 	hrt2ts32(pup->pr_tftime,	&upup->pr_tftime);
3928 	hrt2ts32(pup->pr_dftime,	&upup->pr_dftime);
3929 	hrt2ts32(pup->pr_kftime,	&upup->pr_kftime);
3930 	hrt2ts32(pup->pr_ltime,		&upup->pr_ltime);
3931 	hrt2ts32(pup->pr_slptime,	&upup->pr_slptime);
3932 	hrt2ts32(pup->pr_wtime,		&upup->pr_wtime);
3933 	hrt2ts32(pup->pr_stoptime,	&upup->pr_stoptime);
3934 	bzero(upup->filltime, sizeof (upup->filltime));
3935 
3936 	ullp = &pup->pr_minf;
3937 	ulp = &upup->pr_minf;
3938 	for (i = 0; i < 22; i++)
3939 		*ulp++ = (uint32_t)*ullp++;
3940 }
3941 #endif	/* _SYSCALL32_IMPL */
3942 
3943 /*
3944  * Determine whether a set is empty.
3945  */
3946 int
3947 setisempty(uint32_t *sp, uint_t n)
3948 {
3949 	while (n--)
3950 		if (*sp++)
3951 			return (0);
3952 	return (1);
3953 }
3954 
3955 /*
3956  * Utility routine for establishing a watched area in the process.
3957  * Keep the list of watched areas sorted by virtual address.
3958  */
3959 int
3960 set_watched_area(proc_t *p, struct watched_area *pwa)
3961 {
3962 	caddr_t vaddr = pwa->wa_vaddr;
3963 	caddr_t eaddr = pwa->wa_eaddr;
3964 	ulong_t flags = pwa->wa_flags;
3965 	struct watched_area *target;
3966 	avl_index_t where;
3967 	int error = 0;
3968 
3969 	/* we must not be holding p->p_lock, but the process must be locked */
3970 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
3971 	ASSERT(p->p_proc_flag & P_PR_LOCK);
3972 
3973 	/*
3974 	 * If this is our first watchpoint, enable watchpoints for the process.
3975 	 */
3976 	if (!pr_watch_active(p)) {
3977 		kthread_t *t;
3978 
3979 		mutex_enter(&p->p_lock);
3980 		if ((t = p->p_tlist) != NULL) {
3981 			do {
3982 				watch_enable(t);
3983 			} while ((t = t->t_forw) != p->p_tlist);
3984 		}
3985 		mutex_exit(&p->p_lock);
3986 	}
3987 
3988 	target = pr_find_watched_area(p, pwa, &where);
3989 	if (target != NULL) {
3990 		/*
3991 		 * We discovered an existing, overlapping watched area.
3992 		 * Allow it only if it is an exact match.
3993 		 */
3994 		if (target->wa_vaddr != vaddr ||
3995 		    target->wa_eaddr != eaddr)
3996 			error = EINVAL;
3997 		else if (target->wa_flags != flags) {
3998 			error = set_watched_page(p, vaddr, eaddr,
3999 			    flags, target->wa_flags);
4000 			target->wa_flags = flags;
4001 		}
4002 		kmem_free(pwa, sizeof (struct watched_area));
4003 	} else {
4004 		avl_insert(&p->p_warea, pwa, where);
4005 		error = set_watched_page(p, vaddr, eaddr, flags, 0);
4006 	}
4007 
4008 	return (error);
4009 }
4010 
4011 /*
4012  * Utility routine for clearing a watched area in the process.
4013  * Must be an exact match of the virtual address.
4014  * size and flags don't matter.
4015  */
4016 int
4017 clear_watched_area(proc_t *p, struct watched_area *pwa)
4018 {
4019 	struct watched_area *found;
4020 
4021 	/* we must not be holding p->p_lock, but the process must be locked */
4022 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
4023 	ASSERT(p->p_proc_flag & P_PR_LOCK);
4024 
4025 
4026 	if (!pr_watch_active(p)) {
4027 		kmem_free(pwa, sizeof (struct watched_area));
4028 		return (0);
4029 	}
4030 
4031 	/*
4032 	 * Look for a matching address in the watched areas.  If a match is
4033 	 * found, clear the old watched area and adjust the watched page(s).  It
4034 	 * is not an error if there is no match.
4035 	 */
4036 	if ((found = pr_find_watched_area(p, pwa, NULL)) != NULL &&
4037 	    found->wa_vaddr == pwa->wa_vaddr) {
4038 		clear_watched_page(p, found->wa_vaddr, found->wa_eaddr,
4039 		    found->wa_flags);
4040 		avl_remove(&p->p_warea, found);
4041 		kmem_free(found, sizeof (struct watched_area));
4042 	}
4043 
4044 	kmem_free(pwa, sizeof (struct watched_area));
4045 
4046 	/*
4047 	 * If we removed the last watched area from the process, disable
4048 	 * watchpoints.
4049 	 */
4050 	if (!pr_watch_active(p)) {
4051 		kthread_t *t;
4052 
4053 		mutex_enter(&p->p_lock);
4054 		if ((t = p->p_tlist) != NULL) {
4055 			do {
4056 				watch_disable(t);
4057 			} while ((t = t->t_forw) != p->p_tlist);
4058 		}
4059 		mutex_exit(&p->p_lock);
4060 	}
4061 
4062 	return (0);
4063 }
4064 
4065 /*
4066  * Frees all the watched_area structures
4067  */
4068 void
4069 pr_free_watchpoints(proc_t *p)
4070 {
4071 	struct watched_area *delp;
4072 	void *cookie;
4073 
4074 	cookie = NULL;
4075 	while ((delp = avl_destroy_nodes(&p->p_warea, &cookie)) != NULL)
4076 		kmem_free(delp, sizeof (struct watched_area));
4077 
4078 	avl_destroy(&p->p_warea);
4079 }
4080 
4081 /*
4082  * This one is called by the traced process to unwatch all the
4083  * pages while deallocating the list of watched_page structs.
4084  */
4085 void
4086 pr_free_watched_pages(proc_t *p)
4087 {
4088 	struct as *as = p->p_as;
4089 	struct watched_page *pwp;
4090 	uint_t prot;
4091 	int    retrycnt, err;
4092 	void *cookie;
4093 
4094 	if (as == NULL || avl_numnodes(&as->a_wpage) == 0)
4095 		return;
4096 
4097 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
4098 	AS_LOCK_ENTER(as, RW_WRITER);
4099 
4100 	pwp = avl_first(&as->a_wpage);
4101 
4102 	cookie = NULL;
4103 	while ((pwp = avl_destroy_nodes(&as->a_wpage, &cookie)) != NULL) {
4104 		retrycnt = 0;
4105 		if ((prot = pwp->wp_oprot) != 0) {
4106 			caddr_t addr = pwp->wp_vaddr;
4107 			struct seg *seg;
4108 		retry:
4109 
4110 			if ((pwp->wp_prot != prot ||
4111 			    (pwp->wp_flags & WP_NOWATCH)) &&
4112 			    (seg = as_segat(as, addr)) != NULL) {
4113 				err = SEGOP_SETPROT(seg, addr, PAGESIZE, prot);
4114 				if (err == IE_RETRY) {
4115 					ASSERT(retrycnt == 0);
4116 					retrycnt++;
4117 					goto retry;
4118 				}
4119 			}
4120 		}
4121 		kmem_free(pwp, sizeof (struct watched_page));
4122 	}
4123 
4124 	avl_destroy(&as->a_wpage);
4125 	p->p_wprot = NULL;
4126 
4127 	AS_LOCK_EXIT(as);
4128 }
4129 
4130 /*
4131  * Insert a watched area into the list of watched pages.
4132  * If oflags is zero then we are adding a new watched area.
4133  * Otherwise we are changing the flags of an existing watched area.
4134  */
4135 static int
4136 set_watched_page(proc_t *p, caddr_t vaddr, caddr_t eaddr,
4137     ulong_t flags, ulong_t oflags)
4138 {
4139 	struct as *as = p->p_as;
4140 	avl_tree_t *pwp_tree;
4141 	struct watched_page *pwp, *newpwp;
4142 	struct watched_page tpw;
4143 	avl_index_t where;
4144 	struct seg *seg;
4145 	uint_t prot;
4146 	caddr_t addr;
4147 
4148 	/*
4149 	 * We need to pre-allocate a list of structures before we grab the
4150 	 * address space lock to avoid calling kmem_alloc(KM_SLEEP) with locks
4151 	 * held.
4152 	 */
4153 	newpwp = NULL;
4154 	for (addr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
4155 	    addr < eaddr; addr += PAGESIZE) {
4156 		pwp = kmem_zalloc(sizeof (struct watched_page), KM_SLEEP);
4157 		pwp->wp_list = newpwp;
4158 		newpwp = pwp;
4159 	}
4160 
4161 	AS_LOCK_ENTER(as, RW_WRITER);
4162 
4163 	/*
4164 	 * Search for an existing watched page to contain the watched area.
4165 	 * If none is found, grab a new one from the available list
4166 	 * and insert it in the active list, keeping the list sorted
4167 	 * by user-level virtual address.
4168 	 */
4169 	if (p->p_flag & SVFWAIT)
4170 		pwp_tree = &p->p_wpage;
4171 	else
4172 		pwp_tree = &as->a_wpage;
4173 
4174 again:
4175 	if (avl_numnodes(pwp_tree) > prnwatch) {
4176 		AS_LOCK_EXIT(as);
4177 		while (newpwp != NULL) {
4178 			pwp = newpwp->wp_list;
4179 			kmem_free(newpwp, sizeof (struct watched_page));
4180 			newpwp = pwp;
4181 		}
4182 		return (E2BIG);
4183 	}
4184 
4185 	tpw.wp_vaddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
4186 	if ((pwp = avl_find(pwp_tree, &tpw, &where)) == NULL) {
4187 		pwp = newpwp;
4188 		newpwp = newpwp->wp_list;
4189 		pwp->wp_list = NULL;
4190 		pwp->wp_vaddr = (caddr_t)((uintptr_t)vaddr &
4191 		    (uintptr_t)PAGEMASK);
4192 		avl_insert(pwp_tree, pwp, where);
4193 	}
4194 
4195 	ASSERT(vaddr >= pwp->wp_vaddr && vaddr < pwp->wp_vaddr + PAGESIZE);
4196 
4197 	if (oflags & WA_READ)
4198 		pwp->wp_read--;
4199 	if (oflags & WA_WRITE)
4200 		pwp->wp_write--;
4201 	if (oflags & WA_EXEC)
4202 		pwp->wp_exec--;
4203 
4204 	ASSERT(pwp->wp_read >= 0);
4205 	ASSERT(pwp->wp_write >= 0);
4206 	ASSERT(pwp->wp_exec >= 0);
4207 
4208 	if (flags & WA_READ)
4209 		pwp->wp_read++;
4210 	if (flags & WA_WRITE)
4211 		pwp->wp_write++;
4212 	if (flags & WA_EXEC)
4213 		pwp->wp_exec++;
4214 
4215 	if (!(p->p_flag & SVFWAIT)) {
4216 		vaddr = pwp->wp_vaddr;
4217 		if (pwp->wp_oprot == 0 &&
4218 		    (seg = as_segat(as, vaddr)) != NULL) {
4219 			SEGOP_GETPROT(seg, vaddr, 0, &prot);
4220 			pwp->wp_oprot = (uchar_t)prot;
4221 			pwp->wp_prot = (uchar_t)prot;
4222 		}
4223 		if (pwp->wp_oprot != 0) {
4224 			prot = pwp->wp_oprot;
4225 			if (pwp->wp_read)
4226 				prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4227 			if (pwp->wp_write)
4228 				prot &= ~PROT_WRITE;
4229 			if (pwp->wp_exec)
4230 				prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4231 			if (!(pwp->wp_flags & WP_NOWATCH) &&
4232 			    pwp->wp_prot != prot &&
4233 			    (pwp->wp_flags & WP_SETPROT) == 0) {
4234 				pwp->wp_flags |= WP_SETPROT;
4235 				pwp->wp_list = p->p_wprot;
4236 				p->p_wprot = pwp;
4237 			}
4238 			pwp->wp_prot = (uchar_t)prot;
4239 		}
4240 	}
4241 
4242 	/*
4243 	 * If the watched area extends into the next page then do
4244 	 * it over again with the virtual address of the next page.
4245 	 */
4246 	if ((vaddr = pwp->wp_vaddr + PAGESIZE) < eaddr)
4247 		goto again;
4248 
4249 	AS_LOCK_EXIT(as);
4250 
4251 	/*
4252 	 * Free any pages we may have over-allocated
4253 	 */
4254 	while (newpwp != NULL) {
4255 		pwp = newpwp->wp_list;
4256 		kmem_free(newpwp, sizeof (struct watched_page));
4257 		newpwp = pwp;
4258 	}
4259 
4260 	return (0);
4261 }
4262 
4263 /*
4264  * Remove a watched area from the list of watched pages.
4265  * A watched area may extend over more than one page.
4266  */
4267 static void
4268 clear_watched_page(proc_t *p, caddr_t vaddr, caddr_t eaddr, ulong_t flags)
4269 {
4270 	struct as *as = p->p_as;
4271 	struct watched_page *pwp;
4272 	struct watched_page tpw;
4273 	avl_tree_t *tree;
4274 	avl_index_t where;
4275 
4276 	AS_LOCK_ENTER(as, RW_WRITER);
4277 
4278 	if (p->p_flag & SVFWAIT)
4279 		tree = &p->p_wpage;
4280 	else
4281 		tree = &as->a_wpage;
4282 
4283 	tpw.wp_vaddr = vaddr =
4284 	    (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
4285 	pwp = avl_find(tree, &tpw, &where);
4286 	if (pwp == NULL)
4287 		pwp = avl_nearest(tree, where, AVL_AFTER);
4288 
4289 	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
4290 		ASSERT(vaddr <=  pwp->wp_vaddr);
4291 
4292 		if (flags & WA_READ)
4293 			pwp->wp_read--;
4294 		if (flags & WA_WRITE)
4295 			pwp->wp_write--;
4296 		if (flags & WA_EXEC)
4297 			pwp->wp_exec--;
4298 
4299 		if (pwp->wp_read + pwp->wp_write + pwp->wp_exec != 0) {
4300 			/*
4301 			 * Reset the hat layer's protections on this page.
4302 			 */
4303 			if (pwp->wp_oprot != 0) {
4304 				uint_t prot = pwp->wp_oprot;
4305 
4306 				if (pwp->wp_read)
4307 					prot &=
4308 					    ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4309 				if (pwp->wp_write)
4310 					prot &= ~PROT_WRITE;
4311 				if (pwp->wp_exec)
4312 					prot &=
4313 					    ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4314 				if (!(pwp->wp_flags & WP_NOWATCH) &&
4315 				    pwp->wp_prot != prot &&
4316 				    (pwp->wp_flags & WP_SETPROT) == 0) {
4317 					pwp->wp_flags |= WP_SETPROT;
4318 					pwp->wp_list = p->p_wprot;
4319 					p->p_wprot = pwp;
4320 				}
4321 				pwp->wp_prot = (uchar_t)prot;
4322 			}
4323 		} else {
4324 			/*
4325 			 * No watched areas remain in this page.
4326 			 * Reset everything to normal.
4327 			 */
4328 			if (pwp->wp_oprot != 0) {
4329 				pwp->wp_prot = pwp->wp_oprot;
4330 				if ((pwp->wp_flags & WP_SETPROT) == 0) {
4331 					pwp->wp_flags |= WP_SETPROT;
4332 					pwp->wp_list = p->p_wprot;
4333 					p->p_wprot = pwp;
4334 				}
4335 			}
4336 		}
4337 
4338 		pwp = AVL_NEXT(tree, pwp);
4339 	}
4340 
4341 	AS_LOCK_EXIT(as);
4342 }
4343 
4344 /*
4345  * Return the original protections for the specified page.
4346  */
4347 static void
4348 getwatchprot(struct as *as, caddr_t addr, uint_t *prot)
4349 {
4350 	struct watched_page *pwp;
4351 	struct watched_page tpw;
4352 
4353 	ASSERT(AS_LOCK_HELD(as));
4354 
4355 	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
4356 	if ((pwp = avl_find(&as->a_wpage, &tpw, NULL)) != NULL)
4357 		*prot = pwp->wp_oprot;
4358 }
4359 
4360 static prpagev_t *
4361 pr_pagev_create(struct seg *seg, int check_noreserve)
4362 {
4363 	prpagev_t *pagev = kmem_alloc(sizeof (prpagev_t), KM_SLEEP);
4364 	size_t total_pages = seg_pages(seg);
4365 
4366 	/*
4367 	 * Limit the size of our vectors to pagev_lim pages at a time.  We need
4368 	 * 4 or 5 bytes of storage per page, so this means we limit ourself
4369 	 * to about a megabyte of kernel heap by default.
4370 	 */
4371 	pagev->pg_npages = MIN(total_pages, pagev_lim);
4372 	pagev->pg_pnbase = 0;
4373 
4374 	pagev->pg_protv =
4375 	    kmem_alloc(pagev->pg_npages * sizeof (uint_t), KM_SLEEP);
4376 
4377 	if (check_noreserve)
4378 		pagev->pg_incore =
4379 		    kmem_alloc(pagev->pg_npages * sizeof (char), KM_SLEEP);
4380 	else
4381 		pagev->pg_incore = NULL;
4382 
4383 	return (pagev);
4384 }
4385 
4386 static void
4387 pr_pagev_destroy(prpagev_t *pagev)
4388 {
4389 	if (pagev->pg_incore != NULL)
4390 		kmem_free(pagev->pg_incore, pagev->pg_npages * sizeof (char));
4391 
4392 	kmem_free(pagev->pg_protv, pagev->pg_npages * sizeof (uint_t));
4393 	kmem_free(pagev, sizeof (prpagev_t));
4394 }
4395 
4396 static caddr_t
4397 pr_pagev_fill(prpagev_t *pagev, struct seg *seg, caddr_t addr, caddr_t eaddr)
4398 {
4399 	ulong_t lastpg = seg_page(seg, eaddr - 1);
4400 	ulong_t pn, pnlim;
4401 	caddr_t saddr;
4402 	size_t len;
4403 
4404 	ASSERT(addr >= seg->s_base && addr <= eaddr);
4405 
4406 	if (addr == eaddr)
4407 		return (eaddr);
4408 
4409 refill:
4410 	ASSERT(addr < eaddr);
4411 	pagev->pg_pnbase = seg_page(seg, addr);
4412 	pnlim = pagev->pg_pnbase + pagev->pg_npages;
4413 	saddr = addr;
4414 
4415 	if (lastpg < pnlim)
4416 		len = (size_t)(eaddr - addr);
4417 	else
4418 		len = pagev->pg_npages * PAGESIZE;
4419 
4420 	if (pagev->pg_incore != NULL) {
4421 		/*
4422 		 * INCORE cleverly has different semantics than GETPROT:
4423 		 * it returns info on pages up to but NOT including addr + len.
4424 		 */
4425 		SEGOP_INCORE(seg, addr, len, pagev->pg_incore);
4426 		pn = pagev->pg_pnbase;
4427 
4428 		do {
4429 			/*
4430 			 * Guilty knowledge here:  We know that segvn_incore
4431 			 * returns more than just the low-order bit that
4432 			 * indicates the page is actually in memory.  If any
4433 			 * bits are set, then the page has backing store.
4434 			 */
4435 			if (pagev->pg_incore[pn++ - pagev->pg_pnbase])
4436 				goto out;
4437 
4438 		} while ((addr += PAGESIZE) < eaddr && pn < pnlim);
4439 
4440 		/*
4441 		 * If we examined all the pages in the vector but we're not
4442 		 * at the end of the segment, take another lap.
4443 		 */
4444 		if (addr < eaddr)
4445 			goto refill;
4446 	}
4447 
4448 	/*
4449 	 * Need to take len - 1 because addr + len is the address of the
4450 	 * first byte of the page just past the end of what we want.
4451 	 */
4452 out:
4453 	SEGOP_GETPROT(seg, saddr, len - 1, pagev->pg_protv);
4454 	return (addr);
4455 }
4456 
4457 static caddr_t
4458 pr_pagev_nextprot(prpagev_t *pagev, struct seg *seg,
4459     caddr_t *saddrp, caddr_t eaddr, uint_t *protp)
4460 {
4461 	/*
4462 	 * Our starting address is either the specified address, or the base
4463 	 * address from the start of the pagev.  If the latter is greater,
4464 	 * this means a previous call to pr_pagev_fill has already scanned
4465 	 * further than the end of the previous mapping.
4466 	 */
4467 	caddr_t base = seg->s_base + pagev->pg_pnbase * PAGESIZE;
4468 	caddr_t addr = MAX(*saddrp, base);
4469 	ulong_t pn = seg_page(seg, addr);
4470 	uint_t prot, nprot;
4471 
4472 	/*
4473 	 * If we're dealing with noreserve pages, then advance addr to
4474 	 * the address of the next page which has backing store.
4475 	 */
4476 	if (pagev->pg_incore != NULL) {
4477 		while (pagev->pg_incore[pn - pagev->pg_pnbase] == 0) {
4478 			if ((addr += PAGESIZE) == eaddr) {
4479 				*saddrp = addr;
4480 				prot = 0;
4481 				goto out;
4482 			}
4483 			if (++pn == pagev->pg_pnbase + pagev->pg_npages) {
4484 				addr = pr_pagev_fill(pagev, seg, addr, eaddr);
4485 				if (addr == eaddr) {
4486 					*saddrp = addr;
4487 					prot = 0;
4488 					goto out;
4489 				}
4490 				pn = seg_page(seg, addr);
4491 			}
4492 		}
4493 	}
4494 
4495 	/*
4496 	 * Get the protections on the page corresponding to addr.
4497 	 */
4498 	pn = seg_page(seg, addr);
4499 	ASSERT(pn >= pagev->pg_pnbase);
4500 	ASSERT(pn < (pagev->pg_pnbase + pagev->pg_npages));
4501 
4502 	prot = pagev->pg_protv[pn - pagev->pg_pnbase];
4503 	getwatchprot(seg->s_as, addr, &prot);
4504 	*saddrp = addr;
4505 
4506 	/*
4507 	 * Now loop until we find a backed page with different protections
4508 	 * or we reach the end of this segment.
4509 	 */
4510 	while ((addr += PAGESIZE) < eaddr) {
4511 		/*
4512 		 * If pn has advanced to the page number following what we
4513 		 * have information on, refill the page vector and reset
4514 		 * addr and pn.  If pr_pagev_fill does not return the
4515 		 * address of the next page, we have a discontiguity and
4516 		 * thus have reached the end of the current mapping.
4517 		 */
4518 		if (++pn == pagev->pg_pnbase + pagev->pg_npages) {
4519 			caddr_t naddr = pr_pagev_fill(pagev, seg, addr, eaddr);
4520 			if (naddr != addr)
4521 				goto out;
4522 			pn = seg_page(seg, addr);
4523 		}
4524 
4525 		/*
4526 		 * The previous page's protections are in prot, and it has
4527 		 * backing.  If this page is MAP_NORESERVE and has no backing,
4528 		 * then end this mapping and return the previous protections.
4529 		 */
4530 		if (pagev->pg_incore != NULL &&
4531 		    pagev->pg_incore[pn - pagev->pg_pnbase] == 0)
4532 			break;
4533 
4534 		/*
4535 		 * Otherwise end the mapping if this page's protections (nprot)
4536 		 * are different than those in the previous page (prot).
4537 		 */
4538 		nprot = pagev->pg_protv[pn - pagev->pg_pnbase];
4539 		getwatchprot(seg->s_as, addr, &nprot);
4540 
4541 		if (nprot != prot)
4542 			break;
4543 	}
4544 
4545 out:
4546 	*protp = prot;
4547 	return (addr);
4548 }
4549 
4550 size_t
4551 pr_getsegsize(struct seg *seg, int reserved)
4552 {
4553 	size_t size = seg->s_size;
4554 
4555 	/*
4556 	 * If we're interested in the reserved space, return the size of the
4557 	 * segment itself.  Everything else in this function is a special case
4558 	 * to determine the actual underlying size of various segment types.
4559 	 */
4560 	if (reserved)
4561 		return (size);
4562 
4563 	/*
4564 	 * If this is a segvn mapping of a regular file, return the smaller
4565 	 * of the segment size and the remaining size of the file beyond
4566 	 * the file offset corresponding to seg->s_base.
4567 	 */
4568 	if (seg->s_ops == &segvn_ops) {
4569 		vattr_t vattr;
4570 		vnode_t *vp;
4571 
4572 		vattr.va_mask = AT_SIZE;
4573 
4574 		if (SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
4575 		    vp != NULL && vp->v_type == VREG &&
4576 		    VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
4577 
4578 			u_offset_t fsize = vattr.va_size;
4579 			u_offset_t offset = SEGOP_GETOFFSET(seg, seg->s_base);
4580 
4581 			if (fsize < offset)
4582 				fsize = 0;
4583 			else
4584 				fsize -= offset;
4585 
4586 			fsize = roundup(fsize, (u_offset_t)PAGESIZE);
4587 
4588 			if (fsize < (u_offset_t)size)
4589 				size = (size_t)fsize;
4590 		}
4591 
4592 		return (size);
4593 	}
4594 
4595 	/*
4596 	 * If this is an ISM shared segment, don't include pages that are
4597 	 * beyond the real size of the spt segment that backs it.
4598 	 */
4599 	if (seg->s_ops == &segspt_shmops)
4600 		return (MIN(spt_realsize(seg), size));
4601 
4602 	/*
4603 	 * If this is segment is a mapping from /dev/null, then this is a
4604 	 * reservation of virtual address space and has no actual size.
4605 	 * Such segments are backed by segdev and have type set to neither
4606 	 * MAP_SHARED nor MAP_PRIVATE.
4607 	 */
4608 	if (seg->s_ops == &segdev_ops &&
4609 	    ((SEGOP_GETTYPE(seg, seg->s_base) &
4610 	    (MAP_SHARED | MAP_PRIVATE)) == 0))
4611 		return (0);
4612 
4613 	/*
4614 	 * If this segment doesn't match one of the special types we handle,
4615 	 * just return the size of the segment itself.
4616 	 */
4617 	return (size);
4618 }
4619 
4620 uint_t
4621 pr_getprot(struct seg *seg, int reserved, void **tmp,
4622     caddr_t *saddrp, caddr_t *naddrp, caddr_t eaddr)
4623 {
4624 	struct as *as = seg->s_as;
4625 
4626 	caddr_t saddr = *saddrp;
4627 	caddr_t naddr;
4628 
4629 	int check_noreserve;
4630 	uint_t prot;
4631 
4632 	union {
4633 		struct segvn_data *svd;
4634 		struct segdev_data *sdp;
4635 		void *data;
4636 	} s;
4637 
4638 	s.data = seg->s_data;
4639 
4640 	ASSERT(AS_WRITE_HELD(as));
4641 	ASSERT(saddr >= seg->s_base && saddr < eaddr);
4642 	ASSERT(eaddr <= seg->s_base + seg->s_size);
4643 
4644 	/*
4645 	 * Don't include MAP_NORESERVE pages in the address range
4646 	 * unless their mappings have actually materialized.
4647 	 * We cheat by knowing that segvn is the only segment
4648 	 * driver that supports MAP_NORESERVE.
4649 	 */
4650 	check_noreserve =
4651 	    (!reserved && seg->s_ops == &segvn_ops && s.svd != NULL &&
4652 	    (s.svd->vp == NULL || s.svd->vp->v_type != VREG) &&
4653 	    (s.svd->flags & MAP_NORESERVE));
4654 
4655 	/*
4656 	 * Examine every page only as a last resort.  We use guilty knowledge
4657 	 * of segvn and segdev to avoid this: if there are no per-page
4658 	 * protections present in the segment and we don't care about
4659 	 * MAP_NORESERVE, then s_data->prot is the prot for the whole segment.
4660 	 */
4661 	if (!check_noreserve && saddr == seg->s_base &&
4662 	    seg->s_ops == &segvn_ops && s.svd != NULL && s.svd->pageprot == 0) {
4663 		prot = s.svd->prot;
4664 		getwatchprot(as, saddr, &prot);
4665 		naddr = eaddr;
4666 
4667 	} else if (saddr == seg->s_base && seg->s_ops == &segdev_ops &&
4668 	    s.sdp != NULL && s.sdp->pageprot == 0) {
4669 		prot = s.sdp->prot;
4670 		getwatchprot(as, saddr, &prot);
4671 		naddr = eaddr;
4672 
4673 	} else {
4674 		prpagev_t *pagev;
4675 
4676 		/*
4677 		 * If addr is sitting at the start of the segment, then
4678 		 * create a page vector to store protection and incore
4679 		 * information for pages in the segment, and fill it.
4680 		 * Otherwise, we expect *tmp to address the prpagev_t
4681 		 * allocated by a previous call to this function.
4682 		 */
4683 		if (saddr == seg->s_base) {
4684 			pagev = pr_pagev_create(seg, check_noreserve);
4685 			saddr = pr_pagev_fill(pagev, seg, saddr, eaddr);
4686 
4687 			ASSERT(*tmp == NULL);
4688 			*tmp = pagev;
4689 
4690 			ASSERT(saddr <= eaddr);
4691 			*saddrp = saddr;
4692 
4693 			if (saddr == eaddr) {
4694 				naddr = saddr;
4695 				prot = 0;
4696 				goto out;
4697 			}
4698 
4699 		} else {
4700 			ASSERT(*tmp != NULL);
4701 			pagev = (prpagev_t *)*tmp;
4702 		}
4703 
4704 		naddr = pr_pagev_nextprot(pagev, seg, saddrp, eaddr, &prot);
4705 		ASSERT(naddr <= eaddr);
4706 	}
4707 
4708 out:
4709 	if (naddr == eaddr)
4710 		pr_getprot_done(tmp);
4711 	*naddrp = naddr;
4712 	return (prot);
4713 }
4714 
4715 void
4716 pr_getprot_done(void **tmp)
4717 {
4718 	if (*tmp != NULL) {
4719 		pr_pagev_destroy((prpagev_t *)*tmp);
4720 		*tmp = NULL;
4721 	}
4722 }
4723 
4724 /*
4725  * Return true iff the vnode is a /proc file from the object directory.
4726  */
4727 int
4728 pr_isobject(vnode_t *vp)
4729 {
4730 	return (vn_matchops(vp, prvnodeops) && VTOP(vp)->pr_type == PR_OBJECT);
4731 }
4732 
4733 /*
4734  * Return true iff the vnode is a /proc file opened by the process itself.
4735  */
4736 int
4737 pr_isself(vnode_t *vp)
4738 {
4739 	/*
4740 	 * XXX: To retain binary compatibility with the old
4741 	 * ioctl()-based version of /proc, we exempt self-opens
4742 	 * of /proc/<pid> from being marked close-on-exec.
4743 	 */
4744 	return (vn_matchops(vp, prvnodeops) &&
4745 	    (VTOP(vp)->pr_flags & PR_ISSELF) &&
4746 	    VTOP(vp)->pr_type != PR_PIDDIR);
4747 }
4748 
4749 static ssize_t
4750 pr_getpagesize(struct seg *seg, caddr_t saddr, caddr_t *naddrp, caddr_t eaddr)
4751 {
4752 	ssize_t pagesize, hatsize;
4753 
4754 	ASSERT(AS_WRITE_HELD(seg->s_as));
4755 	ASSERT(IS_P2ALIGNED(saddr, PAGESIZE));
4756 	ASSERT(IS_P2ALIGNED(eaddr, PAGESIZE));
4757 	ASSERT(saddr < eaddr);
4758 
4759 	pagesize = hatsize = hat_getpagesize(seg->s_as->a_hat, saddr);
4760 	ASSERT(pagesize == -1 || IS_P2ALIGNED(pagesize, pagesize));
4761 	ASSERT(pagesize != 0);
4762 
4763 	if (pagesize == -1)
4764 		pagesize = PAGESIZE;
4765 
4766 	saddr += P2NPHASE((uintptr_t)saddr, pagesize);
4767 
4768 	while (saddr < eaddr) {
4769 		if (hatsize != hat_getpagesize(seg->s_as->a_hat, saddr))
4770 			break;
4771 		ASSERT(IS_P2ALIGNED(saddr, pagesize));
4772 		saddr += pagesize;
4773 	}
4774 
4775 	*naddrp = ((saddr < eaddr) ? saddr : eaddr);
4776 	return (hatsize);
4777 }
4778 
4779 /*
4780  * Return an array of structures with extended memory map information.
4781  * We allocate here; the caller must deallocate.
4782  */
4783 int
4784 prgetxmap(proc_t *p, list_t *iolhead)
4785 {
4786 	struct as *as = p->p_as;
4787 	prxmap_t *mp;
4788 	struct seg *seg;
4789 	struct seg *brkseg, *stkseg;
4790 	struct vnode *vp;
4791 	struct vattr vattr;
4792 	uint_t prot;
4793 
4794 	ASSERT(as != &kas && AS_WRITE_HELD(as));
4795 
4796 	/*
4797 	 * Request an initial buffer size that doesn't waste memory
4798 	 * if the address space has only a small number of segments.
4799 	 */
4800 	pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
4801 
4802 	if ((seg = AS_SEGFIRST(as)) == NULL)
4803 		return (0);
4804 
4805 	brkseg = break_seg(p);
4806 	stkseg = as_segat(as, prgetstackbase(p));
4807 
4808 	do {
4809 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
4810 		caddr_t saddr, naddr, baddr;
4811 		void *tmp = NULL;
4812 		ssize_t psz;
4813 		char *parr;
4814 		uint64_t npages;
4815 		uint64_t pagenum;
4816 
4817 		if ((seg->s_flags & S_HOLE) != 0) {
4818 			continue;
4819 		}
4820 		/*
4821 		 * Segment loop part one: iterate from the base of the segment
4822 		 * to its end, pausing at each address boundary (baddr) between
4823 		 * ranges that have different virtual memory protections.
4824 		 */
4825 		for (saddr = seg->s_base; saddr < eaddr; saddr = baddr) {
4826 			prot = pr_getprot(seg, 0, &tmp, &saddr, &baddr, eaddr);
4827 			ASSERT(baddr >= saddr && baddr <= eaddr);
4828 
4829 			/*
4830 			 * Segment loop part two: iterate from the current
4831 			 * position to the end of the protection boundary,
4832 			 * pausing at each address boundary (naddr) between
4833 			 * ranges that have different underlying page sizes.
4834 			 */
4835 			for (; saddr < baddr; saddr = naddr) {
4836 				psz = pr_getpagesize(seg, saddr, &naddr, baddr);
4837 				ASSERT(naddr >= saddr && naddr <= baddr);
4838 
4839 				mp = pr_iol_newbuf(iolhead, sizeof (*mp));
4840 
4841 				mp->pr_vaddr = (uintptr_t)saddr;
4842 				mp->pr_size = naddr - saddr;
4843 				mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
4844 				mp->pr_mflags = 0;
4845 				if (prot & PROT_READ)
4846 					mp->pr_mflags |= MA_READ;
4847 				if (prot & PROT_WRITE)
4848 					mp->pr_mflags |= MA_WRITE;
4849 				if (prot & PROT_EXEC)
4850 					mp->pr_mflags |= MA_EXEC;
4851 				if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
4852 					mp->pr_mflags |= MA_SHARED;
4853 				if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
4854 					mp->pr_mflags |= MA_NORESERVE;
4855 				if (seg->s_ops == &segspt_shmops ||
4856 				    (seg->s_ops == &segvn_ops &&
4857 				    (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
4858 				    vp == NULL)))
4859 					mp->pr_mflags |= MA_ANON;
4860 				if (seg == brkseg)
4861 					mp->pr_mflags |= MA_BREAK;
4862 				else if (seg == stkseg)
4863 					mp->pr_mflags |= MA_STACK;
4864 				if (seg->s_ops == &segspt_shmops)
4865 					mp->pr_mflags |= MA_ISM | MA_SHM;
4866 
4867 				mp->pr_pagesize = PAGESIZE;
4868 				if (psz == -1) {
4869 					mp->pr_hatpagesize = 0;
4870 				} else {
4871 					mp->pr_hatpagesize = psz;
4872 				}
4873 
4874 				/*
4875 				 * Manufacture a filename for the "object" dir.
4876 				 */
4877 				mp->pr_dev = PRNODEV;
4878 				vattr.va_mask = AT_FSID|AT_NODEID;
4879 				if (seg->s_ops == &segvn_ops &&
4880 				    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
4881 				    vp != NULL && vp->v_type == VREG &&
4882 				    VOP_GETATTR(vp, &vattr, 0, CRED(),
4883 				    NULL) == 0) {
4884 					mp->pr_dev = vattr.va_fsid;
4885 					mp->pr_ino = vattr.va_nodeid;
4886 					if (vp == p->p_exec)
4887 						(void) strcpy(mp->pr_mapname,
4888 						    "a.out");
4889 					else
4890 						pr_object_name(mp->pr_mapname,
4891 						    vp, &vattr);
4892 				}
4893 
4894 				/*
4895 				 * Get the SysV shared memory id, if any.
4896 				 */
4897 				if ((mp->pr_mflags & MA_SHARED) &&
4898 				    p->p_segacct && (mp->pr_shmid = shmgetid(p,
4899 				    seg->s_base)) != SHMID_NONE) {
4900 					if (mp->pr_shmid == SHMID_FREE)
4901 						mp->pr_shmid = -1;
4902 
4903 					mp->pr_mflags |= MA_SHM;
4904 				} else {
4905 					mp->pr_shmid = -1;
4906 				}
4907 
4908 				npages = ((uintptr_t)(naddr - saddr)) >>
4909 				    PAGESHIFT;
4910 				parr = kmem_zalloc(npages, KM_SLEEP);
4911 
4912 				SEGOP_INCORE(seg, saddr, naddr - saddr, parr);
4913 
4914 				for (pagenum = 0; pagenum < npages; pagenum++) {
4915 					if (parr[pagenum] & SEG_PAGE_INCORE)
4916 						mp->pr_rss++;
4917 					if (parr[pagenum] & SEG_PAGE_ANON)
4918 						mp->pr_anon++;
4919 					if (parr[pagenum] & SEG_PAGE_LOCKED)
4920 						mp->pr_locked++;
4921 				}
4922 				kmem_free(parr, npages);
4923 			}
4924 		}
4925 		ASSERT(tmp == NULL);
4926 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
4927 
4928 	return (0);
4929 }
4930 
4931 /*
4932  * Return the process's credentials.  We don't need a 32-bit equivalent of
4933  * this function because prcred_t and prcred32_t are actually the same.
4934  */
4935 void
4936 prgetcred(proc_t *p, prcred_t *pcrp)
4937 {
4938 	mutex_enter(&p->p_crlock);
4939 	cred2prcred(p->p_cred, pcrp);
4940 	mutex_exit(&p->p_crlock);
4941 }
4942 
4943 void
4944 prgetsecflags(proc_t *p, prsecflags_t *psfp)
4945 {
4946 	ASSERT(psfp != NULL);
4947 
4948 	bzero(psfp, sizeof (*psfp));
4949 	psfp->pr_version = PRSECFLAGS_VERSION_CURRENT;
4950 	psfp->pr_lower = p->p_secflags.psf_lower;
4951 	psfp->pr_upper = p->p_secflags.psf_upper;
4952 	psfp->pr_effective = p->p_secflags.psf_effective;
4953 	psfp->pr_inherit = p->p_secflags.psf_inherit;
4954 }
4955 
4956 /*
4957  * Compute actual size of the prpriv_t structure.
4958  */
4959 
4960 size_t
4961 prgetprivsize(void)
4962 {
4963 	return (priv_prgetprivsize(NULL));
4964 }
4965 
4966 /*
4967  * Return the process's privileges.  We don't need a 32-bit equivalent of
4968  * this function because prpriv_t and prpriv32_t are actually the same.
4969  */
4970 void
4971 prgetpriv(proc_t *p, prpriv_t *pprp)
4972 {
4973 	mutex_enter(&p->p_crlock);
4974 	cred2prpriv(p->p_cred, pprp);
4975 	mutex_exit(&p->p_crlock);
4976 }
4977 
4978 #ifdef _SYSCALL32_IMPL
4979 /*
4980  * Return an array of structures with HAT memory map information.
4981  * We allocate here; the caller must deallocate.
4982  */
4983 int
4984 prgetxmap32(proc_t *p, list_t *iolhead)
4985 {
4986 	struct as *as = p->p_as;
4987 	prxmap32_t *mp;
4988 	struct seg *seg;
4989 	struct seg *brkseg, *stkseg;
4990 	struct vnode *vp;
4991 	struct vattr vattr;
4992 	uint_t prot;
4993 
4994 	ASSERT(as != &kas && AS_WRITE_HELD(as));
4995 
4996 	/*
4997 	 * Request an initial buffer size that doesn't waste memory
4998 	 * if the address space has only a small number of segments.
4999 	 */
5000 	pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
5001 
5002 	if ((seg = AS_SEGFIRST(as)) == NULL)
5003 		return (0);
5004 
5005 	brkseg = break_seg(p);
5006 	stkseg = as_segat(as, prgetstackbase(p));
5007 
5008 	do {
5009 		caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
5010 		caddr_t saddr, naddr, baddr;
5011 		void *tmp = NULL;
5012 		ssize_t psz;
5013 		char *parr;
5014 		uint64_t npages;
5015 		uint64_t pagenum;
5016 
5017 		if ((seg->s_flags & S_HOLE) != 0) {
5018 			continue;
5019 		}
5020 
5021 		/*
5022 		 * Segment loop part one: iterate from the base of the segment
5023 		 * to its end, pausing at each address boundary (baddr) between
5024 		 * ranges that have different virtual memory protections.
5025 		 */
5026 		for (saddr = seg->s_base; saddr < eaddr; saddr = baddr) {
5027 			prot = pr_getprot(seg, 0, &tmp, &saddr, &baddr, eaddr);
5028 			ASSERT(baddr >= saddr && baddr <= eaddr);
5029 
5030 			/*
5031 			 * Segment loop part two: iterate from the current
5032 			 * position to the end of the protection boundary,
5033 			 * pausing at each address boundary (naddr) between
5034 			 * ranges that have different underlying page sizes.
5035 			 */
5036 			for (; saddr < baddr; saddr = naddr) {
5037 				psz = pr_getpagesize(seg, saddr, &naddr, baddr);
5038 				ASSERT(naddr >= saddr && naddr <= baddr);
5039 
5040 				mp = pr_iol_newbuf(iolhead, sizeof (*mp));
5041 
5042 				mp->pr_vaddr = (caddr32_t)(uintptr_t)saddr;
5043 				mp->pr_size = (size32_t)(naddr - saddr);
5044 				mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
5045 				mp->pr_mflags = 0;
5046 				if (prot & PROT_READ)
5047 					mp->pr_mflags |= MA_READ;
5048 				if (prot & PROT_WRITE)
5049 					mp->pr_mflags |= MA_WRITE;
5050 				if (prot & PROT_EXEC)
5051 					mp->pr_mflags |= MA_EXEC;
5052 				if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
5053 					mp->pr_mflags |= MA_SHARED;
5054 				if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
5055 					mp->pr_mflags |= MA_NORESERVE;
5056 				if (seg->s_ops == &segspt_shmops ||
5057 				    (seg->s_ops == &segvn_ops &&
5058 				    (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
5059 				    vp == NULL)))
5060 					mp->pr_mflags |= MA_ANON;
5061 				if (seg == brkseg)
5062 					mp->pr_mflags |= MA_BREAK;
5063 				else if (seg == stkseg)
5064 					mp->pr_mflags |= MA_STACK;
5065 				if (seg->s_ops == &segspt_shmops)
5066 					mp->pr_mflags |= MA_ISM | MA_SHM;
5067 
5068 				mp->pr_pagesize = PAGESIZE;
5069 				if (psz == -1) {
5070 					mp->pr_hatpagesize = 0;
5071 				} else {
5072 					mp->pr_hatpagesize = psz;
5073 				}
5074 
5075 				/*
5076 				 * Manufacture a filename for the "object" dir.
5077 				 */
5078 				mp->pr_dev = PRNODEV32;
5079 				vattr.va_mask = AT_FSID|AT_NODEID;
5080 				if (seg->s_ops == &segvn_ops &&
5081 				    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
5082 				    vp != NULL && vp->v_type == VREG &&
5083 				    VOP_GETATTR(vp, &vattr, 0, CRED(),
5084 				    NULL) == 0) {
5085 					(void) cmpldev(&mp->pr_dev,
5086 					    vattr.va_fsid);
5087 					mp->pr_ino = vattr.va_nodeid;
5088 					if (vp == p->p_exec)
5089 						(void) strcpy(mp->pr_mapname,
5090 						    "a.out");
5091 					else
5092 						pr_object_name(mp->pr_mapname,
5093 						    vp, &vattr);
5094 				}
5095 
5096 				/*
5097 				 * Get the SysV shared memory id, if any.
5098 				 */
5099 				if ((mp->pr_mflags & MA_SHARED) &&
5100 				    p->p_segacct && (mp->pr_shmid = shmgetid(p,
5101 				    seg->s_base)) != SHMID_NONE) {
5102 					if (mp->pr_shmid == SHMID_FREE)
5103 						mp->pr_shmid = -1;
5104 
5105 					mp->pr_mflags |= MA_SHM;
5106 				} else {
5107 					mp->pr_shmid = -1;
5108 				}
5109 
5110 				npages = ((uintptr_t)(naddr - saddr)) >>
5111 				    PAGESHIFT;
5112 				parr = kmem_zalloc(npages, KM_SLEEP);
5113 
5114 				SEGOP_INCORE(seg, saddr, naddr - saddr, parr);
5115 
5116 				for (pagenum = 0; pagenum < npages; pagenum++) {
5117 					if (parr[pagenum] & SEG_PAGE_INCORE)
5118 						mp->pr_rss++;
5119 					if (parr[pagenum] & SEG_PAGE_ANON)
5120 						mp->pr_anon++;
5121 					if (parr[pagenum] & SEG_PAGE_LOCKED)
5122 						mp->pr_locked++;
5123 				}
5124 				kmem_free(parr, npages);
5125 			}
5126 		}
5127 		ASSERT(tmp == NULL);
5128 	} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
5129 
5130 	return (0);
5131 }
5132 #endif	/* _SYSCALL32_IMPL */
5133