1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
26 * Copyright 2022 MNX Cloud, Inc.
27 * Copyright 2025 Oxide Computer Company
28 */
29
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
31 /* All Rights Reserved */
32
33 #include <sys/types.h>
34 #include <sys/t_lock.h>
35 #include <sys/param.h>
36 #include <sys/cmn_err.h>
37 #include <sys/cred.h>
38 #include <sys/priv.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/inline.h>
42 #include <sys/kmem.h>
43 #include <sys/mman.h>
44 #include <sys/proc.h>
45 #include <sys/brand.h>
46 #include <sys/sobject.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/uio.h>
50 #include <sys/var.h>
51 #include <sys/vfs.h>
52 #include <sys/vnode.h>
53 #include <sys/session.h>
54 #include <sys/pcb.h>
55 #include <sys/signal.h>
56 #include <sys/user.h>
57 #include <sys/disp.h>
58 #include <sys/class.h>
59 #include <sys/ts.h>
60 #include <sys/bitmap.h>
61 #include <sys/poll.h>
62 #include <sys/shm_impl.h>
63 #include <sys/fault.h>
64 #include <sys/syscall.h>
65 #include <sys/procfs.h>
66 #include <sys/processor.h>
67 #include <sys/cpuvar.h>
68 #include <sys/copyops.h>
69 #include <sys/time.h>
70 #include <sys/msacct.h>
71 #include <sys/flock_impl.h>
72 #include <sys/stropts.h>
73 #include <sys/strsubr.h>
74 #include <sys/pathname.h>
75 #include <sys/mode.h>
76 #include <sys/socketvar.h>
77 #include <sys/autoconf.h>
78 #include <sys/dtrace.h>
79 #include <sys/timod.h>
80 #include <sys/fs/namenode.h>
81 #include <netinet/udp.h>
82 #include <netinet/tcp.h>
83 #include <inet/cc.h>
84 #include <vm/as.h>
85 #include <vm/rm.h>
86 #include <vm/seg.h>
87 #include <vm/seg_vn.h>
88 #include <vm/seg_dev.h>
89 #include <vm/seg_spt.h>
90 #include <vm/page.h>
91 #include <sys/vmparam.h>
92 #include <sys/swap.h>
93 #include <fs/proc/prdata.h>
94 #include <sys/task.h>
95 #include <sys/project.h>
96 #include <sys/contract_impl.h>
97 #include <sys/contract/process.h>
98 #include <sys/contract/process_impl.h>
99 #include <sys/schedctl.h>
100 #include <sys/pool.h>
101 #include <sys/zone.h>
102 #include <sys/atomic.h>
103 #include <sys/sdt.h>
104
105 #define MAX_ITERS_SPIN 5
106
107 typedef struct prpagev {
108 uint_t *pg_protv; /* vector of page permissions */
109 char *pg_incore; /* vector of incore flags */
110 size_t pg_npages; /* number of pages in protv and incore */
111 ulong_t pg_pnbase; /* pn within segment of first protv element */
112 } prpagev_t;
113
114 size_t pagev_lim = 256 * 1024; /* limit on number of pages in prpagev_t */
115
116 extern struct seg_ops segdev_ops; /* needs a header file */
117 extern struct seg_ops segspt_shmops; /* needs a header file */
118
119 static int set_watched_page(proc_t *, caddr_t, caddr_t, ulong_t, ulong_t);
120 static void clear_watched_page(proc_t *, caddr_t, caddr_t, ulong_t);
121
122 /*
123 * Choose an lwp from the complete set of lwps for the process.
124 * This is called for any operation applied to the process
125 * file descriptor that requires an lwp to operate upon.
126 *
127 * Returns a pointer to the thread for the selected LWP,
128 * and with the dispatcher lock held for the thread.
129 *
130 * The algorithm for choosing an lwp is critical for /proc semantics;
131 * don't touch this code unless you know all of the implications.
132 */
133 kthread_t *
prchoose(proc_t * p)134 prchoose(proc_t *p)
135 {
136 kthread_t *t;
137 kthread_t *t_onproc = NULL; /* running on processor */
138 kthread_t *t_run = NULL; /* runnable, on disp queue */
139 kthread_t *t_sleep = NULL; /* sleeping */
140 kthread_t *t_hold = NULL; /* sleeping, performing hold */
141 kthread_t *t_susp = NULL; /* suspended stop */
142 kthread_t *t_jstop = NULL; /* jobcontrol stop, w/o directed stop */
143 kthread_t *t_jdstop = NULL; /* jobcontrol stop with directed stop */
144 kthread_t *t_req = NULL; /* requested stop */
145 kthread_t *t_istop = NULL; /* event-of-interest stop */
146 kthread_t *t_dtrace = NULL; /* DTrace stop */
147
148 ASSERT(MUTEX_HELD(&p->p_lock));
149
150 /*
151 * If the agent lwp exists, it takes precedence over all others.
152 */
153 if ((t = p->p_agenttp) != NULL) {
154 thread_lock(t);
155 return (t);
156 }
157
158 if ((t = p->p_tlist) == NULL) /* start at the head of the list */
159 return (t);
160 do { /* for eacn lwp in the process */
161 if (VSTOPPED(t)) { /* virtually stopped */
162 if (t_req == NULL)
163 t_req = t;
164 continue;
165 }
166
167 /* If this is a process kernel thread, ignore it. */
168 if ((t->t_proc_flag & TP_KTHREAD) != 0) {
169 continue;
170 }
171
172 thread_lock(t); /* make sure thread is in good state */
173 switch (t->t_state) {
174 default:
175 panic("prchoose: bad thread state %d, thread 0x%p",
176 t->t_state, (void *)t);
177 /*NOTREACHED*/
178 case TS_SLEEP:
179 /* this is filthy */
180 if (t->t_wchan == (caddr_t)&p->p_holdlwps &&
181 t->t_wchan0 == NULL) {
182 if (t_hold == NULL)
183 t_hold = t;
184 } else {
185 if (t_sleep == NULL)
186 t_sleep = t;
187 }
188 break;
189 case TS_RUN:
190 case TS_WAIT:
191 if (t_run == NULL)
192 t_run = t;
193 break;
194 case TS_ONPROC:
195 if (t_onproc == NULL)
196 t_onproc = t;
197 break;
198 case TS_ZOMB: /* last possible choice */
199 break;
200 case TS_STOPPED:
201 switch (t->t_whystop) {
202 case PR_SUSPENDED:
203 if (t_susp == NULL)
204 t_susp = t;
205 break;
206 case PR_JOBCONTROL:
207 if (t->t_proc_flag & TP_PRSTOP) {
208 if (t_jdstop == NULL)
209 t_jdstop = t;
210 } else {
211 if (t_jstop == NULL)
212 t_jstop = t;
213 }
214 break;
215 case PR_REQUESTED:
216 if (t->t_dtrace_stop && t_dtrace == NULL)
217 t_dtrace = t;
218 else if (t_req == NULL)
219 t_req = t;
220 break;
221 case PR_SYSENTRY:
222 case PR_SYSEXIT:
223 case PR_SIGNALLED:
224 case PR_FAULTED:
225 /*
226 * Make an lwp calling exit() be the
227 * last lwp seen in the process.
228 */
229 if (t_istop == NULL ||
230 (t_istop->t_whystop == PR_SYSENTRY &&
231 t_istop->t_whatstop == SYS_exit))
232 t_istop = t;
233 break;
234 case PR_CHECKPOINT: /* can't happen? */
235 break;
236 default:
237 panic("prchoose: bad t_whystop %d, thread 0x%p",
238 t->t_whystop, (void *)t);
239 /*NOTREACHED*/
240 }
241 break;
242 }
243 thread_unlock(t);
244 } while ((t = t->t_forw) != p->p_tlist);
245
246 if (t_onproc)
247 t = t_onproc;
248 else if (t_run)
249 t = t_run;
250 else if (t_sleep)
251 t = t_sleep;
252 else if (t_jstop)
253 t = t_jstop;
254 else if (t_jdstop)
255 t = t_jdstop;
256 else if (t_istop)
257 t = t_istop;
258 else if (t_dtrace)
259 t = t_dtrace;
260 else if (t_req)
261 t = t_req;
262 else if (t_hold)
263 t = t_hold;
264 else if (t_susp)
265 t = t_susp;
266 else /* TS_ZOMB */
267 t = p->p_tlist;
268
269 if (t != NULL)
270 thread_lock(t);
271 return (t);
272 }
273
274 /*
275 * Wakeup anyone sleeping on the /proc vnode for the process/lwp to stop.
276 * Also call pollwakeup() if any lwps are waiting in poll() for POLLPRI
277 * on the /proc file descriptor. Called from stop() when a traced
278 * process stops on an event of interest. Also called from exit()
279 * and prinvalidate() to indicate POLLHUP and POLLERR respectively.
280 */
281 void
prnotify(struct vnode * vp)282 prnotify(struct vnode *vp)
283 {
284 prcommon_t *pcp = VTOP(vp)->pr_common;
285
286 mutex_enter(&pcp->prc_mutex);
287 cv_broadcast(&pcp->prc_wait);
288 mutex_exit(&pcp->prc_mutex);
289 if (pcp->prc_flags & PRC_POLL) {
290 /*
291 * We call pollwakeup() with POLLHUP to ensure that
292 * the pollers are awakened even if they are polling
293 * for nothing (i.e., waiting for the process to exit).
294 * This enables the use of the PRC_POLL flag for optimization
295 * (we can turn off PRC_POLL only if we know no pollers remain).
296 */
297 pcp->prc_flags &= ~PRC_POLL;
298 pollwakeup(&pcp->prc_pollhead, POLLHUP);
299 }
300 }
301
302 /* called immediately below, in prfree() */
303 static void
prfreenotify(vnode_t * vp)304 prfreenotify(vnode_t *vp)
305 {
306 prnode_t *pnp;
307 prcommon_t *pcp;
308
309 while (vp != NULL) {
310 pnp = VTOP(vp);
311 pcp = pnp->pr_common;
312 ASSERT(pcp->prc_thread == NULL);
313 pcp->prc_proc = NULL;
314 /*
315 * We can't call prnotify() here because we are holding
316 * pidlock. We assert that there is no need to.
317 */
318 mutex_enter(&pcp->prc_mutex);
319 cv_broadcast(&pcp->prc_wait);
320 mutex_exit(&pcp->prc_mutex);
321 ASSERT(!(pcp->prc_flags & PRC_POLL));
322
323 vp = pnp->pr_next;
324 pnp->pr_next = NULL;
325 }
326 }
327
328 /*
329 * Called from a hook in freeproc() when a traced process is removed
330 * from the process table. The proc-table pointers of all associated
331 * /proc vnodes are cleared to indicate that the process has gone away.
332 */
333 void
prfree(proc_t * p)334 prfree(proc_t *p)
335 {
336 uint_t slot = p->p_slot;
337
338 ASSERT(MUTEX_HELD(&pidlock));
339
340 /*
341 * Block the process against /proc so it can be freed.
342 * It cannot be freed while locked by some controlling process.
343 * Lock ordering:
344 * pidlock -> pr_pidlock -> p->p_lock -> pcp->prc_mutex
345 */
346 mutex_enter(&pr_pidlock); /* protects pcp->prc_proc */
347 mutex_enter(&p->p_lock);
348 while (p->p_proc_flag & P_PR_LOCK) {
349 mutex_exit(&pr_pidlock);
350 cv_wait(&pr_pid_cv[slot], &p->p_lock);
351 mutex_exit(&p->p_lock);
352 mutex_enter(&pr_pidlock);
353 mutex_enter(&p->p_lock);
354 }
355
356 ASSERT(p->p_tlist == NULL);
357
358 prfreenotify(p->p_plist);
359 p->p_plist = NULL;
360
361 prfreenotify(p->p_trace);
362 p->p_trace = NULL;
363
364 /*
365 * We broadcast to wake up everyone waiting for this process.
366 * No one can reach this process from this point on.
367 */
368 cv_broadcast(&pr_pid_cv[slot]);
369
370 mutex_exit(&p->p_lock);
371 mutex_exit(&pr_pidlock);
372 }
373
374 /*
375 * Called from a hook in exit() when a traced process is becoming a zombie.
376 */
377 void
prexit(proc_t * p)378 prexit(proc_t *p)
379 {
380 ASSERT(MUTEX_HELD(&p->p_lock));
381
382 if (pr_watch_active(p)) {
383 pr_free_watchpoints(p);
384 watch_disable(curthread);
385 }
386 /* pr_free_watched_pages() is called in exit(), after dropping p_lock */
387 if (p->p_trace) {
388 VTOP(p->p_trace)->pr_common->prc_flags |= PRC_DESTROY;
389 prnotify(p->p_trace);
390 }
391 cv_broadcast(&pr_pid_cv[p->p_slot]); /* pauselwps() */
392 }
393
394 /*
395 * Called when a thread calls lwp_exit().
396 */
397 void
prlwpexit(kthread_t * t)398 prlwpexit(kthread_t *t)
399 {
400 vnode_t *vp;
401 prnode_t *pnp;
402 prcommon_t *pcp;
403 proc_t *p = ttoproc(t);
404 lwpent_t *lep = p->p_lwpdir[t->t_dslot].ld_entry;
405
406 ASSERT(t == curthread);
407 ASSERT(MUTEX_HELD(&p->p_lock));
408
409 /*
410 * The process must be blocked against /proc to do this safely.
411 * The lwp must not disappear while the process is marked P_PR_LOCK.
412 * It is the caller's responsibility to have called prbarrier(p).
413 */
414 ASSERT(!(p->p_proc_flag & P_PR_LOCK));
415
416 for (vp = p->p_plist; vp != NULL; vp = pnp->pr_next) {
417 pnp = VTOP(vp);
418 pcp = pnp->pr_common;
419 if (pcp->prc_thread == t) {
420 pcp->prc_thread = NULL;
421 pcp->prc_flags |= PRC_DESTROY;
422 }
423 }
424
425 for (vp = lep->le_trace; vp != NULL; vp = pnp->pr_next) {
426 pnp = VTOP(vp);
427 pcp = pnp->pr_common;
428 pcp->prc_thread = NULL;
429 pcp->prc_flags |= PRC_DESTROY;
430 prnotify(vp);
431 }
432
433 if (p->p_trace)
434 prnotify(p->p_trace);
435 }
436
437 /*
438 * Called when a zombie thread is joined or when a
439 * detached lwp exits. Called from lwp_hash_out().
440 */
441 void
prlwpfree(proc_t * p,lwpent_t * lep)442 prlwpfree(proc_t *p, lwpent_t *lep)
443 {
444 vnode_t *vp;
445 prnode_t *pnp;
446 prcommon_t *pcp;
447
448 ASSERT(MUTEX_HELD(&p->p_lock));
449
450 /*
451 * The process must be blocked against /proc to do this safely.
452 * The lwp must not disappear while the process is marked P_PR_LOCK.
453 * It is the caller's responsibility to have called prbarrier(p).
454 */
455 ASSERT(!(p->p_proc_flag & P_PR_LOCK));
456
457 vp = lep->le_trace;
458 lep->le_trace = NULL;
459 while (vp) {
460 prnotify(vp);
461 pnp = VTOP(vp);
462 pcp = pnp->pr_common;
463 ASSERT(pcp->prc_thread == NULL &&
464 (pcp->prc_flags & PRC_DESTROY));
465 pcp->prc_tslot = -1;
466 vp = pnp->pr_next;
467 pnp->pr_next = NULL;
468 }
469
470 if (p->p_trace)
471 prnotify(p->p_trace);
472 }
473
474 /*
475 * Called from a hook in exec() when a thread starts exec().
476 */
477 void
prexecstart(void)478 prexecstart(void)
479 {
480 proc_t *p = ttoproc(curthread);
481 klwp_t *lwp = ttolwp(curthread);
482
483 /*
484 * The P_PR_EXEC flag blocks /proc operations for
485 * the duration of the exec().
486 * We can't start exec() while the process is
487 * locked by /proc, so we call prbarrier().
488 * lwp_nostop keeps the process from being stopped
489 * via job control for the duration of the exec().
490 */
491
492 ASSERT(MUTEX_HELD(&p->p_lock));
493 prbarrier(p);
494 lwp->lwp_nostop++;
495 p->p_proc_flag |= P_PR_EXEC;
496 }
497
498 /*
499 * Called from a hook in exec() when a thread finishes exec().
500 * The thread may or may not have succeeded. Some other thread
501 * may have beat it to the punch.
502 */
503 void
prexecend(void)504 prexecend(void)
505 {
506 proc_t *p = ttoproc(curthread);
507 klwp_t *lwp = ttolwp(curthread);
508 vnode_t *vp;
509 prnode_t *pnp;
510 prcommon_t *pcp;
511 model_t model = p->p_model;
512 id_t tid = curthread->t_tid;
513 int tslot = curthread->t_dslot;
514
515 ASSERT(MUTEX_HELD(&p->p_lock));
516
517 lwp->lwp_nostop--;
518 if (p->p_flag & SEXITLWPS) {
519 /*
520 * We are on our way to exiting because some
521 * other thread beat us in the race to exec().
522 * Don't clear the P_PR_EXEC flag in this case.
523 */
524 return;
525 }
526
527 /*
528 * Wake up anyone waiting in /proc for the process to complete exec().
529 */
530 p->p_proc_flag &= ~P_PR_EXEC;
531 if ((vp = p->p_trace) != NULL) {
532 pcp = VTOP(vp)->pr_common;
533 mutex_enter(&pcp->prc_mutex);
534 cv_broadcast(&pcp->prc_wait);
535 mutex_exit(&pcp->prc_mutex);
536 for (; vp != NULL; vp = pnp->pr_next) {
537 pnp = VTOP(vp);
538 pnp->pr_common->prc_datamodel = model;
539 }
540 }
541 if ((vp = p->p_lwpdir[tslot].ld_entry->le_trace) != NULL) {
542 /*
543 * We dealt with the process common above.
544 */
545 ASSERT(p->p_trace != NULL);
546 pcp = VTOP(vp)->pr_common;
547 mutex_enter(&pcp->prc_mutex);
548 cv_broadcast(&pcp->prc_wait);
549 mutex_exit(&pcp->prc_mutex);
550 for (; vp != NULL; vp = pnp->pr_next) {
551 pnp = VTOP(vp);
552 pcp = pnp->pr_common;
553 pcp->prc_datamodel = model;
554 pcp->prc_tid = tid;
555 pcp->prc_tslot = tslot;
556 }
557 }
558 }
559
560 /*
561 * Called from a hook in relvm() just before freeing the address space.
562 * We free all the watched areas now.
563 */
564 void
prrelvm(void)565 prrelvm(void)
566 {
567 proc_t *p = ttoproc(curthread);
568
569 mutex_enter(&p->p_lock);
570 prbarrier(p); /* block all other /proc operations */
571 if (pr_watch_active(p)) {
572 pr_free_watchpoints(p);
573 watch_disable(curthread);
574 }
575 mutex_exit(&p->p_lock);
576 pr_free_watched_pages(p);
577 }
578
579 /*
580 * Called from hooks in exec-related code when a traced process
581 * attempts to exec(2) a setuid/setgid program or an unreadable
582 * file. Rather than fail the exec we invalidate the associated
583 * /proc vnodes so that subsequent attempts to use them will fail.
584 *
585 * All /proc vnodes, except directory vnodes, are retained on a linked
586 * list (rooted at p_plist in the process structure) until last close.
587 *
588 * A controlling process must re-open the /proc files in order to
589 * regain control.
590 */
591 void
prinvalidate(struct user * up)592 prinvalidate(struct user *up)
593 {
594 kthread_t *t = curthread;
595 proc_t *p = ttoproc(t);
596 vnode_t *vp;
597 prnode_t *pnp;
598 int writers = 0;
599
600 mutex_enter(&p->p_lock);
601 prbarrier(p); /* block all other /proc operations */
602
603 /*
604 * At this moment, there can be only one lwp in the process.
605 */
606 ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
607
608 /*
609 * Invalidate any currently active /proc vnodes.
610 */
611 for (vp = p->p_plist; vp != NULL; vp = pnp->pr_next) {
612 pnp = VTOP(vp);
613 switch (pnp->pr_type) {
614 case PR_PSINFO: /* these files can read by anyone */
615 case PR_LPSINFO:
616 case PR_LWPSINFO:
617 case PR_LWPDIR:
618 case PR_LWPIDDIR:
619 case PR_USAGE:
620 case PR_LUSAGE:
621 case PR_LWPUSAGE:
622 break;
623 default:
624 pnp->pr_flags |= PR_INVAL;
625 break;
626 }
627 }
628 /*
629 * Wake up anyone waiting for the process or lwp.
630 * p->p_trace is guaranteed to be non-NULL if there
631 * are any open /proc files for this process.
632 */
633 if ((vp = p->p_trace) != NULL) {
634 prcommon_t *pcp = VTOP(vp)->pr_pcommon;
635
636 prnotify(vp);
637 /*
638 * Are there any writers?
639 */
640 if ((writers = pcp->prc_writers) != 0) {
641 /*
642 * Clear the exclusive open flag (old /proc interface).
643 * Set prc_selfopens equal to prc_writers so that
644 * the next O_EXCL|O_WRITE open will succeed
645 * even with existing (though invalid) writers.
646 * prclose() must decrement prc_selfopens when
647 * the invalid files are closed.
648 */
649 pcp->prc_flags &= ~PRC_EXCL;
650 ASSERT(pcp->prc_selfopens <= writers);
651 pcp->prc_selfopens = writers;
652 }
653 }
654 vp = p->p_lwpdir[t->t_dslot].ld_entry->le_trace;
655 while (vp != NULL) {
656 /*
657 * We should not invalidate the lwpiddir vnodes,
658 * but the necessities of maintaining the old
659 * ioctl()-based version of /proc require it.
660 */
661 pnp = VTOP(vp);
662 pnp->pr_flags |= PR_INVAL;
663 prnotify(vp);
664 vp = pnp->pr_next;
665 }
666
667 /*
668 * If any tracing flags are in effect and any vnodes are open for
669 * writing then set the requested-stop and run-on-last-close flags.
670 * Otherwise, clear all tracing flags.
671 */
672 t->t_proc_flag &= ~TP_PAUSE;
673 if ((p->p_proc_flag & P_PR_TRACE) && writers) {
674 t->t_proc_flag |= TP_PRSTOP;
675 aston(t); /* so ISSIG will see the flag */
676 p->p_proc_flag |= P_PR_RUNLCL;
677 } else {
678 premptyset(&up->u_entrymask); /* syscalls */
679 premptyset(&up->u_exitmask);
680 up->u_systrap = 0;
681 premptyset(&p->p_sigmask); /* signals */
682 premptyset(&p->p_fltmask); /* faults */
683 t->t_proc_flag &= ~(TP_PRSTOP|TP_PRVSTOP|TP_STOPPING);
684 p->p_proc_flag &= ~(P_PR_RUNLCL|P_PR_KILLCL|P_PR_TRACE);
685 prnostep(ttolwp(t));
686 }
687
688 mutex_exit(&p->p_lock);
689 }
690
691 /*
692 * Acquire the controlled process's p_lock and mark it P_PR_LOCK.
693 * Return with pr_pidlock held in all cases.
694 * Return with p_lock held if the the process still exists.
695 * Return value is the process pointer if the process still exists, else NULL.
696 * If we lock the process, give ourself kernel priority to avoid deadlocks;
697 * this is undone in prunlock().
698 */
699 proc_t *
pr_p_lock(prnode_t * pnp)700 pr_p_lock(prnode_t *pnp)
701 {
702 proc_t *p;
703 prcommon_t *pcp;
704
705 mutex_enter(&pr_pidlock);
706 if ((pcp = pnp->pr_pcommon) == NULL || (p = pcp->prc_proc) == NULL)
707 return (NULL);
708 mutex_enter(&p->p_lock);
709 while (p->p_proc_flag & P_PR_LOCK) {
710 /*
711 * This cv/mutex pair is persistent even if
712 * the process disappears while we sleep.
713 */
714 kcondvar_t *cv = &pr_pid_cv[p->p_slot];
715 kmutex_t *mp = &p->p_lock;
716
717 mutex_exit(&pr_pidlock);
718 cv_wait(cv, mp);
719 mutex_exit(mp);
720 mutex_enter(&pr_pidlock);
721 if (pcp->prc_proc == NULL)
722 return (NULL);
723 ASSERT(p == pcp->prc_proc);
724 mutex_enter(&p->p_lock);
725 }
726 p->p_proc_flag |= P_PR_LOCK;
727 return (p);
728 }
729
730 /*
731 * Lock the target process by setting P_PR_LOCK and grabbing p->p_lock.
732 * This prevents any lwp of the process from disappearing and
733 * blocks most operations that a process can perform on itself.
734 * Returns 0 on success, a non-zero error number on failure.
735 *
736 * 'zdisp' is ZYES or ZNO to indicate whether prlock() should succeed when
737 * the subject process is a zombie (ZYES) or fail for zombies (ZNO).
738 *
739 * error returns:
740 * ENOENT: process or lwp has disappeared or process is exiting
741 * (or has become a zombie and zdisp == ZNO).
742 * EAGAIN: procfs vnode has become invalid.
743 * EINTR: signal arrived while waiting for exec to complete.
744 */
745 int
prlock(prnode_t * pnp,int zdisp)746 prlock(prnode_t *pnp, int zdisp)
747 {
748 prcommon_t *pcp;
749 proc_t *p;
750
751 again:
752 pcp = pnp->pr_common;
753 p = pr_p_lock(pnp);
754 mutex_exit(&pr_pidlock);
755
756 /*
757 * Return ENOENT immediately if there is no process.
758 */
759 if (p == NULL)
760 return (ENOENT);
761
762 ASSERT(p == pcp->prc_proc && p->p_stat != 0 && p->p_stat != SIDL);
763
764 /*
765 * Return ENOENT if process entered zombie state or is exiting
766 * and the 'zdisp' flag is set to ZNO indicating not to lock zombies.
767 */
768 if (zdisp == ZNO &&
769 ((pcp->prc_flags & PRC_DESTROY) || (p->p_flag & SEXITING))) {
770 prunlock(pnp);
771 return (ENOENT);
772 }
773
774 /*
775 * If lwp-specific, check to see if lwp has disappeared.
776 */
777 if (pcp->prc_flags & PRC_LWP) {
778 if ((zdisp == ZNO && (pcp->prc_flags & PRC_DESTROY)) ||
779 pcp->prc_tslot == -1) {
780 prunlock(pnp);
781 return (ENOENT);
782 }
783 }
784
785 /*
786 * Return EAGAIN if we have encountered a security violation.
787 * (The process exec'd a set-id or unreadable executable file.)
788 */
789 if (pnp->pr_flags & PR_INVAL) {
790 prunlock(pnp);
791 return (EAGAIN);
792 }
793
794 /*
795 * If process is undergoing an exec(), wait for
796 * completion and then start all over again.
797 */
798 if (p->p_proc_flag & P_PR_EXEC) {
799 pcp = pnp->pr_pcommon; /* Put on the correct sleep queue */
800 mutex_enter(&pcp->prc_mutex);
801 prunlock(pnp);
802 if (!cv_wait_sig(&pcp->prc_wait, &pcp->prc_mutex)) {
803 mutex_exit(&pcp->prc_mutex);
804 return (EINTR);
805 }
806 mutex_exit(&pcp->prc_mutex);
807 goto again;
808 }
809
810 /*
811 * We return holding p->p_lock.
812 */
813 return (0);
814 }
815
816 /*
817 * Undo prlock() and pr_p_lock().
818 * p->p_lock is still held; pr_pidlock is no longer held.
819 *
820 * prunmark() drops the P_PR_LOCK flag and wakes up another thread,
821 * if any, waiting for the flag to be dropped; it retains p->p_lock.
822 *
823 * prunlock() calls prunmark() and then drops p->p_lock.
824 */
825 void
prunmark(proc_t * p)826 prunmark(proc_t *p)
827 {
828 ASSERT(p->p_proc_flag & P_PR_LOCK);
829 ASSERT(MUTEX_HELD(&p->p_lock));
830
831 cv_signal(&pr_pid_cv[p->p_slot]);
832 p->p_proc_flag &= ~P_PR_LOCK;
833 }
834
835 void
prunlock(prnode_t * pnp)836 prunlock(prnode_t *pnp)
837 {
838 prcommon_t *pcp = pnp->pr_common;
839 proc_t *p = pcp->prc_proc;
840
841 /*
842 * If we (or someone) gave it a SIGKILL, and it is not
843 * already a zombie, set it running unconditionally.
844 */
845 if ((p->p_flag & SKILLED) &&
846 !(p->p_flag & SEXITING) &&
847 !(pcp->prc_flags & PRC_DESTROY) &&
848 !((pcp->prc_flags & PRC_LWP) && pcp->prc_tslot == -1)) {
849 int err = pr_setrun(pnp, 0);
850 /*
851 * EBUSY here means either the process was not stopped by /proc
852 * or there is an agent lwp. If there's an agent lwp, we don't
853 * need to do anything as it will run and witness the SIGKILL.
854 * However, if there's no agent lwp and the process was not
855 * stopped by /proc, it may have been stopped by SIGSTOP; try
856 * getting lwps running with TS_XSTART to undo SIGSTOP effect.
857 *
858 * Notably, other TS_* bits are inappropriate here:
859 * * Do not set TS_PSTART; pr_setrun() above would have already
860 * set this if it did anything for this process.
861 * * Do not set TS_CSTART or TS_UNPAUSE; lwps may be stopped by
862 * PR_SUSPEND for many reasons. Some cases, like holdlwps(),
863 * will resume the process before the corresponding syscall
864 * returns. Other cases, like dumping core, the suspender
865 * will tear down the lwps as it completes.
866 * * Do not set TS_RESUME out of caution; not sure about the
867 * consequences of a process going away during CPR resume and
868 * CPR should set the process running eventually.
869 * * Do not set TS_CREATE because lwp creation expects threads
870 * to remain paused until lwp completes.
871 */
872 if (err == EBUSY && p->p_agenttp == NULL) {
873 runlwps(p, TS_XSTART);
874 }
875 }
876 prunmark(p);
877 mutex_exit(&p->p_lock);
878 }
879
880 /*
881 * Called while holding p->p_lock to delay until the process is unlocked.
882 * We enter holding p->p_lock; p->p_lock is dropped and reacquired.
883 * The process cannot become locked again until p->p_lock is dropped.
884 */
885 void
prbarrier(proc_t * p)886 prbarrier(proc_t *p)
887 {
888 ASSERT(MUTEX_HELD(&p->p_lock));
889
890 if (p->p_proc_flag & P_PR_LOCK) {
891 /* The process is locked; delay until not locked */
892 uint_t slot = p->p_slot;
893
894 while (p->p_proc_flag & P_PR_LOCK)
895 cv_wait(&pr_pid_cv[slot], &p->p_lock);
896 cv_signal(&pr_pid_cv[slot]);
897 }
898 }
899
900 /*
901 * Return process/lwp status.
902 * The u-block is mapped in by this routine and unmapped at the end.
903 */
904 void
prgetstatus(proc_t * p,pstatus_t * sp,zone_t * zp)905 prgetstatus(proc_t *p, pstatus_t *sp, zone_t *zp)
906 {
907 kthread_t *t;
908
909 ASSERT(MUTEX_HELD(&p->p_lock));
910
911 t = prchoose(p); /* returns locked thread */
912 ASSERT(t != NULL);
913 thread_unlock(t);
914
915 /* just bzero the process part, prgetlwpstatus() does the rest */
916 bzero(sp, sizeof (pstatus_t) - sizeof (lwpstatus_t));
917 sp->pr_nlwp = p->p_lwpcnt;
918 sp->pr_nzomb = p->p_zombcnt;
919 prassignset(&sp->pr_sigpend, &p->p_sig);
920 sp->pr_brkbase = (uintptr_t)p->p_brkbase;
921 sp->pr_brksize = p->p_brksize;
922 sp->pr_stkbase = (uintptr_t)prgetstackbase(p);
923 sp->pr_stksize = p->p_stksize;
924 sp->pr_pid = p->p_pid;
925 if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
926 (p->p_flag & SZONETOP)) {
927 ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
928 /*
929 * Inside local zones, fake zsched's pid as parent pids for
930 * processes which reference processes outside of the zone.
931 */
932 sp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
933 } else {
934 sp->pr_ppid = p->p_ppid;
935 }
936 sp->pr_pgid = p->p_pgrp;
937 sp->pr_sid = p->p_sessp->s_sid;
938 sp->pr_taskid = p->p_task->tk_tkid;
939 sp->pr_projid = p->p_task->tk_proj->kpj_id;
940 sp->pr_zoneid = p->p_zone->zone_id;
941 hrt2ts(mstate_aggr_state(p, LMS_USER), &sp->pr_utime);
942 hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &sp->pr_stime);
943 TICK_TO_TIMESTRUC(p->p_cutime, &sp->pr_cutime);
944 TICK_TO_TIMESTRUC(p->p_cstime, &sp->pr_cstime);
945 prassignset(&sp->pr_sigtrace, &p->p_sigmask);
946 prassignset(&sp->pr_flttrace, &p->p_fltmask);
947 prassignset(&sp->pr_sysentry, &PTOU(p)->u_entrymask);
948 prassignset(&sp->pr_sysexit, &PTOU(p)->u_exitmask);
949 switch (p->p_model) {
950 case DATAMODEL_ILP32:
951 sp->pr_dmodel = PR_MODEL_ILP32;
952 break;
953 case DATAMODEL_LP64:
954 sp->pr_dmodel = PR_MODEL_LP64;
955 break;
956 }
957 if (p->p_agenttp)
958 sp->pr_agentid = p->p_agenttp->t_tid;
959
960 /* get the chosen lwp's status */
961 prgetlwpstatus(t, &sp->pr_lwp, zp);
962
963 /* replicate the flags */
964 sp->pr_flags = sp->pr_lwp.pr_flags;
965 }
966
967 /*
968 * Query mask of held signals for a given thread.
969 *
970 * This makes use of schedctl_sigblock() to query if userspace has requested
971 * that all maskable signals be held. While it would be tempting to call
972 * schedctl_finish_sigblock() and apply that update to t->t_hold, it cannot be
973 * done safely without the risk of racing with the thread under consideration.
974 */
975 void
prgethold(kthread_t * t,sigset_t * sp)976 prgethold(kthread_t *t, sigset_t *sp)
977 {
978 k_sigset_t set;
979
980 if (schedctl_sigblock(t)) {
981 set.__sigbits[0] = FILLSET0 & ~CANTMASK0;
982 set.__sigbits[1] = FILLSET1 & ~CANTMASK1;
983 set.__sigbits[2] = FILLSET2 & ~CANTMASK2;
984 } else {
985 set = t->t_hold;
986 }
987 sigktou(&set, sp);
988 }
989
990 #ifdef _SYSCALL32_IMPL
991 void
prgetlwpstatus32(kthread_t * t,lwpstatus32_t * sp,zone_t * zp)992 prgetlwpstatus32(kthread_t *t, lwpstatus32_t *sp, zone_t *zp)
993 {
994 proc_t *p = ttoproc(t);
995 klwp_t *lwp = ttolwp(t);
996 struct mstate *ms = &lwp->lwp_mstate;
997 hrtime_t usr, sys;
998 int flags;
999 ulong_t instr;
1000
1001 ASSERT(MUTEX_HELD(&p->p_lock));
1002
1003 bzero(sp, sizeof (*sp));
1004 flags = 0L;
1005 if (t->t_state == TS_STOPPED) {
1006 flags |= PR_STOPPED;
1007 if ((t->t_schedflag & TS_PSTART) == 0)
1008 flags |= PR_ISTOP;
1009 } else if (VSTOPPED(t)) {
1010 flags |= PR_STOPPED|PR_ISTOP;
1011 }
1012 if (!(flags & PR_ISTOP) && (t->t_proc_flag & TP_PRSTOP))
1013 flags |= PR_DSTOP;
1014 if (lwp->lwp_asleep)
1015 flags |= PR_ASLEEP;
1016 if (t == p->p_agenttp)
1017 flags |= PR_AGENT;
1018 if (!(t->t_proc_flag & TP_TWAIT))
1019 flags |= PR_DETACH;
1020 if (t->t_proc_flag & TP_DAEMON)
1021 flags |= PR_DAEMON;
1022 if (p->p_proc_flag & P_PR_FORK)
1023 flags |= PR_FORK;
1024 if (p->p_proc_flag & P_PR_RUNLCL)
1025 flags |= PR_RLC;
1026 if (p->p_proc_flag & P_PR_KILLCL)
1027 flags |= PR_KLC;
1028 if (p->p_proc_flag & P_PR_ASYNC)
1029 flags |= PR_ASYNC;
1030 if (p->p_proc_flag & P_PR_BPTADJ)
1031 flags |= PR_BPTADJ;
1032 if (p->p_proc_flag & P_PR_PTRACE)
1033 flags |= PR_PTRACE;
1034 if (p->p_flag & SMSACCT)
1035 flags |= PR_MSACCT;
1036 if (p->p_flag & SMSFORK)
1037 flags |= PR_MSFORK;
1038 if (p->p_flag & SVFWAIT)
1039 flags |= PR_VFORKP;
1040 sp->pr_flags = flags;
1041 if (VSTOPPED(t)) {
1042 sp->pr_why = PR_REQUESTED;
1043 sp->pr_what = 0;
1044 } else {
1045 sp->pr_why = t->t_whystop;
1046 sp->pr_what = t->t_whatstop;
1047 }
1048 sp->pr_lwpid = t->t_tid;
1049 sp->pr_cursig = lwp->lwp_cursig;
1050 prassignset(&sp->pr_lwppend, &t->t_sig);
1051 prgethold(t, &sp->pr_lwphold);
1052 if (t->t_whystop == PR_FAULTED) {
1053 siginfo_kto32(&lwp->lwp_siginfo, &sp->pr_info);
1054 if (t->t_whatstop == FLTPAGE)
1055 sp->pr_info.si_addr =
1056 (caddr32_t)(uintptr_t)lwp->lwp_siginfo.si_addr;
1057 } else if (lwp->lwp_curinfo)
1058 siginfo_kto32(&lwp->lwp_curinfo->sq_info, &sp->pr_info);
1059 if (SI_FROMUSER(&lwp->lwp_siginfo) && zp->zone_id != GLOBAL_ZONEID &&
1060 sp->pr_info.si_zoneid != zp->zone_id) {
1061 sp->pr_info.si_pid = zp->zone_zsched->p_pid;
1062 sp->pr_info.si_uid = 0;
1063 sp->pr_info.si_ctid = -1;
1064 sp->pr_info.si_zoneid = zp->zone_id;
1065 }
1066 sp->pr_altstack.ss_sp =
1067 (caddr32_t)(uintptr_t)lwp->lwp_sigaltstack.ss_sp;
1068 sp->pr_altstack.ss_size = (size32_t)lwp->lwp_sigaltstack.ss_size;
1069 sp->pr_altstack.ss_flags = (int32_t)lwp->lwp_sigaltstack.ss_flags;
1070 prgetaction32(p, PTOU(p), lwp->lwp_cursig, &sp->pr_action);
1071 sp->pr_oldcontext = (caddr32_t)lwp->lwp_oldcontext;
1072 sp->pr_ustack = (caddr32_t)lwp->lwp_ustack;
1073 (void) strncpy(sp->pr_clname, sclass[t->t_cid].cl_name,
1074 sizeof (sp->pr_clname) - 1);
1075 if (flags & PR_STOPPED)
1076 hrt2ts32(t->t_stoptime, &sp->pr_tstamp);
1077 usr = ms->ms_acct[LMS_USER];
1078 sys = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
1079 scalehrtime(&usr);
1080 scalehrtime(&sys);
1081 hrt2ts32(usr, &sp->pr_utime);
1082 hrt2ts32(sys, &sp->pr_stime);
1083
1084 /*
1085 * Fetch the current instruction, if not a system process.
1086 * We don't attempt this unless the lwp is stopped.
1087 */
1088 if ((p->p_flag & SSYS) || p->p_as == &kas)
1089 sp->pr_flags |= (PR_ISSYS|PR_PCINVAL);
1090 else if (!(flags & PR_STOPPED))
1091 sp->pr_flags |= PR_PCINVAL;
1092 else if (!prfetchinstr(lwp, &instr))
1093 sp->pr_flags |= PR_PCINVAL;
1094 else
1095 sp->pr_instr = (uint32_t)instr;
1096
1097 /*
1098 * Drop p_lock while touching the lwp's stack.
1099 */
1100 mutex_exit(&p->p_lock);
1101 if (prisstep(lwp))
1102 sp->pr_flags |= PR_STEP;
1103 if ((flags & (PR_STOPPED|PR_ASLEEP)) && t->t_sysnum) {
1104 int i;
1105
1106 sp->pr_syscall = get_syscall32_args(lwp,
1107 (int *)sp->pr_sysarg, &i);
1108 sp->pr_nsysarg = (ushort_t)i;
1109 }
1110 if ((flags & PR_STOPPED) || t == curthread)
1111 prgetprregs32(lwp, sp->pr_reg);
1112 if ((t->t_state == TS_STOPPED && t->t_whystop == PR_SYSEXIT) ||
1113 (flags & PR_VFORKP)) {
1114 long r1, r2;
1115 user_t *up;
1116 auxv_t *auxp;
1117 int i;
1118
1119 sp->pr_errno = prgetrvals(lwp, &r1, &r2);
1120 if (sp->pr_errno == 0) {
1121 sp->pr_rval1 = (int32_t)r1;
1122 sp->pr_rval2 = (int32_t)r2;
1123 sp->pr_errpriv = PRIV_NONE;
1124 } else
1125 sp->pr_errpriv = lwp->lwp_badpriv;
1126
1127 if (t->t_sysnum == SYS_execve) {
1128 up = PTOU(p);
1129 sp->pr_sysarg[0] = 0;
1130 sp->pr_sysarg[1] = (caddr32_t)up->u_argv;
1131 sp->pr_sysarg[2] = (caddr32_t)up->u_envp;
1132 sp->pr_sysarg[3] = 0;
1133 for (i = 0, auxp = up->u_auxv;
1134 i < sizeof (up->u_auxv) / sizeof (up->u_auxv[0]);
1135 i++, auxp++) {
1136 if (auxp->a_type == AT_SUN_EXECNAME) {
1137 sp->pr_sysarg[0] =
1138 (caddr32_t)
1139 (uintptr_t)auxp->a_un.a_ptr;
1140 break;
1141 }
1142 }
1143 }
1144 }
1145 if (prhasfp())
1146 prgetprfpregs32(lwp, &sp->pr_fpreg);
1147 mutex_enter(&p->p_lock);
1148 }
1149
1150 void
prgetstatus32(proc_t * p,pstatus32_t * sp,zone_t * zp)1151 prgetstatus32(proc_t *p, pstatus32_t *sp, zone_t *zp)
1152 {
1153 kthread_t *t;
1154
1155 ASSERT(MUTEX_HELD(&p->p_lock));
1156
1157 t = prchoose(p); /* returns locked thread */
1158 ASSERT(t != NULL);
1159 thread_unlock(t);
1160
1161 /* just bzero the process part, prgetlwpstatus32() does the rest */
1162 bzero(sp, sizeof (pstatus32_t) - sizeof (lwpstatus32_t));
1163 sp->pr_nlwp = p->p_lwpcnt;
1164 sp->pr_nzomb = p->p_zombcnt;
1165 prassignset(&sp->pr_sigpend, &p->p_sig);
1166 sp->pr_brkbase = (uint32_t)(uintptr_t)p->p_brkbase;
1167 sp->pr_brksize = (uint32_t)p->p_brksize;
1168 sp->pr_stkbase = (uint32_t)(uintptr_t)prgetstackbase(p);
1169 sp->pr_stksize = (uint32_t)p->p_stksize;
1170 sp->pr_pid = p->p_pid;
1171 if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
1172 (p->p_flag & SZONETOP)) {
1173 ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
1174 /*
1175 * Inside local zones, fake zsched's pid as parent pids for
1176 * processes which reference processes outside of the zone.
1177 */
1178 sp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
1179 } else {
1180 sp->pr_ppid = p->p_ppid;
1181 }
1182 sp->pr_pgid = p->p_pgrp;
1183 sp->pr_sid = p->p_sessp->s_sid;
1184 sp->pr_taskid = p->p_task->tk_tkid;
1185 sp->pr_projid = p->p_task->tk_proj->kpj_id;
1186 sp->pr_zoneid = p->p_zone->zone_id;
1187 hrt2ts32(mstate_aggr_state(p, LMS_USER), &sp->pr_utime);
1188 hrt2ts32(mstate_aggr_state(p, LMS_SYSTEM), &sp->pr_stime);
1189 TICK_TO_TIMESTRUC32(p->p_cutime, &sp->pr_cutime);
1190 TICK_TO_TIMESTRUC32(p->p_cstime, &sp->pr_cstime);
1191 prassignset(&sp->pr_sigtrace, &p->p_sigmask);
1192 prassignset(&sp->pr_flttrace, &p->p_fltmask);
1193 prassignset(&sp->pr_sysentry, &PTOU(p)->u_entrymask);
1194 prassignset(&sp->pr_sysexit, &PTOU(p)->u_exitmask);
1195 switch (p->p_model) {
1196 case DATAMODEL_ILP32:
1197 sp->pr_dmodel = PR_MODEL_ILP32;
1198 break;
1199 case DATAMODEL_LP64:
1200 sp->pr_dmodel = PR_MODEL_LP64;
1201 break;
1202 }
1203 if (p->p_agenttp)
1204 sp->pr_agentid = p->p_agenttp->t_tid;
1205
1206 /* get the chosen lwp's status */
1207 prgetlwpstatus32(t, &sp->pr_lwp, zp);
1208
1209 /* replicate the flags */
1210 sp->pr_flags = sp->pr_lwp.pr_flags;
1211 }
1212 #endif /* _SYSCALL32_IMPL */
1213
1214 /*
1215 * Return lwp status.
1216 */
1217 void
prgetlwpstatus(kthread_t * t,lwpstatus_t * sp,zone_t * zp)1218 prgetlwpstatus(kthread_t *t, lwpstatus_t *sp, zone_t *zp)
1219 {
1220 proc_t *p = ttoproc(t);
1221 klwp_t *lwp = ttolwp(t);
1222 struct mstate *ms = &lwp->lwp_mstate;
1223 hrtime_t usr, sys;
1224 int flags;
1225 ulong_t instr;
1226
1227 ASSERT(MUTEX_HELD(&p->p_lock));
1228
1229 bzero(sp, sizeof (*sp));
1230 flags = 0L;
1231 if (t->t_state == TS_STOPPED) {
1232 flags |= PR_STOPPED;
1233 if ((t->t_schedflag & TS_PSTART) == 0)
1234 flags |= PR_ISTOP;
1235 } else if (VSTOPPED(t)) {
1236 flags |= PR_STOPPED|PR_ISTOP;
1237 }
1238 if (!(flags & PR_ISTOP) && (t->t_proc_flag & TP_PRSTOP))
1239 flags |= PR_DSTOP;
1240 if (lwp->lwp_asleep)
1241 flags |= PR_ASLEEP;
1242 if (t == p->p_agenttp)
1243 flags |= PR_AGENT;
1244 if (!(t->t_proc_flag & TP_TWAIT))
1245 flags |= PR_DETACH;
1246 if (t->t_proc_flag & TP_DAEMON)
1247 flags |= PR_DAEMON;
1248 if (p->p_proc_flag & P_PR_FORK)
1249 flags |= PR_FORK;
1250 if (p->p_proc_flag & P_PR_RUNLCL)
1251 flags |= PR_RLC;
1252 if (p->p_proc_flag & P_PR_KILLCL)
1253 flags |= PR_KLC;
1254 if (p->p_proc_flag & P_PR_ASYNC)
1255 flags |= PR_ASYNC;
1256 if (p->p_proc_flag & P_PR_BPTADJ)
1257 flags |= PR_BPTADJ;
1258 if (p->p_proc_flag & P_PR_PTRACE)
1259 flags |= PR_PTRACE;
1260 if (p->p_flag & SMSACCT)
1261 flags |= PR_MSACCT;
1262 if (p->p_flag & SMSFORK)
1263 flags |= PR_MSFORK;
1264 if (p->p_flag & SVFWAIT)
1265 flags |= PR_VFORKP;
1266 if (p->p_pgidp->pid_pgorphaned)
1267 flags |= PR_ORPHAN;
1268 if (p->p_pidflag & CLDNOSIGCHLD)
1269 flags |= PR_NOSIGCHLD;
1270 if (p->p_pidflag & CLDWAITPID)
1271 flags |= PR_WAITPID;
1272 sp->pr_flags = flags;
1273 if (VSTOPPED(t)) {
1274 sp->pr_why = PR_REQUESTED;
1275 sp->pr_what = 0;
1276 } else {
1277 sp->pr_why = t->t_whystop;
1278 sp->pr_what = t->t_whatstop;
1279 }
1280 sp->pr_lwpid = t->t_tid;
1281 sp->pr_cursig = lwp->lwp_cursig;
1282 prassignset(&sp->pr_lwppend, &t->t_sig);
1283 prgethold(t, &sp->pr_lwphold);
1284 if (t->t_whystop == PR_FAULTED)
1285 bcopy(&lwp->lwp_siginfo,
1286 &sp->pr_info, sizeof (k_siginfo_t));
1287 else if (lwp->lwp_curinfo)
1288 bcopy(&lwp->lwp_curinfo->sq_info,
1289 &sp->pr_info, sizeof (k_siginfo_t));
1290 if (SI_FROMUSER(&lwp->lwp_siginfo) && zp->zone_id != GLOBAL_ZONEID &&
1291 sp->pr_info.si_zoneid != zp->zone_id) {
1292 sp->pr_info.si_pid = zp->zone_zsched->p_pid;
1293 sp->pr_info.si_uid = 0;
1294 sp->pr_info.si_ctid = -1;
1295 sp->pr_info.si_zoneid = zp->zone_id;
1296 }
1297 sp->pr_altstack = lwp->lwp_sigaltstack;
1298 prgetaction(p, PTOU(p), lwp->lwp_cursig, &sp->pr_action);
1299 sp->pr_oldcontext = (uintptr_t)lwp->lwp_oldcontext;
1300 sp->pr_ustack = lwp->lwp_ustack;
1301 (void) strncpy(sp->pr_clname, sclass[t->t_cid].cl_name,
1302 sizeof (sp->pr_clname) - 1);
1303 if (flags & PR_STOPPED)
1304 hrt2ts(t->t_stoptime, &sp->pr_tstamp);
1305 usr = ms->ms_acct[LMS_USER];
1306 sys = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
1307 scalehrtime(&usr);
1308 scalehrtime(&sys);
1309 hrt2ts(usr, &sp->pr_utime);
1310 hrt2ts(sys, &sp->pr_stime);
1311
1312 /*
1313 * Fetch the current instruction, if not a system process.
1314 * We don't attempt this unless the lwp is stopped.
1315 */
1316 if ((p->p_flag & SSYS) || p->p_as == &kas)
1317 sp->pr_flags |= (PR_ISSYS|PR_PCINVAL);
1318 else if (!(flags & PR_STOPPED))
1319 sp->pr_flags |= PR_PCINVAL;
1320 else if (!prfetchinstr(lwp, &instr))
1321 sp->pr_flags |= PR_PCINVAL;
1322 else
1323 sp->pr_instr = instr;
1324
1325 /*
1326 * Drop p_lock while touching the lwp's stack.
1327 */
1328 mutex_exit(&p->p_lock);
1329 if (prisstep(lwp))
1330 sp->pr_flags |= PR_STEP;
1331 if ((flags & (PR_STOPPED|PR_ASLEEP)) && t->t_sysnum) {
1332 int i;
1333
1334 sp->pr_syscall = get_syscall_args(lwp,
1335 (long *)sp->pr_sysarg, &i);
1336 sp->pr_nsysarg = (ushort_t)i;
1337 }
1338 if ((flags & PR_STOPPED) || t == curthread)
1339 prgetprregs(lwp, sp->pr_reg);
1340 if ((t->t_state == TS_STOPPED && t->t_whystop == PR_SYSEXIT) ||
1341 (flags & PR_VFORKP)) {
1342 user_t *up;
1343 auxv_t *auxp;
1344 int i;
1345
1346 sp->pr_errno = prgetrvals(lwp, &sp->pr_rval1, &sp->pr_rval2);
1347 if (sp->pr_errno == 0)
1348 sp->pr_errpriv = PRIV_NONE;
1349 else
1350 sp->pr_errpriv = lwp->lwp_badpriv;
1351
1352 if (t->t_sysnum == SYS_execve) {
1353 up = PTOU(p);
1354 sp->pr_sysarg[0] = 0;
1355 sp->pr_sysarg[1] = (uintptr_t)up->u_argv;
1356 sp->pr_sysarg[2] = (uintptr_t)up->u_envp;
1357 sp->pr_sysarg[3] = 0;
1358 for (i = 0, auxp = up->u_auxv;
1359 i < sizeof (up->u_auxv) / sizeof (up->u_auxv[0]);
1360 i++, auxp++) {
1361 if (auxp->a_type == AT_SUN_EXECNAME) {
1362 sp->pr_sysarg[0] =
1363 (uintptr_t)auxp->a_un.a_ptr;
1364 break;
1365 }
1366 }
1367 }
1368 }
1369 if (prhasfp())
1370 prgetprfpregs(lwp, &sp->pr_fpreg);
1371 mutex_enter(&p->p_lock);
1372 }
1373
1374 /*
1375 * Get the sigaction structure for the specified signal. The u-block
1376 * must already have been mapped in by the caller.
1377 */
1378 void
prgetaction(proc_t * p,user_t * up,uint_t sig,struct sigaction * sp)1379 prgetaction(proc_t *p, user_t *up, uint_t sig, struct sigaction *sp)
1380 {
1381 int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG;
1382
1383 bzero(sp, sizeof (*sp));
1384
1385 if (sig != 0 && (unsigned)sig < nsig) {
1386 sp->sa_handler = up->u_signal[sig-1];
1387 prassignset(&sp->sa_mask, &up->u_sigmask[sig-1]);
1388 if (sigismember(&up->u_sigonstack, sig))
1389 sp->sa_flags |= SA_ONSTACK;
1390 if (sigismember(&up->u_sigresethand, sig))
1391 sp->sa_flags |= SA_RESETHAND;
1392 if (sigismember(&up->u_sigrestart, sig))
1393 sp->sa_flags |= SA_RESTART;
1394 if (sigismember(&p->p_siginfo, sig))
1395 sp->sa_flags |= SA_SIGINFO;
1396 if (sigismember(&up->u_signodefer, sig))
1397 sp->sa_flags |= SA_NODEFER;
1398 if (sig == SIGCLD) {
1399 if (p->p_flag & SNOWAIT)
1400 sp->sa_flags |= SA_NOCLDWAIT;
1401 if ((p->p_flag & SJCTL) == 0)
1402 sp->sa_flags |= SA_NOCLDSTOP;
1403 }
1404 }
1405 }
1406
1407 #ifdef _SYSCALL32_IMPL
1408 void
prgetaction32(proc_t * p,user_t * up,uint_t sig,struct sigaction32 * sp)1409 prgetaction32(proc_t *p, user_t *up, uint_t sig, struct sigaction32 *sp)
1410 {
1411 int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG;
1412
1413 bzero(sp, sizeof (*sp));
1414
1415 if (sig != 0 && (unsigned)sig < nsig) {
1416 sp->sa_handler = (caddr32_t)(uintptr_t)up->u_signal[sig-1];
1417 prassignset(&sp->sa_mask, &up->u_sigmask[sig-1]);
1418 if (sigismember(&up->u_sigonstack, sig))
1419 sp->sa_flags |= SA_ONSTACK;
1420 if (sigismember(&up->u_sigresethand, sig))
1421 sp->sa_flags |= SA_RESETHAND;
1422 if (sigismember(&up->u_sigrestart, sig))
1423 sp->sa_flags |= SA_RESTART;
1424 if (sigismember(&p->p_siginfo, sig))
1425 sp->sa_flags |= SA_SIGINFO;
1426 if (sigismember(&up->u_signodefer, sig))
1427 sp->sa_flags |= SA_NODEFER;
1428 if (sig == SIGCLD) {
1429 if (p->p_flag & SNOWAIT)
1430 sp->sa_flags |= SA_NOCLDWAIT;
1431 if ((p->p_flag & SJCTL) == 0)
1432 sp->sa_flags |= SA_NOCLDSTOP;
1433 }
1434 }
1435 }
1436 #endif /* _SYSCALL32_IMPL */
1437
1438 /*
1439 * Count the number of segments in this process's address space.
1440 */
1441 uint_t
prnsegs(struct as * as,int reserved)1442 prnsegs(struct as *as, int reserved)
1443 {
1444 uint_t n = 0;
1445 struct seg *seg;
1446
1447 ASSERT(as != &kas && AS_WRITE_HELD(as));
1448
1449 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1450 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, reserved);
1451 caddr_t saddr, naddr;
1452 void *tmp = NULL;
1453
1454 if ((seg->s_flags & S_HOLE) != 0) {
1455 continue;
1456 }
1457
1458 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
1459 (void) pr_getprot(seg, reserved, &tmp,
1460 &saddr, &naddr, eaddr);
1461 if (saddr != naddr) {
1462 n++;
1463 /*
1464 * prnsegs() was formerly designated to return
1465 * an 'int' despite having no ability or use
1466 * for negative results. As part of changing
1467 * it to 'uint_t', keep the old effective limit
1468 * of INT_MAX in place.
1469 */
1470 if (n == INT_MAX) {
1471 pr_getprot_done(&tmp);
1472 ASSERT(tmp == NULL);
1473 return (n);
1474 }
1475 }
1476 }
1477
1478 ASSERT(tmp == NULL);
1479 }
1480
1481 return (n);
1482 }
1483
1484 /*
1485 * Convert uint32_t to decimal string w/o leading zeros.
1486 * Add trailing null characters if 'len' is greater than string length.
1487 * Return the string length.
1488 */
1489 int
pr_u32tos(uint32_t n,char * s,int len)1490 pr_u32tos(uint32_t n, char *s, int len)
1491 {
1492 char cbuf[11]; /* 32-bit unsigned integer fits in 10 digits */
1493 char *cp = cbuf;
1494 char *end = s + len;
1495
1496 do {
1497 *cp++ = (char)(n % 10 + '0');
1498 n /= 10;
1499 } while (n);
1500
1501 len = (int)(cp - cbuf);
1502
1503 do {
1504 *s++ = *--cp;
1505 } while (cp > cbuf);
1506
1507 while (s < end) /* optional pad */
1508 *s++ = '\0';
1509
1510 return (len);
1511 }
1512
1513 /*
1514 * Convert uint64_t to decimal string w/o leading zeros.
1515 * Return the string length.
1516 */
1517 static int
pr_u64tos(uint64_t n,char * s)1518 pr_u64tos(uint64_t n, char *s)
1519 {
1520 char cbuf[21]; /* 64-bit unsigned integer fits in 20 digits */
1521 char *cp = cbuf;
1522 int len;
1523
1524 do {
1525 *cp++ = (char)(n % 10 + '0');
1526 n /= 10;
1527 } while (n);
1528
1529 len = (int)(cp - cbuf);
1530
1531 do {
1532 *s++ = *--cp;
1533 } while (cp > cbuf);
1534
1535 return (len);
1536 }
1537
1538 /*
1539 * Similar to getf() / getf_gen(), but for the specified process. On success,
1540 * returns the fp with fp->f_count incremented. The caller MUST call
1541 * closef(fp) on the returned fp after completing any actions using that fp.
1542 * We return a reference-held (fp->f_count bumped) file_t so no other closef()
1543 * can invoke destructive VOP_CLOSE actions while we're inspecting the
1544 * process's FD.
1545 *
1546 * Returns NULL for errors: either an empty process-table slot post-fi_lock
1547 * and UF_ENTER, or too many mutex_tryenter() failures on the file_t's f_tlock.
1548 * Both failure modes have DTrace probes.
1549 *
1550 * The current design of the procfs "close" code path uses the following lock
1551 * order of:
1552 *
1553 * 1: (file_t) f_tlock
1554 * 2: (proc_t) p_lock AND setting p->p_proc_flag's P_PR_LOCK
1555 *
1556 * That happens because closef() holds f_tlock while calling fop_close(),
1557 * which can be prclose(), which currently waits on and sets P_PR_LOCK at its
1558 * beginning.
1559 *
1560 * That lock order creates a challenge for pr_getf, which needs to take those
1561 * locks in the opposite order when the fd points to a procfs file descriptor.
1562 * The solution chosen here is to use mutex_tryenter on f_tlock and retry some
1563 * (limited) number of times, failing if we don't get both locks.
1564 *
1565 * The cases where this can fail are rare, and all involve a procfs caller
1566 * asking for info (eg. FDINFO) on another procfs FD. In these cases,
1567 * returning EBADF (which results from a NULL return from pr_getf()) is
1568 * acceptable.
1569 *
1570 * One can increase the number of tries in pr_getf_maxtries if one is worried
1571 * about the contentuous case.
1572 */
1573
1574 uint64_t pr_getf_tryfails; /* Bumped for statistic purposes. */
1575 int pr_getf_maxtries = 3; /* So you can tune it from /etc/system */
1576
1577 file_t *
pr_getf(proc_t * p,uint_t fd,short * flag)1578 pr_getf(proc_t *p, uint_t fd, short *flag)
1579 {
1580 uf_entry_t *ufp;
1581 uf_info_t *fip;
1582 file_t *fp;
1583 int tries = 0;
1584
1585 ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK));
1586
1587 retry:
1588 fip = P_FINFO(p);
1589
1590 if (fd >= fip->fi_nfiles)
1591 return (NULL);
1592
1593 mutex_exit(&p->p_lock);
1594 mutex_enter(&fip->fi_lock);
1595 UF_ENTER(ufp, fip, fd);
1596 if ((fp = ufp->uf_file) != NULL && fp->f_count > 0) {
1597 if (mutex_tryenter(&fp->f_tlock)) {
1598 ASSERT(fp->f_count > 0);
1599 fp->f_count++;
1600 mutex_exit(&fp->f_tlock);
1601 if (flag != NULL)
1602 *flag = ufp->uf_flag;
1603 } else {
1604 /*
1605 * Note the number of mutex_trylock attempts.
1606 *
1607 * The exit path will catch this and try again if we
1608 * are below the retry threshhold (pr_getf_maxtries).
1609 */
1610 tries++;
1611 pr_getf_tryfails++;
1612 /*
1613 * If we hit pr_getf_maxtries, we'll return NULL.
1614 * DTrace scripts looking for this sort of failure
1615 * should check when arg1 is pr_getf_maxtries.
1616 */
1617 DTRACE_PROBE2(pr_getf_tryfail, file_t *, fp, int,
1618 tries);
1619 fp = NULL;
1620 }
1621 } else {
1622 fp = NULL;
1623 /* If we fail here, someone else closed this FD. */
1624 DTRACE_PROBE1(pr_getf_emptyslot, int, tries);
1625 tries = pr_getf_maxtries; /* Don't bother retrying. */
1626 }
1627 UF_EXIT(ufp);
1628 mutex_exit(&fip->fi_lock);
1629 mutex_enter(&p->p_lock);
1630
1631 /* Use goto instead of tail-recursion so we can keep "tries" around. */
1632 if (fp == NULL) {
1633 /* "tries" starts at 1. */
1634 if (tries < pr_getf_maxtries)
1635 goto retry;
1636 } else {
1637 /*
1638 * Probes here will detect successes after arg1's number of
1639 * mutex_tryenter() calls.
1640 */
1641 DTRACE_PROBE2(pr_getf_trysuccess, file_t *, fp, int, tries + 1);
1642 }
1643
1644 return (fp);
1645 }
1646
1647
1648 /*
1649 * Just as pr_getf() is a little unusual in how it goes about making the file_t
1650 * safe for procfs consumers to access it, so too is pr_releasef() for safely
1651 * releasing that "hold". The "hold" is unlike normal file descriptor activity
1652 * -- procfs is just an interloper here, wanting access to the vnode_t without
1653 * risk of a racing close() disrupting the state. Just as pr_getf() avoids some
1654 * of the typical file_t behavior (such as auditing) when establishing its hold,
1655 * so too should pr_releasef(). It should not go through the motions of
1656 * closef() (since it is not a true close()) unless racing activity causes it to
1657 * be the last actor holding the refcount above zero.
1658 *
1659 * Under normal circumstances, we expect to find file_t`f_count > 1 after
1660 * the successful pr_getf() call. We are, after all, accessing a resource
1661 * already held by the process in question. We would also expect to rarely race
1662 * with a close() of the underlying fd, meaning that file_t`f_count > 1 would
1663 * still holds at pr_releasef() time. That would mean we only need to decrement
1664 * f_count, leaving it to the process to later close the fd (thus triggering
1665 * VOP_CLOSE(), etc).
1666 *
1667 * It is only when that process manages to close() the fd while we have it
1668 * "held" in procfs that we must make a trip through the traditional closef()
1669 * logic to ensure proper tear-down of the file_t.
1670 */
1671 void
pr_releasef(file_t * fp)1672 pr_releasef(file_t *fp)
1673 {
1674 mutex_enter(&fp->f_tlock);
1675 if (fp->f_count > 1) {
1676 /*
1677 * This is the most common case: The file is still held open by
1678 * the process, and we simply need to release our hold by
1679 * decrementing f_count
1680 */
1681 fp->f_count--;
1682 mutex_exit(&fp->f_tlock);
1683 } else {
1684 /*
1685 * A rare occasion: The process snuck a close() of this file
1686 * while we were doing our business in procfs. Given that
1687 * f_count == 1, we are the only one with a reference to the
1688 * file_t and need to take a trip through closef() to free it.
1689 */
1690 mutex_exit(&fp->f_tlock);
1691 (void) closef(fp);
1692 }
1693 }
1694
1695 void
pr_object_name(char * name,vnode_t * vp,struct vattr * vattr)1696 pr_object_name(char *name, vnode_t *vp, struct vattr *vattr)
1697 {
1698 char *s = name;
1699 struct vfs *vfsp;
1700 struct vfssw *vfsswp;
1701
1702 if ((vfsp = vp->v_vfsp) != NULL &&
1703 ((vfsswp = vfssw + vfsp->vfs_fstype), vfsswp->vsw_name) &&
1704 *vfsswp->vsw_name) {
1705 (void) strcpy(s, vfsswp->vsw_name);
1706 s += strlen(s);
1707 *s++ = '.';
1708 }
1709 s += pr_u32tos(getmajor(vattr->va_fsid), s, 0);
1710 *s++ = '.';
1711 s += pr_u32tos(getminor(vattr->va_fsid), s, 0);
1712 *s++ = '.';
1713 s += pr_u64tos(vattr->va_nodeid, s);
1714 *s++ = '\0';
1715 }
1716
1717 struct seg *
break_seg(proc_t * p)1718 break_seg(proc_t *p)
1719 {
1720 caddr_t addr = p->p_brkbase;
1721 struct seg *seg;
1722 struct vnode *vp;
1723
1724 if (p->p_brksize != 0)
1725 addr += p->p_brksize - 1;
1726 seg = as_segat(p->p_as, addr);
1727 if (seg != NULL && seg->s_ops == &segvn_ops &&
1728 (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL))
1729 return (seg);
1730 return (NULL);
1731 }
1732
1733 /*
1734 * Implementation of service functions to handle procfs generic chained
1735 * copyout buffers.
1736 */
1737 typedef struct pr_iobuf_list {
1738 list_node_t piol_link; /* buffer linkage */
1739 size_t piol_size; /* total size (header + data) */
1740 size_t piol_usedsize; /* amount to copy out from this buf */
1741 } piol_t;
1742
1743 #define MAPSIZE (64 * 1024)
1744 #define PIOL_DATABUF(iol) ((void *)(&(iol)[1]))
1745
1746 void
pr_iol_initlist(list_t * iolhead,size_t itemsize,int n)1747 pr_iol_initlist(list_t *iolhead, size_t itemsize, int n)
1748 {
1749 piol_t *iol;
1750 size_t initial_size = MIN(1, n) * itemsize;
1751
1752 list_create(iolhead, sizeof (piol_t), offsetof(piol_t, piol_link));
1753
1754 ASSERT(list_head(iolhead) == NULL);
1755 ASSERT(itemsize < MAPSIZE - sizeof (*iol));
1756 ASSERT(initial_size > 0);
1757
1758 /*
1759 * Someone creating chained copyout buffers may ask for less than
1760 * MAPSIZE if the amount of data to be buffered is known to be
1761 * smaller than that.
1762 * But in order to prevent involuntary self-denial of service,
1763 * the requested input size is clamped at MAPSIZE.
1764 */
1765 initial_size = MIN(MAPSIZE, initial_size + sizeof (*iol));
1766 iol = kmem_alloc(initial_size, KM_SLEEP);
1767 list_insert_head(iolhead, iol);
1768 iol->piol_usedsize = 0;
1769 iol->piol_size = initial_size;
1770 }
1771
1772 void *
pr_iol_newbuf(list_t * iolhead,size_t itemsize)1773 pr_iol_newbuf(list_t *iolhead, size_t itemsize)
1774 {
1775 piol_t *iol;
1776 char *new;
1777
1778 ASSERT(itemsize < MAPSIZE - sizeof (*iol));
1779 ASSERT(list_head(iolhead) != NULL);
1780
1781 iol = (piol_t *)list_tail(iolhead);
1782
1783 if (iol->piol_size <
1784 iol->piol_usedsize + sizeof (*iol) + itemsize) {
1785 /*
1786 * Out of space in the current buffer. Allocate more.
1787 */
1788 piol_t *newiol;
1789
1790 newiol = kmem_alloc(MAPSIZE, KM_SLEEP);
1791 newiol->piol_size = MAPSIZE;
1792 newiol->piol_usedsize = 0;
1793
1794 list_insert_after(iolhead, iol, newiol);
1795 iol = list_next(iolhead, iol);
1796 ASSERT(iol == newiol);
1797 }
1798 new = (char *)PIOL_DATABUF(iol) + iol->piol_usedsize;
1799 iol->piol_usedsize += itemsize;
1800 bzero(new, itemsize);
1801 return (new);
1802 }
1803
1804 void
pr_iol_freelist(list_t * iolhead)1805 pr_iol_freelist(list_t *iolhead)
1806 {
1807 piol_t *iol;
1808
1809 while ((iol = list_head(iolhead)) != NULL) {
1810 list_remove(iolhead, iol);
1811 kmem_free(iol, iol->piol_size);
1812 }
1813 list_destroy(iolhead);
1814 }
1815
1816 int
pr_iol_copyout_and_free(list_t * iolhead,caddr_t * tgt,int errin)1817 pr_iol_copyout_and_free(list_t *iolhead, caddr_t *tgt, int errin)
1818 {
1819 int error = errin;
1820 piol_t *iol;
1821
1822 while ((iol = list_head(iolhead)) != NULL) {
1823 list_remove(iolhead, iol);
1824 if (!error) {
1825 if (copyout(PIOL_DATABUF(iol), *tgt,
1826 iol->piol_usedsize))
1827 error = EFAULT;
1828 *tgt += iol->piol_usedsize;
1829 }
1830 kmem_free(iol, iol->piol_size);
1831 }
1832 list_destroy(iolhead);
1833
1834 return (error);
1835 }
1836
1837 int
pr_iol_uiomove_and_free(list_t * iolhead,uio_t * uiop,int errin)1838 pr_iol_uiomove_and_free(list_t *iolhead, uio_t *uiop, int errin)
1839 {
1840 offset_t off = uiop->uio_offset;
1841 char *base;
1842 size_t size;
1843 piol_t *iol;
1844 int error = errin;
1845
1846 while ((iol = list_head(iolhead)) != NULL) {
1847 list_remove(iolhead, iol);
1848 base = PIOL_DATABUF(iol);
1849 size = iol->piol_usedsize;
1850 if (off <= size && error == 0 && uiop->uio_resid > 0)
1851 error = uiomove(base + off, size - off,
1852 UIO_READ, uiop);
1853 off = MAX(0, off - (offset_t)size);
1854 kmem_free(iol, iol->piol_size);
1855 }
1856 list_destroy(iolhead);
1857
1858 return (error);
1859 }
1860
1861 /*
1862 * Return an array of structures with memory map information.
1863 * We allocate here; the caller must deallocate.
1864 */
1865 int
prgetmap(proc_t * p,int reserved,list_t * iolhead)1866 prgetmap(proc_t *p, int reserved, list_t *iolhead)
1867 {
1868 struct as *as = p->p_as;
1869 prmap_t *mp;
1870 struct seg *seg;
1871 struct seg *brkseg, *stkseg;
1872 struct vnode *vp;
1873 struct vattr vattr;
1874 uint_t prot;
1875
1876 ASSERT(as != &kas && AS_WRITE_HELD(as));
1877
1878 /*
1879 * Request an initial buffer size that doesn't waste memory
1880 * if the address space has only a small number of segments.
1881 */
1882 pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
1883
1884 if ((seg = AS_SEGFIRST(as)) == NULL)
1885 return (0);
1886
1887 brkseg = break_seg(p);
1888 stkseg = as_segat(as, prgetstackbase(p));
1889
1890 do {
1891 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, reserved);
1892 caddr_t saddr, naddr;
1893 void *tmp = NULL;
1894
1895 if ((seg->s_flags & S_HOLE) != 0) {
1896 continue;
1897 }
1898
1899 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
1900 prot = pr_getprot(seg, reserved, &tmp,
1901 &saddr, &naddr, eaddr);
1902 if (saddr == naddr)
1903 continue;
1904
1905 mp = pr_iol_newbuf(iolhead, sizeof (*mp));
1906
1907 mp->pr_vaddr = (uintptr_t)saddr;
1908 mp->pr_size = naddr - saddr;
1909 mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
1910 mp->pr_mflags = 0;
1911 if (prot & PROT_READ)
1912 mp->pr_mflags |= MA_READ;
1913 if (prot & PROT_WRITE)
1914 mp->pr_mflags |= MA_WRITE;
1915 if (prot & PROT_EXEC)
1916 mp->pr_mflags |= MA_EXEC;
1917 if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
1918 mp->pr_mflags |= MA_SHARED;
1919 if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
1920 mp->pr_mflags |= MA_NORESERVE;
1921 if (seg->s_ops == &segspt_shmops ||
1922 (seg->s_ops == &segvn_ops &&
1923 (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
1924 mp->pr_mflags |= MA_ANON;
1925 if (seg == brkseg)
1926 mp->pr_mflags |= MA_BREAK;
1927 else if (seg == stkseg) {
1928 mp->pr_mflags |= MA_STACK;
1929 if (reserved) {
1930 size_t maxstack =
1931 ((size_t)p->p_stk_ctl +
1932 PAGEOFFSET) & PAGEMASK;
1933 mp->pr_vaddr =
1934 (uintptr_t)prgetstackbase(p) +
1935 p->p_stksize - maxstack;
1936 mp->pr_size = (uintptr_t)naddr -
1937 mp->pr_vaddr;
1938 }
1939 }
1940 if (seg->s_ops == &segspt_shmops)
1941 mp->pr_mflags |= MA_ISM | MA_SHM;
1942 mp->pr_pagesize = PAGESIZE;
1943
1944 /*
1945 * Manufacture a filename for the "object" directory.
1946 */
1947 vattr.va_mask = AT_FSID|AT_NODEID;
1948 if (seg->s_ops == &segvn_ops &&
1949 SEGOP_GETVP(seg, saddr, &vp) == 0 &&
1950 vp != NULL && vp->v_type == VREG &&
1951 VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
1952 if (vp == p->p_exec)
1953 (void) strcpy(mp->pr_mapname, "a.out");
1954 else
1955 pr_object_name(mp->pr_mapname,
1956 vp, &vattr);
1957 }
1958
1959 /*
1960 * Get the SysV shared memory id, if any.
1961 */
1962 if ((mp->pr_mflags & MA_SHARED) && p->p_segacct &&
1963 (mp->pr_shmid = shmgetid(p, seg->s_base)) !=
1964 SHMID_NONE) {
1965 if (mp->pr_shmid == SHMID_FREE)
1966 mp->pr_shmid = -1;
1967
1968 mp->pr_mflags |= MA_SHM;
1969 } else {
1970 mp->pr_shmid = -1;
1971 }
1972 }
1973 ASSERT(tmp == NULL);
1974 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
1975
1976 return (0);
1977 }
1978
1979 #ifdef _SYSCALL32_IMPL
1980 int
prgetmap32(proc_t * p,int reserved,list_t * iolhead)1981 prgetmap32(proc_t *p, int reserved, list_t *iolhead)
1982 {
1983 struct as *as = p->p_as;
1984 prmap32_t *mp;
1985 struct seg *seg;
1986 struct seg *brkseg, *stkseg;
1987 struct vnode *vp;
1988 struct vattr vattr;
1989 uint_t prot;
1990
1991 ASSERT(as != &kas && AS_WRITE_HELD(as));
1992
1993 /*
1994 * Request an initial buffer size that doesn't waste memory
1995 * if the address space has only a small number of segments.
1996 */
1997 pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
1998
1999 if ((seg = AS_SEGFIRST(as)) == NULL)
2000 return (0);
2001
2002 brkseg = break_seg(p);
2003 stkseg = as_segat(as, prgetstackbase(p));
2004
2005 do {
2006 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, reserved);
2007 caddr_t saddr, naddr;
2008 void *tmp = NULL;
2009
2010 if ((seg->s_flags & S_HOLE) != 0) {
2011 continue;
2012 }
2013
2014 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2015 prot = pr_getprot(seg, reserved, &tmp,
2016 &saddr, &naddr, eaddr);
2017 if (saddr == naddr)
2018 continue;
2019
2020 mp = pr_iol_newbuf(iolhead, sizeof (*mp));
2021
2022 mp->pr_vaddr = (caddr32_t)(uintptr_t)saddr;
2023 mp->pr_size = (size32_t)(naddr - saddr);
2024 mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
2025 mp->pr_mflags = 0;
2026 if (prot & PROT_READ)
2027 mp->pr_mflags |= MA_READ;
2028 if (prot & PROT_WRITE)
2029 mp->pr_mflags |= MA_WRITE;
2030 if (prot & PROT_EXEC)
2031 mp->pr_mflags |= MA_EXEC;
2032 if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
2033 mp->pr_mflags |= MA_SHARED;
2034 if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
2035 mp->pr_mflags |= MA_NORESERVE;
2036 if (seg->s_ops == &segspt_shmops ||
2037 (seg->s_ops == &segvn_ops &&
2038 (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
2039 mp->pr_mflags |= MA_ANON;
2040 if (seg == brkseg)
2041 mp->pr_mflags |= MA_BREAK;
2042 else if (seg == stkseg) {
2043 mp->pr_mflags |= MA_STACK;
2044 if (reserved) {
2045 size_t maxstack =
2046 ((size_t)p->p_stk_ctl +
2047 PAGEOFFSET) & PAGEMASK;
2048 uintptr_t vaddr =
2049 (uintptr_t)prgetstackbase(p) +
2050 p->p_stksize - maxstack;
2051 mp->pr_vaddr = (caddr32_t)vaddr;
2052 mp->pr_size = (size32_t)
2053 ((uintptr_t)naddr - vaddr);
2054 }
2055 }
2056 if (seg->s_ops == &segspt_shmops)
2057 mp->pr_mflags |= MA_ISM | MA_SHM;
2058 mp->pr_pagesize = PAGESIZE;
2059
2060 /*
2061 * Manufacture a filename for the "object" directory.
2062 */
2063 vattr.va_mask = AT_FSID|AT_NODEID;
2064 if (seg->s_ops == &segvn_ops &&
2065 SEGOP_GETVP(seg, saddr, &vp) == 0 &&
2066 vp != NULL && vp->v_type == VREG &&
2067 VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
2068 if (vp == p->p_exec)
2069 (void) strcpy(mp->pr_mapname, "a.out");
2070 else
2071 pr_object_name(mp->pr_mapname,
2072 vp, &vattr);
2073 }
2074
2075 /*
2076 * Get the SysV shared memory id, if any.
2077 */
2078 if ((mp->pr_mflags & MA_SHARED) && p->p_segacct &&
2079 (mp->pr_shmid = shmgetid(p, seg->s_base)) !=
2080 SHMID_NONE) {
2081 if (mp->pr_shmid == SHMID_FREE)
2082 mp->pr_shmid = -1;
2083
2084 mp->pr_mflags |= MA_SHM;
2085 } else {
2086 mp->pr_shmid = -1;
2087 }
2088 }
2089 ASSERT(tmp == NULL);
2090 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2091
2092 return (0);
2093 }
2094 #endif /* _SYSCALL32_IMPL */
2095
2096 /*
2097 * Return the size of the /proc page data file.
2098 */
2099 size_t
prpdsize(struct as * as)2100 prpdsize(struct as *as)
2101 {
2102 struct seg *seg;
2103 size_t size;
2104
2105 ASSERT(as != &kas && AS_WRITE_HELD(as));
2106
2107 if ((seg = AS_SEGFIRST(as)) == NULL)
2108 return (0);
2109
2110 size = sizeof (prpageheader_t);
2111 do {
2112 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2113 caddr_t saddr, naddr;
2114 void *tmp = NULL;
2115 size_t npage;
2116
2117 if ((seg->s_flags & S_HOLE) != 0) {
2118 continue;
2119 }
2120
2121 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2122 (void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2123 if ((npage = (naddr - saddr) / PAGESIZE) != 0)
2124 size += sizeof (prasmap_t) + round8(npage);
2125 }
2126 ASSERT(tmp == NULL);
2127 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2128
2129 return (size);
2130 }
2131
2132 #ifdef _SYSCALL32_IMPL
2133 size_t
prpdsize32(struct as * as)2134 prpdsize32(struct as *as)
2135 {
2136 struct seg *seg;
2137 size_t size;
2138
2139 ASSERT(as != &kas && AS_WRITE_HELD(as));
2140
2141 if ((seg = AS_SEGFIRST(as)) == NULL)
2142 return (0);
2143
2144 size = sizeof (prpageheader32_t);
2145 do {
2146 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2147 caddr_t saddr, naddr;
2148 void *tmp = NULL;
2149 size_t npage;
2150
2151 if ((seg->s_flags & S_HOLE) != 0) {
2152 continue;
2153 }
2154
2155 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2156 (void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2157 if ((npage = (naddr - saddr) / PAGESIZE) != 0)
2158 size += sizeof (prasmap32_t) + round8(npage);
2159 }
2160 ASSERT(tmp == NULL);
2161 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2162
2163 return (size);
2164 }
2165 #endif /* _SYSCALL32_IMPL */
2166
2167 /*
2168 * Read page data information.
2169 */
2170 int
prpdread(proc_t * p,uint_t hatid,struct uio * uiop)2171 prpdread(proc_t *p, uint_t hatid, struct uio *uiop)
2172 {
2173 struct as *as = p->p_as;
2174 caddr_t buf;
2175 size_t size;
2176 prpageheader_t *php;
2177 prasmap_t *pmp;
2178 struct seg *seg;
2179 int error;
2180
2181 again:
2182 AS_LOCK_ENTER(as, RW_WRITER);
2183
2184 if ((seg = AS_SEGFIRST(as)) == NULL) {
2185 AS_LOCK_EXIT(as);
2186 return (0);
2187 }
2188 size = prpdsize(as);
2189 if (uiop->uio_resid < size) {
2190 AS_LOCK_EXIT(as);
2191 return (E2BIG);
2192 }
2193
2194 buf = kmem_zalloc(size, KM_SLEEP);
2195 php = (prpageheader_t *)buf;
2196 pmp = (prasmap_t *)(buf + sizeof (prpageheader_t));
2197
2198 hrt2ts(gethrtime(), &php->pr_tstamp);
2199 php->pr_nmap = 0;
2200 php->pr_npage = 0;
2201 do {
2202 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2203 caddr_t saddr, naddr;
2204 void *tmp = NULL;
2205
2206 if ((seg->s_flags & S_HOLE) != 0) {
2207 continue;
2208 }
2209
2210 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2211 struct vnode *vp;
2212 struct vattr vattr;
2213 size_t len;
2214 size_t npage;
2215 uint_t prot;
2216 uintptr_t next;
2217
2218 prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2219 if ((len = (size_t)(naddr - saddr)) == 0)
2220 continue;
2221 npage = len / PAGESIZE;
2222 next = (uintptr_t)(pmp + 1) + round8(npage);
2223 /*
2224 * It's possible that the address space can change
2225 * subtlely even though we're holding as->a_lock
2226 * due to the nondeterminism of page_exists() in
2227 * the presence of asychronously flushed pages or
2228 * mapped files whose sizes are changing.
2229 * page_exists() may be called indirectly from
2230 * pr_getprot() by a SEGOP_INCORE() routine.
2231 * If this happens we need to make sure we don't
2232 * overrun the buffer whose size we computed based
2233 * on the initial iteration through the segments.
2234 * Once we've detected an overflow, we need to clean
2235 * up the temporary memory allocated in pr_getprot()
2236 * and retry. If there's a pending signal, we return
2237 * EINTR so that this thread can be dislodged if
2238 * a latent bug causes us to spin indefinitely.
2239 */
2240 if (next > (uintptr_t)buf + size) {
2241 pr_getprot_done(&tmp);
2242 AS_LOCK_EXIT(as);
2243
2244 kmem_free(buf, size);
2245
2246 if (ISSIG(curthread, JUSTLOOKING))
2247 return (EINTR);
2248
2249 goto again;
2250 }
2251
2252 php->pr_nmap++;
2253 php->pr_npage += npage;
2254 pmp->pr_vaddr = (uintptr_t)saddr;
2255 pmp->pr_npage = npage;
2256 pmp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
2257 pmp->pr_mflags = 0;
2258 if (prot & PROT_READ)
2259 pmp->pr_mflags |= MA_READ;
2260 if (prot & PROT_WRITE)
2261 pmp->pr_mflags |= MA_WRITE;
2262 if (prot & PROT_EXEC)
2263 pmp->pr_mflags |= MA_EXEC;
2264 if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
2265 pmp->pr_mflags |= MA_SHARED;
2266 if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
2267 pmp->pr_mflags |= MA_NORESERVE;
2268 if (seg->s_ops == &segspt_shmops ||
2269 (seg->s_ops == &segvn_ops &&
2270 (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
2271 pmp->pr_mflags |= MA_ANON;
2272 if (seg->s_ops == &segspt_shmops)
2273 pmp->pr_mflags |= MA_ISM | MA_SHM;
2274 pmp->pr_pagesize = PAGESIZE;
2275 /*
2276 * Manufacture a filename for the "object" directory.
2277 */
2278 vattr.va_mask = AT_FSID|AT_NODEID;
2279 if (seg->s_ops == &segvn_ops &&
2280 SEGOP_GETVP(seg, saddr, &vp) == 0 &&
2281 vp != NULL && vp->v_type == VREG &&
2282 VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
2283 if (vp == p->p_exec)
2284 (void) strcpy(pmp->pr_mapname, "a.out");
2285 else
2286 pr_object_name(pmp->pr_mapname,
2287 vp, &vattr);
2288 }
2289
2290 /*
2291 * Get the SysV shared memory id, if any.
2292 */
2293 if ((pmp->pr_mflags & MA_SHARED) && p->p_segacct &&
2294 (pmp->pr_shmid = shmgetid(p, seg->s_base)) !=
2295 SHMID_NONE) {
2296 if (pmp->pr_shmid == SHMID_FREE)
2297 pmp->pr_shmid = -1;
2298
2299 pmp->pr_mflags |= MA_SHM;
2300 } else {
2301 pmp->pr_shmid = -1;
2302 }
2303
2304 hat_getstat(as, saddr, len, hatid,
2305 (char *)(pmp + 1), HAT_SYNC_ZERORM);
2306 pmp = (prasmap_t *)next;
2307 }
2308 ASSERT(tmp == NULL);
2309 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2310
2311 AS_LOCK_EXIT(as);
2312
2313 ASSERT((uintptr_t)pmp <= (uintptr_t)buf + size);
2314 error = uiomove(buf, (caddr_t)pmp - buf, UIO_READ, uiop);
2315 kmem_free(buf, size);
2316
2317 return (error);
2318 }
2319
2320 #ifdef _SYSCALL32_IMPL
2321 int
prpdread32(proc_t * p,uint_t hatid,struct uio * uiop)2322 prpdread32(proc_t *p, uint_t hatid, struct uio *uiop)
2323 {
2324 struct as *as = p->p_as;
2325 caddr_t buf;
2326 size_t size;
2327 prpageheader32_t *php;
2328 prasmap32_t *pmp;
2329 struct seg *seg;
2330 int error;
2331
2332 again:
2333 AS_LOCK_ENTER(as, RW_WRITER);
2334
2335 if ((seg = AS_SEGFIRST(as)) == NULL) {
2336 AS_LOCK_EXIT(as);
2337 return (0);
2338 }
2339 size = prpdsize32(as);
2340 if (uiop->uio_resid < size) {
2341 AS_LOCK_EXIT(as);
2342 return (E2BIG);
2343 }
2344
2345 buf = kmem_zalloc(size, KM_SLEEP);
2346 php = (prpageheader32_t *)buf;
2347 pmp = (prasmap32_t *)(buf + sizeof (prpageheader32_t));
2348
2349 hrt2ts32(gethrtime(), &php->pr_tstamp);
2350 php->pr_nmap = 0;
2351 php->pr_npage = 0;
2352 do {
2353 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2354 caddr_t saddr, naddr;
2355 void *tmp = NULL;
2356
2357 if ((seg->s_flags & S_HOLE) != 0) {
2358 continue;
2359 }
2360
2361 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2362 struct vnode *vp;
2363 struct vattr vattr;
2364 size_t len;
2365 size_t npage;
2366 uint_t prot;
2367 uintptr_t next;
2368
2369 prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2370 if ((len = (size_t)(naddr - saddr)) == 0)
2371 continue;
2372 npage = len / PAGESIZE;
2373 next = (uintptr_t)(pmp + 1) + round8(npage);
2374 /*
2375 * It's possible that the address space can change
2376 * subtlely even though we're holding as->a_lock
2377 * due to the nondeterminism of page_exists() in
2378 * the presence of asychronously flushed pages or
2379 * mapped files whose sizes are changing.
2380 * page_exists() may be called indirectly from
2381 * pr_getprot() by a SEGOP_INCORE() routine.
2382 * If this happens we need to make sure we don't
2383 * overrun the buffer whose size we computed based
2384 * on the initial iteration through the segments.
2385 * Once we've detected an overflow, we need to clean
2386 * up the temporary memory allocated in pr_getprot()
2387 * and retry. If there's a pending signal, we return
2388 * EINTR so that this thread can be dislodged if
2389 * a latent bug causes us to spin indefinitely.
2390 */
2391 if (next > (uintptr_t)buf + size) {
2392 pr_getprot_done(&tmp);
2393 AS_LOCK_EXIT(as);
2394
2395 kmem_free(buf, size);
2396
2397 if (ISSIG(curthread, JUSTLOOKING))
2398 return (EINTR);
2399
2400 goto again;
2401 }
2402
2403 php->pr_nmap++;
2404 php->pr_npage += npage;
2405 pmp->pr_vaddr = (caddr32_t)(uintptr_t)saddr;
2406 pmp->pr_npage = (size32_t)npage;
2407 pmp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
2408 pmp->pr_mflags = 0;
2409 if (prot & PROT_READ)
2410 pmp->pr_mflags |= MA_READ;
2411 if (prot & PROT_WRITE)
2412 pmp->pr_mflags |= MA_WRITE;
2413 if (prot & PROT_EXEC)
2414 pmp->pr_mflags |= MA_EXEC;
2415 if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
2416 pmp->pr_mflags |= MA_SHARED;
2417 if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
2418 pmp->pr_mflags |= MA_NORESERVE;
2419 if (seg->s_ops == &segspt_shmops ||
2420 (seg->s_ops == &segvn_ops &&
2421 (SEGOP_GETVP(seg, saddr, &vp) != 0 || vp == NULL)))
2422 pmp->pr_mflags |= MA_ANON;
2423 if (seg->s_ops == &segspt_shmops)
2424 pmp->pr_mflags |= MA_ISM | MA_SHM;
2425 pmp->pr_pagesize = PAGESIZE;
2426 /*
2427 * Manufacture a filename for the "object" directory.
2428 */
2429 vattr.va_mask = AT_FSID|AT_NODEID;
2430 if (seg->s_ops == &segvn_ops &&
2431 SEGOP_GETVP(seg, saddr, &vp) == 0 &&
2432 vp != NULL && vp->v_type == VREG &&
2433 VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
2434 if (vp == p->p_exec)
2435 (void) strcpy(pmp->pr_mapname, "a.out");
2436 else
2437 pr_object_name(pmp->pr_mapname,
2438 vp, &vattr);
2439 }
2440
2441 /*
2442 * Get the SysV shared memory id, if any.
2443 */
2444 if ((pmp->pr_mflags & MA_SHARED) && p->p_segacct &&
2445 (pmp->pr_shmid = shmgetid(p, seg->s_base)) !=
2446 SHMID_NONE) {
2447 if (pmp->pr_shmid == SHMID_FREE)
2448 pmp->pr_shmid = -1;
2449
2450 pmp->pr_mflags |= MA_SHM;
2451 } else {
2452 pmp->pr_shmid = -1;
2453 }
2454
2455 hat_getstat(as, saddr, len, hatid,
2456 (char *)(pmp + 1), HAT_SYNC_ZERORM);
2457 pmp = (prasmap32_t *)next;
2458 }
2459 ASSERT(tmp == NULL);
2460 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2461
2462 AS_LOCK_EXIT(as);
2463
2464 ASSERT((uintptr_t)pmp <= (uintptr_t)buf + size);
2465 error = uiomove(buf, (caddr_t)pmp - buf, UIO_READ, uiop);
2466 kmem_free(buf, size);
2467
2468 return (error);
2469 }
2470 #endif /* _SYSCALL32_IMPL */
2471
2472 ushort_t
prgetpctcpu(uint64_t pct)2473 prgetpctcpu(uint64_t pct)
2474 {
2475 /*
2476 * The value returned will be relevant in the zone of the examiner,
2477 * which may not be the same as the zone which performed the procfs
2478 * mount.
2479 */
2480 int nonline = zone_ncpus_online_get(curproc->p_zone);
2481
2482 /*
2483 * Prorate over online cpus so we don't exceed 100%
2484 */
2485 if (nonline > 1)
2486 pct /= nonline;
2487 pct >>= 16; /* convert to 16-bit scaled integer */
2488 if (pct > 0x8000) /* might happen, due to rounding */
2489 pct = 0x8000;
2490 return ((ushort_t)pct);
2491 }
2492
2493 /*
2494 * Return information used by ps(1).
2495 */
2496 void
prgetpsinfo(proc_t * p,psinfo_t * psp)2497 prgetpsinfo(proc_t *p, psinfo_t *psp)
2498 {
2499 kthread_t *t;
2500 struct cred *cred;
2501 hrtime_t hrutime, hrstime;
2502
2503 ASSERT(MUTEX_HELD(&p->p_lock));
2504
2505 if ((t = prchoose(p)) == NULL) /* returns locked thread */
2506 bzero(psp, sizeof (*psp));
2507 else {
2508 thread_unlock(t);
2509 bzero(psp, sizeof (*psp) - sizeof (psp->pr_lwp));
2510 }
2511
2512 /*
2513 * only export SSYS and SMSACCT; everything else is off-limits to
2514 * userland apps.
2515 */
2516 psp->pr_flag = p->p_flag & (SSYS | SMSACCT);
2517 psp->pr_nlwp = p->p_lwpcnt;
2518 psp->pr_nzomb = p->p_zombcnt;
2519 mutex_enter(&p->p_crlock);
2520 cred = p->p_cred;
2521 psp->pr_uid = crgetruid(cred);
2522 psp->pr_euid = crgetuid(cred);
2523 psp->pr_gid = crgetrgid(cred);
2524 psp->pr_egid = crgetgid(cred);
2525 mutex_exit(&p->p_crlock);
2526 psp->pr_pid = p->p_pid;
2527 if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
2528 (p->p_flag & SZONETOP)) {
2529 ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
2530 /*
2531 * Inside local zones, fake zsched's pid as parent pids for
2532 * processes which reference processes outside of the zone.
2533 */
2534 psp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
2535 } else {
2536 psp->pr_ppid = p->p_ppid;
2537 }
2538 psp->pr_pgid = p->p_pgrp;
2539 psp->pr_sid = p->p_sessp->s_sid;
2540 psp->pr_taskid = p->p_task->tk_tkid;
2541 psp->pr_projid = p->p_task->tk_proj->kpj_id;
2542 psp->pr_poolid = p->p_pool->pool_id;
2543 psp->pr_zoneid = p->p_zone->zone_id;
2544 if ((psp->pr_contract = PRCTID(p)) == 0)
2545 psp->pr_contract = -1;
2546 psp->pr_addr = (uintptr_t)prgetpsaddr(p);
2547 switch (p->p_model) {
2548 case DATAMODEL_ILP32:
2549 psp->pr_dmodel = PR_MODEL_ILP32;
2550 break;
2551 case DATAMODEL_LP64:
2552 psp->pr_dmodel = PR_MODEL_LP64;
2553 break;
2554 }
2555 hrutime = mstate_aggr_state(p, LMS_USER);
2556 hrstime = mstate_aggr_state(p, LMS_SYSTEM);
2557 hrt2ts((hrutime + hrstime), &psp->pr_time);
2558 TICK_TO_TIMESTRUC(p->p_cutime + p->p_cstime, &psp->pr_ctime);
2559
2560 if (t == NULL) {
2561 int wcode = p->p_wcode; /* must be atomic read */
2562
2563 if (wcode)
2564 psp->pr_wstat = wstat(wcode, p->p_wdata);
2565 psp->pr_ttydev = PRNODEV;
2566 psp->pr_lwp.pr_state = SZOMB;
2567 psp->pr_lwp.pr_sname = 'Z';
2568 psp->pr_lwp.pr_bindpro = PBIND_NONE;
2569 psp->pr_lwp.pr_bindpset = PS_NONE;
2570 } else {
2571 user_t *up = PTOU(p);
2572 struct as *as;
2573 dev_t d;
2574 extern dev_t rwsconsdev, rconsdev, uconsdev;
2575
2576 d = cttydev(p);
2577 /*
2578 * If the controlling terminal is the real
2579 * or workstation console device, map to what the
2580 * user thinks is the console device. Handle case when
2581 * rwsconsdev or rconsdev is set to NODEV for Starfire.
2582 */
2583 if ((d == rwsconsdev || d == rconsdev) && d != NODEV)
2584 d = uconsdev;
2585 psp->pr_ttydev = (d == NODEV) ? PRNODEV : d;
2586 psp->pr_start = up->u_start;
2587 bcopy(up->u_comm, psp->pr_fname,
2588 MIN(sizeof (up->u_comm), sizeof (psp->pr_fname)-1));
2589 bcopy(up->u_psargs, psp->pr_psargs,
2590 MIN(PRARGSZ-1, PSARGSZ));
2591 psp->pr_argc = up->u_argc;
2592 psp->pr_argv = up->u_argv;
2593 psp->pr_envp = up->u_envp;
2594
2595 /* get the chosen lwp's lwpsinfo */
2596 prgetlwpsinfo(t, &psp->pr_lwp);
2597
2598 /* compute %cpu for the process */
2599 if (p->p_lwpcnt == 1)
2600 psp->pr_pctcpu = psp->pr_lwp.pr_pctcpu;
2601 else {
2602 uint64_t pct = 0;
2603 hrtime_t cur_time = gethrtime_unscaled();
2604
2605 t = p->p_tlist;
2606 do {
2607 pct += cpu_update_pct(t, cur_time);
2608 } while ((t = t->t_forw) != p->p_tlist);
2609
2610 psp->pr_pctcpu = prgetpctcpu(pct);
2611 }
2612 if ((p->p_flag & SSYS) || (as = p->p_as) == &kas) {
2613 psp->pr_size = 0;
2614 psp->pr_rssize = 0;
2615 } else {
2616 mutex_exit(&p->p_lock);
2617 AS_LOCK_ENTER(as, RW_READER);
2618 psp->pr_size = btopr(as->a_resvsize) *
2619 (PAGESIZE / 1024);
2620 psp->pr_rssize = rm_asrss(as) * (PAGESIZE / 1024);
2621 psp->pr_pctmem = rm_pctmemory(as);
2622 AS_LOCK_EXIT(as);
2623 mutex_enter(&p->p_lock);
2624 }
2625 }
2626 }
2627
2628 static size_t
prfdinfomisc(list_t * data,uint_t type,const void * val,size_t vlen)2629 prfdinfomisc(list_t *data, uint_t type, const void *val, size_t vlen)
2630 {
2631 pr_misc_header_t *misc;
2632 size_t len;
2633
2634 len = PRFDINFO_ROUNDUP(sizeof (*misc) + vlen);
2635
2636 if (data != NULL) {
2637 misc = pr_iol_newbuf(data, len);
2638 misc->pr_misc_type = type;
2639 misc->pr_misc_size = len;
2640 misc++;
2641 bcopy((char *)val, (char *)misc, vlen);
2642 }
2643
2644 return (len);
2645 }
2646
2647 /*
2648 * There's no elegant way to determine if a character device
2649 * supports TLI, so just check a hardcoded list of known TLI
2650 * devices.
2651 */
2652
2653 static boolean_t
pristli(vnode_t * vp)2654 pristli(vnode_t *vp)
2655 {
2656 static const char *tlidevs[] = {
2657 "udp", "udp6", "tcp", "tcp6"
2658 };
2659 char *devname;
2660 uint_t i;
2661
2662 ASSERT(vp != NULL);
2663
2664 if (vp->v_type != VCHR || vp->v_stream == NULL || vp->v_rdev == 0)
2665 return (B_FALSE);
2666
2667 if ((devname = mod_major_to_name(getmajor(vp->v_rdev))) == NULL)
2668 return (B_FALSE);
2669
2670 for (i = 0; i < ARRAY_SIZE(tlidevs); i++) {
2671 if (strcmp(devname, tlidevs[i]) == 0)
2672 return (B_TRUE);
2673 }
2674
2675 return (B_FALSE);
2676 }
2677
2678 static size_t
prfdinfopath(proc_t * p,vnode_t * vp,list_t * data,cred_t * cred)2679 prfdinfopath(proc_t *p, vnode_t *vp, list_t *data, cred_t *cred)
2680 {
2681 char *pathname;
2682 size_t pathlen;
2683 size_t sz = 0;
2684
2685 /*
2686 * The global zone's path to a file in a non-global zone can exceed
2687 * MAXPATHLEN.
2688 */
2689 pathlen = MAXPATHLEN * 2 + 1;
2690 pathname = kmem_alloc(pathlen, KM_SLEEP);
2691
2692 if (vnodetopath(NULL, vp, pathname, pathlen, cred) == 0) {
2693 sz += prfdinfomisc(data, PR_PATHNAME,
2694 pathname, strlen(pathname) + 1);
2695 }
2696
2697 kmem_free(pathname, pathlen);
2698
2699 return (sz);
2700 }
2701
2702 static size_t
prfdinfotlisockopt(vnode_t * vp,list_t * data,cred_t * cred)2703 prfdinfotlisockopt(vnode_t *vp, list_t *data, cred_t *cred)
2704 {
2705 strcmd_t strcmd;
2706 int32_t rval;
2707 size_t sz = 0;
2708
2709 strcmd.sc_cmd = TI_GETMYNAME;
2710 strcmd.sc_timeout = 1;
2711 strcmd.sc_len = STRCMDBUFSIZE;
2712
2713 if (VOP_IOCTL(vp, _I_CMD, (intptr_t)&strcmd, FKIOCTL, cred,
2714 &rval, NULL) == 0 && strcmd.sc_len > 0) {
2715 sz += prfdinfomisc(data, PR_SOCKETNAME, strcmd.sc_buf,
2716 strcmd.sc_len);
2717 }
2718
2719 strcmd.sc_cmd = TI_GETPEERNAME;
2720 strcmd.sc_timeout = 1;
2721 strcmd.sc_len = STRCMDBUFSIZE;
2722
2723 if (VOP_IOCTL(vp, _I_CMD, (intptr_t)&strcmd, FKIOCTL, cred,
2724 &rval, NULL) == 0 && strcmd.sc_len > 0) {
2725 sz += prfdinfomisc(data, PR_PEERSOCKNAME, strcmd.sc_buf,
2726 strcmd.sc_len);
2727 }
2728
2729 return (sz);
2730 }
2731
2732 static size_t
prfdinfosockopt(vnode_t * vp,list_t * data,cred_t * cred)2733 prfdinfosockopt(vnode_t *vp, list_t *data, cred_t *cred)
2734 {
2735 sonode_t *so;
2736 socklen_t vlen;
2737 size_t sz = 0;
2738 uint_t i;
2739
2740 if (vp->v_stream != NULL) {
2741 so = VTOSO(vp->v_stream->sd_vnode);
2742
2743 if (so->so_version == SOV_STREAM)
2744 so = NULL;
2745 } else {
2746 so = VTOSO(vp);
2747 }
2748
2749 if (so == NULL)
2750 return (0);
2751
2752 DTRACE_PROBE1(sonode, sonode_t *, so);
2753
2754 /* prmisc - PR_SOCKETNAME */
2755
2756 struct sockaddr_storage buf;
2757 struct sockaddr *name = (struct sockaddr *)&buf;
2758
2759 vlen = sizeof (buf);
2760 if (SOP_GETSOCKNAME(so, name, &vlen, cred) == 0 && vlen > 0)
2761 sz += prfdinfomisc(data, PR_SOCKETNAME, name, vlen);
2762
2763 /* prmisc - PR_PEERSOCKNAME */
2764
2765 vlen = sizeof (buf);
2766 if (SOP_GETPEERNAME(so, name, &vlen, B_FALSE, cred) == 0 && vlen > 0)
2767 sz += prfdinfomisc(data, PR_PEERSOCKNAME, name, vlen);
2768
2769 /* prmisc - PR_SOCKOPTS_BOOL_OPTS */
2770
2771 static struct boolopt {
2772 int level;
2773 int opt;
2774 int bopt;
2775 } boolopts[] = {
2776 { SOL_SOCKET, SO_DEBUG, PR_SO_DEBUG },
2777 { SOL_SOCKET, SO_REUSEADDR, PR_SO_REUSEADDR },
2778 #ifdef SO_REUSEPORT
2779 /* SmartOS and OmniOS have SO_REUSEPORT */
2780 { SOL_SOCKET, SO_REUSEPORT, PR_SO_REUSEPORT },
2781 #endif
2782 { SOL_SOCKET, SO_KEEPALIVE, PR_SO_KEEPALIVE },
2783 { SOL_SOCKET, SO_DONTROUTE, PR_SO_DONTROUTE },
2784 { SOL_SOCKET, SO_BROADCAST, PR_SO_BROADCAST },
2785 { SOL_SOCKET, SO_OOBINLINE, PR_SO_OOBINLINE },
2786 { SOL_SOCKET, SO_DGRAM_ERRIND, PR_SO_DGRAM_ERRIND },
2787 { SOL_SOCKET, SO_ALLZONES, PR_SO_ALLZONES },
2788 { SOL_SOCKET, SO_MAC_EXEMPT, PR_SO_MAC_EXEMPT },
2789 { SOL_SOCKET, SO_MAC_IMPLICIT, PR_SO_MAC_IMPLICIT },
2790 { SOL_SOCKET, SO_EXCLBIND, PR_SO_EXCLBIND },
2791 { SOL_SOCKET, SO_VRRP, PR_SO_VRRP },
2792 { IPPROTO_UDP, UDP_NAT_T_ENDPOINT,
2793 PR_UDP_NAT_T_ENDPOINT }
2794 };
2795 prsockopts_bool_opts_t opts;
2796 int val;
2797
2798 if (data != NULL) {
2799 opts.prsock_bool_opts = 0;
2800
2801 for (i = 0; i < ARRAY_SIZE(boolopts); i++) {
2802 vlen = sizeof (val);
2803 if (SOP_GETSOCKOPT(so, boolopts[i].level,
2804 boolopts[i].opt, &val, &vlen, 0, cred) == 0 &&
2805 val != 0) {
2806 opts.prsock_bool_opts |= boolopts[i].bopt;
2807 }
2808 }
2809 }
2810
2811 sz += prfdinfomisc(data, PR_SOCKOPTS_BOOL_OPTS, &opts, sizeof (opts));
2812
2813 /* prmisc - PR_SOCKOPT_LINGER */
2814
2815 struct linger l;
2816
2817 vlen = sizeof (l);
2818 if (SOP_GETSOCKOPT(so, SOL_SOCKET, SO_LINGER, &l, &vlen,
2819 0, cred) == 0 && vlen > 0) {
2820 sz += prfdinfomisc(data, PR_SOCKOPT_LINGER, &l, vlen);
2821 }
2822
2823 /* prmisc - PR_SOCKOPT_* int types */
2824
2825 static struct sopt {
2826 int level;
2827 int opt;
2828 int bopt;
2829 } sopts[] = {
2830 { SOL_SOCKET, SO_TYPE, PR_SOCKOPT_TYPE },
2831 { SOL_SOCKET, SO_SNDBUF, PR_SOCKOPT_SNDBUF },
2832 { SOL_SOCKET, SO_RCVBUF, PR_SOCKOPT_RCVBUF }
2833 };
2834
2835 for (i = 0; i < ARRAY_SIZE(sopts); i++) {
2836 vlen = sizeof (val);
2837 if (SOP_GETSOCKOPT(so, sopts[i].level, sopts[i].opt,
2838 &val, &vlen, 0, cred) == 0 && vlen > 0) {
2839 sz += prfdinfomisc(data, sopts[i].bopt, &val, vlen);
2840 }
2841 }
2842
2843 /* prmisc - PR_SOCKOPT_IP_NEXTHOP */
2844
2845 in_addr_t nexthop_val;
2846
2847 vlen = sizeof (nexthop_val);
2848 if (SOP_GETSOCKOPT(so, IPPROTO_IP, IP_NEXTHOP,
2849 &nexthop_val, &vlen, 0, cred) == 0 && vlen > 0) {
2850 sz += prfdinfomisc(data, PR_SOCKOPT_IP_NEXTHOP,
2851 &nexthop_val, vlen);
2852 }
2853
2854 /* prmisc - PR_SOCKOPT_IPV6_NEXTHOP */
2855
2856 struct sockaddr_in6 nexthop6_val;
2857
2858 vlen = sizeof (nexthop6_val);
2859 if (SOP_GETSOCKOPT(so, IPPROTO_IPV6, IPV6_NEXTHOP,
2860 &nexthop6_val, &vlen, 0, cred) == 0 && vlen > 0) {
2861 sz += prfdinfomisc(data, PR_SOCKOPT_IPV6_NEXTHOP,
2862 &nexthop6_val, vlen);
2863 }
2864
2865 /* prmisc - PR_SOCKOPT_TCP_CONGESTION */
2866
2867 char cong[CC_ALGO_NAME_MAX];
2868
2869 vlen = sizeof (cong);
2870 if (SOP_GETSOCKOPT(so, IPPROTO_TCP, TCP_CONGESTION,
2871 &cong, &vlen, 0, cred) == 0 && vlen > 0) {
2872 sz += prfdinfomisc(data, PR_SOCKOPT_TCP_CONGESTION, cong, vlen);
2873 }
2874
2875 /* prmisc - PR_SOCKFILTERS_PRIV */
2876
2877 struct fil_info fi;
2878
2879 vlen = sizeof (fi);
2880 if (SOP_GETSOCKOPT(so, SOL_FILTER, FIL_LIST,
2881 &fi, &vlen, 0, cred) == 0 && vlen != 0) {
2882 pr_misc_header_t *misc;
2883 size_t len;
2884
2885 /*
2886 * We limit the number of returned filters to 32.
2887 * This is the maximum number that pfiles will print
2888 * anyway.
2889 */
2890 vlen = MIN(32, fi.fi_pos + 1);
2891 vlen *= sizeof (fi);
2892
2893 len = PRFDINFO_ROUNDUP(sizeof (*misc) + vlen);
2894 sz += len;
2895
2896 if (data != NULL) {
2897 /*
2898 * So that the filter list can be built incrementally,
2899 * prfdinfomisc() is not used here. Instead we
2900 * allocate a buffer directly on the copyout list using
2901 * pr_iol_newbuf()
2902 */
2903 misc = pr_iol_newbuf(data, len);
2904 misc->pr_misc_type = PR_SOCKFILTERS_PRIV;
2905 misc->pr_misc_size = len;
2906 misc++;
2907 len = vlen;
2908 if (SOP_GETSOCKOPT(so, SOL_FILTER, FIL_LIST,
2909 misc, &vlen, 0, cred) == 0) {
2910 /*
2911 * In case the number of filters has reduced
2912 * since the first call, explicitly zero out
2913 * any unpopulated space.
2914 */
2915 if (vlen < len)
2916 bzero(misc + vlen, len - vlen);
2917 } else {
2918 /* Something went wrong, zero out the result */
2919 bzero(misc, vlen);
2920 }
2921 }
2922 }
2923
2924 return (sz);
2925 }
2926
2927 typedef struct prfdinfo_nm_path_cbdata {
2928 proc_t *nmp_p;
2929 u_offset_t nmp_sz;
2930 list_t *nmp_data;
2931 } prfdinfo_nm_path_cbdata_t;
2932
2933 static int
prfdinfo_nm_path(const struct namenode * np,cred_t * cred,void * arg)2934 prfdinfo_nm_path(const struct namenode *np, cred_t *cred, void *arg)
2935 {
2936 prfdinfo_nm_path_cbdata_t *cb = arg;
2937
2938 cb->nmp_sz += prfdinfopath(cb->nmp_p, np->nm_vnode, cb->nmp_data, cred);
2939
2940 return (0);
2941 }
2942
2943 u_offset_t
prgetfdinfosize(proc_t * p,vnode_t * vp,cred_t * cred)2944 prgetfdinfosize(proc_t *p, vnode_t *vp, cred_t *cred)
2945 {
2946 u_offset_t sz;
2947
2948 /*
2949 * All fdinfo files will be at least this big -
2950 * sizeof fdinfo struct + zero length trailer
2951 */
2952 sz = offsetof(prfdinfo_t, pr_misc) + sizeof (pr_misc_header_t);
2953
2954 /* Pathname */
2955 switch (vp->v_type) {
2956 case VDOOR: {
2957 prfdinfo_nm_path_cbdata_t cb = {
2958 .nmp_p = p,
2959 .nmp_data = NULL,
2960 .nmp_sz = 0
2961 };
2962
2963 (void) nm_walk_mounts(vp, prfdinfo_nm_path, cred, &cb);
2964 sz += cb.nmp_sz;
2965 break;
2966 }
2967 case VSOCK:
2968 break;
2969 default:
2970 sz += prfdinfopath(p, vp, NULL, cred);
2971 }
2972
2973 /* Socket options */
2974 if (vp->v_type == VSOCK)
2975 sz += prfdinfosockopt(vp, NULL, cred);
2976
2977 /* TLI/XTI sockets */
2978 if (pristli(vp))
2979 sz += prfdinfotlisockopt(vp, NULL, cred);
2980
2981 return (sz);
2982 }
2983
2984 int
prgetfdinfo(proc_t * p,vnode_t * vp,prfdinfo_t * fdinfo,cred_t * cred,cred_t * file_cred,list_t * data)2985 prgetfdinfo(proc_t *p, vnode_t *vp, prfdinfo_t *fdinfo, cred_t *cred,
2986 cred_t *file_cred, list_t *data)
2987 {
2988 vattr_t vattr;
2989 int error;
2990
2991 /*
2992 * The buffer has been initialised to zero by pr_iol_newbuf().
2993 * Initialise defaults for any values that should not default to zero.
2994 */
2995 fdinfo->pr_uid = (uid_t)-1;
2996 fdinfo->pr_gid = (gid_t)-1;
2997 fdinfo->pr_size = -1;
2998 fdinfo->pr_locktype = F_UNLCK;
2999 fdinfo->pr_lockpid = -1;
3000 fdinfo->pr_locksysid = -1;
3001 fdinfo->pr_peerpid = -1;
3002
3003 /* Offset */
3004
3005 /*
3006 * pr_offset has already been set from the underlying file_t.
3007 * Check if it is plausible and reset to -1 if not.
3008 */
3009 if (fdinfo->pr_offset != -1 &&
3010 VOP_SEEK(vp, 0, (offset_t *)&fdinfo->pr_offset, NULL) != 0)
3011 fdinfo->pr_offset = -1;
3012
3013 /*
3014 * Attributes
3015 *
3016 * We have two cred_t structures available here.
3017 * 'cred' is the caller's credential, and 'file_cred' is the credential
3018 * for the file being inspected.
3019 *
3020 * When looking up the file attributes, file_cred is used in order
3021 * that the correct ownership is set for doors and FIFOs. Since the
3022 * caller has permission to read the fdinfo file in proc, this does
3023 * not expose any additional information.
3024 */
3025 vattr.va_mask = AT_STAT;
3026 if (VOP_GETATTR(vp, &vattr, 0, file_cred, NULL) == 0) {
3027 fdinfo->pr_major = getmajor(vattr.va_fsid);
3028 fdinfo->pr_minor = getminor(vattr.va_fsid);
3029 fdinfo->pr_rmajor = getmajor(vattr.va_rdev);
3030 fdinfo->pr_rminor = getminor(vattr.va_rdev);
3031 fdinfo->pr_ino = (ino64_t)vattr.va_nodeid;
3032 fdinfo->pr_size = (off64_t)vattr.va_size;
3033 fdinfo->pr_mode = VTTOIF(vattr.va_type) | vattr.va_mode;
3034 fdinfo->pr_uid = vattr.va_uid;
3035 fdinfo->pr_gid = vattr.va_gid;
3036 if (vp->v_type == VSOCK)
3037 fdinfo->pr_fileflags |= sock_getfasync(vp);
3038 }
3039
3040 /* locks */
3041
3042 flock64_t bf;
3043
3044 bzero(&bf, sizeof (bf));
3045 bf.l_type = F_WRLCK;
3046
3047 if (VOP_FRLOCK(vp, F_GETLK, &bf,
3048 (uint16_t)(fdinfo->pr_fileflags & 0xffff), 0, NULL,
3049 cred, NULL) == 0 && bf.l_type != F_UNLCK) {
3050 fdinfo->pr_locktype = bf.l_type;
3051 fdinfo->pr_lockpid = bf.l_pid;
3052 fdinfo->pr_locksysid = bf.l_sysid;
3053 }
3054
3055 /* peer cred */
3056
3057 k_peercred_t kpc;
3058
3059 switch (vp->v_type) {
3060 case VFIFO:
3061 case VSOCK: {
3062 int32_t rval;
3063
3064 error = VOP_IOCTL(vp, _I_GETPEERCRED, (intptr_t)&kpc,
3065 FKIOCTL, cred, &rval, NULL);
3066 break;
3067 }
3068 case VCHR: {
3069 struct strioctl strioc;
3070 int32_t rval;
3071
3072 if (vp->v_stream == NULL) {
3073 error = ENOTSUP;
3074 break;
3075 }
3076 strioc.ic_cmd = _I_GETPEERCRED;
3077 strioc.ic_timout = INFTIM;
3078 strioc.ic_len = (int)sizeof (k_peercred_t);
3079 strioc.ic_dp = (char *)&kpc;
3080
3081 error = strdoioctl(vp->v_stream, &strioc, FNATIVE | FKIOCTL,
3082 STR_NOSIG | K_TO_K, cred, &rval);
3083 break;
3084 }
3085 default:
3086 error = ENOTSUP;
3087 break;
3088 }
3089
3090 if (error == 0 && kpc.pc_cr != NULL) {
3091 proc_t *peerp;
3092
3093 fdinfo->pr_peerpid = kpc.pc_cpid;
3094
3095 crfree(kpc.pc_cr);
3096
3097 mutex_enter(&pidlock);
3098 if ((peerp = prfind(fdinfo->pr_peerpid)) != NULL) {
3099 user_t *up;
3100
3101 mutex_enter(&peerp->p_lock);
3102 mutex_exit(&pidlock);
3103
3104 up = PTOU(peerp);
3105 bcopy(up->u_comm, fdinfo->pr_peername,
3106 MIN(sizeof (up->u_comm),
3107 sizeof (fdinfo->pr_peername) - 1));
3108
3109 mutex_exit(&peerp->p_lock);
3110 } else {
3111 mutex_exit(&pidlock);
3112 }
3113 }
3114
3115 /* pathname */
3116
3117 switch (vp->v_type) {
3118 case VDOOR: {
3119 prfdinfo_nm_path_cbdata_t cb = {
3120 .nmp_p = p,
3121 .nmp_data = data,
3122 .nmp_sz = 0
3123 };
3124
3125 (void) nm_walk_mounts(vp, prfdinfo_nm_path, cred, &cb);
3126 break;
3127 }
3128 case VSOCK:
3129 /*
3130 * Don't attempt to determine the path for a socket as the
3131 * vnode has no associated v_path. It will cause a linear scan
3132 * of the dnlc table and result in no path being found.
3133 */
3134 break;
3135 default:
3136 (void) prfdinfopath(p, vp, data, cred);
3137 }
3138
3139 /* socket options */
3140 if (vp->v_type == VSOCK)
3141 (void) prfdinfosockopt(vp, data, cred);
3142
3143 /* TLI/XTI stream sockets */
3144 if (pristli(vp))
3145 (void) prfdinfotlisockopt(vp, data, cred);
3146
3147 /*
3148 * Add a terminating header with a zero size.
3149 */
3150 pr_misc_header_t *misc;
3151
3152 misc = pr_iol_newbuf(data, sizeof (*misc));
3153 misc->pr_misc_size = 0;
3154 misc->pr_misc_type = (uint_t)-1;
3155
3156 return (0);
3157 }
3158
3159 #ifdef _SYSCALL32_IMPL
3160 void
prgetpsinfo32(proc_t * p,psinfo32_t * psp)3161 prgetpsinfo32(proc_t *p, psinfo32_t *psp)
3162 {
3163 kthread_t *t;
3164 struct cred *cred;
3165 hrtime_t hrutime, hrstime;
3166
3167 ASSERT(MUTEX_HELD(&p->p_lock));
3168
3169 if ((t = prchoose(p)) == NULL) /* returns locked thread */
3170 bzero(psp, sizeof (*psp));
3171 else {
3172 thread_unlock(t);
3173 bzero(psp, sizeof (*psp) - sizeof (psp->pr_lwp));
3174 }
3175
3176 /*
3177 * only export SSYS and SMSACCT; everything else is off-limits to
3178 * userland apps.
3179 */
3180 psp->pr_flag = p->p_flag & (SSYS | SMSACCT);
3181 psp->pr_nlwp = p->p_lwpcnt;
3182 psp->pr_nzomb = p->p_zombcnt;
3183 mutex_enter(&p->p_crlock);
3184 cred = p->p_cred;
3185 psp->pr_uid = crgetruid(cred);
3186 psp->pr_euid = crgetuid(cred);
3187 psp->pr_gid = crgetrgid(cred);
3188 psp->pr_egid = crgetgid(cred);
3189 mutex_exit(&p->p_crlock);
3190 psp->pr_pid = p->p_pid;
3191 if (curproc->p_zone->zone_id != GLOBAL_ZONEID &&
3192 (p->p_flag & SZONETOP)) {
3193 ASSERT(p->p_zone->zone_id != GLOBAL_ZONEID);
3194 /*
3195 * Inside local zones, fake zsched's pid as parent pids for
3196 * processes which reference processes outside of the zone.
3197 */
3198 psp->pr_ppid = curproc->p_zone->zone_zsched->p_pid;
3199 } else {
3200 psp->pr_ppid = p->p_ppid;
3201 }
3202 psp->pr_pgid = p->p_pgrp;
3203 psp->pr_sid = p->p_sessp->s_sid;
3204 psp->pr_taskid = p->p_task->tk_tkid;
3205 psp->pr_projid = p->p_task->tk_proj->kpj_id;
3206 psp->pr_poolid = p->p_pool->pool_id;
3207 psp->pr_zoneid = p->p_zone->zone_id;
3208 if ((psp->pr_contract = PRCTID(p)) == 0)
3209 psp->pr_contract = -1;
3210 psp->pr_addr = 0; /* cannot represent 64-bit addr in 32 bits */
3211 switch (p->p_model) {
3212 case DATAMODEL_ILP32:
3213 psp->pr_dmodel = PR_MODEL_ILP32;
3214 break;
3215 case DATAMODEL_LP64:
3216 psp->pr_dmodel = PR_MODEL_LP64;
3217 break;
3218 }
3219 hrutime = mstate_aggr_state(p, LMS_USER);
3220 hrstime = mstate_aggr_state(p, LMS_SYSTEM);
3221 hrt2ts32(hrutime + hrstime, &psp->pr_time);
3222 TICK_TO_TIMESTRUC32(p->p_cutime + p->p_cstime, &psp->pr_ctime);
3223
3224 if (t == NULL) {
3225 extern int wstat(int, int); /* needs a header file */
3226 int wcode = p->p_wcode; /* must be atomic read */
3227
3228 if (wcode)
3229 psp->pr_wstat = wstat(wcode, p->p_wdata);
3230 psp->pr_ttydev = PRNODEV32;
3231 psp->pr_lwp.pr_state = SZOMB;
3232 psp->pr_lwp.pr_sname = 'Z';
3233 } else {
3234 user_t *up = PTOU(p);
3235 struct as *as;
3236 dev_t d;
3237 extern dev_t rwsconsdev, rconsdev, uconsdev;
3238
3239 d = cttydev(p);
3240 /*
3241 * If the controlling terminal is the real
3242 * or workstation console device, map to what the
3243 * user thinks is the console device. Handle case when
3244 * rwsconsdev or rconsdev is set to NODEV for Starfire.
3245 */
3246 if ((d == rwsconsdev || d == rconsdev) && d != NODEV)
3247 d = uconsdev;
3248 (void) cmpldev(&psp->pr_ttydev, d);
3249 TIMESPEC_TO_TIMESPEC32(&psp->pr_start, &up->u_start);
3250 bcopy(up->u_comm, psp->pr_fname,
3251 MIN(sizeof (up->u_comm), sizeof (psp->pr_fname)-1));
3252 bcopy(up->u_psargs, psp->pr_psargs,
3253 MIN(PRARGSZ-1, PSARGSZ));
3254 psp->pr_argc = up->u_argc;
3255 psp->pr_argv = (caddr32_t)up->u_argv;
3256 psp->pr_envp = (caddr32_t)up->u_envp;
3257
3258 /* get the chosen lwp's lwpsinfo */
3259 prgetlwpsinfo32(t, &psp->pr_lwp);
3260
3261 /* compute %cpu for the process */
3262 if (p->p_lwpcnt == 1)
3263 psp->pr_pctcpu = psp->pr_lwp.pr_pctcpu;
3264 else {
3265 uint64_t pct = 0;
3266 hrtime_t cur_time;
3267
3268 t = p->p_tlist;
3269 cur_time = gethrtime_unscaled();
3270 do {
3271 pct += cpu_update_pct(t, cur_time);
3272 } while ((t = t->t_forw) != p->p_tlist);
3273
3274 psp->pr_pctcpu = prgetpctcpu(pct);
3275 }
3276 if ((p->p_flag & SSYS) || (as = p->p_as) == &kas) {
3277 psp->pr_size = 0;
3278 psp->pr_rssize = 0;
3279 } else {
3280 mutex_exit(&p->p_lock);
3281 AS_LOCK_ENTER(as, RW_READER);
3282 psp->pr_size = (size32_t)
3283 (btopr(as->a_resvsize) * (PAGESIZE / 1024));
3284 psp->pr_rssize = (size32_t)
3285 (rm_asrss(as) * (PAGESIZE / 1024));
3286 psp->pr_pctmem = rm_pctmemory(as);
3287 AS_LOCK_EXIT(as);
3288 mutex_enter(&p->p_lock);
3289 }
3290 }
3291
3292 /*
3293 * If we are looking at an LP64 process, zero out
3294 * the fields that cannot be represented in ILP32.
3295 */
3296 if (p->p_model != DATAMODEL_ILP32) {
3297 psp->pr_size = 0;
3298 psp->pr_rssize = 0;
3299 psp->pr_argv = 0;
3300 psp->pr_envp = 0;
3301 }
3302 }
3303
3304 #endif /* _SYSCALL32_IMPL */
3305
3306 void
prgetlwpsinfo(kthread_t * t,lwpsinfo_t * psp)3307 prgetlwpsinfo(kthread_t *t, lwpsinfo_t *psp)
3308 {
3309 klwp_t *lwp = ttolwp(t);
3310 sobj_ops_t *sobj;
3311 char c, state;
3312 uint64_t pct;
3313 int retval, niceval;
3314 hrtime_t hrutime, hrstime;
3315
3316 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
3317
3318 bzero(psp, sizeof (*psp));
3319
3320 psp->pr_flag = 0; /* lwpsinfo_t.pr_flag is deprecated */
3321 psp->pr_lwpid = t->t_tid;
3322 psp->pr_addr = (uintptr_t)t;
3323 psp->pr_wchan = (uintptr_t)t->t_wchan;
3324
3325 /* map the thread state enum into a process state enum */
3326 state = VSTOPPED(t) ? TS_STOPPED : t->t_state;
3327 switch (state) {
3328 case TS_SLEEP: state = SSLEEP; c = 'S'; break;
3329 case TS_RUN: state = SRUN; c = 'R'; break;
3330 case TS_ONPROC: state = SONPROC; c = 'O'; break;
3331 case TS_ZOMB: state = SZOMB; c = 'Z'; break;
3332 case TS_STOPPED: state = SSTOP; c = 'T'; break;
3333 case TS_WAIT: state = SWAIT; c = 'W'; break;
3334 default: state = 0; c = '?'; break;
3335 }
3336 psp->pr_state = state;
3337 psp->pr_sname = c;
3338 if ((sobj = t->t_sobj_ops) != NULL)
3339 psp->pr_stype = SOBJ_TYPE(sobj);
3340 retval = CL_DONICE(t, NULL, 0, &niceval);
3341 if (retval == 0) {
3342 psp->pr_oldpri = v.v_maxsyspri - t->t_pri;
3343 psp->pr_nice = niceval + NZERO;
3344 }
3345 psp->pr_syscall = t->t_sysnum;
3346 psp->pr_pri = t->t_pri;
3347 psp->pr_start.tv_sec = t->t_start;
3348 psp->pr_start.tv_nsec = 0L;
3349 hrutime = lwp->lwp_mstate.ms_acct[LMS_USER];
3350 scalehrtime(&hrutime);
3351 hrstime = lwp->lwp_mstate.ms_acct[LMS_SYSTEM] +
3352 lwp->lwp_mstate.ms_acct[LMS_TRAP];
3353 scalehrtime(&hrstime);
3354 hrt2ts(hrutime + hrstime, &psp->pr_time);
3355 /* compute %cpu for the lwp */
3356 pct = cpu_update_pct(t, gethrtime_unscaled());
3357 psp->pr_pctcpu = prgetpctcpu(pct);
3358 psp->pr_cpu = (psp->pr_pctcpu*100 + 0x6000) >> 15; /* [0..99] */
3359 if (psp->pr_cpu > 99)
3360 psp->pr_cpu = 99;
3361
3362 (void) strncpy(psp->pr_clname, sclass[t->t_cid].cl_name,
3363 sizeof (psp->pr_clname) - 1);
3364 bzero(psp->pr_name, sizeof (psp->pr_name)); /* XXX ??? */
3365 psp->pr_onpro = t->t_cpu->cpu_id;
3366 psp->pr_bindpro = t->t_bind_cpu;
3367 psp->pr_bindpset = t->t_bind_pset;
3368 psp->pr_lgrp = t->t_lpl->lpl_lgrpid;
3369 }
3370
3371 #ifdef _SYSCALL32_IMPL
3372 void
prgetlwpsinfo32(kthread_t * t,lwpsinfo32_t * psp)3373 prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp)
3374 {
3375 klwp_t *lwp = ttolwp(t);
3376 sobj_ops_t *sobj;
3377 char c, state;
3378 uint64_t pct;
3379 int retval, niceval;
3380 hrtime_t hrutime, hrstime;
3381
3382 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
3383
3384 bzero(psp, sizeof (*psp));
3385
3386 psp->pr_flag = 0; /* lwpsinfo_t.pr_flag is deprecated */
3387 psp->pr_lwpid = t->t_tid;
3388 psp->pr_addr = 0; /* cannot represent 64-bit addr in 32 bits */
3389 psp->pr_wchan = 0; /* cannot represent 64-bit addr in 32 bits */
3390
3391 /* map the thread state enum into a process state enum */
3392 state = VSTOPPED(t) ? TS_STOPPED : t->t_state;
3393 switch (state) {
3394 case TS_SLEEP: state = SSLEEP; c = 'S'; break;
3395 case TS_RUN: state = SRUN; c = 'R'; break;
3396 case TS_ONPROC: state = SONPROC; c = 'O'; break;
3397 case TS_ZOMB: state = SZOMB; c = 'Z'; break;
3398 case TS_STOPPED: state = SSTOP; c = 'T'; break;
3399 case TS_WAIT: state = SWAIT; c = 'W'; break;
3400 default: state = 0; c = '?'; break;
3401 }
3402 psp->pr_state = state;
3403 psp->pr_sname = c;
3404 if ((sobj = t->t_sobj_ops) != NULL)
3405 psp->pr_stype = SOBJ_TYPE(sobj);
3406 retval = CL_DONICE(t, NULL, 0, &niceval);
3407 if (retval == 0) {
3408 psp->pr_oldpri = v.v_maxsyspri - t->t_pri;
3409 psp->pr_nice = niceval + NZERO;
3410 } else {
3411 psp->pr_oldpri = 0;
3412 psp->pr_nice = 0;
3413 }
3414 psp->pr_syscall = t->t_sysnum;
3415 psp->pr_pri = t->t_pri;
3416 psp->pr_start.tv_sec = (time32_t)t->t_start;
3417 psp->pr_start.tv_nsec = 0L;
3418 hrutime = lwp->lwp_mstate.ms_acct[LMS_USER];
3419 scalehrtime(&hrutime);
3420 hrstime = lwp->lwp_mstate.ms_acct[LMS_SYSTEM] +
3421 lwp->lwp_mstate.ms_acct[LMS_TRAP];
3422 scalehrtime(&hrstime);
3423 hrt2ts32(hrutime + hrstime, &psp->pr_time);
3424 /* compute %cpu for the lwp */
3425 pct = cpu_update_pct(t, gethrtime_unscaled());
3426 psp->pr_pctcpu = prgetpctcpu(pct);
3427 psp->pr_cpu = (psp->pr_pctcpu*100 + 0x6000) >> 15; /* [0..99] */
3428 if (psp->pr_cpu > 99)
3429 psp->pr_cpu = 99;
3430
3431 (void) strncpy(psp->pr_clname, sclass[t->t_cid].cl_name,
3432 sizeof (psp->pr_clname) - 1);
3433 bzero(psp->pr_name, sizeof (psp->pr_name)); /* XXX ??? */
3434 psp->pr_onpro = t->t_cpu->cpu_id;
3435 psp->pr_bindpro = t->t_bind_cpu;
3436 psp->pr_bindpset = t->t_bind_pset;
3437 psp->pr_lgrp = t->t_lpl->lpl_lgrpid;
3438 }
3439 #endif /* _SYSCALL32_IMPL */
3440
3441 #ifdef _SYSCALL32_IMPL
3442
3443 #define PR_COPY_FIELD(s, d, field) d->field = s->field
3444
3445 #define PR_COPY_FIELD_ILP32(s, d, field) \
3446 if (s->pr_dmodel == PR_MODEL_ILP32) { \
3447 d->field = s->field; \
3448 }
3449
3450 #define PR_COPY_TIMESPEC(s, d, field) \
3451 TIMESPEC_TO_TIMESPEC32(&d->field, &s->field);
3452
3453 #define PR_COPY_BUF(s, d, field) \
3454 bcopy(s->field, d->field, sizeof (d->field));
3455
3456 #define PR_IGNORE_FIELD(s, d, field)
3457
3458 void
lwpsinfo_kto32(const struct lwpsinfo * src,struct lwpsinfo32 * dest)3459 lwpsinfo_kto32(const struct lwpsinfo *src, struct lwpsinfo32 *dest)
3460 {
3461 bzero(dest, sizeof (*dest));
3462
3463 PR_COPY_FIELD(src, dest, pr_flag);
3464 PR_COPY_FIELD(src, dest, pr_lwpid);
3465 PR_IGNORE_FIELD(src, dest, pr_addr);
3466 PR_IGNORE_FIELD(src, dest, pr_wchan);
3467 PR_COPY_FIELD(src, dest, pr_stype);
3468 PR_COPY_FIELD(src, dest, pr_state);
3469 PR_COPY_FIELD(src, dest, pr_sname);
3470 PR_COPY_FIELD(src, dest, pr_nice);
3471 PR_COPY_FIELD(src, dest, pr_syscall);
3472 PR_COPY_FIELD(src, dest, pr_oldpri);
3473 PR_COPY_FIELD(src, dest, pr_cpu);
3474 PR_COPY_FIELD(src, dest, pr_pri);
3475 PR_COPY_FIELD(src, dest, pr_pctcpu);
3476 PR_COPY_TIMESPEC(src, dest, pr_start);
3477 PR_COPY_BUF(src, dest, pr_clname);
3478 PR_COPY_BUF(src, dest, pr_name);
3479 PR_COPY_FIELD(src, dest, pr_onpro);
3480 PR_COPY_FIELD(src, dest, pr_bindpro);
3481 PR_COPY_FIELD(src, dest, pr_bindpset);
3482 PR_COPY_FIELD(src, dest, pr_lgrp);
3483 }
3484
3485 void
psinfo_kto32(const struct psinfo * src,struct psinfo32 * dest)3486 psinfo_kto32(const struct psinfo *src, struct psinfo32 *dest)
3487 {
3488 bzero(dest, sizeof (*dest));
3489
3490 PR_COPY_FIELD(src, dest, pr_flag);
3491 PR_COPY_FIELD(src, dest, pr_nlwp);
3492 PR_COPY_FIELD(src, dest, pr_pid);
3493 PR_COPY_FIELD(src, dest, pr_ppid);
3494 PR_COPY_FIELD(src, dest, pr_pgid);
3495 PR_COPY_FIELD(src, dest, pr_sid);
3496 PR_COPY_FIELD(src, dest, pr_uid);
3497 PR_COPY_FIELD(src, dest, pr_euid);
3498 PR_COPY_FIELD(src, dest, pr_gid);
3499 PR_COPY_FIELD(src, dest, pr_egid);
3500 PR_IGNORE_FIELD(src, dest, pr_addr);
3501 PR_COPY_FIELD_ILP32(src, dest, pr_size);
3502 PR_COPY_FIELD_ILP32(src, dest, pr_rssize);
3503 PR_COPY_FIELD(src, dest, pr_ttydev);
3504 PR_COPY_FIELD(src, dest, pr_pctcpu);
3505 PR_COPY_FIELD(src, dest, pr_pctmem);
3506 PR_COPY_TIMESPEC(src, dest, pr_start);
3507 PR_COPY_TIMESPEC(src, dest, pr_time);
3508 PR_COPY_TIMESPEC(src, dest, pr_ctime);
3509 PR_COPY_BUF(src, dest, pr_fname);
3510 PR_COPY_BUF(src, dest, pr_psargs);
3511 PR_COPY_FIELD(src, dest, pr_wstat);
3512 PR_COPY_FIELD(src, dest, pr_argc);
3513 PR_COPY_FIELD_ILP32(src, dest, pr_argv);
3514 PR_COPY_FIELD_ILP32(src, dest, pr_envp);
3515 PR_COPY_FIELD(src, dest, pr_dmodel);
3516 PR_COPY_FIELD(src, dest, pr_taskid);
3517 PR_COPY_FIELD(src, dest, pr_projid);
3518 PR_COPY_FIELD(src, dest, pr_nzomb);
3519 PR_COPY_FIELD(src, dest, pr_poolid);
3520 PR_COPY_FIELD(src, dest, pr_contract);
3521 PR_COPY_FIELD(src, dest, pr_poolid);
3522 PR_COPY_FIELD(src, dest, pr_poolid);
3523
3524 lwpsinfo_kto32(&src->pr_lwp, &dest->pr_lwp);
3525 }
3526
3527 #undef PR_COPY_FIELD
3528 #undef PR_COPY_FIELD_ILP32
3529 #undef PR_COPY_TIMESPEC
3530 #undef PR_COPY_BUF
3531 #undef PR_IGNORE_FIELD
3532
3533 #endif /* _SYSCALL32_IMPL */
3534
3535 /*
3536 * This used to get called when microstate accounting was disabled but
3537 * microstate information was requested. Since Microstate accounting is on
3538 * regardless of the proc flags, this simply makes it appear to procfs that
3539 * microstate accounting is on. This is relatively meaningless since you
3540 * can't turn it off, but this is here for the sake of appearances.
3541 */
3542
3543 /*ARGSUSED*/
3544 void
estimate_msacct(kthread_t * t,hrtime_t curtime)3545 estimate_msacct(kthread_t *t, hrtime_t curtime)
3546 {
3547 proc_t *p;
3548
3549 if (t == NULL)
3550 return;
3551
3552 p = ttoproc(t);
3553 ASSERT(MUTEX_HELD(&p->p_lock));
3554
3555 /*
3556 * A system process (p0) could be referenced if the thread is
3557 * in the process of exiting. Don't turn on microstate accounting
3558 * in that case.
3559 */
3560 if (p->p_flag & SSYS)
3561 return;
3562
3563 /*
3564 * Loop through all the LWPs (kernel threads) in the process.
3565 */
3566 t = p->p_tlist;
3567 do {
3568 t->t_proc_flag |= TP_MSACCT;
3569 } while ((t = t->t_forw) != p->p_tlist);
3570
3571 p->p_flag |= SMSACCT; /* set process-wide MSACCT */
3572 }
3573
3574 /*
3575 * It's not really possible to disable microstate accounting anymore.
3576 * However, this routine simply turns off the ms accounting flags in a process
3577 * This way procfs can still pretend to turn microstate accounting on and
3578 * off for a process, but it actually doesn't do anything. This is
3579 * a neutered form of preemptive idiot-proofing.
3580 */
3581 void
disable_msacct(proc_t * p)3582 disable_msacct(proc_t *p)
3583 {
3584 kthread_t *t;
3585
3586 ASSERT(MUTEX_HELD(&p->p_lock));
3587
3588 p->p_flag &= ~SMSACCT; /* clear process-wide MSACCT */
3589 /*
3590 * Loop through all the LWPs (kernel threads) in the process.
3591 */
3592 if ((t = p->p_tlist) != NULL) {
3593 do {
3594 /* clear per-thread flag */
3595 t->t_proc_flag &= ~TP_MSACCT;
3596 } while ((t = t->t_forw) != p->p_tlist);
3597 }
3598 }
3599
3600 /*
3601 * Return resource usage information.
3602 */
3603 void
prgetusage(kthread_t * t,prhusage_t * pup)3604 prgetusage(kthread_t *t, prhusage_t *pup)
3605 {
3606 klwp_t *lwp = ttolwp(t);
3607 hrtime_t *mstimep;
3608 struct mstate *ms = &lwp->lwp_mstate;
3609 int state;
3610 int i;
3611 hrtime_t curtime;
3612 hrtime_t waitrq;
3613 hrtime_t tmp1;
3614
3615 curtime = gethrtime_unscaled();
3616
3617 pup->pr_lwpid = t->t_tid;
3618 pup->pr_count = 1;
3619 pup->pr_create = ms->ms_start;
3620 pup->pr_term = ms->ms_term;
3621 scalehrtime(&pup->pr_create);
3622 scalehrtime(&pup->pr_term);
3623 if (ms->ms_term == 0) {
3624 pup->pr_rtime = curtime - ms->ms_start;
3625 scalehrtime(&pup->pr_rtime);
3626 } else {
3627 pup->pr_rtime = ms->ms_term - ms->ms_start;
3628 scalehrtime(&pup->pr_rtime);
3629 }
3630
3631
3632 pup->pr_utime = ms->ms_acct[LMS_USER];
3633 pup->pr_stime = ms->ms_acct[LMS_SYSTEM];
3634 pup->pr_ttime = ms->ms_acct[LMS_TRAP];
3635 pup->pr_tftime = ms->ms_acct[LMS_TFAULT];
3636 pup->pr_dftime = ms->ms_acct[LMS_DFAULT];
3637 pup->pr_kftime = ms->ms_acct[LMS_KFAULT];
3638 pup->pr_ltime = ms->ms_acct[LMS_USER_LOCK];
3639 pup->pr_slptime = ms->ms_acct[LMS_SLEEP];
3640 pup->pr_wtime = ms->ms_acct[LMS_WAIT_CPU];
3641 pup->pr_stoptime = ms->ms_acct[LMS_STOPPED];
3642
3643 prscaleusage(pup);
3644
3645 /*
3646 * Adjust for time waiting in the dispatcher queue.
3647 */
3648 waitrq = t->t_waitrq; /* hopefully atomic */
3649 if (waitrq != 0) {
3650 if (waitrq > curtime) {
3651 curtime = gethrtime_unscaled();
3652 }
3653 tmp1 = curtime - waitrq;
3654 scalehrtime(&tmp1);
3655 pup->pr_wtime += tmp1;
3656 curtime = waitrq;
3657 }
3658
3659 /*
3660 * Adjust for time spent in current microstate.
3661 */
3662 if (ms->ms_state_start > curtime) {
3663 curtime = gethrtime_unscaled();
3664 }
3665
3666 i = 0;
3667 do {
3668 switch (state = t->t_mstate) {
3669 case LMS_SLEEP:
3670 /*
3671 * Update the timer for the current sleep state.
3672 */
3673 switch (state = ms->ms_prev) {
3674 case LMS_TFAULT:
3675 case LMS_DFAULT:
3676 case LMS_KFAULT:
3677 case LMS_USER_LOCK:
3678 break;
3679 default:
3680 state = LMS_SLEEP;
3681 break;
3682 }
3683 break;
3684 case LMS_TFAULT:
3685 case LMS_DFAULT:
3686 case LMS_KFAULT:
3687 case LMS_USER_LOCK:
3688 state = LMS_SYSTEM;
3689 break;
3690 }
3691 switch (state) {
3692 case LMS_USER: mstimep = &pup->pr_utime; break;
3693 case LMS_SYSTEM: mstimep = &pup->pr_stime; break;
3694 case LMS_TRAP: mstimep = &pup->pr_ttime; break;
3695 case LMS_TFAULT: mstimep = &pup->pr_tftime; break;
3696 case LMS_DFAULT: mstimep = &pup->pr_dftime; break;
3697 case LMS_KFAULT: mstimep = &pup->pr_kftime; break;
3698 case LMS_USER_LOCK: mstimep = &pup->pr_ltime; break;
3699 case LMS_SLEEP: mstimep = &pup->pr_slptime; break;
3700 case LMS_WAIT_CPU: mstimep = &pup->pr_wtime; break;
3701 case LMS_STOPPED: mstimep = &pup->pr_stoptime; break;
3702 default: panic("prgetusage: unknown microstate");
3703 }
3704 tmp1 = curtime - ms->ms_state_start;
3705 if (tmp1 < 0) {
3706 curtime = gethrtime_unscaled();
3707 i++;
3708 continue;
3709 }
3710 scalehrtime(&tmp1);
3711 } while (tmp1 < 0 && i < MAX_ITERS_SPIN);
3712
3713 *mstimep += tmp1;
3714
3715 /* update pup timestamp */
3716 pup->pr_tstamp = curtime;
3717 scalehrtime(&pup->pr_tstamp);
3718
3719 /*
3720 * Resource usage counters.
3721 */
3722 pup->pr_minf = lwp->lwp_ru.minflt;
3723 pup->pr_majf = lwp->lwp_ru.majflt;
3724 pup->pr_nswap = lwp->lwp_ru.nswap;
3725 pup->pr_inblk = lwp->lwp_ru.inblock;
3726 pup->pr_oublk = lwp->lwp_ru.oublock;
3727 pup->pr_msnd = lwp->lwp_ru.msgsnd;
3728 pup->pr_mrcv = lwp->lwp_ru.msgrcv;
3729 pup->pr_sigs = lwp->lwp_ru.nsignals;
3730 pup->pr_vctx = lwp->lwp_ru.nvcsw;
3731 pup->pr_ictx = lwp->lwp_ru.nivcsw;
3732 pup->pr_sysc = lwp->lwp_ru.sysc;
3733 pup->pr_ioch = lwp->lwp_ru.ioch;
3734 }
3735
3736 /*
3737 * Convert ms_acct stats from unscaled high-res time to nanoseconds
3738 */
3739 void
prscaleusage(prhusage_t * usg)3740 prscaleusage(prhusage_t *usg)
3741 {
3742 scalehrtime(&usg->pr_utime);
3743 scalehrtime(&usg->pr_stime);
3744 scalehrtime(&usg->pr_ttime);
3745 scalehrtime(&usg->pr_tftime);
3746 scalehrtime(&usg->pr_dftime);
3747 scalehrtime(&usg->pr_kftime);
3748 scalehrtime(&usg->pr_ltime);
3749 scalehrtime(&usg->pr_slptime);
3750 scalehrtime(&usg->pr_wtime);
3751 scalehrtime(&usg->pr_stoptime);
3752 }
3753
3754
3755 /*
3756 * Sum resource usage information.
3757 */
3758 void
praddusage(kthread_t * t,prhusage_t * pup)3759 praddusage(kthread_t *t, prhusage_t *pup)
3760 {
3761 klwp_t *lwp = ttolwp(t);
3762 hrtime_t *mstimep;
3763 struct mstate *ms = &lwp->lwp_mstate;
3764 int state;
3765 int i;
3766 hrtime_t curtime;
3767 hrtime_t waitrq;
3768 hrtime_t tmp;
3769 prhusage_t conv;
3770
3771 curtime = gethrtime_unscaled();
3772
3773 if (ms->ms_term == 0) {
3774 tmp = curtime - ms->ms_start;
3775 scalehrtime(&tmp);
3776 pup->pr_rtime += tmp;
3777 } else {
3778 tmp = ms->ms_term - ms->ms_start;
3779 scalehrtime(&tmp);
3780 pup->pr_rtime += tmp;
3781 }
3782
3783 conv.pr_utime = ms->ms_acct[LMS_USER];
3784 conv.pr_stime = ms->ms_acct[LMS_SYSTEM];
3785 conv.pr_ttime = ms->ms_acct[LMS_TRAP];
3786 conv.pr_tftime = ms->ms_acct[LMS_TFAULT];
3787 conv.pr_dftime = ms->ms_acct[LMS_DFAULT];
3788 conv.pr_kftime = ms->ms_acct[LMS_KFAULT];
3789 conv.pr_ltime = ms->ms_acct[LMS_USER_LOCK];
3790 conv.pr_slptime = ms->ms_acct[LMS_SLEEP];
3791 conv.pr_wtime = ms->ms_acct[LMS_WAIT_CPU];
3792 conv.pr_stoptime = ms->ms_acct[LMS_STOPPED];
3793
3794 prscaleusage(&conv);
3795
3796 pup->pr_utime += conv.pr_utime;
3797 pup->pr_stime += conv.pr_stime;
3798 pup->pr_ttime += conv.pr_ttime;
3799 pup->pr_tftime += conv.pr_tftime;
3800 pup->pr_dftime += conv.pr_dftime;
3801 pup->pr_kftime += conv.pr_kftime;
3802 pup->pr_ltime += conv.pr_ltime;
3803 pup->pr_slptime += conv.pr_slptime;
3804 pup->pr_wtime += conv.pr_wtime;
3805 pup->pr_stoptime += conv.pr_stoptime;
3806
3807 /*
3808 * Adjust for time waiting in the dispatcher queue.
3809 */
3810 waitrq = t->t_waitrq; /* hopefully atomic */
3811 if (waitrq != 0) {
3812 if (waitrq > curtime) {
3813 curtime = gethrtime_unscaled();
3814 }
3815 tmp = curtime - waitrq;
3816 scalehrtime(&tmp);
3817 pup->pr_wtime += tmp;
3818 curtime = waitrq;
3819 }
3820
3821 /*
3822 * Adjust for time spent in current microstate.
3823 */
3824 if (ms->ms_state_start > curtime) {
3825 curtime = gethrtime_unscaled();
3826 }
3827
3828 i = 0;
3829 do {
3830 switch (state = t->t_mstate) {
3831 case LMS_SLEEP:
3832 /*
3833 * Update the timer for the current sleep state.
3834 */
3835 switch (state = ms->ms_prev) {
3836 case LMS_TFAULT:
3837 case LMS_DFAULT:
3838 case LMS_KFAULT:
3839 case LMS_USER_LOCK:
3840 break;
3841 default:
3842 state = LMS_SLEEP;
3843 break;
3844 }
3845 break;
3846 case LMS_TFAULT:
3847 case LMS_DFAULT:
3848 case LMS_KFAULT:
3849 case LMS_USER_LOCK:
3850 state = LMS_SYSTEM;
3851 break;
3852 }
3853 switch (state) {
3854 case LMS_USER: mstimep = &pup->pr_utime; break;
3855 case LMS_SYSTEM: mstimep = &pup->pr_stime; break;
3856 case LMS_TRAP: mstimep = &pup->pr_ttime; break;
3857 case LMS_TFAULT: mstimep = &pup->pr_tftime; break;
3858 case LMS_DFAULT: mstimep = &pup->pr_dftime; break;
3859 case LMS_KFAULT: mstimep = &pup->pr_kftime; break;
3860 case LMS_USER_LOCK: mstimep = &pup->pr_ltime; break;
3861 case LMS_SLEEP: mstimep = &pup->pr_slptime; break;
3862 case LMS_WAIT_CPU: mstimep = &pup->pr_wtime; break;
3863 case LMS_STOPPED: mstimep = &pup->pr_stoptime; break;
3864 default: panic("praddusage: unknown microstate");
3865 }
3866 tmp = curtime - ms->ms_state_start;
3867 if (tmp < 0) {
3868 curtime = gethrtime_unscaled();
3869 i++;
3870 continue;
3871 }
3872 scalehrtime(&tmp);
3873 } while (tmp < 0 && i < MAX_ITERS_SPIN);
3874
3875 *mstimep += tmp;
3876
3877 /* update pup timestamp */
3878 pup->pr_tstamp = curtime;
3879 scalehrtime(&pup->pr_tstamp);
3880
3881 /*
3882 * Resource usage counters.
3883 */
3884 pup->pr_minf += lwp->lwp_ru.minflt;
3885 pup->pr_majf += lwp->lwp_ru.majflt;
3886 pup->pr_nswap += lwp->lwp_ru.nswap;
3887 pup->pr_inblk += lwp->lwp_ru.inblock;
3888 pup->pr_oublk += lwp->lwp_ru.oublock;
3889 pup->pr_msnd += lwp->lwp_ru.msgsnd;
3890 pup->pr_mrcv += lwp->lwp_ru.msgrcv;
3891 pup->pr_sigs += lwp->lwp_ru.nsignals;
3892 pup->pr_vctx += lwp->lwp_ru.nvcsw;
3893 pup->pr_ictx += lwp->lwp_ru.nivcsw;
3894 pup->pr_sysc += lwp->lwp_ru.sysc;
3895 pup->pr_ioch += lwp->lwp_ru.ioch;
3896 }
3897
3898 /*
3899 * Convert a prhusage_t to a prusage_t.
3900 * This means convert each hrtime_t to a timestruc_t
3901 * and copy the count fields uint64_t => ulong_t.
3902 */
3903 void
prcvtusage(prhusage_t * pup,prusage_t * upup)3904 prcvtusage(prhusage_t *pup, prusage_t *upup)
3905 {
3906 uint64_t *ullp;
3907 ulong_t *ulp;
3908 int i;
3909
3910 upup->pr_lwpid = pup->pr_lwpid;
3911 upup->pr_count = pup->pr_count;
3912
3913 hrt2ts(pup->pr_tstamp, &upup->pr_tstamp);
3914 hrt2ts(pup->pr_create, &upup->pr_create);
3915 hrt2ts(pup->pr_term, &upup->pr_term);
3916 hrt2ts(pup->pr_rtime, &upup->pr_rtime);
3917 hrt2ts(pup->pr_utime, &upup->pr_utime);
3918 hrt2ts(pup->pr_stime, &upup->pr_stime);
3919 hrt2ts(pup->pr_ttime, &upup->pr_ttime);
3920 hrt2ts(pup->pr_tftime, &upup->pr_tftime);
3921 hrt2ts(pup->pr_dftime, &upup->pr_dftime);
3922 hrt2ts(pup->pr_kftime, &upup->pr_kftime);
3923 hrt2ts(pup->pr_ltime, &upup->pr_ltime);
3924 hrt2ts(pup->pr_slptime, &upup->pr_slptime);
3925 hrt2ts(pup->pr_wtime, &upup->pr_wtime);
3926 hrt2ts(pup->pr_stoptime, &upup->pr_stoptime);
3927 bzero(upup->filltime, sizeof (upup->filltime));
3928
3929 ullp = &pup->pr_minf;
3930 ulp = &upup->pr_minf;
3931 for (i = 0; i < 22; i++)
3932 *ulp++ = (ulong_t)*ullp++;
3933 }
3934
3935 #ifdef _SYSCALL32_IMPL
3936 void
prcvtusage32(prhusage_t * pup,prusage32_t * upup)3937 prcvtusage32(prhusage_t *pup, prusage32_t *upup)
3938 {
3939 uint64_t *ullp;
3940 uint32_t *ulp;
3941 int i;
3942
3943 upup->pr_lwpid = pup->pr_lwpid;
3944 upup->pr_count = pup->pr_count;
3945
3946 hrt2ts32(pup->pr_tstamp, &upup->pr_tstamp);
3947 hrt2ts32(pup->pr_create, &upup->pr_create);
3948 hrt2ts32(pup->pr_term, &upup->pr_term);
3949 hrt2ts32(pup->pr_rtime, &upup->pr_rtime);
3950 hrt2ts32(pup->pr_utime, &upup->pr_utime);
3951 hrt2ts32(pup->pr_stime, &upup->pr_stime);
3952 hrt2ts32(pup->pr_ttime, &upup->pr_ttime);
3953 hrt2ts32(pup->pr_tftime, &upup->pr_tftime);
3954 hrt2ts32(pup->pr_dftime, &upup->pr_dftime);
3955 hrt2ts32(pup->pr_kftime, &upup->pr_kftime);
3956 hrt2ts32(pup->pr_ltime, &upup->pr_ltime);
3957 hrt2ts32(pup->pr_slptime, &upup->pr_slptime);
3958 hrt2ts32(pup->pr_wtime, &upup->pr_wtime);
3959 hrt2ts32(pup->pr_stoptime, &upup->pr_stoptime);
3960 bzero(upup->filltime, sizeof (upup->filltime));
3961
3962 ullp = &pup->pr_minf;
3963 ulp = &upup->pr_minf;
3964 for (i = 0; i < 22; i++)
3965 *ulp++ = (uint32_t)*ullp++;
3966 }
3967 #endif /* _SYSCALL32_IMPL */
3968
3969 /*
3970 * Determine whether a set is empty.
3971 */
3972 int
setisempty(uint32_t * sp,uint_t n)3973 setisempty(uint32_t *sp, uint_t n)
3974 {
3975 while (n--)
3976 if (*sp++)
3977 return (0);
3978 return (1);
3979 }
3980
3981 /*
3982 * Utility routine for establishing a watched area in the process.
3983 * Keep the list of watched areas sorted by virtual address.
3984 */
3985 int
set_watched_area(proc_t * p,struct watched_area * pwa)3986 set_watched_area(proc_t *p, struct watched_area *pwa)
3987 {
3988 caddr_t vaddr = pwa->wa_vaddr;
3989 caddr_t eaddr = pwa->wa_eaddr;
3990 ulong_t flags = pwa->wa_flags;
3991 struct watched_area *target;
3992 avl_index_t where;
3993 int error = 0;
3994
3995 /* we must not be holding p->p_lock, but the process must be locked */
3996 ASSERT(MUTEX_NOT_HELD(&p->p_lock));
3997 ASSERT(p->p_proc_flag & P_PR_LOCK);
3998
3999 /*
4000 * If this is our first watchpoint, enable watchpoints for the process.
4001 */
4002 if (!pr_watch_active(p)) {
4003 kthread_t *t;
4004
4005 mutex_enter(&p->p_lock);
4006 if ((t = p->p_tlist) != NULL) {
4007 do {
4008 watch_enable(t);
4009 } while ((t = t->t_forw) != p->p_tlist);
4010 }
4011 mutex_exit(&p->p_lock);
4012 }
4013
4014 target = pr_find_watched_area(p, pwa, &where);
4015 if (target != NULL) {
4016 /*
4017 * We discovered an existing, overlapping watched area.
4018 * Allow it only if it is an exact match.
4019 */
4020 if (target->wa_vaddr != vaddr ||
4021 target->wa_eaddr != eaddr)
4022 error = EINVAL;
4023 else if (target->wa_flags != flags) {
4024 error = set_watched_page(p, vaddr, eaddr,
4025 flags, target->wa_flags);
4026 target->wa_flags = flags;
4027 }
4028 kmem_free(pwa, sizeof (struct watched_area));
4029 } else {
4030 avl_insert(&p->p_warea, pwa, where);
4031 error = set_watched_page(p, vaddr, eaddr, flags, 0);
4032 }
4033
4034 return (error);
4035 }
4036
4037 /*
4038 * Utility routine for clearing a watched area in the process.
4039 * Must be an exact match of the virtual address.
4040 * size and flags don't matter.
4041 */
4042 int
clear_watched_area(proc_t * p,struct watched_area * pwa)4043 clear_watched_area(proc_t *p, struct watched_area *pwa)
4044 {
4045 struct watched_area *found;
4046
4047 /* we must not be holding p->p_lock, but the process must be locked */
4048 ASSERT(MUTEX_NOT_HELD(&p->p_lock));
4049 ASSERT(p->p_proc_flag & P_PR_LOCK);
4050
4051
4052 if (!pr_watch_active(p)) {
4053 kmem_free(pwa, sizeof (struct watched_area));
4054 return (0);
4055 }
4056
4057 /*
4058 * Look for a matching address in the watched areas. If a match is
4059 * found, clear the old watched area and adjust the watched page(s). It
4060 * is not an error if there is no match.
4061 */
4062 if ((found = pr_find_watched_area(p, pwa, NULL)) != NULL &&
4063 found->wa_vaddr == pwa->wa_vaddr) {
4064 clear_watched_page(p, found->wa_vaddr, found->wa_eaddr,
4065 found->wa_flags);
4066 avl_remove(&p->p_warea, found);
4067 kmem_free(found, sizeof (struct watched_area));
4068 }
4069
4070 kmem_free(pwa, sizeof (struct watched_area));
4071
4072 /*
4073 * If we removed the last watched area from the process, disable
4074 * watchpoints.
4075 */
4076 if (!pr_watch_active(p)) {
4077 kthread_t *t;
4078
4079 mutex_enter(&p->p_lock);
4080 if ((t = p->p_tlist) != NULL) {
4081 do {
4082 watch_disable(t);
4083 } while ((t = t->t_forw) != p->p_tlist);
4084 }
4085 mutex_exit(&p->p_lock);
4086 }
4087
4088 return (0);
4089 }
4090
4091 /*
4092 * Frees all the watched_area structures
4093 */
4094 void
pr_free_watchpoints(proc_t * p)4095 pr_free_watchpoints(proc_t *p)
4096 {
4097 struct watched_area *delp;
4098 void *cookie;
4099
4100 cookie = NULL;
4101 while ((delp = avl_destroy_nodes(&p->p_warea, &cookie)) != NULL)
4102 kmem_free(delp, sizeof (struct watched_area));
4103
4104 avl_destroy(&p->p_warea);
4105 }
4106
4107 /*
4108 * This one is called by the traced process to unwatch all the
4109 * pages while deallocating the list of watched_page structs.
4110 */
4111 void
pr_free_watched_pages(proc_t * p)4112 pr_free_watched_pages(proc_t *p)
4113 {
4114 struct as *as = p->p_as;
4115 struct watched_page *pwp;
4116 uint_t prot;
4117 int retrycnt, err;
4118 void *cookie;
4119
4120 if (as == NULL || avl_numnodes(&as->a_wpage) == 0)
4121 return;
4122
4123 ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
4124 AS_LOCK_ENTER(as, RW_WRITER);
4125
4126 pwp = avl_first(&as->a_wpage);
4127
4128 cookie = NULL;
4129 while ((pwp = avl_destroy_nodes(&as->a_wpage, &cookie)) != NULL) {
4130 retrycnt = 0;
4131 if ((prot = pwp->wp_oprot) != 0) {
4132 caddr_t addr = pwp->wp_vaddr;
4133 struct seg *seg;
4134 retry:
4135
4136 if ((pwp->wp_prot != prot ||
4137 (pwp->wp_flags & WP_NOWATCH)) &&
4138 (seg = as_segat(as, addr)) != NULL) {
4139 err = SEGOP_SETPROT(seg, addr, PAGESIZE, prot);
4140 if (err == IE_RETRY) {
4141 ASSERT(retrycnt == 0);
4142 retrycnt++;
4143 goto retry;
4144 }
4145 }
4146 }
4147 kmem_free(pwp, sizeof (struct watched_page));
4148 }
4149
4150 avl_destroy(&as->a_wpage);
4151 p->p_wprot = NULL;
4152
4153 AS_LOCK_EXIT(as);
4154 }
4155
4156 /*
4157 * Insert a watched area into the list of watched pages.
4158 * If oflags is zero then we are adding a new watched area.
4159 * Otherwise we are changing the flags of an existing watched area.
4160 */
4161 static int
set_watched_page(proc_t * p,caddr_t vaddr,caddr_t eaddr,ulong_t flags,ulong_t oflags)4162 set_watched_page(proc_t *p, caddr_t vaddr, caddr_t eaddr,
4163 ulong_t flags, ulong_t oflags)
4164 {
4165 struct as *as = p->p_as;
4166 avl_tree_t *pwp_tree;
4167 struct watched_page *pwp, *newpwp;
4168 struct watched_page tpw;
4169 avl_index_t where;
4170 struct seg *seg;
4171 uint_t prot;
4172 caddr_t addr;
4173
4174 /*
4175 * We need to pre-allocate a list of structures before we grab the
4176 * address space lock to avoid calling kmem_alloc(KM_SLEEP) with locks
4177 * held.
4178 */
4179 newpwp = NULL;
4180 for (addr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
4181 addr < eaddr; addr += PAGESIZE) {
4182 pwp = kmem_zalloc(sizeof (struct watched_page), KM_SLEEP);
4183 pwp->wp_list = newpwp;
4184 newpwp = pwp;
4185 }
4186
4187 AS_LOCK_ENTER(as, RW_WRITER);
4188
4189 /*
4190 * Search for an existing watched page to contain the watched area.
4191 * If none is found, grab a new one from the available list
4192 * and insert it in the active list, keeping the list sorted
4193 * by user-level virtual address.
4194 */
4195 if (p->p_flag & SVFWAIT)
4196 pwp_tree = &p->p_wpage;
4197 else
4198 pwp_tree = &as->a_wpage;
4199
4200 again:
4201 if (avl_numnodes(pwp_tree) > prnwatch) {
4202 AS_LOCK_EXIT(as);
4203 while (newpwp != NULL) {
4204 pwp = newpwp->wp_list;
4205 kmem_free(newpwp, sizeof (struct watched_page));
4206 newpwp = pwp;
4207 }
4208 return (E2BIG);
4209 }
4210
4211 tpw.wp_vaddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
4212 if ((pwp = avl_find(pwp_tree, &tpw, &where)) == NULL) {
4213 pwp = newpwp;
4214 newpwp = newpwp->wp_list;
4215 pwp->wp_list = NULL;
4216 pwp->wp_vaddr = (caddr_t)((uintptr_t)vaddr &
4217 (uintptr_t)PAGEMASK);
4218 avl_insert(pwp_tree, pwp, where);
4219 }
4220
4221 ASSERT(vaddr >= pwp->wp_vaddr && vaddr < pwp->wp_vaddr + PAGESIZE);
4222
4223 if (oflags & WA_READ)
4224 pwp->wp_read--;
4225 if (oflags & WA_WRITE)
4226 pwp->wp_write--;
4227 if (oflags & WA_EXEC)
4228 pwp->wp_exec--;
4229
4230 ASSERT(pwp->wp_read >= 0);
4231 ASSERT(pwp->wp_write >= 0);
4232 ASSERT(pwp->wp_exec >= 0);
4233
4234 if (flags & WA_READ)
4235 pwp->wp_read++;
4236 if (flags & WA_WRITE)
4237 pwp->wp_write++;
4238 if (flags & WA_EXEC)
4239 pwp->wp_exec++;
4240
4241 if (!(p->p_flag & SVFWAIT)) {
4242 vaddr = pwp->wp_vaddr;
4243 if (pwp->wp_oprot == 0 &&
4244 (seg = as_segat(as, vaddr)) != NULL) {
4245 SEGOP_GETPROT(seg, vaddr, 0, &prot);
4246 pwp->wp_oprot = (uchar_t)prot;
4247 pwp->wp_prot = (uchar_t)prot;
4248 }
4249 if (pwp->wp_oprot != 0) {
4250 prot = pwp->wp_oprot;
4251 if (pwp->wp_read)
4252 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4253 if (pwp->wp_write)
4254 prot &= ~PROT_WRITE;
4255 if (pwp->wp_exec)
4256 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4257 if (!(pwp->wp_flags & WP_NOWATCH) &&
4258 pwp->wp_prot != prot &&
4259 (pwp->wp_flags & WP_SETPROT) == 0) {
4260 pwp->wp_flags |= WP_SETPROT;
4261 pwp->wp_list = p->p_wprot;
4262 p->p_wprot = pwp;
4263 }
4264 pwp->wp_prot = (uchar_t)prot;
4265 }
4266 }
4267
4268 /*
4269 * If the watched area extends into the next page then do
4270 * it over again with the virtual address of the next page.
4271 */
4272 if ((vaddr = pwp->wp_vaddr + PAGESIZE) < eaddr)
4273 goto again;
4274
4275 AS_LOCK_EXIT(as);
4276
4277 /*
4278 * Free any pages we may have over-allocated
4279 */
4280 while (newpwp != NULL) {
4281 pwp = newpwp->wp_list;
4282 kmem_free(newpwp, sizeof (struct watched_page));
4283 newpwp = pwp;
4284 }
4285
4286 return (0);
4287 }
4288
4289 /*
4290 * Remove a watched area from the list of watched pages.
4291 * A watched area may extend over more than one page.
4292 */
4293 static void
clear_watched_page(proc_t * p,caddr_t vaddr,caddr_t eaddr,ulong_t flags)4294 clear_watched_page(proc_t *p, caddr_t vaddr, caddr_t eaddr, ulong_t flags)
4295 {
4296 struct as *as = p->p_as;
4297 struct watched_page *pwp;
4298 struct watched_page tpw;
4299 avl_tree_t *tree;
4300 avl_index_t where;
4301
4302 AS_LOCK_ENTER(as, RW_WRITER);
4303
4304 if (p->p_flag & SVFWAIT)
4305 tree = &p->p_wpage;
4306 else
4307 tree = &as->a_wpage;
4308
4309 tpw.wp_vaddr = vaddr =
4310 (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
4311 pwp = avl_find(tree, &tpw, &where);
4312 if (pwp == NULL)
4313 pwp = avl_nearest(tree, where, AVL_AFTER);
4314
4315 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
4316 ASSERT(vaddr <= pwp->wp_vaddr);
4317
4318 if (flags & WA_READ)
4319 pwp->wp_read--;
4320 if (flags & WA_WRITE)
4321 pwp->wp_write--;
4322 if (flags & WA_EXEC)
4323 pwp->wp_exec--;
4324
4325 if (pwp->wp_read + pwp->wp_write + pwp->wp_exec != 0) {
4326 /*
4327 * Reset the hat layer's protections on this page.
4328 */
4329 if (pwp->wp_oprot != 0) {
4330 uint_t prot = pwp->wp_oprot;
4331
4332 if (pwp->wp_read)
4333 prot &=
4334 ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4335 if (pwp->wp_write)
4336 prot &= ~PROT_WRITE;
4337 if (pwp->wp_exec)
4338 prot &=
4339 ~(PROT_READ|PROT_WRITE|PROT_EXEC);
4340 if (!(pwp->wp_flags & WP_NOWATCH) &&
4341 pwp->wp_prot != prot &&
4342 (pwp->wp_flags & WP_SETPROT) == 0) {
4343 pwp->wp_flags |= WP_SETPROT;
4344 pwp->wp_list = p->p_wprot;
4345 p->p_wprot = pwp;
4346 }
4347 pwp->wp_prot = (uchar_t)prot;
4348 }
4349 } else {
4350 /*
4351 * No watched areas remain in this page.
4352 * Reset everything to normal.
4353 */
4354 if (pwp->wp_oprot != 0) {
4355 pwp->wp_prot = pwp->wp_oprot;
4356 if ((pwp->wp_flags & WP_SETPROT) == 0) {
4357 pwp->wp_flags |= WP_SETPROT;
4358 pwp->wp_list = p->p_wprot;
4359 p->p_wprot = pwp;
4360 }
4361 }
4362 }
4363
4364 pwp = AVL_NEXT(tree, pwp);
4365 }
4366
4367 AS_LOCK_EXIT(as);
4368 }
4369
4370 /*
4371 * Return the original protections for the specified page.
4372 */
4373 static void
getwatchprot(struct as * as,caddr_t addr,uint_t * prot)4374 getwatchprot(struct as *as, caddr_t addr, uint_t *prot)
4375 {
4376 struct watched_page *pwp;
4377 struct watched_page tpw;
4378
4379 ASSERT(AS_LOCK_HELD(as));
4380
4381 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
4382 if ((pwp = avl_find(&as->a_wpage, &tpw, NULL)) != NULL)
4383 *prot = pwp->wp_oprot;
4384 }
4385
4386 static prpagev_t *
pr_pagev_create(struct seg * seg,int check_noreserve)4387 pr_pagev_create(struct seg *seg, int check_noreserve)
4388 {
4389 prpagev_t *pagev = kmem_alloc(sizeof (prpagev_t), KM_SLEEP);
4390 size_t total_pages = seg_pages(seg);
4391
4392 /*
4393 * Limit the size of our vectors to pagev_lim pages at a time. We need
4394 * 4 or 5 bytes of storage per page, so this means we limit ourself
4395 * to about a megabyte of kernel heap by default.
4396 */
4397 pagev->pg_npages = MIN(total_pages, pagev_lim);
4398 pagev->pg_pnbase = 0;
4399
4400 pagev->pg_protv =
4401 kmem_alloc(pagev->pg_npages * sizeof (uint_t), KM_SLEEP);
4402
4403 if (check_noreserve)
4404 pagev->pg_incore =
4405 kmem_alloc(pagev->pg_npages * sizeof (char), KM_SLEEP);
4406 else
4407 pagev->pg_incore = NULL;
4408
4409 return (pagev);
4410 }
4411
4412 static void
pr_pagev_destroy(prpagev_t * pagev)4413 pr_pagev_destroy(prpagev_t *pagev)
4414 {
4415 if (pagev->pg_incore != NULL)
4416 kmem_free(pagev->pg_incore, pagev->pg_npages * sizeof (char));
4417
4418 kmem_free(pagev->pg_protv, pagev->pg_npages * sizeof (uint_t));
4419 kmem_free(pagev, sizeof (prpagev_t));
4420 }
4421
4422 static caddr_t
pr_pagev_fill(prpagev_t * pagev,struct seg * seg,caddr_t addr,caddr_t eaddr)4423 pr_pagev_fill(prpagev_t *pagev, struct seg *seg, caddr_t addr, caddr_t eaddr)
4424 {
4425 ulong_t lastpg = seg_page(seg, eaddr - 1);
4426 ulong_t pn, pnlim;
4427 caddr_t saddr;
4428 size_t len;
4429
4430 ASSERT(addr >= seg->s_base && addr <= eaddr);
4431
4432 if (addr == eaddr)
4433 return (eaddr);
4434
4435 refill:
4436 ASSERT(addr < eaddr);
4437 pagev->pg_pnbase = seg_page(seg, addr);
4438 pnlim = pagev->pg_pnbase + pagev->pg_npages;
4439 saddr = addr;
4440
4441 if (lastpg < pnlim)
4442 len = (size_t)(eaddr - addr);
4443 else
4444 len = pagev->pg_npages * PAGESIZE;
4445
4446 if (pagev->pg_incore != NULL) {
4447 /*
4448 * INCORE cleverly has different semantics than GETPROT:
4449 * it returns info on pages up to but NOT including addr + len.
4450 */
4451 SEGOP_INCORE(seg, addr, len, pagev->pg_incore);
4452 pn = pagev->pg_pnbase;
4453
4454 do {
4455 /*
4456 * Guilty knowledge here: We know that segvn_incore
4457 * returns more than just the low-order bit that
4458 * indicates the page is actually in memory. If any
4459 * bits are set, then the page has backing store.
4460 */
4461 if (pagev->pg_incore[pn++ - pagev->pg_pnbase])
4462 goto out;
4463
4464 } while ((addr += PAGESIZE) < eaddr && pn < pnlim);
4465
4466 /*
4467 * If we examined all the pages in the vector but we're not
4468 * at the end of the segment, take another lap.
4469 */
4470 if (addr < eaddr)
4471 goto refill;
4472 }
4473
4474 /*
4475 * Need to take len - 1 because addr + len is the address of the
4476 * first byte of the page just past the end of what we want.
4477 */
4478 out:
4479 SEGOP_GETPROT(seg, saddr, len - 1, pagev->pg_protv);
4480 return (addr);
4481 }
4482
4483 static caddr_t
pr_pagev_nextprot(prpagev_t * pagev,struct seg * seg,caddr_t * saddrp,caddr_t eaddr,uint_t * protp)4484 pr_pagev_nextprot(prpagev_t *pagev, struct seg *seg,
4485 caddr_t *saddrp, caddr_t eaddr, uint_t *protp)
4486 {
4487 /*
4488 * Our starting address is either the specified address, or the base
4489 * address from the start of the pagev. If the latter is greater,
4490 * this means a previous call to pr_pagev_fill has already scanned
4491 * further than the end of the previous mapping.
4492 */
4493 caddr_t base = seg->s_base + pagev->pg_pnbase * PAGESIZE;
4494 caddr_t addr = MAX(*saddrp, base);
4495 ulong_t pn = seg_page(seg, addr);
4496 uint_t prot, nprot;
4497
4498 /*
4499 * If we're dealing with noreserve pages, then advance addr to
4500 * the address of the next page which has backing store.
4501 */
4502 if (pagev->pg_incore != NULL) {
4503 while (pagev->pg_incore[pn - pagev->pg_pnbase] == 0) {
4504 if ((addr += PAGESIZE) == eaddr) {
4505 *saddrp = addr;
4506 prot = 0;
4507 goto out;
4508 }
4509 if (++pn == pagev->pg_pnbase + pagev->pg_npages) {
4510 addr = pr_pagev_fill(pagev, seg, addr, eaddr);
4511 if (addr == eaddr) {
4512 *saddrp = addr;
4513 prot = 0;
4514 goto out;
4515 }
4516 pn = seg_page(seg, addr);
4517 }
4518 }
4519 }
4520
4521 /*
4522 * Get the protections on the page corresponding to addr.
4523 */
4524 pn = seg_page(seg, addr);
4525 ASSERT(pn >= pagev->pg_pnbase);
4526 ASSERT(pn < (pagev->pg_pnbase + pagev->pg_npages));
4527
4528 prot = pagev->pg_protv[pn - pagev->pg_pnbase];
4529 getwatchprot(seg->s_as, addr, &prot);
4530 *saddrp = addr;
4531
4532 /*
4533 * Now loop until we find a backed page with different protections
4534 * or we reach the end of this segment.
4535 */
4536 while ((addr += PAGESIZE) < eaddr) {
4537 /*
4538 * If pn has advanced to the page number following what we
4539 * have information on, refill the page vector and reset
4540 * addr and pn. If pr_pagev_fill does not return the
4541 * address of the next page, we have a discontiguity and
4542 * thus have reached the end of the current mapping.
4543 */
4544 if (++pn == pagev->pg_pnbase + pagev->pg_npages) {
4545 caddr_t naddr = pr_pagev_fill(pagev, seg, addr, eaddr);
4546 if (naddr != addr)
4547 goto out;
4548 pn = seg_page(seg, addr);
4549 }
4550
4551 /*
4552 * The previous page's protections are in prot, and it has
4553 * backing. If this page is MAP_NORESERVE and has no backing,
4554 * then end this mapping and return the previous protections.
4555 */
4556 if (pagev->pg_incore != NULL &&
4557 pagev->pg_incore[pn - pagev->pg_pnbase] == 0)
4558 break;
4559
4560 /*
4561 * Otherwise end the mapping if this page's protections (nprot)
4562 * are different than those in the previous page (prot).
4563 */
4564 nprot = pagev->pg_protv[pn - pagev->pg_pnbase];
4565 getwatchprot(seg->s_as, addr, &nprot);
4566
4567 if (nprot != prot)
4568 break;
4569 }
4570
4571 out:
4572 *protp = prot;
4573 return (addr);
4574 }
4575
4576 size_t
pr_getsegsize(struct seg * seg,int reserved)4577 pr_getsegsize(struct seg *seg, int reserved)
4578 {
4579 size_t size = seg->s_size;
4580
4581 /*
4582 * If we're interested in the reserved space, return the size of the
4583 * segment itself. Everything else in this function is a special case
4584 * to determine the actual underlying size of various segment types.
4585 */
4586 if (reserved)
4587 return (size);
4588
4589 /*
4590 * If this is a segvn mapping of a regular file, return the smaller
4591 * of the segment size and the remaining size of the file beyond
4592 * the file offset corresponding to seg->s_base.
4593 */
4594 if (seg->s_ops == &segvn_ops) {
4595 vattr_t vattr;
4596 vnode_t *vp;
4597
4598 vattr.va_mask = AT_SIZE;
4599
4600 if (SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
4601 vp != NULL && vp->v_type == VREG &&
4602 VOP_GETATTR(vp, &vattr, 0, CRED(), NULL) == 0) {
4603
4604 u_offset_t fsize = vattr.va_size;
4605 u_offset_t offset = SEGOP_GETOFFSET(seg, seg->s_base);
4606
4607 if (fsize < offset)
4608 fsize = 0;
4609 else
4610 fsize -= offset;
4611
4612 fsize = roundup(fsize, (u_offset_t)PAGESIZE);
4613
4614 if (fsize < (u_offset_t)size)
4615 size = (size_t)fsize;
4616 }
4617
4618 return (size);
4619 }
4620
4621 /*
4622 * If this is an ISM shared segment, don't include pages that are
4623 * beyond the real size of the spt segment that backs it.
4624 */
4625 if (seg->s_ops == &segspt_shmops)
4626 return (MIN(spt_realsize(seg), size));
4627
4628 /*
4629 * If this is segment is a mapping from /dev/null, then this is a
4630 * reservation of virtual address space and has no actual size.
4631 * Such segments are backed by segdev and have type set to neither
4632 * MAP_SHARED nor MAP_PRIVATE.
4633 */
4634 if (seg->s_ops == &segdev_ops &&
4635 ((SEGOP_GETTYPE(seg, seg->s_base) &
4636 (MAP_SHARED | MAP_PRIVATE)) == 0))
4637 return (0);
4638
4639 /*
4640 * If this segment doesn't match one of the special types we handle,
4641 * just return the size of the segment itself.
4642 */
4643 return (size);
4644 }
4645
4646 uint_t
pr_getprot(struct seg * seg,int reserved,void ** tmp,caddr_t * saddrp,caddr_t * naddrp,caddr_t eaddr)4647 pr_getprot(struct seg *seg, int reserved, void **tmp,
4648 caddr_t *saddrp, caddr_t *naddrp, caddr_t eaddr)
4649 {
4650 struct as *as = seg->s_as;
4651
4652 caddr_t saddr = *saddrp;
4653 caddr_t naddr;
4654
4655 int check_noreserve;
4656 uint_t prot;
4657
4658 union {
4659 struct segvn_data *svd;
4660 struct segdev_data *sdp;
4661 void *data;
4662 } s;
4663
4664 s.data = seg->s_data;
4665
4666 ASSERT(AS_WRITE_HELD(as));
4667 ASSERT(saddr >= seg->s_base && saddr < eaddr);
4668 ASSERT(eaddr <= seg->s_base + seg->s_size);
4669
4670 /*
4671 * Don't include MAP_NORESERVE pages in the address range
4672 * unless their mappings have actually materialized.
4673 * We cheat by knowing that segvn is the only segment
4674 * driver that supports MAP_NORESERVE.
4675 */
4676 check_noreserve =
4677 (!reserved && seg->s_ops == &segvn_ops && s.svd != NULL &&
4678 (s.svd->vp == NULL || s.svd->vp->v_type != VREG) &&
4679 (s.svd->flags & MAP_NORESERVE));
4680
4681 /*
4682 * Examine every page only as a last resort. We use guilty knowledge
4683 * of segvn and segdev to avoid this: if there are no per-page
4684 * protections present in the segment and we don't care about
4685 * MAP_NORESERVE, then s_data->prot is the prot for the whole segment.
4686 */
4687 if (!check_noreserve && saddr == seg->s_base &&
4688 seg->s_ops == &segvn_ops && s.svd != NULL && s.svd->pageprot == 0) {
4689 prot = s.svd->prot;
4690 getwatchprot(as, saddr, &prot);
4691 naddr = eaddr;
4692
4693 } else if (saddr == seg->s_base && seg->s_ops == &segdev_ops &&
4694 s.sdp != NULL && s.sdp->pageprot == 0) {
4695 prot = s.sdp->prot;
4696 getwatchprot(as, saddr, &prot);
4697 naddr = eaddr;
4698
4699 } else {
4700 prpagev_t *pagev;
4701
4702 /*
4703 * If addr is sitting at the start of the segment, then
4704 * create a page vector to store protection and incore
4705 * information for pages in the segment, and fill it.
4706 * Otherwise, we expect *tmp to address the prpagev_t
4707 * allocated by a previous call to this function.
4708 */
4709 if (saddr == seg->s_base) {
4710 pagev = pr_pagev_create(seg, check_noreserve);
4711 saddr = pr_pagev_fill(pagev, seg, saddr, eaddr);
4712
4713 ASSERT(*tmp == NULL);
4714 *tmp = pagev;
4715
4716 ASSERT(saddr <= eaddr);
4717 *saddrp = saddr;
4718
4719 if (saddr == eaddr) {
4720 naddr = saddr;
4721 prot = 0;
4722 goto out;
4723 }
4724
4725 } else {
4726 ASSERT(*tmp != NULL);
4727 pagev = (prpagev_t *)*tmp;
4728 }
4729
4730 naddr = pr_pagev_nextprot(pagev, seg, saddrp, eaddr, &prot);
4731 ASSERT(naddr <= eaddr);
4732 }
4733
4734 out:
4735 if (naddr == eaddr)
4736 pr_getprot_done(tmp);
4737 *naddrp = naddr;
4738 return (prot);
4739 }
4740
4741 void
pr_getprot_done(void ** tmp)4742 pr_getprot_done(void **tmp)
4743 {
4744 if (*tmp != NULL) {
4745 pr_pagev_destroy((prpagev_t *)*tmp);
4746 *tmp = NULL;
4747 }
4748 }
4749
4750 /*
4751 * Return true iff the vnode is a /proc file from the object directory.
4752 */
4753 int
pr_isobject(vnode_t * vp)4754 pr_isobject(vnode_t *vp)
4755 {
4756 return (vn_matchops(vp, prvnodeops) && VTOP(vp)->pr_type == PR_OBJECT);
4757 }
4758
4759 /*
4760 * Return true iff the vnode is a /proc file opened by the process itself.
4761 */
4762 int
pr_isself(vnode_t * vp)4763 pr_isself(vnode_t *vp)
4764 {
4765 /*
4766 * XXX: To retain binary compatibility with the old
4767 * ioctl()-based version of /proc, we exempt self-opens
4768 * of /proc/<pid> from being marked close-on-exec.
4769 */
4770 return (vn_matchops(vp, prvnodeops) &&
4771 (VTOP(vp)->pr_flags & PR_ISSELF) &&
4772 VTOP(vp)->pr_type != PR_PIDDIR);
4773 }
4774
4775 static ssize_t
pr_getpagesize(struct seg * seg,caddr_t saddr,caddr_t * naddrp,caddr_t eaddr)4776 pr_getpagesize(struct seg *seg, caddr_t saddr, caddr_t *naddrp, caddr_t eaddr)
4777 {
4778 ssize_t pagesize, hatsize;
4779
4780 ASSERT(AS_WRITE_HELD(seg->s_as));
4781 ASSERT(IS_P2ALIGNED(saddr, PAGESIZE));
4782 ASSERT(IS_P2ALIGNED(eaddr, PAGESIZE));
4783 ASSERT(saddr < eaddr);
4784
4785 pagesize = hatsize = hat_getpagesize(seg->s_as->a_hat, saddr);
4786 ASSERT(pagesize == -1 || IS_P2ALIGNED(pagesize, pagesize));
4787 ASSERT(pagesize != 0);
4788
4789 if (pagesize == -1)
4790 pagesize = PAGESIZE;
4791
4792 saddr += P2NPHASE((uintptr_t)saddr, pagesize);
4793
4794 while (saddr < eaddr) {
4795 if (hatsize != hat_getpagesize(seg->s_as->a_hat, saddr))
4796 break;
4797 ASSERT(IS_P2ALIGNED(saddr, pagesize));
4798 saddr += pagesize;
4799 }
4800
4801 *naddrp = ((saddr < eaddr) ? saddr : eaddr);
4802 return (hatsize);
4803 }
4804
4805 /*
4806 * Return an array of structures with extended memory map information.
4807 * We allocate here; the caller must deallocate.
4808 */
4809 int
prgetxmap(proc_t * p,list_t * iolhead)4810 prgetxmap(proc_t *p, list_t *iolhead)
4811 {
4812 struct as *as = p->p_as;
4813 prxmap_t *mp;
4814 struct seg *seg;
4815 struct seg *brkseg, *stkseg;
4816 struct vnode *vp;
4817 struct vattr vattr;
4818 uint_t prot;
4819
4820 ASSERT(as != &kas && AS_WRITE_HELD(as));
4821
4822 /*
4823 * Request an initial buffer size that doesn't waste memory
4824 * if the address space has only a small number of segments.
4825 */
4826 pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
4827
4828 if ((seg = AS_SEGFIRST(as)) == NULL)
4829 return (0);
4830
4831 brkseg = break_seg(p);
4832 stkseg = as_segat(as, prgetstackbase(p));
4833
4834 do {
4835 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
4836 caddr_t saddr, naddr, baddr;
4837 void *tmp = NULL;
4838 ssize_t psz;
4839 char *parr;
4840 uint64_t npages;
4841 uint64_t pagenum;
4842
4843 if ((seg->s_flags & S_HOLE) != 0) {
4844 continue;
4845 }
4846 /*
4847 * Segment loop part one: iterate from the base of the segment
4848 * to its end, pausing at each address boundary (baddr) between
4849 * ranges that have different virtual memory protections.
4850 */
4851 for (saddr = seg->s_base; saddr < eaddr; saddr = baddr) {
4852 prot = pr_getprot(seg, 0, &tmp, &saddr, &baddr, eaddr);
4853 ASSERT(baddr >= saddr && baddr <= eaddr);
4854
4855 /*
4856 * Segment loop part two: iterate from the current
4857 * position to the end of the protection boundary,
4858 * pausing at each address boundary (naddr) between
4859 * ranges that have different underlying page sizes.
4860 */
4861 for (; saddr < baddr; saddr = naddr) {
4862 psz = pr_getpagesize(seg, saddr, &naddr, baddr);
4863 ASSERT(naddr >= saddr && naddr <= baddr);
4864
4865 mp = pr_iol_newbuf(iolhead, sizeof (*mp));
4866
4867 mp->pr_vaddr = (uintptr_t)saddr;
4868 mp->pr_size = naddr - saddr;
4869 mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
4870 mp->pr_mflags = 0;
4871 if (prot & PROT_READ)
4872 mp->pr_mflags |= MA_READ;
4873 if (prot & PROT_WRITE)
4874 mp->pr_mflags |= MA_WRITE;
4875 if (prot & PROT_EXEC)
4876 mp->pr_mflags |= MA_EXEC;
4877 if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
4878 mp->pr_mflags |= MA_SHARED;
4879 if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
4880 mp->pr_mflags |= MA_NORESERVE;
4881 if (seg->s_ops == &segspt_shmops ||
4882 (seg->s_ops == &segvn_ops &&
4883 (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
4884 vp == NULL)))
4885 mp->pr_mflags |= MA_ANON;
4886 if (seg == brkseg)
4887 mp->pr_mflags |= MA_BREAK;
4888 else if (seg == stkseg)
4889 mp->pr_mflags |= MA_STACK;
4890 if (seg->s_ops == &segspt_shmops)
4891 mp->pr_mflags |= MA_ISM | MA_SHM;
4892
4893 mp->pr_pagesize = PAGESIZE;
4894 if (psz == -1) {
4895 mp->pr_hatpagesize = 0;
4896 } else {
4897 mp->pr_hatpagesize = psz;
4898 }
4899
4900 /*
4901 * Manufacture a filename for the "object" dir.
4902 */
4903 mp->pr_dev = PRNODEV;
4904 vattr.va_mask = AT_FSID|AT_NODEID;
4905 if (seg->s_ops == &segvn_ops &&
4906 SEGOP_GETVP(seg, saddr, &vp) == 0 &&
4907 vp != NULL && vp->v_type == VREG &&
4908 VOP_GETATTR(vp, &vattr, 0, CRED(),
4909 NULL) == 0) {
4910 mp->pr_dev = vattr.va_fsid;
4911 mp->pr_ino = vattr.va_nodeid;
4912 if (vp == p->p_exec)
4913 (void) strcpy(mp->pr_mapname,
4914 "a.out");
4915 else
4916 pr_object_name(mp->pr_mapname,
4917 vp, &vattr);
4918 }
4919
4920 /*
4921 * Get the SysV shared memory id, if any.
4922 */
4923 if ((mp->pr_mflags & MA_SHARED) &&
4924 p->p_segacct && (mp->pr_shmid = shmgetid(p,
4925 seg->s_base)) != SHMID_NONE) {
4926 if (mp->pr_shmid == SHMID_FREE)
4927 mp->pr_shmid = -1;
4928
4929 mp->pr_mflags |= MA_SHM;
4930 } else {
4931 mp->pr_shmid = -1;
4932 }
4933
4934 npages = ((uintptr_t)(naddr - saddr)) >>
4935 PAGESHIFT;
4936 parr = kmem_zalloc(npages, KM_SLEEP);
4937
4938 SEGOP_INCORE(seg, saddr, naddr - saddr, parr);
4939
4940 for (pagenum = 0; pagenum < npages; pagenum++) {
4941 if (parr[pagenum] & SEG_PAGE_INCORE)
4942 mp->pr_rss++;
4943 if (parr[pagenum] & SEG_PAGE_ANON)
4944 mp->pr_anon++;
4945 if (parr[pagenum] & SEG_PAGE_LOCKED)
4946 mp->pr_locked++;
4947 }
4948 kmem_free(parr, npages);
4949 }
4950 }
4951 ASSERT(tmp == NULL);
4952 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
4953
4954 return (0);
4955 }
4956
4957 /*
4958 * Return the process's credentials. We don't need a 32-bit equivalent of
4959 * this function because prcred_t and prcred32_t are actually the same.
4960 */
4961 void
prgetcred(proc_t * p,prcred_t * pcrp)4962 prgetcred(proc_t *p, prcred_t *pcrp)
4963 {
4964 mutex_enter(&p->p_crlock);
4965 cred2prcred(p->p_cred, pcrp);
4966 mutex_exit(&p->p_crlock);
4967 }
4968
4969 void
prgetsecflags(proc_t * p,prsecflags_t * psfp)4970 prgetsecflags(proc_t *p, prsecflags_t *psfp)
4971 {
4972 ASSERT(psfp != NULL);
4973
4974 bzero(psfp, sizeof (*psfp));
4975 psfp->pr_version = PRSECFLAGS_VERSION_CURRENT;
4976 psfp->pr_lower = p->p_secflags.psf_lower;
4977 psfp->pr_upper = p->p_secflags.psf_upper;
4978 psfp->pr_effective = p->p_secflags.psf_effective;
4979 psfp->pr_inherit = p->p_secflags.psf_inherit;
4980 }
4981
4982 /*
4983 * Compute actual size of the prpriv_t structure.
4984 */
4985
4986 size_t
prgetprivsize(void)4987 prgetprivsize(void)
4988 {
4989 return (priv_prgetprivsize(NULL));
4990 }
4991
4992 /*
4993 * Return the process's privileges. We don't need a 32-bit equivalent of
4994 * this function because prpriv_t and prpriv32_t are actually the same.
4995 */
4996 void
prgetpriv(proc_t * p,prpriv_t * pprp)4997 prgetpriv(proc_t *p, prpriv_t *pprp)
4998 {
4999 mutex_enter(&p->p_crlock);
5000 cred2prpriv(p->p_cred, pprp);
5001 mutex_exit(&p->p_crlock);
5002 }
5003
5004 #ifdef _SYSCALL32_IMPL
5005 /*
5006 * Return an array of structures with HAT memory map information.
5007 * We allocate here; the caller must deallocate.
5008 */
5009 int
prgetxmap32(proc_t * p,list_t * iolhead)5010 prgetxmap32(proc_t *p, list_t *iolhead)
5011 {
5012 struct as *as = p->p_as;
5013 prxmap32_t *mp;
5014 struct seg *seg;
5015 struct seg *brkseg, *stkseg;
5016 struct vnode *vp;
5017 struct vattr vattr;
5018 uint_t prot;
5019
5020 ASSERT(as != &kas && AS_WRITE_HELD(as));
5021
5022 /*
5023 * Request an initial buffer size that doesn't waste memory
5024 * if the address space has only a small number of segments.
5025 */
5026 pr_iol_initlist(iolhead, sizeof (*mp), avl_numnodes(&as->a_segtree));
5027
5028 if ((seg = AS_SEGFIRST(as)) == NULL)
5029 return (0);
5030
5031 brkseg = break_seg(p);
5032 stkseg = as_segat(as, prgetstackbase(p));
5033
5034 do {
5035 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
5036 caddr_t saddr, naddr, baddr;
5037 void *tmp = NULL;
5038 ssize_t psz;
5039 char *parr;
5040 uint64_t npages;
5041 uint64_t pagenum;
5042
5043 if ((seg->s_flags & S_HOLE) != 0) {
5044 continue;
5045 }
5046
5047 /*
5048 * Segment loop part one: iterate from the base of the segment
5049 * to its end, pausing at each address boundary (baddr) between
5050 * ranges that have different virtual memory protections.
5051 */
5052 for (saddr = seg->s_base; saddr < eaddr; saddr = baddr) {
5053 prot = pr_getprot(seg, 0, &tmp, &saddr, &baddr, eaddr);
5054 ASSERT(baddr >= saddr && baddr <= eaddr);
5055
5056 /*
5057 * Segment loop part two: iterate from the current
5058 * position to the end of the protection boundary,
5059 * pausing at each address boundary (naddr) between
5060 * ranges that have different underlying page sizes.
5061 */
5062 for (; saddr < baddr; saddr = naddr) {
5063 psz = pr_getpagesize(seg, saddr, &naddr, baddr);
5064 ASSERT(naddr >= saddr && naddr <= baddr);
5065
5066 mp = pr_iol_newbuf(iolhead, sizeof (*mp));
5067
5068 mp->pr_vaddr = (caddr32_t)(uintptr_t)saddr;
5069 mp->pr_size = (size32_t)(naddr - saddr);
5070 mp->pr_offset = SEGOP_GETOFFSET(seg, saddr);
5071 mp->pr_mflags = 0;
5072 if (prot & PROT_READ)
5073 mp->pr_mflags |= MA_READ;
5074 if (prot & PROT_WRITE)
5075 mp->pr_mflags |= MA_WRITE;
5076 if (prot & PROT_EXEC)
5077 mp->pr_mflags |= MA_EXEC;
5078 if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
5079 mp->pr_mflags |= MA_SHARED;
5080 if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
5081 mp->pr_mflags |= MA_NORESERVE;
5082 if (seg->s_ops == &segspt_shmops ||
5083 (seg->s_ops == &segvn_ops &&
5084 (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
5085 vp == NULL)))
5086 mp->pr_mflags |= MA_ANON;
5087 if (seg == brkseg)
5088 mp->pr_mflags |= MA_BREAK;
5089 else if (seg == stkseg)
5090 mp->pr_mflags |= MA_STACK;
5091 if (seg->s_ops == &segspt_shmops)
5092 mp->pr_mflags |= MA_ISM | MA_SHM;
5093
5094 mp->pr_pagesize = PAGESIZE;
5095 if (psz == -1) {
5096 mp->pr_hatpagesize = 0;
5097 } else {
5098 mp->pr_hatpagesize = psz;
5099 }
5100
5101 /*
5102 * Manufacture a filename for the "object" dir.
5103 */
5104 mp->pr_dev = PRNODEV32;
5105 vattr.va_mask = AT_FSID|AT_NODEID;
5106 if (seg->s_ops == &segvn_ops &&
5107 SEGOP_GETVP(seg, saddr, &vp) == 0 &&
5108 vp != NULL && vp->v_type == VREG &&
5109 VOP_GETATTR(vp, &vattr, 0, CRED(),
5110 NULL) == 0) {
5111 (void) cmpldev(&mp->pr_dev,
5112 vattr.va_fsid);
5113 mp->pr_ino = vattr.va_nodeid;
5114 if (vp == p->p_exec)
5115 (void) strcpy(mp->pr_mapname,
5116 "a.out");
5117 else
5118 pr_object_name(mp->pr_mapname,
5119 vp, &vattr);
5120 }
5121
5122 /*
5123 * Get the SysV shared memory id, if any.
5124 */
5125 if ((mp->pr_mflags & MA_SHARED) &&
5126 p->p_segacct && (mp->pr_shmid = shmgetid(p,
5127 seg->s_base)) != SHMID_NONE) {
5128 if (mp->pr_shmid == SHMID_FREE)
5129 mp->pr_shmid = -1;
5130
5131 mp->pr_mflags |= MA_SHM;
5132 } else {
5133 mp->pr_shmid = -1;
5134 }
5135
5136 npages = ((uintptr_t)(naddr - saddr)) >>
5137 PAGESHIFT;
5138 parr = kmem_zalloc(npages, KM_SLEEP);
5139
5140 SEGOP_INCORE(seg, saddr, naddr - saddr, parr);
5141
5142 for (pagenum = 0; pagenum < npages; pagenum++) {
5143 if (parr[pagenum] & SEG_PAGE_INCORE)
5144 mp->pr_rss++;
5145 if (parr[pagenum] & SEG_PAGE_ANON)
5146 mp->pr_anon++;
5147 if (parr[pagenum] & SEG_PAGE_LOCKED)
5148 mp->pr_locked++;
5149 }
5150 kmem_free(parr, npages);
5151 }
5152 }
5153 ASSERT(tmp == NULL);
5154 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
5155
5156 return (0);
5157 }
5158 #endif /* _SYSCALL32_IMPL */
5159