xref: /illumos-gate/usr/src/lib/libdtrace/common/dt_proc.c (revision 24da5b34f49324ed742a340010ed5bd3d4e06625)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * DTrace Process Control
31  *
32  * This file provides a set of routines that permit libdtrace and its clients
33  * to create and grab process handles using libproc, and to share these handles
34  * between library mechanisms that need libproc access, such as ustack(), and
35  * client mechanisms that need libproc access, such as dtrace(1M) -c and -p.
36  * The library provides several mechanisms in the libproc control layer:
37  *
38  * Reference Counting: The library code and client code can independently grab
39  * the same process handles without interfering with one another.  Only when
40  * the reference count drops to zero and the handle is not being cached (see
41  * below for more information on caching) will Prelease() be called on it.
42  *
43  * Handle Caching: If a handle is grabbed PGRAB_RDONLY (e.g. by ustack()) and
44  * the reference count drops to zero, the handle is not immediately released.
45  * Instead, libproc handles are maintained on dph_lrulist in order from most-
46  * recently accessed to least-recently accessed.  Idle handles are maintained
47  * until a pre-defined LRU cache limit is exceeded, permitting repeated calls
48  * to ustack() to avoid the overhead of releasing and re-grabbing processes.
49  *
50  * Process Control: For processes that are grabbed for control (~PGRAB_RDONLY)
51  * or created by dt_proc_create(), a control thread is created to provide
52  * callbacks on process exit and symbol table caching on dlopen()s.
53  *
54  * MT-Safety: Libproc is not MT-Safe, so dt_proc_lock() and dt_proc_unlock()
55  * are provided to synchronize access to the libproc handle between libdtrace
56  * code and client code and the control thread's use of the ps_prochandle.
57  *
58  * NOTE: MT-Safety is NOT provided for libdtrace itself, or for use of the
59  * dtrace_proc_grab/dtrace_proc_create mechanisms.  Like all exported libdtrace
60  * calls, these are assumed to be MT-Unsafe.  MT-Safety is ONLY provided for
61  * synchronization between libdtrace control threads and the client thread.
62  *
63  * The ps_prochandles themselves are maintained along with a dt_proc_t struct
64  * in a hash table indexed by PID.  This provides basic locking and reference
65  * counting.  The dt_proc_t is also maintained in LRU order on dph_lrulist.
66  * The dph_lrucnt and dph_lrulim count the number of cacheable processes and
67  * the current limit on the number of actively cached entries.
68  *
69  * The control thread for a process establishes breakpoints at the rtld_db
70  * locations of interest, updates mappings and symbol tables at these points,
71  * and handles exec and fork (by always following the parent).  The control
72  * thread automatically exits when the process dies or control is lost.
73  *
74  * A simple notification mechanism is provided for libdtrace clients using
75  * dtrace_handle_proc() for notification of PS_UNDEAD or PS_LOST events.  If
76  * such an event occurs, the dt_proc_t itself is enqueued on a notification
77  * list and the control thread broadcasts to dph_cv.  dtrace_sleep() will wake
78  * up using this condition and will then call the client handler as necessary.
79  */
80 
81 #include <sys/wait.h>
82 #include <sys/lwp.h>
83 #include <strings.h>
84 #include <signal.h>
85 #include <assert.h>
86 #include <errno.h>
87 
88 #include <dt_proc.h>
89 #include <dt_pid.h>
90 #include <dt_impl.h>
91 
92 #define	IS_SYS_EXEC(w)	(w == SYS_exec || w == SYS_execve)
93 #define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_fork1 ||	\
94 			w == SYS_forkall || w == SYS_forksys)
95 
96 static dt_bkpt_t *
97 dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
98 {
99 	struct ps_prochandle *P = dpr->dpr_proc;
100 	dt_bkpt_t *dbp;
101 
102 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
103 
104 	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
105 		dbp->dbp_func = func;
106 		dbp->dbp_data = data;
107 		dbp->dbp_addr = addr;
108 
109 		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
110 			dbp->dbp_active = B_TRUE;
111 
112 		dt_list_append(&dpr->dpr_bps, dbp);
113 	}
114 
115 	return (dbp);
116 }
117 
118 static void
119 dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
120 {
121 	int state = Pstate(dpr->dpr_proc);
122 	dt_bkpt_t *dbp, *nbp;
123 
124 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
125 
126 	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
127 		if (delbkpts && dbp->dbp_active &&
128 		    state != PS_LOST && state != PS_UNDEAD) {
129 			(void) Pdelbkpt(dpr->dpr_proc,
130 			    dbp->dbp_addr, dbp->dbp_instr);
131 		}
132 		nbp = dt_list_next(dbp);
133 		dt_list_delete(&dpr->dpr_bps, dbp);
134 		dt_free(dpr->dpr_hdl, dbp);
135 	}
136 }
137 
138 static void
139 dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
140 {
141 	const lwpstatus_t *psp = &Pstatus(dpr->dpr_proc)->pr_lwp;
142 	dt_bkpt_t *dbp;
143 
144 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
145 
146 	for (dbp = dt_list_next(&dpr->dpr_bps);
147 	    dbp != NULL; dbp = dt_list_next(dbp)) {
148 		if (psp->pr_reg[R_PC] == dbp->dbp_addr)
149 			break;
150 	}
151 
152 	if (dbp == NULL) {
153 		dt_dprintf("pid %d: spurious breakpoint wakeup for %lx\n",
154 		    (int)dpr->dpr_pid, (ulong_t)psp->pr_reg[R_PC]);
155 		return;
156 	}
157 
158 	dt_dprintf("pid %d: hit breakpoint at %lx (%lu)\n",
159 	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
160 
161 	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
162 	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
163 }
164 
165 void
166 dt_proc_bpenable(dt_proc_t *dpr)
167 {
168 	dt_bkpt_t *dbp;
169 
170 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
171 
172 	for (dbp = dt_list_next(&dpr->dpr_bps);
173 	    dbp != NULL; dbp = dt_list_next(dbp)) {
174 		if (!dbp->dbp_active && Psetbkpt(dpr->dpr_proc,
175 		    dbp->dbp_addr, &dbp->dbp_instr) == 0)
176 			dbp->dbp_active = B_TRUE;
177 	}
178 
179 	dt_dprintf("breakpoints enabled\n");
180 }
181 
182 void
183 dt_proc_bpdisable(dt_proc_t *dpr)
184 {
185 	dt_bkpt_t *dbp;
186 
187 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
188 
189 	for (dbp = dt_list_next(&dpr->dpr_bps);
190 	    dbp != NULL; dbp = dt_list_next(dbp)) {
191 		if (dbp->dbp_active && Pdelbkpt(dpr->dpr_proc,
192 		    dbp->dbp_addr, dbp->dbp_instr) == 0)
193 			dbp->dbp_active = B_FALSE;
194 	}
195 
196 	dt_dprintf("breakpoints disabled\n");
197 }
198 
199 static void
200 dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
201     const char *msg)
202 {
203 	dt_proc_notify_t *dprn = dt_alloc(dtp, sizeof (dt_proc_notify_t));
204 
205 	if (dprn == NULL) {
206 		dt_dprintf("failed to allocate notification for %d %s\n",
207 		    (int)dpr->dpr_pid, msg);
208 	} else {
209 		dprn->dprn_dpr = dpr;
210 		if (msg == NULL)
211 			dprn->dprn_errmsg[0] = '\0';
212 		else
213 			(void) strlcpy(dprn->dprn_errmsg, msg,
214 			    sizeof (dprn->dprn_errmsg));
215 
216 		(void) pthread_mutex_lock(&dph->dph_lock);
217 
218 		dprn->dprn_next = dph->dph_notify;
219 		dph->dph_notify = dprn;
220 
221 		(void) pthread_cond_broadcast(&dph->dph_cv);
222 		(void) pthread_mutex_unlock(&dph->dph_lock);
223 	}
224 }
225 
226 /*
227  * Check to see if the control thread was requested to stop when the victim
228  * process reached a particular event (why) rather than continuing the victim.
229  * If 'why' is set in the stop mask, we wait on dpr_cv for dt_proc_continue().
230  * If 'why' is not set, this function returns immediately and does nothing.
231  */
232 static void
233 dt_proc_stop(dt_proc_t *dpr, uint8_t why)
234 {
235 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
236 	assert(why != DT_PROC_STOP_IDLE);
237 
238 	if (dpr->dpr_stop & why) {
239 		dpr->dpr_stop |= DT_PROC_STOP_IDLE;
240 		dpr->dpr_stop &= ~why;
241 
242 		(void) pthread_cond_broadcast(&dpr->dpr_cv);
243 
244 		while (dpr->dpr_stop & DT_PROC_STOP_IDLE)
245 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
246 	}
247 }
248 
249 /*ARGSUSED*/
250 static void
251 dt_proc_bpmain(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *fname)
252 {
253 	dt_dprintf("pid %d: breakpoint at %s()\n", (int)dpr->dpr_pid, fname);
254 	dt_proc_stop(dpr, DT_PROC_STOP_MAIN);
255 }
256 
257 static void
258 dt_proc_rdevent(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *evname)
259 {
260 	rd_event_msg_t rdm;
261 	rd_err_e err;
262 
263 	if ((err = rd_event_getmsg(dpr->dpr_rtld, &rdm)) != RD_OK) {
264 		dt_dprintf("pid %d: failed to get %s event message: %s\n",
265 		    (int)dpr->dpr_pid, evname, rd_errstr(err));
266 		return;
267 	}
268 
269 	dt_dprintf("pid %d: rtld event %s type=%d state %d\n",
270 	    (int)dpr->dpr_pid, evname, rdm.type, rdm.u.state);
271 
272 	switch (rdm.type) {
273 	case RD_DLACTIVITY:
274 		if (rdm.u.state != RD_CONSISTENT)
275 			break;
276 
277 		Pupdate_syms(dpr->dpr_proc);
278 		if (dt_pid_create_probes_module(dtp, dpr) != 0)
279 			dt_proc_notify(dtp, dtp->dt_procs, dpr,
280 			    dpr->dpr_errmsg);
281 
282 		break;
283 	case RD_PREINIT:
284 		Pupdate_syms(dpr->dpr_proc);
285 		dt_proc_stop(dpr, DT_PROC_STOP_PREINIT);
286 		break;
287 	case RD_POSTINIT:
288 		Pupdate_syms(dpr->dpr_proc);
289 		dt_proc_stop(dpr, DT_PROC_STOP_POSTINIT);
290 		break;
291 	}
292 }
293 
294 static void
295 dt_proc_rdwatch(dt_proc_t *dpr, rd_event_e event, const char *evname)
296 {
297 	rd_notify_t rdn;
298 	rd_err_e err;
299 
300 	if ((err = rd_event_addr(dpr->dpr_rtld, event, &rdn)) != RD_OK) {
301 		dt_dprintf("pid %d: failed to get event address for %s: %s\n",
302 		    (int)dpr->dpr_pid, evname, rd_errstr(err));
303 		return;
304 	}
305 
306 	if (rdn.type != RD_NOTIFY_BPT) {
307 		dt_dprintf("pid %d: event %s has unexpected type %d\n",
308 		    (int)dpr->dpr_pid, evname, rdn.type);
309 		return;
310 	}
311 
312 	(void) dt_proc_bpcreate(dpr, rdn.u.bptaddr,
313 	    (dt_bkpt_f *)dt_proc_rdevent, (void *)evname);
314 }
315 
316 /*
317  * Common code for enabling events associated with the run-time linker after
318  * attaching to a process or after a victim process completes an exec(2).
319  */
320 static void
321 dt_proc_attach(dt_proc_t *dpr, int exec)
322 {
323 	const pstatus_t *psp = Pstatus(dpr->dpr_proc);
324 	rd_err_e err;
325 	GElf_Sym sym;
326 
327 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
328 
329 	if (exec) {
330 		if (psp->pr_lwp.pr_errno != 0)
331 			return; /* exec failed: nothing needs to be done */
332 
333 		dt_proc_bpdestroy(dpr, B_FALSE);
334 		Preset_maps(dpr->dpr_proc);
335 	}
336 
337 	if ((dpr->dpr_rtld = Prd_agent(dpr->dpr_proc)) != NULL &&
338 	    (err = rd_event_enable(dpr->dpr_rtld, B_TRUE)) == RD_OK) {
339 		dt_proc_rdwatch(dpr, RD_PREINIT, "RD_PREINIT");
340 		dt_proc_rdwatch(dpr, RD_POSTINIT, "RD_POSTINIT");
341 		dt_proc_rdwatch(dpr, RD_DLACTIVITY, "RD_DLACTIVITY");
342 	} else {
343 		dt_dprintf("pid %d: failed to enable rtld events: %s\n",
344 		    (int)dpr->dpr_pid, dpr->dpr_rtld ? rd_errstr(err) :
345 		    "rtld_db agent initialization failed");
346 	}
347 
348 	Pupdate_maps(dpr->dpr_proc);
349 
350 	if (Pxlookup_by_name(dpr->dpr_proc, LM_ID_BASE,
351 	    "a.out", "main", &sym, NULL) == 0) {
352 		(void) dt_proc_bpcreate(dpr, (uintptr_t)sym.st_value,
353 		    (dt_bkpt_f *)dt_proc_bpmain, "a.out`main");
354 	} else {
355 		dt_dprintf("pid %d: failed to find a.out`main: %s\n",
356 		    (int)dpr->dpr_pid, strerror(errno));
357 	}
358 }
359 
360 /*
361  * Wait for a stopped process to be set running again by some other debugger.
362  * This is typically not required by /proc-based debuggers, since the usual
363  * model is that one debugger controls one victim.  But DTrace, as usual, has
364  * its own needs: the stop() action assumes that prun(1) or some other tool
365  * will be applied to resume the victim process.  This could be solved by
366  * adding a PCWRUN directive to /proc, but that seems like overkill unless
367  * other debuggers end up needing this functionality, so we implement a cheap
368  * equivalent to PCWRUN using the set of existing kernel mechanisms.
369  *
370  * Our intent is really not just to wait for the victim to run, but rather to
371  * wait for it to run and then stop again for a reason other than the current
372  * PR_REQUESTED stop.  Since PCWSTOP/Pstopstatus() can be applied repeatedly
373  * to a stopped process and will return the same result without affecting the
374  * victim, we can just perform these operations repeatedly until Pstate()
375  * changes, the representative LWP ID changes, or the stop timestamp advances.
376  * dt_proc_control() will then rediscover the new state and continue as usual.
377  * When the process is still stopped in the same exact state, we sleep for a
378  * brief interval before waiting again so as not to spin consuming CPU cycles.
379  */
380 static void
381 dt_proc_waitrun(dt_proc_t *dpr)
382 {
383 	struct ps_prochandle *P = dpr->dpr_proc;
384 	const lwpstatus_t *psp = &Pstatus(P)->pr_lwp;
385 
386 	int krflag = psp->pr_flags & (PR_KLC | PR_RLC);
387 	timestruc_t tstamp = psp->pr_tstamp;
388 	lwpid_t lwpid = psp->pr_lwpid;
389 
390 	const long wstop = PCWSTOP;
391 	int pfd = Pctlfd(P);
392 
393 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
394 	assert(psp->pr_flags & PR_STOPPED);
395 	assert(Pstate(P) == PS_STOP);
396 
397 	/*
398 	 * While we are waiting for the victim to run, clear PR_KLC and PR_RLC
399 	 * so that if the libdtrace client is killed, the victim stays stopped.
400 	 * dt_proc_destroy() will also observe this and perform PRELEASE_HANG.
401 	 */
402 	(void) Punsetflags(P, krflag);
403 	Psync(P);
404 
405 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
406 
407 	while (!dpr->dpr_quit) {
408 		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
409 			continue; /* check dpr_quit and continue waiting */
410 
411 		(void) pthread_mutex_lock(&dpr->dpr_lock);
412 		(void) Pstopstatus(P, PCNULL, 0);
413 		psp = &Pstatus(P)->pr_lwp;
414 
415 		/*
416 		 * If we've reached a new state, found a new representative, or
417 		 * the stop timestamp has changed, restore PR_KLC/PR_RLC to its
418 		 * original setting and then return with dpr_lock held.
419 		 */
420 		if (Pstate(P) != PS_STOP || psp->pr_lwpid != lwpid ||
421 		    bcmp(&psp->pr_tstamp, &tstamp, sizeof (tstamp)) != 0) {
422 			(void) Psetflags(P, krflag);
423 			Psync(P);
424 			return;
425 		}
426 
427 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
428 		(void) poll(NULL, 0, MILLISEC / 2);
429 	}
430 
431 	(void) pthread_mutex_lock(&dpr->dpr_lock);
432 }
433 
434 typedef struct dt_proc_control_data {
435 	dtrace_hdl_t *dpcd_hdl;			/* DTrace handle */
436 	dt_proc_t *dpcd_proc;			/* proccess to control */
437 } dt_proc_control_data_t;
438 
439 /*
440  * Main loop for all victim process control threads.  We initialize all the
441  * appropriate /proc control mechanisms, and then enter a loop waiting for
442  * the process to stop on an event or die.  We process any events by calling
443  * appropriate subroutines, and exit when the victim dies or we lose control.
444  *
445  * The control thread synchronizes the use of dpr_proc with other libdtrace
446  * threads using dpr_lock.  We hold the lock for all of our operations except
447  * waiting while the process is running: this is accomplished by writing a
448  * PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.  If the
449  * libdtrace client wishes to exit or abort our wait, SIGCANCEL can be used.
450  */
451 static void *
452 dt_proc_control(void *arg)
453 {
454 	dt_proc_control_data_t *datap = arg;
455 	dtrace_hdl_t *dtp = datap->dpcd_hdl;
456 	dt_proc_t *dpr = datap->dpcd_proc;
457 	dt_proc_hash_t *dph = dpr->dpr_hdl->dt_procs;
458 	struct ps_prochandle *P = dpr->dpr_proc;
459 
460 	int pfd = Pctlfd(P);
461 	int pid = dpr->dpr_pid;
462 
463 	const long wstop = PCWSTOP;
464 	int notify = B_FALSE;
465 
466 	/*
467 	 * We disable the POSIX thread cancellation mechanism so that the
468 	 * client program using libdtrace can't accidentally cancel our thread.
469 	 * dt_proc_destroy() uses SIGCANCEL explicitly to simply poke us out
470 	 * of PCWSTOP with EINTR, at which point we will see dpr_quit and exit.
471 	 */
472 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
473 
474 	/*
475 	 * Set up the corresponding process for tracing by libdtrace.  We want
476 	 * to be able to catch breakpoints and efficiently single-step over
477 	 * them, and we need to enable librtld_db to watch libdl activity.
478 	 */
479 	(void) pthread_mutex_lock(&dpr->dpr_lock);
480 
481 	(void) Punsetflags(P, PR_ASYNC);	/* require synchronous mode */
482 	(void) Psetflags(P, PR_BPTADJ);		/* always adjust eip on x86 */
483 	(void) Punsetflags(P, PR_FORK);		/* do not inherit on fork */
484 
485 	(void) Pfault(P, FLTBPT, B_TRUE);	/* always trace breakpoints */
486 	(void) Pfault(P, FLTTRACE, B_TRUE);	/* always trace single-step */
487 
488 	/*
489 	 * We must trace exit from exec() system calls so that if the exec is
490 	 * successful, we can reset our breakpoints and re-initialize libproc.
491 	 */
492 	(void) Psysexit(P, SYS_exec, B_TRUE);
493 	(void) Psysexit(P, SYS_execve, B_TRUE);
494 
495 	/*
496 	 * We must trace entry and exit for fork() system calls in order to
497 	 * disable our breakpoints temporarily during the fork.  We do not set
498 	 * the PR_FORK flag, so if fork succeeds the child begins executing and
499 	 * does not inherit any other tracing behaviors or a control thread.
500 	 */
501 	(void) Psysentry(P, SYS_vfork, B_TRUE);
502 	(void) Psysexit(P, SYS_vfork, B_TRUE);
503 	(void) Psysentry(P, SYS_fork1, B_TRUE);
504 	(void) Psysexit(P, SYS_fork1, B_TRUE);
505 	(void) Psysentry(P, SYS_forkall, B_TRUE);
506 	(void) Psysexit(P, SYS_forkall, B_TRUE);
507 	(void) Psysentry(P, SYS_forksys, B_TRUE);
508 	(void) Psysexit(P, SYS_forksys, B_TRUE);
509 
510 	Psync(P);				/* enable all /proc changes */
511 	dt_proc_attach(dpr, B_FALSE);		/* enable rtld breakpoints */
512 
513 	/*
514 	 * If PR_KLC is set, we created the process; otherwise we grabbed it.
515 	 * Check for an appropriate stop request and wait for dt_proc_continue.
516 	 */
517 	if (Pstatus(P)->pr_flags & PR_KLC)
518 		dt_proc_stop(dpr, DT_PROC_STOP_CREATE);
519 	else
520 		dt_proc_stop(dpr, DT_PROC_STOP_GRAB);
521 
522 	if (Psetrun(P, 0, 0) == -1) {
523 		dt_dprintf("pid %d: failed to set running: %s\n",
524 		    (int)dpr->dpr_pid, strerror(errno));
525 	}
526 
527 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
528 
529 	/*
530 	 * Wait for the process corresponding to this control thread to stop,
531 	 * process the event, and then set it running again.  We want to sleep
532 	 * with dpr_lock *unheld* so that other parts of libdtrace can use the
533 	 * ps_prochandle in the meantime (e.g. ustack()).  To do this, we write
534 	 * a PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.
535 	 * Once the process stops, we wake up, grab dpr_lock, and then call
536 	 * Pwait() (which will return immediately) and do our processing.
537 	 */
538 	while (!dpr->dpr_quit) {
539 		const lwpstatus_t *psp;
540 
541 		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
542 			continue; /* check dpr_quit and continue waiting */
543 
544 		(void) pthread_mutex_lock(&dpr->dpr_lock);
545 pwait_locked:
546 		if (Pstopstatus(P, PCNULL, 0) == -1 && errno == EINTR) {
547 			(void) pthread_mutex_unlock(&dpr->dpr_lock);
548 			continue; /* check dpr_quit and continue waiting */
549 		}
550 
551 		switch (Pstate(P)) {
552 		case PS_STOP:
553 			psp = &Pstatus(P)->pr_lwp;
554 
555 			dt_dprintf("pid %d: proc stopped showing %d/%d\n",
556 			    pid, psp->pr_why, psp->pr_what);
557 
558 			/*
559 			 * If the process stops showing PR_REQUESTED, then the
560 			 * DTrace stop() action was applied to it or another
561 			 * debugging utility (e.g. pstop(1)) asked it to stop.
562 			 * In either case, the user's intention is for the
563 			 * process to remain stopped until another external
564 			 * mechanism (e.g. prun(1)) is applied.  So instead of
565 			 * setting the process running ourself, we wait for
566 			 * someone else to do so.  Once that happens, we return
567 			 * to our normal loop waiting for an event of interest.
568 			 */
569 			if (psp->pr_why == PR_REQUESTED) {
570 				dt_proc_waitrun(dpr);
571 				(void) pthread_mutex_unlock(&dpr->dpr_lock);
572 				continue;
573 			}
574 
575 			/*
576 			 * If the process stops showing one of the events that
577 			 * we are tracing, perform the appropriate response.
578 			 * Note that we ignore PR_SUSPENDED, PR_CHECKPOINT, and
579 			 * PR_JOBCONTROL by design: if one of these conditions
580 			 * occurs, we will fall through to Psetrun() but the
581 			 * process will remain stopped in the kernel by the
582 			 * corresponding mechanism (e.g. job control stop).
583 			 */
584 			if (psp->pr_why == PR_FAULTED && psp->pr_what == FLTBPT)
585 				dt_proc_bpmatch(dtp, dpr);
586 			else if (psp->pr_why == PR_SYSENTRY &&
587 			    IS_SYS_FORK(psp->pr_what))
588 				dt_proc_bpdisable(dpr);
589 			else if (psp->pr_why == PR_SYSEXIT &&
590 			    IS_SYS_FORK(psp->pr_what))
591 				dt_proc_bpenable(dpr);
592 			else if (psp->pr_why == PR_SYSEXIT &&
593 			    IS_SYS_EXEC(psp->pr_what))
594 				dt_proc_attach(dpr, B_TRUE);
595 			break;
596 
597 		case PS_LOST:
598 			if (Preopen(P) == 0)
599 				goto pwait_locked;
600 
601 			dt_dprintf("pid %d: proc lost: %s\n",
602 			    pid, strerror(errno));
603 
604 			dpr->dpr_quit = B_TRUE;
605 			notify = B_TRUE;
606 			break;
607 
608 		case PS_UNDEAD:
609 			dt_dprintf("pid %d: proc died\n", pid);
610 			dpr->dpr_quit = B_TRUE;
611 			notify = B_TRUE;
612 			break;
613 		}
614 
615 		if (Pstate(P) != PS_UNDEAD && Psetrun(P, 0, 0) == -1) {
616 			dt_dprintf("pid %d: failed to set running: %s\n",
617 			    (int)dpr->dpr_pid, strerror(errno));
618 		}
619 
620 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
621 	}
622 
623 	/*
624 	 * If the control thread detected PS_UNDEAD or PS_LOST, then enqueue
625 	 * the dt_proc_t structure on the dt_proc_hash_t notification list.
626 	 */
627 	if (notify)
628 		dt_proc_notify(dtp, dph, dpr, NULL);
629 
630 	/*
631 	 * Destroy and remove any remaining breakpoints, set dpr_done and clear
632 	 * dpr_tid to indicate the control thread has exited, and notify any
633 	 * waiting thread in dt_proc_destroy() that we have succesfully exited.
634 	 */
635 	(void) pthread_mutex_lock(&dpr->dpr_lock);
636 
637 	dt_proc_bpdestroy(dpr, B_TRUE);
638 	dpr->dpr_done = B_TRUE;
639 	dpr->dpr_tid = 0;
640 
641 	(void) pthread_cond_broadcast(&dpr->dpr_cv);
642 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
643 
644 	return (NULL);
645 }
646 
647 /*PRINTFLIKE3*/
648 static struct ps_prochandle *
649 dt_proc_error(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *format, ...)
650 {
651 	va_list ap;
652 
653 	va_start(ap, format);
654 	dt_set_errmsg(dtp, NULL, NULL, NULL, 0, format, ap);
655 	va_end(ap);
656 
657 	if (dpr->dpr_proc != NULL)
658 		Prelease(dpr->dpr_proc, 0);
659 
660 	dt_free(dtp, dpr);
661 	(void) dt_set_errno(dtp, EDT_COMPILER);
662 	return (NULL);
663 }
664 
665 dt_proc_t *
666 dt_proc_lookup(dtrace_hdl_t *dtp, struct ps_prochandle *P, int remove)
667 {
668 	dt_proc_hash_t *dph = dtp->dt_procs;
669 	pid_t pid = Pstatus(P)->pr_pid;
670 	dt_proc_t *dpr, **dpp = &dph->dph_hash[pid & (dph->dph_hashlen - 1)];
671 
672 	for (dpr = *dpp; dpr != NULL; dpr = dpr->dpr_hash) {
673 		if (dpr->dpr_pid == pid)
674 			break;
675 		else
676 			dpp = &dpr->dpr_hash;
677 	}
678 
679 	assert(dpr != NULL);
680 	assert(dpr->dpr_proc == P);
681 
682 	if (remove)
683 		*dpp = dpr->dpr_hash; /* remove from pid hash chain */
684 
685 	return (dpr);
686 }
687 
688 static void
689 dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
690 {
691 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
692 	dt_proc_hash_t *dph = dtp->dt_procs;
693 	dt_proc_notify_t *npr, **npp;
694 	int rflag;
695 
696 	assert(dpr != NULL);
697 
698 	/*
699 	 * If neither PR_KLC nor PR_RLC is set, then the process is stopped by
700 	 * an external debugger and we were waiting in dt_proc_waitrun().
701 	 * Leave the process in this condition using PRELEASE_HANG.
702 	 */
703 	if (!(Pstatus(dpr->dpr_proc)->pr_flags & (PR_KLC | PR_RLC))) {
704 		dt_dprintf("abandoning pid %d\n", (int)dpr->dpr_pid);
705 		rflag = PRELEASE_HANG;
706 	} else {
707 		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
708 		rflag = 0; /* apply kill or run-on-last-close */
709 	}
710 
711 	if (dpr->dpr_tid) {
712 		/*
713 		 * Set the dpr_quit flag to tell the daemon thread to exit.  We
714 		 * send it a SIGCANCEL to poke it out of PCWSTOP or any other
715 		 * long-term /proc system call.  Our daemon threads have POSIX
716 		 * cancellation disabled, so EINTR will be the only effect.  We
717 		 * then wait for dpr_done to indicate the thread has exited.
718 		 *
719 		 * We can't use pthread_kill() to send SIGCANCEL because the
720 		 * interface forbids it and we can't use pthread_cancel()
721 		 * because with cancellation disabled it won't actually
722 		 * send SIGCANCEL to the target thread, so we use _lwp_kill()
723 		 * to do the job.  This is all built on evil knowledge of
724 		 * the details of the cancellation mechanism in libc.
725 		 */
726 		(void) pthread_mutex_lock(&dpr->dpr_lock);
727 		dpr->dpr_quit = B_TRUE;
728 		(void) _lwp_kill(dpr->dpr_tid, SIGCANCEL);
729 
730 		/*
731 		 * If the process is currently idling in dt_proc_stop(), re-
732 		 * enable breakpoints and poke it into running again.
733 		 */
734 		if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
735 			dt_proc_bpenable(dpr);
736 			dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
737 			(void) pthread_cond_broadcast(&dpr->dpr_cv);
738 		}
739 
740 		while (!dpr->dpr_done)
741 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
742 
743 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
744 	}
745 
746 	/*
747 	 * Before we free the process structure, remove this dt_proc_t from the
748 	 * lookup hash, and then walk the dt_proc_hash_t's notification list
749 	 * and remove this dt_proc_t if it is enqueued.
750 	 */
751 	(void) pthread_mutex_lock(&dph->dph_lock);
752 	(void) dt_proc_lookup(dtp, P, B_TRUE);
753 	npp = &dph->dph_notify;
754 
755 	while ((npr = *npp) != NULL) {
756 		if (npr->dprn_dpr == dpr) {
757 			*npp = npr->dprn_next;
758 			dt_free(dtp, npr);
759 		} else {
760 			npp = &npr->dprn_next;
761 		}
762 	}
763 
764 	(void) pthread_mutex_unlock(&dph->dph_lock);
765 
766 	/*
767 	 * Remove the dt_proc_list from the LRU list, release the underlying
768 	 * libproc handle, and free our dt_proc_t data structure.
769 	 */
770 	if (dpr->dpr_cacheable) {
771 		assert(dph->dph_lrucnt != 0);
772 		dph->dph_lrucnt--;
773 	}
774 
775 	dt_list_delete(&dph->dph_lrulist, dpr);
776 	Prelease(dpr->dpr_proc, rflag);
777 	dt_free(dtp, dpr);
778 }
779 
780 static int
781 dt_proc_create_thread(dtrace_hdl_t *dtp, dt_proc_t *dpr, uint_t stop)
782 {
783 	dt_proc_control_data_t data;
784 	sigset_t nset, oset;
785 	pthread_attr_t a;
786 	int err;
787 
788 	(void) pthread_mutex_lock(&dpr->dpr_lock);
789 	dpr->dpr_stop |= stop; /* set bit for initial rendezvous */
790 
791 	(void) pthread_attr_init(&a);
792 	(void) pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
793 
794 	(void) sigfillset(&nset);
795 	(void) sigdelset(&nset, SIGABRT);	/* unblocked for assert() */
796 	(void) sigdelset(&nset, SIGCANCEL);	/* see dt_proc_destroy() */
797 
798 	data.dpcd_hdl = dtp;
799 	data.dpcd_proc = dpr;
800 
801 	(void) pthread_sigmask(SIG_SETMASK, &nset, &oset);
802 	err = pthread_create(&dpr->dpr_tid, &a, dt_proc_control, &data);
803 	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
804 
805 	/*
806 	 * If the control thread was created, then wait on dpr_cv for either
807 	 * dpr_done to be set (the victim died or the control thread failed)
808 	 * or DT_PROC_STOP_IDLE to be set, indicating that the victim is now
809 	 * stopped by /proc and the control thread is at the rendezvous event.
810 	 * On success, we return with the process and control thread stopped:
811 	 * the caller can then apply dt_proc_continue() to resume both.
812 	 */
813 	if (err == 0) {
814 		while (!dpr->dpr_done && !(dpr->dpr_stop & DT_PROC_STOP_IDLE))
815 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
816 
817 		/*
818 		 * If dpr_done is set, the control thread aborted before it
819 		 * reached the rendezvous event.  This is either due to PS_LOST
820 		 * or PS_UNDEAD (i.e. the process died).  We try to provide a
821 		 * small amount of useful information to help figure it out.
822 		 */
823 		if (dpr->dpr_done) {
824 			const psinfo_t *prp = Ppsinfo(dpr->dpr_proc);
825 			int stat = prp ? prp->pr_wstat : 0;
826 			int pid = dpr->dpr_pid;
827 
828 			if (Pstate(dpr->dpr_proc) == PS_LOST) {
829 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
830 				    "failed to control pid %d: process exec'd "
831 				    "set-id or unobservable program\n", pid);
832 			} else if (WIFSIGNALED(stat)) {
833 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
834 				    "failed to control pid %d: process died "
835 				    "from signal %d\n", pid, WTERMSIG(stat));
836 			} else {
837 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
838 				    "failed to control pid %d: process exited "
839 				    "with status %d\n", pid, WEXITSTATUS(stat));
840 			}
841 
842 			err = ESRCH; /* cause grab() or create() to fail */
843 		}
844 	} else {
845 		(void) dt_proc_error(dpr->dpr_hdl, dpr,
846 		    "failed to create control thread for process-id %d: %s\n",
847 		    (int)dpr->dpr_pid, strerror(err));
848 	}
849 
850 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
851 	(void) pthread_attr_destroy(&a);
852 
853 	return (err);
854 }
855 
856 struct ps_prochandle *
857 dt_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv)
858 {
859 	dt_proc_hash_t *dph = dtp->dt_procs;
860 	dt_proc_t *dpr;
861 	int err;
862 
863 	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
864 		return (NULL); /* errno is set for us */
865 
866 	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
867 	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
868 
869 	if ((dpr->dpr_proc = Pcreate(file, argv, &err, NULL, 0)) == NULL) {
870 		return (dt_proc_error(dtp, dpr,
871 		    "failed to execute %s: %s\n", file, Pcreate_error(err)));
872 	}
873 
874 	dpr->dpr_hdl = dtp;
875 	dpr->dpr_pid = Pstatus(dpr->dpr_proc)->pr_pid;
876 
877 	(void) Punsetflags(dpr->dpr_proc, PR_RLC);
878 	(void) Psetflags(dpr->dpr_proc, PR_KLC);
879 
880 	if (dt_proc_create_thread(dtp, dpr, dtp->dt_prcmode) != 0)
881 		return (NULL); /* dt_proc_error() has been called for us */
882 
883 	dpr->dpr_hash = dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)];
884 	dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)] = dpr;
885 	dt_list_prepend(&dph->dph_lrulist, dpr);
886 
887 	dt_dprintf("created pid %d\n", (int)dpr->dpr_pid);
888 	dpr->dpr_refs++;
889 
890 	return (dpr->dpr_proc);
891 }
892 
893 struct ps_prochandle *
894 dt_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags, int nomonitor)
895 {
896 	dt_proc_hash_t *dph = dtp->dt_procs;
897 	uint_t h = pid & (dph->dph_hashlen - 1);
898 	dt_proc_t *dpr, *opr;
899 	int err;
900 
901 	/*
902 	 * Search the hash table for the pid.  If it is already grabbed or
903 	 * created, move the handle to the front of the lrulist, increment
904 	 * the reference count, and return the existing ps_prochandle.
905 	 */
906 	for (dpr = dph->dph_hash[h]; dpr != NULL; dpr = dpr->dpr_hash) {
907 		if (dpr->dpr_pid == pid && !dpr->dpr_stale) {
908 			/*
909 			 * If the cached handle was opened read-only and
910 			 * this request is for a writeable handle, mark
911 			 * the cached handle as stale and open a new handle.
912 			 * Since it's stale, unmark it as cacheable.
913 			 */
914 			if (dpr->dpr_rdonly && !(flags & PGRAB_RDONLY)) {
915 				dt_dprintf("upgrading pid %d\n", (int)pid);
916 				dpr->dpr_stale = B_TRUE;
917 				dpr->dpr_cacheable = B_FALSE;
918 				dph->dph_lrucnt--;
919 				break;
920 			}
921 
922 			dt_dprintf("grabbed pid %d (cached)\n", (int)pid);
923 			dt_list_delete(&dph->dph_lrulist, dpr);
924 			dt_list_prepend(&dph->dph_lrulist, dpr);
925 			dpr->dpr_refs++;
926 			return (dpr->dpr_proc);
927 		}
928 	}
929 
930 	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
931 		return (NULL); /* errno is set for us */
932 
933 	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
934 	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
935 
936 	if ((dpr->dpr_proc = Pgrab(pid, flags, &err)) == NULL) {
937 		return (dt_proc_error(dtp, dpr,
938 		    "failed to grab pid %d: %s\n", (int)pid, Pgrab_error(err)));
939 	}
940 
941 	dpr->dpr_hdl = dtp;
942 	dpr->dpr_pid = pid;
943 
944 	(void) Punsetflags(dpr->dpr_proc, PR_KLC);
945 	(void) Psetflags(dpr->dpr_proc, PR_RLC);
946 
947 	/*
948 	 * If we are attempting to grab the process without a monitor
949 	 * thread, then mark the process cacheable only if it's being
950 	 * grabbed read-only.  If we're currently caching more process
951 	 * handles than dph_lrulim permits, attempt to find the
952 	 * least-recently-used handle that is currently unreferenced and
953 	 * release it from the cache.  Otherwise we are grabbing the process
954 	 * for control: create a control thread for this process and store
955 	 * its ID in dpr->dpr_tid.
956 	 */
957 	if (nomonitor || (flags & PGRAB_RDONLY)) {
958 		if (dph->dph_lrucnt >= dph->dph_lrulim) {
959 			for (opr = dt_list_prev(&dph->dph_lrulist);
960 			    opr != NULL; opr = dt_list_prev(opr)) {
961 				if (opr->dpr_cacheable && opr->dpr_refs == 0) {
962 					dt_proc_destroy(dtp, opr->dpr_proc);
963 					break;
964 				}
965 			}
966 		}
967 
968 		if (flags & PGRAB_RDONLY) {
969 			dpr->dpr_cacheable = B_TRUE;
970 			dpr->dpr_rdonly = B_TRUE;
971 			dph->dph_lrucnt++;
972 		}
973 
974 	} else if (dt_proc_create_thread(dtp, dpr, DT_PROC_STOP_GRAB) != 0)
975 		return (NULL); /* dt_proc_error() has been called for us */
976 
977 	dpr->dpr_hash = dph->dph_hash[h];
978 	dph->dph_hash[h] = dpr;
979 	dt_list_prepend(&dph->dph_lrulist, dpr);
980 
981 	dt_dprintf("grabbed pid %d\n", (int)pid);
982 	dpr->dpr_refs++;
983 
984 	return (dpr->dpr_proc);
985 }
986 
987 void
988 dt_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
989 {
990 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
991 	dt_proc_hash_t *dph = dtp->dt_procs;
992 
993 	assert(dpr != NULL);
994 	assert(dpr->dpr_refs != 0);
995 
996 	if (--dpr->dpr_refs == 0 &&
997 	    (!dpr->dpr_cacheable || dph->dph_lrucnt > dph->dph_lrulim))
998 		dt_proc_destroy(dtp, P);
999 }
1000 
1001 void
1002 dt_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1003 {
1004 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1005 
1006 	(void) pthread_mutex_lock(&dpr->dpr_lock);
1007 
1008 	if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
1009 		dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
1010 		(void) pthread_cond_broadcast(&dpr->dpr_cv);
1011 	}
1012 
1013 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
1014 }
1015 
1016 void
1017 dt_proc_lock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1018 {
1019 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1020 	int err = pthread_mutex_lock(&dpr->dpr_lock);
1021 	assert(err == 0); /* check for recursion */
1022 }
1023 
1024 void
1025 dt_proc_unlock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1026 {
1027 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1028 	int err = pthread_mutex_unlock(&dpr->dpr_lock);
1029 	assert(err == 0); /* check for unheld lock */
1030 }
1031 
1032 void
1033 dt_proc_hash_create(dtrace_hdl_t *dtp)
1034 {
1035 	if ((dtp->dt_procs = dt_zalloc(dtp, sizeof (dt_proc_hash_t) +
1036 	    sizeof (dt_proc_t *) * _dtrace_pidbuckets - 1)) != NULL) {
1037 
1038 		(void) pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
1039 		(void) pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
1040 
1041 		dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
1042 		dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
1043 	}
1044 }
1045 
1046 void
1047 dt_proc_hash_destroy(dtrace_hdl_t *dtp)
1048 {
1049 	dt_proc_hash_t *dph = dtp->dt_procs;
1050 	dt_proc_t *dpr;
1051 
1052 	while ((dpr = dt_list_next(&dph->dph_lrulist)) != NULL)
1053 		dt_proc_destroy(dtp, dpr->dpr_proc);
1054 
1055 	dtp->dt_procs = NULL;
1056 	dt_free(dtp, dph);
1057 }
1058 
1059 struct ps_prochandle *
1060 dtrace_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv)
1061 {
1062 	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1063 	struct ps_prochandle *P = dt_proc_create(dtp, file, argv);
1064 
1065 	if (P != NULL && idp != NULL && idp->di_id == 0)
1066 		idp->di_id = Pstatus(P)->pr_pid; /* $target = created pid */
1067 
1068 	return (P);
1069 }
1070 
1071 struct ps_prochandle *
1072 dtrace_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags)
1073 {
1074 	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1075 	struct ps_prochandle *P = dt_proc_grab(dtp, pid, flags, 0);
1076 
1077 	if (P != NULL && idp != NULL && idp->di_id == 0)
1078 		idp->di_id = pid; /* $target = grabbed pid */
1079 
1080 	return (P);
1081 }
1082 
1083 void
1084 dtrace_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1085 {
1086 	dt_proc_release(dtp, P);
1087 }
1088 
1089 void
1090 dtrace_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1091 {
1092 	dt_proc_continue(dtp, P);
1093 }
1094