xref: /titanic_51/usr/src/lib/libdtrace/common/dt_proc.c (revision 381a2a9a387f449fab7d0c7e97c4184c26963abf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * DTrace Process Control
32  *
33  * This file provides a set of routines that permit libdtrace and its clients
34  * to create and grab process handles using libproc, and to share these handles
35  * between library mechanisms that need libproc access, such as ustack(), and
36  * client mechanisms that need libproc access, such as dtrace(1M) -c and -p.
37  * The library provides several mechanisms in the libproc control layer:
38  *
39  * Reference Counting: The library code and client code can independently grab
40  * the same process handles without interfering with one another.  Only when
41  * the reference count drops to zero and the handle is not being cached (see
42  * below for more information on caching) will Prelease() be called on it.
43  *
44  * Handle Caching: If a handle is grabbed PGRAB_RDONLY (e.g. by ustack()) and
45  * the reference count drops to zero, the handle is not immediately released.
46  * Instead, libproc handles are maintained on dph_lrulist in order from most-
47  * recently accessed to least-recently accessed.  Idle handles are maintained
48  * until a pre-defined LRU cache limit is exceeded, permitting repeated calls
49  * to ustack() to avoid the overhead of releasing and re-grabbing processes.
50  *
51  * Process Control: For processes that are grabbed for control (~PGRAB_RDONLY)
52  * or created by dt_proc_create(), a control thread is created to provide
53  * callbacks on process exit and symbol table caching on dlopen()s.
54  *
55  * MT-Safety: Libproc is not MT-Safe, so dt_proc_lock() and dt_proc_unlock()
56  * are provided to synchronize access to the libproc handle between libdtrace
57  * code and client code and the control thread's use of the ps_prochandle.
58  *
59  * NOTE: MT-Safety is NOT provided for libdtrace itself, or for use of the
60  * dtrace_proc_grab/dtrace_proc_create mechanisms.  Like all exported libdtrace
61  * calls, these are assumed to be MT-Unsafe.  MT-Safety is ONLY provided for
62  * synchronization between libdtrace control threads and the client thread.
63  *
64  * The ps_prochandles themselves are maintained along with a dt_proc_t struct
65  * in a hash table indexed by PID.  This provides basic locking and reference
66  * counting.  The dt_proc_t is also maintained in LRU order on dph_lrulist.
67  * The dph_lrucnt and dph_lrulim count the number of cacheable processes and
68  * the current limit on the number of actively cached entries.
69  *
70  * The control thread for a process establishes breakpoints at the rtld_db
71  * locations of interest, updates mappings and symbol tables at these points,
72  * and handles exec and fork (by always following the parent).  The control
73  * thread automatically exits when the process dies or control is lost.
74  *
75  * A simple notification mechanism is provided for libdtrace clients using
76  * dtrace_handle_proc() for notification of PS_UNDEAD or PS_LOST events.  If
77  * such an event occurs, the dt_proc_t itself is enqueued on a notification
78  * list and the control thread broadcasts to dph_cv.  dtrace_sleep() will wake
79  * up using this condition and will then call the client handler as necessary.
80  */
81 
82 #include <sys/wait.h>
83 #include <sys/lwp.h>
84 #include <strings.h>
85 #include <signal.h>
86 #include <assert.h>
87 #include <errno.h>
88 
89 #include <dt_proc.h>
90 #include <dt_pid.h>
91 #include <dt_impl.h>
92 
93 #define	IS_SYS_EXEC(w)	(w == SYS_exec || w == SYS_execve)
94 #define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_fork1 || w == SYS_forkall)
95 
96 static dt_bkpt_t *
97 dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
98 {
99 	struct ps_prochandle *P = dpr->dpr_proc;
100 	dt_bkpt_t *dbp;
101 
102 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
103 
104 	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
105 		dbp->dbp_func = func;
106 		dbp->dbp_data = data;
107 		dbp->dbp_addr = addr;
108 
109 		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
110 			dbp->dbp_active = B_TRUE;
111 
112 		dt_list_append(&dpr->dpr_bps, dbp);
113 	}
114 
115 	return (dbp);
116 }
117 
118 static void
119 dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
120 {
121 	int state = Pstate(dpr->dpr_proc);
122 	dt_bkpt_t *dbp, *nbp;
123 
124 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
125 
126 	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
127 		if (delbkpts && dbp->dbp_active &&
128 		    state != PS_LOST && state != PS_UNDEAD) {
129 			(void) Pdelbkpt(dpr->dpr_proc,
130 			    dbp->dbp_addr, dbp->dbp_instr);
131 		}
132 		nbp = dt_list_next(dbp);
133 		dt_list_delete(&dpr->dpr_bps, dbp);
134 		dt_free(dpr->dpr_hdl, dbp);
135 	}
136 }
137 
138 static void
139 dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
140 {
141 	const lwpstatus_t *psp = &Pstatus(dpr->dpr_proc)->pr_lwp;
142 	dt_bkpt_t *dbp;
143 
144 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
145 
146 	for (dbp = dt_list_next(&dpr->dpr_bps);
147 	    dbp != NULL; dbp = dt_list_next(dbp)) {
148 		if (psp->pr_reg[R_PC] == dbp->dbp_addr)
149 			break;
150 	}
151 
152 	if (dbp == NULL) {
153 		dt_dprintf("pid %d: spurious breakpoint wakeup for %lx\n",
154 		    (int)dpr->dpr_pid, (ulong_t)psp->pr_reg[R_PC]);
155 		return;
156 	}
157 
158 	dt_dprintf("pid %d: hit breakpoint at %lx (%lu)\n",
159 	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
160 
161 	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
162 	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
163 }
164 
165 void
166 dt_proc_bpenable(dt_proc_t *dpr)
167 {
168 	dt_bkpt_t *dbp;
169 
170 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
171 
172 	for (dbp = dt_list_next(&dpr->dpr_bps);
173 	    dbp != NULL; dbp = dt_list_next(dbp)) {
174 		if (!dbp->dbp_active && Psetbkpt(dpr->dpr_proc,
175 		    dbp->dbp_addr, &dbp->dbp_instr) == 0)
176 			dbp->dbp_active = B_TRUE;
177 	}
178 
179 	dt_dprintf("breakpoints enabled\n");
180 }
181 
182 void
183 dt_proc_bpdisable(dt_proc_t *dpr)
184 {
185 	dt_bkpt_t *dbp;
186 
187 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
188 
189 	for (dbp = dt_list_next(&dpr->dpr_bps);
190 	    dbp != NULL; dbp = dt_list_next(dbp)) {
191 		if (dbp->dbp_active && Pdelbkpt(dpr->dpr_proc,
192 		    dbp->dbp_addr, dbp->dbp_instr) == 0)
193 			dbp->dbp_active = B_FALSE;
194 	}
195 
196 	dt_dprintf("breakpoints disabled\n");
197 }
198 
199 static void
200 dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
201     const char *msg)
202 {
203 	dt_proc_notify_t *dprn = dt_alloc(dtp, sizeof (dt_proc_notify_t));
204 
205 	if (dprn == NULL) {
206 		dt_dprintf("failed to allocate notification for %d %s\n",
207 		    (int)dpr->dpr_pid, msg);
208 	} else {
209 		dprn->dprn_dpr = dpr;
210 		if (msg == NULL)
211 			dprn->dprn_errmsg[0] = '\0';
212 		else
213 			(void) strlcpy(dprn->dprn_errmsg, msg,
214 			    sizeof (dprn->dprn_errmsg));
215 
216 		(void) pthread_mutex_lock(&dph->dph_lock);
217 
218 		dprn->dprn_next = dph->dph_notify;
219 		dph->dph_notify = dprn;
220 
221 		(void) pthread_cond_broadcast(&dph->dph_cv);
222 		(void) pthread_mutex_unlock(&dph->dph_lock);
223 	}
224 }
225 
226 /*
227  * Check to see if the control thread was requested to stop when the victim
228  * process reached a particular event (why) rather than continuing the victim.
229  * If 'why' is set in the stop mask, we wait on dpr_cv for dt_proc_continue().
230  * If 'why' is not set, this function returns immediately and does nothing.
231  */
232 static void
233 dt_proc_stop(dt_proc_t *dpr, uint8_t why)
234 {
235 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
236 	assert(why != DT_PROC_STOP_IDLE);
237 
238 	if (dpr->dpr_stop & why) {
239 		dpr->dpr_stop |= DT_PROC_STOP_IDLE;
240 		dpr->dpr_stop &= ~why;
241 
242 		(void) pthread_cond_broadcast(&dpr->dpr_cv);
243 
244 		while (dpr->dpr_stop & DT_PROC_STOP_IDLE)
245 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
246 	}
247 }
248 
249 /*ARGSUSED*/
250 static void
251 dt_proc_bpmain(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *fname)
252 {
253 	dt_dprintf("pid %d: breakpoint at %s()\n", (int)dpr->dpr_pid, fname);
254 	dt_proc_stop(dpr, DT_PROC_STOP_MAIN);
255 }
256 
257 static void
258 dt_proc_rdevent(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *evname)
259 {
260 	rd_event_msg_t rdm;
261 	rd_err_e err;
262 
263 	if ((err = rd_event_getmsg(dpr->dpr_rtld, &rdm)) != RD_OK) {
264 		dt_dprintf("pid %d: failed to get %s event message: %s\n",
265 		    (int)dpr->dpr_pid, evname, rd_errstr(err));
266 		return;
267 	}
268 
269 	dt_dprintf("pid %d: rtld event %s type=%d state %d\n",
270 	    (int)dpr->dpr_pid, evname, rdm.type, rdm.u.state);
271 
272 	switch (rdm.type) {
273 	case RD_DLACTIVITY:
274 		if (rdm.u.state != RD_CONSISTENT)
275 			break;
276 
277 		Pupdate_syms(dpr->dpr_proc);
278 		if (dt_pid_create_probes_module(dtp, dpr) != 0)
279 			dt_proc_notify(dtp, dtp->dt_procs, dpr,
280 			    dpr->dpr_errmsg);
281 
282 		break;
283 	case RD_PREINIT:
284 		Pupdate_syms(dpr->dpr_proc);
285 		dt_proc_stop(dpr, DT_PROC_STOP_PREINIT);
286 		break;
287 	case RD_POSTINIT:
288 		Pupdate_syms(dpr->dpr_proc);
289 		dt_proc_stop(dpr, DT_PROC_STOP_POSTINIT);
290 		break;
291 	}
292 }
293 
294 static void
295 dt_proc_rdwatch(dt_proc_t *dpr, rd_event_e event, const char *evname)
296 {
297 	rd_notify_t rdn;
298 	rd_err_e err;
299 
300 	if ((err = rd_event_addr(dpr->dpr_rtld, event, &rdn)) != RD_OK) {
301 		dt_dprintf("pid %d: failed to get event address for %s: %s\n",
302 		    (int)dpr->dpr_pid, evname, rd_errstr(err));
303 		return;
304 	}
305 
306 	if (rdn.type != RD_NOTIFY_BPT) {
307 		dt_dprintf("pid %d: event %s has unexpected type %d\n",
308 		    (int)dpr->dpr_pid, evname, rdn.type);
309 		return;
310 	}
311 
312 	(void) dt_proc_bpcreate(dpr, rdn.u.bptaddr,
313 	    (dt_bkpt_f *)dt_proc_rdevent, (void *)evname);
314 }
315 
316 /*
317  * Common code for enabling events associated with the run-time linker after
318  * attaching to a process or after a victim process completes an exec(2).
319  */
320 static void
321 dt_proc_attach(dt_proc_t *dpr, int exec)
322 {
323 	const pstatus_t *psp = Pstatus(dpr->dpr_proc);
324 	rd_err_e err;
325 	GElf_Sym sym;
326 
327 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
328 
329 	if (exec) {
330 		if (psp->pr_lwp.pr_errno != 0)
331 			return; /* exec failed: nothing needs to be done */
332 
333 		dt_proc_bpdestroy(dpr, B_FALSE);
334 		Preset_maps(dpr->dpr_proc);
335 	}
336 
337 	if ((dpr->dpr_rtld = Prd_agent(dpr->dpr_proc)) != NULL &&
338 	    (err = rd_event_enable(dpr->dpr_rtld, B_TRUE)) == RD_OK) {
339 		dt_proc_rdwatch(dpr, RD_PREINIT, "RD_PREINIT");
340 		dt_proc_rdwatch(dpr, RD_POSTINIT, "RD_POSTINIT");
341 		dt_proc_rdwatch(dpr, RD_DLACTIVITY, "RD_DLACTIVITY");
342 	} else {
343 		dt_dprintf("pid %d: failed to enable rtld events: %s\n",
344 		    (int)dpr->dpr_pid, dpr->dpr_rtld ? rd_errstr(err) :
345 		    "rtld_db agent initialization failed");
346 	}
347 
348 	Pupdate_maps(dpr->dpr_proc);
349 
350 	if (Pxlookup_by_name(dpr->dpr_proc, LM_ID_BASE,
351 	    "a.out", "main", &sym, NULL) == 0) {
352 		(void) dt_proc_bpcreate(dpr, (uintptr_t)sym.st_value,
353 		    (dt_bkpt_f *)dt_proc_bpmain, "a.out`main");
354 	} else {
355 		dt_dprintf("pid %d: failed to find a.out`main: %s\n",
356 		    (int)dpr->dpr_pid, strerror(errno));
357 	}
358 }
359 
360 /*
361  * Wait for a stopped process to be set running again by some other debugger.
362  * This is typically not required by /proc-based debuggers, since the usual
363  * model is that one debugger controls one victim.  But DTrace, as usual, has
364  * its own needs: the stop() action assumes that prun(1) or some other tool
365  * will be applied to resume the victim process.  This could be solved by
366  * adding a PCWRUN directive to /proc, but that seems like overkill unless
367  * other debuggers end up needing this functionality, so we implement a cheap
368  * equivalent to PCWRUN using the set of existing kernel mechanisms.
369  *
370  * Our intent is really not just to wait for the victim to run, but rather to
371  * wait for it to run and then stop again for a reason other than the current
372  * PR_REQUESTED stop.  Since PCWSTOP/Pstopstatus() can be applied repeatedly
373  * to a stopped process and will return the same result without affecting the
374  * victim, we can just perform these operations repeatedly until Pstate()
375  * changes, the representative LWP ID changes, or the stop timestamp advances.
376  * dt_proc_control() will then rediscover the new state and continue as usual.
377  * When the process is still stopped in the same exact state, we sleep for a
378  * brief interval before waiting again so as not to spin consuming CPU cycles.
379  */
380 static void
381 dt_proc_waitrun(dt_proc_t *dpr)
382 {
383 	struct ps_prochandle *P = dpr->dpr_proc;
384 	const lwpstatus_t *psp = &Pstatus(P)->pr_lwp;
385 
386 	int krflag = psp->pr_flags & (PR_KLC | PR_RLC);
387 	timestruc_t tstamp = psp->pr_tstamp;
388 	lwpid_t lwpid = psp->pr_lwpid;
389 
390 	const long wstop = PCWSTOP;
391 	int pfd = Pctlfd(P);
392 
393 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
394 	assert(psp->pr_flags & PR_STOPPED);
395 	assert(Pstate(P) == PS_STOP);
396 
397 	/*
398 	 * While we are waiting for the victim to run, clear PR_KLC and PR_RLC
399 	 * so that if the libdtrace client is killed, the victim stays stopped.
400 	 * dt_proc_destroy() will also observe this and perform PRELEASE_HANG.
401 	 */
402 	(void) Punsetflags(P, krflag);
403 	Psync(P);
404 
405 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
406 
407 	while (!dpr->dpr_quit) {
408 		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
409 			continue; /* check dpr_quit and continue waiting */
410 
411 		(void) pthread_mutex_lock(&dpr->dpr_lock);
412 		(void) Pstopstatus(P, PCNULL, 0);
413 		psp = &Pstatus(P)->pr_lwp;
414 
415 		/*
416 		 * If we've reached a new state, found a new representative, or
417 		 * the stop timestamp has changed, restore PR_KLC/PR_RLC to its
418 		 * original setting and then return with dpr_lock held.
419 		 */
420 		if (Pstate(P) != PS_STOP || psp->pr_lwpid != lwpid ||
421 		    bcmp(&psp->pr_tstamp, &tstamp, sizeof (tstamp)) != 0) {
422 			(void) Psetflags(P, krflag);
423 			Psync(P);
424 			return;
425 		}
426 
427 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
428 		(void) poll(NULL, 0, MILLISEC / 2);
429 	}
430 
431 	(void) pthread_mutex_lock(&dpr->dpr_lock);
432 }
433 
434 typedef struct dt_proc_control_data {
435 	dtrace_hdl_t *dpcd_hdl;			/* DTrace handle */
436 	dt_proc_t *dpcd_proc;			/* proccess to control */
437 } dt_proc_control_data_t;
438 
439 /*
440  * Main loop for all victim process control threads.  We initialize all the
441  * appropriate /proc control mechanisms, and then enter a loop waiting for
442  * the process to stop on an event or die.  We process any events by calling
443  * appropriate subroutines, and exit when the victim dies or we lose control.
444  *
445  * The control thread synchronizes the use of dpr_proc with other libdtrace
446  * threads using dpr_lock.  We hold the lock for all of our operations except
447  * waiting while the process is running: this is accomplished by writing a
448  * PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.  If the
449  * libdtrace client wishes to exit or abort our wait, SIGCANCEL can be used.
450  */
451 static void *
452 dt_proc_control(void *arg)
453 {
454 	dt_proc_control_data_t *datap = arg;
455 	dtrace_hdl_t *dtp = datap->dpcd_hdl;
456 	dt_proc_t *dpr = datap->dpcd_proc;
457 	dt_proc_hash_t *dph = dpr->dpr_hdl->dt_procs;
458 	struct ps_prochandle *P = dpr->dpr_proc;
459 
460 	int pfd = Pctlfd(P);
461 	int pid = dpr->dpr_pid;
462 
463 	const long wstop = PCWSTOP;
464 	int notify = B_FALSE;
465 
466 	/*
467 	 * We disable the POSIX thread cancellation mechanism so that the
468 	 * client program using libdtrace can't accidentally cancel our thread.
469 	 * dt_proc_destroy() uses SIGCANCEL explicitly to simply poke us out
470 	 * of PCWSTOP with EINTR, at which point we will see dpr_quit and exit.
471 	 */
472 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
473 
474 	/*
475 	 * Set up the corresponding process for tracing by libdtrace.  We want
476 	 * to be able to catch breakpoints and efficiently single-step over
477 	 * them, and we need to enable librtld_db to watch libdl activity.
478 	 */
479 	(void) pthread_mutex_lock(&dpr->dpr_lock);
480 
481 	(void) Punsetflags(P, PR_ASYNC);	/* require synchronous mode */
482 	(void) Psetflags(P, PR_BPTADJ);		/* always adjust eip on x86 */
483 	(void) Punsetflags(P, PR_FORK);		/* do not inherit on fork */
484 
485 	(void) Pfault(P, FLTBPT, B_TRUE);	/* always trace breakpoints */
486 	(void) Pfault(P, FLTTRACE, B_TRUE);	/* always trace single-step */
487 
488 	/*
489 	 * We must trace exit from exec() system calls so that if the exec is
490 	 * successful, we can reset our breakpoints and re-initialize libproc.
491 	 */
492 	(void) Psysexit(P, SYS_exec, B_TRUE);
493 	(void) Psysexit(P, SYS_execve, B_TRUE);
494 
495 	/*
496 	 * We must trace entry and exit for fork() system calls in order to
497 	 * disable our breakpoints temporarily during the fork.  We do not set
498 	 * the PR_FORK flag, so if fork succeeds the child begins executing and
499 	 * does not inherit any other tracing behaviors or a control thread.
500 	 */
501 	(void) Psysentry(P, SYS_vfork, B_TRUE);
502 	(void) Psysexit(P, SYS_vfork, B_TRUE);
503 	(void) Psysentry(P, SYS_fork1, B_TRUE);
504 	(void) Psysexit(P, SYS_fork1, B_TRUE);
505 	(void) Psysentry(P, SYS_forkall, B_TRUE);
506 	(void) Psysexit(P, SYS_forkall, B_TRUE);
507 
508 	Psync(P);				/* enable all /proc changes */
509 	dt_proc_attach(dpr, B_FALSE);		/* enable rtld breakpoints */
510 
511 	/*
512 	 * If PR_KLC is set, we created the process; otherwise we grabbed it.
513 	 * Check for an appropriate stop request and wait for dt_proc_continue.
514 	 */
515 	if (Pstatus(P)->pr_flags & PR_KLC)
516 		dt_proc_stop(dpr, DT_PROC_STOP_CREATE);
517 	else
518 		dt_proc_stop(dpr, DT_PROC_STOP_GRAB);
519 
520 	if (Psetrun(P, 0, 0) == -1) {
521 		dt_dprintf("pid %d: failed to set running: %s\n",
522 		    (int)dpr->dpr_pid, strerror(errno));
523 	}
524 
525 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
526 
527 	/*
528 	 * Wait for the process corresponding to this control thread to stop,
529 	 * process the event, and then set it running again.  We want to sleep
530 	 * with dpr_lock *unheld* so that other parts of libdtrace can use the
531 	 * ps_prochandle in the meantime (e.g. ustack()).  To do this, we write
532 	 * a PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.
533 	 * Once the process stops, we wake up, grab dpr_lock, and then call
534 	 * Pwait() (which will return immediately) and do our processing.
535 	 */
536 	while (!dpr->dpr_quit) {
537 		const lwpstatus_t *psp;
538 
539 		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
540 			continue; /* check dpr_quit and continue waiting */
541 
542 		(void) pthread_mutex_lock(&dpr->dpr_lock);
543 pwait_locked:
544 		if (Pstopstatus(P, PCNULL, 0) == -1 && errno == EINTR) {
545 			(void) pthread_mutex_unlock(&dpr->dpr_lock);
546 			continue; /* check dpr_quit and continue waiting */
547 		}
548 
549 		switch (Pstate(P)) {
550 		case PS_STOP:
551 			psp = &Pstatus(P)->pr_lwp;
552 
553 			dt_dprintf("pid %d: proc stopped showing %d/%d\n",
554 			    pid, psp->pr_why, psp->pr_what);
555 
556 			/*
557 			 * If the process stops showing PR_REQUESTED, then the
558 			 * DTrace stop() action was applied to it or another
559 			 * debugging utility (e.g. pstop(1)) asked it to stop.
560 			 * In either case, the user's intention is for the
561 			 * process to remain stopped until another external
562 			 * mechanism (e.g. prun(1)) is applied.  So instead of
563 			 * setting the process running ourself, we wait for
564 			 * someone else to do so.  Once that happens, we return
565 			 * to our normal loop waiting for an event of interest.
566 			 */
567 			if (psp->pr_why == PR_REQUESTED) {
568 				dt_proc_waitrun(dpr);
569 				(void) pthread_mutex_unlock(&dpr->dpr_lock);
570 				continue;
571 			}
572 
573 			/*
574 			 * If the process stops showing one of the events that
575 			 * we are tracing, perform the appropriate response.
576 			 * Note that we ignore PR_SUSPENDED, PR_CHECKPOINT, and
577 			 * PR_JOBCONTROL by design: if one of these conditions
578 			 * occurs, we will fall through to Psetrun() but the
579 			 * process will remain stopped in the kernel by the
580 			 * corresponding mechanism (e.g. job control stop).
581 			 */
582 			if (psp->pr_why == PR_FAULTED && psp->pr_what == FLTBPT)
583 				dt_proc_bpmatch(dtp, dpr);
584 			else if (psp->pr_why == PR_SYSENTRY &&
585 			    IS_SYS_FORK(psp->pr_what))
586 				dt_proc_bpdisable(dpr);
587 			else if (psp->pr_why == PR_SYSEXIT &&
588 			    IS_SYS_FORK(psp->pr_what))
589 				dt_proc_bpenable(dpr);
590 			else if (psp->pr_why == PR_SYSEXIT &&
591 			    IS_SYS_EXEC(psp->pr_what))
592 				dt_proc_attach(dpr, B_TRUE);
593 			break;
594 
595 		case PS_LOST:
596 			if (Preopen(P) == 0)
597 				goto pwait_locked;
598 
599 			dt_dprintf("pid %d: proc lost: %s\n",
600 			    pid, strerror(errno));
601 
602 			dpr->dpr_quit = B_TRUE;
603 			notify = B_TRUE;
604 			break;
605 
606 		case PS_UNDEAD:
607 			dt_dprintf("pid %d: proc died\n", pid);
608 			dpr->dpr_quit = B_TRUE;
609 			notify = B_TRUE;
610 			break;
611 		}
612 
613 		if (Pstate(P) != PS_UNDEAD && Psetrun(P, 0, 0) == -1) {
614 			dt_dprintf("pid %d: failed to set running: %s\n",
615 			    (int)dpr->dpr_pid, strerror(errno));
616 		}
617 
618 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
619 	}
620 
621 	/*
622 	 * If the control thread detected PS_UNDEAD or PS_LOST, then enqueue
623 	 * the dt_proc_t structure on the dt_proc_hash_t notification list.
624 	 */
625 	if (notify)
626 		dt_proc_notify(dtp, dph, dpr, NULL);
627 
628 	/*
629 	 * Destroy and remove any remaining breakpoints, set dpr_done and clear
630 	 * dpr_tid to indicate the control thread has exited, and notify any
631 	 * waiting thread in dt_proc_destroy() that we have succesfully exited.
632 	 */
633 	(void) pthread_mutex_lock(&dpr->dpr_lock);
634 
635 	dt_proc_bpdestroy(dpr, B_TRUE);
636 	dpr->dpr_done = B_TRUE;
637 	dpr->dpr_tid = 0;
638 
639 	(void) pthread_cond_broadcast(&dpr->dpr_cv);
640 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
641 
642 	return (NULL);
643 }
644 
645 /*PRINTFLIKE3*/
646 static struct ps_prochandle *
647 dt_proc_error(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *format, ...)
648 {
649 	va_list ap;
650 
651 	va_start(ap, format);
652 	dt_set_errmsg(dtp, NULL, NULL, NULL, 0, format, ap);
653 	va_end(ap);
654 
655 	if (dpr->dpr_proc != NULL)
656 		Prelease(dpr->dpr_proc, 0);
657 
658 	dt_free(dtp, dpr);
659 	(void) dt_set_errno(dtp, EDT_COMPILER);
660 	return (NULL);
661 }
662 
663 dt_proc_t *
664 dt_proc_lookup(dtrace_hdl_t *dtp, struct ps_prochandle *P, int remove)
665 {
666 	dt_proc_hash_t *dph = dtp->dt_procs;
667 	pid_t pid = Pstatus(P)->pr_pid;
668 	dt_proc_t *dpr, **dpp = &dph->dph_hash[pid & (dph->dph_hashlen - 1)];
669 
670 	for (dpr = *dpp; dpr != NULL; dpr = dpr->dpr_hash) {
671 		if (dpr->dpr_pid == pid)
672 			break;
673 		else
674 			dpp = &dpr->dpr_hash;
675 	}
676 
677 	assert(dpr != NULL);
678 	assert(dpr->dpr_proc == P);
679 
680 	if (remove)
681 		*dpp = dpr->dpr_hash; /* remove from pid hash chain */
682 
683 	return (dpr);
684 }
685 
686 static void
687 dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
688 {
689 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
690 	dt_proc_hash_t *dph = dtp->dt_procs;
691 	dt_proc_notify_t *npr, **npp;
692 	int rflag;
693 
694 	assert(dpr != NULL);
695 
696 	/*
697 	 * If neither PR_KLC nor PR_RLC is set, then the process is stopped by
698 	 * an external debugger and we were waiting in dt_proc_waitrun().
699 	 * Leave the process in this condition using PRELEASE_HANG.
700 	 */
701 	if (!(Pstatus(dpr->dpr_proc)->pr_flags & (PR_KLC | PR_RLC))) {
702 		dt_dprintf("abandoning pid %d\n", (int)dpr->dpr_pid);
703 		rflag = PRELEASE_HANG;
704 	} else {
705 		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
706 		rflag = 0; /* apply kill or run-on-last-close */
707 	}
708 
709 	if (dpr->dpr_tid) {
710 		/*
711 		 * Set the dpr_quit flag to tell the daemon thread to exit.  We
712 		 * send it a SIGCANCEL to poke it out of PCWSTOP or any other
713 		 * long-term /proc system call.  Our daemon threads have POSIX
714 		 * cancellation disabled, so EINTR will be the only effect.  We
715 		 * then wait for dpr_done to indicate the thread has exited.
716 		 *
717 		 * We can't use pthread_kill() to send SIGCANCEL because the
718 		 * interface forbids it and we can't use pthread_cancel()
719 		 * because with cancellation disabled it won't actually
720 		 * send SIGCANCEL to the target thread, so we use _lwp_kill()
721 		 * to do the job.  This is all built on evil knowledge of
722 		 * the details of the cancellation mechanism in libc.
723 		 */
724 		(void) pthread_mutex_lock(&dpr->dpr_lock);
725 		dpr->dpr_quit = B_TRUE;
726 		(void) _lwp_kill(dpr->dpr_tid, SIGCANCEL);
727 
728 		/*
729 		 * If the process is currently idling in dt_proc_stop(), re-
730 		 * enable breakpoints and poke it into running again.
731 		 */
732 		if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
733 			dt_proc_bpenable(dpr);
734 			dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
735 			(void) pthread_cond_broadcast(&dpr->dpr_cv);
736 		}
737 
738 		while (!dpr->dpr_done)
739 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
740 
741 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
742 	}
743 
744 	/*
745 	 * Before we free the process structure, remove this dt_proc_t from the
746 	 * lookup hash, and then walk the dt_proc_hash_t's notification list
747 	 * and remove this dt_proc_t if it is enqueued.
748 	 */
749 	(void) pthread_mutex_lock(&dph->dph_lock);
750 	(void) dt_proc_lookup(dtp, P, B_TRUE);
751 	npp = &dph->dph_notify;
752 
753 	while ((npr = *npp) != NULL) {
754 		if (npr->dprn_dpr == dpr) {
755 			*npp = npr->dprn_next;
756 			dt_free(dtp, npr);
757 		} else {
758 			npp = &npr->dprn_next;
759 		}
760 	}
761 
762 	(void) pthread_mutex_unlock(&dph->dph_lock);
763 
764 	/*
765 	 * Remove the dt_proc_list from the LRU list, release the underlying
766 	 * libproc handle, and free our dt_proc_t data structure.
767 	 */
768 	if (dpr->dpr_cacheable) {
769 		assert(dph->dph_lrucnt != 0);
770 		dph->dph_lrucnt--;
771 	}
772 
773 	dt_list_delete(&dph->dph_lrulist, dpr);
774 	Prelease(dpr->dpr_proc, rflag);
775 	dt_free(dtp, dpr);
776 }
777 
778 static int
779 dt_proc_create_thread(dtrace_hdl_t *dtp, dt_proc_t *dpr, uint_t stop)
780 {
781 	dt_proc_control_data_t data;
782 	sigset_t nset, oset;
783 	pthread_attr_t a;
784 	int err;
785 
786 	(void) pthread_mutex_lock(&dpr->dpr_lock);
787 	dpr->dpr_stop |= stop; /* set bit for initial rendezvous */
788 
789 	(void) pthread_attr_init(&a);
790 	(void) pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
791 
792 	(void) sigfillset(&nset);
793 	(void) sigdelset(&nset, SIGABRT);	/* unblocked for assert() */
794 	(void) sigdelset(&nset, SIGCANCEL);	/* see dt_proc_destroy() */
795 
796 	data.dpcd_hdl = dtp;
797 	data.dpcd_proc = dpr;
798 
799 	(void) pthread_sigmask(SIG_SETMASK, &nset, &oset);
800 	err = pthread_create(&dpr->dpr_tid, &a, dt_proc_control, &data);
801 	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
802 
803 	/*
804 	 * If the control thread was created, then wait on dpr_cv for either
805 	 * dpr_done to be set (the victim died or the control thread failed)
806 	 * or DT_PROC_STOP_IDLE to be set, indicating that the victim is now
807 	 * stopped by /proc and the control thread is at the rendezvous event.
808 	 * On success, we return with the process and control thread stopped:
809 	 * the caller can then apply dt_proc_continue() to resume both.
810 	 */
811 	if (err == 0) {
812 		while (!dpr->dpr_done && !(dpr->dpr_stop & DT_PROC_STOP_IDLE))
813 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
814 
815 		/*
816 		 * If dpr_done is set, the control thread aborted before it
817 		 * reached the rendezvous event.  This is either due to PS_LOST
818 		 * or PS_UNDEAD (i.e. the process died).  We try to provide a
819 		 * small amount of useful information to help figure it out.
820 		 */
821 		if (dpr->dpr_done) {
822 			const psinfo_t *prp = Ppsinfo(dpr->dpr_proc);
823 			int stat = prp ? prp->pr_wstat : 0;
824 			int pid = dpr->dpr_pid;
825 
826 			if (Pstate(dpr->dpr_proc) == PS_LOST) {
827 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
828 				    "failed to control pid %d: process exec'd "
829 				    "set-id or unobservable program\n", pid);
830 			} else if (WIFSIGNALED(stat)) {
831 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
832 				    "failed to control pid %d: process died "
833 				    "from signal %d\n", pid, WTERMSIG(stat));
834 			} else {
835 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
836 				    "failed to control pid %d: process exited "
837 				    "with status %d\n", pid, WEXITSTATUS(stat));
838 			}
839 
840 			err = ESRCH; /* cause grab() or create() to fail */
841 		}
842 	} else {
843 		(void) dt_proc_error(dpr->dpr_hdl, dpr,
844 		    "failed to create control thread for process-id %d: %s\n",
845 		    (int)dpr->dpr_pid, strerror(err));
846 	}
847 
848 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
849 	(void) pthread_attr_destroy(&a);
850 
851 	return (err);
852 }
853 
854 struct ps_prochandle *
855 dt_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv)
856 {
857 	dt_proc_hash_t *dph = dtp->dt_procs;
858 	dt_proc_t *dpr;
859 	int err;
860 
861 	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
862 		return (NULL); /* errno is set for us */
863 
864 	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
865 	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
866 
867 	if ((dpr->dpr_proc = Pcreate(file, argv, &err, NULL, 0)) == NULL) {
868 		return (dt_proc_error(dtp, dpr,
869 		    "failed to execute %s: %s\n", file, Pcreate_error(err)));
870 	}
871 
872 	dpr->dpr_hdl = dtp;
873 	dpr->dpr_pid = Pstatus(dpr->dpr_proc)->pr_pid;
874 
875 	(void) Punsetflags(dpr->dpr_proc, PR_RLC);
876 	(void) Psetflags(dpr->dpr_proc, PR_KLC);
877 
878 	if (dt_proc_create_thread(dtp, dpr, dtp->dt_prcmode) != 0)
879 		return (NULL); /* dt_proc_error() has been called for us */
880 
881 	dpr->dpr_hash = dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)];
882 	dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)] = dpr;
883 	dt_list_prepend(&dph->dph_lrulist, dpr);
884 
885 	dt_dprintf("created pid %d\n", (int)dpr->dpr_pid);
886 	dpr->dpr_refs++;
887 
888 	return (dpr->dpr_proc);
889 }
890 
891 struct ps_prochandle *
892 dt_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags, int nomonitor)
893 {
894 	dt_proc_hash_t *dph = dtp->dt_procs;
895 	uint_t h = pid & (dph->dph_hashlen - 1);
896 	dt_proc_t *dpr, *opr;
897 	int err;
898 
899 	/*
900 	 * Search the hash table for the pid.  If it is already grabbed or
901 	 * created, move the handle to the front of the lrulist, increment
902 	 * the reference count, and return the existing ps_prochandle.
903 	 */
904 	for (dpr = dph->dph_hash[h]; dpr != NULL; dpr = dpr->dpr_hash) {
905 		if (dpr->dpr_pid == pid && !dpr->dpr_stale) {
906 			/*
907 			 * If the cached handle was opened read-only and
908 			 * this request is for a writeable handle, mark
909 			 * the cached handle as stale and open a new handle.
910 			 * Since it's stale, unmark it as cacheable.
911 			 */
912 			if (dpr->dpr_rdonly && !(flags & PGRAB_RDONLY)) {
913 				dt_dprintf("upgrading pid %d\n", (int)pid);
914 				dpr->dpr_stale = B_TRUE;
915 				dpr->dpr_cacheable = B_FALSE;
916 				dph->dph_lrucnt--;
917 				break;
918 			}
919 
920 			dt_dprintf("grabbed pid %d (cached)\n", (int)pid);
921 			dt_list_delete(&dph->dph_lrulist, dpr);
922 			dt_list_prepend(&dph->dph_lrulist, dpr);
923 			dpr->dpr_refs++;
924 			return (dpr->dpr_proc);
925 		}
926 	}
927 
928 	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
929 		return (NULL); /* errno is set for us */
930 
931 	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
932 	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
933 
934 	if ((dpr->dpr_proc = Pgrab(pid, flags, &err)) == NULL) {
935 		return (dt_proc_error(dtp, dpr,
936 		    "failed to grab pid %d: %s\n", (int)pid, Pgrab_error(err)));
937 	}
938 
939 	dpr->dpr_hdl = dtp;
940 	dpr->dpr_pid = pid;
941 
942 	(void) Punsetflags(dpr->dpr_proc, PR_KLC);
943 	(void) Psetflags(dpr->dpr_proc, PR_RLC);
944 
945 	/*
946 	 * If we are attempting to grab the process without a monitor
947 	 * thread, then mark the process cacheable only if it's being
948 	 * grabbed read-only.  If we're currently caching more process
949 	 * handles than dph_lrulim permits, attempt to find the
950 	 * least-recently-used handle that is currently unreferenced and
951 	 * release it from the cache.  Otherwise we are grabbing the process
952 	 * for control: create a control thread for this process and store
953 	 * its ID in dpr->dpr_tid.
954 	 */
955 	if (nomonitor || (flags & PGRAB_RDONLY)) {
956 		if (dph->dph_lrucnt >= dph->dph_lrulim) {
957 			for (opr = dt_list_prev(&dph->dph_lrulist);
958 			    opr != NULL; opr = dt_list_prev(opr)) {
959 				if (opr->dpr_cacheable && opr->dpr_refs == 0) {
960 					dt_proc_destroy(dtp, opr->dpr_proc);
961 					break;
962 				}
963 			}
964 		}
965 
966 		if (flags & PGRAB_RDONLY) {
967 			dpr->dpr_cacheable = B_TRUE;
968 			dpr->dpr_rdonly = B_TRUE;
969 			dph->dph_lrucnt++;
970 		}
971 
972 	} else if (dt_proc_create_thread(dtp, dpr, DT_PROC_STOP_GRAB) != 0)
973 		return (NULL); /* dt_proc_error() has been called for us */
974 
975 	dpr->dpr_hash = dph->dph_hash[h];
976 	dph->dph_hash[h] = dpr;
977 	dt_list_prepend(&dph->dph_lrulist, dpr);
978 
979 	dt_dprintf("grabbed pid %d\n", (int)pid);
980 	dpr->dpr_refs++;
981 
982 	return (dpr->dpr_proc);
983 }
984 
985 void
986 dt_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
987 {
988 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
989 	dt_proc_hash_t *dph = dtp->dt_procs;
990 
991 	assert(dpr != NULL);
992 	assert(dpr->dpr_refs != 0);
993 
994 	if (--dpr->dpr_refs == 0 &&
995 	    (!dpr->dpr_cacheable || dph->dph_lrucnt > dph->dph_lrulim))
996 		dt_proc_destroy(dtp, P);
997 }
998 
999 void
1000 dt_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1001 {
1002 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1003 
1004 	(void) pthread_mutex_lock(&dpr->dpr_lock);
1005 
1006 	if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
1007 		dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
1008 		(void) pthread_cond_broadcast(&dpr->dpr_cv);
1009 	}
1010 
1011 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
1012 }
1013 
1014 void
1015 dt_proc_lock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1016 {
1017 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1018 	int err = pthread_mutex_lock(&dpr->dpr_lock);
1019 	assert(err == 0); /* check for recursion */
1020 }
1021 
1022 void
1023 dt_proc_unlock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1024 {
1025 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1026 	int err = pthread_mutex_unlock(&dpr->dpr_lock);
1027 	assert(err == 0); /* check for unheld lock */
1028 }
1029 
1030 void
1031 dt_proc_hash_create(dtrace_hdl_t *dtp)
1032 {
1033 	if ((dtp->dt_procs = dt_zalloc(dtp, sizeof (dt_proc_hash_t) +
1034 	    sizeof (dt_proc_t *) * _dtrace_pidbuckets - 1)) != NULL) {
1035 
1036 		(void) pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
1037 		(void) pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
1038 
1039 		dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
1040 		dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
1041 	}
1042 }
1043 
1044 void
1045 dt_proc_hash_destroy(dtrace_hdl_t *dtp)
1046 {
1047 	dt_proc_hash_t *dph = dtp->dt_procs;
1048 	dt_proc_t *dpr;
1049 
1050 	while ((dpr = dt_list_next(&dph->dph_lrulist)) != NULL)
1051 		dt_proc_destroy(dtp, dpr->dpr_proc);
1052 
1053 	dtp->dt_procs = NULL;
1054 	dt_free(dtp, dph);
1055 }
1056 
1057 struct ps_prochandle *
1058 dtrace_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv)
1059 {
1060 	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1061 	struct ps_prochandle *P = dt_proc_create(dtp, file, argv);
1062 
1063 	if (P != NULL && idp != NULL && idp->di_id == 0)
1064 		idp->di_id = Pstatus(P)->pr_pid; /* $target = created pid */
1065 
1066 	return (P);
1067 }
1068 
1069 struct ps_prochandle *
1070 dtrace_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags)
1071 {
1072 	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1073 	struct ps_prochandle *P = dt_proc_grab(dtp, pid, flags, 0);
1074 
1075 	if (P != NULL && idp != NULL && idp->di_id == 0)
1076 		idp->di_id = pid; /* $target = grabbed pid */
1077 
1078 	return (P);
1079 }
1080 
1081 void
1082 dtrace_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1083 {
1084 	dt_proc_release(dtp, P);
1085 }
1086 
1087 void
1088 dtrace_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1089 {
1090 	dt_proc_continue(dtp, P);
1091 }
1092