xref: /freebsd/cddl/contrib/opensolaris/lib/libdtrace/common/dt_proc.c (revision a64729f5077d77e13b9497cb33ecb3c82e606ee8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2012 by Delphix. All rights reserved.
29  */
30 
31 /*
32  * DTrace Process Control
33  *
34  * This file provides a set of routines that permit libdtrace and its clients
35  * to create and grab process handles using libproc, and to share these handles
36  * between library mechanisms that need libproc access, such as ustack(), and
37  * client mechanisms that need libproc access, such as dtrace(1M) -c and -p.
38  * The library provides several mechanisms in the libproc control layer:
39  *
40  * Reference Counting: The library code and client code can independently grab
41  * the same process handles without interfering with one another.  Only when
42  * the reference count drops to zero and the handle is not being cached (see
43  * below for more information on caching) will Prelease() be called on it.
44  *
45  * Handle Caching: If a handle is grabbed PGRAB_RDONLY (e.g. by ustack()) and
46  * the reference count drops to zero, the handle is not immediately released.
47  * Instead, libproc handles are maintained on dph_lrulist in order from most-
48  * recently accessed to least-recently accessed.  Idle handles are maintained
49  * until a pre-defined LRU cache limit is exceeded, permitting repeated calls
50  * to ustack() to avoid the overhead of releasing and re-grabbing processes.
51  *
52  * Process Control: For processes that are grabbed for control (~PGRAB_RDONLY)
53  * or created by dt_proc_create(), a control thread is created to provide
54  * callbacks on process exit and symbol table caching on dlopen()s.
55  *
56  * MT-Safety: Libproc is not MT-Safe, so dt_proc_lock() and dt_proc_unlock()
57  * are provided to synchronize access to the libproc handle between libdtrace
58  * code and client code and the control thread's use of the ps_prochandle.
59  *
60  * NOTE: MT-Safety is NOT provided for libdtrace itself, or for use of the
61  * dtrace_proc_grab/dtrace_proc_create mechanisms.  Like all exported libdtrace
62  * calls, these are assumed to be MT-Unsafe.  MT-Safety is ONLY provided for
63  * synchronization between libdtrace control threads and the client thread.
64  *
65  * The ps_prochandles themselves are maintained along with a dt_proc_t struct
66  * in a hash table indexed by PID.  This provides basic locking and reference
67  * counting.  The dt_proc_t is also maintained in LRU order on dph_lrulist.
68  * The dph_lrucnt and dph_lrulim count the number of cacheable processes and
69  * the current limit on the number of actively cached entries.
70  *
71  * The control thread for a process establishes breakpoints at the rtld_db
72  * locations of interest, updates mappings and symbol tables at these points,
73  * and handles exec and fork (by always following the parent).  The control
74  * thread automatically exits when the process dies or control is lost.
75  *
76  * A simple notification mechanism is provided for libdtrace clients using
77  * dtrace_handle_proc() for notification of PS_UNDEAD or PS_LOST events.  If
78  * such an event occurs, the dt_proc_t itself is enqueued on a notification
79  * list and the control thread broadcasts to dph_cv.  dtrace_sleep() will wake
80  * up using this condition and will then call the client handler as necessary.
81  */
82 
83 #include <sys/syscall.h>
84 #include <sys/wait.h>
85 #include <strings.h>
86 #include <signal.h>
87 #include <assert.h>
88 #include <errno.h>
89 
90 #include <dt_proc.h>
91 #include <dt_pid.h>
92 #include <dt_impl.h>
93 
94 #include <libproc_compat.h>
95 
96 #define	IS_SYS_EXEC(w)	(w == SYS_execve)
97 #define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_fork)
98 
99 static dt_bkpt_t *
100 dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
101 {
102 	struct ps_prochandle *P = dpr->dpr_proc;
103 	dt_bkpt_t *dbp;
104 
105 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
106 
107 	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
108 		dbp->dbp_func = func;
109 		dbp->dbp_data = data;
110 		dbp->dbp_addr = addr;
111 
112 		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
113 			dbp->dbp_active = B_TRUE;
114 
115 		dt_list_append(&dpr->dpr_bps, dbp);
116 	}
117 
118 	return (dbp);
119 }
120 
121 static void
122 dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
123 {
124 	int state = Pstate(dpr->dpr_proc);
125 	dt_bkpt_t *dbp, *nbp;
126 
127 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
128 
129 	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
130 		if (delbkpts && dbp->dbp_active &&
131 		    state != PS_LOST && state != PS_UNDEAD) {
132 			(void) Pdelbkpt(dpr->dpr_proc,
133 			    dbp->dbp_addr, dbp->dbp_instr);
134 		}
135 		nbp = dt_list_next(dbp);
136 		dt_list_delete(&dpr->dpr_bps, dbp);
137 		dt_free(dpr->dpr_hdl, dbp);
138 	}
139 }
140 
141 static void
142 dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
143 {
144 	unsigned long pc;
145 	dt_bkpt_t *dbp;
146 
147 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
148 
149 	proc_regget(dpr->dpr_proc, REG_PC, &pc);
150 	proc_bkptregadj(&pc);
151 
152 	for (dbp = dt_list_next(&dpr->dpr_bps);
153 	    dbp != NULL; dbp = dt_list_next(dbp)) {
154 		if (pc == dbp->dbp_addr)
155 			break;
156 	}
157 
158 	if (dbp == NULL) {
159 		dt_dprintf("pid %d: spurious breakpoint wakeup for %lx\n",
160 		    (int)dpr->dpr_pid, pc);
161 		return;
162 	}
163 
164 	dt_dprintf("pid %d: hit breakpoint at %lx (%lu)\n",
165 	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
166 
167 	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
168 	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
169 }
170 
171 static void
172 dt_proc_bpenable(dt_proc_t *dpr)
173 {
174 	dt_bkpt_t *dbp;
175 
176 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
177 
178 	for (dbp = dt_list_next(&dpr->dpr_bps);
179 	    dbp != NULL; dbp = dt_list_next(dbp)) {
180 		if (!dbp->dbp_active && Psetbkpt(dpr->dpr_proc,
181 		    dbp->dbp_addr, &dbp->dbp_instr) == 0)
182 			dbp->dbp_active = B_TRUE;
183 	}
184 
185 	dt_dprintf("breakpoints enabled\n");
186 }
187 
188 static void
189 dt_proc_bpdisable(dt_proc_t *dpr)
190 {
191 	dt_bkpt_t *dbp;
192 
193 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
194 
195 	for (dbp = dt_list_next(&dpr->dpr_bps);
196 	    dbp != NULL; dbp = dt_list_next(dbp)) {
197 		if (dbp->dbp_active && Pdelbkpt(dpr->dpr_proc,
198 		    dbp->dbp_addr, dbp->dbp_instr) == 0)
199 			dbp->dbp_active = B_FALSE;
200 	}
201 
202 	dt_dprintf("breakpoints disabled\n");
203 }
204 
205 static void
206 dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
207     const char *msg)
208 {
209 	dt_proc_notify_t *dprn = dt_alloc(dtp, sizeof (dt_proc_notify_t));
210 
211 	if (dprn == NULL) {
212 		dt_dprintf("failed to allocate notification for %d %s\n",
213 		    (int)dpr->dpr_pid, msg);
214 	} else {
215 		dprn->dprn_dpr = dpr;
216 		if (msg == NULL)
217 			dprn->dprn_errmsg[0] = '\0';
218 		else
219 			(void) strlcpy(dprn->dprn_errmsg, msg,
220 			    sizeof (dprn->dprn_errmsg));
221 
222 		(void) pthread_mutex_lock(&dph->dph_lock);
223 
224 		dprn->dprn_next = dph->dph_notify;
225 		dph->dph_notify = dprn;
226 
227 		(void) pthread_cond_broadcast(&dph->dph_cv);
228 		(void) pthread_mutex_unlock(&dph->dph_lock);
229 	}
230 }
231 
232 /*
233  * Check to see if the control thread was requested to stop when the victim
234  * process reached a particular event (why) rather than continuing the victim.
235  * If 'why' is set in the stop mask, we wait on dpr_cv for dt_proc_continue().
236  * If 'why' is not set, this function returns immediately and does nothing.
237  */
238 static void
239 dt_proc_stop(dt_proc_t *dpr, uint8_t why)
240 {
241 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
242 	assert(why != DT_PROC_STOP_IDLE);
243 
244 	if (dpr->dpr_stop & why) {
245 		dpr->dpr_stop |= DT_PROC_STOP_IDLE;
246 		dpr->dpr_stop &= ~why;
247 
248 		(void) pthread_cond_broadcast(&dpr->dpr_cv);
249 
250 		/*
251 		 * We disable breakpoints while stopped to preserve the
252 		 * integrity of the program text for both our own disassembly
253 		 * and that of the kernel.
254 		 */
255 		dt_proc_bpdisable(dpr);
256 
257 		while (dpr->dpr_stop & DT_PROC_STOP_IDLE)
258 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
259 
260 		dt_proc_bpenable(dpr);
261 	}
262 }
263 
264 /*ARGSUSED*/
265 static void
266 dt_proc_bpmain(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *fname)
267 {
268 	dt_dprintf("pid %d: breakpoint at %s()\n", (int)dpr->dpr_pid, fname);
269 	dt_proc_stop(dpr, DT_PROC_STOP_MAIN);
270 }
271 
272 static void
273 dt_proc_rdevent(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *evname)
274 {
275 	rd_event_msg_t rdm;
276 	rd_err_e err;
277 
278 	if ((err = rd_event_getmsg(dpr->dpr_rtld, &rdm)) != RD_OK) {
279 		dt_dprintf("pid %d: failed to get %s event message: %s\n",
280 		    (int)dpr->dpr_pid, evname, rd_errstr(err));
281 		return;
282 	}
283 
284 	dt_dprintf("pid %d: rtld event %s type=%d state %d\n",
285 	    (int)dpr->dpr_pid, evname, rdm.type, rdm.u.state);
286 
287 	switch (rdm.type) {
288 	case RD_DLACTIVITY:
289 		if (rdm.u.state != RD_CONSISTENT)
290 			break;
291 
292 		Pupdate_syms(dpr->dpr_proc);
293 		if (dt_pid_create_probes_module(dtp, dpr) != 0)
294 			dt_proc_notify(dtp, dtp->dt_procs, dpr,
295 			    dpr->dpr_errmsg);
296 
297 		break;
298 	case RD_PREINIT:
299 		Pupdate_syms(dpr->dpr_proc);
300 		dt_proc_stop(dpr, DT_PROC_STOP_PREINIT);
301 		break;
302 	case RD_POSTINIT:
303 		Pupdate_syms(dpr->dpr_proc);
304 		dt_proc_stop(dpr, DT_PROC_STOP_POSTINIT);
305 		break;
306 	}
307 }
308 
309 static void
310 dt_proc_rdwatch(dt_proc_t *dpr, rd_event_e event, const char *evname)
311 {
312 	rd_notify_t rdn;
313 	rd_err_e err;
314 
315 	if ((err = rd_event_addr(dpr->dpr_rtld, event, &rdn)) != RD_OK) {
316 		dt_dprintf("pid %d: failed to get event address for %s: %s\n",
317 		    (int)dpr->dpr_pid, evname, rd_errstr(err));
318 		return;
319 	}
320 
321 	if (rdn.type != RD_NOTIFY_BPT) {
322 		dt_dprintf("pid %d: event %s has unexpected type %d\n",
323 		    (int)dpr->dpr_pid, evname, rdn.type);
324 		return;
325 	}
326 
327 	(void) dt_proc_bpcreate(dpr, rdn.u.bptaddr,
328 	    /* XXX ugly */
329 	    (dt_bkpt_f *)dt_proc_rdevent, __DECONST(void *, evname));
330 }
331 
332 /*
333  * Common code for enabling events associated with the run-time linker after
334  * attaching to a process or after a victim process completes an exec(2).
335  */
336 static void
337 dt_proc_attach(dt_proc_t *dpr, int exec)
338 {
339 	rd_err_e err;
340 	GElf_Sym sym;
341 
342 	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
343 
344 	if (exec) {
345 
346 		dt_proc_bpdestroy(dpr, B_FALSE);
347 	}
348 	if ((dpr->dpr_rtld = Prd_agent(dpr->dpr_proc)) != NULL &&
349 	    (err = rd_event_enable(dpr->dpr_rtld, B_TRUE)) == RD_OK) {
350 		dt_proc_rdwatch(dpr, RD_POSTINIT, "RD_POSTINIT");
351 	} else {
352 		dt_dprintf("pid %d: failed to enable rtld events: %s\n",
353 		    (int)dpr->dpr_pid, dpr->dpr_rtld ? rd_errstr(err) :
354 		    "rtld_db agent initialization failed");
355 	}
356 
357 	Pupdate_maps(dpr->dpr_proc);
358 
359 	if (Pxlookup_by_name(dpr->dpr_proc, LM_ID_BASE,
360 	    "a.out", "main", &sym, NULL) == 0) {
361 		(void) dt_proc_bpcreate(dpr, (uintptr_t)sym.st_value,
362 		    (dt_bkpt_f *)dt_proc_bpmain, "a.out`main");
363 	} else {
364 		dt_dprintf("pid %d: failed to find a.out`main: %s\n",
365 		    (int)dpr->dpr_pid, strerror(errno));
366 	}
367 }
368 
369 typedef struct dt_proc_control_data {
370 	dtrace_hdl_t *dpcd_hdl;			/* DTrace handle */
371 	dt_proc_t *dpcd_proc;			/* proccess to control */
372 } dt_proc_control_data_t;
373 
374 /*
375  * Main loop for all victim process control threads.  We initialize all the
376  * appropriate /proc control mechanisms, and then enter a loop waiting for
377  * the process to stop on an event or die.  We process any events by calling
378  * appropriate subroutines, and exit when the victim dies or we lose control.
379  *
380  * The control thread synchronizes the use of dpr_proc with other libdtrace
381  * threads using dpr_lock.  We hold the lock for all of our operations except
382  * waiting while the process is running: this is accomplished by writing a
383  * PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.  If the
384  * libdtrace client wishes to exit or abort our wait, SIGCANCEL can be used.
385  */
386 static void *
387 dt_proc_control(void *arg)
388 {
389 	dt_proc_control_data_t *datap = arg;
390 	dtrace_hdl_t *dtp = datap->dpcd_hdl;
391 	dt_proc_t *dpr = datap->dpcd_proc;
392 	dt_proc_hash_t *dph = dtp->dt_procs;
393 	struct ps_prochandle *P = dpr->dpr_proc;
394 	int pid = dpr->dpr_pid;
395 	int notify = B_FALSE;
396 
397 	/*
398 	 * We disable the POSIX thread cancellation mechanism so that the
399 	 * client program using libdtrace can't accidentally cancel our thread.
400 	 * dt_proc_destroy() uses SIGCANCEL explicitly to simply poke us out
401 	 * of PCWSTOP with EINTR, at which point we will see dpr_quit and exit.
402 	 */
403 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
404 
405 	/*
406 	 * Set up the corresponding process for tracing by libdtrace.  We want
407 	 * to be able to catch breakpoints and efficiently single-step over
408 	 * them, and we need to enable librtld_db to watch libdl activity.
409 	 */
410 	(void) pthread_mutex_lock(&dpr->dpr_lock);
411 
412 	dt_proc_attach(dpr, B_FALSE);		/* enable rtld breakpoints */
413 
414 	/*
415 	 * If DT_CLOSE_KILL is set, we created the process; otherwise we
416 	 * grabbed it.  Check for an appropriate stop request and wait for
417 	 * dt_proc_continue.
418 	 */
419 	if (dpr->dpr_close == DT_CLOSE_KILL)
420 		dt_proc_stop(dpr, DT_PROC_STOP_CREATE);
421 	else
422 		dt_proc_stop(dpr, DT_PROC_STOP_GRAB);
423 
424 	if (Psetrun(P, 0, 0) == -1) {
425 		dt_dprintf("pid %d: failed to set running: %s\n",
426 		    (int)dpr->dpr_pid, strerror(errno));
427 	}
428 
429 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
430 
431 	/*
432 	 * Wait for the process corresponding to this control thread to stop,
433 	 * process the event, and then set it running again.  We want to sleep
434 	 * with dpr_lock *unheld* so that other parts of libdtrace can use the
435 	 * ps_prochandle in the meantime (e.g. ustack()).  To do this, we write
436 	 * a PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.
437 	 * Once the process stops, we wake up, grab dpr_lock, and then call
438 	 * Pwait() (which will return immediately) and do our processing.
439 	 */
440 	while (!dpr->dpr_quit) {
441 		const lwpstatus_t *psp;
442 
443 		/* Wait for the process to report status. */
444 		proc_wstatus(P);
445 		if (errno == EINTR)
446 			continue; /* check dpr_quit and continue waiting */
447 
448 		(void) pthread_mutex_lock(&dpr->dpr_lock);
449 
450 		switch (Pstate(P)) {
451 		case PS_STOP:
452 			psp = proc_getlwpstatus(P);
453 
454 			dt_dprintf("pid %d: proc stopped showing %d/%d\n",
455 			    pid, psp->pr_why, psp->pr_what);
456 
457 			/*
458 			 * If the process stops showing one of the events that
459 			 * we are tracing, perform the appropriate response.
460 			 * Note that we ignore PR_SUSPENDED, PR_CHECKPOINT, and
461 			 * PR_JOBCONTROL by design: if one of these conditions
462 			 * occurs, we will fall through to Psetrun() but the
463 			 * process will remain stopped in the kernel by the
464 			 * corresponding mechanism (e.g. job control stop).
465 			 */
466 			if (psp->pr_why == PR_FAULTED && psp->pr_what == FLTBPT)
467 				dt_proc_bpmatch(dtp, dpr);
468 			else if (psp->pr_why == PR_SYSENTRY &&
469 			    IS_SYS_FORK(psp->pr_what))
470 				dt_proc_bpdisable(dpr);
471 			else if (psp->pr_why == PR_SYSEXIT &&
472 			    IS_SYS_FORK(psp->pr_what))
473 				dt_proc_bpenable(dpr);
474 			else if (psp->pr_why == PR_SYSEXIT &&
475 			    IS_SYS_EXEC(psp->pr_what))
476 				dt_proc_attach(dpr, B_TRUE);
477 			break;
478 
479 		case PS_LOST:
480 			dt_dprintf("pid %d: proc lost: %s\n",
481 			    pid, strerror(errno));
482 
483 			dpr->dpr_quit = B_TRUE;
484 			notify = B_TRUE;
485 			break;
486 
487 		case PS_UNDEAD:
488 			dt_dprintf("pid %d: proc died\n", pid);
489 			dpr->dpr_quit = B_TRUE;
490 			notify = B_TRUE;
491 			break;
492 		}
493 
494 		if (Pstate(P) != PS_UNDEAD) {
495 			if (dpr->dpr_quit && dpr->dpr_close == DT_CLOSE_KILL) {
496 				/*
497 				 * We're about to kill the child, so don't
498 				 * bother resuming it.  In some cases, such as
499 				 * an initialization error, we shouldn't have
500 				 * started it in the first place, so letting it
501 				 * run could be harmful.
502 				 */
503 			} else if (Psetrun(P, 0, 0) == -1) {
504 				dt_dprintf("pid %d: failed to set running: "
505 				    "%s\n", (int)dpr->dpr_pid, strerror(errno));
506 			}
507 		}
508 
509 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
510 	}
511 
512 	/*
513 	 * If the control thread detected PS_UNDEAD or PS_LOST, then enqueue
514 	 * the dt_proc_t structure on the dt_proc_hash_t notification list.
515 	 */
516 	if (notify)
517 		dt_proc_notify(dtp, dph, dpr, NULL);
518 
519 	/*
520 	 * Destroy and remove any remaining breakpoints, set dpr_done and clear
521 	 * dpr_tid to indicate the control thread has exited, and notify any
522 	 * waiting thread in dt_proc_destroy() that we have succesfully exited.
523 	 */
524 	(void) pthread_mutex_lock(&dpr->dpr_lock);
525 
526 	dt_proc_bpdestroy(dpr, B_TRUE);
527 	dpr->dpr_done = B_TRUE;
528 	dpr->dpr_tid = 0;
529 
530 	(void) pthread_cond_broadcast(&dpr->dpr_cv);
531 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
532 
533 	return (NULL);
534 }
535 
536 /*PRINTFLIKE3*/
537 static struct ps_prochandle *
538 dt_proc_error(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *format, ...)
539 {
540 	va_list ap;
541 
542 	va_start(ap, format);
543 	dt_set_errmsg(dtp, NULL, NULL, NULL, 0, format, ap);
544 	va_end(ap);
545 
546 	if (dpr->dpr_proc != NULL)
547 		Prelease(dpr->dpr_proc, 0);
548 
549 	dt_free(dtp, dpr);
550 	(void) dt_set_errno(dtp, EDT_COMPILER);
551 	return (NULL);
552 }
553 
554 dt_proc_t *
555 dt_proc_lookup(dtrace_hdl_t *dtp, struct ps_prochandle *P, int remove)
556 {
557 	dt_proc_hash_t *dph = dtp->dt_procs;
558 	pid_t pid = proc_getpid(P);
559 	dt_proc_t *dpr, **dpp = &dph->dph_hash[pid & (dph->dph_hashlen - 1)];
560 
561 	for (dpr = *dpp; dpr != NULL; dpr = dpr->dpr_hash) {
562 		if (dpr->dpr_pid == pid)
563 			break;
564 		else
565 			dpp = &dpr->dpr_hash;
566 	}
567 
568 	assert(dpr != NULL);
569 	assert(dpr->dpr_proc == P);
570 
571 	if (remove)
572 		*dpp = dpr->dpr_hash; /* remove from pid hash chain */
573 
574 	return (dpr);
575 }
576 
577 static void
578 dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
579 {
580 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
581 	dt_proc_hash_t *dph = dtp->dt_procs;
582 	dt_proc_notify_t *npr, **npp;
583 	int rflag;
584 
585 	assert(dpr != NULL);
586 
587 	switch (dpr->dpr_close) {
588 	case DT_CLOSE_KILL:
589 		dt_dprintf("killing pid %d\n", (int)dpr->dpr_pid);
590 		rflag = PRELEASE_KILL;
591 		break;
592 	case DT_CLOSE_RUN:
593 		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
594 		rflag = 0;
595 		break;
596 	}
597 
598 	if (dpr->dpr_tid) {
599 		/*
600 		 * Set the dpr_quit flag to tell the daemon thread to exit.  We
601 		 * send it a SIGCANCEL to poke it out of PCWSTOP or any other
602 		 * long-term /proc system call.  Our daemon threads have POSIX
603 		 * cancellation disabled, so EINTR will be the only effect.  We
604 		 * then wait for dpr_done to indicate the thread has exited.
605 		 *
606 		 * We can't use pthread_kill() to send SIGCANCEL because the
607 		 * interface forbids it and we can't use pthread_cancel()
608 		 * because with cancellation disabled it won't actually
609 		 * send SIGCANCEL to the target thread, so we use _lwp_kill()
610 		 * to do the job.  This is all built on evil knowledge of
611 		 * the details of the cancellation mechanism in libc.
612 		 */
613 		(void) pthread_mutex_lock(&dpr->dpr_lock);
614 		dpr->dpr_quit = B_TRUE;
615 		pthread_kill(dpr->dpr_tid, SIGTHR);
616 
617 		/*
618 		 * If the process is currently idling in dt_proc_stop(), re-
619 		 * enable breakpoints and poke it into running again.
620 		 */
621 		if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
622 			dt_proc_bpenable(dpr);
623 			dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
624 			(void) pthread_cond_broadcast(&dpr->dpr_cv);
625 		}
626 
627 		while (!dpr->dpr_done)
628 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
629 
630 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
631 	}
632 
633 	/*
634 	 * Before we free the process structure, remove this dt_proc_t from the
635 	 * lookup hash, and then walk the dt_proc_hash_t's notification list
636 	 * and remove this dt_proc_t if it is enqueued.
637 	 */
638 	(void) pthread_mutex_lock(&dph->dph_lock);
639 	(void) dt_proc_lookup(dtp, P, B_TRUE);
640 	npp = &dph->dph_notify;
641 
642 	while ((npr = *npp) != NULL) {
643 		if (npr->dprn_dpr == dpr) {
644 			*npp = npr->dprn_next;
645 			dt_free(dtp, npr);
646 		} else {
647 			npp = &npr->dprn_next;
648 		}
649 	}
650 
651 	(void) pthread_mutex_unlock(&dph->dph_lock);
652 
653 	/*
654 	 * Remove the dt_proc_list from the LRU list, release the underlying
655 	 * libproc handle, and free our dt_proc_t data structure.
656 	 */
657 	if (dpr->dpr_cacheable) {
658 		assert(dph->dph_lrucnt != 0);
659 		dph->dph_lrucnt--;
660 	}
661 
662 	dt_list_delete(&dph->dph_lrulist, dpr);
663 	Prelease(dpr->dpr_proc, rflag);
664 	dt_free(dtp, dpr);
665 }
666 
667 static int
668 dt_proc_create_thread(dtrace_hdl_t *dtp, dt_proc_t *dpr, uint_t stop)
669 {
670 	dt_proc_control_data_t data;
671 	sigset_t nset, oset;
672 	pthread_attr_t a;
673 	int err;
674 
675 	(void) pthread_mutex_lock(&dpr->dpr_lock);
676 	dpr->dpr_stop |= stop; /* set bit for initial rendezvous */
677 
678 	(void) pthread_attr_init(&a);
679 	(void) pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
680 
681 	(void) sigfillset(&nset);
682 	(void) sigdelset(&nset, SIGABRT);	/* unblocked for assert() */
683 	(void) sigdelset(&nset, SIGUSR1);	/* see dt_proc_destroy() */
684 
685 	data.dpcd_hdl = dtp;
686 	data.dpcd_proc = dpr;
687 
688 	(void) pthread_sigmask(SIG_SETMASK, &nset, &oset);
689 	err = pthread_create(&dpr->dpr_tid, &a, dt_proc_control, &data);
690 	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
691 
692 	/*
693 	 * If the control thread was created, then wait on dpr_cv for either
694 	 * dpr_done to be set (the victim died or the control thread failed)
695 	 * or DT_PROC_STOP_IDLE to be set, indicating that the victim is now
696 	 * stopped by /proc and the control thread is at the rendezvous event.
697 	 * On success, we return with the process and control thread stopped:
698 	 * the caller can then apply dt_proc_continue() to resume both.
699 	 */
700 	if (err == 0) {
701 		while (!dpr->dpr_done && !(dpr->dpr_stop & DT_PROC_STOP_IDLE))
702 			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
703 
704 		/*
705 		 * If dpr_done is set, the control thread aborted before it
706 		 * reached the rendezvous event.  This is either due to PS_LOST
707 		 * or PS_UNDEAD (i.e. the process died).  We try to provide a
708 		 * small amount of useful information to help figure it out.
709 		 */
710 		if (dpr->dpr_done) {
711 			int stat = proc_getwstat(dpr->dpr_proc);
712 			int pid = proc_getpid(dpr->dpr_proc);
713 			if (proc_state(dpr->dpr_proc) == PS_LOST) {
714 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
715 				    "failed to control pid %d: process exec'd "
716 				    "set-id or unobservable program\n", pid);
717 			} else if (WIFSIGNALED(stat)) {
718 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
719 				    "failed to control pid %d: process died "
720 				    "from signal %d\n", pid, WTERMSIG(stat));
721 			} else {
722 				(void) dt_proc_error(dpr->dpr_hdl, dpr,
723 				    "failed to control pid %d: process exited "
724 				    "with status %d\n", pid, WEXITSTATUS(stat));
725 			}
726 
727 			err = ESRCH; /* cause grab() or create() to fail */
728 		}
729 	} else {
730 		(void) dt_proc_error(dpr->dpr_hdl, dpr,
731 		    "failed to create control thread for process-id %d: %s\n",
732 		    (int)dpr->dpr_pid, strerror(err));
733 	}
734 
735 	if (err == 0)
736 		(void) pthread_mutex_unlock(&dpr->dpr_lock);
737 	(void) pthread_attr_destroy(&a);
738 
739 	return (err);
740 }
741 
742 struct ps_prochandle *
743 dt_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv,
744     proc_child_func *pcf, void *child_arg)
745 {
746 	dt_proc_hash_t *dph = dtp->dt_procs;
747 	dt_proc_t *dpr;
748 	int err;
749 
750 	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
751 		return (NULL); /* errno is set for us */
752 
753 	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
754 	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
755 
756 	if ((err = proc_create(file, argv, dtp->dt_proc_env, pcf, child_arg,
757 	    &dpr->dpr_proc)) != 0) {
758 		return (dt_proc_error(dtp, dpr,
759 		    "failed to execute %s: %s\n", file, Pcreate_error(err)));
760 	}
761 
762 	dpr->dpr_hdl = dtp;
763 	dpr->dpr_pid = proc_getpid(dpr->dpr_proc);
764 	dpr->dpr_close = DT_CLOSE_KILL;
765 
766 	if (dt_proc_create_thread(dtp, dpr, dtp->dt_prcmode) != 0)
767 		return (NULL); /* dt_proc_error() has been called for us */
768 
769 	dpr->dpr_hash = dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)];
770 	dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)] = dpr;
771 	dt_list_prepend(&dph->dph_lrulist, dpr);
772 
773 	dt_dprintf("created pid %d\n", (int)dpr->dpr_pid);
774 	dpr->dpr_refs++;
775 
776 	return (dpr->dpr_proc);
777 }
778 
779 struct ps_prochandle *
780 dt_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags, int nomonitor)
781 {
782 	dt_proc_hash_t *dph = dtp->dt_procs;
783 	uint_t h = pid & (dph->dph_hashlen - 1);
784 	dt_proc_t *dpr, *opr;
785 	int err;
786 
787 	/*
788 	 * Search the hash table for the pid.  If it is already grabbed or
789 	 * created, move the handle to the front of the lrulist, increment
790 	 * the reference count, and return the existing ps_prochandle.
791 	 */
792 	for (dpr = dph->dph_hash[h]; dpr != NULL; dpr = dpr->dpr_hash) {
793 		if (dpr->dpr_pid == pid && !dpr->dpr_stale) {
794 			/*
795 			 * If the cached handle was opened read-only and
796 			 * this request is for a writeable handle, mark
797 			 * the cached handle as stale and open a new handle.
798 			 * Since it's stale, unmark it as cacheable.
799 			 */
800 			if (dpr->dpr_rdonly && !(flags & PGRAB_RDONLY)) {
801 				dt_dprintf("upgrading pid %d\n", (int)pid);
802 				dpr->dpr_stale = B_TRUE;
803 				dpr->dpr_cacheable = B_FALSE;
804 				dph->dph_lrucnt--;
805 				break;
806 			}
807 
808 			dt_dprintf("grabbed pid %d (cached)\n", (int)pid);
809 			dt_list_delete(&dph->dph_lrulist, dpr);
810 			dt_list_prepend(&dph->dph_lrulist, dpr);
811 			dpr->dpr_refs++;
812 			return (dpr->dpr_proc);
813 		}
814 	}
815 
816 	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
817 		return (NULL); /* errno is set for us */
818 
819 	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
820 	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
821 
822 	if ((err = proc_attach(pid, flags, &dpr->dpr_proc)) != 0) {
823 		return (dt_proc_error(dtp, dpr,
824 		    "failed to grab pid %d: %s\n", (int)pid, Pgrab_error(err)));
825 	}
826 
827 	dpr->dpr_hdl = dtp;
828 	dpr->dpr_pid = pid;
829 	dpr->dpr_close = DT_CLOSE_RUN;
830 
831 	/*
832 	 * If we are attempting to grab the process without a monitor
833 	 * thread, then mark the process cacheable only if it's being
834 	 * grabbed read-only.  If we're currently caching more process
835 	 * handles than dph_lrulim permits, attempt to find the
836 	 * least-recently-used handle that is currently unreferenced and
837 	 * release it from the cache.  Otherwise we are grabbing the process
838 	 * for control: create a control thread for this process and store
839 	 * its ID in dpr->dpr_tid.
840 	 */
841 	if (nomonitor || (flags & PGRAB_RDONLY)) {
842 		if (dph->dph_lrucnt >= dph->dph_lrulim) {
843 			for (opr = dt_list_prev(&dph->dph_lrulist);
844 			    opr != NULL; opr = dt_list_prev(opr)) {
845 				if (opr->dpr_cacheable && opr->dpr_refs == 0) {
846 					dt_proc_destroy(dtp, opr->dpr_proc);
847 					break;
848 				}
849 			}
850 		}
851 
852 		if (flags & PGRAB_RDONLY) {
853 			dpr->dpr_cacheable = B_TRUE;
854 			dpr->dpr_rdonly = B_TRUE;
855 			dph->dph_lrucnt++;
856 		}
857 
858 	} else if (dt_proc_create_thread(dtp, dpr, DT_PROC_STOP_GRAB) != 0)
859 		return (NULL); /* dt_proc_error() has been called for us */
860 
861 	dpr->dpr_hash = dph->dph_hash[h];
862 	dph->dph_hash[h] = dpr;
863 	dt_list_prepend(&dph->dph_lrulist, dpr);
864 
865 	dt_dprintf("grabbed pid %d\n", (int)pid);
866 	dpr->dpr_refs++;
867 
868 	return (dpr->dpr_proc);
869 }
870 
871 void
872 dt_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
873 {
874 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
875 	dt_proc_hash_t *dph = dtp->dt_procs;
876 
877 	assert(dpr != NULL);
878 	assert(dpr->dpr_refs != 0);
879 
880 	if (--dpr->dpr_refs == 0 &&
881 	    (!dpr->dpr_cacheable || dph->dph_lrucnt > dph->dph_lrulim))
882 		dt_proc_destroy(dtp, P);
883 }
884 
885 void
886 dt_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
887 {
888 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
889 
890 	(void) pthread_mutex_lock(&dpr->dpr_lock);
891 
892 	if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
893 		dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
894 		(void) pthread_cond_broadcast(&dpr->dpr_cv);
895 	}
896 
897 	(void) pthread_mutex_unlock(&dpr->dpr_lock);
898 }
899 
900 void
901 dt_proc_lock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
902 {
903 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
904 	int err = pthread_mutex_lock(&dpr->dpr_lock);
905 	assert(err == 0); /* check for recursion */
906 }
907 
908 void
909 dt_proc_unlock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
910 {
911 	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
912 	int err = pthread_mutex_unlock(&dpr->dpr_lock);
913 	assert(err == 0); /* check for unheld lock */
914 }
915 
916 void
917 dt_proc_init(dtrace_hdl_t *dtp)
918 {
919 	extern char **environ;
920 	static char *envdef[] = {
921 		"LD_NOLAZYLOAD=1",	/* linker lazy loading hides funcs */
922 		NULL
923 	};
924 	char **p;
925 	int i;
926 
927 	if ((dtp->dt_procs = dt_zalloc(dtp, sizeof (dt_proc_hash_t) +
928 	    sizeof (dt_proc_t *) * _dtrace_pidbuckets - 1)) == NULL)
929 		return;
930 
931 	(void) pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
932 	(void) pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
933 
934 	dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
935 	dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
936 
937 	/*
938 	 * Count how big our environment needs to be.
939 	 */
940 	for (i = 1, p = environ; *p != NULL; i++, p++)
941 		continue;
942 	for (p = envdef; *p != NULL; i++, p++)
943 		continue;
944 
945 	if ((dtp->dt_proc_env = dt_zalloc(dtp, sizeof (char *) * i)) == NULL)
946 		return;
947 
948 	for (i = 0, p = environ; *p != NULL; i++, p++) {
949 		if ((dtp->dt_proc_env[i] = strdup(*p)) == NULL)
950 			goto err;
951 	}
952 	for (p = envdef; *p != NULL; i++, p++) {
953 		if ((dtp->dt_proc_env[i] = strdup(*p)) == NULL)
954 			goto err;
955 	}
956 
957 	return;
958 
959 err:
960 	while (--i != 0) {
961 		dt_free(dtp, dtp->dt_proc_env[i]);
962 	}
963 	dt_free(dtp, dtp->dt_proc_env);
964 	dtp->dt_proc_env = NULL;
965 }
966 
967 void
968 dt_proc_fini(dtrace_hdl_t *dtp)
969 {
970 	dt_proc_hash_t *dph = dtp->dt_procs;
971 	dt_proc_t *dpr;
972 	char **p;
973 
974 	while ((dpr = dt_list_next(&dph->dph_lrulist)) != NULL)
975 		dt_proc_destroy(dtp, dpr->dpr_proc);
976 
977 	dtp->dt_procs = NULL;
978 	dt_free(dtp, dph);
979 
980 	for (p = dtp->dt_proc_env; *p != NULL; p++)
981 		dt_free(dtp, *p);
982 
983 	dt_free(dtp, dtp->dt_proc_env);
984 	dtp->dt_proc_env = NULL;
985 }
986 
987 struct ps_prochandle *
988 dtrace_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv,
989     proc_child_func *pcf, void *child_arg)
990 {
991 	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
992 	struct ps_prochandle *P = dt_proc_create(dtp, file, argv, pcf, child_arg);
993 
994 	if (P != NULL && idp != NULL && idp->di_id == 0) {
995 		idp->di_id = proc_getpid(P); /* $target = created pid */
996 	}
997 
998 	return (P);
999 }
1000 
1001 struct ps_prochandle *
1002 dtrace_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags)
1003 {
1004 	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1005 	struct ps_prochandle *P = dt_proc_grab(dtp, pid, flags, 0);
1006 
1007 	if (P != NULL && idp != NULL && idp->di_id == 0)
1008 		idp->di_id = pid; /* $target = grabbed pid */
1009 
1010 	return (P);
1011 }
1012 
1013 void
1014 dtrace_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1015 {
1016 	dt_proc_release(dtp, P);
1017 }
1018 
1019 void
1020 dtrace_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1021 {
1022 	dt_proc_continue(dtp, P);
1023 }
1024