1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
26 */
27
28 /*
29 * DTrace - Dynamic Tracing for Solaris
30 *
31 * This is the implementation of the Solaris Dynamic Tracing framework
32 * (DTrace). The user-visible interface to DTrace is described at length in
33 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
34 * library, the in-kernel DTrace framework, and the DTrace providers are
35 * described in the block comments in the <sys/dtrace.h> header file. The
36 * internal architecture of DTrace is described in the block comments in the
37 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
38 * implementation very much assume mastery of all of these sources; if one has
39 * an unanswered question about the implementation, one should consult them
40 * first.
41 *
42 * The functions here are ordered roughly as follows:
43 *
44 * - Probe context functions
45 * - Probe hashing functions
46 * - Non-probe context utility functions
47 * - Matching functions
48 * - Provider-to-Framework API functions
49 * - Probe management functions
50 * - DIF object functions
51 * - Format functions
52 * - Predicate functions
53 * - ECB functions
54 * - Buffer functions
55 * - Enabling functions
56 * - DOF functions
57 * - Anonymous enabling functions
58 * - Consumer state functions
59 * - Helper functions
60 * - Hook functions
61 * - Driver cookbook functions
62 *
63 * Each group of functions begins with a block comment labelled the "DTrace
64 * [Group] Functions", allowing one to find each block by searching forward
65 * on capital-f functions.
66 */
67 #include <sys/errno.h>
68 #include <sys/param.h>
69 #include <sys/types.h>
70 #ifndef illumos
71 #include <sys/time.h>
72 #endif
73 #include <sys/stat.h>
74 #include <sys/conf.h>
75 #include <sys/systm.h>
76 #include <sys/endian.h>
77 #ifdef illumos
78 #include <sys/ddi.h>
79 #include <sys/sunddi.h>
80 #endif
81 #include <sys/cpuvar.h>
82 #include <sys/kmem.h>
83 #ifdef illumos
84 #include <sys/strsubr.h>
85 #endif
86 #include <sys/sysmacros.h>
87 #include <sys/dtrace_impl.h>
88 #include <sys/atomic.h>
89 #include <sys/cmn_err.h>
90 #ifdef illumos
91 #include <sys/mutex_impl.h>
92 #include <sys/rwlock_impl.h>
93 #endif
94 #include <sys/ctf_api.h>
95 #ifdef illumos
96 #include <sys/panic.h>
97 #include <sys/priv_impl.h>
98 #endif
99 #ifdef illumos
100 #include <sys/cred_impl.h>
101 #include <sys/procfs_isa.h>
102 #endif
103 #include <sys/taskq.h>
104 #ifdef illumos
105 #include <sys/mkdev.h>
106 #include <sys/kdi.h>
107 #endif
108 #include <sys/zone.h>
109 #include <sys/socket.h>
110 #include <netinet/in.h>
111 #include "strtolctype.h"
112
113 /* FreeBSD includes: */
114 #ifndef illumos
115 #include <sys/callout.h>
116 #include <sys/ctype.h>
117 #include <sys/eventhandler.h>
118 #include <sys/limits.h>
119 #include <sys/linker.h>
120 #include <sys/kdb.h>
121 #include <sys/jail.h>
122 #include <sys/kernel.h>
123 #include <sys/malloc.h>
124 #include <sys/lock.h>
125 #include <sys/mutex.h>
126 #include <sys/ptrace.h>
127 #include <sys/random.h>
128 #include <sys/rwlock.h>
129 #include <sys/sx.h>
130 #include <sys/sysctl.h>
131
132
133 #include <sys/mount.h>
134 #undef AT_UID
135 #undef AT_GID
136 #include <sys/vnode.h>
137 #include <sys/cred.h>
138
139 #include <sys/dtrace_bsd.h>
140
141 #include <netinet/in.h>
142
143 #include "dtrace_cddl.h"
144 #include "dtrace_debug.c"
145 #endif
146
147 #include "dtrace_xoroshiro128_plus.h"
148
149 /*
150 * DTrace Tunable Variables
151 *
152 * The following variables may be tuned by adding a line to /etc/system that
153 * includes both the name of the DTrace module ("dtrace") and the name of the
154 * variable. For example:
155 *
156 * set dtrace:dtrace_destructive_disallow = 1
157 *
158 * In general, the only variables that one should be tuning this way are those
159 * that affect system-wide DTrace behavior, and for which the default behavior
160 * is undesirable. Most of these variables are tunable on a per-consumer
161 * basis using DTrace options, and need not be tuned on a system-wide basis.
162 * When tuning these variables, avoid pathological values; while some attempt
163 * is made to verify the integrity of these variables, they are not considered
164 * part of the supported interface to DTrace, and they are therefore not
165 * checked comprehensively. Further, these variables should not be tuned
166 * dynamically via "mdb -kw" or other means; they should only be tuned via
167 * /etc/system.
168 */
169 int dtrace_destructive_disallow = 0;
170 #ifndef illumos
171 /* Positive logic version of dtrace_destructive_disallow for loader tunable */
172 int dtrace_allow_destructive = 1;
173 #endif
174 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
175 size_t dtrace_difo_maxsize = (256 * 1024);
176 dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
177 size_t dtrace_statvar_maxsize = (16 * 1024);
178 size_t dtrace_actions_max = (16 * 1024);
179 size_t dtrace_retain_max = 1024;
180 dtrace_optval_t dtrace_helper_actions_max = 128;
181 dtrace_optval_t dtrace_helper_providers_max = 32;
182 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
183 size_t dtrace_strsize_default = 256;
184 dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
185 dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
186 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
187 dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
188 dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
189 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
190 dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
191 dtrace_optval_t dtrace_nspec_default = 1;
192 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
193 dtrace_optval_t dtrace_stackframes_default = 20;
194 dtrace_optval_t dtrace_ustackframes_default = 20;
195 dtrace_optval_t dtrace_jstackframes_default = 50;
196 dtrace_optval_t dtrace_jstackstrsize_default = 512;
197 int dtrace_msgdsize_max = 128;
198 hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */
199 hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
200 int dtrace_devdepth_max = 32;
201 int dtrace_err_verbose;
202 hrtime_t dtrace_deadman_interval = NANOSEC;
203 hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
204 hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
205 hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
206 #ifndef illumos
207 int dtrace_memstr_max = 4096;
208 int dtrace_bufsize_max_frac = 128;
209 #endif
210
211 /*
212 * DTrace External Variables
213 *
214 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
215 * available to DTrace consumers via the backtick (`) syntax. One of these,
216 * dtrace_zero, is made deliberately so: it is provided as a source of
217 * well-known, zero-filled memory. While this variable is not documented,
218 * it is used by some translators as an implementation detail.
219 */
220 const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
221
222 /*
223 * DTrace Internal Variables
224 */
225 #ifdef illumos
226 static dev_info_t *dtrace_devi; /* device info */
227 #endif
228 #ifdef illumos
229 static vmem_t *dtrace_arena; /* probe ID arena */
230 static vmem_t *dtrace_minor; /* minor number arena */
231 #else
232 static taskq_t *dtrace_taskq; /* task queue */
233 static struct unrhdr *dtrace_arena; /* Probe ID number. */
234 #endif
235 static dtrace_probe_t **dtrace_probes; /* array of all probes */
236 static int dtrace_nprobes; /* number of probes */
237 static dtrace_provider_t *dtrace_provider; /* provider list */
238 static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
239 static int dtrace_opens; /* number of opens */
240 static int dtrace_helpers; /* number of helpers */
241 static int dtrace_getf; /* number of unpriv getf()s */
242 #ifdef illumos
243 static void *dtrace_softstate; /* softstate pointer */
244 #endif
245 static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
246 static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
247 static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
248 static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
249 static int dtrace_toxranges; /* number of toxic ranges */
250 static int dtrace_toxranges_max; /* size of toxic range array */
251 static dtrace_anon_t dtrace_anon; /* anonymous enabling */
252 static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
253 static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
254 static kthread_t *dtrace_panicked; /* panicking thread */
255 static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
256 static dtrace_genid_t dtrace_probegen; /* current probe generation */
257 static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
258 static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
259 static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
260 static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
261 static int dtrace_dynvar_failclean; /* dynvars failed to clean */
262 #ifndef illumos
263 static struct mtx dtrace_unr_mtx;
264 MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
265 static eventhandler_tag dtrace_kld_load_tag;
266 static eventhandler_tag dtrace_kld_unload_try_tag;
267 #endif
268
269 /*
270 * DTrace Locking
271 * DTrace is protected by three (relatively coarse-grained) locks:
272 *
273 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
274 * including enabling state, probes, ECBs, consumer state, helper state,
275 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
276 * probe context is lock-free -- synchronization is handled via the
277 * dtrace_sync() cross call mechanism.
278 *
279 * (2) dtrace_provider_lock is required when manipulating provider state, or
280 * when provider state must be held constant.
281 *
282 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
283 * when meta provider state must be held constant.
284 *
285 * The lock ordering between these three locks is dtrace_meta_lock before
286 * dtrace_provider_lock before dtrace_lock. (In particular, there are
287 * several places where dtrace_provider_lock is held by the framework as it
288 * calls into the providers -- which then call back into the framework,
289 * grabbing dtrace_lock.)
290 *
291 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
292 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
293 * role as a coarse-grained lock; it is acquired before both of these locks.
294 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
295 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
296 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
297 * acquired _between_ dtrace_provider_lock and dtrace_lock.
298 */
299 static kmutex_t dtrace_lock; /* probe state lock */
300 static kmutex_t dtrace_provider_lock; /* provider state lock */
301 static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
302
303 #ifndef illumos
304 /* XXX FreeBSD hacks. */
305 #define cr_suid cr_svuid
306 #define cr_sgid cr_svgid
307 #define ipaddr_t in_addr_t
308 #define mod_modname pathname
309 #define vuprintf vprintf
310 #ifndef crgetzoneid
311 #define crgetzoneid(_a) 0
312 #endif
313 #define ttoproc(_a) ((_a)->td_proc)
314 #define SNOCD 0
315 #define CPU_ON_INTR(_a) 0
316
317 #define PRIV_EFFECTIVE (1 << 0)
318 #define PRIV_DTRACE_KERNEL (1 << 1)
319 #define PRIV_DTRACE_PROC (1 << 2)
320 #define PRIV_DTRACE_USER (1 << 3)
321 #define PRIV_PROC_OWNER (1 << 4)
322 #define PRIV_PROC_ZONE (1 << 5)
323 #define PRIV_ALL ~0
324
325 SYSCTL_DECL(_debug_dtrace);
326 SYSCTL_DECL(_kern_dtrace);
327 #endif
328
329 #ifdef illumos
330 #define curcpu CPU->cpu_id
331 #endif
332
333
334 /*
335 * DTrace Provider Variables
336 *
337 * These are the variables relating to DTrace as a provider (that is, the
338 * provider of the BEGIN, END, and ERROR probes).
339 */
340 static dtrace_pattr_t dtrace_provider_attr = {
341 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
342 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
343 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
344 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
345 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
346 };
347
348 static void
dtrace_nullop(void)349 dtrace_nullop(void)
350 {}
351
352 static dtrace_pops_t dtrace_provider_ops = {
353 .dtps_provide = (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
354 .dtps_provide_module = (void (*)(void *, modctl_t *))dtrace_nullop,
355 .dtps_enable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
356 .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
357 .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
358 .dtps_resume = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
359 .dtps_getargdesc = NULL,
360 .dtps_getargval = NULL,
361 .dtps_usermode = NULL,
362 .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
363 };
364
365 static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
366 static dtrace_id_t dtrace_probeid_end; /* special END probe */
367 dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
368
369 /*
370 * DTrace Helper Tracing Variables
371 *
372 * These variables should be set dynamically to enable helper tracing. The
373 * only variables that should be set are dtrace_helptrace_enable (which should
374 * be set to a non-zero value to allocate helper tracing buffers on the next
375 * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
376 * non-zero value to deallocate helper tracing buffers on the next close of
377 * /dev/dtrace). When (and only when) helper tracing is disabled, the
378 * buffer size may also be set via dtrace_helptrace_bufsize.
379 */
380 int dtrace_helptrace_enable = 0;
381 int dtrace_helptrace_disable = 0;
382 int dtrace_helptrace_bufsize = 16 * 1024 * 1024;
383 uint32_t dtrace_helptrace_nlocals;
384 static dtrace_helptrace_t *dtrace_helptrace_buffer;
385 static uint32_t dtrace_helptrace_next = 0;
386 static int dtrace_helptrace_wrapped = 0;
387
388 /*
389 * DTrace Error Hashing
390 *
391 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
392 * table. This is very useful for checking coverage of tests that are
393 * expected to induce DIF or DOF processing errors, and may be useful for
394 * debugging problems in the DIF code generator or in DOF generation . The
395 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
396 */
397 #ifdef DEBUG
398 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
399 static const char *dtrace_errlast;
400 static kthread_t *dtrace_errthread;
401 static kmutex_t dtrace_errlock;
402 #endif
403
404 /*
405 * DTrace Macros and Constants
406 *
407 * These are various macros that are useful in various spots in the
408 * implementation, along with a few random constants that have no meaning
409 * outside of the implementation. There is no real structure to this cpp
410 * mishmash -- but is there ever?
411 */
412 #define DTRACE_HASHSTR(hash, probe) \
413 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
414
415 #define DTRACE_HASHNEXT(hash, probe) \
416 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
417
418 #define DTRACE_HASHPREV(hash, probe) \
419 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
420
421 #define DTRACE_HASHEQ(hash, lhs, rhs) \
422 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
423 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
424
425 #define DTRACE_AGGHASHSIZE_SLEW 17
426
427 #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
428
429 /*
430 * The key for a thread-local variable consists of the lower 61 bits of the
431 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
432 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
433 * equal to a variable identifier. This is necessary (but not sufficient) to
434 * assure that global associative arrays never collide with thread-local
435 * variables. To guarantee that they cannot collide, we must also define the
436 * order for keying dynamic variables. That order is:
437 *
438 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
439 *
440 * Because the variable-key and the tls-key are in orthogonal spaces, there is
441 * no way for a global variable key signature to match a thread-local key
442 * signature.
443 */
444 #ifdef illumos
445 #define DTRACE_TLS_THRKEY(where) { \
446 uint_t intr = 0; \
447 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
448 for (; actv; actv >>= 1) \
449 intr++; \
450 ASSERT(intr < (1 << 3)); \
451 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
452 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
453 }
454 #else
455 #define DTRACE_TLS_THRKEY(where) { \
456 solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
457 uint_t intr = 0; \
458 uint_t actv = _c->cpu_intr_actv; \
459 for (; actv; actv >>= 1) \
460 intr++; \
461 ASSERT(intr < (1 << 3)); \
462 (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
463 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
464 }
465 #endif
466
467 #define DT_BSWAP_8(x) ((x) & 0xff)
468 #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
469 #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
470 #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
471
472 #define DT_MASK_LO 0x00000000FFFFFFFFULL
473
474 #define DTRACE_STORE(type, tomax, offset, what) \
475 *((type *)((uintptr_t)(tomax) + (size_t)offset)) = (type)(what);
476
477 #if !defined(__x86) && !defined(__aarch64__)
478 #define DTRACE_ALIGNCHECK(addr, size, flags) \
479 if (addr & (size - 1)) { \
480 *flags |= CPU_DTRACE_BADALIGN; \
481 cpu_core[curcpu].cpuc_dtrace_illval = addr; \
482 return (0); \
483 }
484 #else
485 #define DTRACE_ALIGNCHECK(addr, size, flags)
486 #endif
487
488 /*
489 * Test whether a range of memory starting at testaddr of size testsz falls
490 * within the range of memory described by addr, sz. We take care to avoid
491 * problems with overflow and underflow of the unsigned quantities, and
492 * disallow all negative sizes. Ranges of size 0 are allowed.
493 */
494 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
495 ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
496 (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
497 (testaddr) + (testsz) >= (testaddr))
498
499 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
500 do { \
501 if ((remp) != NULL) { \
502 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
503 } \
504 } while (0)
505
506
507 /*
508 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
509 * alloc_sz on the righthand side of the comparison in order to avoid overflow
510 * or underflow in the comparison with it. This is simpler than the INRANGE
511 * check above, because we know that the dtms_scratch_ptr is valid in the
512 * range. Allocations of size zero are allowed.
513 */
514 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
515 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
516 (mstate)->dtms_scratch_ptr >= (alloc_sz))
517
518 #define DTRACE_INSCRATCHPTR(mstate, ptr, howmany) \
519 ((ptr) >= (mstate)->dtms_scratch_base && \
520 (ptr) <= \
521 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - (howmany)))
522
523 #define DTRACE_LOADFUNC(bits) \
524 /*CSTYLED*/ \
525 uint##bits##_t \
526 dtrace_load##bits(uintptr_t addr) \
527 { \
528 size_t size = bits / NBBY; \
529 /*CSTYLED*/ \
530 uint##bits##_t rval; \
531 int i; \
532 volatile uint16_t *flags = (volatile uint16_t *) \
533 &cpu_core[curcpu].cpuc_dtrace_flags; \
534 \
535 DTRACE_ALIGNCHECK(addr, size, flags); \
536 \
537 for (i = 0; i < dtrace_toxranges; i++) { \
538 if (addr >= dtrace_toxrange[i].dtt_limit) \
539 continue; \
540 \
541 if (addr + size <= dtrace_toxrange[i].dtt_base) \
542 continue; \
543 \
544 /* \
545 * This address falls within a toxic region; return 0. \
546 */ \
547 *flags |= CPU_DTRACE_BADADDR; \
548 cpu_core[curcpu].cpuc_dtrace_illval = addr; \
549 return (0); \
550 } \
551 \
552 __compiler_membar(); \
553 *flags |= CPU_DTRACE_NOFAULT; \
554 /*CSTYLED*/ \
555 rval = *((volatile uint##bits##_t *)addr); \
556 *flags &= ~CPU_DTRACE_NOFAULT; \
557 __compiler_membar(); \
558 \
559 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
560 }
561
562 #ifdef _LP64
563 #define dtrace_loadptr dtrace_load64
564 #else
565 #define dtrace_loadptr dtrace_load32
566 #endif
567
568 #define DTRACE_DYNHASH_FREE 0
569 #define DTRACE_DYNHASH_SINK 1
570 #define DTRACE_DYNHASH_VALID 2
571
572 #define DTRACE_MATCH_NEXT 0
573 #define DTRACE_MATCH_DONE 1
574 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
575 #define DTRACE_STATE_ALIGN 64
576
577 #define DTRACE_FLAGS2FLT(flags) \
578 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
579 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
580 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
581 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
582 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
583 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
584 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
585 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
586 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
587 DTRACEFLT_UNKNOWN)
588
589 #define DTRACEACT_ISSTRING(act) \
590 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
591 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
592
593 /* Function prototype definitions: */
594 static size_t dtrace_strlen(const char *, size_t);
595 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
596 static void dtrace_enabling_provide(dtrace_provider_t *);
597 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
598 static void dtrace_enabling_matchall(void);
599 static void dtrace_enabling_matchall_task(void *);
600 static void dtrace_enabling_reap(void *);
601 static dtrace_state_t *dtrace_anon_grab(void);
602 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
603 dtrace_state_t *, uint64_t, uint64_t);
604 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
605 static void dtrace_buffer_drop(dtrace_buffer_t *);
606 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
607 static ssize_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
608 dtrace_state_t *, dtrace_mstate_t *);
609 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
610 dtrace_optval_t);
611 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
612 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
613 uint16_t dtrace_load16(uintptr_t);
614 uint32_t dtrace_load32(uintptr_t);
615 uint64_t dtrace_load64(uintptr_t);
616 uint8_t dtrace_load8(uintptr_t);
617 void dtrace_dynvar_clean(dtrace_dstate_t *);
618 dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
619 size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
620 uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
621 static int dtrace_priv_proc(dtrace_state_t *);
622 static void dtrace_getf_barrier(void);
623 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
624 dtrace_mstate_t *, dtrace_vstate_t *);
625 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
626 dtrace_mstate_t *, dtrace_vstate_t *);
627
628 /*
629 * DTrace Probe Context Functions
630 *
631 * These functions are called from probe context. Because probe context is
632 * any context in which C may be called, arbitrarily locks may be held,
633 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
634 * As a result, functions called from probe context may only call other DTrace
635 * support functions -- they may not interact at all with the system at large.
636 * (Note that the ASSERT macro is made probe-context safe by redefining it in
637 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
638 * loads are to be performed from probe context, they _must_ be in terms of
639 * the safe dtrace_load*() variants.
640 *
641 * Some functions in this block are not actually called from probe context;
642 * for these functions, there will be a comment above the function reading
643 * "Note: not called from probe context."
644 */
645 void
dtrace_panic(const char * format,...)646 dtrace_panic(const char *format, ...)
647 {
648 va_list alist;
649
650 va_start(alist, format);
651 #ifdef __FreeBSD__
652 vpanic(format, alist);
653 #else
654 dtrace_vpanic(format, alist);
655 #endif
656 va_end(alist);
657 }
658
659 int
dtrace_assfail(const char * a,const char * f,int l)660 dtrace_assfail(const char *a, const char *f, int l)
661 {
662 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
663
664 /*
665 * We just need something here that even the most clever compiler
666 * cannot optimize away.
667 */
668 return (a[(uintptr_t)f]);
669 }
670
671 /*
672 * Atomically increment a specified error counter from probe context.
673 */
674 static void
dtrace_error(uint32_t * counter)675 dtrace_error(uint32_t *counter)
676 {
677 /*
678 * Most counters stored to in probe context are per-CPU counters.
679 * However, there are some error conditions that are sufficiently
680 * arcane that they don't merit per-CPU storage. If these counters
681 * are incremented concurrently on different CPUs, scalability will be
682 * adversely affected -- but we don't expect them to be white-hot in a
683 * correctly constructed enabling...
684 */
685 uint32_t oval, nval;
686
687 do {
688 oval = *counter;
689
690 if ((nval = oval + 1) == 0) {
691 /*
692 * If the counter would wrap, set it to 1 -- assuring
693 * that the counter is never zero when we have seen
694 * errors. (The counter must be 32-bits because we
695 * aren't guaranteed a 64-bit compare&swap operation.)
696 * To save this code both the infamy of being fingered
697 * by a priggish news story and the indignity of being
698 * the target of a neo-puritan witch trial, we're
699 * carefully avoiding any colorful description of the
700 * likelihood of this condition -- but suffice it to
701 * say that it is only slightly more likely than the
702 * overflow of predicate cache IDs, as discussed in
703 * dtrace_predicate_create().
704 */
705 nval = 1;
706 }
707 } while (dtrace_cas32(counter, oval, nval) != oval);
708 }
709
710 void
dtrace_xcall(processorid_t cpu,dtrace_xcall_t func,void * arg)711 dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg)
712 {
713 cpuset_t cpus;
714
715 if (cpu == DTRACE_CPUALL)
716 cpus = all_cpus;
717 else
718 CPU_SETOF(cpu, &cpus);
719
720 smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func,
721 smp_no_rendezvous_barrier, arg);
722 }
723
724 static void
dtrace_sync_func(void)725 dtrace_sync_func(void)
726 {
727 }
728
729 void
dtrace_sync(void)730 dtrace_sync(void)
731 {
732 dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
733 }
734
735 /*
736 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
737 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
738 */
739 /* BEGIN CSTYLED */
740 DTRACE_LOADFUNC(8)
741 DTRACE_LOADFUNC(16)
742 DTRACE_LOADFUNC(32)
743 DTRACE_LOADFUNC(64)
744 /* END CSTYLED */
745
746 static int
dtrace_inscratch(uintptr_t dest,size_t size,dtrace_mstate_t * mstate)747 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
748 {
749 if (dest < mstate->dtms_scratch_base)
750 return (0);
751
752 if (dest + size < dest)
753 return (0);
754
755 if (dest + size > mstate->dtms_scratch_ptr)
756 return (0);
757
758 return (1);
759 }
760
761 static int
dtrace_canstore_statvar(uint64_t addr,size_t sz,size_t * remain,dtrace_statvar_t ** svars,int nsvars)762 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
763 dtrace_statvar_t **svars, int nsvars)
764 {
765 int i;
766 size_t maxglobalsize, maxlocalsize;
767
768 if (nsvars == 0)
769 return (0);
770
771 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
772 maxlocalsize = maxglobalsize * (mp_maxid + 1);
773
774 for (i = 0; i < nsvars; i++) {
775 dtrace_statvar_t *svar = svars[i];
776 uint8_t scope;
777 size_t size;
778
779 if (svar == NULL || (size = svar->dtsv_size) == 0)
780 continue;
781
782 scope = svar->dtsv_var.dtdv_scope;
783
784 /*
785 * We verify that our size is valid in the spirit of providing
786 * defense in depth: we want to prevent attackers from using
787 * DTrace to escalate an orthogonal kernel heap corruption bug
788 * into the ability to store to arbitrary locations in memory.
789 */
790 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
791 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
792
793 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
794 svar->dtsv_size)) {
795 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
796 svar->dtsv_size);
797 return (1);
798 }
799 }
800
801 return (0);
802 }
803
804 /*
805 * Check to see if the address is within a memory region to which a store may
806 * be issued. This includes the DTrace scratch areas, and any DTrace variable
807 * region. The caller of dtrace_canstore() is responsible for performing any
808 * alignment checks that are needed before stores are actually executed.
809 */
810 static int
dtrace_canstore(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)811 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
812 dtrace_vstate_t *vstate)
813 {
814 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
815 }
816
817 /*
818 * Implementation of dtrace_canstore which communicates the upper bound of the
819 * allowed memory region.
820 */
821 static int
dtrace_canstore_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)822 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
823 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
824 {
825 /*
826 * First, check to see if the address is in scratch space...
827 */
828 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
829 mstate->dtms_scratch_size)) {
830 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
831 mstate->dtms_scratch_size);
832 return (1);
833 }
834
835 /*
836 * Now check to see if it's a dynamic variable. This check will pick
837 * up both thread-local variables and any global dynamically-allocated
838 * variables.
839 */
840 if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
841 vstate->dtvs_dynvars.dtds_size)) {
842 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
843 uintptr_t base = (uintptr_t)dstate->dtds_base +
844 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
845 uintptr_t chunkoffs;
846 dtrace_dynvar_t *dvar;
847
848 /*
849 * Before we assume that we can store here, we need to make
850 * sure that it isn't in our metadata -- storing to our
851 * dynamic variable metadata would corrupt our state. For
852 * the range to not include any dynamic variable metadata,
853 * it must:
854 *
855 * (1) Start above the hash table that is at the base of
856 * the dynamic variable space
857 *
858 * (2) Have a starting chunk offset that is beyond the
859 * dtrace_dynvar_t that is at the base of every chunk
860 *
861 * (3) Not span a chunk boundary
862 *
863 * (4) Not be in the tuple space of a dynamic variable
864 *
865 */
866 if (addr < base)
867 return (0);
868
869 chunkoffs = (addr - base) % dstate->dtds_chunksize;
870
871 if (chunkoffs < sizeof (dtrace_dynvar_t))
872 return (0);
873
874 if (chunkoffs + sz > dstate->dtds_chunksize)
875 return (0);
876
877 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
878
879 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
880 return (0);
881
882 if (chunkoffs < sizeof (dtrace_dynvar_t) +
883 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
884 return (0);
885
886 DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
887 return (1);
888 }
889
890 /*
891 * Finally, check the static local and global variables. These checks
892 * take the longest, so we perform them last.
893 */
894 if (dtrace_canstore_statvar(addr, sz, remain,
895 vstate->dtvs_locals, vstate->dtvs_nlocals))
896 return (1);
897
898 if (dtrace_canstore_statvar(addr, sz, remain,
899 vstate->dtvs_globals, vstate->dtvs_nglobals))
900 return (1);
901
902 return (0);
903 }
904
905
906 /*
907 * Convenience routine to check to see if the address is within a memory
908 * region in which a load may be issued given the user's privilege level;
909 * if not, it sets the appropriate error flags and loads 'addr' into the
910 * illegal value slot.
911 *
912 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
913 * appropriate memory access protection.
914 */
915 static int
dtrace_canload(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)916 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
917 dtrace_vstate_t *vstate)
918 {
919 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
920 }
921
922 /*
923 * Implementation of dtrace_canload which communicates the uppoer bound of the
924 * allowed memory region.
925 */
926 static int
dtrace_canload_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)927 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
928 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
929 {
930 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
931 file_t *fp;
932
933 /*
934 * If we hold the privilege to read from kernel memory, then
935 * everything is readable.
936 */
937 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
938 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
939 return (1);
940 }
941
942 /*
943 * You can obviously read that which you can store.
944 */
945 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
946 return (1);
947
948 /*
949 * We're allowed to read from our own string table.
950 */
951 if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
952 mstate->dtms_difo->dtdo_strlen)) {
953 DTRACE_RANGE_REMAIN(remain, addr,
954 mstate->dtms_difo->dtdo_strtab,
955 mstate->dtms_difo->dtdo_strlen);
956 return (1);
957 }
958
959 if (vstate->dtvs_state != NULL &&
960 dtrace_priv_proc(vstate->dtvs_state)) {
961 proc_t *p;
962
963 /*
964 * When we have privileges to the current process, there are
965 * several context-related kernel structures that are safe to
966 * read, even absent the privilege to read from kernel memory.
967 * These reads are safe because these structures contain only
968 * state that (1) we're permitted to read, (2) is harmless or
969 * (3) contains pointers to additional kernel state that we're
970 * not permitted to read (and as such, do not present an
971 * opportunity for privilege escalation). Finally (and
972 * critically), because of the nature of their relation with
973 * the current thread context, the memory associated with these
974 * structures cannot change over the duration of probe context,
975 * and it is therefore impossible for this memory to be
976 * deallocated and reallocated as something else while it's
977 * being operated upon.
978 */
979 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
980 DTRACE_RANGE_REMAIN(remain, addr, curthread,
981 sizeof (kthread_t));
982 return (1);
983 }
984
985 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
986 sz, curthread->t_procp, sizeof (proc_t))) {
987 DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
988 sizeof (proc_t));
989 return (1);
990 }
991
992 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
993 curthread->t_cred, sizeof (cred_t))) {
994 DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
995 sizeof (cred_t));
996 return (1);
997 }
998
999 #ifdef illumos
1000 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
1001 &(p->p_pidp->pid_id), sizeof (pid_t))) {
1002 DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
1003 sizeof (pid_t));
1004 return (1);
1005 }
1006
1007 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
1008 curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
1009 DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
1010 offsetof(cpu_t, cpu_pause_thread));
1011 return (1);
1012 }
1013 #endif
1014 }
1015
1016 if ((fp = mstate->dtms_getf) != NULL) {
1017 uintptr_t psz = sizeof (void *);
1018 vnode_t *vp;
1019 vnodeops_t *op;
1020
1021 /*
1022 * When getf() returns a file_t, the enabling is implicitly
1023 * granted the (transient) right to read the returned file_t
1024 * as well as the v_path and v_op->vnop_name of the underlying
1025 * vnode. These accesses are allowed after a successful
1026 * getf() because the members that they refer to cannot change
1027 * once set -- and the barrier logic in the kernel's closef()
1028 * path assures that the file_t and its referenced vode_t
1029 * cannot themselves be stale (that is, it impossible for
1030 * either dtms_getf itself or its f_vnode member to reference
1031 * freed memory).
1032 */
1033 if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
1034 DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
1035 return (1);
1036 }
1037
1038 if ((vp = fp->f_vnode) != NULL) {
1039 size_t slen;
1040 #ifdef illumos
1041 if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
1042 DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
1043 psz);
1044 return (1);
1045 }
1046 slen = strlen(vp->v_path) + 1;
1047 if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
1048 DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
1049 slen);
1050 return (1);
1051 }
1052 #endif
1053
1054 if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
1055 DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
1056 psz);
1057 return (1);
1058 }
1059
1060 #ifdef illumos
1061 if ((op = vp->v_op) != NULL &&
1062 DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
1063 DTRACE_RANGE_REMAIN(remain, addr,
1064 &op->vnop_name, psz);
1065 return (1);
1066 }
1067
1068 if (op != NULL && op->vnop_name != NULL &&
1069 DTRACE_INRANGE(addr, sz, op->vnop_name,
1070 (slen = strlen(op->vnop_name) + 1))) {
1071 DTRACE_RANGE_REMAIN(remain, addr,
1072 op->vnop_name, slen);
1073 return (1);
1074 }
1075 #endif
1076 }
1077 }
1078
1079 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1080 *illval = addr;
1081 return (0);
1082 }
1083
1084 /*
1085 * Convenience routine to check to see if a given string is within a memory
1086 * region in which a load may be issued given the user's privilege level;
1087 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1088 * calls in the event that the user has all privileges.
1089 */
1090 static int
dtrace_strcanload(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1091 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1092 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1093 {
1094 size_t rsize;
1095
1096 /*
1097 * If we hold the privilege to read from kernel memory, then
1098 * everything is readable.
1099 */
1100 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1101 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1102 return (1);
1103 }
1104
1105 /*
1106 * Even if the caller is uninterested in querying the remaining valid
1107 * range, it is required to ensure that the access is allowed.
1108 */
1109 if (remain == NULL) {
1110 remain = &rsize;
1111 }
1112 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1113 size_t strsz;
1114 /*
1115 * Perform the strlen after determining the length of the
1116 * memory region which is accessible. This prevents timing
1117 * information from being used to find NULs in memory which is
1118 * not accessible to the caller.
1119 */
1120 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1121 MIN(sz, *remain));
1122 if (strsz <= *remain) {
1123 return (1);
1124 }
1125 }
1126
1127 return (0);
1128 }
1129
1130 /*
1131 * Convenience routine to check to see if a given variable is within a memory
1132 * region in which a load may be issued given the user's privilege level.
1133 */
1134 static int
dtrace_vcanload(void * src,dtrace_diftype_t * type,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1135 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1136 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1137 {
1138 size_t sz;
1139 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1140
1141 /*
1142 * Calculate the max size before performing any checks since even
1143 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1144 * return the max length via 'remain'.
1145 */
1146 if (type->dtdt_kind == DIF_TYPE_STRING) {
1147 dtrace_state_t *state = vstate->dtvs_state;
1148
1149 if (state != NULL) {
1150 sz = state->dts_options[DTRACEOPT_STRSIZE];
1151 } else {
1152 /*
1153 * In helper context, we have a NULL state; fall back
1154 * to using the system-wide default for the string size
1155 * in this case.
1156 */
1157 sz = dtrace_strsize_default;
1158 }
1159 } else {
1160 sz = type->dtdt_size;
1161 }
1162
1163 /*
1164 * If we hold the privilege to read from kernel memory, then
1165 * everything is readable.
1166 */
1167 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1168 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1169 return (1);
1170 }
1171
1172 if (type->dtdt_kind == DIF_TYPE_STRING) {
1173 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1174 vstate));
1175 }
1176 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1177 vstate));
1178 }
1179
1180 /*
1181 * Convert a string to a signed integer using safe loads.
1182 *
1183 * NOTE: This function uses various macros from strtolctype.h to manipulate
1184 * digit values, etc -- these have all been checked to ensure they make
1185 * no additional function calls.
1186 */
1187 static int64_t
dtrace_strtoll(char * input,int base,size_t limit)1188 dtrace_strtoll(char *input, int base, size_t limit)
1189 {
1190 uintptr_t pos = (uintptr_t)input;
1191 int64_t val = 0;
1192 int x;
1193 boolean_t neg = B_FALSE;
1194 char c, cc, ccc;
1195 uintptr_t end = pos + limit;
1196
1197 /*
1198 * Consume any whitespace preceding digits.
1199 */
1200 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1201 pos++;
1202
1203 /*
1204 * Handle an explicit sign if one is present.
1205 */
1206 if (c == '-' || c == '+') {
1207 if (c == '-')
1208 neg = B_TRUE;
1209 c = dtrace_load8(++pos);
1210 }
1211
1212 /*
1213 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1214 * if present.
1215 */
1216 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1217 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1218 pos += 2;
1219 c = ccc;
1220 }
1221
1222 /*
1223 * Read in contiguous digits until the first non-digit character.
1224 */
1225 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1226 c = dtrace_load8(++pos))
1227 val = val * base + x;
1228
1229 return (neg ? -val : val);
1230 }
1231
1232 /*
1233 * Compare two strings using safe loads.
1234 */
1235 static int
dtrace_strncmp(char * s1,char * s2,size_t limit)1236 dtrace_strncmp(char *s1, char *s2, size_t limit)
1237 {
1238 uint8_t c1, c2;
1239 volatile uint16_t *flags;
1240
1241 if (s1 == s2 || limit == 0)
1242 return (0);
1243
1244 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1245
1246 do {
1247 if (s1 == NULL) {
1248 c1 = '\0';
1249 } else {
1250 c1 = dtrace_load8((uintptr_t)s1++);
1251 }
1252
1253 if (s2 == NULL) {
1254 c2 = '\0';
1255 } else {
1256 c2 = dtrace_load8((uintptr_t)s2++);
1257 }
1258
1259 if (c1 != c2)
1260 return (c1 - c2);
1261 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1262
1263 return (0);
1264 }
1265
1266 /*
1267 * Compute strlen(s) for a string using safe memory accesses. The additional
1268 * len parameter is used to specify a maximum length to ensure completion.
1269 */
1270 static size_t
dtrace_strlen(const char * s,size_t lim)1271 dtrace_strlen(const char *s, size_t lim)
1272 {
1273 uint_t len;
1274
1275 for (len = 0; len != lim; len++) {
1276 if (dtrace_load8((uintptr_t)s++) == '\0')
1277 break;
1278 }
1279
1280 return (len);
1281 }
1282
1283 /*
1284 * Check if an address falls within a toxic region.
1285 */
1286 static int
dtrace_istoxic(uintptr_t kaddr,size_t size)1287 dtrace_istoxic(uintptr_t kaddr, size_t size)
1288 {
1289 uintptr_t taddr, tsize;
1290 int i;
1291
1292 for (i = 0; i < dtrace_toxranges; i++) {
1293 taddr = dtrace_toxrange[i].dtt_base;
1294 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1295
1296 if (kaddr - taddr < tsize) {
1297 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1298 cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1299 return (1);
1300 }
1301
1302 if (taddr - kaddr < size) {
1303 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1304 cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1305 return (1);
1306 }
1307 }
1308
1309 return (0);
1310 }
1311
1312 /*
1313 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1314 * memory specified by the DIF program. The dst is assumed to be safe memory
1315 * that we can store to directly because it is managed by DTrace. As with
1316 * standard bcopy, overlapping copies are handled properly.
1317 */
1318 static void
dtrace_bcopy(const void * src,void * dst,size_t len)1319 dtrace_bcopy(const void *src, void *dst, size_t len)
1320 {
1321 if (len != 0) {
1322 uint8_t *s1 = dst;
1323 const uint8_t *s2 = src;
1324
1325 if (s1 <= s2) {
1326 do {
1327 *s1++ = dtrace_load8((uintptr_t)s2++);
1328 } while (--len != 0);
1329 } else {
1330 s2 += len;
1331 s1 += len;
1332
1333 do {
1334 *--s1 = dtrace_load8((uintptr_t)--s2);
1335 } while (--len != 0);
1336 }
1337 }
1338 }
1339
1340 /*
1341 * Copy src to dst using safe memory accesses, up to either the specified
1342 * length, or the point that a nul byte is encountered. The src is assumed to
1343 * be unsafe memory specified by the DIF program. The dst is assumed to be
1344 * safe memory that we can store to directly because it is managed by DTrace.
1345 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1346 */
1347 static void
dtrace_strcpy(const void * src,void * dst,size_t len)1348 dtrace_strcpy(const void *src, void *dst, size_t len)
1349 {
1350 if (len != 0) {
1351 uint8_t *s1 = dst, c;
1352 const uint8_t *s2 = src;
1353
1354 do {
1355 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1356 } while (--len != 0 && c != '\0');
1357 }
1358 }
1359
1360 /*
1361 * Copy src to dst, deriving the size and type from the specified (BYREF)
1362 * variable type. The src is assumed to be unsafe memory specified by the DIF
1363 * program. The dst is assumed to be DTrace variable memory that is of the
1364 * specified type; we assume that we can store to directly.
1365 */
1366 static void
dtrace_vcopy(void * src,void * dst,dtrace_diftype_t * type,size_t limit)1367 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1368 {
1369 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1370
1371 if (type->dtdt_kind == DIF_TYPE_STRING) {
1372 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1373 } else {
1374 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1375 }
1376 }
1377
1378 /*
1379 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1380 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1381 * safe memory that we can access directly because it is managed by DTrace.
1382 */
1383 static int
dtrace_bcmp(const void * s1,const void * s2,size_t len)1384 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1385 {
1386 volatile uint16_t *flags;
1387
1388 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1389
1390 if (s1 == s2)
1391 return (0);
1392
1393 if (s1 == NULL || s2 == NULL)
1394 return (1);
1395
1396 if (s1 != s2 && len != 0) {
1397 const uint8_t *ps1 = s1;
1398 const uint8_t *ps2 = s2;
1399
1400 do {
1401 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1402 return (1);
1403 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1404 }
1405 return (0);
1406 }
1407
1408 /*
1409 * Zero the specified region using a simple byte-by-byte loop. Note that this
1410 * is for safe DTrace-managed memory only.
1411 */
1412 static void
dtrace_bzero(void * dst,size_t len)1413 dtrace_bzero(void *dst, size_t len)
1414 {
1415 uchar_t *cp;
1416
1417 for (cp = dst; len != 0; len--)
1418 *cp++ = 0;
1419 }
1420
1421 static void
dtrace_add_128(uint64_t * addend1,uint64_t * addend2,uint64_t * sum)1422 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1423 {
1424 uint64_t result[2];
1425
1426 result[0] = addend1[0] + addend2[0];
1427 result[1] = addend1[1] + addend2[1] +
1428 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1429
1430 sum[0] = result[0];
1431 sum[1] = result[1];
1432 }
1433
1434 /*
1435 * Shift the 128-bit value in a by b. If b is positive, shift left.
1436 * If b is negative, shift right.
1437 */
1438 static void
dtrace_shift_128(uint64_t * a,int b)1439 dtrace_shift_128(uint64_t *a, int b)
1440 {
1441 uint64_t mask;
1442
1443 if (b == 0)
1444 return;
1445
1446 if (b < 0) {
1447 b = -b;
1448 if (b >= 64) {
1449 a[0] = a[1] >> (b - 64);
1450 a[1] = 0;
1451 } else {
1452 a[0] >>= b;
1453 mask = 1LL << (64 - b);
1454 mask -= 1;
1455 a[0] |= ((a[1] & mask) << (64 - b));
1456 a[1] >>= b;
1457 }
1458 } else {
1459 if (b >= 64) {
1460 a[1] = a[0] << (b - 64);
1461 a[0] = 0;
1462 } else {
1463 a[1] <<= b;
1464 mask = a[0] >> (64 - b);
1465 a[1] |= mask;
1466 a[0] <<= b;
1467 }
1468 }
1469 }
1470
1471 /*
1472 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1473 * use native multiplication on those, and then re-combine into the
1474 * resulting 128-bit value.
1475 *
1476 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1477 * hi1 * hi2 << 64 +
1478 * hi1 * lo2 << 32 +
1479 * hi2 * lo1 << 32 +
1480 * lo1 * lo2
1481 */
1482 static void
dtrace_multiply_128(uint64_t factor1,uint64_t factor2,uint64_t * product)1483 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1484 {
1485 uint64_t hi1, hi2, lo1, lo2;
1486 uint64_t tmp[2];
1487
1488 hi1 = factor1 >> 32;
1489 hi2 = factor2 >> 32;
1490
1491 lo1 = factor1 & DT_MASK_LO;
1492 lo2 = factor2 & DT_MASK_LO;
1493
1494 product[0] = lo1 * lo2;
1495 product[1] = hi1 * hi2;
1496
1497 tmp[0] = hi1 * lo2;
1498 tmp[1] = 0;
1499 dtrace_shift_128(tmp, 32);
1500 dtrace_add_128(product, tmp, product);
1501
1502 tmp[0] = hi2 * lo1;
1503 tmp[1] = 0;
1504 dtrace_shift_128(tmp, 32);
1505 dtrace_add_128(product, tmp, product);
1506 }
1507
1508 /*
1509 * This privilege check should be used by actions and subroutines to
1510 * verify that the user credentials of the process that enabled the
1511 * invoking ECB match the target credentials
1512 */
1513 static int
dtrace_priv_proc_common_user(dtrace_state_t * state)1514 dtrace_priv_proc_common_user(dtrace_state_t *state)
1515 {
1516 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1517
1518 /*
1519 * We should always have a non-NULL state cred here, since if cred
1520 * is null (anonymous tracing), we fast-path bypass this routine.
1521 */
1522 ASSERT(s_cr != NULL);
1523
1524 if ((cr = CRED()) != NULL &&
1525 s_cr->cr_uid == cr->cr_uid &&
1526 s_cr->cr_uid == cr->cr_ruid &&
1527 s_cr->cr_uid == cr->cr_suid &&
1528 s_cr->cr_gid == cr->cr_gid &&
1529 s_cr->cr_gid == cr->cr_rgid &&
1530 s_cr->cr_gid == cr->cr_sgid)
1531 return (1);
1532
1533 return (0);
1534 }
1535
1536 /*
1537 * This privilege check should be used by actions and subroutines to
1538 * verify that the zone of the process that enabled the invoking ECB
1539 * matches the target credentials
1540 */
1541 static int
dtrace_priv_proc_common_zone(dtrace_state_t * state)1542 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1543 {
1544 #ifdef illumos
1545 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1546
1547 /*
1548 * We should always have a non-NULL state cred here, since if cred
1549 * is null (anonymous tracing), we fast-path bypass this routine.
1550 */
1551 ASSERT(s_cr != NULL);
1552
1553 if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1554 return (1);
1555
1556 return (0);
1557 #else
1558 return (1);
1559 #endif
1560 }
1561
1562 /*
1563 * This privilege check should be used by actions and subroutines to
1564 * verify that the process has not setuid or changed credentials.
1565 */
1566 static int
dtrace_priv_proc_common_nocd(void)1567 dtrace_priv_proc_common_nocd(void)
1568 {
1569 proc_t *proc;
1570
1571 if ((proc = ttoproc(curthread)) != NULL &&
1572 !(proc->p_flag & SNOCD))
1573 return (1);
1574
1575 return (0);
1576 }
1577
1578 static int
dtrace_priv_proc_destructive(dtrace_state_t * state)1579 dtrace_priv_proc_destructive(dtrace_state_t *state)
1580 {
1581 int action = state->dts_cred.dcr_action;
1582
1583 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1584 dtrace_priv_proc_common_zone(state) == 0)
1585 goto bad;
1586
1587 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1588 dtrace_priv_proc_common_user(state) == 0)
1589 goto bad;
1590
1591 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1592 dtrace_priv_proc_common_nocd() == 0)
1593 goto bad;
1594
1595 return (1);
1596
1597 bad:
1598 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1599
1600 return (0);
1601 }
1602
1603 static int
dtrace_priv_proc_control(dtrace_state_t * state)1604 dtrace_priv_proc_control(dtrace_state_t *state)
1605 {
1606 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1607 return (1);
1608
1609 if (dtrace_priv_proc_common_zone(state) &&
1610 dtrace_priv_proc_common_user(state) &&
1611 dtrace_priv_proc_common_nocd())
1612 return (1);
1613
1614 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1615
1616 return (0);
1617 }
1618
1619 static int
dtrace_priv_proc(dtrace_state_t * state)1620 dtrace_priv_proc(dtrace_state_t *state)
1621 {
1622 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1623 return (1);
1624
1625 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1626
1627 return (0);
1628 }
1629
1630 static int
dtrace_priv_kernel(dtrace_state_t * state)1631 dtrace_priv_kernel(dtrace_state_t *state)
1632 {
1633 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1634 return (1);
1635
1636 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1637
1638 return (0);
1639 }
1640
1641 static int
dtrace_priv_kernel_destructive(dtrace_state_t * state)1642 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1643 {
1644 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1645 return (1);
1646
1647 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1648
1649 return (0);
1650 }
1651
1652 /*
1653 * Determine if the dte_cond of the specified ECB allows for processing of
1654 * the current probe to continue. Note that this routine may allow continued
1655 * processing, but with access(es) stripped from the mstate's dtms_access
1656 * field.
1657 */
1658 static int
dtrace_priv_probe(dtrace_state_t * state,dtrace_mstate_t * mstate,dtrace_ecb_t * ecb)1659 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1660 dtrace_ecb_t *ecb)
1661 {
1662 dtrace_probe_t *probe = ecb->dte_probe;
1663 dtrace_provider_t *prov = probe->dtpr_provider;
1664 dtrace_pops_t *pops = &prov->dtpv_pops;
1665 int mode = DTRACE_MODE_NOPRIV_DROP;
1666
1667 ASSERT(ecb->dte_cond);
1668
1669 #ifdef illumos
1670 if (pops->dtps_mode != NULL) {
1671 mode = pops->dtps_mode(prov->dtpv_arg,
1672 probe->dtpr_id, probe->dtpr_arg);
1673
1674 ASSERT((mode & DTRACE_MODE_USER) ||
1675 (mode & DTRACE_MODE_KERNEL));
1676 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1677 (mode & DTRACE_MODE_NOPRIV_DROP));
1678 }
1679
1680 /*
1681 * If the dte_cond bits indicate that this consumer is only allowed to
1682 * see user-mode firings of this probe, call the provider's dtps_mode()
1683 * entry point to check that the probe was fired while in a user
1684 * context. If that's not the case, use the policy specified by the
1685 * provider to determine if we drop the probe or merely restrict
1686 * operation.
1687 */
1688 if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1689 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1690
1691 if (!(mode & DTRACE_MODE_USER)) {
1692 if (mode & DTRACE_MODE_NOPRIV_DROP)
1693 return (0);
1694
1695 mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1696 }
1697 }
1698 #endif
1699
1700 /*
1701 * This is more subtle than it looks. We have to be absolutely certain
1702 * that CRED() isn't going to change out from under us so it's only
1703 * legit to examine that structure if we're in constrained situations.
1704 * Currently, the only times we'll this check is if a non-super-user
1705 * has enabled the profile or syscall providers -- providers that
1706 * allow visibility of all processes. For the profile case, the check
1707 * above will ensure that we're examining a user context.
1708 */
1709 if (ecb->dte_cond & DTRACE_COND_OWNER) {
1710 cred_t *cr;
1711 cred_t *s_cr = state->dts_cred.dcr_cred;
1712 proc_t *proc;
1713
1714 ASSERT(s_cr != NULL);
1715
1716 if ((cr = CRED()) == NULL ||
1717 s_cr->cr_uid != cr->cr_uid ||
1718 s_cr->cr_uid != cr->cr_ruid ||
1719 s_cr->cr_uid != cr->cr_suid ||
1720 s_cr->cr_gid != cr->cr_gid ||
1721 s_cr->cr_gid != cr->cr_rgid ||
1722 s_cr->cr_gid != cr->cr_sgid ||
1723 (proc = ttoproc(curthread)) == NULL ||
1724 (proc->p_flag & SNOCD)) {
1725 if (mode & DTRACE_MODE_NOPRIV_DROP)
1726 return (0);
1727
1728 #ifdef illumos
1729 mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1730 #endif
1731 }
1732 }
1733
1734 #ifdef illumos
1735 /*
1736 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1737 * in our zone, check to see if our mode policy is to restrict rather
1738 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1739 * and DTRACE_ACCESS_ARGS
1740 */
1741 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1742 cred_t *cr;
1743 cred_t *s_cr = state->dts_cred.dcr_cred;
1744
1745 ASSERT(s_cr != NULL);
1746
1747 if ((cr = CRED()) == NULL ||
1748 s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1749 if (mode & DTRACE_MODE_NOPRIV_DROP)
1750 return (0);
1751
1752 mstate->dtms_access &=
1753 ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1754 }
1755 }
1756 #endif
1757
1758 return (1);
1759 }
1760
1761 /*
1762 * Note: not called from probe context. This function is called
1763 * asynchronously (and at a regular interval) from outside of probe context to
1764 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1765 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1766 */
1767 void
dtrace_dynvar_clean(dtrace_dstate_t * dstate)1768 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1769 {
1770 dtrace_dynvar_t *dirty;
1771 dtrace_dstate_percpu_t *dcpu;
1772 dtrace_dynvar_t **rinsep;
1773 int i, j, work = 0;
1774
1775 CPU_FOREACH(i) {
1776 dcpu = &dstate->dtds_percpu[i];
1777 rinsep = &dcpu->dtdsc_rinsing;
1778
1779 /*
1780 * If the dirty list is NULL, there is no dirty work to do.
1781 */
1782 if (dcpu->dtdsc_dirty == NULL)
1783 continue;
1784
1785 if (dcpu->dtdsc_rinsing != NULL) {
1786 /*
1787 * If the rinsing list is non-NULL, then it is because
1788 * this CPU was selected to accept another CPU's
1789 * dirty list -- and since that time, dirty buffers
1790 * have accumulated. This is a highly unlikely
1791 * condition, but we choose to ignore the dirty
1792 * buffers -- they'll be picked up a future cleanse.
1793 */
1794 continue;
1795 }
1796
1797 if (dcpu->dtdsc_clean != NULL) {
1798 /*
1799 * If the clean list is non-NULL, then we're in a
1800 * situation where a CPU has done deallocations (we
1801 * have a non-NULL dirty list) but no allocations (we
1802 * also have a non-NULL clean list). We can't simply
1803 * move the dirty list into the clean list on this
1804 * CPU, yet we also don't want to allow this condition
1805 * to persist, lest a short clean list prevent a
1806 * massive dirty list from being cleaned (which in
1807 * turn could lead to otherwise avoidable dynamic
1808 * drops). To deal with this, we look for some CPU
1809 * with a NULL clean list, NULL dirty list, and NULL
1810 * rinsing list -- and then we borrow this CPU to
1811 * rinse our dirty list.
1812 */
1813 CPU_FOREACH(j) {
1814 dtrace_dstate_percpu_t *rinser;
1815
1816 rinser = &dstate->dtds_percpu[j];
1817
1818 if (rinser->dtdsc_rinsing != NULL)
1819 continue;
1820
1821 if (rinser->dtdsc_dirty != NULL)
1822 continue;
1823
1824 if (rinser->dtdsc_clean != NULL)
1825 continue;
1826
1827 rinsep = &rinser->dtdsc_rinsing;
1828 break;
1829 }
1830
1831 if (j > mp_maxid) {
1832 /*
1833 * We were unable to find another CPU that
1834 * could accept this dirty list -- we are
1835 * therefore unable to clean it now.
1836 */
1837 dtrace_dynvar_failclean++;
1838 continue;
1839 }
1840 }
1841
1842 work = 1;
1843
1844 /*
1845 * Atomically move the dirty list aside.
1846 */
1847 do {
1848 dirty = dcpu->dtdsc_dirty;
1849
1850 /*
1851 * Before we zap the dirty list, set the rinsing list.
1852 * (This allows for a potential assertion in
1853 * dtrace_dynvar(): if a free dynamic variable appears
1854 * on a hash chain, either the dirty list or the
1855 * rinsing list for some CPU must be non-NULL.)
1856 */
1857 *rinsep = dirty;
1858 dtrace_membar_producer();
1859 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1860 dirty, NULL) != dirty);
1861 }
1862
1863 if (!work) {
1864 /*
1865 * We have no work to do; we can simply return.
1866 */
1867 return;
1868 }
1869
1870 dtrace_sync();
1871
1872 CPU_FOREACH(i) {
1873 dcpu = &dstate->dtds_percpu[i];
1874
1875 if (dcpu->dtdsc_rinsing == NULL)
1876 continue;
1877
1878 /*
1879 * We are now guaranteed that no hash chain contains a pointer
1880 * into this dirty list; we can make it clean.
1881 */
1882 ASSERT(dcpu->dtdsc_clean == NULL);
1883 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1884 dcpu->dtdsc_rinsing = NULL;
1885 }
1886
1887 /*
1888 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1889 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1890 * This prevents a race whereby a CPU incorrectly decides that
1891 * the state should be something other than DTRACE_DSTATE_CLEAN
1892 * after dtrace_dynvar_clean() has completed.
1893 */
1894 dtrace_sync();
1895
1896 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1897 }
1898
1899 /*
1900 * Depending on the value of the op parameter, this function looks-up,
1901 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1902 * allocation is requested, this function will return a pointer to a
1903 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1904 * variable can be allocated. If NULL is returned, the appropriate counter
1905 * will be incremented.
1906 */
1907 dtrace_dynvar_t *
dtrace_dynvar(dtrace_dstate_t * dstate,uint_t nkeys,dtrace_key_t * key,size_t dsize,dtrace_dynvar_op_t op,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1908 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1909 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1910 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1911 {
1912 uint64_t hashval = DTRACE_DYNHASH_VALID;
1913 dtrace_dynhash_t *hash = dstate->dtds_hash;
1914 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1915 processorid_t me = curcpu, cpu = me;
1916 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1917 size_t bucket, ksize;
1918 size_t chunksize = dstate->dtds_chunksize;
1919 uintptr_t kdata, lock, nstate;
1920 uint_t i;
1921
1922 ASSERT(nkeys != 0);
1923
1924 /*
1925 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1926 * algorithm. For the by-value portions, we perform the algorithm in
1927 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1928 * bit, and seems to have only a minute effect on distribution. For
1929 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1930 * over each referenced byte. It's painful to do this, but it's much
1931 * better than pathological hash distribution. The efficacy of the
1932 * hashing algorithm (and a comparison with other algorithms) may be
1933 * found by running the ::dtrace_dynstat MDB dcmd.
1934 */
1935 for (i = 0; i < nkeys; i++) {
1936 if (key[i].dttk_size == 0) {
1937 uint64_t val = key[i].dttk_value;
1938
1939 hashval += (val >> 48) & 0xffff;
1940 hashval += (hashval << 10);
1941 hashval ^= (hashval >> 6);
1942
1943 hashval += (val >> 32) & 0xffff;
1944 hashval += (hashval << 10);
1945 hashval ^= (hashval >> 6);
1946
1947 hashval += (val >> 16) & 0xffff;
1948 hashval += (hashval << 10);
1949 hashval ^= (hashval >> 6);
1950
1951 hashval += val & 0xffff;
1952 hashval += (hashval << 10);
1953 hashval ^= (hashval >> 6);
1954 } else {
1955 /*
1956 * This is incredibly painful, but it beats the hell
1957 * out of the alternative.
1958 */
1959 uint64_t j, size = key[i].dttk_size;
1960 uintptr_t base = (uintptr_t)key[i].dttk_value;
1961
1962 if (!dtrace_canload(base, size, mstate, vstate))
1963 break;
1964
1965 for (j = 0; j < size; j++) {
1966 hashval += dtrace_load8(base + j);
1967 hashval += (hashval << 10);
1968 hashval ^= (hashval >> 6);
1969 }
1970 }
1971 }
1972
1973 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1974 return (NULL);
1975
1976 hashval += (hashval << 3);
1977 hashval ^= (hashval >> 11);
1978 hashval += (hashval << 15);
1979
1980 /*
1981 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1982 * comes out to be one of our two sentinel hash values. If this
1983 * actually happens, we set the hashval to be a value known to be a
1984 * non-sentinel value.
1985 */
1986 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1987 hashval = DTRACE_DYNHASH_VALID;
1988
1989 /*
1990 * Yes, it's painful to do a divide here. If the cycle count becomes
1991 * important here, tricks can be pulled to reduce it. (However, it's
1992 * critical that hash collisions be kept to an absolute minimum;
1993 * they're much more painful than a divide.) It's better to have a
1994 * solution that generates few collisions and still keeps things
1995 * relatively simple.
1996 */
1997 bucket = hashval % dstate->dtds_hashsize;
1998
1999 if (op == DTRACE_DYNVAR_DEALLOC) {
2000 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
2001
2002 for (;;) {
2003 while ((lock = *lockp) & 1)
2004 continue;
2005
2006 if (dtrace_casptr((volatile void *)lockp,
2007 (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
2008 break;
2009 }
2010
2011 dtrace_membar_producer();
2012 }
2013
2014 top:
2015 prev = NULL;
2016 lock = hash[bucket].dtdh_lock;
2017
2018 dtrace_membar_consumer();
2019
2020 start = hash[bucket].dtdh_chain;
2021 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
2022 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
2023 op != DTRACE_DYNVAR_DEALLOC));
2024
2025 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
2026 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
2027 dtrace_key_t *dkey = &dtuple->dtt_key[0];
2028
2029 if (dvar->dtdv_hashval != hashval) {
2030 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
2031 /*
2032 * We've reached the sink, and therefore the
2033 * end of the hash chain; we can kick out of
2034 * the loop knowing that we have seen a valid
2035 * snapshot of state.
2036 */
2037 ASSERT(dvar->dtdv_next == NULL);
2038 ASSERT(dvar == &dtrace_dynhash_sink);
2039 break;
2040 }
2041
2042 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2043 /*
2044 * We've gone off the rails: somewhere along
2045 * the line, one of the members of this hash
2046 * chain was deleted. Note that we could also
2047 * detect this by simply letting this loop run
2048 * to completion, as we would eventually hit
2049 * the end of the dirty list. However, we
2050 * want to avoid running the length of the
2051 * dirty list unnecessarily (it might be quite
2052 * long), so we catch this as early as
2053 * possible by detecting the hash marker. In
2054 * this case, we simply set dvar to NULL and
2055 * break; the conditional after the loop will
2056 * send us back to top.
2057 */
2058 dvar = NULL;
2059 break;
2060 }
2061
2062 goto next;
2063 }
2064
2065 if (dtuple->dtt_nkeys != nkeys)
2066 goto next;
2067
2068 for (i = 0; i < nkeys; i++, dkey++) {
2069 if (dkey->dttk_size != key[i].dttk_size)
2070 goto next; /* size or type mismatch */
2071
2072 if (dkey->dttk_size != 0) {
2073 if (dtrace_bcmp(
2074 (void *)(uintptr_t)key[i].dttk_value,
2075 (void *)(uintptr_t)dkey->dttk_value,
2076 dkey->dttk_size))
2077 goto next;
2078 } else {
2079 if (dkey->dttk_value != key[i].dttk_value)
2080 goto next;
2081 }
2082 }
2083
2084 if (op != DTRACE_DYNVAR_DEALLOC)
2085 return (dvar);
2086
2087 ASSERT(dvar->dtdv_next == NULL ||
2088 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2089
2090 if (prev != NULL) {
2091 ASSERT(hash[bucket].dtdh_chain != dvar);
2092 ASSERT(start != dvar);
2093 ASSERT(prev->dtdv_next == dvar);
2094 prev->dtdv_next = dvar->dtdv_next;
2095 } else {
2096 if (dtrace_casptr(&hash[bucket].dtdh_chain,
2097 start, dvar->dtdv_next) != start) {
2098 /*
2099 * We have failed to atomically swing the
2100 * hash table head pointer, presumably because
2101 * of a conflicting allocation on another CPU.
2102 * We need to reread the hash chain and try
2103 * again.
2104 */
2105 goto top;
2106 }
2107 }
2108
2109 dtrace_membar_producer();
2110
2111 /*
2112 * Now set the hash value to indicate that it's free.
2113 */
2114 ASSERT(hash[bucket].dtdh_chain != dvar);
2115 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2116
2117 dtrace_membar_producer();
2118
2119 /*
2120 * Set the next pointer to point at the dirty list, and
2121 * atomically swing the dirty pointer to the newly freed dvar.
2122 */
2123 do {
2124 next = dcpu->dtdsc_dirty;
2125 dvar->dtdv_next = next;
2126 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2127
2128 /*
2129 * Finally, unlock this hash bucket.
2130 */
2131 ASSERT(hash[bucket].dtdh_lock == lock);
2132 ASSERT(lock & 1);
2133 hash[bucket].dtdh_lock++;
2134
2135 return (NULL);
2136 next:
2137 prev = dvar;
2138 continue;
2139 }
2140
2141 if (dvar == NULL) {
2142 /*
2143 * If dvar is NULL, it is because we went off the rails:
2144 * one of the elements that we traversed in the hash chain
2145 * was deleted while we were traversing it. In this case,
2146 * we assert that we aren't doing a dealloc (deallocs lock
2147 * the hash bucket to prevent themselves from racing with
2148 * one another), and retry the hash chain traversal.
2149 */
2150 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2151 goto top;
2152 }
2153
2154 if (op != DTRACE_DYNVAR_ALLOC) {
2155 /*
2156 * If we are not to allocate a new variable, we want to
2157 * return NULL now. Before we return, check that the value
2158 * of the lock word hasn't changed. If it has, we may have
2159 * seen an inconsistent snapshot.
2160 */
2161 if (op == DTRACE_DYNVAR_NOALLOC) {
2162 if (hash[bucket].dtdh_lock != lock)
2163 goto top;
2164 } else {
2165 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2166 ASSERT(hash[bucket].dtdh_lock == lock);
2167 ASSERT(lock & 1);
2168 hash[bucket].dtdh_lock++;
2169 }
2170
2171 return (NULL);
2172 }
2173
2174 /*
2175 * We need to allocate a new dynamic variable. The size we need is the
2176 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2177 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2178 * the size of any referred-to data (dsize). We then round the final
2179 * size up to the chunksize for allocation.
2180 */
2181 for (ksize = 0, i = 0; i < nkeys; i++)
2182 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2183
2184 /*
2185 * This should be pretty much impossible, but could happen if, say,
2186 * strange DIF specified the tuple. Ideally, this should be an
2187 * assertion and not an error condition -- but that requires that the
2188 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2189 * bullet-proof. (That is, it must not be able to be fooled by
2190 * malicious DIF.) Given the lack of backwards branches in DIF,
2191 * solving this would presumably not amount to solving the Halting
2192 * Problem -- but it still seems awfully hard.
2193 */
2194 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2195 ksize + dsize > chunksize) {
2196 dcpu->dtdsc_drops++;
2197 return (NULL);
2198 }
2199
2200 nstate = DTRACE_DSTATE_EMPTY;
2201
2202 do {
2203 retry:
2204 free = dcpu->dtdsc_free;
2205
2206 if (free == NULL) {
2207 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2208 void *rval;
2209
2210 if (clean == NULL) {
2211 /*
2212 * We're out of dynamic variable space on
2213 * this CPU. Unless we have tried all CPUs,
2214 * we'll try to allocate from a different
2215 * CPU.
2216 */
2217 switch (dstate->dtds_state) {
2218 case DTRACE_DSTATE_CLEAN: {
2219 void *sp = &dstate->dtds_state;
2220
2221 if (++cpu > mp_maxid)
2222 cpu = 0;
2223
2224 if (dcpu->dtdsc_dirty != NULL &&
2225 nstate == DTRACE_DSTATE_EMPTY)
2226 nstate = DTRACE_DSTATE_DIRTY;
2227
2228 if (dcpu->dtdsc_rinsing != NULL)
2229 nstate = DTRACE_DSTATE_RINSING;
2230
2231 dcpu = &dstate->dtds_percpu[cpu];
2232
2233 if (cpu != me)
2234 goto retry;
2235
2236 (void) dtrace_cas32(sp,
2237 DTRACE_DSTATE_CLEAN, nstate);
2238
2239 /*
2240 * To increment the correct bean
2241 * counter, take another lap.
2242 */
2243 goto retry;
2244 }
2245
2246 case DTRACE_DSTATE_DIRTY:
2247 dcpu->dtdsc_dirty_drops++;
2248 break;
2249
2250 case DTRACE_DSTATE_RINSING:
2251 dcpu->dtdsc_rinsing_drops++;
2252 break;
2253
2254 case DTRACE_DSTATE_EMPTY:
2255 dcpu->dtdsc_drops++;
2256 break;
2257 }
2258
2259 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2260 return (NULL);
2261 }
2262
2263 /*
2264 * The clean list appears to be non-empty. We want to
2265 * move the clean list to the free list; we start by
2266 * moving the clean pointer aside.
2267 */
2268 if (dtrace_casptr(&dcpu->dtdsc_clean,
2269 clean, NULL) != clean) {
2270 /*
2271 * We are in one of two situations:
2272 *
2273 * (a) The clean list was switched to the
2274 * free list by another CPU.
2275 *
2276 * (b) The clean list was added to by the
2277 * cleansing cyclic.
2278 *
2279 * In either of these situations, we can
2280 * just reattempt the free list allocation.
2281 */
2282 goto retry;
2283 }
2284
2285 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2286
2287 /*
2288 * Now we'll move the clean list to our free list.
2289 * It's impossible for this to fail: the only way
2290 * the free list can be updated is through this
2291 * code path, and only one CPU can own the clean list.
2292 * Thus, it would only be possible for this to fail if
2293 * this code were racing with dtrace_dynvar_clean().
2294 * (That is, if dtrace_dynvar_clean() updated the clean
2295 * list, and we ended up racing to update the free
2296 * list.) This race is prevented by the dtrace_sync()
2297 * in dtrace_dynvar_clean() -- which flushes the
2298 * owners of the clean lists out before resetting
2299 * the clean lists.
2300 */
2301 dcpu = &dstate->dtds_percpu[me];
2302 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2303 ASSERT(rval == NULL);
2304 goto retry;
2305 }
2306
2307 dvar = free;
2308 new_free = dvar->dtdv_next;
2309 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2310
2311 /*
2312 * We have now allocated a new chunk. We copy the tuple keys into the
2313 * tuple array and copy any referenced key data into the data space
2314 * following the tuple array. As we do this, we relocate dttk_value
2315 * in the final tuple to point to the key data address in the chunk.
2316 */
2317 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2318 dvar->dtdv_data = (void *)(kdata + ksize);
2319 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2320
2321 for (i = 0; i < nkeys; i++) {
2322 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2323 size_t kesize = key[i].dttk_size;
2324
2325 if (kesize != 0) {
2326 dtrace_bcopy(
2327 (const void *)(uintptr_t)key[i].dttk_value,
2328 (void *)kdata, kesize);
2329 dkey->dttk_value = kdata;
2330 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2331 } else {
2332 dkey->dttk_value = key[i].dttk_value;
2333 }
2334
2335 dkey->dttk_size = kesize;
2336 }
2337
2338 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2339 dvar->dtdv_hashval = hashval;
2340 dvar->dtdv_next = start;
2341
2342 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2343 return (dvar);
2344
2345 /*
2346 * The cas has failed. Either another CPU is adding an element to
2347 * this hash chain, or another CPU is deleting an element from this
2348 * hash chain. The simplest way to deal with both of these cases
2349 * (though not necessarily the most efficient) is to free our
2350 * allocated block and re-attempt it all. Note that the free is
2351 * to the dirty list and _not_ to the free list. This is to prevent
2352 * races with allocators, above.
2353 */
2354 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2355
2356 dtrace_membar_producer();
2357
2358 do {
2359 free = dcpu->dtdsc_dirty;
2360 dvar->dtdv_next = free;
2361 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2362
2363 goto top;
2364 }
2365
2366 /*ARGSUSED*/
2367 static void
dtrace_aggregate_min(uint64_t * oval,uint64_t nval,uint64_t arg)2368 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2369 {
2370 if ((int64_t)nval < (int64_t)*oval)
2371 *oval = nval;
2372 }
2373
2374 /*ARGSUSED*/
2375 static void
dtrace_aggregate_max(uint64_t * oval,uint64_t nval,uint64_t arg)2376 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2377 {
2378 if ((int64_t)nval > (int64_t)*oval)
2379 *oval = nval;
2380 }
2381
2382 static void
dtrace_aggregate_quantize(uint64_t * quanta,uint64_t nval,uint64_t incr)2383 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2384 {
2385 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2386 int64_t val = (int64_t)nval;
2387
2388 if (val < 0) {
2389 for (i = 0; i < zero; i++) {
2390 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2391 quanta[i] += incr;
2392 return;
2393 }
2394 }
2395 } else {
2396 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2397 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2398 quanta[i - 1] += incr;
2399 return;
2400 }
2401 }
2402
2403 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2404 return;
2405 }
2406
2407 ASSERT(0);
2408 }
2409
2410 static void
dtrace_aggregate_lquantize(uint64_t * lquanta,uint64_t nval,uint64_t incr)2411 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2412 {
2413 uint64_t arg = *lquanta++;
2414 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2415 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2416 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2417 int32_t val = (int32_t)nval, level;
2418
2419 ASSERT(step != 0);
2420 ASSERT(levels != 0);
2421
2422 if (val < base) {
2423 /*
2424 * This is an underflow.
2425 */
2426 lquanta[0] += incr;
2427 return;
2428 }
2429
2430 level = (val - base) / step;
2431
2432 if (level < levels) {
2433 lquanta[level + 1] += incr;
2434 return;
2435 }
2436
2437 /*
2438 * This is an overflow.
2439 */
2440 lquanta[levels + 1] += incr;
2441 }
2442
2443 static int
dtrace_aggregate_llquantize_bucket(uint16_t factor,uint16_t low,uint16_t high,uint16_t nsteps,int64_t value)2444 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2445 uint16_t high, uint16_t nsteps, int64_t value)
2446 {
2447 int64_t this = 1, last, next;
2448 int base = 1, order;
2449
2450 ASSERT(factor <= nsteps);
2451 ASSERT(nsteps % factor == 0);
2452
2453 for (order = 0; order < low; order++)
2454 this *= factor;
2455
2456 /*
2457 * If our value is less than our factor taken to the power of the
2458 * low order of magnitude, it goes into the zeroth bucket.
2459 */
2460 if (value < (last = this))
2461 return (0);
2462
2463 for (this *= factor; order <= high; order++) {
2464 int nbuckets = this > nsteps ? nsteps : this;
2465
2466 if ((next = this * factor) < this) {
2467 /*
2468 * We should not generally get log/linear quantizations
2469 * with a high magnitude that allows 64-bits to
2470 * overflow, but we nonetheless protect against this
2471 * by explicitly checking for overflow, and clamping
2472 * our value accordingly.
2473 */
2474 value = this - 1;
2475 }
2476
2477 if (value < this) {
2478 /*
2479 * If our value lies within this order of magnitude,
2480 * determine its position by taking the offset within
2481 * the order of magnitude, dividing by the bucket
2482 * width, and adding to our (accumulated) base.
2483 */
2484 return (base + (value - last) / (this / nbuckets));
2485 }
2486
2487 base += nbuckets - (nbuckets / factor);
2488 last = this;
2489 this = next;
2490 }
2491
2492 /*
2493 * Our value is greater than or equal to our factor taken to the
2494 * power of one plus the high magnitude -- return the top bucket.
2495 */
2496 return (base);
2497 }
2498
2499 static void
dtrace_aggregate_llquantize(uint64_t * llquanta,uint64_t nval,uint64_t incr)2500 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2501 {
2502 uint64_t arg = *llquanta++;
2503 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2504 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2505 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2506 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2507
2508 llquanta[dtrace_aggregate_llquantize_bucket(factor,
2509 low, high, nsteps, nval)] += incr;
2510 }
2511
2512 /*ARGSUSED*/
2513 static void
dtrace_aggregate_avg(uint64_t * data,uint64_t nval,uint64_t arg)2514 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2515 {
2516 data[0]++;
2517 data[1] += nval;
2518 }
2519
2520 /*ARGSUSED*/
2521 static void
dtrace_aggregate_stddev(uint64_t * data,uint64_t nval,uint64_t arg)2522 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2523 {
2524 int64_t snval = (int64_t)nval;
2525 uint64_t tmp[2];
2526
2527 data[0]++;
2528 data[1] += nval;
2529
2530 /*
2531 * What we want to say here is:
2532 *
2533 * data[2] += nval * nval;
2534 *
2535 * But given that nval is 64-bit, we could easily overflow, so
2536 * we do this as 128-bit arithmetic.
2537 */
2538 if (snval < 0)
2539 snval = -snval;
2540
2541 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2542 dtrace_add_128(data + 2, tmp, data + 2);
2543 }
2544
2545 /*ARGSUSED*/
2546 static void
dtrace_aggregate_count(uint64_t * oval,uint64_t nval,uint64_t arg)2547 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2548 {
2549 *oval = *oval + 1;
2550 }
2551
2552 /*ARGSUSED*/
2553 static void
dtrace_aggregate_sum(uint64_t * oval,uint64_t nval,uint64_t arg)2554 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2555 {
2556 *oval += nval;
2557 }
2558
2559 /*
2560 * Aggregate given the tuple in the principal data buffer, and the aggregating
2561 * action denoted by the specified dtrace_aggregation_t. The aggregation
2562 * buffer is specified as the buf parameter. This routine does not return
2563 * failure; if there is no space in the aggregation buffer, the data will be
2564 * dropped, and a corresponding counter incremented.
2565 */
2566 static void
dtrace_aggregate(dtrace_aggregation_t * agg,dtrace_buffer_t * dbuf,intptr_t offset,dtrace_buffer_t * buf,uint64_t expr,uint64_t arg)2567 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2568 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2569 {
2570 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2571 uint32_t i, ndx, size, fsize;
2572 uint32_t align = sizeof (uint64_t) - 1;
2573 dtrace_aggbuffer_t *agb;
2574 dtrace_aggkey_t *key;
2575 uint32_t hashval = 0, limit, isstr;
2576 caddr_t tomax, data, kdata;
2577 dtrace_actkind_t action;
2578 dtrace_action_t *act;
2579 size_t offs;
2580
2581 if (buf == NULL)
2582 return;
2583
2584 if (!agg->dtag_hasarg) {
2585 /*
2586 * Currently, only quantize() and lquantize() take additional
2587 * arguments, and they have the same semantics: an increment
2588 * value that defaults to 1 when not present. If additional
2589 * aggregating actions take arguments, the setting of the
2590 * default argument value will presumably have to become more
2591 * sophisticated...
2592 */
2593 arg = 1;
2594 }
2595
2596 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2597 size = rec->dtrd_offset - agg->dtag_base;
2598 fsize = size + rec->dtrd_size;
2599
2600 ASSERT(dbuf->dtb_tomax != NULL);
2601 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2602
2603 if ((tomax = buf->dtb_tomax) == NULL) {
2604 dtrace_buffer_drop(buf);
2605 return;
2606 }
2607
2608 /*
2609 * The metastructure is always at the bottom of the buffer.
2610 */
2611 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2612 sizeof (dtrace_aggbuffer_t));
2613
2614 if (buf->dtb_offset == 0) {
2615 /*
2616 * We just kludge up approximately 1/8th of the size to be
2617 * buckets. If this guess ends up being routinely
2618 * off-the-mark, we may need to dynamically readjust this
2619 * based on past performance.
2620 */
2621 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2622
2623 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2624 (uintptr_t)tomax || hashsize == 0) {
2625 /*
2626 * We've been given a ludicrously small buffer;
2627 * increment our drop count and leave.
2628 */
2629 dtrace_buffer_drop(buf);
2630 return;
2631 }
2632
2633 /*
2634 * And now, a pathetic attempt to try to get a an odd (or
2635 * perchance, a prime) hash size for better hash distribution.
2636 */
2637 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2638 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2639
2640 agb->dtagb_hashsize = hashsize;
2641 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2642 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2643 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2644
2645 for (i = 0; i < agb->dtagb_hashsize; i++)
2646 agb->dtagb_hash[i] = NULL;
2647 }
2648
2649 ASSERT(agg->dtag_first != NULL);
2650 ASSERT(agg->dtag_first->dta_intuple);
2651
2652 /*
2653 * Calculate the hash value based on the key. Note that we _don't_
2654 * include the aggid in the hashing (but we will store it as part of
2655 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2656 * algorithm: a simple, quick algorithm that has no known funnels, and
2657 * gets good distribution in practice. The efficacy of the hashing
2658 * algorithm (and a comparison with other algorithms) may be found by
2659 * running the ::dtrace_aggstat MDB dcmd.
2660 */
2661 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2662 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2663 limit = i + act->dta_rec.dtrd_size;
2664 ASSERT(limit <= size);
2665 isstr = DTRACEACT_ISSTRING(act);
2666
2667 for (; i < limit; i++) {
2668 hashval += data[i];
2669 hashval += (hashval << 10);
2670 hashval ^= (hashval >> 6);
2671
2672 if (isstr && data[i] == '\0')
2673 break;
2674 }
2675 }
2676
2677 hashval += (hashval << 3);
2678 hashval ^= (hashval >> 11);
2679 hashval += (hashval << 15);
2680
2681 /*
2682 * Yes, the divide here is expensive -- but it's generally the least
2683 * of the performance issues given the amount of data that we iterate
2684 * over to compute hash values, compare data, etc.
2685 */
2686 ndx = hashval % agb->dtagb_hashsize;
2687
2688 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2689 ASSERT((caddr_t)key >= tomax);
2690 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2691
2692 if (hashval != key->dtak_hashval || key->dtak_size != size)
2693 continue;
2694
2695 kdata = key->dtak_data;
2696 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2697
2698 for (act = agg->dtag_first; act->dta_intuple;
2699 act = act->dta_next) {
2700 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2701 limit = i + act->dta_rec.dtrd_size;
2702 ASSERT(limit <= size);
2703 isstr = DTRACEACT_ISSTRING(act);
2704
2705 for (; i < limit; i++) {
2706 if (kdata[i] != data[i])
2707 goto next;
2708
2709 if (isstr && data[i] == '\0')
2710 break;
2711 }
2712 }
2713
2714 if (action != key->dtak_action) {
2715 /*
2716 * We are aggregating on the same value in the same
2717 * aggregation with two different aggregating actions.
2718 * (This should have been picked up in the compiler,
2719 * so we may be dealing with errant or devious DIF.)
2720 * This is an error condition; we indicate as much,
2721 * and return.
2722 */
2723 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2724 return;
2725 }
2726
2727 /*
2728 * This is a hit: we need to apply the aggregator to
2729 * the value at this key.
2730 */
2731 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2732 return;
2733 next:
2734 continue;
2735 }
2736
2737 /*
2738 * We didn't find it. We need to allocate some zero-filled space,
2739 * link it into the hash table appropriately, and apply the aggregator
2740 * to the (zero-filled) value.
2741 */
2742 offs = buf->dtb_offset;
2743 while (offs & (align - 1))
2744 offs += sizeof (uint32_t);
2745
2746 /*
2747 * If we don't have enough room to both allocate a new key _and_
2748 * its associated data, increment the drop count and return.
2749 */
2750 if ((uintptr_t)tomax + offs + fsize >
2751 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2752 dtrace_buffer_drop(buf);
2753 return;
2754 }
2755
2756 /*CONSTCOND*/
2757 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2758 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2759 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2760
2761 key->dtak_data = kdata = tomax + offs;
2762 buf->dtb_offset = offs + fsize;
2763
2764 /*
2765 * Now copy the data across.
2766 */
2767 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2768
2769 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2770 kdata[i] = data[i];
2771
2772 /*
2773 * Because strings are not zeroed out by default, we need to iterate
2774 * looking for actions that store strings, and we need to explicitly
2775 * pad these strings out with zeroes.
2776 */
2777 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2778 int nul;
2779
2780 if (!DTRACEACT_ISSTRING(act))
2781 continue;
2782
2783 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2784 limit = i + act->dta_rec.dtrd_size;
2785 ASSERT(limit <= size);
2786
2787 for (nul = 0; i < limit; i++) {
2788 if (nul) {
2789 kdata[i] = '\0';
2790 continue;
2791 }
2792
2793 if (data[i] != '\0')
2794 continue;
2795
2796 nul = 1;
2797 }
2798 }
2799
2800 for (i = size; i < fsize; i++)
2801 kdata[i] = 0;
2802
2803 key->dtak_hashval = hashval;
2804 key->dtak_size = size;
2805 key->dtak_action = action;
2806 key->dtak_next = agb->dtagb_hash[ndx];
2807 agb->dtagb_hash[ndx] = key;
2808
2809 /*
2810 * Finally, apply the aggregator.
2811 */
2812 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2813 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2814 }
2815
2816 /*
2817 * Given consumer state, this routine finds a speculation in the INACTIVE
2818 * state and transitions it into the ACTIVE state. If there is no speculation
2819 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2820 * incremented -- it is up to the caller to take appropriate action.
2821 */
2822 static int
dtrace_speculation(dtrace_state_t * state)2823 dtrace_speculation(dtrace_state_t *state)
2824 {
2825 int i = 0;
2826 dtrace_speculation_state_t curstate;
2827 uint32_t *stat = &state->dts_speculations_unavail, count;
2828
2829 while (i < state->dts_nspeculations) {
2830 dtrace_speculation_t *spec = &state->dts_speculations[i];
2831
2832 curstate = spec->dtsp_state;
2833
2834 if (curstate != DTRACESPEC_INACTIVE) {
2835 if (curstate == DTRACESPEC_COMMITTINGMANY ||
2836 curstate == DTRACESPEC_COMMITTING ||
2837 curstate == DTRACESPEC_DISCARDING)
2838 stat = &state->dts_speculations_busy;
2839 i++;
2840 continue;
2841 }
2842
2843 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2844 curstate, DTRACESPEC_ACTIVE) == curstate)
2845 return (i + 1);
2846 }
2847
2848 /*
2849 * We couldn't find a speculation. If we found as much as a single
2850 * busy speculation buffer, we'll attribute this failure as "busy"
2851 * instead of "unavail".
2852 */
2853 do {
2854 count = *stat;
2855 } while (dtrace_cas32(stat, count, count + 1) != count);
2856
2857 return (0);
2858 }
2859
2860 /*
2861 * This routine commits an active speculation. If the specified speculation
2862 * is not in a valid state to perform a commit(), this routine will silently do
2863 * nothing. The state of the specified speculation is transitioned according
2864 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2865 */
2866 static void
dtrace_speculation_commit(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)2867 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2868 dtrace_specid_t which)
2869 {
2870 dtrace_speculation_t *spec;
2871 dtrace_buffer_t *src, *dest;
2872 uintptr_t daddr, saddr, dlimit, slimit;
2873 dtrace_speculation_state_t curstate, new = 0;
2874 ssize_t offs;
2875 uint64_t timestamp;
2876
2877 if (which == 0)
2878 return;
2879
2880 if (which > state->dts_nspeculations) {
2881 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2882 return;
2883 }
2884
2885 spec = &state->dts_speculations[which - 1];
2886 src = &spec->dtsp_buffer[cpu];
2887 dest = &state->dts_buffer[cpu];
2888
2889 do {
2890 curstate = spec->dtsp_state;
2891
2892 if (curstate == DTRACESPEC_COMMITTINGMANY)
2893 break;
2894
2895 switch (curstate) {
2896 case DTRACESPEC_INACTIVE:
2897 case DTRACESPEC_DISCARDING:
2898 return;
2899
2900 case DTRACESPEC_COMMITTING:
2901 /*
2902 * This is only possible if we are (a) commit()'ing
2903 * without having done a prior speculate() on this CPU
2904 * and (b) racing with another commit() on a different
2905 * CPU. There's nothing to do -- we just assert that
2906 * our offset is 0.
2907 */
2908 ASSERT(src->dtb_offset == 0);
2909 return;
2910
2911 case DTRACESPEC_ACTIVE:
2912 new = DTRACESPEC_COMMITTING;
2913 break;
2914
2915 case DTRACESPEC_ACTIVEONE:
2916 /*
2917 * This speculation is active on one CPU. If our
2918 * buffer offset is non-zero, we know that the one CPU
2919 * must be us. Otherwise, we are committing on a
2920 * different CPU from the speculate(), and we must
2921 * rely on being asynchronously cleaned.
2922 */
2923 if (src->dtb_offset != 0) {
2924 new = DTRACESPEC_COMMITTING;
2925 break;
2926 }
2927 /*FALLTHROUGH*/
2928
2929 case DTRACESPEC_ACTIVEMANY:
2930 new = DTRACESPEC_COMMITTINGMANY;
2931 break;
2932
2933 default:
2934 ASSERT(0);
2935 }
2936 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2937 curstate, new) != curstate);
2938
2939 /*
2940 * We have set the state to indicate that we are committing this
2941 * speculation. Now reserve the necessary space in the destination
2942 * buffer.
2943 */
2944 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2945 sizeof (uint64_t), state, NULL)) < 0) {
2946 dtrace_buffer_drop(dest);
2947 goto out;
2948 }
2949
2950 /*
2951 * We have sufficient space to copy the speculative buffer into the
2952 * primary buffer. First, modify the speculative buffer, filling
2953 * in the timestamp of all entries with the curstate time. The data
2954 * must have the commit() time rather than the time it was traced,
2955 * so that all entries in the primary buffer are in timestamp order.
2956 */
2957 timestamp = dtrace_gethrtime();
2958 saddr = (uintptr_t)src->dtb_tomax;
2959 slimit = saddr + src->dtb_offset;
2960 while (saddr < slimit) {
2961 size_t size;
2962 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2963
2964 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2965 saddr += sizeof (dtrace_epid_t);
2966 continue;
2967 }
2968 ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2969 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2970
2971 ASSERT3U(saddr + size, <=, slimit);
2972 ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2973 ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2974
2975 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2976
2977 saddr += size;
2978 }
2979
2980 /*
2981 * Copy the buffer across. (Note that this is a
2982 * highly subobtimal bcopy(); in the unlikely event that this becomes
2983 * a serious performance issue, a high-performance DTrace-specific
2984 * bcopy() should obviously be invented.)
2985 */
2986 daddr = (uintptr_t)dest->dtb_tomax + offs;
2987 dlimit = daddr + src->dtb_offset;
2988 saddr = (uintptr_t)src->dtb_tomax;
2989
2990 /*
2991 * First, the aligned portion.
2992 */
2993 while (dlimit - daddr >= sizeof (uint64_t)) {
2994 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2995
2996 daddr += sizeof (uint64_t);
2997 saddr += sizeof (uint64_t);
2998 }
2999
3000 /*
3001 * Now any left-over bit...
3002 */
3003 while (dlimit - daddr)
3004 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
3005
3006 /*
3007 * Finally, commit the reserved space in the destination buffer.
3008 */
3009 dest->dtb_offset = offs + src->dtb_offset;
3010
3011 out:
3012 /*
3013 * If we're lucky enough to be the only active CPU on this speculation
3014 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
3015 */
3016 if (curstate == DTRACESPEC_ACTIVE ||
3017 (curstate == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
3018 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
3019 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
3020
3021 ASSERT(rval == DTRACESPEC_COMMITTING);
3022 }
3023
3024 src->dtb_offset = 0;
3025 src->dtb_xamot_drops += src->dtb_drops;
3026 src->dtb_drops = 0;
3027 }
3028
3029 /*
3030 * This routine discards an active speculation. If the specified speculation
3031 * is not in a valid state to perform a discard(), this routine will silently
3032 * do nothing. The state of the specified speculation is transitioned
3033 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3034 */
3035 static void
dtrace_speculation_discard(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)3036 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3037 dtrace_specid_t which)
3038 {
3039 dtrace_speculation_t *spec;
3040 dtrace_speculation_state_t curstate, new = 0;
3041 dtrace_buffer_t *buf;
3042
3043 if (which == 0)
3044 return;
3045
3046 if (which > state->dts_nspeculations) {
3047 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3048 return;
3049 }
3050
3051 spec = &state->dts_speculations[which - 1];
3052 buf = &spec->dtsp_buffer[cpu];
3053
3054 do {
3055 curstate = spec->dtsp_state;
3056
3057 switch (curstate) {
3058 case DTRACESPEC_INACTIVE:
3059 case DTRACESPEC_COMMITTINGMANY:
3060 case DTRACESPEC_COMMITTING:
3061 case DTRACESPEC_DISCARDING:
3062 return;
3063
3064 case DTRACESPEC_ACTIVE:
3065 case DTRACESPEC_ACTIVEMANY:
3066 new = DTRACESPEC_DISCARDING;
3067 break;
3068
3069 case DTRACESPEC_ACTIVEONE:
3070 if (buf->dtb_offset != 0) {
3071 new = DTRACESPEC_INACTIVE;
3072 } else {
3073 new = DTRACESPEC_DISCARDING;
3074 }
3075 break;
3076
3077 default:
3078 ASSERT(0);
3079 }
3080 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3081 curstate, new) != curstate);
3082
3083 buf->dtb_offset = 0;
3084 buf->dtb_drops = 0;
3085 }
3086
3087 /*
3088 * Note: not called from probe context. This function is called
3089 * asynchronously from cross call context to clean any speculations that are
3090 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3091 * transitioned back to the INACTIVE state until all CPUs have cleaned the
3092 * speculation.
3093 */
3094 static void
dtrace_speculation_clean_here(dtrace_state_t * state)3095 dtrace_speculation_clean_here(dtrace_state_t *state)
3096 {
3097 dtrace_icookie_t cookie;
3098 processorid_t cpu = curcpu;
3099 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3100 dtrace_specid_t i;
3101
3102 cookie = dtrace_interrupt_disable();
3103
3104 if (dest->dtb_tomax == NULL) {
3105 dtrace_interrupt_enable(cookie);
3106 return;
3107 }
3108
3109 for (i = 0; i < state->dts_nspeculations; i++) {
3110 dtrace_speculation_t *spec = &state->dts_speculations[i];
3111 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3112
3113 if (src->dtb_tomax == NULL)
3114 continue;
3115
3116 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3117 src->dtb_offset = 0;
3118 continue;
3119 }
3120
3121 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3122 continue;
3123
3124 if (src->dtb_offset == 0)
3125 continue;
3126
3127 dtrace_speculation_commit(state, cpu, i + 1);
3128 }
3129
3130 dtrace_interrupt_enable(cookie);
3131 }
3132
3133 /*
3134 * Note: not called from probe context. This function is called
3135 * asynchronously (and at a regular interval) to clean any speculations that
3136 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3137 * is work to be done, it cross calls all CPUs to perform that work;
3138 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3139 * INACTIVE state until they have been cleaned by all CPUs.
3140 */
3141 static void
dtrace_speculation_clean(dtrace_state_t * state)3142 dtrace_speculation_clean(dtrace_state_t *state)
3143 {
3144 int work = 0, rv;
3145 dtrace_specid_t i;
3146
3147 for (i = 0; i < state->dts_nspeculations; i++) {
3148 dtrace_speculation_t *spec = &state->dts_speculations[i];
3149
3150 ASSERT(!spec->dtsp_cleaning);
3151
3152 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3153 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3154 continue;
3155
3156 work++;
3157 spec->dtsp_cleaning = 1;
3158 }
3159
3160 if (!work)
3161 return;
3162
3163 dtrace_xcall(DTRACE_CPUALL,
3164 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3165
3166 /*
3167 * We now know that all CPUs have committed or discarded their
3168 * speculation buffers, as appropriate. We can now set the state
3169 * to inactive.
3170 */
3171 for (i = 0; i < state->dts_nspeculations; i++) {
3172 dtrace_speculation_t *spec = &state->dts_speculations[i];
3173 dtrace_speculation_state_t curstate, new;
3174
3175 if (!spec->dtsp_cleaning)
3176 continue;
3177
3178 curstate = spec->dtsp_state;
3179 ASSERT(curstate == DTRACESPEC_DISCARDING ||
3180 curstate == DTRACESPEC_COMMITTINGMANY);
3181
3182 new = DTRACESPEC_INACTIVE;
3183
3184 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, curstate, new);
3185 ASSERT(rv == curstate);
3186 spec->dtsp_cleaning = 0;
3187 }
3188 }
3189
3190 /*
3191 * Called as part of a speculate() to get the speculative buffer associated
3192 * with a given speculation. Returns NULL if the specified speculation is not
3193 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3194 * the active CPU is not the specified CPU -- the speculation will be
3195 * atomically transitioned into the ACTIVEMANY state.
3196 */
3197 static dtrace_buffer_t *
dtrace_speculation_buffer(dtrace_state_t * state,processorid_t cpuid,dtrace_specid_t which)3198 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3199 dtrace_specid_t which)
3200 {
3201 dtrace_speculation_t *spec;
3202 dtrace_speculation_state_t curstate, new = 0;
3203 dtrace_buffer_t *buf;
3204
3205 if (which == 0)
3206 return (NULL);
3207
3208 if (which > state->dts_nspeculations) {
3209 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3210 return (NULL);
3211 }
3212
3213 spec = &state->dts_speculations[which - 1];
3214 buf = &spec->dtsp_buffer[cpuid];
3215
3216 do {
3217 curstate = spec->dtsp_state;
3218
3219 switch (curstate) {
3220 case DTRACESPEC_INACTIVE:
3221 case DTRACESPEC_COMMITTINGMANY:
3222 case DTRACESPEC_DISCARDING:
3223 return (NULL);
3224
3225 case DTRACESPEC_COMMITTING:
3226 ASSERT(buf->dtb_offset == 0);
3227 return (NULL);
3228
3229 case DTRACESPEC_ACTIVEONE:
3230 /*
3231 * This speculation is currently active on one CPU.
3232 * Check the offset in the buffer; if it's non-zero,
3233 * that CPU must be us (and we leave the state alone).
3234 * If it's zero, assume that we're starting on a new
3235 * CPU -- and change the state to indicate that the
3236 * speculation is active on more than one CPU.
3237 */
3238 if (buf->dtb_offset != 0)
3239 return (buf);
3240
3241 new = DTRACESPEC_ACTIVEMANY;
3242 break;
3243
3244 case DTRACESPEC_ACTIVEMANY:
3245 return (buf);
3246
3247 case DTRACESPEC_ACTIVE:
3248 new = DTRACESPEC_ACTIVEONE;
3249 break;
3250
3251 default:
3252 ASSERT(0);
3253 }
3254 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3255 curstate, new) != curstate);
3256
3257 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3258 return (buf);
3259 }
3260
3261 /*
3262 * Return a string. In the event that the user lacks the privilege to access
3263 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3264 * don't fail access checking.
3265 *
3266 * dtrace_dif_variable() uses this routine as a helper for various
3267 * builtin values such as 'execname' and 'probefunc.'
3268 */
3269 uintptr_t
dtrace_dif_varstr(uintptr_t addr,dtrace_state_t * state,dtrace_mstate_t * mstate)3270 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3271 dtrace_mstate_t *mstate)
3272 {
3273 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3274 uintptr_t ret;
3275 size_t strsz;
3276
3277 /*
3278 * The easy case: this probe is allowed to read all of memory, so
3279 * we can just return this as a vanilla pointer.
3280 */
3281 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3282 return (addr);
3283
3284 /*
3285 * This is the tougher case: we copy the string in question from
3286 * kernel memory into scratch memory and return it that way: this
3287 * ensures that we won't trip up when access checking tests the
3288 * BYREF return value.
3289 */
3290 strsz = dtrace_strlen((char *)addr, size) + 1;
3291
3292 if (mstate->dtms_scratch_ptr + strsz >
3293 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3294 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3295 return (0);
3296 }
3297
3298 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3299 strsz);
3300 ret = mstate->dtms_scratch_ptr;
3301 mstate->dtms_scratch_ptr += strsz;
3302 return (ret);
3303 }
3304
3305 /*
3306 * Return a string from a memoy address which is known to have one or
3307 * more concatenated, individually zero terminated, sub-strings.
3308 * In the event that the user lacks the privilege to access
3309 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3310 * don't fail access checking.
3311 *
3312 * dtrace_dif_variable() uses this routine as a helper for various
3313 * builtin values such as 'execargs'.
3314 */
3315 static uintptr_t
dtrace_dif_varstrz(uintptr_t addr,size_t strsz,dtrace_state_t * state,dtrace_mstate_t * mstate)3316 dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3317 dtrace_mstate_t *mstate)
3318 {
3319 char *p;
3320 size_t i;
3321 uintptr_t ret;
3322
3323 if (mstate->dtms_scratch_ptr + strsz >
3324 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3325 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3326 return (0);
3327 }
3328
3329 dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3330 strsz);
3331
3332 /* Replace sub-string termination characters with a space. */
3333 for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3334 p++, i++)
3335 if (*p == '\0')
3336 *p = ' ';
3337
3338 ret = mstate->dtms_scratch_ptr;
3339 mstate->dtms_scratch_ptr += strsz;
3340 return (ret);
3341 }
3342
3343 /*
3344 * This function implements the DIF emulator's variable lookups. The emulator
3345 * passes a reserved variable identifier and optional built-in array index.
3346 */
3347 static uint64_t
dtrace_dif_variable(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t v,uint64_t ndx)3348 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3349 uint64_t ndx)
3350 {
3351 /*
3352 * If we're accessing one of the uncached arguments, we'll turn this
3353 * into a reference in the args array.
3354 */
3355 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3356 ndx = v - DIF_VAR_ARG0;
3357 v = DIF_VAR_ARGS;
3358 }
3359
3360 switch (v) {
3361 case DIF_VAR_ARGS:
3362 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3363 if (ndx >= sizeof (mstate->dtms_arg) /
3364 sizeof (mstate->dtms_arg[0])) {
3365 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3366 dtrace_provider_t *pv;
3367 uint64_t val;
3368
3369 pv = mstate->dtms_probe->dtpr_provider;
3370 if (pv->dtpv_pops.dtps_getargval != NULL)
3371 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3372 mstate->dtms_probe->dtpr_id,
3373 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3374 else
3375 val = dtrace_getarg(ndx, aframes);
3376
3377 /*
3378 * This is regrettably required to keep the compiler
3379 * from tail-optimizing the call to dtrace_getarg().
3380 * The condition always evaluates to true, but the
3381 * compiler has no way of figuring that out a priori.
3382 * (None of this would be necessary if the compiler
3383 * could be relied upon to _always_ tail-optimize
3384 * the call to dtrace_getarg() -- but it can't.)
3385 */
3386 if (mstate->dtms_probe != NULL)
3387 return (val);
3388
3389 ASSERT(0);
3390 }
3391
3392 return (mstate->dtms_arg[ndx]);
3393
3394 case DIF_VAR_REGS:
3395 case DIF_VAR_UREGS: {
3396 struct trapframe *tframe;
3397
3398 if (!dtrace_priv_proc(state))
3399 return (0);
3400
3401 if (v == DIF_VAR_REGS)
3402 tframe = curthread->t_dtrace_trapframe;
3403 else
3404 tframe = curthread->td_frame;
3405
3406 if (tframe == NULL) {
3407 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3408 cpu_core[curcpu].cpuc_dtrace_illval = 0;
3409 return (0);
3410 }
3411
3412 return (dtrace_getreg(tframe, ndx));
3413 }
3414
3415 case DIF_VAR_CURTHREAD:
3416 if (!dtrace_priv_proc(state))
3417 return (0);
3418 return ((uint64_t)(uintptr_t)curthread);
3419
3420 case DIF_VAR_TIMESTAMP:
3421 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3422 mstate->dtms_timestamp = dtrace_gethrtime();
3423 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3424 }
3425 return (mstate->dtms_timestamp);
3426
3427 case DIF_VAR_VTIMESTAMP:
3428 ASSERT(dtrace_vtime_references != 0);
3429 return (curthread->t_dtrace_vtime);
3430
3431 case DIF_VAR_WALLTIMESTAMP:
3432 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3433 mstate->dtms_walltimestamp = dtrace_gethrestime();
3434 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3435 }
3436 return (mstate->dtms_walltimestamp);
3437
3438 #ifdef illumos
3439 case DIF_VAR_IPL:
3440 if (!dtrace_priv_kernel(state))
3441 return (0);
3442 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3443 mstate->dtms_ipl = dtrace_getipl();
3444 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3445 }
3446 return (mstate->dtms_ipl);
3447 #endif
3448
3449 case DIF_VAR_EPID:
3450 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3451 return (mstate->dtms_epid);
3452
3453 case DIF_VAR_ID:
3454 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3455 return (mstate->dtms_probe->dtpr_id);
3456
3457 case DIF_VAR_STACKDEPTH:
3458 if (!dtrace_priv_kernel(state))
3459 return (0);
3460 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3461 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3462
3463 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3464 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3465 }
3466 return (mstate->dtms_stackdepth);
3467
3468 case DIF_VAR_USTACKDEPTH:
3469 if (!dtrace_priv_proc(state))
3470 return (0);
3471 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3472 /*
3473 * See comment in DIF_VAR_PID.
3474 */
3475 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3476 CPU_ON_INTR(CPU)) {
3477 mstate->dtms_ustackdepth = 0;
3478 } else {
3479 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3480 mstate->dtms_ustackdepth =
3481 dtrace_getustackdepth();
3482 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3483 }
3484 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3485 }
3486 return (mstate->dtms_ustackdepth);
3487
3488 case DIF_VAR_CALLER:
3489 if (!dtrace_priv_kernel(state))
3490 return (0);
3491 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3492 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3493
3494 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3495 /*
3496 * If this is an unanchored probe, we are
3497 * required to go through the slow path:
3498 * dtrace_caller() only guarantees correct
3499 * results for anchored probes.
3500 */
3501 pc_t caller[2] = {0, 0};
3502
3503 dtrace_getpcstack(caller, 2, aframes,
3504 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3505 mstate->dtms_caller = caller[1];
3506 } else if ((mstate->dtms_caller =
3507 dtrace_caller(aframes)) == -1) {
3508 /*
3509 * We have failed to do this the quick way;
3510 * we must resort to the slower approach of
3511 * calling dtrace_getpcstack().
3512 */
3513 pc_t caller = 0;
3514
3515 dtrace_getpcstack(&caller, 1, aframes, NULL);
3516 mstate->dtms_caller = caller;
3517 }
3518
3519 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3520 }
3521 return (mstate->dtms_caller);
3522
3523 case DIF_VAR_UCALLER:
3524 if (!dtrace_priv_proc(state))
3525 return (0);
3526
3527 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3528 uint64_t ustack[3];
3529
3530 /*
3531 * dtrace_getupcstack() fills in the first uint64_t
3532 * with the current PID. The second uint64_t will
3533 * be the program counter at user-level. The third
3534 * uint64_t will contain the caller, which is what
3535 * we're after.
3536 */
3537 ustack[2] = 0;
3538 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3539 dtrace_getupcstack(ustack, 3);
3540 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3541 mstate->dtms_ucaller = ustack[2];
3542 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3543 }
3544
3545 return (mstate->dtms_ucaller);
3546
3547 case DIF_VAR_PROBEPROV:
3548 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3549 return (dtrace_dif_varstr(
3550 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3551 state, mstate));
3552
3553 case DIF_VAR_PROBEMOD:
3554 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3555 return (dtrace_dif_varstr(
3556 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3557 state, mstate));
3558
3559 case DIF_VAR_PROBEFUNC:
3560 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3561 return (dtrace_dif_varstr(
3562 (uintptr_t)mstate->dtms_probe->dtpr_func,
3563 state, mstate));
3564
3565 case DIF_VAR_PROBENAME:
3566 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3567 return (dtrace_dif_varstr(
3568 (uintptr_t)mstate->dtms_probe->dtpr_name,
3569 state, mstate));
3570
3571 case DIF_VAR_PID:
3572 if (!dtrace_priv_proc(state))
3573 return (0);
3574
3575 #ifdef illumos
3576 /*
3577 * Note that we are assuming that an unanchored probe is
3578 * always due to a high-level interrupt. (And we're assuming
3579 * that there is only a single high level interrupt.)
3580 */
3581 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3582 return (pid0.pid_id);
3583
3584 /*
3585 * It is always safe to dereference one's own t_procp pointer:
3586 * it always points to a valid, allocated proc structure.
3587 * Further, it is always safe to dereference the p_pidp member
3588 * of one's own proc structure. (These are truisms becuase
3589 * threads and processes don't clean up their own state --
3590 * they leave that task to whomever reaps them.)
3591 */
3592 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3593 #else
3594 return ((uint64_t)curproc->p_pid);
3595 #endif
3596
3597 case DIF_VAR_PPID:
3598 if (!dtrace_priv_proc(state))
3599 return (0);
3600
3601 #ifdef illumos
3602 /*
3603 * See comment in DIF_VAR_PID.
3604 */
3605 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3606 return (pid0.pid_id);
3607
3608 /*
3609 * It is always safe to dereference one's own t_procp pointer:
3610 * it always points to a valid, allocated proc structure.
3611 * (This is true because threads don't clean up their own
3612 * state -- they leave that task to whomever reaps them.)
3613 */
3614 return ((uint64_t)curthread->t_procp->p_ppid);
3615 #else
3616 if (curproc->p_pid == proc0.p_pid)
3617 return (curproc->p_pid);
3618 else
3619 return (curproc->p_pptr->p_pid);
3620 #endif
3621
3622 case DIF_VAR_TID:
3623 #ifdef illumos
3624 /*
3625 * See comment in DIF_VAR_PID.
3626 */
3627 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3628 return (0);
3629 #endif
3630
3631 return ((uint64_t)curthread->t_tid);
3632
3633 case DIF_VAR_EXECARGS: {
3634 struct pargs *p_args = curthread->td_proc->p_args;
3635
3636 if (p_args == NULL)
3637 return(0);
3638
3639 return (dtrace_dif_varstrz(
3640 (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3641 }
3642
3643 case DIF_VAR_EXECNAME:
3644 #ifdef illumos
3645 if (!dtrace_priv_proc(state))
3646 return (0);
3647
3648 /*
3649 * See comment in DIF_VAR_PID.
3650 */
3651 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3652 return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3653
3654 /*
3655 * It is always safe to dereference one's own t_procp pointer:
3656 * it always points to a valid, allocated proc structure.
3657 * (This is true because threads don't clean up their own
3658 * state -- they leave that task to whomever reaps them.)
3659 */
3660 return (dtrace_dif_varstr(
3661 (uintptr_t)curthread->t_procp->p_user.u_comm,
3662 state, mstate));
3663 #else
3664 return (dtrace_dif_varstr(
3665 (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3666 #endif
3667
3668 case DIF_VAR_ZONENAME:
3669 #ifdef illumos
3670 if (!dtrace_priv_proc(state))
3671 return (0);
3672
3673 /*
3674 * See comment in DIF_VAR_PID.
3675 */
3676 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3677 return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3678
3679 /*
3680 * It is always safe to dereference one's own t_procp pointer:
3681 * it always points to a valid, allocated proc structure.
3682 * (This is true because threads don't clean up their own
3683 * state -- they leave that task to whomever reaps them.)
3684 */
3685 return (dtrace_dif_varstr(
3686 (uintptr_t)curthread->t_procp->p_zone->zone_name,
3687 state, mstate));
3688 #elif defined(__FreeBSD__)
3689 /*
3690 * On FreeBSD, we introduce compatibility to zonename by falling through
3691 * into jailname.
3692 */
3693 case DIF_VAR_JAILNAME:
3694 if (!dtrace_priv_kernel(state))
3695 return (0);
3696
3697 return (dtrace_dif_varstr(
3698 (uintptr_t)curthread->td_ucred->cr_prison->pr_name,
3699 state, mstate));
3700
3701 case DIF_VAR_JID:
3702 if (!dtrace_priv_kernel(state))
3703 return (0);
3704
3705 return ((uint64_t)curthread->td_ucred->cr_prison->pr_id);
3706 #else
3707 return (0);
3708 #endif
3709
3710 case DIF_VAR_UID:
3711 if (!dtrace_priv_proc(state))
3712 return (0);
3713
3714 #ifdef illumos
3715 /*
3716 * See comment in DIF_VAR_PID.
3717 */
3718 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3719 return ((uint64_t)p0.p_cred->cr_uid);
3720
3721 /*
3722 * It is always safe to dereference one's own t_procp pointer:
3723 * it always points to a valid, allocated proc structure.
3724 * (This is true because threads don't clean up their own
3725 * state -- they leave that task to whomever reaps them.)
3726 *
3727 * Additionally, it is safe to dereference one's own process
3728 * credential, since this is never NULL after process birth.
3729 */
3730 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3731 #else
3732 return ((uint64_t)curthread->td_ucred->cr_uid);
3733 #endif
3734
3735 case DIF_VAR_GID:
3736 if (!dtrace_priv_proc(state))
3737 return (0);
3738
3739 #ifdef illumos
3740 /*
3741 * See comment in DIF_VAR_PID.
3742 */
3743 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3744 return ((uint64_t)p0.p_cred->cr_gid);
3745
3746 /*
3747 * It is always safe to dereference one's own t_procp pointer:
3748 * it always points to a valid, allocated proc structure.
3749 * (This is true because threads don't clean up their own
3750 * state -- they leave that task to whomever reaps them.)
3751 *
3752 * Additionally, it is safe to dereference one's own process
3753 * credential, since this is never NULL after process birth.
3754 */
3755 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3756 #else
3757 return ((uint64_t)curthread->td_ucred->cr_gid);
3758 #endif
3759
3760 case DIF_VAR_ERRNO: {
3761 #ifdef illumos
3762 klwp_t *lwp;
3763 if (!dtrace_priv_proc(state))
3764 return (0);
3765
3766 /*
3767 * See comment in DIF_VAR_PID.
3768 */
3769 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3770 return (0);
3771
3772 /*
3773 * It is always safe to dereference one's own t_lwp pointer in
3774 * the event that this pointer is non-NULL. (This is true
3775 * because threads and lwps don't clean up their own state --
3776 * they leave that task to whomever reaps them.)
3777 */
3778 if ((lwp = curthread->t_lwp) == NULL)
3779 return (0);
3780
3781 return ((uint64_t)lwp->lwp_errno);
3782 #else
3783 return (curthread->td_errno);
3784 #endif
3785 }
3786 #ifndef illumos
3787 case DIF_VAR_CPU: {
3788 return curcpu;
3789 }
3790 #endif
3791 default:
3792 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3793 return (0);
3794 }
3795 }
3796
3797
3798 typedef enum dtrace_json_state {
3799 DTRACE_JSON_REST = 1,
3800 DTRACE_JSON_OBJECT,
3801 DTRACE_JSON_STRING,
3802 DTRACE_JSON_STRING_ESCAPE,
3803 DTRACE_JSON_STRING_ESCAPE_UNICODE,
3804 DTRACE_JSON_COLON,
3805 DTRACE_JSON_COMMA,
3806 DTRACE_JSON_VALUE,
3807 DTRACE_JSON_IDENTIFIER,
3808 DTRACE_JSON_NUMBER,
3809 DTRACE_JSON_NUMBER_FRAC,
3810 DTRACE_JSON_NUMBER_EXP,
3811 DTRACE_JSON_COLLECT_OBJECT
3812 } dtrace_json_state_t;
3813
3814 /*
3815 * This function possesses just enough knowledge about JSON to extract a single
3816 * value from a JSON string and store it in the scratch buffer. It is able
3817 * to extract nested object values, and members of arrays by index.
3818 *
3819 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3820 * be looked up as we descend into the object tree. e.g.
3821 *
3822 * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3823 * with nelems = 5.
3824 *
3825 * The run time of this function must be bounded above by strsize to limit the
3826 * amount of work done in probe context. As such, it is implemented as a
3827 * simple state machine, reading one character at a time using safe loads
3828 * until we find the requested element, hit a parsing error or run off the
3829 * end of the object or string.
3830 *
3831 * As there is no way for a subroutine to return an error without interrupting
3832 * clause execution, we simply return NULL in the event of a missing key or any
3833 * other error condition. Each NULL return in this function is commented with
3834 * the error condition it represents -- parsing or otherwise.
3835 *
3836 * The set of states for the state machine closely matches the JSON
3837 * specification (http://json.org/). Briefly:
3838 *
3839 * DTRACE_JSON_REST:
3840 * Skip whitespace until we find either a top-level Object, moving
3841 * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3842 *
3843 * DTRACE_JSON_OBJECT:
3844 * Locate the next key String in an Object. Sets a flag to denote
3845 * the next String as a key string and moves to DTRACE_JSON_STRING.
3846 *
3847 * DTRACE_JSON_COLON:
3848 * Skip whitespace until we find the colon that separates key Strings
3849 * from their values. Once found, move to DTRACE_JSON_VALUE.
3850 *
3851 * DTRACE_JSON_VALUE:
3852 * Detects the type of the next value (String, Number, Identifier, Object
3853 * or Array) and routes to the states that process that type. Here we also
3854 * deal with the element selector list if we are requested to traverse down
3855 * into the object tree.
3856 *
3857 * DTRACE_JSON_COMMA:
3858 * Skip whitespace until we find the comma that separates key-value pairs
3859 * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3860 * (similarly DTRACE_JSON_VALUE). All following literal value processing
3861 * states return to this state at the end of their value, unless otherwise
3862 * noted.
3863 *
3864 * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3865 * Processes a Number literal from the JSON, including any exponent
3866 * component that may be present. Numbers are returned as strings, which
3867 * may be passed to strtoll() if an integer is required.
3868 *
3869 * DTRACE_JSON_IDENTIFIER:
3870 * Processes a "true", "false" or "null" literal in the JSON.
3871 *
3872 * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3873 * DTRACE_JSON_STRING_ESCAPE_UNICODE:
3874 * Processes a String literal from the JSON, whether the String denotes
3875 * a key, a value or part of a larger Object. Handles all escape sequences
3876 * present in the specification, including four-digit unicode characters,
3877 * but merely includes the escape sequence without converting it to the
3878 * actual escaped character. If the String is flagged as a key, we
3879 * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3880 *
3881 * DTRACE_JSON_COLLECT_OBJECT:
3882 * This state collects an entire Object (or Array), correctly handling
3883 * embedded strings. If the full element selector list matches this nested
3884 * object, we return the Object in full as a string. If not, we use this
3885 * state to skip to the next value at this level and continue processing.
3886 *
3887 * NOTE: This function uses various macros from strtolctype.h to manipulate
3888 * digit values, etc -- these have all been checked to ensure they make
3889 * no additional function calls.
3890 */
3891 static char *
dtrace_json(uint64_t size,uintptr_t json,char * elemlist,int nelems,char * dest)3892 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3893 char *dest)
3894 {
3895 dtrace_json_state_t state = DTRACE_JSON_REST;
3896 int64_t array_elem = INT64_MIN;
3897 int64_t array_pos = 0;
3898 uint8_t escape_unicount = 0;
3899 boolean_t string_is_key = B_FALSE;
3900 boolean_t collect_object = B_FALSE;
3901 boolean_t found_key = B_FALSE;
3902 boolean_t in_array = B_FALSE;
3903 uint32_t braces = 0, brackets = 0;
3904 char *elem = elemlist;
3905 char *dd = dest;
3906 uintptr_t cur;
3907
3908 for (cur = json; cur < json + size; cur++) {
3909 char cc = dtrace_load8(cur);
3910 if (cc == '\0')
3911 return (NULL);
3912
3913 switch (state) {
3914 case DTRACE_JSON_REST:
3915 if (isspace(cc))
3916 break;
3917
3918 if (cc == '{') {
3919 state = DTRACE_JSON_OBJECT;
3920 break;
3921 }
3922
3923 if (cc == '[') {
3924 in_array = B_TRUE;
3925 array_pos = 0;
3926 array_elem = dtrace_strtoll(elem, 10, size);
3927 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3928 state = DTRACE_JSON_VALUE;
3929 break;
3930 }
3931
3932 /*
3933 * ERROR: expected to find a top-level object or array.
3934 */
3935 return (NULL);
3936 case DTRACE_JSON_OBJECT:
3937 if (isspace(cc))
3938 break;
3939
3940 if (cc == '"') {
3941 state = DTRACE_JSON_STRING;
3942 string_is_key = B_TRUE;
3943 break;
3944 }
3945
3946 /*
3947 * ERROR: either the object did not start with a key
3948 * string, or we've run off the end of the object
3949 * without finding the requested key.
3950 */
3951 return (NULL);
3952 case DTRACE_JSON_STRING:
3953 if (cc == '\\') {
3954 *dd++ = '\\';
3955 state = DTRACE_JSON_STRING_ESCAPE;
3956 break;
3957 }
3958
3959 if (cc == '"') {
3960 if (collect_object) {
3961 /*
3962 * We don't reset the dest here, as
3963 * the string is part of a larger
3964 * object being collected.
3965 */
3966 *dd++ = cc;
3967 collect_object = B_FALSE;
3968 state = DTRACE_JSON_COLLECT_OBJECT;
3969 break;
3970 }
3971 *dd = '\0';
3972 dd = dest; /* reset string buffer */
3973 if (string_is_key) {
3974 if (dtrace_strncmp(dest, elem,
3975 size) == 0)
3976 found_key = B_TRUE;
3977 } else if (found_key) {
3978 if (nelems > 1) {
3979 /*
3980 * We expected an object, not
3981 * this string.
3982 */
3983 return (NULL);
3984 }
3985 return (dest);
3986 }
3987 state = string_is_key ? DTRACE_JSON_COLON :
3988 DTRACE_JSON_COMMA;
3989 string_is_key = B_FALSE;
3990 break;
3991 }
3992
3993 *dd++ = cc;
3994 break;
3995 case DTRACE_JSON_STRING_ESCAPE:
3996 *dd++ = cc;
3997 if (cc == 'u') {
3998 escape_unicount = 0;
3999 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
4000 } else {
4001 state = DTRACE_JSON_STRING;
4002 }
4003 break;
4004 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
4005 if (!isxdigit(cc)) {
4006 /*
4007 * ERROR: invalid unicode escape, expected
4008 * four valid hexidecimal digits.
4009 */
4010 return (NULL);
4011 }
4012
4013 *dd++ = cc;
4014 if (++escape_unicount == 4)
4015 state = DTRACE_JSON_STRING;
4016 break;
4017 case DTRACE_JSON_COLON:
4018 if (isspace(cc))
4019 break;
4020
4021 if (cc == ':') {
4022 state = DTRACE_JSON_VALUE;
4023 break;
4024 }
4025
4026 /*
4027 * ERROR: expected a colon.
4028 */
4029 return (NULL);
4030 case DTRACE_JSON_COMMA:
4031 if (isspace(cc))
4032 break;
4033
4034 if (cc == ',') {
4035 if (in_array) {
4036 state = DTRACE_JSON_VALUE;
4037 if (++array_pos == array_elem)
4038 found_key = B_TRUE;
4039 } else {
4040 state = DTRACE_JSON_OBJECT;
4041 }
4042 break;
4043 }
4044
4045 /*
4046 * ERROR: either we hit an unexpected character, or
4047 * we reached the end of the object or array without
4048 * finding the requested key.
4049 */
4050 return (NULL);
4051 case DTRACE_JSON_IDENTIFIER:
4052 if (islower(cc)) {
4053 *dd++ = cc;
4054 break;
4055 }
4056
4057 *dd = '\0';
4058 dd = dest; /* reset string buffer */
4059
4060 if (dtrace_strncmp(dest, "true", 5) == 0 ||
4061 dtrace_strncmp(dest, "false", 6) == 0 ||
4062 dtrace_strncmp(dest, "null", 5) == 0) {
4063 if (found_key) {
4064 if (nelems > 1) {
4065 /*
4066 * ERROR: We expected an object,
4067 * not this identifier.
4068 */
4069 return (NULL);
4070 }
4071 return (dest);
4072 } else {
4073 cur--;
4074 state = DTRACE_JSON_COMMA;
4075 break;
4076 }
4077 }
4078
4079 /*
4080 * ERROR: we did not recognise the identifier as one
4081 * of those in the JSON specification.
4082 */
4083 return (NULL);
4084 case DTRACE_JSON_NUMBER:
4085 if (cc == '.') {
4086 *dd++ = cc;
4087 state = DTRACE_JSON_NUMBER_FRAC;
4088 break;
4089 }
4090
4091 if (cc == 'x' || cc == 'X') {
4092 /*
4093 * ERROR: specification explicitly excludes
4094 * hexidecimal or octal numbers.
4095 */
4096 return (NULL);
4097 }
4098
4099 /* FALLTHRU */
4100 case DTRACE_JSON_NUMBER_FRAC:
4101 if (cc == 'e' || cc == 'E') {
4102 *dd++ = cc;
4103 state = DTRACE_JSON_NUMBER_EXP;
4104 break;
4105 }
4106
4107 if (cc == '+' || cc == '-') {
4108 /*
4109 * ERROR: expect sign as part of exponent only.
4110 */
4111 return (NULL);
4112 }
4113 /* FALLTHRU */
4114 case DTRACE_JSON_NUMBER_EXP:
4115 if (isdigit(cc) || cc == '+' || cc == '-') {
4116 *dd++ = cc;
4117 break;
4118 }
4119
4120 *dd = '\0';
4121 dd = dest; /* reset string buffer */
4122 if (found_key) {
4123 if (nelems > 1) {
4124 /*
4125 * ERROR: We expected an object, not
4126 * this number.
4127 */
4128 return (NULL);
4129 }
4130 return (dest);
4131 }
4132
4133 cur--;
4134 state = DTRACE_JSON_COMMA;
4135 break;
4136 case DTRACE_JSON_VALUE:
4137 if (isspace(cc))
4138 break;
4139
4140 if (cc == '{' || cc == '[') {
4141 if (nelems > 1 && found_key) {
4142 in_array = cc == '[' ? B_TRUE : B_FALSE;
4143 /*
4144 * If our element selector directs us
4145 * to descend into this nested object,
4146 * then move to the next selector
4147 * element in the list and restart the
4148 * state machine.
4149 */
4150 while (*elem != '\0')
4151 elem++;
4152 elem++; /* skip the inter-element NUL */
4153 nelems--;
4154 dd = dest;
4155 if (in_array) {
4156 state = DTRACE_JSON_VALUE;
4157 array_pos = 0;
4158 array_elem = dtrace_strtoll(
4159 elem, 10, size);
4160 found_key = array_elem == 0 ?
4161 B_TRUE : B_FALSE;
4162 } else {
4163 found_key = B_FALSE;
4164 state = DTRACE_JSON_OBJECT;
4165 }
4166 break;
4167 }
4168
4169 /*
4170 * Otherwise, we wish to either skip this
4171 * nested object or return it in full.
4172 */
4173 if (cc == '[')
4174 brackets = 1;
4175 else
4176 braces = 1;
4177 *dd++ = cc;
4178 state = DTRACE_JSON_COLLECT_OBJECT;
4179 break;
4180 }
4181
4182 if (cc == '"') {
4183 state = DTRACE_JSON_STRING;
4184 break;
4185 }
4186
4187 if (islower(cc)) {
4188 /*
4189 * Here we deal with true, false and null.
4190 */
4191 *dd++ = cc;
4192 state = DTRACE_JSON_IDENTIFIER;
4193 break;
4194 }
4195
4196 if (cc == '-' || isdigit(cc)) {
4197 *dd++ = cc;
4198 state = DTRACE_JSON_NUMBER;
4199 break;
4200 }
4201
4202 /*
4203 * ERROR: unexpected character at start of value.
4204 */
4205 return (NULL);
4206 case DTRACE_JSON_COLLECT_OBJECT:
4207 if (cc == '\0')
4208 /*
4209 * ERROR: unexpected end of input.
4210 */
4211 return (NULL);
4212
4213 *dd++ = cc;
4214 if (cc == '"') {
4215 collect_object = B_TRUE;
4216 state = DTRACE_JSON_STRING;
4217 break;
4218 }
4219
4220 if (cc == ']') {
4221 if (brackets-- == 0) {
4222 /*
4223 * ERROR: unbalanced brackets.
4224 */
4225 return (NULL);
4226 }
4227 } else if (cc == '}') {
4228 if (braces-- == 0) {
4229 /*
4230 * ERROR: unbalanced braces.
4231 */
4232 return (NULL);
4233 }
4234 } else if (cc == '{') {
4235 braces++;
4236 } else if (cc == '[') {
4237 brackets++;
4238 }
4239
4240 if (brackets == 0 && braces == 0) {
4241 if (found_key) {
4242 *dd = '\0';
4243 return (dest);
4244 }
4245 dd = dest; /* reset string buffer */
4246 state = DTRACE_JSON_COMMA;
4247 }
4248 break;
4249 }
4250 }
4251 return (NULL);
4252 }
4253
4254 /*
4255 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4256 * Notice that we don't bother validating the proper number of arguments or
4257 * their types in the tuple stack. This isn't needed because all argument
4258 * interpretation is safe because of our load safety -- the worst that can
4259 * happen is that a bogus program can obtain bogus results.
4260 */
4261 static void
dtrace_dif_subr(uint_t subr,uint_t rd,uint64_t * regs,dtrace_key_t * tupregs,int nargs,dtrace_mstate_t * mstate,dtrace_state_t * state)4262 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4263 dtrace_key_t *tupregs, int nargs,
4264 dtrace_mstate_t *mstate, dtrace_state_t *state)
4265 {
4266 volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4267 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4268 dtrace_vstate_t *vstate = &state->dts_vstate;
4269
4270 #ifdef illumos
4271 union {
4272 mutex_impl_t mi;
4273 uint64_t mx;
4274 } m;
4275
4276 union {
4277 krwlock_t ri;
4278 uintptr_t rw;
4279 } r;
4280 #else
4281 struct thread *lowner;
4282 union {
4283 struct lock_object *li;
4284 uintptr_t lx;
4285 } l;
4286 #endif
4287
4288 switch (subr) {
4289 case DIF_SUBR_RAND:
4290 regs[rd] = dtrace_xoroshiro128_plus_next(
4291 state->dts_rstate[curcpu]);
4292 break;
4293
4294 #ifdef illumos
4295 case DIF_SUBR_MUTEX_OWNED:
4296 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4297 mstate, vstate)) {
4298 regs[rd] = 0;
4299 break;
4300 }
4301
4302 m.mx = dtrace_load64(tupregs[0].dttk_value);
4303 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4304 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4305 else
4306 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4307 break;
4308
4309 case DIF_SUBR_MUTEX_OWNER:
4310 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4311 mstate, vstate)) {
4312 regs[rd] = 0;
4313 break;
4314 }
4315
4316 m.mx = dtrace_load64(tupregs[0].dttk_value);
4317 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4318 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4319 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4320 else
4321 regs[rd] = 0;
4322 break;
4323
4324 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4325 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4326 mstate, vstate)) {
4327 regs[rd] = 0;
4328 break;
4329 }
4330
4331 m.mx = dtrace_load64(tupregs[0].dttk_value);
4332 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4333 break;
4334
4335 case DIF_SUBR_MUTEX_TYPE_SPIN:
4336 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4337 mstate, vstate)) {
4338 regs[rd] = 0;
4339 break;
4340 }
4341
4342 m.mx = dtrace_load64(tupregs[0].dttk_value);
4343 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4344 break;
4345
4346 case DIF_SUBR_RW_READ_HELD: {
4347 uintptr_t tmp;
4348
4349 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4350 mstate, vstate)) {
4351 regs[rd] = 0;
4352 break;
4353 }
4354
4355 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4356 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4357 break;
4358 }
4359
4360 case DIF_SUBR_RW_WRITE_HELD:
4361 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4362 mstate, vstate)) {
4363 regs[rd] = 0;
4364 break;
4365 }
4366
4367 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4368 regs[rd] = _RW_WRITE_HELD(&r.ri);
4369 break;
4370
4371 case DIF_SUBR_RW_ISWRITER:
4372 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4373 mstate, vstate)) {
4374 regs[rd] = 0;
4375 break;
4376 }
4377
4378 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4379 regs[rd] = _RW_ISWRITER(&r.ri);
4380 break;
4381
4382 #else /* !illumos */
4383 case DIF_SUBR_MUTEX_OWNED:
4384 if (!dtrace_canload(tupregs[0].dttk_value,
4385 sizeof (struct lock_object), mstate, vstate)) {
4386 regs[rd] = 0;
4387 break;
4388 }
4389 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4390 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4391 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4392 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4393 break;
4394
4395 case DIF_SUBR_MUTEX_OWNER:
4396 if (!dtrace_canload(tupregs[0].dttk_value,
4397 sizeof (struct lock_object), mstate, vstate)) {
4398 regs[rd] = 0;
4399 break;
4400 }
4401 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4402 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4403 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4404 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4405 regs[rd] = (uintptr_t)lowner;
4406 break;
4407
4408 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4409 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4410 mstate, vstate)) {
4411 regs[rd] = 0;
4412 break;
4413 }
4414 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4415 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4416 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0;
4417 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4418 break;
4419
4420 case DIF_SUBR_MUTEX_TYPE_SPIN:
4421 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4422 mstate, vstate)) {
4423 regs[rd] = 0;
4424 break;
4425 }
4426 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4427 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4428 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4429 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4430 break;
4431
4432 case DIF_SUBR_RW_READ_HELD:
4433 case DIF_SUBR_SX_SHARED_HELD:
4434 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4435 mstate, vstate)) {
4436 regs[rd] = 0;
4437 break;
4438 }
4439 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4440 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4441 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4442 lowner == NULL;
4443 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4444 break;
4445
4446 case DIF_SUBR_RW_WRITE_HELD:
4447 case DIF_SUBR_SX_EXCLUSIVE_HELD:
4448 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4449 mstate, vstate)) {
4450 regs[rd] = 0;
4451 break;
4452 }
4453 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4454 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4455 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4456 lowner != NULL;
4457 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4458 break;
4459
4460 case DIF_SUBR_RW_ISWRITER:
4461 case DIF_SUBR_SX_ISEXCLUSIVE:
4462 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4463 mstate, vstate)) {
4464 regs[rd] = 0;
4465 break;
4466 }
4467 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4468 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4469 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4470 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4471 regs[rd] = (lowner == curthread);
4472 break;
4473 #endif /* illumos */
4474
4475 case DIF_SUBR_BCOPY: {
4476 /*
4477 * We need to be sure that the destination is in the scratch
4478 * region -- no other region is allowed.
4479 */
4480 uintptr_t src = tupregs[0].dttk_value;
4481 uintptr_t dest = tupregs[1].dttk_value;
4482 size_t size = tupregs[2].dttk_value;
4483
4484 if (!dtrace_inscratch(dest, size, mstate)) {
4485 *flags |= CPU_DTRACE_BADADDR;
4486 *illval = regs[rd];
4487 break;
4488 }
4489
4490 if (!dtrace_canload(src, size, mstate, vstate)) {
4491 regs[rd] = 0;
4492 break;
4493 }
4494
4495 dtrace_bcopy((void *)src, (void *)dest, size);
4496 break;
4497 }
4498
4499 case DIF_SUBR_ALLOCA:
4500 case DIF_SUBR_COPYIN: {
4501 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4502 uint64_t size =
4503 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4504 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4505
4506 /*
4507 * This action doesn't require any credential checks since
4508 * probes will not activate in user contexts to which the
4509 * enabling user does not have permissions.
4510 */
4511
4512 /*
4513 * Rounding up the user allocation size could have overflowed
4514 * a large, bogus allocation (like -1ULL) to 0.
4515 */
4516 if (scratch_size < size ||
4517 !DTRACE_INSCRATCH(mstate, scratch_size)) {
4518 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4519 regs[rd] = 0;
4520 break;
4521 }
4522
4523 if (subr == DIF_SUBR_COPYIN) {
4524 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4525 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4526 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4527 }
4528
4529 mstate->dtms_scratch_ptr += scratch_size;
4530 regs[rd] = dest;
4531 break;
4532 }
4533
4534 case DIF_SUBR_COPYINTO: {
4535 uint64_t size = tupregs[1].dttk_value;
4536 uintptr_t dest = tupregs[2].dttk_value;
4537
4538 /*
4539 * This action doesn't require any credential checks since
4540 * probes will not activate in user contexts to which the
4541 * enabling user does not have permissions.
4542 */
4543 if (!dtrace_inscratch(dest, size, mstate)) {
4544 *flags |= CPU_DTRACE_BADADDR;
4545 *illval = regs[rd];
4546 break;
4547 }
4548
4549 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4550 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4551 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4552 break;
4553 }
4554
4555 case DIF_SUBR_COPYINSTR: {
4556 uintptr_t dest = mstate->dtms_scratch_ptr;
4557 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4558
4559 if (nargs > 1 && tupregs[1].dttk_value < size)
4560 size = tupregs[1].dttk_value + 1;
4561
4562 /*
4563 * This action doesn't require any credential checks since
4564 * probes will not activate in user contexts to which the
4565 * enabling user does not have permissions.
4566 */
4567 if (!DTRACE_INSCRATCH(mstate, size)) {
4568 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4569 regs[rd] = 0;
4570 break;
4571 }
4572
4573 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4574 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4575 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4576
4577 ((char *)dest)[size - 1] = '\0';
4578 mstate->dtms_scratch_ptr += size;
4579 regs[rd] = dest;
4580 break;
4581 }
4582
4583 #ifdef illumos
4584 case DIF_SUBR_MSGSIZE:
4585 case DIF_SUBR_MSGDSIZE: {
4586 uintptr_t baddr = tupregs[0].dttk_value, daddr;
4587 uintptr_t wptr, rptr;
4588 size_t count = 0;
4589 int cont = 0;
4590
4591 while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4592
4593 if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4594 vstate)) {
4595 regs[rd] = 0;
4596 break;
4597 }
4598
4599 wptr = dtrace_loadptr(baddr +
4600 offsetof(mblk_t, b_wptr));
4601
4602 rptr = dtrace_loadptr(baddr +
4603 offsetof(mblk_t, b_rptr));
4604
4605 if (wptr < rptr) {
4606 *flags |= CPU_DTRACE_BADADDR;
4607 *illval = tupregs[0].dttk_value;
4608 break;
4609 }
4610
4611 daddr = dtrace_loadptr(baddr +
4612 offsetof(mblk_t, b_datap));
4613
4614 baddr = dtrace_loadptr(baddr +
4615 offsetof(mblk_t, b_cont));
4616
4617 /*
4618 * We want to prevent against denial-of-service here,
4619 * so we're only going to search the list for
4620 * dtrace_msgdsize_max mblks.
4621 */
4622 if (cont++ > dtrace_msgdsize_max) {
4623 *flags |= CPU_DTRACE_ILLOP;
4624 break;
4625 }
4626
4627 if (subr == DIF_SUBR_MSGDSIZE) {
4628 if (dtrace_load8(daddr +
4629 offsetof(dblk_t, db_type)) != M_DATA)
4630 continue;
4631 }
4632
4633 count += wptr - rptr;
4634 }
4635
4636 if (!(*flags & CPU_DTRACE_FAULT))
4637 regs[rd] = count;
4638
4639 break;
4640 }
4641 #endif
4642
4643 case DIF_SUBR_PROGENYOF: {
4644 pid_t pid = tupregs[0].dttk_value;
4645 proc_t *p;
4646 int rval = 0;
4647
4648 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4649
4650 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4651 #ifdef illumos
4652 if (p->p_pidp->pid_id == pid) {
4653 #else
4654 if (p->p_pid == pid) {
4655 #endif
4656 rval = 1;
4657 break;
4658 }
4659 }
4660
4661 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4662
4663 regs[rd] = rval;
4664 break;
4665 }
4666
4667 case DIF_SUBR_SPECULATION:
4668 regs[rd] = dtrace_speculation(state);
4669 break;
4670
4671 case DIF_SUBR_COPYOUT: {
4672 uintptr_t kaddr = tupregs[0].dttk_value;
4673 uintptr_t uaddr = tupregs[1].dttk_value;
4674 uint64_t size = tupregs[2].dttk_value;
4675
4676 if (!dtrace_destructive_disallow &&
4677 dtrace_priv_proc_control(state) &&
4678 !dtrace_istoxic(kaddr, size) &&
4679 dtrace_canload(kaddr, size, mstate, vstate)) {
4680 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4681 dtrace_copyout(kaddr, uaddr, size, flags);
4682 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4683 }
4684 break;
4685 }
4686
4687 case DIF_SUBR_COPYOUTSTR: {
4688 uintptr_t kaddr = tupregs[0].dttk_value;
4689 uintptr_t uaddr = tupregs[1].dttk_value;
4690 uint64_t size = tupregs[2].dttk_value;
4691 size_t lim;
4692
4693 if (!dtrace_destructive_disallow &&
4694 dtrace_priv_proc_control(state) &&
4695 !dtrace_istoxic(kaddr, size) &&
4696 dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4697 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4698 dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4699 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4700 }
4701 break;
4702 }
4703
4704 case DIF_SUBR_STRLEN: {
4705 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4706 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4707 size_t lim;
4708
4709 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4710 regs[rd] = 0;
4711 break;
4712 }
4713
4714 regs[rd] = dtrace_strlen((char *)addr, lim);
4715 break;
4716 }
4717
4718 case DIF_SUBR_STRCHR:
4719 case DIF_SUBR_STRRCHR: {
4720 /*
4721 * We're going to iterate over the string looking for the
4722 * specified character. We will iterate until we have reached
4723 * the string length or we have found the character. If this
4724 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4725 * of the specified character instead of the first.
4726 */
4727 uintptr_t addr = tupregs[0].dttk_value;
4728 uintptr_t addr_limit;
4729 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4730 size_t lim;
4731 char c, target = (char)tupregs[1].dttk_value;
4732
4733 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4734 regs[rd] = 0;
4735 break;
4736 }
4737 addr_limit = addr + lim;
4738
4739 for (regs[rd] = 0; addr < addr_limit; addr++) {
4740 if ((c = dtrace_load8(addr)) == target) {
4741 regs[rd] = addr;
4742
4743 if (subr == DIF_SUBR_STRCHR)
4744 break;
4745 }
4746
4747 if (c == '\0')
4748 break;
4749 }
4750 break;
4751 }
4752
4753 case DIF_SUBR_STRSTR:
4754 case DIF_SUBR_INDEX:
4755 case DIF_SUBR_RINDEX: {
4756 /*
4757 * We're going to iterate over the string looking for the
4758 * specified string. We will iterate until we have reached
4759 * the string length or we have found the string. (Yes, this
4760 * is done in the most naive way possible -- but considering
4761 * that the string we're searching for is likely to be
4762 * relatively short, the complexity of Rabin-Karp or similar
4763 * hardly seems merited.)
4764 */
4765 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4766 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4767 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4768 size_t len = dtrace_strlen(addr, size);
4769 size_t sublen = dtrace_strlen(substr, size);
4770 char *limit = addr + len, *orig = addr;
4771 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4772 int inc = 1;
4773
4774 regs[rd] = notfound;
4775
4776 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4777 regs[rd] = 0;
4778 break;
4779 }
4780
4781 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4782 vstate)) {
4783 regs[rd] = 0;
4784 break;
4785 }
4786
4787 /*
4788 * strstr() and index()/rindex() have similar semantics if
4789 * both strings are the empty string: strstr() returns a
4790 * pointer to the (empty) string, and index() and rindex()
4791 * both return index 0 (regardless of any position argument).
4792 */
4793 if (sublen == 0 && len == 0) {
4794 if (subr == DIF_SUBR_STRSTR)
4795 regs[rd] = (uintptr_t)addr;
4796 else
4797 regs[rd] = 0;
4798 break;
4799 }
4800
4801 if (subr != DIF_SUBR_STRSTR) {
4802 if (subr == DIF_SUBR_RINDEX) {
4803 limit = orig - 1;
4804 addr += len;
4805 inc = -1;
4806 }
4807
4808 /*
4809 * Both index() and rindex() take an optional position
4810 * argument that denotes the starting position.
4811 */
4812 if (nargs == 3) {
4813 int64_t pos = (int64_t)tupregs[2].dttk_value;
4814
4815 /*
4816 * If the position argument to index() is
4817 * negative, Perl implicitly clamps it at
4818 * zero. This semantic is a little surprising
4819 * given the special meaning of negative
4820 * positions to similar Perl functions like
4821 * substr(), but it appears to reflect a
4822 * notion that index() can start from a
4823 * negative index and increment its way up to
4824 * the string. Given this notion, Perl's
4825 * rindex() is at least self-consistent in
4826 * that it implicitly clamps positions greater
4827 * than the string length to be the string
4828 * length. Where Perl completely loses
4829 * coherence, however, is when the specified
4830 * substring is the empty string (""). In
4831 * this case, even if the position is
4832 * negative, rindex() returns 0 -- and even if
4833 * the position is greater than the length,
4834 * index() returns the string length. These
4835 * semantics violate the notion that index()
4836 * should never return a value less than the
4837 * specified position and that rindex() should
4838 * never return a value greater than the
4839 * specified position. (One assumes that
4840 * these semantics are artifacts of Perl's
4841 * implementation and not the results of
4842 * deliberate design -- it beggars belief that
4843 * even Larry Wall could desire such oddness.)
4844 * While in the abstract one would wish for
4845 * consistent position semantics across
4846 * substr(), index() and rindex() -- or at the
4847 * very least self-consistent position
4848 * semantics for index() and rindex() -- we
4849 * instead opt to keep with the extant Perl
4850 * semantics, in all their broken glory. (Do
4851 * we have more desire to maintain Perl's
4852 * semantics than Perl does? Probably.)
4853 */
4854 if (subr == DIF_SUBR_RINDEX) {
4855 if (pos < 0) {
4856 if (sublen == 0)
4857 regs[rd] = 0;
4858 break;
4859 }
4860
4861 if (pos > len)
4862 pos = len;
4863 } else {
4864 if (pos < 0)
4865 pos = 0;
4866
4867 if (pos >= len) {
4868 if (sublen == 0)
4869 regs[rd] = len;
4870 break;
4871 }
4872 }
4873
4874 addr = orig + pos;
4875 }
4876 }
4877
4878 for (regs[rd] = notfound; addr != limit; addr += inc) {
4879 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4880 if (subr != DIF_SUBR_STRSTR) {
4881 /*
4882 * As D index() and rindex() are
4883 * modeled on Perl (and not on awk),
4884 * we return a zero-based (and not a
4885 * one-based) index. (For you Perl
4886 * weenies: no, we're not going to add
4887 * $[ -- and shouldn't you be at a con
4888 * or something?)
4889 */
4890 regs[rd] = (uintptr_t)(addr - orig);
4891 break;
4892 }
4893
4894 ASSERT(subr == DIF_SUBR_STRSTR);
4895 regs[rd] = (uintptr_t)addr;
4896 break;
4897 }
4898 }
4899
4900 break;
4901 }
4902
4903 case DIF_SUBR_STRTOK: {
4904 uintptr_t addr = tupregs[0].dttk_value;
4905 uintptr_t tokaddr = tupregs[1].dttk_value;
4906 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4907 uintptr_t limit, toklimit;
4908 size_t clim;
4909 uint8_t c = 0, tokmap[32]; /* 256 / 8 */
4910 char *dest = (char *)mstate->dtms_scratch_ptr;
4911 int i;
4912
4913 /*
4914 * Check both the token buffer and (later) the input buffer,
4915 * since both could be non-scratch addresses.
4916 */
4917 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4918 regs[rd] = 0;
4919 break;
4920 }
4921 toklimit = tokaddr + clim;
4922
4923 if (!DTRACE_INSCRATCH(mstate, size)) {
4924 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4925 regs[rd] = 0;
4926 break;
4927 }
4928
4929 if (addr == 0) {
4930 /*
4931 * If the address specified is NULL, we use our saved
4932 * strtok pointer from the mstate. Note that this
4933 * means that the saved strtok pointer is _only_
4934 * valid within multiple enablings of the same probe --
4935 * it behaves like an implicit clause-local variable.
4936 */
4937 addr = mstate->dtms_strtok;
4938 limit = mstate->dtms_strtok_limit;
4939 } else {
4940 /*
4941 * If the user-specified address is non-NULL we must
4942 * access check it. This is the only time we have
4943 * a chance to do so, since this address may reside
4944 * in the string table of this clause-- future calls
4945 * (when we fetch addr from mstate->dtms_strtok)
4946 * would fail this access check.
4947 */
4948 if (!dtrace_strcanload(addr, size, &clim, mstate,
4949 vstate)) {
4950 regs[rd] = 0;
4951 break;
4952 }
4953 limit = addr + clim;
4954 }
4955
4956 /*
4957 * First, zero the token map, and then process the token
4958 * string -- setting a bit in the map for every character
4959 * found in the token string.
4960 */
4961 for (i = 0; i < sizeof (tokmap); i++)
4962 tokmap[i] = 0;
4963
4964 for (; tokaddr < toklimit; tokaddr++) {
4965 if ((c = dtrace_load8(tokaddr)) == '\0')
4966 break;
4967
4968 ASSERT((c >> 3) < sizeof (tokmap));
4969 tokmap[c >> 3] |= (1 << (c & 0x7));
4970 }
4971
4972 for (; addr < limit; addr++) {
4973 /*
4974 * We're looking for a character that is _not_
4975 * contained in the token string.
4976 */
4977 if ((c = dtrace_load8(addr)) == '\0')
4978 break;
4979
4980 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4981 break;
4982 }
4983
4984 if (c == '\0') {
4985 /*
4986 * We reached the end of the string without finding
4987 * any character that was not in the token string.
4988 * We return NULL in this case, and we set the saved
4989 * address to NULL as well.
4990 */
4991 regs[rd] = 0;
4992 mstate->dtms_strtok = 0;
4993 mstate->dtms_strtok_limit = 0;
4994 break;
4995 }
4996
4997 /*
4998 * From here on, we're copying into the destination string.
4999 */
5000 for (i = 0; addr < limit && i < size - 1; addr++) {
5001 if ((c = dtrace_load8(addr)) == '\0')
5002 break;
5003
5004 if (tokmap[c >> 3] & (1 << (c & 0x7)))
5005 break;
5006
5007 ASSERT(i < size);
5008 dest[i++] = c;
5009 }
5010
5011 ASSERT(i < size);
5012 dest[i] = '\0';
5013 regs[rd] = (uintptr_t)dest;
5014 mstate->dtms_scratch_ptr += size;
5015 mstate->dtms_strtok = addr;
5016 mstate->dtms_strtok_limit = limit;
5017 break;
5018 }
5019
5020 case DIF_SUBR_SUBSTR: {
5021 uintptr_t s = tupregs[0].dttk_value;
5022 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5023 char *d = (char *)mstate->dtms_scratch_ptr;
5024 int64_t index = (int64_t)tupregs[1].dttk_value;
5025 int64_t remaining = (int64_t)tupregs[2].dttk_value;
5026 size_t len = dtrace_strlen((char *)s, size);
5027 int64_t i;
5028
5029 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
5030 regs[rd] = 0;
5031 break;
5032 }
5033
5034 if (!DTRACE_INSCRATCH(mstate, size)) {
5035 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5036 regs[rd] = 0;
5037 break;
5038 }
5039
5040 if (nargs <= 2)
5041 remaining = (int64_t)size;
5042
5043 if (index < 0) {
5044 index += len;
5045
5046 if (index < 0 && index + remaining > 0) {
5047 remaining += index;
5048 index = 0;
5049 }
5050 }
5051
5052 if (index >= len || index < 0) {
5053 remaining = 0;
5054 } else if (remaining < 0) {
5055 remaining += len - index;
5056 } else if (index + remaining > size) {
5057 remaining = size - index;
5058 }
5059
5060 for (i = 0; i < remaining; i++) {
5061 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
5062 break;
5063 }
5064
5065 d[i] = '\0';
5066
5067 mstate->dtms_scratch_ptr += size;
5068 regs[rd] = (uintptr_t)d;
5069 break;
5070 }
5071
5072 case DIF_SUBR_JSON: {
5073 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5074 uintptr_t json = tupregs[0].dttk_value;
5075 size_t jsonlen = dtrace_strlen((char *)json, size);
5076 uintptr_t elem = tupregs[1].dttk_value;
5077 size_t elemlen = dtrace_strlen((char *)elem, size);
5078
5079 char *dest = (char *)mstate->dtms_scratch_ptr;
5080 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5081 char *ee = elemlist;
5082 int nelems = 1;
5083 uintptr_t cur;
5084
5085 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5086 !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5087 regs[rd] = 0;
5088 break;
5089 }
5090
5091 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5092 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5093 regs[rd] = 0;
5094 break;
5095 }
5096
5097 /*
5098 * Read the element selector and split it up into a packed list
5099 * of strings.
5100 */
5101 for (cur = elem; cur < elem + elemlen; cur++) {
5102 char cc = dtrace_load8(cur);
5103
5104 if (cur == elem && cc == '[') {
5105 /*
5106 * If the first element selector key is
5107 * actually an array index then ignore the
5108 * bracket.
5109 */
5110 continue;
5111 }
5112
5113 if (cc == ']')
5114 continue;
5115
5116 if (cc == '.' || cc == '[') {
5117 nelems++;
5118 cc = '\0';
5119 }
5120
5121 *ee++ = cc;
5122 }
5123 *ee++ = '\0';
5124
5125 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5126 nelems, dest)) != 0)
5127 mstate->dtms_scratch_ptr += jsonlen + 1;
5128 break;
5129 }
5130
5131 case DIF_SUBR_TOUPPER:
5132 case DIF_SUBR_TOLOWER: {
5133 uintptr_t s = tupregs[0].dttk_value;
5134 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5135 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5136 size_t len = dtrace_strlen((char *)s, size);
5137 char lower, upper, convert;
5138 int64_t i;
5139
5140 if (subr == DIF_SUBR_TOUPPER) {
5141 lower = 'a';
5142 upper = 'z';
5143 convert = 'A';
5144 } else {
5145 lower = 'A';
5146 upper = 'Z';
5147 convert = 'a';
5148 }
5149
5150 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
5151 regs[rd] = 0;
5152 break;
5153 }
5154
5155 if (!DTRACE_INSCRATCH(mstate, size)) {
5156 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5157 regs[rd] = 0;
5158 break;
5159 }
5160
5161 for (i = 0; i < size - 1; i++) {
5162 if ((c = dtrace_load8(s + i)) == '\0')
5163 break;
5164
5165 if (c >= lower && c <= upper)
5166 c = convert + (c - lower);
5167
5168 dest[i] = c;
5169 }
5170
5171 ASSERT(i < size);
5172 dest[i] = '\0';
5173 regs[rd] = (uintptr_t)dest;
5174 mstate->dtms_scratch_ptr += size;
5175 break;
5176 }
5177
5178 #ifdef illumos
5179 case DIF_SUBR_GETMAJOR:
5180 #ifdef _LP64
5181 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
5182 #else
5183 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
5184 #endif
5185 break;
5186
5187 case DIF_SUBR_GETMINOR:
5188 #ifdef _LP64
5189 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
5190 #else
5191 regs[rd] = tupregs[0].dttk_value & MAXMIN;
5192 #endif
5193 break;
5194
5195 case DIF_SUBR_DDI_PATHNAME: {
5196 /*
5197 * This one is a galactic mess. We are going to roughly
5198 * emulate ddi_pathname(), but it's made more complicated
5199 * by the fact that we (a) want to include the minor name and
5200 * (b) must proceed iteratively instead of recursively.
5201 */
5202 uintptr_t dest = mstate->dtms_scratch_ptr;
5203 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5204 char *start = (char *)dest, *end = start + size - 1;
5205 uintptr_t daddr = tupregs[0].dttk_value;
5206 int64_t minor = (int64_t)tupregs[1].dttk_value;
5207 char *s;
5208 int i, len, depth = 0;
5209
5210 /*
5211 * Due to all the pointer jumping we do and context we must
5212 * rely upon, we just mandate that the user must have kernel
5213 * read privileges to use this routine.
5214 */
5215 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
5216 *flags |= CPU_DTRACE_KPRIV;
5217 *illval = daddr;
5218 regs[rd] = 0;
5219 }
5220
5221 if (!DTRACE_INSCRATCH(mstate, size)) {
5222 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5223 regs[rd] = 0;
5224 break;
5225 }
5226
5227 *end = '\0';
5228
5229 /*
5230 * We want to have a name for the minor. In order to do this,
5231 * we need to walk the minor list from the devinfo. We want
5232 * to be sure that we don't infinitely walk a circular list,
5233 * so we check for circularity by sending a scout pointer
5234 * ahead two elements for every element that we iterate over;
5235 * if the list is circular, these will ultimately point to the
5236 * same element. You may recognize this little trick as the
5237 * answer to a stupid interview question -- one that always
5238 * seems to be asked by those who had to have it laboriously
5239 * explained to them, and who can't even concisely describe
5240 * the conditions under which one would be forced to resort to
5241 * this technique. Needless to say, those conditions are
5242 * found here -- and probably only here. Is this the only use
5243 * of this infamous trick in shipping, production code? If it
5244 * isn't, it probably should be...
5245 */
5246 if (minor != -1) {
5247 uintptr_t maddr = dtrace_loadptr(daddr +
5248 offsetof(struct dev_info, devi_minor));
5249
5250 uintptr_t next = offsetof(struct ddi_minor_data, next);
5251 uintptr_t name = offsetof(struct ddi_minor_data,
5252 d_minor) + offsetof(struct ddi_minor, name);
5253 uintptr_t dev = offsetof(struct ddi_minor_data,
5254 d_minor) + offsetof(struct ddi_minor, dev);
5255 uintptr_t scout;
5256
5257 if (maddr != NULL)
5258 scout = dtrace_loadptr(maddr + next);
5259
5260 while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5261 uint64_t m;
5262 #ifdef _LP64
5263 m = dtrace_load64(maddr + dev) & MAXMIN64;
5264 #else
5265 m = dtrace_load32(maddr + dev) & MAXMIN;
5266 #endif
5267 if (m != minor) {
5268 maddr = dtrace_loadptr(maddr + next);
5269
5270 if (scout == NULL)
5271 continue;
5272
5273 scout = dtrace_loadptr(scout + next);
5274
5275 if (scout == NULL)
5276 continue;
5277
5278 scout = dtrace_loadptr(scout + next);
5279
5280 if (scout == NULL)
5281 continue;
5282
5283 if (scout == maddr) {
5284 *flags |= CPU_DTRACE_ILLOP;
5285 break;
5286 }
5287
5288 continue;
5289 }
5290
5291 /*
5292 * We have the minor data. Now we need to
5293 * copy the minor's name into the end of the
5294 * pathname.
5295 */
5296 s = (char *)dtrace_loadptr(maddr + name);
5297 len = dtrace_strlen(s, size);
5298
5299 if (*flags & CPU_DTRACE_FAULT)
5300 break;
5301
5302 if (len != 0) {
5303 if ((end -= (len + 1)) < start)
5304 break;
5305
5306 *end = ':';
5307 }
5308
5309 for (i = 1; i <= len; i++)
5310 end[i] = dtrace_load8((uintptr_t)s++);
5311 break;
5312 }
5313 }
5314
5315 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5316 ddi_node_state_t devi_state;
5317
5318 devi_state = dtrace_load32(daddr +
5319 offsetof(struct dev_info, devi_node_state));
5320
5321 if (*flags & CPU_DTRACE_FAULT)
5322 break;
5323
5324 if (devi_state >= DS_INITIALIZED) {
5325 s = (char *)dtrace_loadptr(daddr +
5326 offsetof(struct dev_info, devi_addr));
5327 len = dtrace_strlen(s, size);
5328
5329 if (*flags & CPU_DTRACE_FAULT)
5330 break;
5331
5332 if (len != 0) {
5333 if ((end -= (len + 1)) < start)
5334 break;
5335
5336 *end = '@';
5337 }
5338
5339 for (i = 1; i <= len; i++)
5340 end[i] = dtrace_load8((uintptr_t)s++);
5341 }
5342
5343 /*
5344 * Now for the node name...
5345 */
5346 s = (char *)dtrace_loadptr(daddr +
5347 offsetof(struct dev_info, devi_node_name));
5348
5349 daddr = dtrace_loadptr(daddr +
5350 offsetof(struct dev_info, devi_parent));
5351
5352 /*
5353 * If our parent is NULL (that is, if we're the root
5354 * node), we're going to use the special path
5355 * "devices".
5356 */
5357 if (daddr == 0)
5358 s = "devices";
5359
5360 len = dtrace_strlen(s, size);
5361 if (*flags & CPU_DTRACE_FAULT)
5362 break;
5363
5364 if ((end -= (len + 1)) < start)
5365 break;
5366
5367 for (i = 1; i <= len; i++)
5368 end[i] = dtrace_load8((uintptr_t)s++);
5369 *end = '/';
5370
5371 if (depth++ > dtrace_devdepth_max) {
5372 *flags |= CPU_DTRACE_ILLOP;
5373 break;
5374 }
5375 }
5376
5377 if (end < start)
5378 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5379
5380 if (daddr == 0) {
5381 regs[rd] = (uintptr_t)end;
5382 mstate->dtms_scratch_ptr += size;
5383 }
5384
5385 break;
5386 }
5387 #endif
5388
5389 case DIF_SUBR_STRJOIN: {
5390 char *d = (char *)mstate->dtms_scratch_ptr;
5391 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5392 uintptr_t s1 = tupregs[0].dttk_value;
5393 uintptr_t s2 = tupregs[1].dttk_value;
5394 int i = 0, j = 0;
5395 size_t lim1, lim2;
5396 char c;
5397
5398 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
5399 !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
5400 regs[rd] = 0;
5401 break;
5402 }
5403
5404 if (!DTRACE_INSCRATCH(mstate, size)) {
5405 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5406 regs[rd] = 0;
5407 break;
5408 }
5409
5410 for (;;) {
5411 if (i >= size) {
5412 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5413 regs[rd] = 0;
5414 break;
5415 }
5416 c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
5417 if ((d[i++] = c) == '\0') {
5418 i--;
5419 break;
5420 }
5421 }
5422
5423 for (;;) {
5424 if (i >= size) {
5425 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5426 regs[rd] = 0;
5427 break;
5428 }
5429
5430 c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
5431 if ((d[i++] = c) == '\0')
5432 break;
5433 }
5434
5435 if (i < size) {
5436 mstate->dtms_scratch_ptr += i;
5437 regs[rd] = (uintptr_t)d;
5438 }
5439
5440 break;
5441 }
5442
5443 case DIF_SUBR_STRTOLL: {
5444 uintptr_t s = tupregs[0].dttk_value;
5445 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5446 size_t lim;
5447 int base = 10;
5448
5449 if (nargs > 1) {
5450 if ((base = tupregs[1].dttk_value) <= 1 ||
5451 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5452 *flags |= CPU_DTRACE_ILLOP;
5453 break;
5454 }
5455 }
5456
5457 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
5458 regs[rd] = INT64_MIN;
5459 break;
5460 }
5461
5462 regs[rd] = dtrace_strtoll((char *)s, base, lim);
5463 break;
5464 }
5465
5466 case DIF_SUBR_LLTOSTR: {
5467 int64_t i = (int64_t)tupregs[0].dttk_value;
5468 uint64_t val, digit;
5469 uint64_t size = 65; /* enough room for 2^64 in binary */
5470 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5471 int base = 10;
5472
5473 if (nargs > 1) {
5474 if ((base = tupregs[1].dttk_value) <= 1 ||
5475 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5476 *flags |= CPU_DTRACE_ILLOP;
5477 break;
5478 }
5479 }
5480
5481 val = (base == 10 && i < 0) ? i * -1 : i;
5482
5483 if (!DTRACE_INSCRATCH(mstate, size)) {
5484 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5485 regs[rd] = 0;
5486 break;
5487 }
5488
5489 for (*end-- = '\0'; val; val /= base) {
5490 if ((digit = val % base) <= '9' - '0') {
5491 *end-- = '0' + digit;
5492 } else {
5493 *end-- = 'a' + (digit - ('9' - '0') - 1);
5494 }
5495 }
5496
5497 if (i == 0 && base == 16)
5498 *end-- = '0';
5499
5500 if (base == 16)
5501 *end-- = 'x';
5502
5503 if (i == 0 || base == 8 || base == 16)
5504 *end-- = '0';
5505
5506 if (i < 0 && base == 10)
5507 *end-- = '-';
5508
5509 regs[rd] = (uintptr_t)end + 1;
5510 mstate->dtms_scratch_ptr += size;
5511 break;
5512 }
5513
5514 case DIF_SUBR_HTONS:
5515 case DIF_SUBR_NTOHS:
5516 #if BYTE_ORDER == BIG_ENDIAN
5517 regs[rd] = (uint16_t)tupregs[0].dttk_value;
5518 #else
5519 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5520 #endif
5521 break;
5522
5523
5524 case DIF_SUBR_HTONL:
5525 case DIF_SUBR_NTOHL:
5526 #if BYTE_ORDER == BIG_ENDIAN
5527 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5528 #else
5529 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5530 #endif
5531 break;
5532
5533
5534 case DIF_SUBR_HTONLL:
5535 case DIF_SUBR_NTOHLL:
5536 #if BYTE_ORDER == BIG_ENDIAN
5537 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5538 #else
5539 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5540 #endif
5541 break;
5542
5543
5544 case DIF_SUBR_DIRNAME:
5545 case DIF_SUBR_BASENAME: {
5546 char *dest = (char *)mstate->dtms_scratch_ptr;
5547 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5548 uintptr_t src = tupregs[0].dttk_value;
5549 int i, j, len = dtrace_strlen((char *)src, size);
5550 int lastbase = -1, firstbase = -1, lastdir = -1;
5551 int start, end;
5552
5553 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5554 regs[rd] = 0;
5555 break;
5556 }
5557
5558 if (!DTRACE_INSCRATCH(mstate, size)) {
5559 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5560 regs[rd] = 0;
5561 break;
5562 }
5563
5564 /*
5565 * The basename and dirname for a zero-length string is
5566 * defined to be "."
5567 */
5568 if (len == 0) {
5569 len = 1;
5570 src = (uintptr_t)".";
5571 }
5572
5573 /*
5574 * Start from the back of the string, moving back toward the
5575 * front until we see a character that isn't a slash. That
5576 * character is the last character in the basename.
5577 */
5578 for (i = len - 1; i >= 0; i--) {
5579 if (dtrace_load8(src + i) != '/')
5580 break;
5581 }
5582
5583 if (i >= 0)
5584 lastbase = i;
5585
5586 /*
5587 * Starting from the last character in the basename, move
5588 * towards the front until we find a slash. The character
5589 * that we processed immediately before that is the first
5590 * character in the basename.
5591 */
5592 for (; i >= 0; i--) {
5593 if (dtrace_load8(src + i) == '/')
5594 break;
5595 }
5596
5597 if (i >= 0)
5598 firstbase = i + 1;
5599
5600 /*
5601 * Now keep going until we find a non-slash character. That
5602 * character is the last character in the dirname.
5603 */
5604 for (; i >= 0; i--) {
5605 if (dtrace_load8(src + i) != '/')
5606 break;
5607 }
5608
5609 if (i >= 0)
5610 lastdir = i;
5611
5612 ASSERT(!(lastbase == -1 && firstbase != -1));
5613 ASSERT(!(firstbase == -1 && lastdir != -1));
5614
5615 if (lastbase == -1) {
5616 /*
5617 * We didn't find a non-slash character. We know that
5618 * the length is non-zero, so the whole string must be
5619 * slashes. In either the dirname or the basename
5620 * case, we return '/'.
5621 */
5622 ASSERT(firstbase == -1);
5623 firstbase = lastbase = lastdir = 0;
5624 }
5625
5626 if (firstbase == -1) {
5627 /*
5628 * The entire string consists only of a basename
5629 * component. If we're looking for dirname, we need
5630 * to change our string to be just "."; if we're
5631 * looking for a basename, we'll just set the first
5632 * character of the basename to be 0.
5633 */
5634 if (subr == DIF_SUBR_DIRNAME) {
5635 ASSERT(lastdir == -1);
5636 src = (uintptr_t)".";
5637 lastdir = 0;
5638 } else {
5639 firstbase = 0;
5640 }
5641 }
5642
5643 if (subr == DIF_SUBR_DIRNAME) {
5644 if (lastdir == -1) {
5645 /*
5646 * We know that we have a slash in the name --
5647 * or lastdir would be set to 0, above. And
5648 * because lastdir is -1, we know that this
5649 * slash must be the first character. (That
5650 * is, the full string must be of the form
5651 * "/basename".) In this case, the last
5652 * character of the directory name is 0.
5653 */
5654 lastdir = 0;
5655 }
5656
5657 start = 0;
5658 end = lastdir;
5659 } else {
5660 ASSERT(subr == DIF_SUBR_BASENAME);
5661 ASSERT(firstbase != -1 && lastbase != -1);
5662 start = firstbase;
5663 end = lastbase;
5664 }
5665
5666 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5667 dest[j] = dtrace_load8(src + i);
5668
5669 dest[j] = '\0';
5670 regs[rd] = (uintptr_t)dest;
5671 mstate->dtms_scratch_ptr += size;
5672 break;
5673 }
5674
5675 case DIF_SUBR_GETF: {
5676 uintptr_t fd = tupregs[0].dttk_value;
5677 struct filedesc *fdp;
5678 file_t *fp;
5679
5680 if (!dtrace_priv_proc(state)) {
5681 regs[rd] = 0;
5682 break;
5683 }
5684 fdp = curproc->p_fd;
5685 FILEDESC_SLOCK(fdp);
5686 /*
5687 * XXXMJG this looks broken as no ref is taken.
5688 */
5689 fp = fget_noref(fdp, fd);
5690 mstate->dtms_getf = fp;
5691 regs[rd] = (uintptr_t)fp;
5692 FILEDESC_SUNLOCK(fdp);
5693 break;
5694 }
5695
5696 case DIF_SUBR_CLEANPATH: {
5697 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5698 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5699 uintptr_t src = tupregs[0].dttk_value;
5700 size_t lim;
5701 int i = 0, j = 0;
5702 #ifdef illumos
5703 zone_t *z;
5704 #endif
5705
5706 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5707 regs[rd] = 0;
5708 break;
5709 }
5710
5711 if (!DTRACE_INSCRATCH(mstate, size)) {
5712 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5713 regs[rd] = 0;
5714 break;
5715 }
5716
5717 /*
5718 * Move forward, loading each character.
5719 */
5720 do {
5721 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5722 next:
5723 if (j + 5 >= size) /* 5 = strlen("/..c\0") */
5724 break;
5725
5726 if (c != '/') {
5727 dest[j++] = c;
5728 continue;
5729 }
5730
5731 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5732
5733 if (c == '/') {
5734 /*
5735 * We have two slashes -- we can just advance
5736 * to the next character.
5737 */
5738 goto next;
5739 }
5740
5741 if (c != '.') {
5742 /*
5743 * This is not "." and it's not ".." -- we can
5744 * just store the "/" and this character and
5745 * drive on.
5746 */
5747 dest[j++] = '/';
5748 dest[j++] = c;
5749 continue;
5750 }
5751
5752 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5753
5754 if (c == '/') {
5755 /*
5756 * This is a "/./" component. We're not going
5757 * to store anything in the destination buffer;
5758 * we're just going to go to the next component.
5759 */
5760 goto next;
5761 }
5762
5763 if (c != '.') {
5764 /*
5765 * This is not ".." -- we can just store the
5766 * "/." and this character and continue
5767 * processing.
5768 */
5769 dest[j++] = '/';
5770 dest[j++] = '.';
5771 dest[j++] = c;
5772 continue;
5773 }
5774
5775 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5776
5777 if (c != '/' && c != '\0') {
5778 /*
5779 * This is not ".." -- it's "..[mumble]".
5780 * We'll store the "/.." and this character
5781 * and continue processing.
5782 */
5783 dest[j++] = '/';
5784 dest[j++] = '.';
5785 dest[j++] = '.';
5786 dest[j++] = c;
5787 continue;
5788 }
5789
5790 /*
5791 * This is "/../" or "/..\0". We need to back up
5792 * our destination pointer until we find a "/".
5793 */
5794 i--;
5795 while (j != 0 && dest[--j] != '/')
5796 continue;
5797
5798 if (c == '\0')
5799 dest[++j] = '/';
5800 } while (c != '\0');
5801
5802 dest[j] = '\0';
5803
5804 #ifdef illumos
5805 if (mstate->dtms_getf != NULL &&
5806 !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5807 (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5808 /*
5809 * If we've done a getf() as a part of this ECB and we
5810 * don't have kernel access (and we're not in the global
5811 * zone), check if the path we cleaned up begins with
5812 * the zone's root path, and trim it off if so. Note
5813 * that this is an output cleanliness issue, not a
5814 * security issue: knowing one's zone root path does
5815 * not enable privilege escalation.
5816 */
5817 if (strstr(dest, z->zone_rootpath) == dest)
5818 dest += strlen(z->zone_rootpath) - 1;
5819 }
5820 #endif
5821
5822 regs[rd] = (uintptr_t)dest;
5823 mstate->dtms_scratch_ptr += size;
5824 break;
5825 }
5826
5827 case DIF_SUBR_INET_NTOA:
5828 case DIF_SUBR_INET_NTOA6:
5829 case DIF_SUBR_INET_NTOP: {
5830 size_t size;
5831 int af, argi, i;
5832 char *base, *end;
5833
5834 if (subr == DIF_SUBR_INET_NTOP) {
5835 af = (int)tupregs[0].dttk_value;
5836 argi = 1;
5837 } else {
5838 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5839 argi = 0;
5840 }
5841
5842 if (af == AF_INET) {
5843 ipaddr_t ip4;
5844 uint8_t *ptr8, val;
5845
5846 if (!dtrace_canload(tupregs[argi].dttk_value,
5847 sizeof (ipaddr_t), mstate, vstate)) {
5848 regs[rd] = 0;
5849 break;
5850 }
5851
5852 /*
5853 * Safely load the IPv4 address.
5854 */
5855 ip4 = dtrace_load32(tupregs[argi].dttk_value);
5856
5857 /*
5858 * Check an IPv4 string will fit in scratch.
5859 */
5860 size = INET_ADDRSTRLEN;
5861 if (!DTRACE_INSCRATCH(mstate, size)) {
5862 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5863 regs[rd] = 0;
5864 break;
5865 }
5866 base = (char *)mstate->dtms_scratch_ptr;
5867 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5868
5869 /*
5870 * Stringify as a dotted decimal quad.
5871 */
5872 *end-- = '\0';
5873 ptr8 = (uint8_t *)&ip4;
5874 for (i = 3; i >= 0; i--) {
5875 val = ptr8[i];
5876
5877 if (val == 0) {
5878 *end-- = '0';
5879 } else {
5880 for (; val; val /= 10) {
5881 *end-- = '0' + (val % 10);
5882 }
5883 }
5884
5885 if (i > 0)
5886 *end-- = '.';
5887 }
5888 ASSERT(end + 1 >= base);
5889
5890 } else if (af == AF_INET6) {
5891 struct in6_addr ip6;
5892 int firstzero, tryzero, numzero, v6end;
5893 uint16_t val;
5894 const char digits[] = "0123456789abcdef";
5895
5896 /*
5897 * Stringify using RFC 1884 convention 2 - 16 bit
5898 * hexadecimal values with a zero-run compression.
5899 * Lower case hexadecimal digits are used.
5900 * eg, fe80::214:4fff:fe0b:76c8.
5901 * The IPv4 embedded form is returned for inet_ntop,
5902 * just the IPv4 string is returned for inet_ntoa6.
5903 */
5904
5905 if (!dtrace_canload(tupregs[argi].dttk_value,
5906 sizeof (struct in6_addr), mstate, vstate)) {
5907 regs[rd] = 0;
5908 break;
5909 }
5910
5911 /*
5912 * Safely load the IPv6 address.
5913 */
5914 dtrace_bcopy(
5915 (void *)(uintptr_t)tupregs[argi].dttk_value,
5916 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5917
5918 /*
5919 * Check an IPv6 string will fit in scratch.
5920 */
5921 size = INET6_ADDRSTRLEN;
5922 if (!DTRACE_INSCRATCH(mstate, size)) {
5923 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5924 regs[rd] = 0;
5925 break;
5926 }
5927 base = (char *)mstate->dtms_scratch_ptr;
5928 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5929 *end-- = '\0';
5930
5931 /*
5932 * Find the longest run of 16 bit zero values
5933 * for the single allowed zero compression - "::".
5934 */
5935 firstzero = -1;
5936 tryzero = -1;
5937 numzero = 1;
5938 for (i = 0; i < sizeof (struct in6_addr); i++) {
5939 #ifdef illumos
5940 if (ip6._S6_un._S6_u8[i] == 0 &&
5941 #else
5942 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5943 #endif
5944 tryzero == -1 && i % 2 == 0) {
5945 tryzero = i;
5946 continue;
5947 }
5948
5949 if (tryzero != -1 &&
5950 #ifdef illumos
5951 (ip6._S6_un._S6_u8[i] != 0 ||
5952 #else
5953 (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5954 #endif
5955 i == sizeof (struct in6_addr) - 1)) {
5956
5957 if (i - tryzero <= numzero) {
5958 tryzero = -1;
5959 continue;
5960 }
5961
5962 firstzero = tryzero;
5963 numzero = i - i % 2 - tryzero;
5964 tryzero = -1;
5965
5966 #ifdef illumos
5967 if (ip6._S6_un._S6_u8[i] == 0 &&
5968 #else
5969 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5970 #endif
5971 i == sizeof (struct in6_addr) - 1)
5972 numzero += 2;
5973 }
5974 }
5975 ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5976
5977 /*
5978 * Check for an IPv4 embedded address.
5979 */
5980 v6end = sizeof (struct in6_addr) - 2;
5981 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5982 IN6_IS_ADDR_V4COMPAT(&ip6)) {
5983 for (i = sizeof (struct in6_addr) - 1;
5984 i >= DTRACE_V4MAPPED_OFFSET; i--) {
5985 ASSERT(end >= base);
5986
5987 #ifdef illumos
5988 val = ip6._S6_un._S6_u8[i];
5989 #else
5990 val = ip6.__u6_addr.__u6_addr8[i];
5991 #endif
5992
5993 if (val == 0) {
5994 *end-- = '0';
5995 } else {
5996 for (; val; val /= 10) {
5997 *end-- = '0' + val % 10;
5998 }
5999 }
6000
6001 if (i > DTRACE_V4MAPPED_OFFSET)
6002 *end-- = '.';
6003 }
6004
6005 if (subr == DIF_SUBR_INET_NTOA6)
6006 goto inetout;
6007
6008 /*
6009 * Set v6end to skip the IPv4 address that
6010 * we have already stringified.
6011 */
6012 v6end = 10;
6013 }
6014
6015 /*
6016 * Build the IPv6 string by working through the
6017 * address in reverse.
6018 */
6019 for (i = v6end; i >= 0; i -= 2) {
6020 ASSERT(end >= base);
6021
6022 if (i == firstzero + numzero - 2) {
6023 *end-- = ':';
6024 *end-- = ':';
6025 i -= numzero - 2;
6026 continue;
6027 }
6028
6029 if (i < 14 && i != firstzero - 2)
6030 *end-- = ':';
6031
6032 #ifdef illumos
6033 val = (ip6._S6_un._S6_u8[i] << 8) +
6034 ip6._S6_un._S6_u8[i + 1];
6035 #else
6036 val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
6037 ip6.__u6_addr.__u6_addr8[i + 1];
6038 #endif
6039
6040 if (val == 0) {
6041 *end-- = '0';
6042 } else {
6043 for (; val; val /= 16) {
6044 *end-- = digits[val % 16];
6045 }
6046 }
6047 }
6048 ASSERT(end + 1 >= base);
6049
6050 } else {
6051 /*
6052 * The user didn't use AH_INET or AH_INET6.
6053 */
6054 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6055 regs[rd] = 0;
6056 break;
6057 }
6058
6059 inetout: regs[rd] = (uintptr_t)end + 1;
6060 mstate->dtms_scratch_ptr += size;
6061 break;
6062 }
6063
6064 case DIF_SUBR_MEMREF: {
6065 uintptr_t size = 2 * sizeof(uintptr_t);
6066 uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
6067 size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
6068
6069 /* address and length */
6070 memref[0] = tupregs[0].dttk_value;
6071 memref[1] = tupregs[1].dttk_value;
6072
6073 regs[rd] = (uintptr_t) memref;
6074 mstate->dtms_scratch_ptr += scratch_size;
6075 break;
6076 }
6077
6078 #ifndef illumos
6079 case DIF_SUBR_MEMSTR: {
6080 char *str = (char *)mstate->dtms_scratch_ptr;
6081 uintptr_t mem = tupregs[0].dttk_value;
6082 char c = tupregs[1].dttk_value;
6083 size_t size = tupregs[2].dttk_value;
6084 uint8_t n;
6085 int i;
6086
6087 regs[rd] = 0;
6088
6089 if (size == 0)
6090 break;
6091
6092 if (!dtrace_canload(mem, size - 1, mstate, vstate))
6093 break;
6094
6095 if (!DTRACE_INSCRATCH(mstate, size)) {
6096 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6097 break;
6098 }
6099
6100 if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
6101 *flags |= CPU_DTRACE_ILLOP;
6102 break;
6103 }
6104
6105 for (i = 0; i < size - 1; i++) {
6106 n = dtrace_load8(mem++);
6107 str[i] = (n == 0) ? c : n;
6108 }
6109 str[size - 1] = 0;
6110
6111 regs[rd] = (uintptr_t)str;
6112 mstate->dtms_scratch_ptr += size;
6113 break;
6114 }
6115 #endif
6116 }
6117 }
6118
6119 /*
6120 * Emulate the execution of DTrace IR instructions specified by the given
6121 * DIF object. This function is deliberately void of assertions as all of
6122 * the necessary checks are handled by a call to dtrace_difo_validate().
6123 */
6124 static uint64_t
6125 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
6126 dtrace_vstate_t *vstate, dtrace_state_t *state)
6127 {
6128 const dif_instr_t *text = difo->dtdo_buf;
6129 const uint_t textlen = difo->dtdo_len;
6130 const char *strtab = difo->dtdo_strtab;
6131 const uint64_t *inttab = difo->dtdo_inttab;
6132
6133 uint64_t rval = 0;
6134 dtrace_statvar_t *svar;
6135 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
6136 dtrace_difv_t *v;
6137 volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6138 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
6139
6140 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
6141 uint64_t regs[DIF_DIR_NREGS];
6142 uint64_t *tmp;
6143
6144 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
6145 int64_t cc_r;
6146 uint_t pc = 0, id, opc = 0;
6147 uint8_t ttop = 0;
6148 dif_instr_t instr;
6149 uint_t r1, r2, rd;
6150
6151 /*
6152 * We stash the current DIF object into the machine state: we need it
6153 * for subsequent access checking.
6154 */
6155 mstate->dtms_difo = difo;
6156
6157 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
6158
6159 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
6160 opc = pc;
6161
6162 instr = text[pc++];
6163 r1 = DIF_INSTR_R1(instr);
6164 r2 = DIF_INSTR_R2(instr);
6165 rd = DIF_INSTR_RD(instr);
6166
6167 switch (DIF_INSTR_OP(instr)) {
6168 case DIF_OP_OR:
6169 regs[rd] = regs[r1] | regs[r2];
6170 break;
6171 case DIF_OP_XOR:
6172 regs[rd] = regs[r1] ^ regs[r2];
6173 break;
6174 case DIF_OP_AND:
6175 regs[rd] = regs[r1] & regs[r2];
6176 break;
6177 case DIF_OP_SLL:
6178 regs[rd] = regs[r1] << regs[r2];
6179 break;
6180 case DIF_OP_SRL:
6181 regs[rd] = regs[r1] >> regs[r2];
6182 break;
6183 case DIF_OP_SUB:
6184 regs[rd] = regs[r1] - regs[r2];
6185 break;
6186 case DIF_OP_ADD:
6187 regs[rd] = regs[r1] + regs[r2];
6188 break;
6189 case DIF_OP_MUL:
6190 regs[rd] = regs[r1] * regs[r2];
6191 break;
6192 case DIF_OP_SDIV:
6193 if (regs[r2] == 0) {
6194 regs[rd] = 0;
6195 *flags |= CPU_DTRACE_DIVZERO;
6196 } else {
6197 regs[rd] = (int64_t)regs[r1] /
6198 (int64_t)regs[r2];
6199 }
6200 break;
6201
6202 case DIF_OP_UDIV:
6203 if (regs[r2] == 0) {
6204 regs[rd] = 0;
6205 *flags |= CPU_DTRACE_DIVZERO;
6206 } else {
6207 regs[rd] = regs[r1] / regs[r2];
6208 }
6209 break;
6210
6211 case DIF_OP_SREM:
6212 if (regs[r2] == 0) {
6213 regs[rd] = 0;
6214 *flags |= CPU_DTRACE_DIVZERO;
6215 } else {
6216 regs[rd] = (int64_t)regs[r1] %
6217 (int64_t)regs[r2];
6218 }
6219 break;
6220
6221 case DIF_OP_UREM:
6222 if (regs[r2] == 0) {
6223 regs[rd] = 0;
6224 *flags |= CPU_DTRACE_DIVZERO;
6225 } else {
6226 regs[rd] = regs[r1] % regs[r2];
6227 }
6228 break;
6229
6230 case DIF_OP_NOT:
6231 regs[rd] = ~regs[r1];
6232 break;
6233 case DIF_OP_MOV:
6234 regs[rd] = regs[r1];
6235 break;
6236 case DIF_OP_CMP:
6237 cc_r = regs[r1] - regs[r2];
6238 cc_n = cc_r < 0;
6239 cc_z = cc_r == 0;
6240 cc_v = 0;
6241 cc_c = regs[r1] < regs[r2];
6242 break;
6243 case DIF_OP_TST:
6244 cc_n = cc_v = cc_c = 0;
6245 cc_z = regs[r1] == 0;
6246 break;
6247 case DIF_OP_BA:
6248 pc = DIF_INSTR_LABEL(instr);
6249 break;
6250 case DIF_OP_BE:
6251 if (cc_z)
6252 pc = DIF_INSTR_LABEL(instr);
6253 break;
6254 case DIF_OP_BNE:
6255 if (cc_z == 0)
6256 pc = DIF_INSTR_LABEL(instr);
6257 break;
6258 case DIF_OP_BG:
6259 if ((cc_z | (cc_n ^ cc_v)) == 0)
6260 pc = DIF_INSTR_LABEL(instr);
6261 break;
6262 case DIF_OP_BGU:
6263 if ((cc_c | cc_z) == 0)
6264 pc = DIF_INSTR_LABEL(instr);
6265 break;
6266 case DIF_OP_BGE:
6267 if ((cc_n ^ cc_v) == 0)
6268 pc = DIF_INSTR_LABEL(instr);
6269 break;
6270 case DIF_OP_BGEU:
6271 if (cc_c == 0)
6272 pc = DIF_INSTR_LABEL(instr);
6273 break;
6274 case DIF_OP_BL:
6275 if (cc_n ^ cc_v)
6276 pc = DIF_INSTR_LABEL(instr);
6277 break;
6278 case DIF_OP_BLU:
6279 if (cc_c)
6280 pc = DIF_INSTR_LABEL(instr);
6281 break;
6282 case DIF_OP_BLE:
6283 if (cc_z | (cc_n ^ cc_v))
6284 pc = DIF_INSTR_LABEL(instr);
6285 break;
6286 case DIF_OP_BLEU:
6287 if (cc_c | cc_z)
6288 pc = DIF_INSTR_LABEL(instr);
6289 break;
6290 case DIF_OP_RLDSB:
6291 if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6292 break;
6293 /*FALLTHROUGH*/
6294 case DIF_OP_LDSB:
6295 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
6296 break;
6297 case DIF_OP_RLDSH:
6298 if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6299 break;
6300 /*FALLTHROUGH*/
6301 case DIF_OP_LDSH:
6302 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
6303 break;
6304 case DIF_OP_RLDSW:
6305 if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6306 break;
6307 /*FALLTHROUGH*/
6308 case DIF_OP_LDSW:
6309 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
6310 break;
6311 case DIF_OP_RLDUB:
6312 if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6313 break;
6314 /*FALLTHROUGH*/
6315 case DIF_OP_LDUB:
6316 regs[rd] = dtrace_load8(regs[r1]);
6317 break;
6318 case DIF_OP_RLDUH:
6319 if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6320 break;
6321 /*FALLTHROUGH*/
6322 case DIF_OP_LDUH:
6323 regs[rd] = dtrace_load16(regs[r1]);
6324 break;
6325 case DIF_OP_RLDUW:
6326 if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6327 break;
6328 /*FALLTHROUGH*/
6329 case DIF_OP_LDUW:
6330 regs[rd] = dtrace_load32(regs[r1]);
6331 break;
6332 case DIF_OP_RLDX:
6333 if (!dtrace_canload(regs[r1], 8, mstate, vstate))
6334 break;
6335 /*FALLTHROUGH*/
6336 case DIF_OP_LDX:
6337 regs[rd] = dtrace_load64(regs[r1]);
6338 break;
6339 case DIF_OP_ULDSB:
6340 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6341 regs[rd] = (int8_t)
6342 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6343 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6344 break;
6345 case DIF_OP_ULDSH:
6346 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6347 regs[rd] = (int16_t)
6348 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6349 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6350 break;
6351 case DIF_OP_ULDSW:
6352 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6353 regs[rd] = (int32_t)
6354 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6355 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6356 break;
6357 case DIF_OP_ULDUB:
6358 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6359 regs[rd] =
6360 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6361 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6362 break;
6363 case DIF_OP_ULDUH:
6364 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6365 regs[rd] =
6366 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6367 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6368 break;
6369 case DIF_OP_ULDUW:
6370 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6371 regs[rd] =
6372 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6373 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6374 break;
6375 case DIF_OP_ULDX:
6376 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6377 regs[rd] =
6378 dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6379 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6380 break;
6381 case DIF_OP_RET:
6382 rval = regs[rd];
6383 pc = textlen;
6384 break;
6385 case DIF_OP_NOP:
6386 break;
6387 case DIF_OP_SETX:
6388 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6389 break;
6390 case DIF_OP_SETS:
6391 regs[rd] = (uint64_t)(uintptr_t)
6392 (strtab + DIF_INSTR_STRING(instr));
6393 break;
6394 case DIF_OP_SCMP: {
6395 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6396 uintptr_t s1 = regs[r1];
6397 uintptr_t s2 = regs[r2];
6398 size_t lim1, lim2;
6399
6400 /*
6401 * If one of the strings is NULL then the limit becomes
6402 * 0 which compares 0 characters in dtrace_strncmp()
6403 * resulting in a false positive. dtrace_strncmp()
6404 * treats a NULL as an empty 1-char string.
6405 */
6406 lim1 = lim2 = 1;
6407
6408 if (s1 != 0 &&
6409 !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
6410 break;
6411 if (s2 != 0 &&
6412 !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
6413 break;
6414
6415 cc_r = dtrace_strncmp((char *)s1, (char *)s2,
6416 MIN(lim1, lim2));
6417
6418 cc_n = cc_r < 0;
6419 cc_z = cc_r == 0;
6420 cc_v = cc_c = 0;
6421 break;
6422 }
6423 case DIF_OP_LDGA:
6424 regs[rd] = dtrace_dif_variable(mstate, state,
6425 r1, regs[r2]);
6426 break;
6427 case DIF_OP_LDGS:
6428 id = DIF_INSTR_VAR(instr);
6429
6430 if (id >= DIF_VAR_OTHER_UBASE) {
6431 uintptr_t a;
6432
6433 id -= DIF_VAR_OTHER_UBASE;
6434 svar = vstate->dtvs_globals[id];
6435 ASSERT(svar != NULL);
6436 v = &svar->dtsv_var;
6437
6438 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6439 regs[rd] = svar->dtsv_data;
6440 break;
6441 }
6442
6443 a = (uintptr_t)svar->dtsv_data;
6444
6445 if (*(uint8_t *)a == UINT8_MAX) {
6446 /*
6447 * If the 0th byte is set to UINT8_MAX
6448 * then this is to be treated as a
6449 * reference to a NULL variable.
6450 */
6451 regs[rd] = 0;
6452 } else {
6453 regs[rd] = a + sizeof (uint64_t);
6454 }
6455
6456 break;
6457 }
6458
6459 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6460 break;
6461
6462 case DIF_OP_STGS:
6463 id = DIF_INSTR_VAR(instr);
6464
6465 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6466 id -= DIF_VAR_OTHER_UBASE;
6467
6468 VERIFY(id < vstate->dtvs_nglobals);
6469 svar = vstate->dtvs_globals[id];
6470 ASSERT(svar != NULL);
6471 v = &svar->dtsv_var;
6472
6473 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6474 uintptr_t a = (uintptr_t)svar->dtsv_data;
6475 size_t lim;
6476
6477 ASSERT(a != 0);
6478 ASSERT(svar->dtsv_size != 0);
6479
6480 if (regs[rd] == 0) {
6481 *(uint8_t *)a = UINT8_MAX;
6482 break;
6483 } else {
6484 *(uint8_t *)a = 0;
6485 a += sizeof (uint64_t);
6486 }
6487 if (!dtrace_vcanload(
6488 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6489 &lim, mstate, vstate))
6490 break;
6491
6492 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6493 (void *)a, &v->dtdv_type, lim);
6494 break;
6495 }
6496
6497 svar->dtsv_data = regs[rd];
6498 break;
6499
6500 case DIF_OP_LDTA:
6501 /*
6502 * There are no DTrace built-in thread-local arrays at
6503 * present. This opcode is saved for future work.
6504 */
6505 *flags |= CPU_DTRACE_ILLOP;
6506 regs[rd] = 0;
6507 break;
6508
6509 case DIF_OP_LDLS:
6510 id = DIF_INSTR_VAR(instr);
6511
6512 if (id < DIF_VAR_OTHER_UBASE) {
6513 /*
6514 * For now, this has no meaning.
6515 */
6516 regs[rd] = 0;
6517 break;
6518 }
6519
6520 id -= DIF_VAR_OTHER_UBASE;
6521
6522 ASSERT(id < vstate->dtvs_nlocals);
6523 ASSERT(vstate->dtvs_locals != NULL);
6524
6525 svar = vstate->dtvs_locals[id];
6526 ASSERT(svar != NULL);
6527 v = &svar->dtsv_var;
6528
6529 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6530 uintptr_t a = (uintptr_t)svar->dtsv_data;
6531 size_t sz = v->dtdv_type.dtdt_size;
6532 size_t lim;
6533
6534 sz += sizeof (uint64_t);
6535 ASSERT(svar->dtsv_size == (mp_maxid + 1) * sz);
6536 a += curcpu * sz;
6537
6538 if (*(uint8_t *)a == UINT8_MAX) {
6539 /*
6540 * If the 0th byte is set to UINT8_MAX
6541 * then this is to be treated as a
6542 * reference to a NULL variable.
6543 */
6544 regs[rd] = 0;
6545 } else {
6546 regs[rd] = a + sizeof (uint64_t);
6547 }
6548
6549 break;
6550 }
6551
6552 ASSERT(svar->dtsv_size ==
6553 (mp_maxid + 1) * sizeof (uint64_t));
6554 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6555 regs[rd] = tmp[curcpu];
6556 break;
6557
6558 case DIF_OP_STLS:
6559 id = DIF_INSTR_VAR(instr);
6560
6561 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6562 id -= DIF_VAR_OTHER_UBASE;
6563 VERIFY(id < vstate->dtvs_nlocals);
6564
6565 ASSERT(vstate->dtvs_locals != NULL);
6566 svar = vstate->dtvs_locals[id];
6567 ASSERT(svar != NULL);
6568 v = &svar->dtsv_var;
6569
6570 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6571 uintptr_t a = (uintptr_t)svar->dtsv_data;
6572 size_t sz = v->dtdv_type.dtdt_size;
6573 size_t lim;
6574
6575 sz += sizeof (uint64_t);
6576 ASSERT(svar->dtsv_size == (mp_maxid + 1) * sz);
6577 a += curcpu * sz;
6578
6579 if (regs[rd] == 0) {
6580 *(uint8_t *)a = UINT8_MAX;
6581 break;
6582 } else {
6583 *(uint8_t *)a = 0;
6584 a += sizeof (uint64_t);
6585 }
6586
6587 if (!dtrace_vcanload(
6588 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6589 &lim, mstate, vstate))
6590 break;
6591
6592 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6593 (void *)a, &v->dtdv_type, lim);
6594 break;
6595 }
6596
6597 ASSERT(svar->dtsv_size ==
6598 (mp_maxid + 1) * sizeof (uint64_t));
6599 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6600 tmp[curcpu] = regs[rd];
6601 break;
6602
6603 case DIF_OP_LDTS: {
6604 dtrace_dynvar_t *dvar;
6605 dtrace_key_t *key;
6606
6607 id = DIF_INSTR_VAR(instr);
6608 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6609 id -= DIF_VAR_OTHER_UBASE;
6610 v = &vstate->dtvs_tlocals[id];
6611
6612 key = &tupregs[DIF_DTR_NREGS];
6613 key[0].dttk_value = (uint64_t)id;
6614 key[0].dttk_size = 0;
6615 DTRACE_TLS_THRKEY(key[1].dttk_value);
6616 key[1].dttk_size = 0;
6617
6618 dvar = dtrace_dynvar(dstate, 2, key,
6619 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6620 mstate, vstate);
6621
6622 if (dvar == NULL) {
6623 regs[rd] = 0;
6624 break;
6625 }
6626
6627 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6628 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6629 } else {
6630 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6631 }
6632
6633 break;
6634 }
6635
6636 case DIF_OP_STTS: {
6637 dtrace_dynvar_t *dvar;
6638 dtrace_key_t *key;
6639
6640 id = DIF_INSTR_VAR(instr);
6641 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6642 id -= DIF_VAR_OTHER_UBASE;
6643 VERIFY(id < vstate->dtvs_ntlocals);
6644
6645 key = &tupregs[DIF_DTR_NREGS];
6646 key[0].dttk_value = (uint64_t)id;
6647 key[0].dttk_size = 0;
6648 DTRACE_TLS_THRKEY(key[1].dttk_value);
6649 key[1].dttk_size = 0;
6650 v = &vstate->dtvs_tlocals[id];
6651
6652 dvar = dtrace_dynvar(dstate, 2, key,
6653 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6654 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6655 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6656 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6657
6658 /*
6659 * Given that we're storing to thread-local data,
6660 * we need to flush our predicate cache.
6661 */
6662 curthread->t_predcache = 0;
6663
6664 if (dvar == NULL)
6665 break;
6666
6667 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6668 size_t lim;
6669
6670 if (!dtrace_vcanload(
6671 (void *)(uintptr_t)regs[rd],
6672 &v->dtdv_type, &lim, mstate, vstate))
6673 break;
6674
6675 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6676 dvar->dtdv_data, &v->dtdv_type, lim);
6677 } else {
6678 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6679 }
6680
6681 break;
6682 }
6683
6684 case DIF_OP_SRA:
6685 regs[rd] = (int64_t)regs[r1] >> regs[r2];
6686 break;
6687
6688 case DIF_OP_CALL:
6689 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6690 regs, tupregs, ttop, mstate, state);
6691 break;
6692
6693 case DIF_OP_PUSHTR:
6694 if (ttop == DIF_DTR_NREGS) {
6695 *flags |= CPU_DTRACE_TUPOFLOW;
6696 break;
6697 }
6698
6699 if (r1 == DIF_TYPE_STRING) {
6700 /*
6701 * If this is a string type and the size is 0,
6702 * we'll use the system-wide default string
6703 * size. Note that we are _not_ looking at
6704 * the value of the DTRACEOPT_STRSIZE option;
6705 * had this been set, we would expect to have
6706 * a non-zero size value in the "pushtr".
6707 */
6708 tupregs[ttop].dttk_size =
6709 dtrace_strlen((char *)(uintptr_t)regs[rd],
6710 regs[r2] ? regs[r2] :
6711 dtrace_strsize_default) + 1;
6712 } else {
6713 if (regs[r2] > LONG_MAX) {
6714 *flags |= CPU_DTRACE_ILLOP;
6715 break;
6716 }
6717
6718 tupregs[ttop].dttk_size = regs[r2];
6719 }
6720
6721 tupregs[ttop++].dttk_value = regs[rd];
6722 break;
6723
6724 case DIF_OP_PUSHTV:
6725 if (ttop == DIF_DTR_NREGS) {
6726 *flags |= CPU_DTRACE_TUPOFLOW;
6727 break;
6728 }
6729
6730 tupregs[ttop].dttk_value = regs[rd];
6731 tupregs[ttop++].dttk_size = 0;
6732 break;
6733
6734 case DIF_OP_POPTS:
6735 if (ttop != 0)
6736 ttop--;
6737 break;
6738
6739 case DIF_OP_FLUSHTS:
6740 ttop = 0;
6741 break;
6742
6743 case DIF_OP_LDGAA:
6744 case DIF_OP_LDTAA: {
6745 dtrace_dynvar_t *dvar;
6746 dtrace_key_t *key = tupregs;
6747 uint_t nkeys = ttop;
6748
6749 id = DIF_INSTR_VAR(instr);
6750 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6751 id -= DIF_VAR_OTHER_UBASE;
6752
6753 key[nkeys].dttk_value = (uint64_t)id;
6754 key[nkeys++].dttk_size = 0;
6755
6756 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6757 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6758 key[nkeys++].dttk_size = 0;
6759 VERIFY(id < vstate->dtvs_ntlocals);
6760 v = &vstate->dtvs_tlocals[id];
6761 } else {
6762 VERIFY(id < vstate->dtvs_nglobals);
6763 v = &vstate->dtvs_globals[id]->dtsv_var;
6764 }
6765
6766 dvar = dtrace_dynvar(dstate, nkeys, key,
6767 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6768 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6769 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6770
6771 if (dvar == NULL) {
6772 regs[rd] = 0;
6773 break;
6774 }
6775
6776 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6777 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6778 } else {
6779 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6780 }
6781
6782 break;
6783 }
6784
6785 case DIF_OP_STGAA:
6786 case DIF_OP_STTAA: {
6787 dtrace_dynvar_t *dvar;
6788 dtrace_key_t *key = tupregs;
6789 uint_t nkeys = ttop;
6790
6791 id = DIF_INSTR_VAR(instr);
6792 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6793 id -= DIF_VAR_OTHER_UBASE;
6794
6795 key[nkeys].dttk_value = (uint64_t)id;
6796 key[nkeys++].dttk_size = 0;
6797
6798 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6799 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6800 key[nkeys++].dttk_size = 0;
6801 VERIFY(id < vstate->dtvs_ntlocals);
6802 v = &vstate->dtvs_tlocals[id];
6803 } else {
6804 VERIFY(id < vstate->dtvs_nglobals);
6805 v = &vstate->dtvs_globals[id]->dtsv_var;
6806 }
6807
6808 dvar = dtrace_dynvar(dstate, nkeys, key,
6809 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6810 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6811 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6812 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6813
6814 if (dvar == NULL)
6815 break;
6816
6817 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6818 size_t lim;
6819
6820 if (!dtrace_vcanload(
6821 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6822 &lim, mstate, vstate))
6823 break;
6824
6825 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6826 dvar->dtdv_data, &v->dtdv_type, lim);
6827 } else {
6828 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6829 }
6830
6831 break;
6832 }
6833
6834 case DIF_OP_ALLOCS: {
6835 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6836 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6837
6838 /*
6839 * Rounding up the user allocation size could have
6840 * overflowed large, bogus allocations (like -1ULL) to
6841 * 0.
6842 */
6843 if (size < regs[r1] ||
6844 !DTRACE_INSCRATCH(mstate, size)) {
6845 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6846 regs[rd] = 0;
6847 break;
6848 }
6849
6850 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6851 mstate->dtms_scratch_ptr += size;
6852 regs[rd] = ptr;
6853 break;
6854 }
6855
6856 case DIF_OP_COPYS:
6857 if (!dtrace_canstore(regs[rd], regs[r2],
6858 mstate, vstate)) {
6859 *flags |= CPU_DTRACE_BADADDR;
6860 *illval = regs[rd];
6861 break;
6862 }
6863
6864 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6865 break;
6866
6867 dtrace_bcopy((void *)(uintptr_t)regs[r1],
6868 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6869 break;
6870
6871 case DIF_OP_STB:
6872 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6873 *flags |= CPU_DTRACE_BADADDR;
6874 *illval = regs[rd];
6875 break;
6876 }
6877 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6878 break;
6879
6880 case DIF_OP_STH:
6881 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6882 *flags |= CPU_DTRACE_BADADDR;
6883 *illval = regs[rd];
6884 break;
6885 }
6886 if (regs[rd] & 1) {
6887 *flags |= CPU_DTRACE_BADALIGN;
6888 *illval = regs[rd];
6889 break;
6890 }
6891 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6892 break;
6893
6894 case DIF_OP_STW:
6895 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6896 *flags |= CPU_DTRACE_BADADDR;
6897 *illval = regs[rd];
6898 break;
6899 }
6900 if (regs[rd] & 3) {
6901 *flags |= CPU_DTRACE_BADALIGN;
6902 *illval = regs[rd];
6903 break;
6904 }
6905 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6906 break;
6907
6908 case DIF_OP_STX:
6909 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6910 *flags |= CPU_DTRACE_BADADDR;
6911 *illval = regs[rd];
6912 break;
6913 }
6914 if (regs[rd] & 7) {
6915 *flags |= CPU_DTRACE_BADALIGN;
6916 *illval = regs[rd];
6917 break;
6918 }
6919 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6920 break;
6921 }
6922 }
6923
6924 if (!(*flags & CPU_DTRACE_FAULT))
6925 return (rval);
6926
6927 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6928 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6929
6930 return (0);
6931 }
6932
6933 static void
6934 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6935 {
6936 dtrace_probe_t *probe = ecb->dte_probe;
6937 dtrace_provider_t *prov = probe->dtpr_provider;
6938 char c[DTRACE_FULLNAMELEN + 80], *str;
6939 char *msg = "dtrace: breakpoint action at probe ";
6940 char *ecbmsg = " (ecb ";
6941 uintptr_t val = (uintptr_t)ecb;
6942 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6943
6944 if (dtrace_destructive_disallow)
6945 return;
6946
6947 /*
6948 * It's impossible to be taking action on the NULL probe.
6949 */
6950 ASSERT(probe != NULL);
6951
6952 /*
6953 * This is a poor man's (destitute man's?) sprintf(): we want to
6954 * print the provider name, module name, function name and name of
6955 * the probe, along with the hex address of the ECB with the breakpoint
6956 * action -- all of which we must place in the character buffer by
6957 * hand.
6958 */
6959 while (*msg != '\0')
6960 c[i++] = *msg++;
6961
6962 for (str = prov->dtpv_name; *str != '\0'; str++)
6963 c[i++] = *str;
6964 c[i++] = ':';
6965
6966 for (str = probe->dtpr_mod; *str != '\0'; str++)
6967 c[i++] = *str;
6968 c[i++] = ':';
6969
6970 for (str = probe->dtpr_func; *str != '\0'; str++)
6971 c[i++] = *str;
6972 c[i++] = ':';
6973
6974 for (str = probe->dtpr_name; *str != '\0'; str++)
6975 c[i++] = *str;
6976
6977 while (*ecbmsg != '\0')
6978 c[i++] = *ecbmsg++;
6979
6980 while (shift >= 0) {
6981 size_t mask = (size_t)0xf << shift;
6982
6983 if (val >= ((size_t)1 << shift))
6984 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6985 shift -= 4;
6986 }
6987
6988 c[i++] = ')';
6989 c[i] = '\0';
6990
6991 #ifdef illumos
6992 debug_enter(c);
6993 #else
6994 kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6995 #endif
6996 }
6997
6998 static void
6999 dtrace_action_panic(dtrace_ecb_t *ecb)
7000 {
7001 dtrace_probe_t *probe = ecb->dte_probe;
7002
7003 /*
7004 * It's impossible to be taking action on the NULL probe.
7005 */
7006 ASSERT(probe != NULL);
7007
7008 if (dtrace_destructive_disallow)
7009 return;
7010
7011 if (dtrace_panicked != NULL)
7012 return;
7013
7014 if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
7015 return;
7016
7017 /*
7018 * We won the right to panic. (We want to be sure that only one
7019 * thread calls panic() from dtrace_probe(), and that panic() is
7020 * called exactly once.)
7021 */
7022 dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
7023 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
7024 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
7025 }
7026
7027 static void
7028 dtrace_action_raise(uint64_t sig)
7029 {
7030 if (dtrace_destructive_disallow)
7031 return;
7032
7033 if (sig >= NSIG) {
7034 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
7035 return;
7036 }
7037
7038 #ifdef illumos
7039 /*
7040 * raise() has a queue depth of 1 -- we ignore all subsequent
7041 * invocations of the raise() action.
7042 */
7043 if (curthread->t_dtrace_sig == 0)
7044 curthread->t_dtrace_sig = (uint8_t)sig;
7045
7046 curthread->t_sig_check = 1;
7047 aston(curthread);
7048 #else
7049 struct proc *p = curproc;
7050 PROC_LOCK(p);
7051 kern_psignal(p, sig);
7052 PROC_UNLOCK(p);
7053 #endif
7054 }
7055
7056 static void
7057 dtrace_action_stop(void)
7058 {
7059 if (dtrace_destructive_disallow)
7060 return;
7061
7062 #ifdef illumos
7063 if (!curthread->t_dtrace_stop) {
7064 curthread->t_dtrace_stop = 1;
7065 curthread->t_sig_check = 1;
7066 aston(curthread);
7067 }
7068 #else
7069 struct proc *p = curproc;
7070 PROC_LOCK(p);
7071 kern_psignal(p, SIGSTOP);
7072 PROC_UNLOCK(p);
7073 #endif
7074 }
7075
7076 static void
7077 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
7078 {
7079 hrtime_t now;
7080 volatile uint16_t *flags;
7081 #ifdef illumos
7082 cpu_t *cpu = CPU;
7083 #else
7084 cpu_t *cpu = &solaris_cpu[curcpu];
7085 #endif
7086
7087 if (dtrace_destructive_disallow)
7088 return;
7089
7090 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
7091
7092 now = dtrace_gethrtime();
7093
7094 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
7095 /*
7096 * We need to advance the mark to the current time.
7097 */
7098 cpu->cpu_dtrace_chillmark = now;
7099 cpu->cpu_dtrace_chilled = 0;
7100 }
7101
7102 /*
7103 * Now check to see if the requested chill time would take us over
7104 * the maximum amount of time allowed in the chill interval. (Or
7105 * worse, if the calculation itself induces overflow.)
7106 */
7107 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
7108 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
7109 *flags |= CPU_DTRACE_ILLOP;
7110 return;
7111 }
7112
7113 while (dtrace_gethrtime() - now < val)
7114 continue;
7115
7116 /*
7117 * Normally, we assure that the value of the variable "timestamp" does
7118 * not change within an ECB. The presence of chill() represents an
7119 * exception to this rule, however.
7120 */
7121 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
7122 cpu->cpu_dtrace_chilled += val;
7123 }
7124
7125 static void
7126 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
7127 uint64_t *buf, uint64_t arg)
7128 {
7129 int nframes = DTRACE_USTACK_NFRAMES(arg);
7130 int strsize = DTRACE_USTACK_STRSIZE(arg);
7131 uint64_t *pcs = &buf[1], *fps;
7132 char *str = (char *)&pcs[nframes];
7133 int size, offs = 0, i, j;
7134 size_t rem;
7135 uintptr_t old = mstate->dtms_scratch_ptr, saved;
7136 uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
7137 char *sym;
7138
7139 /*
7140 * Should be taking a faster path if string space has not been
7141 * allocated.
7142 */
7143 ASSERT(strsize != 0);
7144
7145 /*
7146 * We will first allocate some temporary space for the frame pointers.
7147 */
7148 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
7149 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
7150 (nframes * sizeof (uint64_t));
7151
7152 if (!DTRACE_INSCRATCH(mstate, size)) {
7153 /*
7154 * Not enough room for our frame pointers -- need to indicate
7155 * that we ran out of scratch space.
7156 */
7157 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
7158 return;
7159 }
7160
7161 mstate->dtms_scratch_ptr += size;
7162 saved = mstate->dtms_scratch_ptr;
7163
7164 /*
7165 * Now get a stack with both program counters and frame pointers.
7166 */
7167 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7168 dtrace_getufpstack(buf, fps, nframes + 1);
7169 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7170
7171 /*
7172 * If that faulted, we're cooked.
7173 */
7174 if (*flags & CPU_DTRACE_FAULT)
7175 goto out;
7176
7177 /*
7178 * Now we want to walk up the stack, calling the USTACK helper. For
7179 * each iteration, we restore the scratch pointer.
7180 */
7181 for (i = 0; i < nframes; i++) {
7182 mstate->dtms_scratch_ptr = saved;
7183
7184 if (offs >= strsize)
7185 break;
7186
7187 sym = (char *)(uintptr_t)dtrace_helper(
7188 DTRACE_HELPER_ACTION_USTACK,
7189 mstate, state, pcs[i], fps[i]);
7190
7191 /*
7192 * If we faulted while running the helper, we're going to
7193 * clear the fault and null out the corresponding string.
7194 */
7195 if (*flags & CPU_DTRACE_FAULT) {
7196 *flags &= ~CPU_DTRACE_FAULT;
7197 str[offs++] = '\0';
7198 continue;
7199 }
7200
7201 if (sym == NULL) {
7202 str[offs++] = '\0';
7203 continue;
7204 }
7205
7206 if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate,
7207 &(state->dts_vstate))) {
7208 str[offs++] = '\0';
7209 continue;
7210 }
7211
7212 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7213
7214 /*
7215 * Now copy in the string that the helper returned to us.
7216 */
7217 for (j = 0; offs + j < strsize && j < rem; j++) {
7218 if ((str[offs + j] = sym[j]) == '\0')
7219 break;
7220 }
7221
7222 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7223
7224 offs += j + 1;
7225 }
7226
7227 if (offs >= strsize) {
7228 /*
7229 * If we didn't have room for all of the strings, we don't
7230 * abort processing -- this needn't be a fatal error -- but we
7231 * still want to increment a counter (dts_stkstroverflows) to
7232 * allow this condition to be warned about. (If this is from
7233 * a jstack() action, it is easily tuned via jstackstrsize.)
7234 */
7235 dtrace_error(&state->dts_stkstroverflows);
7236 }
7237
7238 while (offs < strsize)
7239 str[offs++] = '\0';
7240
7241 out:
7242 mstate->dtms_scratch_ptr = old;
7243 }
7244
7245 static void
7246 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
7247 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
7248 {
7249 volatile uint16_t *flags;
7250 uint64_t val = *valp;
7251 size_t valoffs = *valoffsp;
7252
7253 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
7254 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
7255
7256 /*
7257 * If this is a string, we're going to only load until we find the zero
7258 * byte -- after which we'll store zero bytes.
7259 */
7260 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
7261 char c = '\0' + 1;
7262 size_t s;
7263
7264 for (s = 0; s < size; s++) {
7265 if (c != '\0' && dtkind == DIF_TF_BYREF) {
7266 c = dtrace_load8(val++);
7267 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
7268 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7269 c = dtrace_fuword8((void *)(uintptr_t)val++);
7270 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7271 if (*flags & CPU_DTRACE_FAULT)
7272 break;
7273 }
7274
7275 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
7276
7277 if (c == '\0' && intuple)
7278 break;
7279 }
7280 } else {
7281 uint8_t c;
7282 while (valoffs < end) {
7283 if (dtkind == DIF_TF_BYREF) {
7284 c = dtrace_load8(val++);
7285 } else if (dtkind == DIF_TF_BYUREF) {
7286 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7287 c = dtrace_fuword8((void *)(uintptr_t)val++);
7288 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7289 if (*flags & CPU_DTRACE_FAULT)
7290 break;
7291 }
7292
7293 DTRACE_STORE(uint8_t, tomax,
7294 valoffs++, c);
7295 }
7296 }
7297
7298 *valp = val;
7299 *valoffsp = valoffs;
7300 }
7301
7302 /*
7303 * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
7304 * defined, we also assert that we are not recursing unless the probe ID is an
7305 * error probe.
7306 */
7307 static dtrace_icookie_t
7308 dtrace_probe_enter(dtrace_id_t id)
7309 {
7310 dtrace_icookie_t cookie;
7311
7312 cookie = dtrace_interrupt_disable();
7313
7314 /*
7315 * Unless this is an ERROR probe, we are not allowed to recurse in
7316 * dtrace_probe(). Recursing into DTrace probe usually means that a
7317 * function is instrumented that should not have been instrumented or
7318 * that the ordering guarantee of the records will be violated,
7319 * resulting in unexpected output. If there is an exception to this
7320 * assertion, a new case should be added.
7321 */
7322 ASSERT(curthread->t_dtrace_inprobe == 0 ||
7323 id == dtrace_probeid_error);
7324 curthread->t_dtrace_inprobe = 1;
7325
7326 return (cookie);
7327 }
7328
7329 /*
7330 * Clears the per-thread inprobe flag and enables interrupts.
7331 */
7332 static void
7333 dtrace_probe_exit(dtrace_icookie_t cookie)
7334 {
7335
7336 curthread->t_dtrace_inprobe = 0;
7337 dtrace_interrupt_enable(cookie);
7338 }
7339
7340 /*
7341 * If you're looking for the epicenter of DTrace, you just found it. This
7342 * is the function called by the provider to fire a probe -- from which all
7343 * subsequent probe-context DTrace activity emanates.
7344 */
7345 void
7346 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
7347 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
7348 {
7349 processorid_t cpuid;
7350 dtrace_icookie_t cookie;
7351 dtrace_probe_t *probe;
7352 dtrace_mstate_t mstate;
7353 dtrace_ecb_t *ecb;
7354 dtrace_action_t *act;
7355 intptr_t offs;
7356 size_t size;
7357 int vtime, onintr;
7358 volatile uint16_t *flags;
7359 hrtime_t now;
7360
7361 if (KERNEL_PANICKED())
7362 return;
7363
7364 #ifdef illumos
7365 /*
7366 * Kick out immediately if this CPU is still being born (in which case
7367 * curthread will be set to -1) or the current thread can't allow
7368 * probes in its current context.
7369 */
7370 if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
7371 return;
7372 #endif
7373
7374 cookie = dtrace_probe_enter(id);
7375 probe = dtrace_probes[id - 1];
7376 cpuid = curcpu;
7377 onintr = CPU_ON_INTR(CPU);
7378
7379 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7380 probe->dtpr_predcache == curthread->t_predcache) {
7381 /*
7382 * We have hit in the predicate cache; we know that
7383 * this predicate would evaluate to be false.
7384 */
7385 dtrace_probe_exit(cookie);
7386 return;
7387 }
7388
7389 #ifdef illumos
7390 if (panic_quiesce) {
7391 #else
7392 if (KERNEL_PANICKED()) {
7393 #endif
7394 /*
7395 * We don't trace anything if we're panicking.
7396 */
7397 dtrace_probe_exit(cookie);
7398 return;
7399 }
7400
7401 now = mstate.dtms_timestamp = dtrace_gethrtime();
7402 mstate.dtms_present = DTRACE_MSTATE_TIMESTAMP;
7403 vtime = dtrace_vtime_references != 0;
7404
7405 if (vtime && curthread->t_dtrace_start)
7406 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7407
7408 mstate.dtms_difo = NULL;
7409 mstate.dtms_probe = probe;
7410 mstate.dtms_strtok = 0;
7411 mstate.dtms_arg[0] = arg0;
7412 mstate.dtms_arg[1] = arg1;
7413 mstate.dtms_arg[2] = arg2;
7414 mstate.dtms_arg[3] = arg3;
7415 mstate.dtms_arg[4] = arg4;
7416
7417 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7418
7419 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7420 dtrace_predicate_t *pred = ecb->dte_predicate;
7421 dtrace_state_t *state = ecb->dte_state;
7422 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7423 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7424 dtrace_vstate_t *vstate = &state->dts_vstate;
7425 dtrace_provider_t *prov = probe->dtpr_provider;
7426 uint64_t tracememsize = 0;
7427 int committed = 0;
7428 caddr_t tomax;
7429
7430 /*
7431 * A little subtlety with the following (seemingly innocuous)
7432 * declaration of the automatic 'val': by looking at the
7433 * code, you might think that it could be declared in the
7434 * action processing loop, below. (That is, it's only used in
7435 * the action processing loop.) However, it must be declared
7436 * out of that scope because in the case of DIF expression
7437 * arguments to aggregating actions, one iteration of the
7438 * action loop will use the last iteration's value.
7439 */
7440 uint64_t val = 0;
7441
7442 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7443 mstate.dtms_getf = NULL;
7444
7445 *flags &= ~CPU_DTRACE_ERROR;
7446
7447 if (prov == dtrace_provider) {
7448 /*
7449 * If dtrace itself is the provider of this probe,
7450 * we're only going to continue processing the ECB if
7451 * arg0 (the dtrace_state_t) is equal to the ECB's
7452 * creating state. (This prevents disjoint consumers
7453 * from seeing one another's metaprobes.)
7454 */
7455 if (arg0 != (uint64_t)(uintptr_t)state)
7456 continue;
7457 }
7458
7459 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7460 /*
7461 * We're not currently active. If our provider isn't
7462 * the dtrace pseudo provider, we're not interested.
7463 */
7464 if (prov != dtrace_provider)
7465 continue;
7466
7467 /*
7468 * Now we must further check if we are in the BEGIN
7469 * probe. If we are, we will only continue processing
7470 * if we're still in WARMUP -- if one BEGIN enabling
7471 * has invoked the exit() action, we don't want to
7472 * evaluate subsequent BEGIN enablings.
7473 */
7474 if (probe->dtpr_id == dtrace_probeid_begin &&
7475 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7476 ASSERT(state->dts_activity ==
7477 DTRACE_ACTIVITY_DRAINING);
7478 continue;
7479 }
7480 }
7481
7482 if (ecb->dte_cond) {
7483 /*
7484 * If the dte_cond bits indicate that this
7485 * consumer is only allowed to see user-mode firings
7486 * of this probe, call the provider's dtps_usermode()
7487 * entry point to check that the probe was fired
7488 * while in a user context. Skip this ECB if that's
7489 * not the case.
7490 */
7491 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7492 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7493 probe->dtpr_id, probe->dtpr_arg) == 0)
7494 continue;
7495
7496 #ifdef illumos
7497 /*
7498 * This is more subtle than it looks. We have to be
7499 * absolutely certain that CRED() isn't going to
7500 * change out from under us so it's only legit to
7501 * examine that structure if we're in constrained
7502 * situations. Currently, the only times we'll this
7503 * check is if a non-super-user has enabled the
7504 * profile or syscall providers -- providers that
7505 * allow visibility of all processes. For the
7506 * profile case, the check above will ensure that
7507 * we're examining a user context.
7508 */
7509 if (ecb->dte_cond & DTRACE_COND_OWNER) {
7510 cred_t *cr;
7511 cred_t *s_cr =
7512 ecb->dte_state->dts_cred.dcr_cred;
7513 proc_t *proc;
7514
7515 ASSERT(s_cr != NULL);
7516
7517 if ((cr = CRED()) == NULL ||
7518 s_cr->cr_uid != cr->cr_uid ||
7519 s_cr->cr_uid != cr->cr_ruid ||
7520 s_cr->cr_uid != cr->cr_suid ||
7521 s_cr->cr_gid != cr->cr_gid ||
7522 s_cr->cr_gid != cr->cr_rgid ||
7523 s_cr->cr_gid != cr->cr_sgid ||
7524 (proc = ttoproc(curthread)) == NULL ||
7525 (proc->p_flag & SNOCD))
7526 continue;
7527 }
7528
7529 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7530 cred_t *cr;
7531 cred_t *s_cr =
7532 ecb->dte_state->dts_cred.dcr_cred;
7533
7534 ASSERT(s_cr != NULL);
7535
7536 if ((cr = CRED()) == NULL ||
7537 s_cr->cr_zone->zone_id !=
7538 cr->cr_zone->zone_id)
7539 continue;
7540 }
7541 #endif
7542 }
7543
7544 if (now - state->dts_alive > dtrace_deadman_timeout) {
7545 /*
7546 * We seem to be dead. Unless we (a) have kernel
7547 * destructive permissions (b) have explicitly enabled
7548 * destructive actions and (c) destructive actions have
7549 * not been disabled, we're going to transition into
7550 * the KILLED state, from which no further processing
7551 * on this state will be performed.
7552 */
7553 if (!dtrace_priv_kernel_destructive(state) ||
7554 !state->dts_cred.dcr_destructive ||
7555 dtrace_destructive_disallow) {
7556 void *activity = &state->dts_activity;
7557 dtrace_activity_t curstate;
7558
7559 do {
7560 curstate = state->dts_activity;
7561 } while (dtrace_cas32(activity, curstate,
7562 DTRACE_ACTIVITY_KILLED) != curstate);
7563
7564 continue;
7565 }
7566 }
7567
7568 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7569 ecb->dte_alignment, state, &mstate)) < 0)
7570 continue;
7571
7572 tomax = buf->dtb_tomax;
7573 ASSERT(tomax != NULL);
7574
7575 if (ecb->dte_size != 0) {
7576 dtrace_rechdr_t dtrh;
7577 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7578 mstate.dtms_timestamp = dtrace_gethrtime();
7579 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7580 }
7581 ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7582 dtrh.dtrh_epid = ecb->dte_epid;
7583 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7584 mstate.dtms_timestamp);
7585 *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7586 }
7587
7588 mstate.dtms_epid = ecb->dte_epid;
7589 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7590
7591 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7592 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7593 else
7594 mstate.dtms_access = 0;
7595
7596 if (pred != NULL) {
7597 dtrace_difo_t *dp = pred->dtp_difo;
7598 uint64_t rval;
7599
7600 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7601
7602 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7603 dtrace_cacheid_t cid = probe->dtpr_predcache;
7604
7605 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7606 /*
7607 * Update the predicate cache...
7608 */
7609 ASSERT(cid == pred->dtp_cacheid);
7610 curthread->t_predcache = cid;
7611 }
7612
7613 continue;
7614 }
7615 }
7616
7617 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7618 act != NULL; act = act->dta_next) {
7619 size_t valoffs;
7620 dtrace_difo_t *dp;
7621 dtrace_recdesc_t *rec = &act->dta_rec;
7622
7623 size = rec->dtrd_size;
7624 valoffs = offs + rec->dtrd_offset;
7625
7626 if (DTRACEACT_ISAGG(act->dta_kind)) {
7627 uint64_t v = 0xbad;
7628 dtrace_aggregation_t *agg;
7629
7630 agg = (dtrace_aggregation_t *)act;
7631
7632 if ((dp = act->dta_difo) != NULL)
7633 v = dtrace_dif_emulate(dp,
7634 &mstate, vstate, state);
7635
7636 if (*flags & CPU_DTRACE_ERROR)
7637 continue;
7638
7639 /*
7640 * Note that we always pass the expression
7641 * value from the previous iteration of the
7642 * action loop. This value will only be used
7643 * if there is an expression argument to the
7644 * aggregating action, denoted by the
7645 * dtag_hasarg field.
7646 */
7647 dtrace_aggregate(agg, buf,
7648 offs, aggbuf, v, val);
7649 continue;
7650 }
7651
7652 switch (act->dta_kind) {
7653 case DTRACEACT_STOP:
7654 if (dtrace_priv_proc_destructive(state))
7655 dtrace_action_stop();
7656 continue;
7657
7658 case DTRACEACT_BREAKPOINT:
7659 if (dtrace_priv_kernel_destructive(state))
7660 dtrace_action_breakpoint(ecb);
7661 continue;
7662
7663 case DTRACEACT_PANIC:
7664 if (dtrace_priv_kernel_destructive(state))
7665 dtrace_action_panic(ecb);
7666 continue;
7667
7668 case DTRACEACT_STACK:
7669 if (!dtrace_priv_kernel(state))
7670 continue;
7671
7672 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7673 size / sizeof (pc_t), probe->dtpr_aframes,
7674 DTRACE_ANCHORED(probe) ? NULL :
7675 (uint32_t *)arg0);
7676 continue;
7677
7678 case DTRACEACT_JSTACK:
7679 case DTRACEACT_USTACK:
7680 if (!dtrace_priv_proc(state))
7681 continue;
7682
7683 /*
7684 * See comment in DIF_VAR_PID.
7685 */
7686 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7687 CPU_ON_INTR(CPU)) {
7688 int depth = DTRACE_USTACK_NFRAMES(
7689 rec->dtrd_arg) + 1;
7690
7691 dtrace_bzero((void *)(tomax + valoffs),
7692 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7693 + depth * sizeof (uint64_t));
7694
7695 continue;
7696 }
7697
7698 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7699 curproc->p_dtrace_helpers != NULL) {
7700 /*
7701 * This is the slow path -- we have
7702 * allocated string space, and we're
7703 * getting the stack of a process that
7704 * has helpers. Call into a separate
7705 * routine to perform this processing.
7706 */
7707 dtrace_action_ustack(&mstate, state,
7708 (uint64_t *)(tomax + valoffs),
7709 rec->dtrd_arg);
7710 continue;
7711 }
7712
7713 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7714 dtrace_getupcstack((uint64_t *)
7715 (tomax + valoffs),
7716 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7717 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7718 continue;
7719
7720 default:
7721 break;
7722 }
7723
7724 dp = act->dta_difo;
7725 ASSERT(dp != NULL);
7726
7727 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7728
7729 if (*flags & CPU_DTRACE_ERROR)
7730 continue;
7731
7732 switch (act->dta_kind) {
7733 case DTRACEACT_SPECULATE: {
7734 dtrace_rechdr_t *dtrh;
7735
7736 ASSERT(buf == &state->dts_buffer[cpuid]);
7737 buf = dtrace_speculation_buffer(state,
7738 cpuid, val);
7739
7740 if (buf == NULL) {
7741 *flags |= CPU_DTRACE_DROP;
7742 continue;
7743 }
7744
7745 offs = dtrace_buffer_reserve(buf,
7746 ecb->dte_needed, ecb->dte_alignment,
7747 state, NULL);
7748
7749 if (offs < 0) {
7750 *flags |= CPU_DTRACE_DROP;
7751 continue;
7752 }
7753
7754 tomax = buf->dtb_tomax;
7755 ASSERT(tomax != NULL);
7756
7757 if (ecb->dte_size == 0)
7758 continue;
7759
7760 ASSERT3U(ecb->dte_size, >=,
7761 sizeof (dtrace_rechdr_t));
7762 dtrh = ((void *)(tomax + offs));
7763 dtrh->dtrh_epid = ecb->dte_epid;
7764 /*
7765 * When the speculation is committed, all of
7766 * the records in the speculative buffer will
7767 * have their timestamps set to the commit
7768 * time. Until then, it is set to a sentinel
7769 * value, for debugability.
7770 */
7771 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7772 continue;
7773 }
7774
7775 case DTRACEACT_PRINTM: {
7776 /*
7777 * printm() assumes that the DIF returns a
7778 * pointer returned by memref(). memref() is a
7779 * subroutine that is used to get around the
7780 * single-valued returns of DIF and is assumed
7781 * to always be allocated in the scratch space.
7782 * Therefore, we need to validate that the
7783 * pointer given to printm() is in the scratch
7784 * space in order to avoid a potential panic.
7785 */
7786 uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7787
7788 if (!DTRACE_INSCRATCHPTR(&mstate,
7789 (uintptr_t) memref,
7790 sizeof (uintptr_t) + sizeof (size_t))) {
7791 *flags |= CPU_DTRACE_BADADDR;
7792 continue;
7793 }
7794
7795 /* Get the size from the memref. */
7796 size = memref[1];
7797
7798 /*
7799 * Check if the size exceeds the allocated
7800 * buffer size.
7801 */
7802 if (size + sizeof (size_t) >
7803 dp->dtdo_rtype.dtdt_size) {
7804 /* Flag a drop! */
7805 *flags |= CPU_DTRACE_DROP;
7806 continue;
7807 }
7808
7809 /* Store the size in the buffer first. */
7810 DTRACE_STORE(size_t, tomax, valoffs, size);
7811
7812 /*
7813 * Offset the buffer address to the start
7814 * of the data.
7815 */
7816 valoffs += sizeof(size_t);
7817
7818 /*
7819 * Reset to the memory address rather than
7820 * the memref array, then let the BYREF
7821 * code below do the work to store the
7822 * memory data in the buffer.
7823 */
7824 val = memref[0];
7825 break;
7826 }
7827
7828 case DTRACEACT_CHILL:
7829 if (dtrace_priv_kernel_destructive(state))
7830 dtrace_action_chill(&mstate, val);
7831 continue;
7832
7833 case DTRACEACT_RAISE:
7834 if (dtrace_priv_proc_destructive(state))
7835 dtrace_action_raise(val);
7836 continue;
7837
7838 case DTRACEACT_COMMIT:
7839 ASSERT(!committed);
7840
7841 /*
7842 * We need to commit our buffer state.
7843 */
7844 if (ecb->dte_size)
7845 buf->dtb_offset = offs + ecb->dte_size;
7846 buf = &state->dts_buffer[cpuid];
7847 dtrace_speculation_commit(state, cpuid, val);
7848 committed = 1;
7849 continue;
7850
7851 case DTRACEACT_DISCARD:
7852 dtrace_speculation_discard(state, cpuid, val);
7853 continue;
7854
7855 case DTRACEACT_DIFEXPR:
7856 case DTRACEACT_LIBACT:
7857 case DTRACEACT_PRINTF:
7858 case DTRACEACT_PRINTA:
7859 case DTRACEACT_SYSTEM:
7860 case DTRACEACT_FREOPEN:
7861 case DTRACEACT_TRACEMEM:
7862 break;
7863
7864 case DTRACEACT_TRACEMEM_DYNSIZE:
7865 tracememsize = val;
7866 break;
7867
7868 case DTRACEACT_SYM:
7869 case DTRACEACT_MOD:
7870 if (!dtrace_priv_kernel(state))
7871 continue;
7872 break;
7873
7874 case DTRACEACT_USYM:
7875 case DTRACEACT_UMOD:
7876 case DTRACEACT_UADDR: {
7877 #ifdef illumos
7878 struct pid *pid = curthread->t_procp->p_pidp;
7879 #endif
7880
7881 if (!dtrace_priv_proc(state))
7882 continue;
7883
7884 DTRACE_STORE(uint64_t, tomax,
7885 #ifdef illumos
7886 valoffs, (uint64_t)pid->pid_id);
7887 #else
7888 valoffs, (uint64_t) curproc->p_pid);
7889 #endif
7890 DTRACE_STORE(uint64_t, tomax,
7891 valoffs + sizeof (uint64_t), val);
7892
7893 continue;
7894 }
7895
7896 case DTRACEACT_EXIT: {
7897 /*
7898 * For the exit action, we are going to attempt
7899 * to atomically set our activity to be
7900 * draining. If this fails (either because
7901 * another CPU has beat us to the exit action,
7902 * or because our current activity is something
7903 * other than ACTIVE or WARMUP), we will
7904 * continue. This assures that the exit action
7905 * can be successfully recorded at most once
7906 * when we're in the ACTIVE state. If we're
7907 * encountering the exit() action while in
7908 * COOLDOWN, however, we want to honor the new
7909 * status code. (We know that we're the only
7910 * thread in COOLDOWN, so there is no race.)
7911 */
7912 void *activity = &state->dts_activity;
7913 dtrace_activity_t curstate = state->dts_activity;
7914
7915 if (curstate == DTRACE_ACTIVITY_COOLDOWN)
7916 break;
7917
7918 if (curstate != DTRACE_ACTIVITY_WARMUP)
7919 curstate = DTRACE_ACTIVITY_ACTIVE;
7920
7921 if (dtrace_cas32(activity, curstate,
7922 DTRACE_ACTIVITY_DRAINING) != curstate) {
7923 *flags |= CPU_DTRACE_DROP;
7924 continue;
7925 }
7926
7927 break;
7928 }
7929
7930 default:
7931 ASSERT(0);
7932 }
7933
7934 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7935 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7936 uintptr_t end = valoffs + size;
7937
7938 if (tracememsize != 0 &&
7939 valoffs + tracememsize < end) {
7940 end = valoffs + tracememsize;
7941 tracememsize = 0;
7942 }
7943
7944 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7945 !dtrace_vcanload((void *)(uintptr_t)val,
7946 &dp->dtdo_rtype, NULL, &mstate, vstate))
7947 continue;
7948
7949 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7950 &val, end, act->dta_intuple,
7951 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7952 DIF_TF_BYREF: DIF_TF_BYUREF);
7953 continue;
7954 }
7955
7956 switch (size) {
7957 case 0:
7958 break;
7959
7960 case sizeof (uint8_t):
7961 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7962 break;
7963 case sizeof (uint16_t):
7964 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7965 break;
7966 case sizeof (uint32_t):
7967 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7968 break;
7969 case sizeof (uint64_t):
7970 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7971 break;
7972 default:
7973 /*
7974 * Any other size should have been returned by
7975 * reference, not by value.
7976 */
7977 ASSERT(0);
7978 break;
7979 }
7980 }
7981
7982 if (*flags & CPU_DTRACE_DROP)
7983 continue;
7984
7985 if (*flags & CPU_DTRACE_FAULT) {
7986 int ndx;
7987 dtrace_action_t *err;
7988
7989 buf->dtb_errors++;
7990
7991 if (probe->dtpr_id == dtrace_probeid_error) {
7992 /*
7993 * There's nothing we can do -- we had an
7994 * error on the error probe. We bump an
7995 * error counter to at least indicate that
7996 * this condition happened.
7997 */
7998 dtrace_error(&state->dts_dblerrors);
7999 continue;
8000 }
8001
8002 if (vtime) {
8003 /*
8004 * Before recursing on dtrace_probe(), we
8005 * need to explicitly clear out our start
8006 * time to prevent it from being accumulated
8007 * into t_dtrace_vtime.
8008 */
8009 curthread->t_dtrace_start = 0;
8010 }
8011
8012 /*
8013 * Iterate over the actions to figure out which action
8014 * we were processing when we experienced the error.
8015 * Note that act points _past_ the faulting action; if
8016 * act is ecb->dte_action, the fault was in the
8017 * predicate, if it's ecb->dte_action->dta_next it's
8018 * in action #1, and so on.
8019 */
8020 for (err = ecb->dte_action, ndx = 0;
8021 err != act; err = err->dta_next, ndx++)
8022 continue;
8023
8024 dtrace_probe_error(state, ecb->dte_epid, ndx,
8025 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
8026 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
8027 cpu_core[cpuid].cpuc_dtrace_illval);
8028
8029 continue;
8030 }
8031
8032 if (!committed)
8033 buf->dtb_offset = offs + ecb->dte_size;
8034 }
8035
8036 if (vtime)
8037 curthread->t_dtrace_start = dtrace_gethrtime();
8038
8039 dtrace_probe_exit(cookie);
8040 }
8041
8042 /*
8043 * DTrace Probe Hashing Functions
8044 *
8045 * The functions in this section (and indeed, the functions in remaining
8046 * sections) are not _called_ from probe context. (Any exceptions to this are
8047 * marked with a "Note:".) Rather, they are called from elsewhere in the
8048 * DTrace framework to look-up probes in, add probes to and remove probes from
8049 * the DTrace probe hashes. (Each probe is hashed by each element of the
8050 * probe tuple -- allowing for fast lookups, regardless of what was
8051 * specified.)
8052 */
8053 static uint_t
8054 dtrace_hash_str(const char *p)
8055 {
8056 unsigned int g;
8057 uint_t hval = 0;
8058
8059 while (*p) {
8060 hval = (hval << 4) + *p++;
8061 if ((g = (hval & 0xf0000000)) != 0)
8062 hval ^= g >> 24;
8063 hval &= ~g;
8064 }
8065 return (hval);
8066 }
8067
8068 static dtrace_hash_t *
8069 dtrace_hash_create(size_t stroffs, size_t nextoffs, size_t prevoffs)
8070 {
8071 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
8072
8073 hash->dth_stroffs = stroffs;
8074 hash->dth_nextoffs = nextoffs;
8075 hash->dth_prevoffs = prevoffs;
8076
8077 hash->dth_size = 1;
8078 hash->dth_mask = hash->dth_size - 1;
8079
8080 hash->dth_tab = kmem_zalloc(hash->dth_size *
8081 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
8082
8083 return (hash);
8084 }
8085
8086 static void
8087 dtrace_hash_destroy(dtrace_hash_t *hash)
8088 {
8089 #ifdef DEBUG
8090 int i;
8091
8092 for (i = 0; i < hash->dth_size; i++)
8093 ASSERT(hash->dth_tab[i] == NULL);
8094 #endif
8095
8096 kmem_free(hash->dth_tab,
8097 hash->dth_size * sizeof (dtrace_hashbucket_t *));
8098 kmem_free(hash, sizeof (dtrace_hash_t));
8099 }
8100
8101 static void
8102 dtrace_hash_resize(dtrace_hash_t *hash)
8103 {
8104 int size = hash->dth_size, i, ndx;
8105 int new_size = hash->dth_size << 1;
8106 int new_mask = new_size - 1;
8107 dtrace_hashbucket_t **new_tab, *bucket, *next;
8108
8109 ASSERT((new_size & new_mask) == 0);
8110
8111 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
8112
8113 for (i = 0; i < size; i++) {
8114 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
8115 dtrace_probe_t *probe = bucket->dthb_chain;
8116
8117 ASSERT(probe != NULL);
8118 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
8119
8120 next = bucket->dthb_next;
8121 bucket->dthb_next = new_tab[ndx];
8122 new_tab[ndx] = bucket;
8123 }
8124 }
8125
8126 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
8127 hash->dth_tab = new_tab;
8128 hash->dth_size = new_size;
8129 hash->dth_mask = new_mask;
8130 }
8131
8132 static void
8133 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
8134 {
8135 int hashval = DTRACE_HASHSTR(hash, new);
8136 int ndx = hashval & hash->dth_mask;
8137 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
8138 dtrace_probe_t **nextp, **prevp;
8139
8140 for (; bucket != NULL; bucket = bucket->dthb_next) {
8141 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
8142 goto add;
8143 }
8144
8145 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
8146 dtrace_hash_resize(hash);
8147 dtrace_hash_add(hash, new);
8148 return;
8149 }
8150
8151 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
8152 bucket->dthb_next = hash->dth_tab[ndx];
8153 hash->dth_tab[ndx] = bucket;
8154 hash->dth_nbuckets++;
8155
8156 add:
8157 nextp = DTRACE_HASHNEXT(hash, new);
8158 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
8159 *nextp = bucket->dthb_chain;
8160
8161 if (bucket->dthb_chain != NULL) {
8162 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
8163 ASSERT(*prevp == NULL);
8164 *prevp = new;
8165 }
8166
8167 bucket->dthb_chain = new;
8168 bucket->dthb_len++;
8169 }
8170
8171 static dtrace_probe_t *
8172 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
8173 {
8174 int hashval = DTRACE_HASHSTR(hash, template);
8175 int ndx = hashval & hash->dth_mask;
8176 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
8177
8178 for (; bucket != NULL; bucket = bucket->dthb_next) {
8179 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
8180 return (bucket->dthb_chain);
8181 }
8182
8183 return (NULL);
8184 }
8185
8186 static int
8187 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
8188 {
8189 int hashval = DTRACE_HASHSTR(hash, template);
8190 int ndx = hashval & hash->dth_mask;
8191 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
8192
8193 for (; bucket != NULL; bucket = bucket->dthb_next) {
8194 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
8195 return (bucket->dthb_len);
8196 }
8197
8198 return (0);
8199 }
8200
8201 static void
8202 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
8203 {
8204 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
8205 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
8206
8207 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
8208 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
8209
8210 /*
8211 * Find the bucket that we're removing this probe from.
8212 */
8213 for (; bucket != NULL; bucket = bucket->dthb_next) {
8214 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
8215 break;
8216 }
8217
8218 ASSERT(bucket != NULL);
8219
8220 if (*prevp == NULL) {
8221 if (*nextp == NULL) {
8222 /*
8223 * The removed probe was the only probe on this
8224 * bucket; we need to remove the bucket.
8225 */
8226 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
8227
8228 ASSERT(bucket->dthb_chain == probe);
8229 ASSERT(b != NULL);
8230
8231 if (b == bucket) {
8232 hash->dth_tab[ndx] = bucket->dthb_next;
8233 } else {
8234 while (b->dthb_next != bucket)
8235 b = b->dthb_next;
8236 b->dthb_next = bucket->dthb_next;
8237 }
8238
8239 ASSERT(hash->dth_nbuckets > 0);
8240 hash->dth_nbuckets--;
8241 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
8242 return;
8243 }
8244
8245 bucket->dthb_chain = *nextp;
8246 } else {
8247 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
8248 }
8249
8250 if (*nextp != NULL)
8251 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
8252 }
8253
8254 /*
8255 * DTrace Utility Functions
8256 *
8257 * These are random utility functions that are _not_ called from probe context.
8258 */
8259 static int
8260 dtrace_badattr(const dtrace_attribute_t *a)
8261 {
8262 return (a->dtat_name > DTRACE_STABILITY_MAX ||
8263 a->dtat_data > DTRACE_STABILITY_MAX ||
8264 a->dtat_class > DTRACE_CLASS_MAX);
8265 }
8266
8267 /*
8268 * Return a duplicate copy of a string. If the specified string is NULL,
8269 * this function returns a zero-length string.
8270 */
8271 static char *
8272 dtrace_strdup(const char *str)
8273 {
8274 char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
8275
8276 if (str != NULL)
8277 (void) strcpy(new, str);
8278
8279 return (new);
8280 }
8281
8282 #define DTRACE_ISALPHA(c) \
8283 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8284
8285 static int
8286 dtrace_badname(const char *s)
8287 {
8288 char c;
8289
8290 if (s == NULL || (c = *s++) == '\0')
8291 return (0);
8292
8293 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8294 return (1);
8295
8296 while ((c = *s++) != '\0') {
8297 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8298 c != '-' && c != '_' && c != '.' && c != '`')
8299 return (1);
8300 }
8301
8302 return (0);
8303 }
8304
8305 static void
8306 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8307 {
8308 uint32_t priv;
8309
8310 #ifdef illumos
8311 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8312 /*
8313 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
8314 */
8315 priv = DTRACE_PRIV_ALL;
8316 } else {
8317 *uidp = crgetuid(cr);
8318 *zoneidp = crgetzoneid(cr);
8319
8320 priv = 0;
8321 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8322 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8323 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8324 priv |= DTRACE_PRIV_USER;
8325 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8326 priv |= DTRACE_PRIV_PROC;
8327 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8328 priv |= DTRACE_PRIV_OWNER;
8329 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8330 priv |= DTRACE_PRIV_ZONEOWNER;
8331 }
8332 #else
8333 priv = DTRACE_PRIV_ALL;
8334 #endif
8335
8336 *privp = priv;
8337 }
8338
8339 #ifdef DTRACE_ERRDEBUG
8340 static void
8341 dtrace_errdebug(const char *str)
8342 {
8343 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8344 int occupied = 0;
8345
8346 mutex_enter(&dtrace_errlock);
8347 dtrace_errlast = str;
8348 dtrace_errthread = curthread;
8349
8350 while (occupied++ < DTRACE_ERRHASHSZ) {
8351 if (dtrace_errhash[hval].dter_msg == str) {
8352 dtrace_errhash[hval].dter_count++;
8353 goto out;
8354 }
8355
8356 if (dtrace_errhash[hval].dter_msg != NULL) {
8357 hval = (hval + 1) % DTRACE_ERRHASHSZ;
8358 continue;
8359 }
8360
8361 dtrace_errhash[hval].dter_msg = str;
8362 dtrace_errhash[hval].dter_count = 1;
8363 goto out;
8364 }
8365
8366 panic("dtrace: undersized error hash");
8367 out:
8368 mutex_exit(&dtrace_errlock);
8369 }
8370 #endif
8371
8372 /*
8373 * DTrace Matching Functions
8374 *
8375 * These functions are used to match groups of probes, given some elements of
8376 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8377 */
8378 static int
8379 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8380 zoneid_t zoneid)
8381 {
8382 if (priv != DTRACE_PRIV_ALL) {
8383 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8384 uint32_t match = priv & ppriv;
8385
8386 /*
8387 * No PRIV_DTRACE_* privileges...
8388 */
8389 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8390 DTRACE_PRIV_KERNEL)) == 0)
8391 return (0);
8392
8393 /*
8394 * No matching bits, but there were bits to match...
8395 */
8396 if (match == 0 && ppriv != 0)
8397 return (0);
8398
8399 /*
8400 * Need to have permissions to the process, but don't...
8401 */
8402 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8403 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8404 return (0);
8405 }
8406
8407 /*
8408 * Need to be in the same zone unless we possess the
8409 * privilege to examine all zones.
8410 */
8411 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8412 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8413 return (0);
8414 }
8415 }
8416
8417 return (1);
8418 }
8419
8420 /*
8421 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8422 * consists of input pattern strings and an ops-vector to evaluate them.
8423 * This function returns >0 for match, 0 for no match, and <0 for error.
8424 */
8425 static int
8426 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8427 uint32_t priv, uid_t uid, zoneid_t zoneid)
8428 {
8429 dtrace_provider_t *pvp = prp->dtpr_provider;
8430 int rv;
8431
8432 if (pvp->dtpv_defunct)
8433 return (0);
8434
8435 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8436 return (rv);
8437
8438 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8439 return (rv);
8440
8441 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8442 return (rv);
8443
8444 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8445 return (rv);
8446
8447 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8448 return (0);
8449
8450 return (rv);
8451 }
8452
8453 /*
8454 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8455 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
8456 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8457 * In addition, all of the recursion cases except for '*' matching have been
8458 * unwound. For '*', we still implement recursive evaluation, but a depth
8459 * counter is maintained and matching is aborted if we recurse too deep.
8460 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8461 */
8462 static int
8463 dtrace_match_glob(const char *s, const char *p, int depth)
8464 {
8465 const char *olds;
8466 char s1, c;
8467 int gs;
8468
8469 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8470 return (-1);
8471
8472 if (s == NULL)
8473 s = ""; /* treat NULL as empty string */
8474
8475 top:
8476 olds = s;
8477 s1 = *s++;
8478
8479 if (p == NULL)
8480 return (0);
8481
8482 if ((c = *p++) == '\0')
8483 return (s1 == '\0');
8484
8485 switch (c) {
8486 case '[': {
8487 int ok = 0, notflag = 0;
8488 char lc = '\0';
8489
8490 if (s1 == '\0')
8491 return (0);
8492
8493 if (*p == '!') {
8494 notflag = 1;
8495 p++;
8496 }
8497
8498 if ((c = *p++) == '\0')
8499 return (0);
8500
8501 do {
8502 if (c == '-' && lc != '\0' && *p != ']') {
8503 if ((c = *p++) == '\0')
8504 return (0);
8505 if (c == '\\' && (c = *p++) == '\0')
8506 return (0);
8507
8508 if (notflag) {
8509 if (s1 < lc || s1 > c)
8510 ok++;
8511 else
8512 return (0);
8513 } else if (lc <= s1 && s1 <= c)
8514 ok++;
8515
8516 } else if (c == '\\' && (c = *p++) == '\0')
8517 return (0);
8518
8519 lc = c; /* save left-hand 'c' for next iteration */
8520
8521 if (notflag) {
8522 if (s1 != c)
8523 ok++;
8524 else
8525 return (0);
8526 } else if (s1 == c)
8527 ok++;
8528
8529 if ((c = *p++) == '\0')
8530 return (0);
8531
8532 } while (c != ']');
8533
8534 if (ok)
8535 goto top;
8536
8537 return (0);
8538 }
8539
8540 case '\\':
8541 if ((c = *p++) == '\0')
8542 return (0);
8543 /*FALLTHRU*/
8544
8545 default:
8546 if (c != s1)
8547 return (0);
8548 /*FALLTHRU*/
8549
8550 case '?':
8551 if (s1 != '\0')
8552 goto top;
8553 return (0);
8554
8555 case '*':
8556 while (*p == '*')
8557 p++; /* consecutive *'s are identical to a single one */
8558
8559 if (*p == '\0')
8560 return (1);
8561
8562 for (s = olds; *s != '\0'; s++) {
8563 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8564 return (gs);
8565 }
8566
8567 return (0);
8568 }
8569 }
8570
8571 /*ARGSUSED*/
8572 static int
8573 dtrace_match_string(const char *s, const char *p, int depth)
8574 {
8575 return (s != NULL && strcmp(s, p) == 0);
8576 }
8577
8578 /*ARGSUSED*/
8579 static int
8580 dtrace_match_nul(const char *s, const char *p, int depth)
8581 {
8582 return (1); /* always match the empty pattern */
8583 }
8584
8585 /*ARGSUSED*/
8586 static int
8587 dtrace_match_nonzero(const char *s, const char *p, int depth)
8588 {
8589 return (s != NULL && s[0] != '\0');
8590 }
8591
8592 static int
8593 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8594 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8595 {
8596 dtrace_probe_t template, *probe;
8597 dtrace_hash_t *hash = NULL;
8598 int len, best = INT_MAX, nmatched = 0;
8599 dtrace_id_t i;
8600
8601 ASSERT(MUTEX_HELD(&dtrace_lock));
8602
8603 /*
8604 * If the probe ID is specified in the key, just lookup by ID and
8605 * invoke the match callback once if a matching probe is found.
8606 */
8607 if (pkp->dtpk_id != DTRACE_IDNONE) {
8608 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8609 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8610 (void) (*matched)(probe, arg);
8611 nmatched++;
8612 }
8613 return (nmatched);
8614 }
8615
8616 template.dtpr_mod = (char *)pkp->dtpk_mod;
8617 template.dtpr_func = (char *)pkp->dtpk_func;
8618 template.dtpr_name = (char *)pkp->dtpk_name;
8619
8620 /*
8621 * We want to find the most distinct of the module name, function
8622 * name, and name. So for each one that is not a glob pattern or
8623 * empty string, we perform a lookup in the corresponding hash and
8624 * use the hash table with the fewest collisions to do our search.
8625 */
8626 if (pkp->dtpk_mmatch == &dtrace_match_string &&
8627 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8628 best = len;
8629 hash = dtrace_bymod;
8630 }
8631
8632 if (pkp->dtpk_fmatch == &dtrace_match_string &&
8633 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8634 best = len;
8635 hash = dtrace_byfunc;
8636 }
8637
8638 if (pkp->dtpk_nmatch == &dtrace_match_string &&
8639 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8640 best = len;
8641 hash = dtrace_byname;
8642 }
8643
8644 /*
8645 * If we did not select a hash table, iterate over every probe and
8646 * invoke our callback for each one that matches our input probe key.
8647 */
8648 if (hash == NULL) {
8649 for (i = 0; i < dtrace_nprobes; i++) {
8650 if ((probe = dtrace_probes[i]) == NULL ||
8651 dtrace_match_probe(probe, pkp, priv, uid,
8652 zoneid) <= 0)
8653 continue;
8654
8655 nmatched++;
8656
8657 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8658 break;
8659 }
8660
8661 return (nmatched);
8662 }
8663
8664 /*
8665 * If we selected a hash table, iterate over each probe of the same key
8666 * name and invoke the callback for every probe that matches the other
8667 * attributes of our input probe key.
8668 */
8669 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8670 probe = *(DTRACE_HASHNEXT(hash, probe))) {
8671
8672 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8673 continue;
8674
8675 nmatched++;
8676
8677 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8678 break;
8679 }
8680
8681 return (nmatched);
8682 }
8683
8684 /*
8685 * Return the function pointer dtrace_probecmp() should use to compare the
8686 * specified pattern with a string. For NULL or empty patterns, we select
8687 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8688 * For non-empty non-glob strings, we use dtrace_match_string().
8689 */
8690 static dtrace_probekey_f *
8691 dtrace_probekey_func(const char *p)
8692 {
8693 char c;
8694
8695 if (p == NULL || *p == '\0')
8696 return (&dtrace_match_nul);
8697
8698 while ((c = *p++) != '\0') {
8699 if (c == '[' || c == '?' || c == '*' || c == '\\')
8700 return (&dtrace_match_glob);
8701 }
8702
8703 return (&dtrace_match_string);
8704 }
8705
8706 /*
8707 * Build a probe comparison key for use with dtrace_match_probe() from the
8708 * given probe description. By convention, a null key only matches anchored
8709 * probes: if each field is the empty string, reset dtpk_fmatch to
8710 * dtrace_match_nonzero().
8711 */
8712 static void
8713 dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8714 {
8715 pkp->dtpk_prov = pdp->dtpd_provider;
8716 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8717
8718 pkp->dtpk_mod = pdp->dtpd_mod;
8719 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8720
8721 pkp->dtpk_func = pdp->dtpd_func;
8722 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8723
8724 pkp->dtpk_name = pdp->dtpd_name;
8725 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8726
8727 pkp->dtpk_id = pdp->dtpd_id;
8728
8729 if (pkp->dtpk_id == DTRACE_IDNONE &&
8730 pkp->dtpk_pmatch == &dtrace_match_nul &&
8731 pkp->dtpk_mmatch == &dtrace_match_nul &&
8732 pkp->dtpk_fmatch == &dtrace_match_nul &&
8733 pkp->dtpk_nmatch == &dtrace_match_nul)
8734 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8735 }
8736
8737 /*
8738 * DTrace Provider-to-Framework API Functions
8739 *
8740 * These functions implement much of the Provider-to-Framework API, as
8741 * described in <sys/dtrace.h>. The parts of the API not in this section are
8742 * the functions in the API for probe management (found below), and
8743 * dtrace_probe() itself (found above).
8744 */
8745
8746 /*
8747 * Register the calling provider with the DTrace framework. This should
8748 * generally be called by DTrace providers in their attach(9E) entry point.
8749 */
8750 int
8751 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8752 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8753 {
8754 dtrace_provider_t *provider;
8755
8756 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8757 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8758 "arguments", name ? name : "<NULL>");
8759 return (EINVAL);
8760 }
8761
8762 if (name[0] == '\0' || dtrace_badname(name)) {
8763 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8764 "provider name", name);
8765 return (EINVAL);
8766 }
8767
8768 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8769 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8770 pops->dtps_destroy == NULL ||
8771 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8772 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8773 "provider ops", name);
8774 return (EINVAL);
8775 }
8776
8777 if (dtrace_badattr(&pap->dtpa_provider) ||
8778 dtrace_badattr(&pap->dtpa_mod) ||
8779 dtrace_badattr(&pap->dtpa_func) ||
8780 dtrace_badattr(&pap->dtpa_name) ||
8781 dtrace_badattr(&pap->dtpa_args)) {
8782 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8783 "provider attributes", name);
8784 return (EINVAL);
8785 }
8786
8787 if (priv & ~DTRACE_PRIV_ALL) {
8788 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8789 "privilege attributes", name);
8790 return (EINVAL);
8791 }
8792
8793 if ((priv & DTRACE_PRIV_KERNEL) &&
8794 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8795 pops->dtps_usermode == NULL) {
8796 cmn_err(CE_WARN, "failed to register provider '%s': need "
8797 "dtps_usermode() op for given privilege attributes", name);
8798 return (EINVAL);
8799 }
8800
8801 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8802 provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8803 (void) strcpy(provider->dtpv_name, name);
8804
8805 provider->dtpv_attr = *pap;
8806 provider->dtpv_priv.dtpp_flags = priv;
8807 if (cr != NULL) {
8808 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8809 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8810 }
8811 provider->dtpv_pops = *pops;
8812
8813 if (pops->dtps_provide == NULL) {
8814 ASSERT(pops->dtps_provide_module != NULL);
8815 provider->dtpv_pops.dtps_provide =
8816 (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8817 }
8818
8819 if (pops->dtps_provide_module == NULL) {
8820 ASSERT(pops->dtps_provide != NULL);
8821 provider->dtpv_pops.dtps_provide_module =
8822 (void (*)(void *, modctl_t *))dtrace_nullop;
8823 }
8824
8825 if (pops->dtps_suspend == NULL) {
8826 ASSERT(pops->dtps_resume == NULL);
8827 provider->dtpv_pops.dtps_suspend =
8828 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8829 provider->dtpv_pops.dtps_resume =
8830 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8831 }
8832
8833 provider->dtpv_arg = arg;
8834 *idp = (dtrace_provider_id_t)provider;
8835
8836 if (pops == &dtrace_provider_ops) {
8837 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8838 ASSERT(MUTEX_HELD(&dtrace_lock));
8839 ASSERT(dtrace_anon.dta_enabling == NULL);
8840
8841 /*
8842 * We make sure that the DTrace provider is at the head of
8843 * the provider chain.
8844 */
8845 provider->dtpv_next = dtrace_provider;
8846 dtrace_provider = provider;
8847 return (0);
8848 }
8849
8850 mutex_enter(&dtrace_provider_lock);
8851 mutex_enter(&dtrace_lock);
8852
8853 /*
8854 * If there is at least one provider registered, we'll add this
8855 * provider after the first provider.
8856 */
8857 if (dtrace_provider != NULL) {
8858 provider->dtpv_next = dtrace_provider->dtpv_next;
8859 dtrace_provider->dtpv_next = provider;
8860 } else {
8861 dtrace_provider = provider;
8862 }
8863
8864 if (dtrace_retained != NULL) {
8865 dtrace_enabling_provide(provider);
8866
8867 /*
8868 * Now we need to call dtrace_enabling_matchall() -- which
8869 * will acquire cpu_lock and dtrace_lock. We therefore need
8870 * to drop all of our locks before calling into it...
8871 */
8872 mutex_exit(&dtrace_lock);
8873 mutex_exit(&dtrace_provider_lock);
8874 dtrace_enabling_matchall();
8875
8876 return (0);
8877 }
8878
8879 mutex_exit(&dtrace_lock);
8880 mutex_exit(&dtrace_provider_lock);
8881
8882 return (0);
8883 }
8884
8885 /*
8886 * Unregister the specified provider from the DTrace framework. This should
8887 * generally be called by DTrace providers in their detach(9E) entry point.
8888 */
8889 int
8890 dtrace_unregister(dtrace_provider_id_t id)
8891 {
8892 dtrace_provider_t *old = (dtrace_provider_t *)id;
8893 dtrace_provider_t *prev = NULL;
8894 int i, self = 0, noreap = 0;
8895 dtrace_probe_t *probe, *first = NULL;
8896
8897 if (old->dtpv_pops.dtps_enable ==
8898 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8899 /*
8900 * If DTrace itself is the provider, we're called with locks
8901 * already held.
8902 */
8903 ASSERT(old == dtrace_provider);
8904 #ifdef illumos
8905 ASSERT(dtrace_devi != NULL);
8906 #endif
8907 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8908 ASSERT(MUTEX_HELD(&dtrace_lock));
8909 self = 1;
8910
8911 if (dtrace_provider->dtpv_next != NULL) {
8912 /*
8913 * There's another provider here; return failure.
8914 */
8915 return (EBUSY);
8916 }
8917 } else {
8918 mutex_enter(&dtrace_provider_lock);
8919 #ifdef illumos
8920 mutex_enter(&mod_lock);
8921 #endif
8922 mutex_enter(&dtrace_lock);
8923 }
8924
8925 /*
8926 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8927 * probes, we refuse to let providers slither away, unless this
8928 * provider has already been explicitly invalidated.
8929 */
8930 if (!old->dtpv_defunct &&
8931 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8932 dtrace_anon.dta_state->dts_necbs > 0))) {
8933 if (!self) {
8934 mutex_exit(&dtrace_lock);
8935 #ifdef illumos
8936 mutex_exit(&mod_lock);
8937 #endif
8938 mutex_exit(&dtrace_provider_lock);
8939 }
8940 return (EBUSY);
8941 }
8942
8943 /*
8944 * Attempt to destroy the probes associated with this provider.
8945 */
8946 for (i = 0; i < dtrace_nprobes; i++) {
8947 if ((probe = dtrace_probes[i]) == NULL)
8948 continue;
8949
8950 if (probe->dtpr_provider != old)
8951 continue;
8952
8953 if (probe->dtpr_ecb == NULL)
8954 continue;
8955
8956 /*
8957 * If we are trying to unregister a defunct provider, and the
8958 * provider was made defunct within the interval dictated by
8959 * dtrace_unregister_defunct_reap, we'll (asynchronously)
8960 * attempt to reap our enablings. To denote that the provider
8961 * should reattempt to unregister itself at some point in the
8962 * future, we will return a differentiable error code (EAGAIN
8963 * instead of EBUSY) in this case.
8964 */
8965 if (dtrace_gethrtime() - old->dtpv_defunct >
8966 dtrace_unregister_defunct_reap)
8967 noreap = 1;
8968
8969 if (!self) {
8970 mutex_exit(&dtrace_lock);
8971 #ifdef illumos
8972 mutex_exit(&mod_lock);
8973 #endif
8974 mutex_exit(&dtrace_provider_lock);
8975 }
8976
8977 if (noreap)
8978 return (EBUSY);
8979
8980 (void) taskq_dispatch(dtrace_taskq,
8981 (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8982
8983 return (EAGAIN);
8984 }
8985
8986 /*
8987 * All of the probes for this provider are disabled; we can safely
8988 * remove all of them from their hash chains and from the probe array.
8989 */
8990 for (i = 0; i < dtrace_nprobes; i++) {
8991 if ((probe = dtrace_probes[i]) == NULL)
8992 continue;
8993
8994 if (probe->dtpr_provider != old)
8995 continue;
8996
8997 dtrace_probes[i] = NULL;
8998
8999 dtrace_hash_remove(dtrace_bymod, probe);
9000 dtrace_hash_remove(dtrace_byfunc, probe);
9001 dtrace_hash_remove(dtrace_byname, probe);
9002
9003 if (first == NULL) {
9004 first = probe;
9005 probe->dtpr_nextmod = NULL;
9006 } else {
9007 probe->dtpr_nextmod = first;
9008 first = probe;
9009 }
9010 }
9011
9012 /*
9013 * The provider's probes have been removed from the hash chains and
9014 * from the probe array. Now issue a dtrace_sync() to be sure that
9015 * everyone has cleared out from any probe array processing.
9016 */
9017 dtrace_sync();
9018
9019 for (probe = first; probe != NULL; probe = first) {
9020 first = probe->dtpr_nextmod;
9021
9022 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
9023 probe->dtpr_arg);
9024 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
9025 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
9026 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
9027 #ifdef illumos
9028 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
9029 #else
9030 free_unr(dtrace_arena, probe->dtpr_id);
9031 #endif
9032 kmem_free(probe, sizeof (dtrace_probe_t));
9033 }
9034
9035 if ((prev = dtrace_provider) == old) {
9036 #ifdef illumos
9037 ASSERT(self || dtrace_devi == NULL);
9038 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
9039 #endif
9040 dtrace_provider = old->dtpv_next;
9041 } else {
9042 while (prev != NULL && prev->dtpv_next != old)
9043 prev = prev->dtpv_next;
9044
9045 if (prev == NULL) {
9046 panic("attempt to unregister non-existent "
9047 "dtrace provider %p\n", (void *)id);
9048 }
9049
9050 prev->dtpv_next = old->dtpv_next;
9051 }
9052
9053 if (!self) {
9054 mutex_exit(&dtrace_lock);
9055 #ifdef illumos
9056 mutex_exit(&mod_lock);
9057 #endif
9058 mutex_exit(&dtrace_provider_lock);
9059 }
9060
9061 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
9062 kmem_free(old, sizeof (dtrace_provider_t));
9063
9064 return (0);
9065 }
9066
9067 /*
9068 * Invalidate the specified provider. All subsequent probe lookups for the
9069 * specified provider will fail, but its probes will not be removed.
9070 */
9071 void
9072 dtrace_invalidate(dtrace_provider_id_t id)
9073 {
9074 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
9075
9076 ASSERT(pvp->dtpv_pops.dtps_enable !=
9077 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
9078
9079 mutex_enter(&dtrace_provider_lock);
9080 mutex_enter(&dtrace_lock);
9081
9082 pvp->dtpv_defunct = dtrace_gethrtime();
9083
9084 mutex_exit(&dtrace_lock);
9085 mutex_exit(&dtrace_provider_lock);
9086 }
9087
9088 /*
9089 * Indicate whether or not DTrace has attached.
9090 */
9091 int
9092 dtrace_attached(void)
9093 {
9094 /*
9095 * dtrace_provider will be non-NULL iff the DTrace driver has
9096 * attached. (It's non-NULL because DTrace is always itself a
9097 * provider.)
9098 */
9099 return (dtrace_provider != NULL);
9100 }
9101
9102 /*
9103 * Remove all the unenabled probes for the given provider. This function is
9104 * not unlike dtrace_unregister(), except that it doesn't remove the provider
9105 * -- just as many of its associated probes as it can.
9106 */
9107 int
9108 dtrace_condense(dtrace_provider_id_t id)
9109 {
9110 dtrace_provider_t *prov = (dtrace_provider_t *)id;
9111 int i;
9112 dtrace_probe_t *probe;
9113
9114 /*
9115 * Make sure this isn't the dtrace provider itself.
9116 */
9117 ASSERT(prov->dtpv_pops.dtps_enable !=
9118 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
9119
9120 mutex_enter(&dtrace_provider_lock);
9121 mutex_enter(&dtrace_lock);
9122
9123 /*
9124 * Attempt to destroy the probes associated with this provider.
9125 */
9126 for (i = 0; i < dtrace_nprobes; i++) {
9127 if ((probe = dtrace_probes[i]) == NULL)
9128 continue;
9129
9130 if (probe->dtpr_provider != prov)
9131 continue;
9132
9133 if (probe->dtpr_ecb != NULL)
9134 continue;
9135
9136 dtrace_probes[i] = NULL;
9137
9138 dtrace_hash_remove(dtrace_bymod, probe);
9139 dtrace_hash_remove(dtrace_byfunc, probe);
9140 dtrace_hash_remove(dtrace_byname, probe);
9141
9142 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
9143 probe->dtpr_arg);
9144 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
9145 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
9146 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
9147 kmem_free(probe, sizeof (dtrace_probe_t));
9148 #ifdef illumos
9149 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
9150 #else
9151 free_unr(dtrace_arena, i + 1);
9152 #endif
9153 }
9154
9155 mutex_exit(&dtrace_lock);
9156 mutex_exit(&dtrace_provider_lock);
9157
9158 return (0);
9159 }
9160
9161 /*
9162 * DTrace Probe Management Functions
9163 *
9164 * The functions in this section perform the DTrace probe management,
9165 * including functions to create probes, look-up probes, and call into the
9166 * providers to request that probes be provided. Some of these functions are
9167 * in the Provider-to-Framework API; these functions can be identified by the
9168 * fact that they are not declared "static".
9169 */
9170
9171 /*
9172 * Create a probe with the specified module name, function name, and name.
9173 */
9174 dtrace_id_t
9175 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
9176 const char *func, const char *name, int aframes, void *arg)
9177 {
9178 dtrace_probe_t *probe, **probes;
9179 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
9180 dtrace_id_t id;
9181
9182 if (provider == dtrace_provider) {
9183 ASSERT(MUTEX_HELD(&dtrace_lock));
9184 } else {
9185 mutex_enter(&dtrace_lock);
9186 }
9187
9188 #ifdef illumos
9189 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
9190 VM_BESTFIT | VM_SLEEP);
9191 #else
9192 id = alloc_unr(dtrace_arena);
9193 #endif
9194 probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
9195
9196 probe->dtpr_id = id;
9197 probe->dtpr_gen = dtrace_probegen++;
9198 probe->dtpr_mod = dtrace_strdup(mod);
9199 probe->dtpr_func = dtrace_strdup(func);
9200 probe->dtpr_name = dtrace_strdup(name);
9201 probe->dtpr_arg = arg;
9202 probe->dtpr_aframes = aframes;
9203 probe->dtpr_provider = provider;
9204
9205 dtrace_hash_add(dtrace_bymod, probe);
9206 dtrace_hash_add(dtrace_byfunc, probe);
9207 dtrace_hash_add(dtrace_byname, probe);
9208
9209 if (id - 1 >= dtrace_nprobes) {
9210 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
9211 size_t nsize = osize << 1;
9212
9213 if (nsize == 0) {
9214 ASSERT(osize == 0);
9215 ASSERT(dtrace_probes == NULL);
9216 nsize = sizeof (dtrace_probe_t *);
9217 }
9218
9219 probes = kmem_zalloc(nsize, KM_SLEEP);
9220
9221 if (dtrace_probes == NULL) {
9222 ASSERT(osize == 0);
9223 dtrace_probes = probes;
9224 dtrace_nprobes = 1;
9225 } else {
9226 dtrace_probe_t **oprobes = dtrace_probes;
9227
9228 bcopy(oprobes, probes, osize);
9229 dtrace_membar_producer();
9230 dtrace_probes = probes;
9231
9232 dtrace_sync();
9233
9234 /*
9235 * All CPUs are now seeing the new probes array; we can
9236 * safely free the old array.
9237 */
9238 kmem_free(oprobes, osize);
9239 dtrace_nprobes <<= 1;
9240 }
9241
9242 ASSERT(id - 1 < dtrace_nprobes);
9243 }
9244
9245 ASSERT(dtrace_probes[id - 1] == NULL);
9246 dtrace_probes[id - 1] = probe;
9247
9248 if (provider != dtrace_provider)
9249 mutex_exit(&dtrace_lock);
9250
9251 return (id);
9252 }
9253
9254 static dtrace_probe_t *
9255 dtrace_probe_lookup_id(dtrace_id_t id)
9256 {
9257 ASSERT(MUTEX_HELD(&dtrace_lock));
9258
9259 if (id == 0 || id > dtrace_nprobes)
9260 return (NULL);
9261
9262 return (dtrace_probes[id - 1]);
9263 }
9264
9265 static int
9266 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
9267 {
9268 *((dtrace_id_t *)arg) = probe->dtpr_id;
9269
9270 return (DTRACE_MATCH_DONE);
9271 }
9272
9273 /*
9274 * Look up a probe based on provider and one or more of module name, function
9275 * name and probe name.
9276 */
9277 dtrace_id_t
9278 dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
9279 char *func, char *name)
9280 {
9281 dtrace_probekey_t pkey;
9282 dtrace_id_t id;
9283 int match;
9284
9285 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
9286 pkey.dtpk_pmatch = &dtrace_match_string;
9287 pkey.dtpk_mod = mod;
9288 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9289 pkey.dtpk_func = func;
9290 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9291 pkey.dtpk_name = name;
9292 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9293 pkey.dtpk_id = DTRACE_IDNONE;
9294
9295 mutex_enter(&dtrace_lock);
9296 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9297 dtrace_probe_lookup_match, &id);
9298 mutex_exit(&dtrace_lock);
9299
9300 ASSERT(match == 1 || match == 0);
9301 return (match ? id : 0);
9302 }
9303
9304 /*
9305 * Returns the probe argument associated with the specified probe.
9306 */
9307 void *
9308 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9309 {
9310 dtrace_probe_t *probe;
9311 void *rval = NULL;
9312
9313 mutex_enter(&dtrace_lock);
9314
9315 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9316 probe->dtpr_provider == (dtrace_provider_t *)id)
9317 rval = probe->dtpr_arg;
9318
9319 mutex_exit(&dtrace_lock);
9320
9321 return (rval);
9322 }
9323
9324 /*
9325 * Copy a probe into a probe description.
9326 */
9327 static void
9328 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9329 {
9330 bzero(pdp, sizeof (dtrace_probedesc_t));
9331 pdp->dtpd_id = prp->dtpr_id;
9332
9333 (void) strncpy(pdp->dtpd_provider,
9334 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
9335
9336 (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
9337 (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
9338 (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
9339 }
9340
9341 /*
9342 * Called to indicate that a probe -- or probes -- should be provided by a
9343 * specfied provider. If the specified description is NULL, the provider will
9344 * be told to provide all of its probes. (This is done whenever a new
9345 * consumer comes along, or whenever a retained enabling is to be matched.) If
9346 * the specified description is non-NULL, the provider is given the
9347 * opportunity to dynamically provide the specified probe, allowing providers
9348 * to support the creation of probes on-the-fly. (So-called _autocreated_
9349 * probes.) If the provider is NULL, the operations will be applied to all
9350 * providers; if the provider is non-NULL the operations will only be applied
9351 * to the specified provider. The dtrace_provider_lock must be held, and the
9352 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9353 * will need to grab the dtrace_lock when it reenters the framework through
9354 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9355 */
9356 static void
9357 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9358 {
9359 #ifdef illumos
9360 modctl_t *ctl;
9361 #endif
9362 int all = 0;
9363
9364 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
9365
9366 if (prv == NULL) {
9367 all = 1;
9368 prv = dtrace_provider;
9369 }
9370
9371 do {
9372 /*
9373 * First, call the blanket provide operation.
9374 */
9375 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9376
9377 #ifdef illumos
9378 /*
9379 * Now call the per-module provide operation. We will grab
9380 * mod_lock to prevent the list from being modified. Note
9381 * that this also prevents the mod_busy bits from changing.
9382 * (mod_busy can only be changed with mod_lock held.)
9383 */
9384 mutex_enter(&mod_lock);
9385
9386 ctl = &modules;
9387 do {
9388 if (ctl->mod_busy || ctl->mod_mp == NULL)
9389 continue;
9390
9391 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9392
9393 } while ((ctl = ctl->mod_next) != &modules);
9394
9395 mutex_exit(&mod_lock);
9396 #endif
9397 } while (all && (prv = prv->dtpv_next) != NULL);
9398 }
9399
9400 #ifdef illumos
9401 /*
9402 * Iterate over each probe, and call the Framework-to-Provider API function
9403 * denoted by offs.
9404 */
9405 static void
9406 dtrace_probe_foreach(uintptr_t offs)
9407 {
9408 dtrace_provider_t *prov;
9409 void (*func)(void *, dtrace_id_t, void *);
9410 dtrace_probe_t *probe;
9411 dtrace_icookie_t cookie;
9412 int i;
9413
9414 /*
9415 * We disable interrupts to walk through the probe array. This is
9416 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9417 * won't see stale data.
9418 */
9419 cookie = dtrace_interrupt_disable();
9420
9421 for (i = 0; i < dtrace_nprobes; i++) {
9422 if ((probe = dtrace_probes[i]) == NULL)
9423 continue;
9424
9425 if (probe->dtpr_ecb == NULL) {
9426 /*
9427 * This probe isn't enabled -- don't call the function.
9428 */
9429 continue;
9430 }
9431
9432 prov = probe->dtpr_provider;
9433 func = *((void(**)(void *, dtrace_id_t, void *))
9434 ((uintptr_t)&prov->dtpv_pops + offs));
9435
9436 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9437 }
9438
9439 dtrace_interrupt_enable(cookie);
9440 }
9441 #endif
9442
9443 static int
9444 dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9445 {
9446 dtrace_probekey_t pkey;
9447 uint32_t priv;
9448 uid_t uid;
9449 zoneid_t zoneid;
9450
9451 ASSERT(MUTEX_HELD(&dtrace_lock));
9452 dtrace_ecb_create_cache = NULL;
9453
9454 if (desc == NULL) {
9455 /*
9456 * If we're passed a NULL description, we're being asked to
9457 * create an ECB with a NULL probe.
9458 */
9459 (void) dtrace_ecb_create_enable(NULL, enab);
9460 return (0);
9461 }
9462
9463 dtrace_probekey(desc, &pkey);
9464 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9465 &priv, &uid, &zoneid);
9466
9467 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9468 enab));
9469 }
9470
9471 /*
9472 * DTrace Helper Provider Functions
9473 */
9474 static void
9475 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9476 {
9477 attr->dtat_name = DOF_ATTR_NAME(dofattr);
9478 attr->dtat_data = DOF_ATTR_DATA(dofattr);
9479 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9480 }
9481
9482 static void
9483 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9484 const dof_provider_t *dofprov, char *strtab)
9485 {
9486 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9487 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9488 dofprov->dofpv_provattr);
9489 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9490 dofprov->dofpv_modattr);
9491 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9492 dofprov->dofpv_funcattr);
9493 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9494 dofprov->dofpv_nameattr);
9495 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9496 dofprov->dofpv_argsattr);
9497 }
9498
9499 static void
9500 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9501 {
9502 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9503 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9504 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9505 dof_provider_t *provider;
9506 dof_probe_t *probe;
9507 uint32_t *off, *enoff;
9508 uint8_t *arg;
9509 char *strtab;
9510 uint_t i, nprobes;
9511 dtrace_helper_provdesc_t dhpv;
9512 dtrace_helper_probedesc_t dhpb;
9513 dtrace_meta_t *meta = dtrace_meta_pid;
9514 dtrace_mops_t *mops = &meta->dtm_mops;
9515 void *parg;
9516
9517 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9518 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9519 provider->dofpv_strtab * dof->dofh_secsize);
9520 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9521 provider->dofpv_probes * dof->dofh_secsize);
9522 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9523 provider->dofpv_prargs * dof->dofh_secsize);
9524 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9525 provider->dofpv_proffs * dof->dofh_secsize);
9526
9527 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9528 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9529 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9530 enoff = NULL;
9531
9532 /*
9533 * See dtrace_helper_provider_validate().
9534 */
9535 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9536 provider->dofpv_prenoffs != DOF_SECT_NONE) {
9537 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9538 provider->dofpv_prenoffs * dof->dofh_secsize);
9539 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9540 }
9541
9542 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9543
9544 /*
9545 * Create the provider.
9546 */
9547 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9548
9549 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9550 return;
9551
9552 meta->dtm_count++;
9553
9554 /*
9555 * Create the probes.
9556 */
9557 for (i = 0; i < nprobes; i++) {
9558 probe = (dof_probe_t *)(uintptr_t)(daddr +
9559 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9560
9561 /* See the check in dtrace_helper_provider_validate(). */
9562 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)
9563 continue;
9564
9565 dhpb.dthpb_mod = dhp->dofhp_mod;
9566 dhpb.dthpb_func = strtab + probe->dofpr_func;
9567 dhpb.dthpb_name = strtab + probe->dofpr_name;
9568 dhpb.dthpb_base = probe->dofpr_addr;
9569 dhpb.dthpb_offs = off + probe->dofpr_offidx;
9570 dhpb.dthpb_noffs = probe->dofpr_noffs;
9571 if (enoff != NULL) {
9572 dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9573 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9574 } else {
9575 dhpb.dthpb_enoffs = NULL;
9576 dhpb.dthpb_nenoffs = 0;
9577 }
9578 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9579 dhpb.dthpb_nargc = probe->dofpr_nargc;
9580 dhpb.dthpb_xargc = probe->dofpr_xargc;
9581 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9582 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9583
9584 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9585 }
9586 }
9587
9588 static void
9589 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9590 {
9591 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9592 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9593 int i;
9594
9595 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9596
9597 for (i = 0; i < dof->dofh_secnum; i++) {
9598 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9599 dof->dofh_secoff + i * dof->dofh_secsize);
9600
9601 if (sec->dofs_type != DOF_SECT_PROVIDER)
9602 continue;
9603
9604 dtrace_helper_provide_one(dhp, sec, pid);
9605 }
9606
9607 /*
9608 * We may have just created probes, so we must now rematch against
9609 * any retained enablings. Note that this call will acquire both
9610 * cpu_lock and dtrace_lock; the fact that we are holding
9611 * dtrace_meta_lock now is what defines the ordering with respect to
9612 * these three locks.
9613 */
9614 dtrace_enabling_matchall();
9615 }
9616
9617 static void
9618 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9619 {
9620 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9621 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9622 dof_sec_t *str_sec;
9623 dof_provider_t *provider;
9624 char *strtab;
9625 dtrace_helper_provdesc_t dhpv;
9626 dtrace_meta_t *meta = dtrace_meta_pid;
9627 dtrace_mops_t *mops = &meta->dtm_mops;
9628
9629 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9630 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9631 provider->dofpv_strtab * dof->dofh_secsize);
9632
9633 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9634
9635 /*
9636 * Create the provider.
9637 */
9638 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9639
9640 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9641
9642 meta->dtm_count--;
9643 }
9644
9645 static void
9646 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9647 {
9648 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9649 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9650 int i;
9651
9652 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9653
9654 for (i = 0; i < dof->dofh_secnum; i++) {
9655 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9656 dof->dofh_secoff + i * dof->dofh_secsize);
9657
9658 if (sec->dofs_type != DOF_SECT_PROVIDER)
9659 continue;
9660
9661 dtrace_helper_provider_remove_one(dhp, sec, pid);
9662 }
9663 }
9664
9665 /*
9666 * DTrace Meta Provider-to-Framework API Functions
9667 *
9668 * These functions implement the Meta Provider-to-Framework API, as described
9669 * in <sys/dtrace.h>.
9670 */
9671 int
9672 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9673 dtrace_meta_provider_id_t *idp)
9674 {
9675 dtrace_meta_t *meta;
9676 dtrace_helpers_t *help, *next;
9677 int i;
9678
9679 *idp = DTRACE_METAPROVNONE;
9680
9681 /*
9682 * We strictly don't need the name, but we hold onto it for
9683 * debuggability. All hail error queues!
9684 */
9685 if (name == NULL) {
9686 cmn_err(CE_WARN, "failed to register meta-provider: "
9687 "invalid name");
9688 return (EINVAL);
9689 }
9690
9691 if (mops == NULL ||
9692 mops->dtms_create_probe == NULL ||
9693 mops->dtms_provide_pid == NULL ||
9694 mops->dtms_remove_pid == NULL) {
9695 cmn_err(CE_WARN, "failed to register meta-register %s: "
9696 "invalid ops", name);
9697 return (EINVAL);
9698 }
9699
9700 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9701 meta->dtm_mops = *mops;
9702 meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9703 (void) strcpy(meta->dtm_name, name);
9704 meta->dtm_arg = arg;
9705
9706 mutex_enter(&dtrace_meta_lock);
9707 mutex_enter(&dtrace_lock);
9708
9709 if (dtrace_meta_pid != NULL) {
9710 mutex_exit(&dtrace_lock);
9711 mutex_exit(&dtrace_meta_lock);
9712 cmn_err(CE_WARN, "failed to register meta-register %s: "
9713 "user-land meta-provider exists", name);
9714 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9715 kmem_free(meta, sizeof (dtrace_meta_t));
9716 return (EINVAL);
9717 }
9718
9719 dtrace_meta_pid = meta;
9720 *idp = (dtrace_meta_provider_id_t)meta;
9721
9722 /*
9723 * If there are providers and probes ready to go, pass them
9724 * off to the new meta provider now.
9725 */
9726
9727 help = dtrace_deferred_pid;
9728 dtrace_deferred_pid = NULL;
9729
9730 mutex_exit(&dtrace_lock);
9731
9732 while (help != NULL) {
9733 for (i = 0; i < help->dthps_nprovs; i++) {
9734 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9735 help->dthps_pid);
9736 }
9737
9738 next = help->dthps_next;
9739 help->dthps_next = NULL;
9740 help->dthps_prev = NULL;
9741 help->dthps_deferred = 0;
9742 help = next;
9743 }
9744
9745 mutex_exit(&dtrace_meta_lock);
9746
9747 return (0);
9748 }
9749
9750 int
9751 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9752 {
9753 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9754
9755 mutex_enter(&dtrace_meta_lock);
9756 mutex_enter(&dtrace_lock);
9757
9758 if (old == dtrace_meta_pid) {
9759 pp = &dtrace_meta_pid;
9760 } else {
9761 panic("attempt to unregister non-existent "
9762 "dtrace meta-provider %p\n", (void *)old);
9763 }
9764
9765 if (old->dtm_count != 0) {
9766 mutex_exit(&dtrace_lock);
9767 mutex_exit(&dtrace_meta_lock);
9768 return (EBUSY);
9769 }
9770
9771 *pp = NULL;
9772
9773 mutex_exit(&dtrace_lock);
9774 mutex_exit(&dtrace_meta_lock);
9775
9776 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9777 kmem_free(old, sizeof (dtrace_meta_t));
9778
9779 return (0);
9780 }
9781
9782
9783 /*
9784 * DTrace DIF Object Functions
9785 */
9786 static int
9787 dtrace_difo_err(uint_t pc, const char *format, ...)
9788 {
9789 if (dtrace_err_verbose) {
9790 va_list alist;
9791
9792 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9793 va_start(alist, format);
9794 (void) vuprintf(format, alist);
9795 va_end(alist);
9796 }
9797
9798 #ifdef DTRACE_ERRDEBUG
9799 dtrace_errdebug(format);
9800 #endif
9801 return (1);
9802 }
9803
9804 /*
9805 * Validate a DTrace DIF object by checking the IR instructions. The following
9806 * rules are currently enforced by dtrace_difo_validate():
9807 *
9808 * 1. Each instruction must have a valid opcode
9809 * 2. Each register, string, variable, or subroutine reference must be valid
9810 * 3. No instruction can modify register %r0 (must be zero)
9811 * 4. All instruction reserved bits must be set to zero
9812 * 5. The last instruction must be a "ret" instruction
9813 * 6. All branch targets must reference a valid instruction _after_ the branch
9814 */
9815 static int
9816 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9817 cred_t *cr)
9818 {
9819 int err = 0, i;
9820 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9821 int kcheckload;
9822 uint_t pc;
9823 int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9824
9825 kcheckload = cr == NULL ||
9826 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9827
9828 dp->dtdo_destructive = 0;
9829
9830 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9831 dif_instr_t instr = dp->dtdo_buf[pc];
9832
9833 uint_t r1 = DIF_INSTR_R1(instr);
9834 uint_t r2 = DIF_INSTR_R2(instr);
9835 uint_t rd = DIF_INSTR_RD(instr);
9836 uint_t rs = DIF_INSTR_RS(instr);
9837 uint_t label = DIF_INSTR_LABEL(instr);
9838 uint_t v = DIF_INSTR_VAR(instr);
9839 uint_t subr = DIF_INSTR_SUBR(instr);
9840 uint_t type = DIF_INSTR_TYPE(instr);
9841 uint_t op = DIF_INSTR_OP(instr);
9842
9843 switch (op) {
9844 case DIF_OP_OR:
9845 case DIF_OP_XOR:
9846 case DIF_OP_AND:
9847 case DIF_OP_SLL:
9848 case DIF_OP_SRL:
9849 case DIF_OP_SRA:
9850 case DIF_OP_SUB:
9851 case DIF_OP_ADD:
9852 case DIF_OP_MUL:
9853 case DIF_OP_SDIV:
9854 case DIF_OP_UDIV:
9855 case DIF_OP_SREM:
9856 case DIF_OP_UREM:
9857 case DIF_OP_COPYS:
9858 if (r1 >= nregs)
9859 err += efunc(pc, "invalid register %u\n", r1);
9860 if (r2 >= nregs)
9861 err += efunc(pc, "invalid register %u\n", r2);
9862 if (rd >= nregs)
9863 err += efunc(pc, "invalid register %u\n", rd);
9864 if (rd == 0)
9865 err += efunc(pc, "cannot write to %%r0\n");
9866 break;
9867 case DIF_OP_NOT:
9868 case DIF_OP_MOV:
9869 case DIF_OP_ALLOCS:
9870 if (r1 >= nregs)
9871 err += efunc(pc, "invalid register %u\n", r1);
9872 if (r2 != 0)
9873 err += efunc(pc, "non-zero reserved bits\n");
9874 if (rd >= nregs)
9875 err += efunc(pc, "invalid register %u\n", rd);
9876 if (rd == 0)
9877 err += efunc(pc, "cannot write to %%r0\n");
9878 break;
9879 case DIF_OP_LDSB:
9880 case DIF_OP_LDSH:
9881 case DIF_OP_LDSW:
9882 case DIF_OP_LDUB:
9883 case DIF_OP_LDUH:
9884 case DIF_OP_LDUW:
9885 case DIF_OP_LDX:
9886 if (r1 >= nregs)
9887 err += efunc(pc, "invalid register %u\n", r1);
9888 if (r2 != 0)
9889 err += efunc(pc, "non-zero reserved bits\n");
9890 if (rd >= nregs)
9891 err += efunc(pc, "invalid register %u\n", rd);
9892 if (rd == 0)
9893 err += efunc(pc, "cannot write to %%r0\n");
9894 if (kcheckload)
9895 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9896 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9897 break;
9898 case DIF_OP_RLDSB:
9899 case DIF_OP_RLDSH:
9900 case DIF_OP_RLDSW:
9901 case DIF_OP_RLDUB:
9902 case DIF_OP_RLDUH:
9903 case DIF_OP_RLDUW:
9904 case DIF_OP_RLDX:
9905 if (r1 >= nregs)
9906 err += efunc(pc, "invalid register %u\n", r1);
9907 if (r2 != 0)
9908 err += efunc(pc, "non-zero reserved bits\n");
9909 if (rd >= nregs)
9910 err += efunc(pc, "invalid register %u\n", rd);
9911 if (rd == 0)
9912 err += efunc(pc, "cannot write to %%r0\n");
9913 break;
9914 case DIF_OP_ULDSB:
9915 case DIF_OP_ULDSH:
9916 case DIF_OP_ULDSW:
9917 case DIF_OP_ULDUB:
9918 case DIF_OP_ULDUH:
9919 case DIF_OP_ULDUW:
9920 case DIF_OP_ULDX:
9921 if (r1 >= nregs)
9922 err += efunc(pc, "invalid register %u\n", r1);
9923 if (r2 != 0)
9924 err += efunc(pc, "non-zero reserved bits\n");
9925 if (rd >= nregs)
9926 err += efunc(pc, "invalid register %u\n", rd);
9927 if (rd == 0)
9928 err += efunc(pc, "cannot write to %%r0\n");
9929 break;
9930 case DIF_OP_STB:
9931 case DIF_OP_STH:
9932 case DIF_OP_STW:
9933 case DIF_OP_STX:
9934 if (r1 >= nregs)
9935 err += efunc(pc, "invalid register %u\n", r1);
9936 if (r2 != 0)
9937 err += efunc(pc, "non-zero reserved bits\n");
9938 if (rd >= nregs)
9939 err += efunc(pc, "invalid register %u\n", rd);
9940 if (rd == 0)
9941 err += efunc(pc, "cannot write to 0 address\n");
9942 break;
9943 case DIF_OP_CMP:
9944 case DIF_OP_SCMP:
9945 if (r1 >= nregs)
9946 err += efunc(pc, "invalid register %u\n", r1);
9947 if (r2 >= nregs)
9948 err += efunc(pc, "invalid register %u\n", r2);
9949 if (rd != 0)
9950 err += efunc(pc, "non-zero reserved bits\n");
9951 break;
9952 case DIF_OP_TST:
9953 if (r1 >= nregs)
9954 err += efunc(pc, "invalid register %u\n", r1);
9955 if (r2 != 0 || rd != 0)
9956 err += efunc(pc, "non-zero reserved bits\n");
9957 break;
9958 case DIF_OP_BA:
9959 case DIF_OP_BE:
9960 case DIF_OP_BNE:
9961 case DIF_OP_BG:
9962 case DIF_OP_BGU:
9963 case DIF_OP_BGE:
9964 case DIF_OP_BGEU:
9965 case DIF_OP_BL:
9966 case DIF_OP_BLU:
9967 case DIF_OP_BLE:
9968 case DIF_OP_BLEU:
9969 if (label >= dp->dtdo_len) {
9970 err += efunc(pc, "invalid branch target %u\n",
9971 label);
9972 }
9973 if (label <= pc) {
9974 err += efunc(pc, "backward branch to %u\n",
9975 label);
9976 }
9977 break;
9978 case DIF_OP_RET:
9979 if (r1 != 0 || r2 != 0)
9980 err += efunc(pc, "non-zero reserved bits\n");
9981 if (rd >= nregs)
9982 err += efunc(pc, "invalid register %u\n", rd);
9983 break;
9984 case DIF_OP_NOP:
9985 case DIF_OP_POPTS:
9986 case DIF_OP_FLUSHTS:
9987 if (r1 != 0 || r2 != 0 || rd != 0)
9988 err += efunc(pc, "non-zero reserved bits\n");
9989 break;
9990 case DIF_OP_SETX:
9991 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9992 err += efunc(pc, "invalid integer ref %u\n",
9993 DIF_INSTR_INTEGER(instr));
9994 }
9995 if (rd >= nregs)
9996 err += efunc(pc, "invalid register %u\n", rd);
9997 if (rd == 0)
9998 err += efunc(pc, "cannot write to %%r0\n");
9999 break;
10000 case DIF_OP_SETS:
10001 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
10002 err += efunc(pc, "invalid string ref %u\n",
10003 DIF_INSTR_STRING(instr));
10004 }
10005 if (rd >= nregs)
10006 err += efunc(pc, "invalid register %u\n", rd);
10007 if (rd == 0)
10008 err += efunc(pc, "cannot write to %%r0\n");
10009 break;
10010 case DIF_OP_LDGA:
10011 case DIF_OP_LDTA:
10012 if (r1 > DIF_VAR_ARRAY_MAX)
10013 err += efunc(pc, "invalid array %u\n", r1);
10014 if (r2 >= nregs)
10015 err += efunc(pc, "invalid register %u\n", r2);
10016 if (rd >= nregs)
10017 err += efunc(pc, "invalid register %u\n", rd);
10018 if (rd == 0)
10019 err += efunc(pc, "cannot write to %%r0\n");
10020 break;
10021 case DIF_OP_LDGS:
10022 case DIF_OP_LDTS:
10023 case DIF_OP_LDLS:
10024 case DIF_OP_LDGAA:
10025 case DIF_OP_LDTAA:
10026 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
10027 err += efunc(pc, "invalid variable %u\n", v);
10028 if (rd >= nregs)
10029 err += efunc(pc, "invalid register %u\n", rd);
10030 if (rd == 0)
10031 err += efunc(pc, "cannot write to %%r0\n");
10032 break;
10033 case DIF_OP_STGS:
10034 case DIF_OP_STTS:
10035 case DIF_OP_STLS:
10036 case DIF_OP_STGAA:
10037 case DIF_OP_STTAA:
10038 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
10039 err += efunc(pc, "invalid variable %u\n", v);
10040 if (rs >= nregs)
10041 err += efunc(pc, "invalid register %u\n", rd);
10042 break;
10043 case DIF_OP_CALL:
10044 if (subr > DIF_SUBR_MAX)
10045 err += efunc(pc, "invalid subr %u\n", subr);
10046 if (rd >= nregs)
10047 err += efunc(pc, "invalid register %u\n", rd);
10048 if (rd == 0)
10049 err += efunc(pc, "cannot write to %%r0\n");
10050
10051 if (subr == DIF_SUBR_COPYOUT ||
10052 subr == DIF_SUBR_COPYOUTSTR) {
10053 dp->dtdo_destructive = 1;
10054 }
10055
10056 if (subr == DIF_SUBR_GETF) {
10057 #ifdef __FreeBSD__
10058 err += efunc(pc, "getf() not supported");
10059 #else
10060 /*
10061 * If we have a getf() we need to record that
10062 * in our state. Note that our state can be
10063 * NULL if this is a helper -- but in that
10064 * case, the call to getf() is itself illegal,
10065 * and will be caught (slightly later) when
10066 * the helper is validated.
10067 */
10068 if (vstate->dtvs_state != NULL)
10069 vstate->dtvs_state->dts_getf++;
10070 #endif
10071 }
10072
10073 break;
10074 case DIF_OP_PUSHTR:
10075 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
10076 err += efunc(pc, "invalid ref type %u\n", type);
10077 if (r2 >= nregs)
10078 err += efunc(pc, "invalid register %u\n", r2);
10079 if (rs >= nregs)
10080 err += efunc(pc, "invalid register %u\n", rs);
10081 break;
10082 case DIF_OP_PUSHTV:
10083 if (type != DIF_TYPE_CTF)
10084 err += efunc(pc, "invalid val type %u\n", type);
10085 if (r2 >= nregs)
10086 err += efunc(pc, "invalid register %u\n", r2);
10087 if (rs >= nregs)
10088 err += efunc(pc, "invalid register %u\n", rs);
10089 break;
10090 default:
10091 err += efunc(pc, "invalid opcode %u\n",
10092 DIF_INSTR_OP(instr));
10093 }
10094 }
10095
10096 if (dp->dtdo_len != 0 &&
10097 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
10098 err += efunc(dp->dtdo_len - 1,
10099 "expected 'ret' as last DIF instruction\n");
10100 }
10101
10102 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
10103 /*
10104 * If we're not returning by reference, the size must be either
10105 * 0 or the size of one of the base types.
10106 */
10107 switch (dp->dtdo_rtype.dtdt_size) {
10108 case 0:
10109 case sizeof (uint8_t):
10110 case sizeof (uint16_t):
10111 case sizeof (uint32_t):
10112 case sizeof (uint64_t):
10113 break;
10114
10115 default:
10116 err += efunc(dp->dtdo_len - 1, "bad return size\n");
10117 }
10118 }
10119
10120 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
10121 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
10122 dtrace_diftype_t *vt, *et;
10123 uint_t id, ndx;
10124
10125 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
10126 v->dtdv_scope != DIFV_SCOPE_THREAD &&
10127 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
10128 err += efunc(i, "unrecognized variable scope %d\n",
10129 v->dtdv_scope);
10130 break;
10131 }
10132
10133 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
10134 v->dtdv_kind != DIFV_KIND_SCALAR) {
10135 err += efunc(i, "unrecognized variable type %d\n",
10136 v->dtdv_kind);
10137 break;
10138 }
10139
10140 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
10141 err += efunc(i, "%d exceeds variable id limit\n", id);
10142 break;
10143 }
10144
10145 if (id < DIF_VAR_OTHER_UBASE)
10146 continue;
10147
10148 /*
10149 * For user-defined variables, we need to check that this
10150 * definition is identical to any previous definition that we
10151 * encountered.
10152 */
10153 ndx = id - DIF_VAR_OTHER_UBASE;
10154
10155 switch (v->dtdv_scope) {
10156 case DIFV_SCOPE_GLOBAL:
10157 if (maxglobal == -1 || ndx > maxglobal)
10158 maxglobal = ndx;
10159
10160 if (ndx < vstate->dtvs_nglobals) {
10161 dtrace_statvar_t *svar;
10162
10163 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
10164 existing = &svar->dtsv_var;
10165 }
10166
10167 break;
10168
10169 case DIFV_SCOPE_THREAD:
10170 if (maxtlocal == -1 || ndx > maxtlocal)
10171 maxtlocal = ndx;
10172
10173 if (ndx < vstate->dtvs_ntlocals)
10174 existing = &vstate->dtvs_tlocals[ndx];
10175 break;
10176
10177 case DIFV_SCOPE_LOCAL:
10178 if (maxlocal == -1 || ndx > maxlocal)
10179 maxlocal = ndx;
10180
10181 if (ndx < vstate->dtvs_nlocals) {
10182 dtrace_statvar_t *svar;
10183
10184 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
10185 existing = &svar->dtsv_var;
10186 }
10187
10188 break;
10189 }
10190
10191 vt = &v->dtdv_type;
10192
10193 if (vt->dtdt_flags & DIF_TF_BYREF) {
10194 if (vt->dtdt_size == 0) {
10195 err += efunc(i, "zero-sized variable\n");
10196 break;
10197 }
10198
10199 if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
10200 v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
10201 vt->dtdt_size > dtrace_statvar_maxsize) {
10202 err += efunc(i, "oversized by-ref static\n");
10203 break;
10204 }
10205 }
10206
10207 if (existing == NULL || existing->dtdv_id == 0)
10208 continue;
10209
10210 ASSERT(existing->dtdv_id == v->dtdv_id);
10211 ASSERT(existing->dtdv_scope == v->dtdv_scope);
10212
10213 if (existing->dtdv_kind != v->dtdv_kind)
10214 err += efunc(i, "%d changed variable kind\n", id);
10215
10216 et = &existing->dtdv_type;
10217
10218 if (vt->dtdt_flags != et->dtdt_flags) {
10219 err += efunc(i, "%d changed variable type flags\n", id);
10220 break;
10221 }
10222
10223 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
10224 err += efunc(i, "%d changed variable type size\n", id);
10225 break;
10226 }
10227 }
10228
10229 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
10230 dif_instr_t instr = dp->dtdo_buf[pc];
10231
10232 uint_t v = DIF_INSTR_VAR(instr);
10233 uint_t op = DIF_INSTR_OP(instr);
10234
10235 switch (op) {
10236 case DIF_OP_LDGS:
10237 case DIF_OP_LDGAA:
10238 case DIF_OP_STGS:
10239 case DIF_OP_STGAA:
10240 if (v > DIF_VAR_OTHER_UBASE + maxglobal)
10241 err += efunc(pc, "invalid variable %u\n", v);
10242 break;
10243 case DIF_OP_LDTS:
10244 case DIF_OP_LDTAA:
10245 case DIF_OP_STTS:
10246 case DIF_OP_STTAA:
10247 if (v > DIF_VAR_OTHER_UBASE + maxtlocal)
10248 err += efunc(pc, "invalid variable %u\n", v);
10249 break;
10250 case DIF_OP_LDLS:
10251 case DIF_OP_STLS:
10252 if (v > DIF_VAR_OTHER_UBASE + maxlocal)
10253 err += efunc(pc, "invalid variable %u\n", v);
10254 break;
10255 default:
10256 break;
10257 }
10258 }
10259
10260 return (err);
10261 }
10262
10263 /*
10264 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
10265 * are much more constrained than normal DIFOs. Specifically, they may
10266 * not:
10267 *
10268 * 1. Make calls to subroutines other than copyin(), copyinstr() or
10269 * miscellaneous string routines
10270 * 2. Access DTrace variables other than the args[] array, and the
10271 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10272 * 3. Have thread-local variables.
10273 * 4. Have dynamic variables.
10274 */
10275 static int
10276 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10277 {
10278 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10279 int err = 0;
10280 uint_t pc;
10281
10282 for (pc = 0; pc < dp->dtdo_len; pc++) {
10283 dif_instr_t instr = dp->dtdo_buf[pc];
10284
10285 uint_t v = DIF_INSTR_VAR(instr);
10286 uint_t subr = DIF_INSTR_SUBR(instr);
10287 uint_t op = DIF_INSTR_OP(instr);
10288
10289 switch (op) {
10290 case DIF_OP_OR:
10291 case DIF_OP_XOR:
10292 case DIF_OP_AND:
10293 case DIF_OP_SLL:
10294 case DIF_OP_SRL:
10295 case DIF_OP_SRA:
10296 case DIF_OP_SUB:
10297 case DIF_OP_ADD:
10298 case DIF_OP_MUL:
10299 case DIF_OP_SDIV:
10300 case DIF_OP_UDIV:
10301 case DIF_OP_SREM:
10302 case DIF_OP_UREM:
10303 case DIF_OP_COPYS:
10304 case DIF_OP_NOT:
10305 case DIF_OP_MOV:
10306 case DIF_OP_RLDSB:
10307 case DIF_OP_RLDSH:
10308 case DIF_OP_RLDSW:
10309 case DIF_OP_RLDUB:
10310 case DIF_OP_RLDUH:
10311 case DIF_OP_RLDUW:
10312 case DIF_OP_RLDX:
10313 case DIF_OP_ULDSB:
10314 case DIF_OP_ULDSH:
10315 case DIF_OP_ULDSW:
10316 case DIF_OP_ULDUB:
10317 case DIF_OP_ULDUH:
10318 case DIF_OP_ULDUW:
10319 case DIF_OP_ULDX:
10320 case DIF_OP_STB:
10321 case DIF_OP_STH:
10322 case DIF_OP_STW:
10323 case DIF_OP_STX:
10324 case DIF_OP_ALLOCS:
10325 case DIF_OP_CMP:
10326 case DIF_OP_SCMP:
10327 case DIF_OP_TST:
10328 case DIF_OP_BA:
10329 case DIF_OP_BE:
10330 case DIF_OP_BNE:
10331 case DIF_OP_BG:
10332 case DIF_OP_BGU:
10333 case DIF_OP_BGE:
10334 case DIF_OP_BGEU:
10335 case DIF_OP_BL:
10336 case DIF_OP_BLU:
10337 case DIF_OP_BLE:
10338 case DIF_OP_BLEU:
10339 case DIF_OP_RET:
10340 case DIF_OP_NOP:
10341 case DIF_OP_POPTS:
10342 case DIF_OP_FLUSHTS:
10343 case DIF_OP_SETX:
10344 case DIF_OP_SETS:
10345 case DIF_OP_LDGA:
10346 case DIF_OP_LDLS:
10347 case DIF_OP_STGS:
10348 case DIF_OP_STLS:
10349 case DIF_OP_PUSHTR:
10350 case DIF_OP_PUSHTV:
10351 break;
10352
10353 case DIF_OP_LDGS:
10354 if (v >= DIF_VAR_OTHER_UBASE)
10355 break;
10356
10357 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10358 break;
10359
10360 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10361 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10362 v == DIF_VAR_EXECARGS ||
10363 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10364 v == DIF_VAR_UID || v == DIF_VAR_GID)
10365 break;
10366
10367 err += efunc(pc, "illegal variable %u\n", v);
10368 break;
10369
10370 case DIF_OP_LDTA:
10371 case DIF_OP_LDTS:
10372 case DIF_OP_LDGAA:
10373 case DIF_OP_LDTAA:
10374 err += efunc(pc, "illegal dynamic variable load\n");
10375 break;
10376
10377 case DIF_OP_STTS:
10378 case DIF_OP_STGAA:
10379 case DIF_OP_STTAA:
10380 err += efunc(pc, "illegal dynamic variable store\n");
10381 break;
10382
10383 case DIF_OP_CALL:
10384 if (subr == DIF_SUBR_ALLOCA ||
10385 subr == DIF_SUBR_BCOPY ||
10386 subr == DIF_SUBR_COPYIN ||
10387 subr == DIF_SUBR_COPYINTO ||
10388 subr == DIF_SUBR_COPYINSTR ||
10389 subr == DIF_SUBR_INDEX ||
10390 subr == DIF_SUBR_INET_NTOA ||
10391 subr == DIF_SUBR_INET_NTOA6 ||
10392 subr == DIF_SUBR_INET_NTOP ||
10393 subr == DIF_SUBR_JSON ||
10394 subr == DIF_SUBR_LLTOSTR ||
10395 subr == DIF_SUBR_STRTOLL ||
10396 subr == DIF_SUBR_RINDEX ||
10397 subr == DIF_SUBR_STRCHR ||
10398 subr == DIF_SUBR_STRJOIN ||
10399 subr == DIF_SUBR_STRRCHR ||
10400 subr == DIF_SUBR_STRSTR ||
10401 subr == DIF_SUBR_HTONS ||
10402 subr == DIF_SUBR_HTONL ||
10403 subr == DIF_SUBR_HTONLL ||
10404 subr == DIF_SUBR_NTOHS ||
10405 subr == DIF_SUBR_NTOHL ||
10406 subr == DIF_SUBR_NTOHLL ||
10407 subr == DIF_SUBR_MEMREF)
10408 break;
10409 #ifdef __FreeBSD__
10410 if (subr == DIF_SUBR_MEMSTR)
10411 break;
10412 #endif
10413
10414 err += efunc(pc, "invalid subr %u\n", subr);
10415 break;
10416
10417 default:
10418 err += efunc(pc, "invalid opcode %u\n",
10419 DIF_INSTR_OP(instr));
10420 }
10421 }
10422
10423 return (err);
10424 }
10425
10426 /*
10427 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10428 * basis; 0 if not.
10429 */
10430 static int
10431 dtrace_difo_cacheable(dtrace_difo_t *dp)
10432 {
10433 int i;
10434
10435 if (dp == NULL)
10436 return (0);
10437
10438 for (i = 0; i < dp->dtdo_varlen; i++) {
10439 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10440
10441 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10442 continue;
10443
10444 switch (v->dtdv_id) {
10445 case DIF_VAR_CURTHREAD:
10446 case DIF_VAR_PID:
10447 case DIF_VAR_TID:
10448 case DIF_VAR_EXECARGS:
10449 case DIF_VAR_EXECNAME:
10450 case DIF_VAR_ZONENAME:
10451 break;
10452
10453 default:
10454 return (0);
10455 }
10456 }
10457
10458 /*
10459 * This DIF object may be cacheable. Now we need to look for any
10460 * array loading instructions, any memory loading instructions, or
10461 * any stores to thread-local variables.
10462 */
10463 for (i = 0; i < dp->dtdo_len; i++) {
10464 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10465
10466 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10467 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10468 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10469 op == DIF_OP_LDGA || op == DIF_OP_STTS)
10470 return (0);
10471 }
10472
10473 return (1);
10474 }
10475
10476 static void
10477 dtrace_difo_hold(dtrace_difo_t *dp)
10478 {
10479 int i;
10480
10481 ASSERT(MUTEX_HELD(&dtrace_lock));
10482
10483 dp->dtdo_refcnt++;
10484 ASSERT(dp->dtdo_refcnt != 0);
10485
10486 /*
10487 * We need to check this DIF object for references to the variable
10488 * DIF_VAR_VTIMESTAMP.
10489 */
10490 for (i = 0; i < dp->dtdo_varlen; i++) {
10491 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10492
10493 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10494 continue;
10495
10496 if (dtrace_vtime_references++ == 0)
10497 dtrace_vtime_enable();
10498 }
10499 }
10500
10501 /*
10502 * This routine calculates the dynamic variable chunksize for a given DIF
10503 * object. The calculation is not fool-proof, and can probably be tricked by
10504 * malicious DIF -- but it works for all compiler-generated DIF. Because this
10505 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10506 * if a dynamic variable size exceeds the chunksize.
10507 */
10508 static void
10509 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10510 {
10511 uint64_t sval = 0;
10512 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10513 const dif_instr_t *text = dp->dtdo_buf;
10514 uint_t pc, srd = 0;
10515 uint_t ttop = 0;
10516 size_t size, ksize;
10517 uint_t id, i;
10518
10519 for (pc = 0; pc < dp->dtdo_len; pc++) {
10520 dif_instr_t instr = text[pc];
10521 uint_t op = DIF_INSTR_OP(instr);
10522 uint_t rd = DIF_INSTR_RD(instr);
10523 uint_t r1 = DIF_INSTR_R1(instr);
10524 uint_t nkeys = 0;
10525 uchar_t scope = 0;
10526
10527 dtrace_key_t *key = tupregs;
10528
10529 switch (op) {
10530 case DIF_OP_SETX:
10531 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10532 srd = rd;
10533 continue;
10534
10535 case DIF_OP_STTS:
10536 key = &tupregs[DIF_DTR_NREGS];
10537 key[0].dttk_size = 0;
10538 key[1].dttk_size = 0;
10539 nkeys = 2;
10540 scope = DIFV_SCOPE_THREAD;
10541 break;
10542
10543 case DIF_OP_STGAA:
10544 case DIF_OP_STTAA:
10545 nkeys = ttop;
10546
10547 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10548 key[nkeys++].dttk_size = 0;
10549
10550 key[nkeys++].dttk_size = 0;
10551
10552 if (op == DIF_OP_STTAA) {
10553 scope = DIFV_SCOPE_THREAD;
10554 } else {
10555 scope = DIFV_SCOPE_GLOBAL;
10556 }
10557
10558 break;
10559
10560 case DIF_OP_PUSHTR:
10561 if (ttop == DIF_DTR_NREGS)
10562 return;
10563
10564 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10565 /*
10566 * If the register for the size of the "pushtr"
10567 * is %r0 (or the value is 0) and the type is
10568 * a string, we'll use the system-wide default
10569 * string size.
10570 */
10571 tupregs[ttop++].dttk_size =
10572 dtrace_strsize_default;
10573 } else {
10574 if (srd == 0)
10575 return;
10576
10577 if (sval > LONG_MAX)
10578 return;
10579
10580 tupregs[ttop++].dttk_size = sval;
10581 }
10582
10583 break;
10584
10585 case DIF_OP_PUSHTV:
10586 if (ttop == DIF_DTR_NREGS)
10587 return;
10588
10589 tupregs[ttop++].dttk_size = 0;
10590 break;
10591
10592 case DIF_OP_FLUSHTS:
10593 ttop = 0;
10594 break;
10595
10596 case DIF_OP_POPTS:
10597 if (ttop != 0)
10598 ttop--;
10599 break;
10600 }
10601
10602 sval = 0;
10603 srd = 0;
10604
10605 if (nkeys == 0)
10606 continue;
10607
10608 /*
10609 * We have a dynamic variable allocation; calculate its size.
10610 */
10611 for (ksize = 0, i = 0; i < nkeys; i++)
10612 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10613
10614 size = sizeof (dtrace_dynvar_t);
10615 size += sizeof (dtrace_key_t) * (nkeys - 1);
10616 size += ksize;
10617
10618 /*
10619 * Now we need to determine the size of the stored data.
10620 */
10621 id = DIF_INSTR_VAR(instr);
10622
10623 for (i = 0; i < dp->dtdo_varlen; i++) {
10624 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10625
10626 if (v->dtdv_id == id && v->dtdv_scope == scope) {
10627 size += v->dtdv_type.dtdt_size;
10628 break;
10629 }
10630 }
10631
10632 if (i == dp->dtdo_varlen)
10633 return;
10634
10635 /*
10636 * We have the size. If this is larger than the chunk size
10637 * for our dynamic variable state, reset the chunk size.
10638 */
10639 size = P2ROUNDUP(size, sizeof (uint64_t));
10640
10641 /*
10642 * Before setting the chunk size, check that we're not going
10643 * to set it to a negative value...
10644 */
10645 if (size > LONG_MAX)
10646 return;
10647
10648 /*
10649 * ...and make certain that we didn't badly overflow.
10650 */
10651 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10652 return;
10653
10654 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10655 vstate->dtvs_dynvars.dtds_chunksize = size;
10656 }
10657 }
10658
10659 static void
10660 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10661 {
10662 int i, oldsvars, osz, nsz, otlocals, ntlocals;
10663 uint_t id;
10664
10665 ASSERT(MUTEX_HELD(&dtrace_lock));
10666 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10667
10668 for (i = 0; i < dp->dtdo_varlen; i++) {
10669 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10670 dtrace_statvar_t *svar, ***svarp = NULL;
10671 size_t dsize = 0;
10672 uint8_t scope = v->dtdv_scope;
10673 int *np = NULL;
10674
10675 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10676 continue;
10677
10678 id -= DIF_VAR_OTHER_UBASE;
10679
10680 switch (scope) {
10681 case DIFV_SCOPE_THREAD:
10682 while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10683 dtrace_difv_t *tlocals;
10684
10685 if ((ntlocals = (otlocals << 1)) == 0)
10686 ntlocals = 1;
10687
10688 osz = otlocals * sizeof (dtrace_difv_t);
10689 nsz = ntlocals * sizeof (dtrace_difv_t);
10690
10691 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10692
10693 if (osz != 0) {
10694 bcopy(vstate->dtvs_tlocals,
10695 tlocals, osz);
10696 kmem_free(vstate->dtvs_tlocals, osz);
10697 }
10698
10699 vstate->dtvs_tlocals = tlocals;
10700 vstate->dtvs_ntlocals = ntlocals;
10701 }
10702
10703 vstate->dtvs_tlocals[id] = *v;
10704 continue;
10705
10706 case DIFV_SCOPE_LOCAL:
10707 np = &vstate->dtvs_nlocals;
10708 svarp = &vstate->dtvs_locals;
10709
10710 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10711 dsize = (mp_maxid + 1) *
10712 (v->dtdv_type.dtdt_size +
10713 sizeof (uint64_t));
10714 else
10715 dsize = (mp_maxid + 1) * sizeof (uint64_t);
10716
10717 break;
10718
10719 case DIFV_SCOPE_GLOBAL:
10720 np = &vstate->dtvs_nglobals;
10721 svarp = &vstate->dtvs_globals;
10722
10723 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10724 dsize = v->dtdv_type.dtdt_size +
10725 sizeof (uint64_t);
10726
10727 break;
10728
10729 default:
10730 ASSERT(0);
10731 }
10732
10733 while (id >= (oldsvars = *np)) {
10734 dtrace_statvar_t **statics;
10735 int newsvars, oldsize, newsize;
10736
10737 if ((newsvars = (oldsvars << 1)) == 0)
10738 newsvars = 1;
10739
10740 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10741 newsize = newsvars * sizeof (dtrace_statvar_t *);
10742
10743 statics = kmem_zalloc(newsize, KM_SLEEP);
10744
10745 if (oldsize != 0) {
10746 bcopy(*svarp, statics, oldsize);
10747 kmem_free(*svarp, oldsize);
10748 }
10749
10750 *svarp = statics;
10751 *np = newsvars;
10752 }
10753
10754 if ((svar = (*svarp)[id]) == NULL) {
10755 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10756 svar->dtsv_var = *v;
10757
10758 if ((svar->dtsv_size = dsize) != 0) {
10759 svar->dtsv_data = (uint64_t)(uintptr_t)
10760 kmem_zalloc(dsize, KM_SLEEP);
10761 }
10762
10763 (*svarp)[id] = svar;
10764 }
10765
10766 svar->dtsv_refcnt++;
10767 }
10768
10769 dtrace_difo_chunksize(dp, vstate);
10770 dtrace_difo_hold(dp);
10771 }
10772
10773 static dtrace_difo_t *
10774 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10775 {
10776 dtrace_difo_t *new;
10777 size_t sz;
10778
10779 ASSERT(dp->dtdo_buf != NULL);
10780 ASSERT(dp->dtdo_refcnt != 0);
10781
10782 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10783
10784 ASSERT(dp->dtdo_buf != NULL);
10785 sz = dp->dtdo_len * sizeof (dif_instr_t);
10786 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10787 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10788 new->dtdo_len = dp->dtdo_len;
10789
10790 if (dp->dtdo_strtab != NULL) {
10791 ASSERT(dp->dtdo_strlen != 0);
10792 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10793 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10794 new->dtdo_strlen = dp->dtdo_strlen;
10795 }
10796
10797 if (dp->dtdo_inttab != NULL) {
10798 ASSERT(dp->dtdo_intlen != 0);
10799 sz = dp->dtdo_intlen * sizeof (uint64_t);
10800 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10801 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10802 new->dtdo_intlen = dp->dtdo_intlen;
10803 }
10804
10805 if (dp->dtdo_vartab != NULL) {
10806 ASSERT(dp->dtdo_varlen != 0);
10807 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10808 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10809 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10810 new->dtdo_varlen = dp->dtdo_varlen;
10811 }
10812
10813 dtrace_difo_init(new, vstate);
10814 return (new);
10815 }
10816
10817 static void
10818 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10819 {
10820 int i;
10821
10822 ASSERT(dp->dtdo_refcnt == 0);
10823
10824 for (i = 0; i < dp->dtdo_varlen; i++) {
10825 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10826 dtrace_statvar_t *svar, **svarp = NULL;
10827 uint_t id;
10828 uint8_t scope = v->dtdv_scope;
10829 int *np = NULL;
10830
10831 switch (scope) {
10832 case DIFV_SCOPE_THREAD:
10833 continue;
10834
10835 case DIFV_SCOPE_LOCAL:
10836 np = &vstate->dtvs_nlocals;
10837 svarp = vstate->dtvs_locals;
10838 break;
10839
10840 case DIFV_SCOPE_GLOBAL:
10841 np = &vstate->dtvs_nglobals;
10842 svarp = vstate->dtvs_globals;
10843 break;
10844
10845 default:
10846 ASSERT(0);
10847 }
10848
10849 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10850 continue;
10851
10852 id -= DIF_VAR_OTHER_UBASE;
10853 ASSERT(id < *np);
10854
10855 svar = svarp[id];
10856 ASSERT(svar != NULL);
10857 ASSERT(svar->dtsv_refcnt > 0);
10858
10859 if (--svar->dtsv_refcnt > 0)
10860 continue;
10861
10862 if (svar->dtsv_size != 0) {
10863 ASSERT(svar->dtsv_data != 0);
10864 kmem_free((void *)(uintptr_t)svar->dtsv_data,
10865 svar->dtsv_size);
10866 }
10867
10868 kmem_free(svar, sizeof (dtrace_statvar_t));
10869 svarp[id] = NULL;
10870 }
10871
10872 if (dp->dtdo_buf != NULL)
10873 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10874 if (dp->dtdo_inttab != NULL)
10875 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10876 if (dp->dtdo_strtab != NULL)
10877 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10878 if (dp->dtdo_vartab != NULL)
10879 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10880
10881 kmem_free(dp, sizeof (dtrace_difo_t));
10882 }
10883
10884 static void
10885 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10886 {
10887 int i;
10888
10889 ASSERT(MUTEX_HELD(&dtrace_lock));
10890 ASSERT(dp->dtdo_refcnt != 0);
10891
10892 for (i = 0; i < dp->dtdo_varlen; i++) {
10893 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10894
10895 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10896 continue;
10897
10898 ASSERT(dtrace_vtime_references > 0);
10899 if (--dtrace_vtime_references == 0)
10900 dtrace_vtime_disable();
10901 }
10902
10903 if (--dp->dtdo_refcnt == 0)
10904 dtrace_difo_destroy(dp, vstate);
10905 }
10906
10907 /*
10908 * DTrace Format Functions
10909 */
10910 static uint16_t
10911 dtrace_format_add(dtrace_state_t *state, char *str)
10912 {
10913 char *fmt, **new;
10914 uint16_t ndx, len = strlen(str) + 1;
10915
10916 fmt = kmem_zalloc(len, KM_SLEEP);
10917 bcopy(str, fmt, len);
10918
10919 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10920 if (state->dts_formats[ndx] == NULL) {
10921 state->dts_formats[ndx] = fmt;
10922 return (ndx + 1);
10923 }
10924 }
10925
10926 if (state->dts_nformats == USHRT_MAX) {
10927 /*
10928 * This is only likely if a denial-of-service attack is being
10929 * attempted. As such, it's okay to fail silently here.
10930 */
10931 kmem_free(fmt, len);
10932 return (0);
10933 }
10934
10935 /*
10936 * For simplicity, we always resize the formats array to be exactly the
10937 * number of formats.
10938 */
10939 ndx = state->dts_nformats++;
10940 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10941
10942 if (state->dts_formats != NULL) {
10943 ASSERT(ndx != 0);
10944 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10945 kmem_free(state->dts_formats, ndx * sizeof (char *));
10946 }
10947
10948 state->dts_formats = new;
10949 state->dts_formats[ndx] = fmt;
10950
10951 return (ndx + 1);
10952 }
10953
10954 static void
10955 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10956 {
10957 char *fmt;
10958
10959 ASSERT(state->dts_formats != NULL);
10960 ASSERT(format <= state->dts_nformats);
10961 ASSERT(state->dts_formats[format - 1] != NULL);
10962
10963 fmt = state->dts_formats[format - 1];
10964 kmem_free(fmt, strlen(fmt) + 1);
10965 state->dts_formats[format - 1] = NULL;
10966 }
10967
10968 static void
10969 dtrace_format_destroy(dtrace_state_t *state)
10970 {
10971 int i;
10972
10973 if (state->dts_nformats == 0) {
10974 ASSERT(state->dts_formats == NULL);
10975 return;
10976 }
10977
10978 ASSERT(state->dts_formats != NULL);
10979
10980 for (i = 0; i < state->dts_nformats; i++) {
10981 char *fmt = state->dts_formats[i];
10982
10983 if (fmt == NULL)
10984 continue;
10985
10986 kmem_free(fmt, strlen(fmt) + 1);
10987 }
10988
10989 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10990 state->dts_nformats = 0;
10991 state->dts_formats = NULL;
10992 }
10993
10994 /*
10995 * DTrace Predicate Functions
10996 */
10997 static dtrace_predicate_t *
10998 dtrace_predicate_create(dtrace_difo_t *dp)
10999 {
11000 dtrace_predicate_t *pred;
11001
11002 ASSERT(MUTEX_HELD(&dtrace_lock));
11003 ASSERT(dp->dtdo_refcnt != 0);
11004
11005 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
11006 pred->dtp_difo = dp;
11007 pred->dtp_refcnt = 1;
11008
11009 if (!dtrace_difo_cacheable(dp))
11010 return (pred);
11011
11012 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
11013 /*
11014 * This is only theoretically possible -- we have had 2^32
11015 * cacheable predicates on this machine. We cannot allow any
11016 * more predicates to become cacheable: as unlikely as it is,
11017 * there may be a thread caching a (now stale) predicate cache
11018 * ID. (N.B.: the temptation is being successfully resisted to
11019 * have this cmn_err() "Holy shit -- we executed this code!")
11020 */
11021 return (pred);
11022 }
11023
11024 pred->dtp_cacheid = dtrace_predcache_id++;
11025
11026 return (pred);
11027 }
11028
11029 static void
11030 dtrace_predicate_hold(dtrace_predicate_t *pred)
11031 {
11032 ASSERT(MUTEX_HELD(&dtrace_lock));
11033 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
11034 ASSERT(pred->dtp_refcnt > 0);
11035
11036 pred->dtp_refcnt++;
11037 }
11038
11039 static void
11040 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
11041 {
11042 dtrace_difo_t *dp = pred->dtp_difo;
11043
11044 ASSERT(MUTEX_HELD(&dtrace_lock));
11045 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
11046 ASSERT(pred->dtp_refcnt > 0);
11047
11048 if (--pred->dtp_refcnt == 0) {
11049 dtrace_difo_release(pred->dtp_difo, vstate);
11050 kmem_free(pred, sizeof (dtrace_predicate_t));
11051 }
11052 }
11053
11054 /*
11055 * DTrace Action Description Functions
11056 */
11057 static dtrace_actdesc_t *
11058 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
11059 uint64_t uarg, uint64_t arg)
11060 {
11061 dtrace_actdesc_t *act;
11062
11063 #ifdef illumos
11064 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
11065 arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
11066 #endif
11067
11068 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
11069 act->dtad_kind = kind;
11070 act->dtad_ntuple = ntuple;
11071 act->dtad_uarg = uarg;
11072 act->dtad_arg = arg;
11073 act->dtad_refcnt = 1;
11074
11075 return (act);
11076 }
11077
11078 static void
11079 dtrace_actdesc_hold(dtrace_actdesc_t *act)
11080 {
11081 ASSERT(act->dtad_refcnt >= 1);
11082 act->dtad_refcnt++;
11083 }
11084
11085 static void
11086 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
11087 {
11088 dtrace_actkind_t kind = act->dtad_kind;
11089 dtrace_difo_t *dp;
11090
11091 ASSERT(act->dtad_refcnt >= 1);
11092
11093 if (--act->dtad_refcnt != 0)
11094 return;
11095
11096 if ((dp = act->dtad_difo) != NULL)
11097 dtrace_difo_release(dp, vstate);
11098
11099 if (DTRACEACT_ISPRINTFLIKE(kind)) {
11100 char *str = (char *)(uintptr_t)act->dtad_arg;
11101
11102 #ifdef illumos
11103 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
11104 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
11105 #endif
11106
11107 if (str != NULL)
11108 kmem_free(str, strlen(str) + 1);
11109 }
11110
11111 kmem_free(act, sizeof (dtrace_actdesc_t));
11112 }
11113
11114 /*
11115 * DTrace ECB Functions
11116 */
11117 static dtrace_ecb_t *
11118 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
11119 {
11120 dtrace_ecb_t *ecb;
11121 dtrace_epid_t epid;
11122
11123 ASSERT(MUTEX_HELD(&dtrace_lock));
11124
11125 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
11126 ecb->dte_predicate = NULL;
11127 ecb->dte_probe = probe;
11128
11129 /*
11130 * The default size is the size of the default action: recording
11131 * the header.
11132 */
11133 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
11134 ecb->dte_alignment = sizeof (dtrace_epid_t);
11135
11136 epid = state->dts_epid++;
11137
11138 if (epid - 1 >= state->dts_necbs) {
11139 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
11140 int necbs = state->dts_necbs << 1;
11141
11142 ASSERT(epid == state->dts_necbs + 1);
11143
11144 if (necbs == 0) {
11145 ASSERT(oecbs == NULL);
11146 necbs = 1;
11147 }
11148
11149 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
11150
11151 if (oecbs != NULL)
11152 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
11153
11154 dtrace_membar_producer();
11155 state->dts_ecbs = ecbs;
11156
11157 if (oecbs != NULL) {
11158 /*
11159 * If this state is active, we must dtrace_sync()
11160 * before we can free the old dts_ecbs array: we're
11161 * coming in hot, and there may be active ring
11162 * buffer processing (which indexes into the dts_ecbs
11163 * array) on another CPU.
11164 */
11165 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
11166 dtrace_sync();
11167
11168 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
11169 }
11170
11171 dtrace_membar_producer();
11172 state->dts_necbs = necbs;
11173 }
11174
11175 ecb->dte_state = state;
11176
11177 ASSERT(state->dts_ecbs[epid - 1] == NULL);
11178 dtrace_membar_producer();
11179 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
11180
11181 return (ecb);
11182 }
11183
11184 static void
11185 dtrace_ecb_enable(dtrace_ecb_t *ecb)
11186 {
11187 dtrace_probe_t *probe = ecb->dte_probe;
11188
11189 ASSERT(MUTEX_HELD(&cpu_lock));
11190 ASSERT(MUTEX_HELD(&dtrace_lock));
11191 ASSERT(ecb->dte_next == NULL);
11192
11193 if (probe == NULL) {
11194 /*
11195 * This is the NULL probe -- there's nothing to do.
11196 */
11197 return;
11198 }
11199
11200 if (probe->dtpr_ecb == NULL) {
11201 dtrace_provider_t *prov = probe->dtpr_provider;
11202
11203 /*
11204 * We're the first ECB on this probe.
11205 */
11206 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
11207
11208 if (ecb->dte_predicate != NULL)
11209 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
11210
11211 prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
11212 probe->dtpr_id, probe->dtpr_arg);
11213 } else {
11214 /*
11215 * This probe is already active. Swing the last pointer to
11216 * point to the new ECB, and issue a dtrace_sync() to assure
11217 * that all CPUs have seen the change.
11218 */
11219 ASSERT(probe->dtpr_ecb_last != NULL);
11220 probe->dtpr_ecb_last->dte_next = ecb;
11221 probe->dtpr_ecb_last = ecb;
11222 probe->dtpr_predcache = 0;
11223
11224 dtrace_sync();
11225 }
11226 }
11227
11228 static int
11229 dtrace_ecb_resize(dtrace_ecb_t *ecb)
11230 {
11231 dtrace_action_t *act;
11232 uint32_t curneeded = UINT32_MAX;
11233 uint32_t aggbase = UINT32_MAX;
11234
11235 /*
11236 * If we record anything, we always record the dtrace_rechdr_t. (And
11237 * we always record it first.)
11238 */
11239 ecb->dte_size = sizeof (dtrace_rechdr_t);
11240 ecb->dte_alignment = sizeof (dtrace_epid_t);
11241
11242 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11243 dtrace_recdesc_t *rec = &act->dta_rec;
11244 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
11245
11246 ecb->dte_alignment = MAX(ecb->dte_alignment,
11247 rec->dtrd_alignment);
11248
11249 if (DTRACEACT_ISAGG(act->dta_kind)) {
11250 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11251
11252 ASSERT(rec->dtrd_size != 0);
11253 ASSERT(agg->dtag_first != NULL);
11254 ASSERT(act->dta_prev->dta_intuple);
11255 ASSERT(aggbase != UINT32_MAX);
11256 ASSERT(curneeded != UINT32_MAX);
11257
11258 agg->dtag_base = aggbase;
11259
11260 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11261 rec->dtrd_offset = curneeded;
11262 if (curneeded + rec->dtrd_size < curneeded)
11263 return (EINVAL);
11264 curneeded += rec->dtrd_size;
11265 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11266
11267 aggbase = UINT32_MAX;
11268 curneeded = UINT32_MAX;
11269 } else if (act->dta_intuple) {
11270 if (curneeded == UINT32_MAX) {
11271 /*
11272 * This is the first record in a tuple. Align
11273 * curneeded to be at offset 4 in an 8-byte
11274 * aligned block.
11275 */
11276 ASSERT(act->dta_prev == NULL ||
11277 !act->dta_prev->dta_intuple);
11278 ASSERT3U(aggbase, ==, UINT32_MAX);
11279 curneeded = P2PHASEUP(ecb->dte_size,
11280 sizeof (uint64_t), sizeof (dtrace_aggid_t));
11281
11282 aggbase = curneeded - sizeof (dtrace_aggid_t);
11283 ASSERT(IS_P2ALIGNED(aggbase,
11284 sizeof (uint64_t)));
11285 }
11286 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11287 rec->dtrd_offset = curneeded;
11288 if (curneeded + rec->dtrd_size < curneeded)
11289 return (EINVAL);
11290 curneeded += rec->dtrd_size;
11291 } else {
11292 /* tuples must be followed by an aggregation */
11293 ASSERT(act->dta_prev == NULL ||
11294 !act->dta_prev->dta_intuple);
11295
11296 ecb->dte_size = P2ROUNDUP(ecb->dte_size,
11297 rec->dtrd_alignment);
11298 rec->dtrd_offset = ecb->dte_size;
11299 if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11300 return (EINVAL);
11301 ecb->dte_size += rec->dtrd_size;
11302 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11303 }
11304 }
11305
11306 if ((act = ecb->dte_action) != NULL &&
11307 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11308 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11309 /*
11310 * If the size is still sizeof (dtrace_rechdr_t), then all
11311 * actions store no data; set the size to 0.
11312 */
11313 ecb->dte_size = 0;
11314 }
11315
11316 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11317 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11318 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
11319 ecb->dte_needed);
11320 return (0);
11321 }
11322
11323 static dtrace_action_t *
11324 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11325 {
11326 dtrace_aggregation_t *agg;
11327 size_t size = sizeof (uint64_t);
11328 int ntuple = desc->dtad_ntuple;
11329 dtrace_action_t *act;
11330 dtrace_recdesc_t *frec;
11331 dtrace_aggid_t aggid;
11332 dtrace_state_t *state = ecb->dte_state;
11333
11334 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11335 agg->dtag_ecb = ecb;
11336
11337 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11338
11339 switch (desc->dtad_kind) {
11340 case DTRACEAGG_MIN:
11341 agg->dtag_initial = INT64_MAX;
11342 agg->dtag_aggregate = dtrace_aggregate_min;
11343 break;
11344
11345 case DTRACEAGG_MAX:
11346 agg->dtag_initial = INT64_MIN;
11347 agg->dtag_aggregate = dtrace_aggregate_max;
11348 break;
11349
11350 case DTRACEAGG_COUNT:
11351 agg->dtag_aggregate = dtrace_aggregate_count;
11352 break;
11353
11354 case DTRACEAGG_QUANTIZE:
11355 agg->dtag_aggregate = dtrace_aggregate_quantize;
11356 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11357 sizeof (uint64_t);
11358 break;
11359
11360 case DTRACEAGG_LQUANTIZE: {
11361 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11362 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11363
11364 agg->dtag_initial = desc->dtad_arg;
11365 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11366
11367 if (step == 0 || levels == 0)
11368 goto err;
11369
11370 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11371 break;
11372 }
11373
11374 case DTRACEAGG_LLQUANTIZE: {
11375 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11376 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11377 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11378 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11379 int64_t v;
11380
11381 agg->dtag_initial = desc->dtad_arg;
11382 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11383
11384 if (factor < 2 || low >= high || nsteps < factor)
11385 goto err;
11386
11387 /*
11388 * Now check that the number of steps evenly divides a power
11389 * of the factor. (This assures both integer bucket size and
11390 * linearity within each magnitude.)
11391 */
11392 for (v = factor; v < nsteps; v *= factor)
11393 continue;
11394
11395 if ((v % nsteps) || (nsteps % factor))
11396 goto err;
11397
11398 size = (dtrace_aggregate_llquantize_bucket(factor,
11399 low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11400 break;
11401 }
11402
11403 case DTRACEAGG_AVG:
11404 agg->dtag_aggregate = dtrace_aggregate_avg;
11405 size = sizeof (uint64_t) * 2;
11406 break;
11407
11408 case DTRACEAGG_STDDEV:
11409 agg->dtag_aggregate = dtrace_aggregate_stddev;
11410 size = sizeof (uint64_t) * 4;
11411 break;
11412
11413 case DTRACEAGG_SUM:
11414 agg->dtag_aggregate = dtrace_aggregate_sum;
11415 break;
11416
11417 default:
11418 goto err;
11419 }
11420
11421 agg->dtag_action.dta_rec.dtrd_size = size;
11422
11423 if (ntuple == 0)
11424 goto err;
11425
11426 /*
11427 * We must make sure that we have enough actions for the n-tuple.
11428 */
11429 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11430 if (DTRACEACT_ISAGG(act->dta_kind))
11431 break;
11432
11433 if (--ntuple == 0) {
11434 /*
11435 * This is the action with which our n-tuple begins.
11436 */
11437 agg->dtag_first = act;
11438 goto success;
11439 }
11440 }
11441
11442 /*
11443 * This n-tuple is short by ntuple elements. Return failure.
11444 */
11445 ASSERT(ntuple != 0);
11446 err:
11447 kmem_free(agg, sizeof (dtrace_aggregation_t));
11448 return (NULL);
11449
11450 success:
11451 /*
11452 * If the last action in the tuple has a size of zero, it's actually
11453 * an expression argument for the aggregating action.
11454 */
11455 ASSERT(ecb->dte_action_last != NULL);
11456 act = ecb->dte_action_last;
11457
11458 if (act->dta_kind == DTRACEACT_DIFEXPR) {
11459 ASSERT(act->dta_difo != NULL);
11460
11461 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11462 agg->dtag_hasarg = 1;
11463 }
11464
11465 /*
11466 * We need to allocate an id for this aggregation.
11467 */
11468 #ifdef illumos
11469 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11470 VM_BESTFIT | VM_SLEEP);
11471 #else
11472 aggid = alloc_unr(state->dts_aggid_arena);
11473 #endif
11474
11475 if (aggid - 1 >= state->dts_naggregations) {
11476 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11477 dtrace_aggregation_t **aggs;
11478 int naggs = state->dts_naggregations << 1;
11479 int onaggs = state->dts_naggregations;
11480
11481 ASSERT(aggid == state->dts_naggregations + 1);
11482
11483 if (naggs == 0) {
11484 ASSERT(oaggs == NULL);
11485 naggs = 1;
11486 }
11487
11488 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11489
11490 if (oaggs != NULL) {
11491 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11492 kmem_free(oaggs, onaggs * sizeof (*aggs));
11493 }
11494
11495 state->dts_aggregations = aggs;
11496 state->dts_naggregations = naggs;
11497 }
11498
11499 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11500 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11501
11502 frec = &agg->dtag_first->dta_rec;
11503 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11504 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11505
11506 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11507 ASSERT(!act->dta_intuple);
11508 act->dta_intuple = 1;
11509 }
11510
11511 return (&agg->dtag_action);
11512 }
11513
11514 static void
11515 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11516 {
11517 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11518 dtrace_state_t *state = ecb->dte_state;
11519 dtrace_aggid_t aggid = agg->dtag_id;
11520
11521 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11522 #ifdef illumos
11523 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11524 #else
11525 free_unr(state->dts_aggid_arena, aggid);
11526 #endif
11527
11528 ASSERT(state->dts_aggregations[aggid - 1] == agg);
11529 state->dts_aggregations[aggid - 1] = NULL;
11530
11531 kmem_free(agg, sizeof (dtrace_aggregation_t));
11532 }
11533
11534 static int
11535 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11536 {
11537 dtrace_action_t *action, *last;
11538 dtrace_difo_t *dp = desc->dtad_difo;
11539 uint32_t size = 0, align = sizeof (uint8_t), mask;
11540 uint16_t format = 0;
11541 dtrace_recdesc_t *rec;
11542 dtrace_state_t *state = ecb->dte_state;
11543 dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11544 uint64_t arg = desc->dtad_arg;
11545
11546 ASSERT(MUTEX_HELD(&dtrace_lock));
11547 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11548
11549 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11550 /*
11551 * If this is an aggregating action, there must be neither
11552 * a speculate nor a commit on the action chain.
11553 */
11554 dtrace_action_t *act;
11555
11556 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11557 if (act->dta_kind == DTRACEACT_COMMIT)
11558 return (EINVAL);
11559
11560 if (act->dta_kind == DTRACEACT_SPECULATE)
11561 return (EINVAL);
11562 }
11563
11564 action = dtrace_ecb_aggregation_create(ecb, desc);
11565
11566 if (action == NULL)
11567 return (EINVAL);
11568 } else {
11569 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11570 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11571 dp != NULL && dp->dtdo_destructive)) {
11572 state->dts_destructive = 1;
11573 }
11574
11575 switch (desc->dtad_kind) {
11576 case DTRACEACT_PRINTF:
11577 case DTRACEACT_PRINTA:
11578 case DTRACEACT_SYSTEM:
11579 case DTRACEACT_FREOPEN:
11580 case DTRACEACT_DIFEXPR:
11581 /*
11582 * We know that our arg is a string -- turn it into a
11583 * format.
11584 */
11585 if (arg == 0) {
11586 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11587 desc->dtad_kind == DTRACEACT_DIFEXPR);
11588 format = 0;
11589 } else {
11590 ASSERT(arg != 0);
11591 #ifdef illumos
11592 ASSERT(arg > KERNELBASE);
11593 #endif
11594 format = dtrace_format_add(state,
11595 (char *)(uintptr_t)arg);
11596 }
11597
11598 /*FALLTHROUGH*/
11599 case DTRACEACT_LIBACT:
11600 case DTRACEACT_TRACEMEM:
11601 case DTRACEACT_TRACEMEM_DYNSIZE:
11602 if (dp == NULL)
11603 return (EINVAL);
11604
11605 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11606 break;
11607
11608 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11609 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11610 return (EINVAL);
11611
11612 size = opt[DTRACEOPT_STRSIZE];
11613 }
11614
11615 break;
11616
11617 case DTRACEACT_STACK:
11618 if ((nframes = arg) == 0) {
11619 nframes = opt[DTRACEOPT_STACKFRAMES];
11620 ASSERT(nframes > 0);
11621 arg = nframes;
11622 }
11623
11624 size = nframes * sizeof (pc_t);
11625 break;
11626
11627 case DTRACEACT_JSTACK:
11628 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11629 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11630
11631 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11632 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11633
11634 arg = DTRACE_USTACK_ARG(nframes, strsize);
11635
11636 /*FALLTHROUGH*/
11637 case DTRACEACT_USTACK:
11638 if (desc->dtad_kind != DTRACEACT_JSTACK &&
11639 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11640 strsize = DTRACE_USTACK_STRSIZE(arg);
11641 nframes = opt[DTRACEOPT_USTACKFRAMES];
11642 ASSERT(nframes > 0);
11643 arg = DTRACE_USTACK_ARG(nframes, strsize);
11644 }
11645
11646 /*
11647 * Save a slot for the pid.
11648 */
11649 size = (nframes + 1) * sizeof (uint64_t);
11650 size += DTRACE_USTACK_STRSIZE(arg);
11651 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11652
11653 break;
11654
11655 case DTRACEACT_SYM:
11656 case DTRACEACT_MOD:
11657 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11658 sizeof (uint64_t)) ||
11659 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11660 return (EINVAL);
11661 break;
11662
11663 case DTRACEACT_USYM:
11664 case DTRACEACT_UMOD:
11665 case DTRACEACT_UADDR:
11666 if (dp == NULL ||
11667 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11668 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11669 return (EINVAL);
11670
11671 /*
11672 * We have a slot for the pid, plus a slot for the
11673 * argument. To keep things simple (aligned with
11674 * bitness-neutral sizing), we store each as a 64-bit
11675 * quantity.
11676 */
11677 size = 2 * sizeof (uint64_t);
11678 break;
11679
11680 case DTRACEACT_STOP:
11681 case DTRACEACT_BREAKPOINT:
11682 case DTRACEACT_PANIC:
11683 break;
11684
11685 case DTRACEACT_CHILL:
11686 case DTRACEACT_DISCARD:
11687 case DTRACEACT_RAISE:
11688 if (dp == NULL)
11689 return (EINVAL);
11690 break;
11691
11692 case DTRACEACT_EXIT:
11693 if (dp == NULL ||
11694 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11695 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11696 return (EINVAL);
11697 break;
11698
11699 case DTRACEACT_SPECULATE:
11700 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11701 return (EINVAL);
11702
11703 if (dp == NULL)
11704 return (EINVAL);
11705
11706 state->dts_speculates = 1;
11707 break;
11708
11709 case DTRACEACT_PRINTM:
11710 size = dp->dtdo_rtype.dtdt_size;
11711 break;
11712
11713 case DTRACEACT_COMMIT: {
11714 dtrace_action_t *act = ecb->dte_action;
11715
11716 for (; act != NULL; act = act->dta_next) {
11717 if (act->dta_kind == DTRACEACT_COMMIT)
11718 return (EINVAL);
11719 }
11720
11721 if (dp == NULL)
11722 return (EINVAL);
11723 break;
11724 }
11725
11726 default:
11727 return (EINVAL);
11728 }
11729
11730 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11731 /*
11732 * If this is a data-storing action or a speculate,
11733 * we must be sure that there isn't a commit on the
11734 * action chain.
11735 */
11736 dtrace_action_t *act = ecb->dte_action;
11737
11738 for (; act != NULL; act = act->dta_next) {
11739 if (act->dta_kind == DTRACEACT_COMMIT)
11740 return (EINVAL);
11741 }
11742 }
11743
11744 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11745 action->dta_rec.dtrd_size = size;
11746 }
11747
11748 action->dta_refcnt = 1;
11749 rec = &action->dta_rec;
11750 size = rec->dtrd_size;
11751
11752 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11753 if (!(size & mask)) {
11754 align = mask + 1;
11755 break;
11756 }
11757 }
11758
11759 action->dta_kind = desc->dtad_kind;
11760
11761 if ((action->dta_difo = dp) != NULL)
11762 dtrace_difo_hold(dp);
11763
11764 rec->dtrd_action = action->dta_kind;
11765 rec->dtrd_arg = arg;
11766 rec->dtrd_uarg = desc->dtad_uarg;
11767 rec->dtrd_alignment = (uint16_t)align;
11768 rec->dtrd_format = format;
11769
11770 if ((last = ecb->dte_action_last) != NULL) {
11771 ASSERT(ecb->dte_action != NULL);
11772 action->dta_prev = last;
11773 last->dta_next = action;
11774 } else {
11775 ASSERT(ecb->dte_action == NULL);
11776 ecb->dte_action = action;
11777 }
11778
11779 ecb->dte_action_last = action;
11780
11781 return (0);
11782 }
11783
11784 static void
11785 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11786 {
11787 dtrace_action_t *act = ecb->dte_action, *next;
11788 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11789 dtrace_difo_t *dp;
11790 uint16_t format;
11791
11792 if (act != NULL && act->dta_refcnt > 1) {
11793 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11794 act->dta_refcnt--;
11795 } else {
11796 for (; act != NULL; act = next) {
11797 next = act->dta_next;
11798 ASSERT(next != NULL || act == ecb->dte_action_last);
11799 ASSERT(act->dta_refcnt == 1);
11800
11801 if ((format = act->dta_rec.dtrd_format) != 0)
11802 dtrace_format_remove(ecb->dte_state, format);
11803
11804 if ((dp = act->dta_difo) != NULL)
11805 dtrace_difo_release(dp, vstate);
11806
11807 if (DTRACEACT_ISAGG(act->dta_kind)) {
11808 dtrace_ecb_aggregation_destroy(ecb, act);
11809 } else {
11810 kmem_free(act, sizeof (dtrace_action_t));
11811 }
11812 }
11813 }
11814
11815 ecb->dte_action = NULL;
11816 ecb->dte_action_last = NULL;
11817 ecb->dte_size = 0;
11818 }
11819
11820 static void
11821 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11822 {
11823 /*
11824 * We disable the ECB by removing it from its probe.
11825 */
11826 dtrace_ecb_t *pecb, *prev = NULL;
11827 dtrace_probe_t *probe = ecb->dte_probe;
11828
11829 ASSERT(MUTEX_HELD(&dtrace_lock));
11830
11831 if (probe == NULL) {
11832 /*
11833 * This is the NULL probe; there is nothing to disable.
11834 */
11835 return;
11836 }
11837
11838 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11839 if (pecb == ecb)
11840 break;
11841 prev = pecb;
11842 }
11843
11844 ASSERT(pecb != NULL);
11845
11846 if (prev == NULL) {
11847 probe->dtpr_ecb = ecb->dte_next;
11848 } else {
11849 prev->dte_next = ecb->dte_next;
11850 }
11851
11852 if (ecb == probe->dtpr_ecb_last) {
11853 ASSERT(ecb->dte_next == NULL);
11854 probe->dtpr_ecb_last = prev;
11855 }
11856
11857 /*
11858 * The ECB has been disconnected from the probe; now sync to assure
11859 * that all CPUs have seen the change before returning.
11860 */
11861 dtrace_sync();
11862
11863 if (probe->dtpr_ecb == NULL) {
11864 /*
11865 * That was the last ECB on the probe; clear the predicate
11866 * cache ID for the probe, disable it and sync one more time
11867 * to assure that we'll never hit it again.
11868 */
11869 dtrace_provider_t *prov = probe->dtpr_provider;
11870
11871 ASSERT(ecb->dte_next == NULL);
11872 ASSERT(probe->dtpr_ecb_last == NULL);
11873 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11874 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11875 probe->dtpr_id, probe->dtpr_arg);
11876 dtrace_sync();
11877 } else {
11878 /*
11879 * There is at least one ECB remaining on the probe. If there
11880 * is _exactly_ one, set the probe's predicate cache ID to be
11881 * the predicate cache ID of the remaining ECB.
11882 */
11883 ASSERT(probe->dtpr_ecb_last != NULL);
11884 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11885
11886 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11887 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11888
11889 ASSERT(probe->dtpr_ecb->dte_next == NULL);
11890
11891 if (p != NULL)
11892 probe->dtpr_predcache = p->dtp_cacheid;
11893 }
11894
11895 ecb->dte_next = NULL;
11896 }
11897 }
11898
11899 static void
11900 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11901 {
11902 dtrace_state_t *state = ecb->dte_state;
11903 dtrace_vstate_t *vstate = &state->dts_vstate;
11904 dtrace_predicate_t *pred;
11905 dtrace_epid_t epid = ecb->dte_epid;
11906
11907 ASSERT(MUTEX_HELD(&dtrace_lock));
11908 ASSERT(ecb->dte_next == NULL);
11909 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11910
11911 if ((pred = ecb->dte_predicate) != NULL)
11912 dtrace_predicate_release(pred, vstate);
11913
11914 dtrace_ecb_action_remove(ecb);
11915
11916 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11917 state->dts_ecbs[epid - 1] = NULL;
11918
11919 kmem_free(ecb, sizeof (dtrace_ecb_t));
11920 }
11921
11922 static dtrace_ecb_t *
11923 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11924 dtrace_enabling_t *enab)
11925 {
11926 dtrace_ecb_t *ecb;
11927 dtrace_predicate_t *pred;
11928 dtrace_actdesc_t *act;
11929 dtrace_provider_t *prov;
11930 dtrace_ecbdesc_t *desc = enab->dten_current;
11931
11932 ASSERT(MUTEX_HELD(&dtrace_lock));
11933 ASSERT(state != NULL);
11934
11935 ecb = dtrace_ecb_add(state, probe);
11936 ecb->dte_uarg = desc->dted_uarg;
11937
11938 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11939 dtrace_predicate_hold(pred);
11940 ecb->dte_predicate = pred;
11941 }
11942
11943 if (probe != NULL) {
11944 /*
11945 * If the provider shows more leg than the consumer is old
11946 * enough to see, we need to enable the appropriate implicit
11947 * predicate bits to prevent the ecb from activating at
11948 * revealing times.
11949 *
11950 * Providers specifying DTRACE_PRIV_USER at register time
11951 * are stating that they need the /proc-style privilege
11952 * model to be enforced, and this is what DTRACE_COND_OWNER
11953 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11954 */
11955 prov = probe->dtpr_provider;
11956 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11957 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11958 ecb->dte_cond |= DTRACE_COND_OWNER;
11959
11960 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11961 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11962 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11963
11964 /*
11965 * If the provider shows us kernel innards and the user
11966 * is lacking sufficient privilege, enable the
11967 * DTRACE_COND_USERMODE implicit predicate.
11968 */
11969 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11970 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11971 ecb->dte_cond |= DTRACE_COND_USERMODE;
11972 }
11973
11974 if (dtrace_ecb_create_cache != NULL) {
11975 /*
11976 * If we have a cached ecb, we'll use its action list instead
11977 * of creating our own (saving both time and space).
11978 */
11979 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11980 dtrace_action_t *act = cached->dte_action;
11981
11982 if (act != NULL) {
11983 ASSERT(act->dta_refcnt > 0);
11984 act->dta_refcnt++;
11985 ecb->dte_action = act;
11986 ecb->dte_action_last = cached->dte_action_last;
11987 ecb->dte_needed = cached->dte_needed;
11988 ecb->dte_size = cached->dte_size;
11989 ecb->dte_alignment = cached->dte_alignment;
11990 }
11991
11992 return (ecb);
11993 }
11994
11995 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11996 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11997 dtrace_ecb_destroy(ecb);
11998 return (NULL);
11999 }
12000 }
12001
12002 if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
12003 dtrace_ecb_destroy(ecb);
12004 return (NULL);
12005 }
12006
12007 return (dtrace_ecb_create_cache = ecb);
12008 }
12009
12010 static int
12011 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
12012 {
12013 dtrace_ecb_t *ecb;
12014 dtrace_enabling_t *enab = arg;
12015 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
12016
12017 ASSERT(state != NULL);
12018
12019 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
12020 /*
12021 * This probe was created in a generation for which this
12022 * enabling has previously created ECBs; we don't want to
12023 * enable it again, so just kick out.
12024 */
12025 return (DTRACE_MATCH_NEXT);
12026 }
12027
12028 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
12029 return (DTRACE_MATCH_DONE);
12030
12031 dtrace_ecb_enable(ecb);
12032 return (DTRACE_MATCH_NEXT);
12033 }
12034
12035 static dtrace_ecb_t *
12036 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
12037 {
12038 dtrace_ecb_t *ecb;
12039
12040 ASSERT(MUTEX_HELD(&dtrace_lock));
12041
12042 if (id == 0 || id > state->dts_necbs)
12043 return (NULL);
12044
12045 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
12046 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
12047
12048 return (state->dts_ecbs[id - 1]);
12049 }
12050
12051 static dtrace_aggregation_t *
12052 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
12053 {
12054 dtrace_aggregation_t *agg;
12055
12056 ASSERT(MUTEX_HELD(&dtrace_lock));
12057
12058 if (id == 0 || id > state->dts_naggregations)
12059 return (NULL);
12060
12061 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
12062 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
12063 agg->dtag_id == id);
12064
12065 return (state->dts_aggregations[id - 1]);
12066 }
12067
12068 /*
12069 * DTrace Buffer Functions
12070 *
12071 * The following functions manipulate DTrace buffers. Most of these functions
12072 * are called in the context of establishing or processing consumer state;
12073 * exceptions are explicitly noted.
12074 */
12075
12076 /*
12077 * Note: called from cross call context. This function switches the two
12078 * buffers on a given CPU. The atomicity of this operation is assured by
12079 * disabling interrupts while the actual switch takes place; the disabling of
12080 * interrupts serializes the execution with any execution of dtrace_probe() on
12081 * the same CPU.
12082 */
12083 static void
12084 dtrace_buffer_switch(dtrace_buffer_t *buf)
12085 {
12086 caddr_t tomax = buf->dtb_tomax;
12087 caddr_t xamot = buf->dtb_xamot;
12088 dtrace_icookie_t cookie;
12089 hrtime_t now;
12090
12091 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12092 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
12093
12094 cookie = dtrace_interrupt_disable();
12095 now = dtrace_gethrtime();
12096 buf->dtb_tomax = xamot;
12097 buf->dtb_xamot = tomax;
12098 buf->dtb_xamot_drops = buf->dtb_drops;
12099 buf->dtb_xamot_offset = buf->dtb_offset;
12100 buf->dtb_xamot_errors = buf->dtb_errors;
12101 buf->dtb_xamot_flags = buf->dtb_flags;
12102 buf->dtb_offset = 0;
12103 buf->dtb_drops = 0;
12104 buf->dtb_errors = 0;
12105 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
12106 buf->dtb_interval = now - buf->dtb_switched;
12107 buf->dtb_switched = now;
12108 dtrace_interrupt_enable(cookie);
12109 }
12110
12111 /*
12112 * Note: called from cross call context. This function activates a buffer
12113 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
12114 * is guaranteed by the disabling of interrupts.
12115 */
12116 static void
12117 dtrace_buffer_activate(dtrace_state_t *state)
12118 {
12119 dtrace_buffer_t *buf;
12120 dtrace_icookie_t cookie = dtrace_interrupt_disable();
12121
12122 buf = &state->dts_buffer[curcpu];
12123
12124 if (buf->dtb_tomax != NULL) {
12125 /*
12126 * We might like to assert that the buffer is marked inactive,
12127 * but this isn't necessarily true: the buffer for the CPU
12128 * that processes the BEGIN probe has its buffer activated
12129 * manually. In this case, we take the (harmless) action
12130 * re-clearing the bit INACTIVE bit.
12131 */
12132 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
12133 }
12134
12135 dtrace_interrupt_enable(cookie);
12136 }
12137
12138 #ifdef __FreeBSD__
12139 /*
12140 * Activate the specified per-CPU buffer. This is used instead of
12141 * dtrace_buffer_activate() when APs have not yet started, i.e. when
12142 * activating anonymous state.
12143 */
12144 static void
12145 dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu)
12146 {
12147
12148 if (state->dts_buffer[cpu].dtb_tomax != NULL)
12149 state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
12150 }
12151 #endif
12152
12153 static int
12154 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
12155 processorid_t cpu, int *factor)
12156 {
12157 #ifdef illumos
12158 cpu_t *cp;
12159 #endif
12160 dtrace_buffer_t *buf;
12161 int allocated = 0, desired = 0;
12162
12163 #ifdef illumos
12164 ASSERT(MUTEX_HELD(&cpu_lock));
12165 ASSERT(MUTEX_HELD(&dtrace_lock));
12166
12167 *factor = 1;
12168
12169 if (size > dtrace_nonroot_maxsize &&
12170 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
12171 return (EFBIG);
12172
12173 cp = cpu_list;
12174
12175 do {
12176 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12177 continue;
12178
12179 buf = &bufs[cp->cpu_id];
12180
12181 /*
12182 * If there is already a buffer allocated for this CPU, it
12183 * is only possible that this is a DR event. In this case,
12184 */
12185 if (buf->dtb_tomax != NULL) {
12186 ASSERT(buf->dtb_size == size);
12187 continue;
12188 }
12189
12190 ASSERT(buf->dtb_xamot == NULL);
12191
12192 if ((buf->dtb_tomax = kmem_zalloc(size,
12193 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12194 goto err;
12195
12196 buf->dtb_size = size;
12197 buf->dtb_flags = flags;
12198 buf->dtb_offset = 0;
12199 buf->dtb_drops = 0;
12200
12201 if (flags & DTRACEBUF_NOSWITCH)
12202 continue;
12203
12204 if ((buf->dtb_xamot = kmem_zalloc(size,
12205 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12206 goto err;
12207 } while ((cp = cp->cpu_next) != cpu_list);
12208
12209 return (0);
12210
12211 err:
12212 cp = cpu_list;
12213
12214 do {
12215 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12216 continue;
12217
12218 buf = &bufs[cp->cpu_id];
12219 desired += 2;
12220
12221 if (buf->dtb_xamot != NULL) {
12222 ASSERT(buf->dtb_tomax != NULL);
12223 ASSERT(buf->dtb_size == size);
12224 kmem_free(buf->dtb_xamot, size);
12225 allocated++;
12226 }
12227
12228 if (buf->dtb_tomax != NULL) {
12229 ASSERT(buf->dtb_size == size);
12230 kmem_free(buf->dtb_tomax, size);
12231 allocated++;
12232 }
12233
12234 buf->dtb_tomax = NULL;
12235 buf->dtb_xamot = NULL;
12236 buf->dtb_size = 0;
12237 } while ((cp = cp->cpu_next) != cpu_list);
12238 #else
12239 int i;
12240
12241 *factor = 1;
12242 #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
12243 defined(__mips__) || defined(__powerpc__) || defined(__riscv)
12244 /*
12245 * FreeBSD isn't good at limiting the amount of memory we
12246 * ask to malloc, so let's place a limit here before trying
12247 * to do something that might well end in tears at bedtime.
12248 */
12249 int bufsize_percpu_frac = dtrace_bufsize_max_frac * mp_ncpus;
12250 if (size > physmem * PAGE_SIZE / bufsize_percpu_frac)
12251 return (ENOMEM);
12252 #endif
12253
12254 ASSERT(MUTEX_HELD(&dtrace_lock));
12255 CPU_FOREACH(i) {
12256 if (cpu != DTRACE_CPUALL && cpu != i)
12257 continue;
12258
12259 buf = &bufs[i];
12260
12261 /*
12262 * If there is already a buffer allocated for this CPU, it
12263 * is only possible that this is a DR event. In this case,
12264 * the buffer size must match our specified size.
12265 */
12266 if (buf->dtb_tomax != NULL) {
12267 ASSERT(buf->dtb_size == size);
12268 continue;
12269 }
12270
12271 ASSERT(buf->dtb_xamot == NULL);
12272
12273 if ((buf->dtb_tomax = kmem_zalloc(size,
12274 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12275 goto err;
12276
12277 buf->dtb_size = size;
12278 buf->dtb_flags = flags;
12279 buf->dtb_offset = 0;
12280 buf->dtb_drops = 0;
12281
12282 if (flags & DTRACEBUF_NOSWITCH)
12283 continue;
12284
12285 if ((buf->dtb_xamot = kmem_zalloc(size,
12286 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12287 goto err;
12288 }
12289
12290 return (0);
12291
12292 err:
12293 /*
12294 * Error allocating memory, so free the buffers that were
12295 * allocated before the failed allocation.
12296 */
12297 CPU_FOREACH(i) {
12298 if (cpu != DTRACE_CPUALL && cpu != i)
12299 continue;
12300
12301 buf = &bufs[i];
12302 desired += 2;
12303
12304 if (buf->dtb_xamot != NULL) {
12305 ASSERT(buf->dtb_tomax != NULL);
12306 ASSERT(buf->dtb_size == size);
12307 kmem_free(buf->dtb_xamot, size);
12308 allocated++;
12309 }
12310
12311 if (buf->dtb_tomax != NULL) {
12312 ASSERT(buf->dtb_size == size);
12313 kmem_free(buf->dtb_tomax, size);
12314 allocated++;
12315 }
12316
12317 buf->dtb_tomax = NULL;
12318 buf->dtb_xamot = NULL;
12319 buf->dtb_size = 0;
12320
12321 }
12322 #endif
12323 *factor = desired / (allocated > 0 ? allocated : 1);
12324
12325 return (ENOMEM);
12326 }
12327
12328 /*
12329 * Note: called from probe context. This function just increments the drop
12330 * count on a buffer. It has been made a function to allow for the
12331 * possibility of understanding the source of mysterious drop counts. (A
12332 * problem for which one may be particularly disappointed that DTrace cannot
12333 * be used to understand DTrace.)
12334 */
12335 static void
12336 dtrace_buffer_drop(dtrace_buffer_t *buf)
12337 {
12338 buf->dtb_drops++;
12339 }
12340
12341 /*
12342 * Note: called from probe context. This function is called to reserve space
12343 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
12344 * mstate. Returns the new offset in the buffer, or a negative value if an
12345 * error has occurred.
12346 */
12347 static ssize_t
12348 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12349 dtrace_state_t *state, dtrace_mstate_t *mstate)
12350 {
12351 ssize_t offs = buf->dtb_offset, soffs;
12352 intptr_t woffs;
12353 caddr_t tomax;
12354 size_t total;
12355
12356 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12357 return (-1);
12358
12359 if ((tomax = buf->dtb_tomax) == NULL) {
12360 dtrace_buffer_drop(buf);
12361 return (-1);
12362 }
12363
12364 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12365 while (offs & (align - 1)) {
12366 /*
12367 * Assert that our alignment is off by a number which
12368 * is itself sizeof (uint32_t) aligned.
12369 */
12370 ASSERT(!((align - (offs & (align - 1))) &
12371 (sizeof (uint32_t) - 1)));
12372 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12373 offs += sizeof (uint32_t);
12374 }
12375
12376 if ((soffs = offs + needed) > buf->dtb_size) {
12377 dtrace_buffer_drop(buf);
12378 return (-1);
12379 }
12380
12381 if (mstate == NULL)
12382 return (offs);
12383
12384 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12385 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12386 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12387
12388 return (offs);
12389 }
12390
12391 if (buf->dtb_flags & DTRACEBUF_FILL) {
12392 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12393 (buf->dtb_flags & DTRACEBUF_FULL))
12394 return (-1);
12395 goto out;
12396 }
12397
12398 total = needed + (offs & (align - 1));
12399
12400 /*
12401 * For a ring buffer, life is quite a bit more complicated. Before
12402 * we can store any padding, we need to adjust our wrapping offset.
12403 * (If we've never before wrapped or we're not about to, no adjustment
12404 * is required.)
12405 */
12406 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12407 offs + total > buf->dtb_size) {
12408 woffs = buf->dtb_xamot_offset;
12409
12410 if (offs + total > buf->dtb_size) {
12411 /*
12412 * We can't fit in the end of the buffer. First, a
12413 * sanity check that we can fit in the buffer at all.
12414 */
12415 if (total > buf->dtb_size) {
12416 dtrace_buffer_drop(buf);
12417 return (-1);
12418 }
12419
12420 /*
12421 * We're going to be storing at the top of the buffer,
12422 * so now we need to deal with the wrapped offset. We
12423 * only reset our wrapped offset to 0 if it is
12424 * currently greater than the current offset. If it
12425 * is less than the current offset, it is because a
12426 * previous allocation induced a wrap -- but the
12427 * allocation didn't subsequently take the space due
12428 * to an error or false predicate evaluation. In this
12429 * case, we'll just leave the wrapped offset alone: if
12430 * the wrapped offset hasn't been advanced far enough
12431 * for this allocation, it will be adjusted in the
12432 * lower loop.
12433 */
12434 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12435 if (woffs >= offs)
12436 woffs = 0;
12437 } else {
12438 woffs = 0;
12439 }
12440
12441 /*
12442 * Now we know that we're going to be storing to the
12443 * top of the buffer and that there is room for us
12444 * there. We need to clear the buffer from the current
12445 * offset to the end (there may be old gunk there).
12446 */
12447 while (offs < buf->dtb_size)
12448 tomax[offs++] = 0;
12449
12450 /*
12451 * We need to set our offset to zero. And because we
12452 * are wrapping, we need to set the bit indicating as
12453 * much. We can also adjust our needed space back
12454 * down to the space required by the ECB -- we know
12455 * that the top of the buffer is aligned.
12456 */
12457 offs = 0;
12458 total = needed;
12459 buf->dtb_flags |= DTRACEBUF_WRAPPED;
12460 } else {
12461 /*
12462 * There is room for us in the buffer, so we simply
12463 * need to check the wrapped offset.
12464 */
12465 if (woffs < offs) {
12466 /*
12467 * The wrapped offset is less than the offset.
12468 * This can happen if we allocated buffer space
12469 * that induced a wrap, but then we didn't
12470 * subsequently take the space due to an error
12471 * or false predicate evaluation. This is
12472 * okay; we know that _this_ allocation isn't
12473 * going to induce a wrap. We still can't
12474 * reset the wrapped offset to be zero,
12475 * however: the space may have been trashed in
12476 * the previous failed probe attempt. But at
12477 * least the wrapped offset doesn't need to
12478 * be adjusted at all...
12479 */
12480 goto out;
12481 }
12482 }
12483
12484 while (offs + total > woffs) {
12485 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12486 size_t size;
12487
12488 if (epid == DTRACE_EPIDNONE) {
12489 size = sizeof (uint32_t);
12490 } else {
12491 ASSERT3U(epid, <=, state->dts_necbs);
12492 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12493
12494 size = state->dts_ecbs[epid - 1]->dte_size;
12495 }
12496
12497 ASSERT(woffs + size <= buf->dtb_size);
12498 ASSERT(size != 0);
12499
12500 if (woffs + size == buf->dtb_size) {
12501 /*
12502 * We've reached the end of the buffer; we want
12503 * to set the wrapped offset to 0 and break
12504 * out. However, if the offs is 0, then we're
12505 * in a strange edge-condition: the amount of
12506 * space that we want to reserve plus the size
12507 * of the record that we're overwriting is
12508 * greater than the size of the buffer. This
12509 * is problematic because if we reserve the
12510 * space but subsequently don't consume it (due
12511 * to a failed predicate or error) the wrapped
12512 * offset will be 0 -- yet the EPID at offset 0
12513 * will not be committed. This situation is
12514 * relatively easy to deal with: if we're in
12515 * this case, the buffer is indistinguishable
12516 * from one that hasn't wrapped; we need only
12517 * finish the job by clearing the wrapped bit,
12518 * explicitly setting the offset to be 0, and
12519 * zero'ing out the old data in the buffer.
12520 */
12521 if (offs == 0) {
12522 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12523 buf->dtb_offset = 0;
12524 woffs = total;
12525
12526 while (woffs < buf->dtb_size)
12527 tomax[woffs++] = 0;
12528 }
12529
12530 woffs = 0;
12531 break;
12532 }
12533
12534 woffs += size;
12535 }
12536
12537 /*
12538 * We have a wrapped offset. It may be that the wrapped offset
12539 * has become zero -- that's okay.
12540 */
12541 buf->dtb_xamot_offset = woffs;
12542 }
12543
12544 out:
12545 /*
12546 * Now we can plow the buffer with any necessary padding.
12547 */
12548 while (offs & (align - 1)) {
12549 /*
12550 * Assert that our alignment is off by a number which
12551 * is itself sizeof (uint32_t) aligned.
12552 */
12553 ASSERT(!((align - (offs & (align - 1))) &
12554 (sizeof (uint32_t) - 1)));
12555 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12556 offs += sizeof (uint32_t);
12557 }
12558
12559 if (buf->dtb_flags & DTRACEBUF_FILL) {
12560 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12561 buf->dtb_flags |= DTRACEBUF_FULL;
12562 return (-1);
12563 }
12564 }
12565
12566 if (mstate == NULL)
12567 return (offs);
12568
12569 /*
12570 * For ring buffers and fill buffers, the scratch space is always
12571 * the inactive buffer.
12572 */
12573 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12574 mstate->dtms_scratch_size = buf->dtb_size;
12575 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12576
12577 return (offs);
12578 }
12579
12580 static void
12581 dtrace_buffer_polish(dtrace_buffer_t *buf)
12582 {
12583 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12584 ASSERT(MUTEX_HELD(&dtrace_lock));
12585
12586 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12587 return;
12588
12589 /*
12590 * We need to polish the ring buffer. There are three cases:
12591 *
12592 * - The first (and presumably most common) is that there is no gap
12593 * between the buffer offset and the wrapped offset. In this case,
12594 * there is nothing in the buffer that isn't valid data; we can
12595 * mark the buffer as polished and return.
12596 *
12597 * - The second (less common than the first but still more common
12598 * than the third) is that there is a gap between the buffer offset
12599 * and the wrapped offset, and the wrapped offset is larger than the
12600 * buffer offset. This can happen because of an alignment issue, or
12601 * can happen because of a call to dtrace_buffer_reserve() that
12602 * didn't subsequently consume the buffer space. In this case,
12603 * we need to zero the data from the buffer offset to the wrapped
12604 * offset.
12605 *
12606 * - The third (and least common) is that there is a gap between the
12607 * buffer offset and the wrapped offset, but the wrapped offset is
12608 * _less_ than the buffer offset. This can only happen because a
12609 * call to dtrace_buffer_reserve() induced a wrap, but the space
12610 * was not subsequently consumed. In this case, we need to zero the
12611 * space from the offset to the end of the buffer _and_ from the
12612 * top of the buffer to the wrapped offset.
12613 */
12614 if (buf->dtb_offset < buf->dtb_xamot_offset) {
12615 bzero(buf->dtb_tomax + buf->dtb_offset,
12616 buf->dtb_xamot_offset - buf->dtb_offset);
12617 }
12618
12619 if (buf->dtb_offset > buf->dtb_xamot_offset) {
12620 bzero(buf->dtb_tomax + buf->dtb_offset,
12621 buf->dtb_size - buf->dtb_offset);
12622 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12623 }
12624 }
12625
12626 /*
12627 * This routine determines if data generated at the specified time has likely
12628 * been entirely consumed at user-level. This routine is called to determine
12629 * if an ECB on a defunct probe (but for an active enabling) can be safely
12630 * disabled and destroyed.
12631 */
12632 static int
12633 dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12634 {
12635 int i;
12636
12637 CPU_FOREACH(i) {
12638 dtrace_buffer_t *buf = &bufs[i];
12639
12640 if (buf->dtb_size == 0)
12641 continue;
12642
12643 if (buf->dtb_flags & DTRACEBUF_RING)
12644 return (0);
12645
12646 if (!buf->dtb_switched && buf->dtb_offset != 0)
12647 return (0);
12648
12649 if (buf->dtb_switched - buf->dtb_interval < when)
12650 return (0);
12651 }
12652
12653 return (1);
12654 }
12655
12656 static void
12657 dtrace_buffer_free(dtrace_buffer_t *bufs)
12658 {
12659 int i;
12660
12661 CPU_FOREACH(i) {
12662 dtrace_buffer_t *buf = &bufs[i];
12663
12664 if (buf->dtb_tomax == NULL) {
12665 ASSERT(buf->dtb_xamot == NULL);
12666 ASSERT(buf->dtb_size == 0);
12667 continue;
12668 }
12669
12670 if (buf->dtb_xamot != NULL) {
12671 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12672 kmem_free(buf->dtb_xamot, buf->dtb_size);
12673 }
12674
12675 kmem_free(buf->dtb_tomax, buf->dtb_size);
12676 buf->dtb_size = 0;
12677 buf->dtb_tomax = NULL;
12678 buf->dtb_xamot = NULL;
12679 }
12680 }
12681
12682 /*
12683 * DTrace Enabling Functions
12684 */
12685 static dtrace_enabling_t *
12686 dtrace_enabling_create(dtrace_vstate_t *vstate)
12687 {
12688 dtrace_enabling_t *enab;
12689
12690 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12691 enab->dten_vstate = vstate;
12692
12693 return (enab);
12694 }
12695
12696 static void
12697 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12698 {
12699 dtrace_ecbdesc_t **ndesc;
12700 size_t osize, nsize;
12701
12702 /*
12703 * We can't add to enablings after we've enabled them, or after we've
12704 * retained them.
12705 */
12706 ASSERT(enab->dten_probegen == 0);
12707 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12708
12709 if (enab->dten_ndesc < enab->dten_maxdesc) {
12710 enab->dten_desc[enab->dten_ndesc++] = ecb;
12711 return;
12712 }
12713
12714 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12715
12716 if (enab->dten_maxdesc == 0) {
12717 enab->dten_maxdesc = 1;
12718 } else {
12719 enab->dten_maxdesc <<= 1;
12720 }
12721
12722 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12723
12724 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12725 ndesc = kmem_zalloc(nsize, KM_SLEEP);
12726 bcopy(enab->dten_desc, ndesc, osize);
12727 if (enab->dten_desc != NULL)
12728 kmem_free(enab->dten_desc, osize);
12729
12730 enab->dten_desc = ndesc;
12731 enab->dten_desc[enab->dten_ndesc++] = ecb;
12732 }
12733
12734 static void
12735 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12736 dtrace_probedesc_t *pd)
12737 {
12738 dtrace_ecbdesc_t *new;
12739 dtrace_predicate_t *pred;
12740 dtrace_actdesc_t *act;
12741
12742 /*
12743 * We're going to create a new ECB description that matches the
12744 * specified ECB in every way, but has the specified probe description.
12745 */
12746 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12747
12748 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12749 dtrace_predicate_hold(pred);
12750
12751 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12752 dtrace_actdesc_hold(act);
12753
12754 new->dted_action = ecb->dted_action;
12755 new->dted_pred = ecb->dted_pred;
12756 new->dted_probe = *pd;
12757 new->dted_uarg = ecb->dted_uarg;
12758
12759 dtrace_enabling_add(enab, new);
12760 }
12761
12762 static void
12763 dtrace_enabling_dump(dtrace_enabling_t *enab)
12764 {
12765 int i;
12766
12767 for (i = 0; i < enab->dten_ndesc; i++) {
12768 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12769
12770 #ifdef __FreeBSD__
12771 printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i,
12772 desc->dtpd_provider, desc->dtpd_mod,
12773 desc->dtpd_func, desc->dtpd_name);
12774 #else
12775 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12776 desc->dtpd_provider, desc->dtpd_mod,
12777 desc->dtpd_func, desc->dtpd_name);
12778 #endif
12779 }
12780 }
12781
12782 static void
12783 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12784 {
12785 int i;
12786 dtrace_ecbdesc_t *ep;
12787 dtrace_vstate_t *vstate = enab->dten_vstate;
12788
12789 ASSERT(MUTEX_HELD(&dtrace_lock));
12790
12791 for (i = 0; i < enab->dten_ndesc; i++) {
12792 dtrace_actdesc_t *act, *next;
12793 dtrace_predicate_t *pred;
12794
12795 ep = enab->dten_desc[i];
12796
12797 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12798 dtrace_predicate_release(pred, vstate);
12799
12800 for (act = ep->dted_action; act != NULL; act = next) {
12801 next = act->dtad_next;
12802 dtrace_actdesc_release(act, vstate);
12803 }
12804
12805 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12806 }
12807
12808 if (enab->dten_desc != NULL)
12809 kmem_free(enab->dten_desc,
12810 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12811
12812 /*
12813 * If this was a retained enabling, decrement the dts_nretained count
12814 * and take it off of the dtrace_retained list.
12815 */
12816 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12817 dtrace_retained == enab) {
12818 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12819 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12820 enab->dten_vstate->dtvs_state->dts_nretained--;
12821 dtrace_retained_gen++;
12822 }
12823
12824 if (enab->dten_prev == NULL) {
12825 if (dtrace_retained == enab) {
12826 dtrace_retained = enab->dten_next;
12827
12828 if (dtrace_retained != NULL)
12829 dtrace_retained->dten_prev = NULL;
12830 }
12831 } else {
12832 ASSERT(enab != dtrace_retained);
12833 ASSERT(dtrace_retained != NULL);
12834 enab->dten_prev->dten_next = enab->dten_next;
12835 }
12836
12837 if (enab->dten_next != NULL) {
12838 ASSERT(dtrace_retained != NULL);
12839 enab->dten_next->dten_prev = enab->dten_prev;
12840 }
12841
12842 kmem_free(enab, sizeof (dtrace_enabling_t));
12843 }
12844
12845 static int
12846 dtrace_enabling_retain(dtrace_enabling_t *enab)
12847 {
12848 dtrace_state_t *state;
12849
12850 ASSERT(MUTEX_HELD(&dtrace_lock));
12851 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12852 ASSERT(enab->dten_vstate != NULL);
12853
12854 state = enab->dten_vstate->dtvs_state;
12855 ASSERT(state != NULL);
12856
12857 /*
12858 * We only allow each state to retain dtrace_retain_max enablings.
12859 */
12860 if (state->dts_nretained >= dtrace_retain_max)
12861 return (ENOSPC);
12862
12863 state->dts_nretained++;
12864 dtrace_retained_gen++;
12865
12866 if (dtrace_retained == NULL) {
12867 dtrace_retained = enab;
12868 return (0);
12869 }
12870
12871 enab->dten_next = dtrace_retained;
12872 dtrace_retained->dten_prev = enab;
12873 dtrace_retained = enab;
12874
12875 return (0);
12876 }
12877
12878 static int
12879 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12880 dtrace_probedesc_t *create)
12881 {
12882 dtrace_enabling_t *new, *enab;
12883 int found = 0, err = ENOENT;
12884
12885 ASSERT(MUTEX_HELD(&dtrace_lock));
12886 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12887 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12888 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12889 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12890
12891 new = dtrace_enabling_create(&state->dts_vstate);
12892
12893 /*
12894 * Iterate over all retained enablings, looking for enablings that
12895 * match the specified state.
12896 */
12897 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12898 int i;
12899
12900 /*
12901 * dtvs_state can only be NULL for helper enablings -- and
12902 * helper enablings can't be retained.
12903 */
12904 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12905
12906 if (enab->dten_vstate->dtvs_state != state)
12907 continue;
12908
12909 /*
12910 * Now iterate over each probe description; we're looking for
12911 * an exact match to the specified probe description.
12912 */
12913 for (i = 0; i < enab->dten_ndesc; i++) {
12914 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12915 dtrace_probedesc_t *pd = &ep->dted_probe;
12916
12917 if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12918 continue;
12919
12920 if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12921 continue;
12922
12923 if (strcmp(pd->dtpd_func, match->dtpd_func))
12924 continue;
12925
12926 if (strcmp(pd->dtpd_name, match->dtpd_name))
12927 continue;
12928
12929 /*
12930 * We have a winning probe! Add it to our growing
12931 * enabling.
12932 */
12933 found = 1;
12934 dtrace_enabling_addlike(new, ep, create);
12935 }
12936 }
12937
12938 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12939 dtrace_enabling_destroy(new);
12940 return (err);
12941 }
12942
12943 return (0);
12944 }
12945
12946 static void
12947 dtrace_enabling_retract(dtrace_state_t *state)
12948 {
12949 dtrace_enabling_t *enab, *next;
12950
12951 ASSERT(MUTEX_HELD(&dtrace_lock));
12952
12953 /*
12954 * Iterate over all retained enablings, destroy the enablings retained
12955 * for the specified state.
12956 */
12957 for (enab = dtrace_retained; enab != NULL; enab = next) {
12958 next = enab->dten_next;
12959
12960 /*
12961 * dtvs_state can only be NULL for helper enablings -- and
12962 * helper enablings can't be retained.
12963 */
12964 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12965
12966 if (enab->dten_vstate->dtvs_state == state) {
12967 ASSERT(state->dts_nretained > 0);
12968 dtrace_enabling_destroy(enab);
12969 }
12970 }
12971
12972 ASSERT(state->dts_nretained == 0);
12973 }
12974
12975 static int
12976 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12977 {
12978 int i = 0;
12979 int matched = 0;
12980
12981 ASSERT(MUTEX_HELD(&cpu_lock));
12982 ASSERT(MUTEX_HELD(&dtrace_lock));
12983
12984 for (i = 0; i < enab->dten_ndesc; i++) {
12985 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12986
12987 enab->dten_current = ep;
12988 enab->dten_error = 0;
12989
12990 matched += dtrace_probe_enable(&ep->dted_probe, enab);
12991
12992 if (enab->dten_error != 0) {
12993 /*
12994 * If we get an error half-way through enabling the
12995 * probes, we kick out -- perhaps with some number of
12996 * them enabled. Leaving enabled probes enabled may
12997 * be slightly confusing for user-level, but we expect
12998 * that no one will attempt to actually drive on in
12999 * the face of such errors. If this is an anonymous
13000 * enabling (indicated with a NULL nmatched pointer),
13001 * we cmn_err() a message. We aren't expecting to
13002 * get such an error -- such as it can exist at all,
13003 * it would be a result of corrupted DOF in the driver
13004 * properties.
13005 */
13006 if (nmatched == NULL) {
13007 cmn_err(CE_WARN, "dtrace_enabling_match() "
13008 "error on %p: %d", (void *)ep,
13009 enab->dten_error);
13010 }
13011
13012 return (enab->dten_error);
13013 }
13014 }
13015
13016 enab->dten_probegen = dtrace_probegen;
13017 if (nmatched != NULL)
13018 *nmatched = matched;
13019
13020 return (0);
13021 }
13022
13023 static void
13024 dtrace_enabling_matchall_task(void *args __unused)
13025 {
13026 dtrace_enabling_matchall();
13027 }
13028
13029 static void
13030 dtrace_enabling_matchall(void)
13031 {
13032 dtrace_enabling_t *enab;
13033
13034 mutex_enter(&cpu_lock);
13035 mutex_enter(&dtrace_lock);
13036
13037 /*
13038 * Iterate over all retained enablings to see if any probes match
13039 * against them. We only perform this operation on enablings for which
13040 * we have sufficient permissions by virtue of being in the global zone
13041 * or in the same zone as the DTrace client. Because we can be called
13042 * after dtrace_detach() has been called, we cannot assert that there
13043 * are retained enablings. We can safely load from dtrace_retained,
13044 * however: the taskq_destroy() at the end of dtrace_detach() will
13045 * block pending our completion.
13046 */
13047 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
13048 #ifdef illumos
13049 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
13050
13051 if (INGLOBALZONE(curproc) ||
13052 cr != NULL && getzoneid() == crgetzoneid(cr))
13053 #endif
13054 (void) dtrace_enabling_match(enab, NULL);
13055 }
13056
13057 mutex_exit(&dtrace_lock);
13058 mutex_exit(&cpu_lock);
13059 }
13060
13061 /*
13062 * If an enabling is to be enabled without having matched probes (that is, if
13063 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
13064 * enabling must be _primed_ by creating an ECB for every ECB description.
13065 * This must be done to assure that we know the number of speculations, the
13066 * number of aggregations, the minimum buffer size needed, etc. before we
13067 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
13068 * enabling any probes, we create ECBs for every ECB decription, but with a
13069 * NULL probe -- which is exactly what this function does.
13070 */
13071 static void
13072 dtrace_enabling_prime(dtrace_state_t *state)
13073 {
13074 dtrace_enabling_t *enab;
13075 int i;
13076
13077 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
13078 ASSERT(enab->dten_vstate->dtvs_state != NULL);
13079
13080 if (enab->dten_vstate->dtvs_state != state)
13081 continue;
13082
13083 /*
13084 * We don't want to prime an enabling more than once, lest
13085 * we allow a malicious user to induce resource exhaustion.
13086 * (The ECBs that result from priming an enabling aren't
13087 * leaked -- but they also aren't deallocated until the
13088 * consumer state is destroyed.)
13089 */
13090 if (enab->dten_primed)
13091 continue;
13092
13093 for (i = 0; i < enab->dten_ndesc; i++) {
13094 enab->dten_current = enab->dten_desc[i];
13095 (void) dtrace_probe_enable(NULL, enab);
13096 }
13097
13098 enab->dten_primed = 1;
13099 }
13100 }
13101
13102 /*
13103 * Called to indicate that probes should be provided due to retained
13104 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
13105 * must take an initial lap through the enabling calling the dtps_provide()
13106 * entry point explicitly to allow for autocreated probes.
13107 */
13108 static void
13109 dtrace_enabling_provide(dtrace_provider_t *prv)
13110 {
13111 int i, all = 0;
13112 dtrace_probedesc_t desc;
13113 dtrace_genid_t gen;
13114
13115 ASSERT(MUTEX_HELD(&dtrace_lock));
13116 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
13117
13118 if (prv == NULL) {
13119 all = 1;
13120 prv = dtrace_provider;
13121 }
13122
13123 do {
13124 dtrace_enabling_t *enab;
13125 void *parg = prv->dtpv_arg;
13126
13127 retry:
13128 gen = dtrace_retained_gen;
13129 for (enab = dtrace_retained; enab != NULL;
13130 enab = enab->dten_next) {
13131 for (i = 0; i < enab->dten_ndesc; i++) {
13132 desc = enab->dten_desc[i]->dted_probe;
13133 mutex_exit(&dtrace_lock);
13134 prv->dtpv_pops.dtps_provide(parg, &desc);
13135 mutex_enter(&dtrace_lock);
13136 /*
13137 * Process the retained enablings again if
13138 * they have changed while we weren't holding
13139 * dtrace_lock.
13140 */
13141 if (gen != dtrace_retained_gen)
13142 goto retry;
13143 }
13144 }
13145 } while (all && (prv = prv->dtpv_next) != NULL);
13146
13147 mutex_exit(&dtrace_lock);
13148 dtrace_probe_provide(NULL, all ? NULL : prv);
13149 mutex_enter(&dtrace_lock);
13150 }
13151
13152 /*
13153 * Called to reap ECBs that are attached to probes from defunct providers.
13154 */
13155 static void
13156 dtrace_enabling_reap(void *args __unused)
13157 {
13158 dtrace_provider_t *prov;
13159 dtrace_probe_t *probe;
13160 dtrace_ecb_t *ecb;
13161 hrtime_t when;
13162 int i;
13163
13164 mutex_enter(&cpu_lock);
13165 mutex_enter(&dtrace_lock);
13166
13167 for (i = 0; i < dtrace_nprobes; i++) {
13168 if ((probe = dtrace_probes[i]) == NULL)
13169 continue;
13170
13171 if (probe->dtpr_ecb == NULL)
13172 continue;
13173
13174 prov = probe->dtpr_provider;
13175
13176 if ((when = prov->dtpv_defunct) == 0)
13177 continue;
13178
13179 /*
13180 * We have ECBs on a defunct provider: we want to reap these
13181 * ECBs to allow the provider to unregister. The destruction
13182 * of these ECBs must be done carefully: if we destroy the ECB
13183 * and the consumer later wishes to consume an EPID that
13184 * corresponds to the destroyed ECB (and if the EPID metadata
13185 * has not been previously consumed), the consumer will abort
13186 * processing on the unknown EPID. To reduce (but not, sadly,
13187 * eliminate) the possibility of this, we will only destroy an
13188 * ECB for a defunct provider if, for the state that
13189 * corresponds to the ECB:
13190 *
13191 * (a) There is no speculative tracing (which can effectively
13192 * cache an EPID for an arbitrary amount of time).
13193 *
13194 * (b) The principal buffers have been switched twice since the
13195 * provider became defunct.
13196 *
13197 * (c) The aggregation buffers are of zero size or have been
13198 * switched twice since the provider became defunct.
13199 *
13200 * We use dts_speculates to determine (a) and call a function
13201 * (dtrace_buffer_consumed()) to determine (b) and (c). Note
13202 * that as soon as we've been unable to destroy one of the ECBs
13203 * associated with the probe, we quit trying -- reaping is only
13204 * fruitful in as much as we can destroy all ECBs associated
13205 * with the defunct provider's probes.
13206 */
13207 while ((ecb = probe->dtpr_ecb) != NULL) {
13208 dtrace_state_t *state = ecb->dte_state;
13209 dtrace_buffer_t *buf = state->dts_buffer;
13210 dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
13211
13212 if (state->dts_speculates)
13213 break;
13214
13215 if (!dtrace_buffer_consumed(buf, when))
13216 break;
13217
13218 if (!dtrace_buffer_consumed(aggbuf, when))
13219 break;
13220
13221 dtrace_ecb_disable(ecb);
13222 ASSERT(probe->dtpr_ecb != ecb);
13223 dtrace_ecb_destroy(ecb);
13224 }
13225 }
13226
13227 mutex_exit(&dtrace_lock);
13228 mutex_exit(&cpu_lock);
13229 }
13230
13231 /*
13232 * DTrace DOF Functions
13233 */
13234 /*ARGSUSED*/
13235 static void
13236 dtrace_dof_error(dof_hdr_t *dof, const char *str)
13237 {
13238 if (dtrace_err_verbose)
13239 cmn_err(CE_WARN, "failed to process DOF: %s", str);
13240
13241 #ifdef DTRACE_ERRDEBUG
13242 dtrace_errdebug(str);
13243 #endif
13244 }
13245
13246 /*
13247 * Create DOF out of a currently enabled state. Right now, we only create
13248 * DOF containing the run-time options -- but this could be expanded to create
13249 * complete DOF representing the enabled state.
13250 */
13251 static dof_hdr_t *
13252 dtrace_dof_create(dtrace_state_t *state)
13253 {
13254 dof_hdr_t *dof;
13255 dof_sec_t *sec;
13256 dof_optdesc_t *opt;
13257 int i, len = sizeof (dof_hdr_t) +
13258 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
13259 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13260
13261 ASSERT(MUTEX_HELD(&dtrace_lock));
13262
13263 dof = kmem_zalloc(len, KM_SLEEP);
13264 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
13265 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
13266 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
13267 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
13268
13269 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
13270 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
13271 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
13272 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
13273 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
13274 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
13275
13276 dof->dofh_flags = 0;
13277 dof->dofh_hdrsize = sizeof (dof_hdr_t);
13278 dof->dofh_secsize = sizeof (dof_sec_t);
13279 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
13280 dof->dofh_secoff = sizeof (dof_hdr_t);
13281 dof->dofh_loadsz = len;
13282 dof->dofh_filesz = len;
13283 dof->dofh_pad = 0;
13284
13285 /*
13286 * Fill in the option section header...
13287 */
13288 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
13289 sec->dofs_type = DOF_SECT_OPTDESC;
13290 sec->dofs_align = sizeof (uint64_t);
13291 sec->dofs_flags = DOF_SECF_LOAD;
13292 sec->dofs_entsize = sizeof (dof_optdesc_t);
13293
13294 opt = (dof_optdesc_t *)((uintptr_t)sec +
13295 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
13296
13297 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
13298 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13299
13300 for (i = 0; i < DTRACEOPT_MAX; i++) {
13301 opt[i].dofo_option = i;
13302 opt[i].dofo_strtab = DOF_SECIDX_NONE;
13303 opt[i].dofo_value = state->dts_options[i];
13304 }
13305
13306 return (dof);
13307 }
13308
13309 static dof_hdr_t *
13310 dtrace_dof_copyin(uintptr_t uarg, int *errp)
13311 {
13312 dof_hdr_t hdr, *dof;
13313
13314 ASSERT(!MUTEX_HELD(&dtrace_lock));
13315
13316 /*
13317 * First, we're going to copyin() the sizeof (dof_hdr_t).
13318 */
13319 if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
13320 dtrace_dof_error(NULL, "failed to copyin DOF header");
13321 *errp = EFAULT;
13322 return (NULL);
13323 }
13324
13325 /*
13326 * Now we'll allocate the entire DOF and copy it in -- provided
13327 * that the length isn't outrageous.
13328 */
13329 if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
13330 dtrace_dof_error(&hdr, "load size exceeds maximum");
13331 *errp = E2BIG;
13332 return (NULL);
13333 }
13334
13335 if (hdr.dofh_loadsz < sizeof (hdr)) {
13336 dtrace_dof_error(&hdr, "invalid load size");
13337 *errp = EINVAL;
13338 return (NULL);
13339 }
13340
13341 dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
13342
13343 if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
13344 dof->dofh_loadsz != hdr.dofh_loadsz) {
13345 kmem_free(dof, hdr.dofh_loadsz);
13346 *errp = EFAULT;
13347 return (NULL);
13348 }
13349
13350 return (dof);
13351 }
13352
13353 #ifdef __FreeBSD__
13354 static dof_hdr_t *
13355 dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp)
13356 {
13357 dof_hdr_t hdr, *dof;
13358 struct thread *td;
13359 size_t loadsz;
13360
13361 ASSERT(!MUTEX_HELD(&dtrace_lock));
13362
13363 td = curthread;
13364
13365 /*
13366 * First, we're going to copyin() the sizeof (dof_hdr_t).
13367 */
13368 if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) {
13369 dtrace_dof_error(NULL, "failed to copyin DOF header");
13370 *errp = EFAULT;
13371 return (NULL);
13372 }
13373
13374 /*
13375 * Now we'll allocate the entire DOF and copy it in -- provided
13376 * that the length isn't outrageous.
13377 */
13378 if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
13379 dtrace_dof_error(&hdr, "load size exceeds maximum");
13380 *errp = E2BIG;
13381 return (NULL);
13382 }
13383 loadsz = (size_t)hdr.dofh_loadsz;
13384
13385 if (loadsz < sizeof (hdr)) {
13386 dtrace_dof_error(&hdr, "invalid load size");
13387 *errp = EINVAL;
13388 return (NULL);
13389 }
13390
13391 dof = kmem_alloc(loadsz, KM_SLEEP);
13392
13393 if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz ||
13394 dof->dofh_loadsz != loadsz) {
13395 kmem_free(dof, hdr.dofh_loadsz);
13396 *errp = EFAULT;
13397 return (NULL);
13398 }
13399
13400 return (dof);
13401 }
13402
13403 static __inline uchar_t
13404 dtrace_dof_char(char c)
13405 {
13406
13407 switch (c) {
13408 case '0':
13409 case '1':
13410 case '2':
13411 case '3':
13412 case '4':
13413 case '5':
13414 case '6':
13415 case '7':
13416 case '8':
13417 case '9':
13418 return (c - '0');
13419 case 'A':
13420 case 'B':
13421 case 'C':
13422 case 'D':
13423 case 'E':
13424 case 'F':
13425 return (c - 'A' + 10);
13426 case 'a':
13427 case 'b':
13428 case 'c':
13429 case 'd':
13430 case 'e':
13431 case 'f':
13432 return (c - 'a' + 10);
13433 }
13434 /* Should not reach here. */
13435 return (UCHAR_MAX);
13436 }
13437 #endif /* __FreeBSD__ */
13438
13439 static dof_hdr_t *
13440 dtrace_dof_property(const char *name)
13441 {
13442 #ifdef __FreeBSD__
13443 uint8_t *dofbuf;
13444 u_char *data, *eol;
13445 caddr_t doffile;
13446 size_t bytes, len, i;
13447 dof_hdr_t *dof;
13448 u_char c1, c2;
13449
13450 dof = NULL;
13451
13452 doffile = preload_search_by_type("dtrace_dof");
13453 if (doffile == NULL)
13454 return (NULL);
13455
13456 data = preload_fetch_addr(doffile);
13457 len = preload_fetch_size(doffile);
13458 for (;;) {
13459 /* Look for the end of the line. All lines end in a newline. */
13460 eol = memchr(data, '\n', len);
13461 if (eol == NULL)
13462 return (NULL);
13463
13464 if (strncmp(name, data, strlen(name)) == 0)
13465 break;
13466
13467 eol++; /* skip past the newline */
13468 len -= eol - data;
13469 data = eol;
13470 }
13471
13472 /* We've found the data corresponding to the specified key. */
13473
13474 data += strlen(name) + 1; /* skip past the '=' */
13475 len = eol - data;
13476 if (len % 2 != 0) {
13477 dtrace_dof_error(NULL, "invalid DOF encoding length");
13478 goto doferr;
13479 }
13480 bytes = len / 2;
13481 if (bytes < sizeof(dof_hdr_t)) {
13482 dtrace_dof_error(NULL, "truncated header");
13483 goto doferr;
13484 }
13485
13486 /*
13487 * Each byte is represented by the two ASCII characters in its hex
13488 * representation.
13489 */
13490 dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK);
13491 for (i = 0; i < bytes; i++) {
13492 c1 = dtrace_dof_char(data[i * 2]);
13493 c2 = dtrace_dof_char(data[i * 2 + 1]);
13494 if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) {
13495 dtrace_dof_error(NULL, "invalid hex char in DOF");
13496 goto doferr;
13497 }
13498 dofbuf[i] = c1 * 16 + c2;
13499 }
13500
13501 dof = (dof_hdr_t *)dofbuf;
13502 if (bytes < dof->dofh_loadsz) {
13503 dtrace_dof_error(NULL, "truncated DOF");
13504 goto doferr;
13505 }
13506
13507 if (dof->dofh_loadsz >= dtrace_dof_maxsize) {
13508 dtrace_dof_error(NULL, "oversized DOF");
13509 goto doferr;
13510 }
13511
13512 return (dof);
13513
13514 doferr:
13515 free(dof, M_SOLARIS);
13516 return (NULL);
13517 #else /* __FreeBSD__ */
13518 uchar_t *buf;
13519 uint64_t loadsz;
13520 unsigned int len, i;
13521 dof_hdr_t *dof;
13522
13523 /*
13524 * Unfortunately, array of values in .conf files are always (and
13525 * only) interpreted to be integer arrays. We must read our DOF
13526 * as an integer array, and then squeeze it into a byte array.
13527 */
13528 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13529 (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13530 return (NULL);
13531
13532 for (i = 0; i < len; i++)
13533 buf[i] = (uchar_t)(((int *)buf)[i]);
13534
13535 if (len < sizeof (dof_hdr_t)) {
13536 ddi_prop_free(buf);
13537 dtrace_dof_error(NULL, "truncated header");
13538 return (NULL);
13539 }
13540
13541 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13542 ddi_prop_free(buf);
13543 dtrace_dof_error(NULL, "truncated DOF");
13544 return (NULL);
13545 }
13546
13547 if (loadsz >= dtrace_dof_maxsize) {
13548 ddi_prop_free(buf);
13549 dtrace_dof_error(NULL, "oversized DOF");
13550 return (NULL);
13551 }
13552
13553 dof = kmem_alloc(loadsz, KM_SLEEP);
13554 bcopy(buf, dof, loadsz);
13555 ddi_prop_free(buf);
13556
13557 return (dof);
13558 #endif /* !__FreeBSD__ */
13559 }
13560
13561 static void
13562 dtrace_dof_destroy(dof_hdr_t *dof)
13563 {
13564 kmem_free(dof, dof->dofh_loadsz);
13565 }
13566
13567 /*
13568 * Return the dof_sec_t pointer corresponding to a given section index. If the
13569 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
13570 * a type other than DOF_SECT_NONE is specified, the header is checked against
13571 * this type and NULL is returned if the types do not match.
13572 */
13573 static dof_sec_t *
13574 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13575 {
13576 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13577 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13578
13579 if (i >= dof->dofh_secnum) {
13580 dtrace_dof_error(dof, "referenced section index is invalid");
13581 return (NULL);
13582 }
13583
13584 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13585 dtrace_dof_error(dof, "referenced section is not loadable");
13586 return (NULL);
13587 }
13588
13589 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13590 dtrace_dof_error(dof, "referenced section is the wrong type");
13591 return (NULL);
13592 }
13593
13594 return (sec);
13595 }
13596
13597 static dtrace_probedesc_t *
13598 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13599 {
13600 dof_probedesc_t *probe;
13601 dof_sec_t *strtab;
13602 uintptr_t daddr = (uintptr_t)dof;
13603 uintptr_t str;
13604 size_t size;
13605
13606 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13607 dtrace_dof_error(dof, "invalid probe section");
13608 return (NULL);
13609 }
13610
13611 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13612 dtrace_dof_error(dof, "bad alignment in probe description");
13613 return (NULL);
13614 }
13615
13616 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13617 dtrace_dof_error(dof, "truncated probe description");
13618 return (NULL);
13619 }
13620
13621 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13622 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13623
13624 if (strtab == NULL)
13625 return (NULL);
13626
13627 str = daddr + strtab->dofs_offset;
13628 size = strtab->dofs_size;
13629
13630 if (probe->dofp_provider >= strtab->dofs_size) {
13631 dtrace_dof_error(dof, "corrupt probe provider");
13632 return (NULL);
13633 }
13634
13635 (void) strncpy(desc->dtpd_provider,
13636 (char *)(str + probe->dofp_provider),
13637 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13638
13639 if (probe->dofp_mod >= strtab->dofs_size) {
13640 dtrace_dof_error(dof, "corrupt probe module");
13641 return (NULL);
13642 }
13643
13644 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13645 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13646
13647 if (probe->dofp_func >= strtab->dofs_size) {
13648 dtrace_dof_error(dof, "corrupt probe function");
13649 return (NULL);
13650 }
13651
13652 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13653 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13654
13655 if (probe->dofp_name >= strtab->dofs_size) {
13656 dtrace_dof_error(dof, "corrupt probe name");
13657 return (NULL);
13658 }
13659
13660 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13661 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13662
13663 return (desc);
13664 }
13665
13666 static dtrace_difo_t *
13667 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13668 cred_t *cr)
13669 {
13670 dtrace_difo_t *dp;
13671 size_t ttl = 0;
13672 dof_difohdr_t *dofd;
13673 uintptr_t daddr = (uintptr_t)dof;
13674 size_t max = dtrace_difo_maxsize;
13675 int i, l, n;
13676
13677 static const struct {
13678 int section;
13679 int bufoffs;
13680 int lenoffs;
13681 int entsize;
13682 int align;
13683 const char *msg;
13684 } difo[] = {
13685 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13686 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13687 sizeof (dif_instr_t), "multiple DIF sections" },
13688
13689 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13690 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13691 sizeof (uint64_t), "multiple integer tables" },
13692
13693 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13694 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13695 sizeof (char), "multiple string tables" },
13696
13697 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13698 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13699 sizeof (uint_t), "multiple variable tables" },
13700
13701 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13702 };
13703
13704 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13705 dtrace_dof_error(dof, "invalid DIFO header section");
13706 return (NULL);
13707 }
13708
13709 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13710 dtrace_dof_error(dof, "bad alignment in DIFO header");
13711 return (NULL);
13712 }
13713
13714 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13715 sec->dofs_size % sizeof (dof_secidx_t)) {
13716 dtrace_dof_error(dof, "bad size in DIFO header");
13717 return (NULL);
13718 }
13719
13720 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13721 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13722
13723 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13724 dp->dtdo_rtype = dofd->dofd_rtype;
13725
13726 for (l = 0; l < n; l++) {
13727 dof_sec_t *subsec;
13728 void **bufp;
13729 uint32_t *lenp;
13730
13731 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13732 dofd->dofd_links[l])) == NULL)
13733 goto err; /* invalid section link */
13734
13735 if (ttl + subsec->dofs_size > max) {
13736 dtrace_dof_error(dof, "exceeds maximum size");
13737 goto err;
13738 }
13739
13740 ttl += subsec->dofs_size;
13741
13742 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13743 if (subsec->dofs_type != difo[i].section)
13744 continue;
13745
13746 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13747 dtrace_dof_error(dof, "section not loaded");
13748 goto err;
13749 }
13750
13751 if (subsec->dofs_align != difo[i].align) {
13752 dtrace_dof_error(dof, "bad alignment");
13753 goto err;
13754 }
13755
13756 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13757 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13758
13759 if (*bufp != NULL) {
13760 dtrace_dof_error(dof, difo[i].msg);
13761 goto err;
13762 }
13763
13764 if (difo[i].entsize != subsec->dofs_entsize) {
13765 dtrace_dof_error(dof, "entry size mismatch");
13766 goto err;
13767 }
13768
13769 if (subsec->dofs_entsize != 0 &&
13770 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13771 dtrace_dof_error(dof, "corrupt entry size");
13772 goto err;
13773 }
13774
13775 *lenp = subsec->dofs_size;
13776 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13777 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13778 *bufp, subsec->dofs_size);
13779
13780 if (subsec->dofs_entsize != 0)
13781 *lenp /= subsec->dofs_entsize;
13782
13783 break;
13784 }
13785
13786 /*
13787 * If we encounter a loadable DIFO sub-section that is not
13788 * known to us, assume this is a broken program and fail.
13789 */
13790 if (difo[i].section == DOF_SECT_NONE &&
13791 (subsec->dofs_flags & DOF_SECF_LOAD)) {
13792 dtrace_dof_error(dof, "unrecognized DIFO subsection");
13793 goto err;
13794 }
13795 }
13796
13797 if (dp->dtdo_buf == NULL) {
13798 /*
13799 * We can't have a DIF object without DIF text.
13800 */
13801 dtrace_dof_error(dof, "missing DIF text");
13802 goto err;
13803 }
13804
13805 /*
13806 * Before we validate the DIF object, run through the variable table
13807 * looking for the strings -- if any of their size are under, we'll set
13808 * their size to be the system-wide default string size. Note that
13809 * this should _not_ happen if the "strsize" option has been set --
13810 * in this case, the compiler should have set the size to reflect the
13811 * setting of the option.
13812 */
13813 for (i = 0; i < dp->dtdo_varlen; i++) {
13814 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13815 dtrace_diftype_t *t = &v->dtdv_type;
13816
13817 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13818 continue;
13819
13820 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13821 t->dtdt_size = dtrace_strsize_default;
13822 }
13823
13824 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13825 goto err;
13826
13827 dtrace_difo_init(dp, vstate);
13828 return (dp);
13829
13830 err:
13831 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13832 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13833 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13834 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13835
13836 kmem_free(dp, sizeof (dtrace_difo_t));
13837 return (NULL);
13838 }
13839
13840 static dtrace_predicate_t *
13841 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13842 cred_t *cr)
13843 {
13844 dtrace_difo_t *dp;
13845
13846 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13847 return (NULL);
13848
13849 return (dtrace_predicate_create(dp));
13850 }
13851
13852 static dtrace_actdesc_t *
13853 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13854 cred_t *cr)
13855 {
13856 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13857 dof_actdesc_t *desc;
13858 dof_sec_t *difosec;
13859 size_t offs;
13860 uintptr_t daddr = (uintptr_t)dof;
13861 uint64_t arg;
13862 dtrace_actkind_t kind;
13863
13864 if (sec->dofs_type != DOF_SECT_ACTDESC) {
13865 dtrace_dof_error(dof, "invalid action section");
13866 return (NULL);
13867 }
13868
13869 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13870 dtrace_dof_error(dof, "truncated action description");
13871 return (NULL);
13872 }
13873
13874 if (sec->dofs_align != sizeof (uint64_t)) {
13875 dtrace_dof_error(dof, "bad alignment in action description");
13876 return (NULL);
13877 }
13878
13879 if (sec->dofs_size < sec->dofs_entsize) {
13880 dtrace_dof_error(dof, "section entry size exceeds total size");
13881 return (NULL);
13882 }
13883
13884 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13885 dtrace_dof_error(dof, "bad entry size in action description");
13886 return (NULL);
13887 }
13888
13889 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13890 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13891 return (NULL);
13892 }
13893
13894 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13895 desc = (dof_actdesc_t *)(daddr +
13896 (uintptr_t)sec->dofs_offset + offs);
13897 kind = (dtrace_actkind_t)desc->dofa_kind;
13898
13899 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13900 (kind != DTRACEACT_PRINTA ||
13901 desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13902 (kind == DTRACEACT_DIFEXPR &&
13903 desc->dofa_strtab != DOF_SECIDX_NONE)) {
13904 dof_sec_t *strtab;
13905 char *str, *fmt;
13906 uint64_t i;
13907
13908 /*
13909 * The argument to these actions is an index into the
13910 * DOF string table. For printf()-like actions, this
13911 * is the format string. For print(), this is the
13912 * CTF type of the expression result.
13913 */
13914 if ((strtab = dtrace_dof_sect(dof,
13915 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13916 goto err;
13917
13918 str = (char *)((uintptr_t)dof +
13919 (uintptr_t)strtab->dofs_offset);
13920
13921 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13922 if (str[i] == '\0')
13923 break;
13924 }
13925
13926 if (i >= strtab->dofs_size) {
13927 dtrace_dof_error(dof, "bogus format string");
13928 goto err;
13929 }
13930
13931 if (i == desc->dofa_arg) {
13932 dtrace_dof_error(dof, "empty format string");
13933 goto err;
13934 }
13935
13936 i -= desc->dofa_arg;
13937 fmt = kmem_alloc(i + 1, KM_SLEEP);
13938 bcopy(&str[desc->dofa_arg], fmt, i + 1);
13939 arg = (uint64_t)(uintptr_t)fmt;
13940 } else {
13941 if (kind == DTRACEACT_PRINTA) {
13942 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13943 arg = 0;
13944 } else {
13945 arg = desc->dofa_arg;
13946 }
13947 }
13948
13949 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13950 desc->dofa_uarg, arg);
13951
13952 if (last != NULL) {
13953 last->dtad_next = act;
13954 } else {
13955 first = act;
13956 }
13957
13958 last = act;
13959
13960 if (desc->dofa_difo == DOF_SECIDX_NONE)
13961 continue;
13962
13963 if ((difosec = dtrace_dof_sect(dof,
13964 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13965 goto err;
13966
13967 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13968
13969 if (act->dtad_difo == NULL)
13970 goto err;
13971 }
13972
13973 ASSERT(first != NULL);
13974 return (first);
13975
13976 err:
13977 for (act = first; act != NULL; act = next) {
13978 next = act->dtad_next;
13979 dtrace_actdesc_release(act, vstate);
13980 }
13981
13982 return (NULL);
13983 }
13984
13985 static dtrace_ecbdesc_t *
13986 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13987 cred_t *cr)
13988 {
13989 dtrace_ecbdesc_t *ep;
13990 dof_ecbdesc_t *ecb;
13991 dtrace_probedesc_t *desc;
13992 dtrace_predicate_t *pred = NULL;
13993
13994 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13995 dtrace_dof_error(dof, "truncated ECB description");
13996 return (NULL);
13997 }
13998
13999 if (sec->dofs_align != sizeof (uint64_t)) {
14000 dtrace_dof_error(dof, "bad alignment in ECB description");
14001 return (NULL);
14002 }
14003
14004 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
14005 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
14006
14007 if (sec == NULL)
14008 return (NULL);
14009
14010 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
14011 ep->dted_uarg = ecb->dofe_uarg;
14012 desc = &ep->dted_probe;
14013
14014 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
14015 goto err;
14016
14017 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
14018 if ((sec = dtrace_dof_sect(dof,
14019 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
14020 goto err;
14021
14022 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
14023 goto err;
14024
14025 ep->dted_pred.dtpdd_predicate = pred;
14026 }
14027
14028 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
14029 if ((sec = dtrace_dof_sect(dof,
14030 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
14031 goto err;
14032
14033 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
14034
14035 if (ep->dted_action == NULL)
14036 goto err;
14037 }
14038
14039 return (ep);
14040
14041 err:
14042 if (pred != NULL)
14043 dtrace_predicate_release(pred, vstate);
14044 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
14045 return (NULL);
14046 }
14047
14048 /*
14049 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
14050 * specified DOF. SETX relocations are computed using 'ubase', the base load
14051 * address of the object containing the DOF, and DOFREL relocations are relative
14052 * to the relocation offset within the DOF.
14053 */
14054 static int
14055 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase,
14056 uint64_t udaddr)
14057 {
14058 uintptr_t daddr = (uintptr_t)dof;
14059 uintptr_t ts_end;
14060 dof_relohdr_t *dofr =
14061 (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
14062 dof_sec_t *ss, *rs, *ts;
14063 dof_relodesc_t *r;
14064 uint_t i, n;
14065
14066 if (sec->dofs_size < sizeof (dof_relohdr_t) ||
14067 sec->dofs_align != sizeof (dof_secidx_t)) {
14068 dtrace_dof_error(dof, "invalid relocation header");
14069 return (-1);
14070 }
14071
14072 ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
14073 rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
14074 ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
14075 ts_end = (uintptr_t)ts + sizeof (dof_sec_t);
14076
14077 if (ss == NULL || rs == NULL || ts == NULL)
14078 return (-1); /* dtrace_dof_error() has been called already */
14079
14080 if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
14081 rs->dofs_align != sizeof (uint64_t)) {
14082 dtrace_dof_error(dof, "invalid relocation section");
14083 return (-1);
14084 }
14085
14086 r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
14087 n = rs->dofs_size / rs->dofs_entsize;
14088
14089 for (i = 0; i < n; i++) {
14090 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
14091
14092 switch (r->dofr_type) {
14093 case DOF_RELO_NONE:
14094 break;
14095 case DOF_RELO_SETX:
14096 case DOF_RELO_DOFREL:
14097 if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
14098 sizeof (uint64_t) > ts->dofs_size) {
14099 dtrace_dof_error(dof, "bad relocation offset");
14100 return (-1);
14101 }
14102
14103 if (taddr >= (uintptr_t)ts && taddr < ts_end) {
14104 dtrace_dof_error(dof, "bad relocation offset");
14105 return (-1);
14106 }
14107
14108 if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
14109 dtrace_dof_error(dof, "misaligned setx relo");
14110 return (-1);
14111 }
14112
14113 if (r->dofr_type == DOF_RELO_SETX)
14114 *(uint64_t *)taddr += ubase;
14115 else
14116 *(uint64_t *)taddr +=
14117 udaddr + ts->dofs_offset + r->dofr_offset;
14118 break;
14119 default:
14120 dtrace_dof_error(dof, "invalid relocation type");
14121 return (-1);
14122 }
14123
14124 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
14125 }
14126
14127 return (0);
14128 }
14129
14130 /*
14131 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
14132 * header: it should be at the front of a memory region that is at least
14133 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
14134 * size. It need not be validated in any other way.
14135 */
14136 static int
14137 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
14138 dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes)
14139 {
14140 uint64_t len = dof->dofh_loadsz, seclen;
14141 uintptr_t daddr = (uintptr_t)dof;
14142 dtrace_ecbdesc_t *ep;
14143 dtrace_enabling_t *enab;
14144 uint_t i;
14145
14146 ASSERT(MUTEX_HELD(&dtrace_lock));
14147 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
14148
14149 /*
14150 * Check the DOF header identification bytes. In addition to checking
14151 * valid settings, we also verify that unused bits/bytes are zeroed so
14152 * we can use them later without fear of regressing existing binaries.
14153 */
14154 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
14155 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
14156 dtrace_dof_error(dof, "DOF magic string mismatch");
14157 return (-1);
14158 }
14159
14160 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
14161 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
14162 dtrace_dof_error(dof, "DOF has invalid data model");
14163 return (-1);
14164 }
14165
14166 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
14167 dtrace_dof_error(dof, "DOF encoding mismatch");
14168 return (-1);
14169 }
14170
14171 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14172 dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
14173 dtrace_dof_error(dof, "DOF version mismatch");
14174 return (-1);
14175 }
14176
14177 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
14178 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
14179 return (-1);
14180 }
14181
14182 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
14183 dtrace_dof_error(dof, "DOF uses too many integer registers");
14184 return (-1);
14185 }
14186
14187 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
14188 dtrace_dof_error(dof, "DOF uses too many tuple registers");
14189 return (-1);
14190 }
14191
14192 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
14193 if (dof->dofh_ident[i] != 0) {
14194 dtrace_dof_error(dof, "DOF has invalid ident byte set");
14195 return (-1);
14196 }
14197 }
14198
14199 if (dof->dofh_flags & ~DOF_FL_VALID) {
14200 dtrace_dof_error(dof, "DOF has invalid flag bits set");
14201 return (-1);
14202 }
14203
14204 if (dof->dofh_secsize == 0) {
14205 dtrace_dof_error(dof, "zero section header size");
14206 return (-1);
14207 }
14208
14209 /*
14210 * Check that the section headers don't exceed the amount of DOF
14211 * data. Note that we cast the section size and number of sections
14212 * to uint64_t's to prevent possible overflow in the multiplication.
14213 */
14214 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
14215
14216 if (dof->dofh_secoff > len || seclen > len ||
14217 dof->dofh_secoff + seclen > len) {
14218 dtrace_dof_error(dof, "truncated section headers");
14219 return (-1);
14220 }
14221
14222 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
14223 dtrace_dof_error(dof, "misaligned section headers");
14224 return (-1);
14225 }
14226
14227 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
14228 dtrace_dof_error(dof, "misaligned section size");
14229 return (-1);
14230 }
14231
14232 /*
14233 * Take an initial pass through the section headers to be sure that
14234 * the headers don't have stray offsets. If the 'noprobes' flag is
14235 * set, do not permit sections relating to providers, probes, or args.
14236 */
14237 for (i = 0; i < dof->dofh_secnum; i++) {
14238 dof_sec_t *sec = (dof_sec_t *)(daddr +
14239 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14240
14241 if (noprobes) {
14242 switch (sec->dofs_type) {
14243 case DOF_SECT_PROVIDER:
14244 case DOF_SECT_PROBES:
14245 case DOF_SECT_PRARGS:
14246 case DOF_SECT_PROFFS:
14247 dtrace_dof_error(dof, "illegal sections "
14248 "for enabling");
14249 return (-1);
14250 }
14251 }
14252
14253 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
14254 !(sec->dofs_flags & DOF_SECF_LOAD)) {
14255 dtrace_dof_error(dof, "loadable section with load "
14256 "flag unset");
14257 return (-1);
14258 }
14259
14260 if (!(sec->dofs_flags & DOF_SECF_LOAD))
14261 continue; /* just ignore non-loadable sections */
14262
14263 if (!ISP2(sec->dofs_align)) {
14264 dtrace_dof_error(dof, "bad section alignment");
14265 return (-1);
14266 }
14267
14268 if (sec->dofs_offset & (sec->dofs_align - 1)) {
14269 dtrace_dof_error(dof, "misaligned section");
14270 return (-1);
14271 }
14272
14273 if (sec->dofs_offset > len || sec->dofs_size > len ||
14274 sec->dofs_offset + sec->dofs_size > len) {
14275 dtrace_dof_error(dof, "corrupt section header");
14276 return (-1);
14277 }
14278
14279 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
14280 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
14281 dtrace_dof_error(dof, "non-terminating string table");
14282 return (-1);
14283 }
14284 }
14285
14286 /*
14287 * Take a second pass through the sections and locate and perform any
14288 * relocations that are present. We do this after the first pass to
14289 * be sure that all sections have had their headers validated.
14290 */
14291 for (i = 0; i < dof->dofh_secnum; i++) {
14292 dof_sec_t *sec = (dof_sec_t *)(daddr +
14293 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14294
14295 if (!(sec->dofs_flags & DOF_SECF_LOAD))
14296 continue; /* skip sections that are not loadable */
14297
14298 switch (sec->dofs_type) {
14299 case DOF_SECT_URELHDR:
14300 if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0)
14301 return (-1);
14302 break;
14303 }
14304 }
14305
14306 if ((enab = *enabp) == NULL)
14307 enab = *enabp = dtrace_enabling_create(vstate);
14308
14309 for (i = 0; i < dof->dofh_secnum; i++) {
14310 dof_sec_t *sec = (dof_sec_t *)(daddr +
14311 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14312
14313 if (sec->dofs_type != DOF_SECT_ECBDESC)
14314 continue;
14315
14316 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
14317 dtrace_enabling_destroy(enab);
14318 *enabp = NULL;
14319 return (-1);
14320 }
14321
14322 dtrace_enabling_add(enab, ep);
14323 }
14324
14325 return (0);
14326 }
14327
14328 /*
14329 * Process DOF for any options. This routine assumes that the DOF has been
14330 * at least processed by dtrace_dof_slurp().
14331 */
14332 static int
14333 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
14334 {
14335 int i, rval;
14336 uint32_t entsize;
14337 size_t offs;
14338 dof_optdesc_t *desc;
14339
14340 for (i = 0; i < dof->dofh_secnum; i++) {
14341 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
14342 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14343
14344 if (sec->dofs_type != DOF_SECT_OPTDESC)
14345 continue;
14346
14347 if (sec->dofs_align != sizeof (uint64_t)) {
14348 dtrace_dof_error(dof, "bad alignment in "
14349 "option description");
14350 return (EINVAL);
14351 }
14352
14353 if ((entsize = sec->dofs_entsize) == 0) {
14354 dtrace_dof_error(dof, "zeroed option entry size");
14355 return (EINVAL);
14356 }
14357
14358 if (entsize < sizeof (dof_optdesc_t)) {
14359 dtrace_dof_error(dof, "bad option entry size");
14360 return (EINVAL);
14361 }
14362
14363 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
14364 desc = (dof_optdesc_t *)((uintptr_t)dof +
14365 (uintptr_t)sec->dofs_offset + offs);
14366
14367 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
14368 dtrace_dof_error(dof, "non-zero option string");
14369 return (EINVAL);
14370 }
14371
14372 if (desc->dofo_value == DTRACEOPT_UNSET) {
14373 dtrace_dof_error(dof, "unset option");
14374 return (EINVAL);
14375 }
14376
14377 if ((rval = dtrace_state_option(state,
14378 desc->dofo_option, desc->dofo_value)) != 0) {
14379 dtrace_dof_error(dof, "rejected option");
14380 return (rval);
14381 }
14382 }
14383 }
14384
14385 return (0);
14386 }
14387
14388 /*
14389 * DTrace Consumer State Functions
14390 */
14391 static int
14392 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
14393 {
14394 size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
14395 void *base;
14396 uintptr_t limit;
14397 dtrace_dynvar_t *dvar, *next, *start;
14398 int i;
14399
14400 ASSERT(MUTEX_HELD(&dtrace_lock));
14401 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
14402
14403 bzero(dstate, sizeof (dtrace_dstate_t));
14404
14405 if ((dstate->dtds_chunksize = chunksize) == 0)
14406 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
14407
14408 VERIFY(dstate->dtds_chunksize < LONG_MAX);
14409
14410 if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
14411 size = min;
14412
14413 if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
14414 return (ENOMEM);
14415
14416 dstate->dtds_size = size;
14417 dstate->dtds_base = base;
14418 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
14419 bzero(dstate->dtds_percpu,
14420 (mp_maxid + 1) * sizeof (dtrace_dstate_percpu_t));
14421
14422 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
14423
14424 if (hashsize != 1 && (hashsize & 1))
14425 hashsize--;
14426
14427 dstate->dtds_hashsize = hashsize;
14428 dstate->dtds_hash = dstate->dtds_base;
14429
14430 /*
14431 * Set all of our hash buckets to point to the single sink, and (if
14432 * it hasn't already been set), set the sink's hash value to be the
14433 * sink sentinel value. The sink is needed for dynamic variable
14434 * lookups to know that they have iterated over an entire, valid hash
14435 * chain.
14436 */
14437 for (i = 0; i < hashsize; i++)
14438 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
14439
14440 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
14441 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
14442
14443 /*
14444 * Determine number of active CPUs. Divide free list evenly among
14445 * active CPUs.
14446 */
14447 start = (dtrace_dynvar_t *)
14448 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
14449 limit = (uintptr_t)base + size;
14450
14451 VERIFY((uintptr_t)start < limit);
14452 VERIFY((uintptr_t)start >= (uintptr_t)base);
14453
14454 maxper = (limit - (uintptr_t)start) / (mp_maxid + 1);
14455 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
14456
14457 CPU_FOREACH(i) {
14458 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
14459
14460 /*
14461 * If we don't even have enough chunks to make it once through
14462 * NCPUs, we're just going to allocate everything to the first
14463 * CPU. And if we're on the last CPU, we're going to allocate
14464 * whatever is left over. In either case, we set the limit to
14465 * be the limit of the dynamic variable space.
14466 */
14467 if (maxper == 0 || i == mp_maxid) {
14468 limit = (uintptr_t)base + size;
14469 start = NULL;
14470 } else {
14471 limit = (uintptr_t)start + maxper;
14472 start = (dtrace_dynvar_t *)limit;
14473 }
14474
14475 VERIFY(limit <= (uintptr_t)base + size);
14476
14477 for (;;) {
14478 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14479 dstate->dtds_chunksize);
14480
14481 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14482 break;
14483
14484 VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
14485 (uintptr_t)dvar <= (uintptr_t)base + size);
14486 dvar->dtdv_next = next;
14487 dvar = next;
14488 }
14489
14490 if (maxper == 0)
14491 break;
14492 }
14493
14494 return (0);
14495 }
14496
14497 static void
14498 dtrace_dstate_fini(dtrace_dstate_t *dstate)
14499 {
14500 ASSERT(MUTEX_HELD(&cpu_lock));
14501
14502 if (dstate->dtds_base == NULL)
14503 return;
14504
14505 kmem_free(dstate->dtds_base, dstate->dtds_size);
14506 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14507 }
14508
14509 static void
14510 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14511 {
14512 /*
14513 * Logical XOR, where are you?
14514 */
14515 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14516
14517 if (vstate->dtvs_nglobals > 0) {
14518 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14519 sizeof (dtrace_statvar_t *));
14520 }
14521
14522 if (vstate->dtvs_ntlocals > 0) {
14523 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14524 sizeof (dtrace_difv_t));
14525 }
14526
14527 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14528
14529 if (vstate->dtvs_nlocals > 0) {
14530 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14531 sizeof (dtrace_statvar_t *));
14532 }
14533 }
14534
14535 #ifdef illumos
14536 static void
14537 dtrace_state_clean(dtrace_state_t *state)
14538 {
14539 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14540 return;
14541
14542 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14543 dtrace_speculation_clean(state);
14544 }
14545
14546 static void
14547 dtrace_state_deadman(dtrace_state_t *state)
14548 {
14549 hrtime_t now;
14550
14551 dtrace_sync();
14552
14553 now = dtrace_gethrtime();
14554
14555 if (state != dtrace_anon.dta_state &&
14556 now - state->dts_laststatus >= dtrace_deadman_user)
14557 return;
14558
14559 /*
14560 * We must be sure that dts_alive never appears to be less than the
14561 * value upon entry to dtrace_state_deadman(), and because we lack a
14562 * dtrace_cas64(), we cannot store to it atomically. We thus instead
14563 * store INT64_MAX to it, followed by a memory barrier, followed by
14564 * the new value. This assures that dts_alive never appears to be
14565 * less than its true value, regardless of the order in which the
14566 * stores to the underlying storage are issued.
14567 */
14568 state->dts_alive = INT64_MAX;
14569 dtrace_membar_producer();
14570 state->dts_alive = now;
14571 }
14572 #else /* !illumos */
14573 static void
14574 dtrace_state_clean(void *arg)
14575 {
14576 dtrace_state_t *state = arg;
14577 dtrace_optval_t *opt = state->dts_options;
14578
14579 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14580 return;
14581
14582 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14583 dtrace_speculation_clean(state);
14584
14585 callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14586 dtrace_state_clean, state);
14587 }
14588
14589 static void
14590 dtrace_state_deadman(void *arg)
14591 {
14592 dtrace_state_t *state = arg;
14593 hrtime_t now;
14594
14595 dtrace_sync();
14596
14597 dtrace_debug_output();
14598
14599 now = dtrace_gethrtime();
14600
14601 if (state != dtrace_anon.dta_state &&
14602 now - state->dts_laststatus >= dtrace_deadman_user)
14603 return;
14604
14605 /*
14606 * We must be sure that dts_alive never appears to be less than the
14607 * value upon entry to dtrace_state_deadman(), and because we lack a
14608 * dtrace_cas64(), we cannot store to it atomically. We thus instead
14609 * store INT64_MAX to it, followed by a memory barrier, followed by
14610 * the new value. This assures that dts_alive never appears to be
14611 * less than its true value, regardless of the order in which the
14612 * stores to the underlying storage are issued.
14613 */
14614 state->dts_alive = INT64_MAX;
14615 dtrace_membar_producer();
14616 state->dts_alive = now;
14617
14618 callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14619 dtrace_state_deadman, state);
14620 }
14621 #endif /* illumos */
14622
14623 static dtrace_state_t *
14624 #ifdef illumos
14625 dtrace_state_create(dev_t *devp, cred_t *cr)
14626 #else
14627 dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
14628 #endif
14629 {
14630 #ifdef illumos
14631 minor_t minor;
14632 major_t major;
14633 #else
14634 cred_t *cr = NULL;
14635 int m = 0;
14636 #endif
14637 char c[30];
14638 dtrace_state_t *state;
14639 dtrace_optval_t *opt;
14640 int bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t), i;
14641 int cpu_it;
14642
14643 ASSERT(MUTEX_HELD(&dtrace_lock));
14644 ASSERT(MUTEX_HELD(&cpu_lock));
14645
14646 #ifdef illumos
14647 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14648 VM_BESTFIT | VM_SLEEP);
14649
14650 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14651 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14652 return (NULL);
14653 }
14654
14655 state = ddi_get_soft_state(dtrace_softstate, minor);
14656 #else
14657 if (dev != NULL) {
14658 cr = dev->si_cred;
14659 m = dev2unit(dev);
14660 }
14661
14662 /* Allocate memory for the state. */
14663 state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14664 #endif
14665
14666 state->dts_epid = DTRACE_EPIDNONE + 1;
14667
14668 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14669 #ifdef illumos
14670 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14671 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14672
14673 if (devp != NULL) {
14674 major = getemajor(*devp);
14675 } else {
14676 major = ddi_driver_major(dtrace_devi);
14677 }
14678
14679 state->dts_dev = makedevice(major, minor);
14680
14681 if (devp != NULL)
14682 *devp = state->dts_dev;
14683 #else
14684 state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14685 state->dts_dev = dev;
14686 #endif
14687
14688 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14689 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14690
14691 /*
14692 * Allocate and initialise the per-process per-CPU random state.
14693 * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14694 * assumed to be seeded at this point (if from Fortuna seed file).
14695 */
14696 arc4random_buf(&state->dts_rstate[0], 2 * sizeof(uint64_t));
14697 for (cpu_it = 1; cpu_it <= mp_maxid; cpu_it++) {
14698 /*
14699 * Each CPU is assigned a 2^64 period, non-overlapping
14700 * subsequence.
14701 */
14702 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it - 1],
14703 state->dts_rstate[cpu_it]);
14704 }
14705
14706 #ifdef illumos
14707 state->dts_cleaner = CYCLIC_NONE;
14708 state->dts_deadman = CYCLIC_NONE;
14709 #else
14710 callout_init(&state->dts_cleaner, 1);
14711 callout_init(&state->dts_deadman, 1);
14712 #endif
14713 state->dts_vstate.dtvs_state = state;
14714
14715 for (i = 0; i < DTRACEOPT_MAX; i++)
14716 state->dts_options[i] = DTRACEOPT_UNSET;
14717
14718 /*
14719 * Set the default options.
14720 */
14721 opt = state->dts_options;
14722 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14723 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14724 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14725 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14726 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14727 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14728 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14729 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14730 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14731 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14732 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14733 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14734 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14735 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14736
14737 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14738
14739 /*
14740 * Depending on the user credentials, we set flag bits which alter probe
14741 * visibility or the amount of destructiveness allowed. In the case of
14742 * actual anonymous tracing, or the possession of all privileges, all of
14743 * the normal checks are bypassed.
14744 */
14745 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14746 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14747 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14748 } else {
14749 /*
14750 * Set up the credentials for this instantiation. We take a
14751 * hold on the credential to prevent it from disappearing on
14752 * us; this in turn prevents the zone_t referenced by this
14753 * credential from disappearing. This means that we can
14754 * examine the credential and the zone from probe context.
14755 */
14756 crhold(cr);
14757 state->dts_cred.dcr_cred = cr;
14758
14759 /*
14760 * CRA_PROC means "we have *some* privilege for dtrace" and
14761 * unlocks the use of variables like pid, zonename, etc.
14762 */
14763 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14764 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14765 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14766 }
14767
14768 /*
14769 * dtrace_user allows use of syscall and profile providers.
14770 * If the user also has proc_owner and/or proc_zone, we
14771 * extend the scope to include additional visibility and
14772 * destructive power.
14773 */
14774 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14775 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14776 state->dts_cred.dcr_visible |=
14777 DTRACE_CRV_ALLPROC;
14778
14779 state->dts_cred.dcr_action |=
14780 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14781 }
14782
14783 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14784 state->dts_cred.dcr_visible |=
14785 DTRACE_CRV_ALLZONE;
14786
14787 state->dts_cred.dcr_action |=
14788 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14789 }
14790
14791 /*
14792 * If we have all privs in whatever zone this is,
14793 * we can do destructive things to processes which
14794 * have altered credentials.
14795 */
14796 #ifdef illumos
14797 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14798 cr->cr_zone->zone_privset)) {
14799 state->dts_cred.dcr_action |=
14800 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14801 }
14802 #endif
14803 }
14804
14805 /*
14806 * Holding the dtrace_kernel privilege also implies that
14807 * the user has the dtrace_user privilege from a visibility
14808 * perspective. But without further privileges, some
14809 * destructive actions are not available.
14810 */
14811 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14812 /*
14813 * Make all probes in all zones visible. However,
14814 * this doesn't mean that all actions become available
14815 * to all zones.
14816 */
14817 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14818 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14819
14820 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14821 DTRACE_CRA_PROC;
14822 /*
14823 * Holding proc_owner means that destructive actions
14824 * for *this* zone are allowed.
14825 */
14826 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14827 state->dts_cred.dcr_action |=
14828 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14829
14830 /*
14831 * Holding proc_zone means that destructive actions
14832 * for this user/group ID in all zones is allowed.
14833 */
14834 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14835 state->dts_cred.dcr_action |=
14836 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14837
14838 #ifdef illumos
14839 /*
14840 * If we have all privs in whatever zone this is,
14841 * we can do destructive things to processes which
14842 * have altered credentials.
14843 */
14844 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14845 cr->cr_zone->zone_privset)) {
14846 state->dts_cred.dcr_action |=
14847 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14848 }
14849 #endif
14850 }
14851
14852 /*
14853 * Holding the dtrace_proc privilege gives control over fasttrap
14854 * and pid providers. We need to grant wider destructive
14855 * privileges in the event that the user has proc_owner and/or
14856 * proc_zone.
14857 */
14858 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14859 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14860 state->dts_cred.dcr_action |=
14861 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14862
14863 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14864 state->dts_cred.dcr_action |=
14865 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14866 }
14867 }
14868
14869 return (state);
14870 }
14871
14872 static int
14873 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14874 {
14875 dtrace_optval_t *opt = state->dts_options, size;
14876 processorid_t cpu = 0;
14877 int flags = 0, rval, factor, divisor = 1;
14878
14879 ASSERT(MUTEX_HELD(&dtrace_lock));
14880 ASSERT(MUTEX_HELD(&cpu_lock));
14881 ASSERT(which < DTRACEOPT_MAX);
14882 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14883 (state == dtrace_anon.dta_state &&
14884 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14885
14886 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14887 return (0);
14888
14889 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14890 cpu = opt[DTRACEOPT_CPU];
14891
14892 if (which == DTRACEOPT_SPECSIZE)
14893 flags |= DTRACEBUF_NOSWITCH;
14894
14895 if (which == DTRACEOPT_BUFSIZE) {
14896 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14897 flags |= DTRACEBUF_RING;
14898
14899 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14900 flags |= DTRACEBUF_FILL;
14901
14902 if (state != dtrace_anon.dta_state ||
14903 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14904 flags |= DTRACEBUF_INACTIVE;
14905 }
14906
14907 for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14908 /*
14909 * The size must be 8-byte aligned. If the size is not 8-byte
14910 * aligned, drop it down by the difference.
14911 */
14912 if (size & (sizeof (uint64_t) - 1))
14913 size -= size & (sizeof (uint64_t) - 1);
14914
14915 if (size < state->dts_reserve) {
14916 /*
14917 * Buffers always must be large enough to accommodate
14918 * their prereserved space. We return E2BIG instead
14919 * of ENOMEM in this case to allow for user-level
14920 * software to differentiate the cases.
14921 */
14922 return (E2BIG);
14923 }
14924
14925 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14926
14927 if (rval != ENOMEM) {
14928 opt[which] = size;
14929 return (rval);
14930 }
14931
14932 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14933 return (rval);
14934
14935 for (divisor = 2; divisor < factor; divisor <<= 1)
14936 continue;
14937 }
14938
14939 return (ENOMEM);
14940 }
14941
14942 static int
14943 dtrace_state_buffers(dtrace_state_t *state)
14944 {
14945 dtrace_speculation_t *spec = state->dts_speculations;
14946 int rval, i;
14947
14948 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14949 DTRACEOPT_BUFSIZE)) != 0)
14950 return (rval);
14951
14952 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14953 DTRACEOPT_AGGSIZE)) != 0)
14954 return (rval);
14955
14956 for (i = 0; i < state->dts_nspeculations; i++) {
14957 if ((rval = dtrace_state_buffer(state,
14958 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14959 return (rval);
14960 }
14961
14962 return (0);
14963 }
14964
14965 static void
14966 dtrace_state_prereserve(dtrace_state_t *state)
14967 {
14968 dtrace_ecb_t *ecb;
14969 dtrace_probe_t *probe;
14970
14971 state->dts_reserve = 0;
14972
14973 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14974 return;
14975
14976 /*
14977 * If our buffer policy is a "fill" buffer policy, we need to set the
14978 * prereserved space to be the space required by the END probes.
14979 */
14980 probe = dtrace_probes[dtrace_probeid_end - 1];
14981 ASSERT(probe != NULL);
14982
14983 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14984 if (ecb->dte_state != state)
14985 continue;
14986
14987 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14988 }
14989 }
14990
14991 static int
14992 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14993 {
14994 dtrace_optval_t *opt = state->dts_options, sz, nspec;
14995 dtrace_speculation_t *spec;
14996 dtrace_buffer_t *buf;
14997 #ifdef illumos
14998 cyc_handler_t hdlr;
14999 cyc_time_t when;
15000 #endif
15001 int rval = 0, i, bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t);
15002 dtrace_icookie_t cookie;
15003
15004 mutex_enter(&cpu_lock);
15005 mutex_enter(&dtrace_lock);
15006
15007 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15008 rval = EBUSY;
15009 goto out;
15010 }
15011
15012 /*
15013 * Before we can perform any checks, we must prime all of the
15014 * retained enablings that correspond to this state.
15015 */
15016 dtrace_enabling_prime(state);
15017
15018 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
15019 rval = EACCES;
15020 goto out;
15021 }
15022
15023 dtrace_state_prereserve(state);
15024
15025 /*
15026 * Now we want to do is try to allocate our speculations.
15027 * We do not automatically resize the number of speculations; if
15028 * this fails, we will fail the operation.
15029 */
15030 nspec = opt[DTRACEOPT_NSPEC];
15031 ASSERT(nspec != DTRACEOPT_UNSET);
15032
15033 if (nspec > INT_MAX) {
15034 rval = ENOMEM;
15035 goto out;
15036 }
15037
15038 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
15039 KM_NOSLEEP | KM_NORMALPRI);
15040
15041 if (spec == NULL) {
15042 rval = ENOMEM;
15043 goto out;
15044 }
15045
15046 state->dts_speculations = spec;
15047 state->dts_nspeculations = (int)nspec;
15048
15049 for (i = 0; i < nspec; i++) {
15050 if ((buf = kmem_zalloc(bufsize,
15051 KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
15052 rval = ENOMEM;
15053 goto err;
15054 }
15055
15056 spec[i].dtsp_buffer = buf;
15057 }
15058
15059 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
15060 if (dtrace_anon.dta_state == NULL) {
15061 rval = ENOENT;
15062 goto out;
15063 }
15064
15065 if (state->dts_necbs != 0) {
15066 rval = EALREADY;
15067 goto out;
15068 }
15069
15070 state->dts_anon = dtrace_anon_grab();
15071 ASSERT(state->dts_anon != NULL);
15072 state = state->dts_anon;
15073
15074 /*
15075 * We want "grabanon" to be set in the grabbed state, so we'll
15076 * copy that option value from the grabbing state into the
15077 * grabbed state.
15078 */
15079 state->dts_options[DTRACEOPT_GRABANON] =
15080 opt[DTRACEOPT_GRABANON];
15081
15082 *cpu = dtrace_anon.dta_beganon;
15083
15084 /*
15085 * If the anonymous state is active (as it almost certainly
15086 * is if the anonymous enabling ultimately matched anything),
15087 * we don't allow any further option processing -- but we
15088 * don't return failure.
15089 */
15090 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
15091 goto out;
15092 }
15093
15094 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
15095 opt[DTRACEOPT_AGGSIZE] != 0) {
15096 if (state->dts_aggregations == NULL) {
15097 /*
15098 * We're not going to create an aggregation buffer
15099 * because we don't have any ECBs that contain
15100 * aggregations -- set this option to 0.
15101 */
15102 opt[DTRACEOPT_AGGSIZE] = 0;
15103 } else {
15104 /*
15105 * If we have an aggregation buffer, we must also have
15106 * a buffer to use as scratch.
15107 */
15108 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
15109 opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
15110 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
15111 }
15112 }
15113 }
15114
15115 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
15116 opt[DTRACEOPT_SPECSIZE] != 0) {
15117 if (!state->dts_speculates) {
15118 /*
15119 * We're not going to create speculation buffers
15120 * because we don't have any ECBs that actually
15121 * speculate -- set the speculation size to 0.
15122 */
15123 opt[DTRACEOPT_SPECSIZE] = 0;
15124 }
15125 }
15126
15127 /*
15128 * The bare minimum size for any buffer that we're actually going to
15129 * do anything to is sizeof (uint64_t).
15130 */
15131 sz = sizeof (uint64_t);
15132
15133 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
15134 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
15135 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
15136 /*
15137 * A buffer size has been explicitly set to 0 (or to a size
15138 * that will be adjusted to 0) and we need the space -- we
15139 * need to return failure. We return ENOSPC to differentiate
15140 * it from failing to allocate a buffer due to failure to meet
15141 * the reserve (for which we return E2BIG).
15142 */
15143 rval = ENOSPC;
15144 goto out;
15145 }
15146
15147 if ((rval = dtrace_state_buffers(state)) != 0)
15148 goto err;
15149
15150 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
15151 sz = dtrace_dstate_defsize;
15152
15153 do {
15154 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
15155
15156 if (rval == 0)
15157 break;
15158
15159 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
15160 goto err;
15161 } while (sz >>= 1);
15162
15163 opt[DTRACEOPT_DYNVARSIZE] = sz;
15164
15165 if (rval != 0)
15166 goto err;
15167
15168 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
15169 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
15170
15171 if (opt[DTRACEOPT_CLEANRATE] == 0)
15172 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
15173
15174 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
15175 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
15176
15177 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
15178 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
15179
15180 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
15181 #ifdef illumos
15182 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
15183 hdlr.cyh_arg = state;
15184 hdlr.cyh_level = CY_LOW_LEVEL;
15185
15186 when.cyt_when = 0;
15187 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
15188
15189 state->dts_cleaner = cyclic_add(&hdlr, &when);
15190
15191 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
15192 hdlr.cyh_arg = state;
15193 hdlr.cyh_level = CY_LOW_LEVEL;
15194
15195 when.cyt_when = 0;
15196 when.cyt_interval = dtrace_deadman_interval;
15197
15198 state->dts_deadman = cyclic_add(&hdlr, &when);
15199 #else
15200 callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
15201 dtrace_state_clean, state);
15202 callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
15203 dtrace_state_deadman, state);
15204 #endif
15205
15206 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
15207
15208 #ifdef illumos
15209 if (state->dts_getf != 0 &&
15210 !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
15211 /*
15212 * We don't have kernel privs but we have at least one call
15213 * to getf(); we need to bump our zone's count, and (if
15214 * this is the first enabling to have an unprivileged call
15215 * to getf()) we need to hook into closef().
15216 */
15217 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
15218
15219 if (dtrace_getf++ == 0) {
15220 ASSERT(dtrace_closef == NULL);
15221 dtrace_closef = dtrace_getf_barrier;
15222 }
15223 }
15224 #endif
15225
15226 /*
15227 * Now it's time to actually fire the BEGIN probe. We need to disable
15228 * interrupts here both to record the CPU on which we fired the BEGIN
15229 * probe (the data from this CPU will be processed first at user
15230 * level) and to manually activate the buffer for this CPU.
15231 */
15232 cookie = dtrace_interrupt_disable();
15233 *cpu = curcpu;
15234 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
15235 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
15236
15237 dtrace_probe(dtrace_probeid_begin,
15238 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
15239 dtrace_interrupt_enable(cookie);
15240 /*
15241 * We may have had an exit action from a BEGIN probe; only change our
15242 * state to ACTIVE if we're still in WARMUP.
15243 */
15244 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
15245 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
15246
15247 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
15248 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
15249
15250 #ifdef __FreeBSD__
15251 /*
15252 * We enable anonymous tracing before APs are started, so we must
15253 * activate buffers using the current CPU.
15254 */
15255 if (state == dtrace_anon.dta_state) {
15256 CPU_FOREACH(i)
15257 dtrace_buffer_activate_cpu(state, i);
15258 } else
15259 dtrace_xcall(DTRACE_CPUALL,
15260 (dtrace_xcall_t)dtrace_buffer_activate, state);
15261 #else
15262 /*
15263 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
15264 * want each CPU to transition its principal buffer out of the
15265 * INACTIVE state. Doing this assures that no CPU will suddenly begin
15266 * processing an ECB halfway down a probe's ECB chain; all CPUs will
15267 * atomically transition from processing none of a state's ECBs to
15268 * processing all of them.
15269 */
15270 dtrace_xcall(DTRACE_CPUALL,
15271 (dtrace_xcall_t)dtrace_buffer_activate, state);
15272 #endif
15273 goto out;
15274
15275 err:
15276 dtrace_buffer_free(state->dts_buffer);
15277 dtrace_buffer_free(state->dts_aggbuffer);
15278
15279 if ((nspec = state->dts_nspeculations) == 0) {
15280 ASSERT(state->dts_speculations == NULL);
15281 goto out;
15282 }
15283
15284 spec = state->dts_speculations;
15285 ASSERT(spec != NULL);
15286
15287 for (i = 0; i < state->dts_nspeculations; i++) {
15288 if ((buf = spec[i].dtsp_buffer) == NULL)
15289 break;
15290
15291 dtrace_buffer_free(buf);
15292 kmem_free(buf, bufsize);
15293 }
15294
15295 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15296 state->dts_nspeculations = 0;
15297 state->dts_speculations = NULL;
15298
15299 out:
15300 mutex_exit(&dtrace_lock);
15301 mutex_exit(&cpu_lock);
15302
15303 return (rval);
15304 }
15305
15306 static int
15307 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
15308 {
15309 dtrace_icookie_t cookie;
15310
15311 ASSERT(MUTEX_HELD(&dtrace_lock));
15312
15313 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
15314 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
15315 return (EINVAL);
15316
15317 /*
15318 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
15319 * to be sure that every CPU has seen it. See below for the details
15320 * on why this is done.
15321 */
15322 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
15323 dtrace_sync();
15324
15325 /*
15326 * By this point, it is impossible for any CPU to be still processing
15327 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
15328 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
15329 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
15330 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
15331 * iff we're in the END probe.
15332 */
15333 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
15334 dtrace_sync();
15335 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
15336
15337 /*
15338 * Finally, we can release the reserve and call the END probe. We
15339 * disable interrupts across calling the END probe to allow us to
15340 * return the CPU on which we actually called the END probe. This
15341 * allows user-land to be sure that this CPU's principal buffer is
15342 * processed last.
15343 */
15344 state->dts_reserve = 0;
15345
15346 cookie = dtrace_interrupt_disable();
15347 *cpu = curcpu;
15348 dtrace_probe(dtrace_probeid_end,
15349 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
15350 dtrace_interrupt_enable(cookie);
15351
15352 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
15353 dtrace_sync();
15354
15355 #ifdef illumos
15356 if (state->dts_getf != 0 &&
15357 !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
15358 /*
15359 * We don't have kernel privs but we have at least one call
15360 * to getf(); we need to lower our zone's count, and (if
15361 * this is the last enabling to have an unprivileged call
15362 * to getf()) we need to clear the closef() hook.
15363 */
15364 ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
15365 ASSERT(dtrace_closef == dtrace_getf_barrier);
15366 ASSERT(dtrace_getf > 0);
15367
15368 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
15369
15370 if (--dtrace_getf == 0)
15371 dtrace_closef = NULL;
15372 }
15373 #endif
15374
15375 return (0);
15376 }
15377
15378 static int
15379 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
15380 dtrace_optval_t val)
15381 {
15382 ASSERT(MUTEX_HELD(&dtrace_lock));
15383
15384 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
15385 return (EBUSY);
15386
15387 if (option >= DTRACEOPT_MAX)
15388 return (EINVAL);
15389
15390 if (option != DTRACEOPT_CPU && val < 0)
15391 return (EINVAL);
15392
15393 switch (option) {
15394 case DTRACEOPT_DESTRUCTIVE:
15395 if (dtrace_destructive_disallow)
15396 return (EACCES);
15397
15398 state->dts_cred.dcr_destructive = 1;
15399 break;
15400
15401 case DTRACEOPT_BUFSIZE:
15402 case DTRACEOPT_DYNVARSIZE:
15403 case DTRACEOPT_AGGSIZE:
15404 case DTRACEOPT_SPECSIZE:
15405 case DTRACEOPT_STRSIZE:
15406 if (val < 0)
15407 return (EINVAL);
15408
15409 if (val >= LONG_MAX) {
15410 /*
15411 * If this is an otherwise negative value, set it to
15412 * the highest multiple of 128m less than LONG_MAX.
15413 * Technically, we're adjusting the size without
15414 * regard to the buffer resizing policy, but in fact,
15415 * this has no effect -- if we set the buffer size to
15416 * ~LONG_MAX and the buffer policy is ultimately set to
15417 * be "manual", the buffer allocation is guaranteed to
15418 * fail, if only because the allocation requires two
15419 * buffers. (We set the the size to the highest
15420 * multiple of 128m because it ensures that the size
15421 * will remain a multiple of a megabyte when
15422 * repeatedly halved -- all the way down to 15m.)
15423 */
15424 val = LONG_MAX - (1 << 27) + 1;
15425 }
15426 }
15427
15428 state->dts_options[option] = val;
15429
15430 return (0);
15431 }
15432
15433 static void
15434 dtrace_state_destroy(dtrace_state_t *state)
15435 {
15436 dtrace_ecb_t *ecb;
15437 dtrace_vstate_t *vstate = &state->dts_vstate;
15438 #ifdef illumos
15439 minor_t minor = getminor(state->dts_dev);
15440 #endif
15441 int i, bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t);
15442 dtrace_speculation_t *spec = state->dts_speculations;
15443 int nspec = state->dts_nspeculations;
15444 uint32_t match;
15445
15446 ASSERT(MUTEX_HELD(&dtrace_lock));
15447 ASSERT(MUTEX_HELD(&cpu_lock));
15448
15449 /*
15450 * First, retract any retained enablings for this state.
15451 */
15452 dtrace_enabling_retract(state);
15453 ASSERT(state->dts_nretained == 0);
15454
15455 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
15456 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
15457 /*
15458 * We have managed to come into dtrace_state_destroy() on a
15459 * hot enabling -- almost certainly because of a disorderly
15460 * shutdown of a consumer. (That is, a consumer that is
15461 * exiting without having called dtrace_stop().) In this case,
15462 * we're going to set our activity to be KILLED, and then
15463 * issue a sync to be sure that everyone is out of probe
15464 * context before we start blowing away ECBs.
15465 */
15466 state->dts_activity = DTRACE_ACTIVITY_KILLED;
15467 dtrace_sync();
15468 }
15469
15470 /*
15471 * Release the credential hold we took in dtrace_state_create().
15472 */
15473 if (state->dts_cred.dcr_cred != NULL)
15474 crfree(state->dts_cred.dcr_cred);
15475
15476 /*
15477 * Now we can safely disable and destroy any enabled probes. Because
15478 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
15479 * (especially if they're all enabled), we take two passes through the
15480 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
15481 * in the second we disable whatever is left over.
15482 */
15483 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
15484 for (i = 0; i < state->dts_necbs; i++) {
15485 if ((ecb = state->dts_ecbs[i]) == NULL)
15486 continue;
15487
15488 if (match && ecb->dte_probe != NULL) {
15489 dtrace_probe_t *probe = ecb->dte_probe;
15490 dtrace_provider_t *prov = probe->dtpr_provider;
15491
15492 if (!(prov->dtpv_priv.dtpp_flags & match))
15493 continue;
15494 }
15495
15496 dtrace_ecb_disable(ecb);
15497 dtrace_ecb_destroy(ecb);
15498 }
15499
15500 if (!match)
15501 break;
15502 }
15503
15504 /*
15505 * Before we free the buffers, perform one more sync to assure that
15506 * every CPU is out of probe context.
15507 */
15508 dtrace_sync();
15509
15510 dtrace_buffer_free(state->dts_buffer);
15511 dtrace_buffer_free(state->dts_aggbuffer);
15512
15513 for (i = 0; i < nspec; i++)
15514 dtrace_buffer_free(spec[i].dtsp_buffer);
15515
15516 #ifdef illumos
15517 if (state->dts_cleaner != CYCLIC_NONE)
15518 cyclic_remove(state->dts_cleaner);
15519
15520 if (state->dts_deadman != CYCLIC_NONE)
15521 cyclic_remove(state->dts_deadman);
15522 #else
15523 callout_stop(&state->dts_cleaner);
15524 callout_drain(&state->dts_cleaner);
15525 callout_stop(&state->dts_deadman);
15526 callout_drain(&state->dts_deadman);
15527 #endif
15528
15529 dtrace_dstate_fini(&vstate->dtvs_dynvars);
15530 dtrace_vstate_fini(vstate);
15531 if (state->dts_ecbs != NULL)
15532 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15533
15534 if (state->dts_aggregations != NULL) {
15535 #ifdef DEBUG
15536 for (i = 0; i < state->dts_naggregations; i++)
15537 ASSERT(state->dts_aggregations[i] == NULL);
15538 #endif
15539 ASSERT(state->dts_naggregations > 0);
15540 kmem_free(state->dts_aggregations,
15541 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15542 }
15543
15544 kmem_free(state->dts_buffer, bufsize);
15545 kmem_free(state->dts_aggbuffer, bufsize);
15546
15547 for (i = 0; i < nspec; i++)
15548 kmem_free(spec[i].dtsp_buffer, bufsize);
15549
15550 if (spec != NULL)
15551 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15552
15553 dtrace_format_destroy(state);
15554
15555 if (state->dts_aggid_arena != NULL) {
15556 #ifdef illumos
15557 vmem_destroy(state->dts_aggid_arena);
15558 #else
15559 delete_unrhdr(state->dts_aggid_arena);
15560 #endif
15561 state->dts_aggid_arena = NULL;
15562 }
15563 #ifdef illumos
15564 ddi_soft_state_free(dtrace_softstate, minor);
15565 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15566 #endif
15567 }
15568
15569 /*
15570 * DTrace Anonymous Enabling Functions
15571 */
15572 static dtrace_state_t *
15573 dtrace_anon_grab(void)
15574 {
15575 dtrace_state_t *state;
15576
15577 ASSERT(MUTEX_HELD(&dtrace_lock));
15578
15579 if ((state = dtrace_anon.dta_state) == NULL) {
15580 ASSERT(dtrace_anon.dta_enabling == NULL);
15581 return (NULL);
15582 }
15583
15584 ASSERT(dtrace_anon.dta_enabling != NULL);
15585 ASSERT(dtrace_retained != NULL);
15586
15587 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15588 dtrace_anon.dta_enabling = NULL;
15589 dtrace_anon.dta_state = NULL;
15590
15591 return (state);
15592 }
15593
15594 static void
15595 dtrace_anon_property(void)
15596 {
15597 int i, rv;
15598 dtrace_state_t *state;
15599 dof_hdr_t *dof;
15600 char c[32]; /* enough for "dof-data-" + digits */
15601
15602 ASSERT(MUTEX_HELD(&dtrace_lock));
15603 ASSERT(MUTEX_HELD(&cpu_lock));
15604
15605 for (i = 0; ; i++) {
15606 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15607
15608 dtrace_err_verbose = 1;
15609
15610 if ((dof = dtrace_dof_property(c)) == NULL) {
15611 dtrace_err_verbose = 0;
15612 break;
15613 }
15614
15615 #ifdef illumos
15616 /*
15617 * We want to create anonymous state, so we need to transition
15618 * the kernel debugger to indicate that DTrace is active. If
15619 * this fails (e.g. because the debugger has modified text in
15620 * some way), we won't continue with the processing.
15621 */
15622 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15623 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15624 "enabling ignored.");
15625 dtrace_dof_destroy(dof);
15626 break;
15627 }
15628 #endif
15629
15630 /*
15631 * If we haven't allocated an anonymous state, we'll do so now.
15632 */
15633 if ((state = dtrace_anon.dta_state) == NULL) {
15634 state = dtrace_state_create(NULL, NULL);
15635 dtrace_anon.dta_state = state;
15636
15637 if (state == NULL) {
15638 /*
15639 * This basically shouldn't happen: the only
15640 * failure mode from dtrace_state_create() is a
15641 * failure of ddi_soft_state_zalloc() that
15642 * itself should never happen. Still, the
15643 * interface allows for a failure mode, and
15644 * we want to fail as gracefully as possible:
15645 * we'll emit an error message and cease
15646 * processing anonymous state in this case.
15647 */
15648 cmn_err(CE_WARN, "failed to create "
15649 "anonymous state");
15650 dtrace_dof_destroy(dof);
15651 break;
15652 }
15653 }
15654
15655 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15656 &dtrace_anon.dta_enabling, 0, 0, B_TRUE);
15657
15658 if (rv == 0)
15659 rv = dtrace_dof_options(dof, state);
15660
15661 dtrace_err_verbose = 0;
15662 dtrace_dof_destroy(dof);
15663
15664 if (rv != 0) {
15665 /*
15666 * This is malformed DOF; chuck any anonymous state
15667 * that we created.
15668 */
15669 ASSERT(dtrace_anon.dta_enabling == NULL);
15670 dtrace_state_destroy(state);
15671 dtrace_anon.dta_state = NULL;
15672 break;
15673 }
15674
15675 ASSERT(dtrace_anon.dta_enabling != NULL);
15676 }
15677
15678 if (dtrace_anon.dta_enabling != NULL) {
15679 int rval;
15680
15681 /*
15682 * dtrace_enabling_retain() can only fail because we are
15683 * trying to retain more enablings than are allowed -- but
15684 * we only have one anonymous enabling, and we are guaranteed
15685 * to be allowed at least one retained enabling; we assert
15686 * that dtrace_enabling_retain() returns success.
15687 */
15688 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15689 ASSERT(rval == 0);
15690
15691 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15692 }
15693 }
15694
15695 /*
15696 * DTrace Helper Functions
15697 */
15698 static void
15699 dtrace_helper_trace(dtrace_helper_action_t *helper,
15700 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15701 {
15702 uint32_t size, next, nnext, i;
15703 dtrace_helptrace_t *ent, *buffer;
15704 uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15705
15706 if ((buffer = dtrace_helptrace_buffer) == NULL)
15707 return;
15708
15709 ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15710
15711 /*
15712 * What would a tracing framework be without its own tracing
15713 * framework? (Well, a hell of a lot simpler, for starters...)
15714 */
15715 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15716 sizeof (uint64_t) - sizeof (uint64_t);
15717
15718 /*
15719 * Iterate until we can allocate a slot in the trace buffer.
15720 */
15721 do {
15722 next = dtrace_helptrace_next;
15723
15724 if (next + size < dtrace_helptrace_bufsize) {
15725 nnext = next + size;
15726 } else {
15727 nnext = size;
15728 }
15729 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15730
15731 /*
15732 * We have our slot; fill it in.
15733 */
15734 if (nnext == size) {
15735 dtrace_helptrace_wrapped++;
15736 next = 0;
15737 }
15738
15739 ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
15740 ent->dtht_helper = helper;
15741 ent->dtht_where = where;
15742 ent->dtht_nlocals = vstate->dtvs_nlocals;
15743
15744 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15745 mstate->dtms_fltoffs : -1;
15746 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15747 ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15748
15749 for (i = 0; i < vstate->dtvs_nlocals; i++) {
15750 dtrace_statvar_t *svar;
15751
15752 if ((svar = vstate->dtvs_locals[i]) == NULL)
15753 continue;
15754
15755 ASSERT(svar->dtsv_size >= (mp_maxid + 1) * sizeof (uint64_t));
15756 ent->dtht_locals[i] =
15757 ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15758 }
15759 }
15760
15761 static uint64_t
15762 dtrace_helper(int which, dtrace_mstate_t *mstate,
15763 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15764 {
15765 uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15766 uint64_t sarg0 = mstate->dtms_arg[0];
15767 uint64_t sarg1 = mstate->dtms_arg[1];
15768 uint64_t rval = 0;
15769 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15770 dtrace_helper_action_t *helper;
15771 dtrace_vstate_t *vstate;
15772 dtrace_difo_t *pred;
15773 int i, trace = dtrace_helptrace_buffer != NULL;
15774
15775 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15776
15777 if (helpers == NULL)
15778 return (0);
15779
15780 if ((helper = helpers->dthps_actions[which]) == NULL)
15781 return (0);
15782
15783 vstate = &helpers->dthps_vstate;
15784 mstate->dtms_arg[0] = arg0;
15785 mstate->dtms_arg[1] = arg1;
15786
15787 /*
15788 * Now iterate over each helper. If its predicate evaluates to 'true',
15789 * we'll call the corresponding actions. Note that the below calls
15790 * to dtrace_dif_emulate() may set faults in machine state. This is
15791 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15792 * the stored DIF offset with its own (which is the desired behavior).
15793 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15794 * from machine state; this is okay, too.
15795 */
15796 for (; helper != NULL; helper = helper->dtha_next) {
15797 if ((pred = helper->dtha_predicate) != NULL) {
15798 if (trace)
15799 dtrace_helper_trace(helper, mstate, vstate, 0);
15800
15801 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15802 goto next;
15803
15804 if (*flags & CPU_DTRACE_FAULT)
15805 goto err;
15806 }
15807
15808 for (i = 0; i < helper->dtha_nactions; i++) {
15809 if (trace)
15810 dtrace_helper_trace(helper,
15811 mstate, vstate, i + 1);
15812
15813 rval = dtrace_dif_emulate(helper->dtha_actions[i],
15814 mstate, vstate, state);
15815
15816 if (*flags & CPU_DTRACE_FAULT)
15817 goto err;
15818 }
15819
15820 next:
15821 if (trace)
15822 dtrace_helper_trace(helper, mstate, vstate,
15823 DTRACE_HELPTRACE_NEXT);
15824 }
15825
15826 if (trace)
15827 dtrace_helper_trace(helper, mstate, vstate,
15828 DTRACE_HELPTRACE_DONE);
15829
15830 /*
15831 * Restore the arg0 that we saved upon entry.
15832 */
15833 mstate->dtms_arg[0] = sarg0;
15834 mstate->dtms_arg[1] = sarg1;
15835
15836 return (rval);
15837
15838 err:
15839 if (trace)
15840 dtrace_helper_trace(helper, mstate, vstate,
15841 DTRACE_HELPTRACE_ERR);
15842
15843 /*
15844 * Restore the arg0 that we saved upon entry.
15845 */
15846 mstate->dtms_arg[0] = sarg0;
15847 mstate->dtms_arg[1] = sarg1;
15848
15849 return (0);
15850 }
15851
15852 static void
15853 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15854 dtrace_vstate_t *vstate)
15855 {
15856 int i;
15857
15858 if (helper->dtha_predicate != NULL)
15859 dtrace_difo_release(helper->dtha_predicate, vstate);
15860
15861 for (i = 0; i < helper->dtha_nactions; i++) {
15862 ASSERT(helper->dtha_actions[i] != NULL);
15863 dtrace_difo_release(helper->dtha_actions[i], vstate);
15864 }
15865
15866 kmem_free(helper->dtha_actions,
15867 helper->dtha_nactions * sizeof (dtrace_difo_t *));
15868 kmem_free(helper, sizeof (dtrace_helper_action_t));
15869 }
15870
15871 static int
15872 dtrace_helper_destroygen(dtrace_helpers_t *help, int gen)
15873 {
15874 proc_t *p = curproc;
15875 dtrace_vstate_t *vstate;
15876 int i;
15877
15878 if (help == NULL)
15879 help = p->p_dtrace_helpers;
15880
15881 ASSERT(MUTEX_HELD(&dtrace_lock));
15882
15883 if (help == NULL || gen > help->dthps_generation)
15884 return (EINVAL);
15885
15886 vstate = &help->dthps_vstate;
15887
15888 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15889 dtrace_helper_action_t *last = NULL, *h, *next;
15890
15891 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15892 next = h->dtha_next;
15893
15894 if (h->dtha_generation == gen) {
15895 if (last != NULL) {
15896 last->dtha_next = next;
15897 } else {
15898 help->dthps_actions[i] = next;
15899 }
15900
15901 dtrace_helper_action_destroy(h, vstate);
15902 } else {
15903 last = h;
15904 }
15905 }
15906 }
15907
15908 /*
15909 * Interate until we've cleared out all helper providers with the
15910 * given generation number.
15911 */
15912 for (;;) {
15913 dtrace_helper_provider_t *prov;
15914
15915 /*
15916 * Look for a helper provider with the right generation. We
15917 * have to start back at the beginning of the list each time
15918 * because we drop dtrace_lock. It's unlikely that we'll make
15919 * more than two passes.
15920 */
15921 for (i = 0; i < help->dthps_nprovs; i++) {
15922 prov = help->dthps_provs[i];
15923
15924 if (prov->dthp_generation == gen)
15925 break;
15926 }
15927
15928 /*
15929 * If there were no matches, we're done.
15930 */
15931 if (i == help->dthps_nprovs)
15932 break;
15933
15934 /*
15935 * Move the last helper provider into this slot.
15936 */
15937 help->dthps_nprovs--;
15938 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15939 help->dthps_provs[help->dthps_nprovs] = NULL;
15940
15941 mutex_exit(&dtrace_lock);
15942
15943 /*
15944 * If we have a meta provider, remove this helper provider.
15945 */
15946 mutex_enter(&dtrace_meta_lock);
15947 if (dtrace_meta_pid != NULL) {
15948 ASSERT(dtrace_deferred_pid == NULL);
15949 dtrace_helper_provider_remove(&prov->dthp_prov,
15950 p->p_pid);
15951 }
15952 mutex_exit(&dtrace_meta_lock);
15953
15954 dtrace_helper_provider_destroy(prov);
15955
15956 mutex_enter(&dtrace_lock);
15957 }
15958
15959 return (0);
15960 }
15961
15962 static int
15963 dtrace_helper_validate(dtrace_helper_action_t *helper)
15964 {
15965 int err = 0, i;
15966 dtrace_difo_t *dp;
15967
15968 if ((dp = helper->dtha_predicate) != NULL)
15969 err += dtrace_difo_validate_helper(dp);
15970
15971 for (i = 0; i < helper->dtha_nactions; i++)
15972 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15973
15974 return (err == 0);
15975 }
15976
15977 static int
15978 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep,
15979 dtrace_helpers_t *help)
15980 {
15981 dtrace_helper_action_t *helper, *last;
15982 dtrace_actdesc_t *act;
15983 dtrace_vstate_t *vstate;
15984 dtrace_predicate_t *pred;
15985 int count = 0, nactions = 0, i;
15986
15987 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15988 return (EINVAL);
15989
15990 last = help->dthps_actions[which];
15991 vstate = &help->dthps_vstate;
15992
15993 for (count = 0; last != NULL; last = last->dtha_next) {
15994 count++;
15995 if (last->dtha_next == NULL)
15996 break;
15997 }
15998
15999 /*
16000 * If we already have dtrace_helper_actions_max helper actions for this
16001 * helper action type, we'll refuse to add a new one.
16002 */
16003 if (count >= dtrace_helper_actions_max)
16004 return (ENOSPC);
16005
16006 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
16007 helper->dtha_generation = help->dthps_generation;
16008
16009 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
16010 ASSERT(pred->dtp_difo != NULL);
16011 dtrace_difo_hold(pred->dtp_difo);
16012 helper->dtha_predicate = pred->dtp_difo;
16013 }
16014
16015 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
16016 if (act->dtad_kind != DTRACEACT_DIFEXPR)
16017 goto err;
16018
16019 if (act->dtad_difo == NULL)
16020 goto err;
16021
16022 nactions++;
16023 }
16024
16025 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
16026 (helper->dtha_nactions = nactions), KM_SLEEP);
16027
16028 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
16029 dtrace_difo_hold(act->dtad_difo);
16030 helper->dtha_actions[i++] = act->dtad_difo;
16031 }
16032
16033 if (!dtrace_helper_validate(helper))
16034 goto err;
16035
16036 if (last == NULL) {
16037 help->dthps_actions[which] = helper;
16038 } else {
16039 last->dtha_next = helper;
16040 }
16041
16042 if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
16043 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
16044 dtrace_helptrace_next = 0;
16045 }
16046
16047 return (0);
16048 err:
16049 dtrace_helper_action_destroy(helper, vstate);
16050 return (EINVAL);
16051 }
16052
16053 static void
16054 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
16055 dof_helper_t *dofhp)
16056 {
16057 ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
16058
16059 mutex_enter(&dtrace_meta_lock);
16060 mutex_enter(&dtrace_lock);
16061
16062 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
16063 /*
16064 * If the dtrace module is loaded but not attached, or if
16065 * there aren't isn't a meta provider registered to deal with
16066 * these provider descriptions, we need to postpone creating
16067 * the actual providers until later.
16068 */
16069
16070 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
16071 dtrace_deferred_pid != help) {
16072 help->dthps_deferred = 1;
16073 help->dthps_pid = p->p_pid;
16074 help->dthps_next = dtrace_deferred_pid;
16075 help->dthps_prev = NULL;
16076 if (dtrace_deferred_pid != NULL)
16077 dtrace_deferred_pid->dthps_prev = help;
16078 dtrace_deferred_pid = help;
16079 }
16080
16081 mutex_exit(&dtrace_lock);
16082
16083 } else if (dofhp != NULL) {
16084 /*
16085 * If the dtrace module is loaded and we have a particular
16086 * helper provider description, pass that off to the
16087 * meta provider.
16088 */
16089
16090 mutex_exit(&dtrace_lock);
16091
16092 dtrace_helper_provide(dofhp, p->p_pid);
16093
16094 } else {
16095 /*
16096 * Otherwise, just pass all the helper provider descriptions
16097 * off to the meta provider.
16098 */
16099
16100 int i;
16101 mutex_exit(&dtrace_lock);
16102
16103 for (i = 0; i < help->dthps_nprovs; i++) {
16104 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
16105 p->p_pid);
16106 }
16107 }
16108
16109 mutex_exit(&dtrace_meta_lock);
16110 }
16111
16112 static int
16113 dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen)
16114 {
16115 dtrace_helper_provider_t *hprov, **tmp_provs;
16116 uint_t tmp_maxprovs, i;
16117
16118 ASSERT(MUTEX_HELD(&dtrace_lock));
16119 ASSERT(help != NULL);
16120
16121 /*
16122 * If we already have dtrace_helper_providers_max helper providers,
16123 * we're refuse to add a new one.
16124 */
16125 if (help->dthps_nprovs >= dtrace_helper_providers_max)
16126 return (ENOSPC);
16127
16128 /*
16129 * Check to make sure this isn't a duplicate.
16130 */
16131 for (i = 0; i < help->dthps_nprovs; i++) {
16132 if (dofhp->dofhp_addr ==
16133 help->dthps_provs[i]->dthp_prov.dofhp_addr)
16134 return (EALREADY);
16135 }
16136
16137 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
16138 hprov->dthp_prov = *dofhp;
16139 hprov->dthp_ref = 1;
16140 hprov->dthp_generation = gen;
16141
16142 /*
16143 * Allocate a bigger table for helper providers if it's already full.
16144 */
16145 if (help->dthps_maxprovs == help->dthps_nprovs) {
16146 tmp_maxprovs = help->dthps_maxprovs;
16147 tmp_provs = help->dthps_provs;
16148
16149 if (help->dthps_maxprovs == 0)
16150 help->dthps_maxprovs = 2;
16151 else
16152 help->dthps_maxprovs *= 2;
16153 if (help->dthps_maxprovs > dtrace_helper_providers_max)
16154 help->dthps_maxprovs = dtrace_helper_providers_max;
16155
16156 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
16157
16158 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
16159 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16160
16161 if (tmp_provs != NULL) {
16162 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
16163 sizeof (dtrace_helper_provider_t *));
16164 kmem_free(tmp_provs, tmp_maxprovs *
16165 sizeof (dtrace_helper_provider_t *));
16166 }
16167 }
16168
16169 help->dthps_provs[help->dthps_nprovs] = hprov;
16170 help->dthps_nprovs++;
16171
16172 return (0);
16173 }
16174
16175 static void
16176 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
16177 {
16178 mutex_enter(&dtrace_lock);
16179
16180 if (--hprov->dthp_ref == 0) {
16181 dof_hdr_t *dof;
16182 mutex_exit(&dtrace_lock);
16183 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
16184 dtrace_dof_destroy(dof);
16185 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
16186 } else {
16187 mutex_exit(&dtrace_lock);
16188 }
16189 }
16190
16191 static int
16192 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
16193 {
16194 uintptr_t daddr = (uintptr_t)dof;
16195 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
16196 dof_provider_t *provider;
16197 dof_probe_t *probe;
16198 uint8_t *arg;
16199 char *strtab, *typestr;
16200 dof_stridx_t typeidx;
16201 size_t typesz;
16202 uint_t nprobes, j, k;
16203
16204 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
16205
16206 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
16207 dtrace_dof_error(dof, "misaligned section offset");
16208 return (-1);
16209 }
16210
16211 /*
16212 * The section needs to be large enough to contain the DOF provider
16213 * structure appropriate for the given version.
16214 */
16215 if (sec->dofs_size <
16216 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
16217 offsetof(dof_provider_t, dofpv_prenoffs) :
16218 sizeof (dof_provider_t))) {
16219 dtrace_dof_error(dof, "provider section too small");
16220 return (-1);
16221 }
16222
16223 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
16224 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
16225 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
16226 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
16227 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
16228
16229 if (str_sec == NULL || prb_sec == NULL ||
16230 arg_sec == NULL || off_sec == NULL)
16231 return (-1);
16232
16233 enoff_sec = NULL;
16234
16235 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
16236 provider->dofpv_prenoffs != DOF_SECT_NONE &&
16237 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
16238 provider->dofpv_prenoffs)) == NULL)
16239 return (-1);
16240
16241 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
16242
16243 if (provider->dofpv_name >= str_sec->dofs_size ||
16244 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
16245 dtrace_dof_error(dof, "invalid provider name");
16246 return (-1);
16247 }
16248
16249 if (prb_sec->dofs_entsize == 0 ||
16250 prb_sec->dofs_entsize > prb_sec->dofs_size) {
16251 dtrace_dof_error(dof, "invalid entry size");
16252 return (-1);
16253 }
16254
16255 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
16256 dtrace_dof_error(dof, "misaligned entry size");
16257 return (-1);
16258 }
16259
16260 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
16261 dtrace_dof_error(dof, "invalid entry size");
16262 return (-1);
16263 }
16264
16265 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
16266 dtrace_dof_error(dof, "misaligned section offset");
16267 return (-1);
16268 }
16269
16270 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
16271 dtrace_dof_error(dof, "invalid entry size");
16272 return (-1);
16273 }
16274
16275 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
16276
16277 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
16278
16279 /*
16280 * Take a pass through the probes to check for errors.
16281 */
16282 for (j = 0; j < nprobes; j++) {
16283 probe = (dof_probe_t *)(uintptr_t)(daddr +
16284 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
16285
16286 if (probe->dofpr_func >= str_sec->dofs_size) {
16287 dtrace_dof_error(dof, "invalid function name");
16288 return (-1);
16289 }
16290
16291 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
16292 dtrace_dof_error(dof, "function name too long");
16293 /*
16294 * Keep going if the function name is too long.
16295 * Unlike provider and probe names, we cannot reasonably
16296 * impose restrictions on function names, since they're
16297 * a property of the code being instrumented. We will
16298 * skip this probe in dtrace_helper_provide_one().
16299 */
16300 }
16301
16302 if (probe->dofpr_name >= str_sec->dofs_size ||
16303 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
16304 dtrace_dof_error(dof, "invalid probe name");
16305 return (-1);
16306 }
16307
16308 /*
16309 * The offset count must not wrap the index, and the offsets
16310 * must also not overflow the section's data.
16311 */
16312 if (probe->dofpr_offidx + probe->dofpr_noffs <
16313 probe->dofpr_offidx ||
16314 (probe->dofpr_offidx + probe->dofpr_noffs) *
16315 off_sec->dofs_entsize > off_sec->dofs_size) {
16316 dtrace_dof_error(dof, "invalid probe offset");
16317 return (-1);
16318 }
16319
16320 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
16321 /*
16322 * If there's no is-enabled offset section, make sure
16323 * there aren't any is-enabled offsets. Otherwise
16324 * perform the same checks as for probe offsets
16325 * (immediately above).
16326 */
16327 if (enoff_sec == NULL) {
16328 if (probe->dofpr_enoffidx != 0 ||
16329 probe->dofpr_nenoffs != 0) {
16330 dtrace_dof_error(dof, "is-enabled "
16331 "offsets with null section");
16332 return (-1);
16333 }
16334 } else if (probe->dofpr_enoffidx +
16335 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
16336 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
16337 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
16338 dtrace_dof_error(dof, "invalid is-enabled "
16339 "offset");
16340 return (-1);
16341 }
16342
16343 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
16344 dtrace_dof_error(dof, "zero probe and "
16345 "is-enabled offsets");
16346 return (-1);
16347 }
16348 } else if (probe->dofpr_noffs == 0) {
16349 dtrace_dof_error(dof, "zero probe offsets");
16350 return (-1);
16351 }
16352
16353 if (probe->dofpr_argidx + probe->dofpr_xargc <
16354 probe->dofpr_argidx ||
16355 (probe->dofpr_argidx + probe->dofpr_xargc) *
16356 arg_sec->dofs_entsize > arg_sec->dofs_size) {
16357 dtrace_dof_error(dof, "invalid args");
16358 return (-1);
16359 }
16360
16361 typeidx = probe->dofpr_nargv;
16362 typestr = strtab + probe->dofpr_nargv;
16363 for (k = 0; k < probe->dofpr_nargc; k++) {
16364 if (typeidx >= str_sec->dofs_size) {
16365 dtrace_dof_error(dof, "bad "
16366 "native argument type");
16367 return (-1);
16368 }
16369
16370 typesz = strlen(typestr) + 1;
16371 if (typesz > DTRACE_ARGTYPELEN) {
16372 dtrace_dof_error(dof, "native "
16373 "argument type too long");
16374 return (-1);
16375 }
16376 typeidx += typesz;
16377 typestr += typesz;
16378 }
16379
16380 typeidx = probe->dofpr_xargv;
16381 typestr = strtab + probe->dofpr_xargv;
16382 for (k = 0; k < probe->dofpr_xargc; k++) {
16383 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
16384 dtrace_dof_error(dof, "bad "
16385 "native argument index");
16386 return (-1);
16387 }
16388
16389 if (typeidx >= str_sec->dofs_size) {
16390 dtrace_dof_error(dof, "bad "
16391 "translated argument type");
16392 return (-1);
16393 }
16394
16395 typesz = strlen(typestr) + 1;
16396 if (typesz > DTRACE_ARGTYPELEN) {
16397 dtrace_dof_error(dof, "translated argument "
16398 "type too long");
16399 return (-1);
16400 }
16401
16402 typeidx += typesz;
16403 typestr += typesz;
16404 }
16405 }
16406
16407 return (0);
16408 }
16409
16410 static int
16411 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p)
16412 {
16413 dtrace_helpers_t *help;
16414 dtrace_vstate_t *vstate;
16415 dtrace_enabling_t *enab = NULL;
16416 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
16417 uintptr_t daddr = (uintptr_t)dof;
16418
16419 ASSERT(MUTEX_HELD(&dtrace_lock));
16420
16421 if ((help = p->p_dtrace_helpers) == NULL)
16422 help = dtrace_helpers_create(p);
16423
16424 vstate = &help->dthps_vstate;
16425
16426 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr,
16427 dhp->dofhp_dof, B_FALSE)) != 0) {
16428 dtrace_dof_destroy(dof);
16429 return (rv);
16430 }
16431
16432 /*
16433 * Look for helper providers and validate their descriptions.
16434 */
16435 for (i = 0; i < dof->dofh_secnum; i++) {
16436 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
16437 dof->dofh_secoff + i * dof->dofh_secsize);
16438
16439 if (sec->dofs_type != DOF_SECT_PROVIDER)
16440 continue;
16441
16442 if (dtrace_helper_provider_validate(dof, sec) != 0) {
16443 dtrace_enabling_destroy(enab);
16444 dtrace_dof_destroy(dof);
16445 return (-1);
16446 }
16447
16448 nprovs++;
16449 }
16450
16451 /*
16452 * Now we need to walk through the ECB descriptions in the enabling.
16453 */
16454 for (i = 0; i < enab->dten_ndesc; i++) {
16455 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
16456 dtrace_probedesc_t *desc = &ep->dted_probe;
16457
16458 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
16459 continue;
16460
16461 if (strcmp(desc->dtpd_mod, "helper") != 0)
16462 continue;
16463
16464 if (strcmp(desc->dtpd_func, "ustack") != 0)
16465 continue;
16466
16467 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
16468 ep, help)) != 0) {
16469 /*
16470 * Adding this helper action failed -- we are now going
16471 * to rip out the entire generation and return failure.
16472 */
16473 (void) dtrace_helper_destroygen(help,
16474 help->dthps_generation);
16475 dtrace_enabling_destroy(enab);
16476 dtrace_dof_destroy(dof);
16477 return (-1);
16478 }
16479
16480 nhelpers++;
16481 }
16482
16483 if (nhelpers < enab->dten_ndesc)
16484 dtrace_dof_error(dof, "unmatched helpers");
16485
16486 gen = help->dthps_generation++;
16487 dtrace_enabling_destroy(enab);
16488
16489 if (nprovs > 0) {
16490 /*
16491 * Now that this is in-kernel, we change the sense of the
16492 * members: dofhp_dof denotes the in-kernel copy of the DOF
16493 * and dofhp_addr denotes the address at user-level.
16494 */
16495 dhp->dofhp_addr = dhp->dofhp_dof;
16496 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
16497
16498 if (dtrace_helper_provider_add(dhp, help, gen) == 0) {
16499 mutex_exit(&dtrace_lock);
16500 dtrace_helper_provider_register(p, help, dhp);
16501 mutex_enter(&dtrace_lock);
16502
16503 destroy = 0;
16504 }
16505 }
16506
16507 if (destroy)
16508 dtrace_dof_destroy(dof);
16509
16510 return (gen);
16511 }
16512
16513 static dtrace_helpers_t *
16514 dtrace_helpers_create(proc_t *p)
16515 {
16516 dtrace_helpers_t *help;
16517
16518 ASSERT(MUTEX_HELD(&dtrace_lock));
16519 ASSERT(p->p_dtrace_helpers == NULL);
16520
16521 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16522 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16523 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16524
16525 p->p_dtrace_helpers = help;
16526 dtrace_helpers++;
16527
16528 return (help);
16529 }
16530
16531 #ifdef illumos
16532 static
16533 #endif
16534 void
16535 dtrace_helpers_destroy(proc_t *p)
16536 {
16537 dtrace_helpers_t *help;
16538 dtrace_vstate_t *vstate;
16539 #ifdef illumos
16540 proc_t *p = curproc;
16541 #endif
16542 int i;
16543
16544 mutex_enter(&dtrace_lock);
16545
16546 ASSERT(p->p_dtrace_helpers != NULL);
16547 ASSERT(dtrace_helpers > 0);
16548
16549 help = p->p_dtrace_helpers;
16550 vstate = &help->dthps_vstate;
16551
16552 /*
16553 * We're now going to lose the help from this process.
16554 */
16555 p->p_dtrace_helpers = NULL;
16556 dtrace_sync();
16557
16558 /*
16559 * Destory the helper actions.
16560 */
16561 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16562 dtrace_helper_action_t *h, *next;
16563
16564 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16565 next = h->dtha_next;
16566 dtrace_helper_action_destroy(h, vstate);
16567 h = next;
16568 }
16569 }
16570
16571 mutex_exit(&dtrace_lock);
16572
16573 /*
16574 * Destroy the helper providers.
16575 */
16576 if (help->dthps_maxprovs > 0) {
16577 mutex_enter(&dtrace_meta_lock);
16578 if (dtrace_meta_pid != NULL) {
16579 ASSERT(dtrace_deferred_pid == NULL);
16580
16581 for (i = 0; i < help->dthps_nprovs; i++) {
16582 dtrace_helper_provider_remove(
16583 &help->dthps_provs[i]->dthp_prov, p->p_pid);
16584 }
16585 } else {
16586 mutex_enter(&dtrace_lock);
16587 ASSERT(help->dthps_deferred == 0 ||
16588 help->dthps_next != NULL ||
16589 help->dthps_prev != NULL ||
16590 help == dtrace_deferred_pid);
16591
16592 /*
16593 * Remove the helper from the deferred list.
16594 */
16595 if (help->dthps_next != NULL)
16596 help->dthps_next->dthps_prev = help->dthps_prev;
16597 if (help->dthps_prev != NULL)
16598 help->dthps_prev->dthps_next = help->dthps_next;
16599 if (dtrace_deferred_pid == help) {
16600 dtrace_deferred_pid = help->dthps_next;
16601 ASSERT(help->dthps_prev == NULL);
16602 }
16603
16604 mutex_exit(&dtrace_lock);
16605 }
16606
16607 mutex_exit(&dtrace_meta_lock);
16608
16609 for (i = 0; i < help->dthps_nprovs; i++) {
16610 dtrace_helper_provider_destroy(help->dthps_provs[i]);
16611 }
16612
16613 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16614 sizeof (dtrace_helper_provider_t *));
16615 }
16616
16617 mutex_enter(&dtrace_lock);
16618
16619 dtrace_vstate_fini(&help->dthps_vstate);
16620 kmem_free(help->dthps_actions,
16621 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16622 kmem_free(help, sizeof (dtrace_helpers_t));
16623
16624 --dtrace_helpers;
16625 mutex_exit(&dtrace_lock);
16626 }
16627
16628 #ifdef illumos
16629 static
16630 #endif
16631 void
16632 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16633 {
16634 dtrace_helpers_t *help, *newhelp;
16635 dtrace_helper_action_t *helper, *new, *last;
16636 dtrace_difo_t *dp;
16637 dtrace_vstate_t *vstate;
16638 int i, j, sz, hasprovs = 0;
16639
16640 mutex_enter(&dtrace_lock);
16641 ASSERT(from->p_dtrace_helpers != NULL);
16642 ASSERT(dtrace_helpers > 0);
16643
16644 help = from->p_dtrace_helpers;
16645 newhelp = dtrace_helpers_create(to);
16646 ASSERT(to->p_dtrace_helpers != NULL);
16647
16648 newhelp->dthps_generation = help->dthps_generation;
16649 vstate = &newhelp->dthps_vstate;
16650
16651 /*
16652 * Duplicate the helper actions.
16653 */
16654 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16655 if ((helper = help->dthps_actions[i]) == NULL)
16656 continue;
16657
16658 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16659 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16660 KM_SLEEP);
16661 new->dtha_generation = helper->dtha_generation;
16662
16663 if ((dp = helper->dtha_predicate) != NULL) {
16664 dp = dtrace_difo_duplicate(dp, vstate);
16665 new->dtha_predicate = dp;
16666 }
16667
16668 new->dtha_nactions = helper->dtha_nactions;
16669 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16670 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16671
16672 for (j = 0; j < new->dtha_nactions; j++) {
16673 dtrace_difo_t *dp = helper->dtha_actions[j];
16674
16675 ASSERT(dp != NULL);
16676 dp = dtrace_difo_duplicate(dp, vstate);
16677 new->dtha_actions[j] = dp;
16678 }
16679
16680 if (last != NULL) {
16681 last->dtha_next = new;
16682 } else {
16683 newhelp->dthps_actions[i] = new;
16684 }
16685
16686 last = new;
16687 }
16688 }
16689
16690 /*
16691 * Duplicate the helper providers and register them with the
16692 * DTrace framework.
16693 */
16694 if (help->dthps_nprovs > 0) {
16695 newhelp->dthps_nprovs = help->dthps_nprovs;
16696 newhelp->dthps_maxprovs = help->dthps_nprovs;
16697 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16698 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16699 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16700 newhelp->dthps_provs[i] = help->dthps_provs[i];
16701 newhelp->dthps_provs[i]->dthp_ref++;
16702 }
16703
16704 hasprovs = 1;
16705 }
16706
16707 mutex_exit(&dtrace_lock);
16708
16709 if (hasprovs)
16710 dtrace_helper_provider_register(to, newhelp, NULL);
16711 }
16712
16713 /*
16714 * DTrace Hook Functions
16715 */
16716 static void
16717 dtrace_module_loaded(modctl_t *ctl)
16718 {
16719 dtrace_provider_t *prv;
16720
16721 mutex_enter(&dtrace_provider_lock);
16722 #ifdef illumos
16723 mutex_enter(&mod_lock);
16724 #endif
16725
16726 #ifdef illumos
16727 ASSERT(ctl->mod_busy);
16728 #endif
16729
16730 /*
16731 * We're going to call each providers per-module provide operation
16732 * specifying only this module.
16733 */
16734 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16735 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16736
16737 #ifdef illumos
16738 mutex_exit(&mod_lock);
16739 #endif
16740 mutex_exit(&dtrace_provider_lock);
16741
16742 /*
16743 * If we have any retained enablings, we need to match against them.
16744 * Enabling probes requires that cpu_lock be held, and we cannot hold
16745 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16746 * module. (In particular, this happens when loading scheduling
16747 * classes.) So if we have any retained enablings, we need to dispatch
16748 * our task queue to do the match for us.
16749 */
16750 mutex_enter(&dtrace_lock);
16751
16752 if (dtrace_retained == NULL) {
16753 mutex_exit(&dtrace_lock);
16754 return;
16755 }
16756
16757 (void)taskq_dispatch(dtrace_taskq,
16758 (task_func_t *)dtrace_enabling_matchall_task, NULL, TQ_SLEEP);
16759
16760 mutex_exit(&dtrace_lock);
16761
16762 /*
16763 * And now, for a little heuristic sleaze: in general, we want to
16764 * match modules as soon as they load. However, we cannot guarantee
16765 * this, because it would lead us to the lock ordering violation
16766 * outlined above. The common case, of course, is that cpu_lock is
16767 * _not_ held -- so we delay here for a clock tick, hoping that that's
16768 * long enough for the task queue to do its work. If it's not, it's
16769 * not a serious problem -- it just means that the module that we
16770 * just loaded may not be immediately instrumentable.
16771 */
16772 delay(1);
16773 }
16774
16775 static void
16776 #ifdef illumos
16777 dtrace_module_unloaded(modctl_t *ctl)
16778 #else
16779 dtrace_module_unloaded(modctl_t *ctl, int *error)
16780 #endif
16781 {
16782 dtrace_probe_t template, *probe, *first, *next;
16783 dtrace_provider_t *prov;
16784 #ifndef illumos
16785 char modname[DTRACE_MODNAMELEN];
16786 size_t len;
16787 #endif
16788
16789 #ifdef illumos
16790 template.dtpr_mod = ctl->mod_modname;
16791 #else
16792 /* Handle the fact that ctl->filename may end in ".ko". */
16793 strlcpy(modname, ctl->filename, sizeof(modname));
16794 len = strlen(ctl->filename);
16795 if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16796 modname[len - 3] = '\0';
16797 template.dtpr_mod = modname;
16798 #endif
16799
16800 mutex_enter(&dtrace_provider_lock);
16801 #ifdef illumos
16802 mutex_enter(&mod_lock);
16803 #endif
16804 mutex_enter(&dtrace_lock);
16805
16806 #ifndef illumos
16807 if (ctl->nenabled > 0) {
16808 /* Don't allow unloads if a probe is enabled. */
16809 mutex_exit(&dtrace_provider_lock);
16810 mutex_exit(&dtrace_lock);
16811 *error = -1;
16812 printf(
16813 "kldunload: attempt to unload module that has DTrace probes enabled\n");
16814 return;
16815 }
16816 #endif
16817
16818 if (dtrace_bymod == NULL) {
16819 /*
16820 * The DTrace module is loaded (obviously) but not attached;
16821 * we don't have any work to do.
16822 */
16823 mutex_exit(&dtrace_provider_lock);
16824 #ifdef illumos
16825 mutex_exit(&mod_lock);
16826 #endif
16827 mutex_exit(&dtrace_lock);
16828 return;
16829 }
16830
16831 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16832 probe != NULL; probe = probe->dtpr_nextmod) {
16833 if (probe->dtpr_ecb != NULL) {
16834 mutex_exit(&dtrace_provider_lock);
16835 #ifdef illumos
16836 mutex_exit(&mod_lock);
16837 #endif
16838 mutex_exit(&dtrace_lock);
16839
16840 /*
16841 * This shouldn't _actually_ be possible -- we're
16842 * unloading a module that has an enabled probe in it.
16843 * (It's normally up to the provider to make sure that
16844 * this can't happen.) However, because dtps_enable()
16845 * doesn't have a failure mode, there can be an
16846 * enable/unload race. Upshot: we don't want to
16847 * assert, but we're not going to disable the
16848 * probe, either.
16849 */
16850 if (dtrace_err_verbose) {
16851 #ifdef illumos
16852 cmn_err(CE_WARN, "unloaded module '%s' had "
16853 "enabled probes", ctl->mod_modname);
16854 #else
16855 cmn_err(CE_WARN, "unloaded module '%s' had "
16856 "enabled probes", modname);
16857 #endif
16858 }
16859
16860 return;
16861 }
16862 }
16863
16864 probe = first;
16865
16866 for (first = NULL; probe != NULL; probe = next) {
16867 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16868
16869 dtrace_probes[probe->dtpr_id - 1] = NULL;
16870
16871 next = probe->dtpr_nextmod;
16872 dtrace_hash_remove(dtrace_bymod, probe);
16873 dtrace_hash_remove(dtrace_byfunc, probe);
16874 dtrace_hash_remove(dtrace_byname, probe);
16875
16876 if (first == NULL) {
16877 first = probe;
16878 probe->dtpr_nextmod = NULL;
16879 } else {
16880 probe->dtpr_nextmod = first;
16881 first = probe;
16882 }
16883 }
16884
16885 /*
16886 * We've removed all of the module's probes from the hash chains and
16887 * from the probe array. Now issue a dtrace_sync() to be sure that
16888 * everyone has cleared out from any probe array processing.
16889 */
16890 dtrace_sync();
16891
16892 for (probe = first; probe != NULL; probe = first) {
16893 first = probe->dtpr_nextmod;
16894 prov = probe->dtpr_provider;
16895 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16896 probe->dtpr_arg);
16897 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16898 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16899 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16900 #ifdef illumos
16901 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16902 #else
16903 free_unr(dtrace_arena, probe->dtpr_id);
16904 #endif
16905 kmem_free(probe, sizeof (dtrace_probe_t));
16906 }
16907
16908 mutex_exit(&dtrace_lock);
16909 #ifdef illumos
16910 mutex_exit(&mod_lock);
16911 #endif
16912 mutex_exit(&dtrace_provider_lock);
16913 }
16914
16915 #ifndef illumos
16916 static void
16917 dtrace_kld_load(void *arg __unused, linker_file_t lf)
16918 {
16919
16920 dtrace_module_loaded(lf);
16921 }
16922
16923 static void
16924 dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16925 {
16926
16927 if (*error != 0)
16928 /* We already have an error, so don't do anything. */
16929 return;
16930 dtrace_module_unloaded(lf, error);
16931 }
16932 #endif
16933
16934 #ifdef illumos
16935 static void
16936 dtrace_suspend(void)
16937 {
16938 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16939 }
16940
16941 static void
16942 dtrace_resume(void)
16943 {
16944 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16945 }
16946 #endif
16947
16948 static int
16949 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16950 {
16951 ASSERT(MUTEX_HELD(&cpu_lock));
16952 mutex_enter(&dtrace_lock);
16953
16954 switch (what) {
16955 case CPU_CONFIG: {
16956 dtrace_state_t *state;
16957 dtrace_optval_t *opt, rs, c;
16958
16959 /*
16960 * For now, we only allocate a new buffer for anonymous state.
16961 */
16962 if ((state = dtrace_anon.dta_state) == NULL)
16963 break;
16964
16965 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16966 break;
16967
16968 opt = state->dts_options;
16969 c = opt[DTRACEOPT_CPU];
16970
16971 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16972 break;
16973
16974 /*
16975 * Regardless of what the actual policy is, we're going to
16976 * temporarily set our resize policy to be manual. We're
16977 * also going to temporarily set our CPU option to denote
16978 * the newly configured CPU.
16979 */
16980 rs = opt[DTRACEOPT_BUFRESIZE];
16981 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16982 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16983
16984 (void) dtrace_state_buffers(state);
16985
16986 opt[DTRACEOPT_BUFRESIZE] = rs;
16987 opt[DTRACEOPT_CPU] = c;
16988
16989 break;
16990 }
16991
16992 case CPU_UNCONFIG:
16993 /*
16994 * We don't free the buffer in the CPU_UNCONFIG case. (The
16995 * buffer will be freed when the consumer exits.)
16996 */
16997 break;
16998
16999 default:
17000 break;
17001 }
17002
17003 mutex_exit(&dtrace_lock);
17004 return (0);
17005 }
17006
17007 #ifdef illumos
17008 static void
17009 dtrace_cpu_setup_initial(processorid_t cpu)
17010 {
17011 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17012 }
17013 #endif
17014
17015 static void
17016 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17017 {
17018 if (dtrace_toxranges >= dtrace_toxranges_max) {
17019 int osize, nsize;
17020 dtrace_toxrange_t *range;
17021
17022 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17023
17024 if (osize == 0) {
17025 ASSERT(dtrace_toxrange == NULL);
17026 ASSERT(dtrace_toxranges_max == 0);
17027 dtrace_toxranges_max = 1;
17028 } else {
17029 dtrace_toxranges_max <<= 1;
17030 }
17031
17032 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17033 range = kmem_zalloc(nsize, KM_SLEEP);
17034
17035 if (dtrace_toxrange != NULL) {
17036 ASSERT(osize != 0);
17037 bcopy(dtrace_toxrange, range, osize);
17038 kmem_free(dtrace_toxrange, osize);
17039 }
17040
17041 dtrace_toxrange = range;
17042 }
17043
17044 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17045 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17046
17047 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17048 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17049 dtrace_toxranges++;
17050 }
17051
17052 static void
17053 dtrace_getf_barrier(void)
17054 {
17055 #ifdef illumos
17056 /*
17057 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
17058 * that contain calls to getf(), this routine will be called on every
17059 * closef() before either the underlying vnode is released or the
17060 * file_t itself is freed. By the time we are here, it is essential
17061 * that the file_t can no longer be accessed from a call to getf()
17062 * in probe context -- that assures that a dtrace_sync() can be used
17063 * to clear out any enablings referring to the old structures.
17064 */
17065 if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
17066 kcred->cr_zone->zone_dtrace_getf != 0)
17067 dtrace_sync();
17068 #endif
17069 }
17070
17071 /*
17072 * DTrace Driver Cookbook Functions
17073 */
17074 #ifdef illumos
17075 /*ARGSUSED*/
17076 static int
17077 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
17078 {
17079 dtrace_provider_id_t id;
17080 dtrace_state_t *state = NULL;
17081 dtrace_enabling_t *enab;
17082
17083 mutex_enter(&cpu_lock);
17084 mutex_enter(&dtrace_provider_lock);
17085 mutex_enter(&dtrace_lock);
17086
17087 if (ddi_soft_state_init(&dtrace_softstate,
17088 sizeof (dtrace_state_t), 0) != 0) {
17089 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
17090 mutex_exit(&cpu_lock);
17091 mutex_exit(&dtrace_provider_lock);
17092 mutex_exit(&dtrace_lock);
17093 return (DDI_FAILURE);
17094 }
17095
17096 if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
17097 DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
17098 ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
17099 DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
17100 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
17101 ddi_remove_minor_node(devi, NULL);
17102 ddi_soft_state_fini(&dtrace_softstate);
17103 mutex_exit(&cpu_lock);
17104 mutex_exit(&dtrace_provider_lock);
17105 mutex_exit(&dtrace_lock);
17106 return (DDI_FAILURE);
17107 }
17108
17109 ddi_report_dev(devi);
17110 dtrace_devi = devi;
17111
17112 dtrace_modload = dtrace_module_loaded;
17113 dtrace_modunload = dtrace_module_unloaded;
17114 dtrace_cpu_init = dtrace_cpu_setup_initial;
17115 dtrace_helpers_cleanup = dtrace_helpers_destroy;
17116 dtrace_helpers_fork = dtrace_helpers_duplicate;
17117 dtrace_cpustart_init = dtrace_suspend;
17118 dtrace_cpustart_fini = dtrace_resume;
17119 dtrace_debugger_init = dtrace_suspend;
17120 dtrace_debugger_fini = dtrace_resume;
17121
17122 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17123
17124 ASSERT(MUTEX_HELD(&cpu_lock));
17125
17126 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
17127 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17128 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
17129 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
17130 VM_SLEEP | VMC_IDENTIFIER);
17131 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
17132 1, INT_MAX, 0);
17133
17134 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
17135 sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
17136 NULL, NULL, NULL, NULL, NULL, 0);
17137
17138 ASSERT(MUTEX_HELD(&cpu_lock));
17139 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
17140 offsetof(dtrace_probe_t, dtpr_nextmod),
17141 offsetof(dtrace_probe_t, dtpr_prevmod));
17142
17143 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
17144 offsetof(dtrace_probe_t, dtpr_nextfunc),
17145 offsetof(dtrace_probe_t, dtpr_prevfunc));
17146
17147 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
17148 offsetof(dtrace_probe_t, dtpr_nextname),
17149 offsetof(dtrace_probe_t, dtpr_prevname));
17150
17151 if (dtrace_retain_max < 1) {
17152 cmn_err(CE_WARN, "illegal value (%zu) for dtrace_retain_max; "
17153 "setting to 1", dtrace_retain_max);
17154 dtrace_retain_max = 1;
17155 }
17156
17157 /*
17158 * Now discover our toxic ranges.
17159 */
17160 dtrace_toxic_ranges(dtrace_toxrange_add);
17161
17162 /*
17163 * Before we register ourselves as a provider to our own framework,
17164 * we would like to assert that dtrace_provider is NULL -- but that's
17165 * not true if we were loaded as a dependency of a DTrace provider.
17166 * Once we've registered, we can assert that dtrace_provider is our
17167 * pseudo provider.
17168 */
17169 (void) dtrace_register("dtrace", &dtrace_provider_attr,
17170 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17171
17172 ASSERT(dtrace_provider != NULL);
17173 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17174
17175 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17176 dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
17177 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17178 dtrace_provider, NULL, NULL, "END", 0, NULL);
17179 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17180 dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
17181
17182 dtrace_anon_property();
17183 mutex_exit(&cpu_lock);
17184
17185 /*
17186 * If there are already providers, we must ask them to provide their
17187 * probes, and then match any anonymous enabling against them. Note
17188 * that there should be no other retained enablings at this time:
17189 * the only retained enablings at this time should be the anonymous
17190 * enabling.
17191 */
17192 if (dtrace_anon.dta_enabling != NULL) {
17193 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17194
17195 dtrace_enabling_provide(NULL);
17196 state = dtrace_anon.dta_state;
17197
17198 /*
17199 * We couldn't hold cpu_lock across the above call to
17200 * dtrace_enabling_provide(), but we must hold it to actually
17201 * enable the probes. We have to drop all of our locks, pick
17202 * up cpu_lock, and regain our locks before matching the
17203 * retained anonymous enabling.
17204 */
17205 mutex_exit(&dtrace_lock);
17206 mutex_exit(&dtrace_provider_lock);
17207
17208 mutex_enter(&cpu_lock);
17209 mutex_enter(&dtrace_provider_lock);
17210 mutex_enter(&dtrace_lock);
17211
17212 if ((enab = dtrace_anon.dta_enabling) != NULL)
17213 (void) dtrace_enabling_match(enab, NULL);
17214
17215 mutex_exit(&cpu_lock);
17216 }
17217
17218 mutex_exit(&dtrace_lock);
17219 mutex_exit(&dtrace_provider_lock);
17220
17221 if (state != NULL) {
17222 /*
17223 * If we created any anonymous state, set it going now.
17224 */
17225 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17226 }
17227
17228 return (DDI_SUCCESS);
17229 }
17230 #endif /* illumos */
17231
17232 #ifndef illumos
17233 static void dtrace_dtr(void *);
17234 #endif
17235
17236 /*ARGSUSED*/
17237 static int
17238 #ifdef illumos
17239 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17240 #else
17241 dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
17242 #endif
17243 {
17244 dtrace_state_t *state;
17245 uint32_t priv;
17246 uid_t uid;
17247 zoneid_t zoneid;
17248
17249 #ifdef illumos
17250 if (getminor(*devp) == DTRACEMNRN_HELPER)
17251 return (0);
17252
17253 /*
17254 * If this wasn't an open with the "helper" minor, then it must be
17255 * the "dtrace" minor.
17256 */
17257 if (getminor(*devp) == DTRACEMNRN_DTRACE)
17258 return (ENXIO);
17259 #else
17260 cred_t *cred_p = NULL;
17261 cred_p = dev->si_cred;
17262
17263 /*
17264 * If no DTRACE_PRIV_* bits are set in the credential, then the
17265 * caller lacks sufficient permission to do anything with DTrace.
17266 */
17267 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17268 if (priv == DTRACE_PRIV_NONE) {
17269 #endif
17270
17271 return (EACCES);
17272 }
17273
17274 /*
17275 * Ask all providers to provide all their probes.
17276 */
17277 mutex_enter(&dtrace_provider_lock);
17278 dtrace_probe_provide(NULL, NULL);
17279 mutex_exit(&dtrace_provider_lock);
17280
17281 mutex_enter(&cpu_lock);
17282 mutex_enter(&dtrace_lock);
17283 dtrace_opens++;
17284 dtrace_membar_producer();
17285
17286 #ifdef illumos
17287 /*
17288 * If the kernel debugger is active (that is, if the kernel debugger
17289 * modified text in some way), we won't allow the open.
17290 */
17291 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17292 dtrace_opens--;
17293 mutex_exit(&cpu_lock);
17294 mutex_exit(&dtrace_lock);
17295 return (EBUSY);
17296 }
17297
17298 if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
17299 /*
17300 * If DTrace helper tracing is enabled, we need to allocate the
17301 * trace buffer and initialize the values.
17302 */
17303 dtrace_helptrace_buffer =
17304 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17305 dtrace_helptrace_next = 0;
17306 dtrace_helptrace_wrapped = 0;
17307 dtrace_helptrace_enable = 0;
17308 }
17309
17310 state = dtrace_state_create(devp, cred_p);
17311 #else
17312 state = dtrace_state_create(dev, NULL);
17313 devfs_set_cdevpriv(state, dtrace_dtr);
17314 #endif
17315
17316 mutex_exit(&cpu_lock);
17317
17318 if (state == NULL) {
17319 #ifdef illumos
17320 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17321 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17322 #else
17323 --dtrace_opens;
17324 #endif
17325 mutex_exit(&dtrace_lock);
17326 return (EAGAIN);
17327 }
17328
17329 mutex_exit(&dtrace_lock);
17330
17331 return (0);
17332 }
17333
17334 /*ARGSUSED*/
17335 #ifdef illumos
17336 static int
17337 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17338 #else
17339 static void
17340 dtrace_dtr(void *data)
17341 #endif
17342 {
17343 #ifdef illumos
17344 minor_t minor = getminor(dev);
17345 dtrace_state_t *state;
17346 #endif
17347 dtrace_helptrace_t *buf = NULL;
17348
17349 #ifdef illumos
17350 if (minor == DTRACEMNRN_HELPER)
17351 return (0);
17352
17353 state = ddi_get_soft_state(dtrace_softstate, minor);
17354 #else
17355 dtrace_state_t *state = data;
17356 #endif
17357
17358 mutex_enter(&cpu_lock);
17359 mutex_enter(&dtrace_lock);
17360
17361 #ifdef illumos
17362 if (state->dts_anon)
17363 #else
17364 if (state != NULL && state->dts_anon)
17365 #endif
17366 {
17367 /*
17368 * There is anonymous state. Destroy that first.
17369 */
17370 ASSERT(dtrace_anon.dta_state == NULL);
17371 dtrace_state_destroy(state->dts_anon);
17372 }
17373
17374 if (dtrace_helptrace_disable) {
17375 /*
17376 * If we have been told to disable helper tracing, set the
17377 * buffer to NULL before calling into dtrace_state_destroy();
17378 * we take advantage of its dtrace_sync() to know that no
17379 * CPU is in probe context with enabled helper tracing
17380 * after it returns.
17381 */
17382 buf = dtrace_helptrace_buffer;
17383 dtrace_helptrace_buffer = NULL;
17384 }
17385
17386 #ifdef illumos
17387 dtrace_state_destroy(state);
17388 #else
17389 if (state != NULL) {
17390 dtrace_state_destroy(state);
17391 kmem_free(state, 0);
17392 }
17393 #endif
17394 ASSERT(dtrace_opens > 0);
17395
17396 #ifdef illumos
17397 /*
17398 * Only relinquish control of the kernel debugger interface when there
17399 * are no consumers and no anonymous enablings.
17400 */
17401 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17402 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17403 #else
17404 --dtrace_opens;
17405 #endif
17406
17407 if (buf != NULL) {
17408 kmem_free(buf, dtrace_helptrace_bufsize);
17409 dtrace_helptrace_disable = 0;
17410 }
17411
17412 mutex_exit(&dtrace_lock);
17413 mutex_exit(&cpu_lock);
17414
17415 #ifdef illumos
17416 return (0);
17417 #endif
17418 }
17419
17420 #ifdef illumos
17421 /*ARGSUSED*/
17422 static int
17423 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
17424 {
17425 int rval;
17426 dof_helper_t help, *dhp = NULL;
17427
17428 switch (cmd) {
17429 case DTRACEHIOC_ADDDOF:
17430 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
17431 dtrace_dof_error(NULL, "failed to copyin DOF helper");
17432 return (EFAULT);
17433 }
17434
17435 dhp = &help;
17436 arg = (intptr_t)help.dofhp_dof;
17437 /*FALLTHROUGH*/
17438
17439 case DTRACEHIOC_ADD: {
17440 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
17441
17442 if (dof == NULL)
17443 return (rval);
17444
17445 mutex_enter(&dtrace_lock);
17446
17447 /*
17448 * dtrace_helper_slurp() takes responsibility for the dof --
17449 * it may free it now or it may save it and free it later.
17450 */
17451 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
17452 *rv = rval;
17453 rval = 0;
17454 } else {
17455 rval = EINVAL;
17456 }
17457
17458 mutex_exit(&dtrace_lock);
17459 return (rval);
17460 }
17461
17462 case DTRACEHIOC_REMOVE: {
17463 mutex_enter(&dtrace_lock);
17464 rval = dtrace_helper_destroygen(NULL, arg);
17465 mutex_exit(&dtrace_lock);
17466
17467 return (rval);
17468 }
17469
17470 default:
17471 break;
17472 }
17473
17474 return (ENOTTY);
17475 }
17476
17477 /*ARGSUSED*/
17478 static int
17479 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
17480 {
17481 minor_t minor = getminor(dev);
17482 dtrace_state_t *state;
17483 int rval;
17484
17485 if (minor == DTRACEMNRN_HELPER)
17486 return (dtrace_ioctl_helper(cmd, arg, rv));
17487
17488 state = ddi_get_soft_state(dtrace_softstate, minor);
17489
17490 if (state->dts_anon) {
17491 ASSERT(dtrace_anon.dta_state == NULL);
17492 state = state->dts_anon;
17493 }
17494
17495 switch (cmd) {
17496 case DTRACEIOC_PROVIDER: {
17497 dtrace_providerdesc_t pvd;
17498 dtrace_provider_t *pvp;
17499
17500 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
17501 return (EFAULT);
17502
17503 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17504 mutex_enter(&dtrace_provider_lock);
17505
17506 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17507 if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17508 break;
17509 }
17510
17511 mutex_exit(&dtrace_provider_lock);
17512
17513 if (pvp == NULL)
17514 return (ESRCH);
17515
17516 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17517 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17518
17519 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17520 return (EFAULT);
17521
17522 return (0);
17523 }
17524
17525 case DTRACEIOC_EPROBE: {
17526 dtrace_eprobedesc_t epdesc;
17527 dtrace_ecb_t *ecb;
17528 dtrace_action_t *act;
17529 void *buf;
17530 size_t size;
17531 uintptr_t dest;
17532 int nrecs;
17533
17534 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17535 return (EFAULT);
17536
17537 mutex_enter(&dtrace_lock);
17538
17539 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17540 mutex_exit(&dtrace_lock);
17541 return (EINVAL);
17542 }
17543
17544 if (ecb->dte_probe == NULL) {
17545 mutex_exit(&dtrace_lock);
17546 return (EINVAL);
17547 }
17548
17549 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17550 epdesc.dtepd_uarg = ecb->dte_uarg;
17551 epdesc.dtepd_size = ecb->dte_size;
17552
17553 nrecs = epdesc.dtepd_nrecs;
17554 epdesc.dtepd_nrecs = 0;
17555 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17556 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17557 continue;
17558
17559 epdesc.dtepd_nrecs++;
17560 }
17561
17562 /*
17563 * Now that we have the size, we need to allocate a temporary
17564 * buffer in which to store the complete description. We need
17565 * the temporary buffer to be able to drop dtrace_lock()
17566 * across the copyout(), below.
17567 */
17568 size = sizeof (dtrace_eprobedesc_t) +
17569 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17570
17571 buf = kmem_alloc(size, KM_SLEEP);
17572 dest = (uintptr_t)buf;
17573
17574 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17575 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17576
17577 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17578 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17579 continue;
17580
17581 if (nrecs-- == 0)
17582 break;
17583
17584 bcopy(&act->dta_rec, (void *)dest,
17585 sizeof (dtrace_recdesc_t));
17586 dest += sizeof (dtrace_recdesc_t);
17587 }
17588
17589 mutex_exit(&dtrace_lock);
17590
17591 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17592 kmem_free(buf, size);
17593 return (EFAULT);
17594 }
17595
17596 kmem_free(buf, size);
17597 return (0);
17598 }
17599
17600 case DTRACEIOC_AGGDESC: {
17601 dtrace_aggdesc_t aggdesc;
17602 dtrace_action_t *act;
17603 dtrace_aggregation_t *agg;
17604 int nrecs;
17605 uint32_t offs;
17606 dtrace_recdesc_t *lrec;
17607 void *buf;
17608 size_t size;
17609 uintptr_t dest;
17610
17611 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17612 return (EFAULT);
17613
17614 mutex_enter(&dtrace_lock);
17615
17616 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17617 mutex_exit(&dtrace_lock);
17618 return (EINVAL);
17619 }
17620
17621 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17622
17623 nrecs = aggdesc.dtagd_nrecs;
17624 aggdesc.dtagd_nrecs = 0;
17625
17626 offs = agg->dtag_base;
17627 lrec = &agg->dtag_action.dta_rec;
17628 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17629
17630 for (act = agg->dtag_first; ; act = act->dta_next) {
17631 ASSERT(act->dta_intuple ||
17632 DTRACEACT_ISAGG(act->dta_kind));
17633
17634 /*
17635 * If this action has a record size of zero, it
17636 * denotes an argument to the aggregating action.
17637 * Because the presence of this record doesn't (or
17638 * shouldn't) affect the way the data is interpreted,
17639 * we don't copy it out to save user-level the
17640 * confusion of dealing with a zero-length record.
17641 */
17642 if (act->dta_rec.dtrd_size == 0) {
17643 ASSERT(agg->dtag_hasarg);
17644 continue;
17645 }
17646
17647 aggdesc.dtagd_nrecs++;
17648
17649 if (act == &agg->dtag_action)
17650 break;
17651 }
17652
17653 /*
17654 * Now that we have the size, we need to allocate a temporary
17655 * buffer in which to store the complete description. We need
17656 * the temporary buffer to be able to drop dtrace_lock()
17657 * across the copyout(), below.
17658 */
17659 size = sizeof (dtrace_aggdesc_t) +
17660 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17661
17662 buf = kmem_alloc(size, KM_SLEEP);
17663 dest = (uintptr_t)buf;
17664
17665 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17666 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17667
17668 for (act = agg->dtag_first; ; act = act->dta_next) {
17669 dtrace_recdesc_t rec = act->dta_rec;
17670
17671 /*
17672 * See the comment in the above loop for why we pass
17673 * over zero-length records.
17674 */
17675 if (rec.dtrd_size == 0) {
17676 ASSERT(agg->dtag_hasarg);
17677 continue;
17678 }
17679
17680 if (nrecs-- == 0)
17681 break;
17682
17683 rec.dtrd_offset -= offs;
17684 bcopy(&rec, (void *)dest, sizeof (rec));
17685 dest += sizeof (dtrace_recdesc_t);
17686
17687 if (act == &agg->dtag_action)
17688 break;
17689 }
17690
17691 mutex_exit(&dtrace_lock);
17692
17693 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17694 kmem_free(buf, size);
17695 return (EFAULT);
17696 }
17697
17698 kmem_free(buf, size);
17699 return (0);
17700 }
17701
17702 case DTRACEIOC_ENABLE: {
17703 dof_hdr_t *dof;
17704 dtrace_enabling_t *enab = NULL;
17705 dtrace_vstate_t *vstate;
17706 int err = 0;
17707
17708 *rv = 0;
17709
17710 /*
17711 * If a NULL argument has been passed, we take this as our
17712 * cue to reevaluate our enablings.
17713 */
17714 if (arg == NULL) {
17715 dtrace_enabling_matchall();
17716
17717 return (0);
17718 }
17719
17720 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17721 return (rval);
17722
17723 mutex_enter(&cpu_lock);
17724 mutex_enter(&dtrace_lock);
17725 vstate = &state->dts_vstate;
17726
17727 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17728 mutex_exit(&dtrace_lock);
17729 mutex_exit(&cpu_lock);
17730 dtrace_dof_destroy(dof);
17731 return (EBUSY);
17732 }
17733
17734 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17735 mutex_exit(&dtrace_lock);
17736 mutex_exit(&cpu_lock);
17737 dtrace_dof_destroy(dof);
17738 return (EINVAL);
17739 }
17740
17741 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17742 dtrace_enabling_destroy(enab);
17743 mutex_exit(&dtrace_lock);
17744 mutex_exit(&cpu_lock);
17745 dtrace_dof_destroy(dof);
17746 return (rval);
17747 }
17748
17749 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17750 err = dtrace_enabling_retain(enab);
17751 } else {
17752 dtrace_enabling_destroy(enab);
17753 }
17754
17755 mutex_exit(&cpu_lock);
17756 mutex_exit(&dtrace_lock);
17757 dtrace_dof_destroy(dof);
17758
17759 return (err);
17760 }
17761
17762 case DTRACEIOC_REPLICATE: {
17763 dtrace_repldesc_t desc;
17764 dtrace_probedesc_t *match = &desc.dtrpd_match;
17765 dtrace_probedesc_t *create = &desc.dtrpd_create;
17766 int err;
17767
17768 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17769 return (EFAULT);
17770
17771 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17772 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17773 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17774 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17775
17776 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17777 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17778 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17779 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17780
17781 mutex_enter(&dtrace_lock);
17782 err = dtrace_enabling_replicate(state, match, create);
17783 mutex_exit(&dtrace_lock);
17784
17785 return (err);
17786 }
17787
17788 case DTRACEIOC_PROBEMATCH:
17789 case DTRACEIOC_PROBES: {
17790 dtrace_probe_t *probe = NULL;
17791 dtrace_probedesc_t desc;
17792 dtrace_probekey_t pkey;
17793 dtrace_id_t i;
17794 int m = 0;
17795 uint32_t priv;
17796 uid_t uid;
17797 zoneid_t zoneid;
17798
17799 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17800 return (EFAULT);
17801
17802 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17803 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17804 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17805 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17806
17807 /*
17808 * Before we attempt to match this probe, we want to give
17809 * all providers the opportunity to provide it.
17810 */
17811 if (desc.dtpd_id == DTRACE_IDNONE) {
17812 mutex_enter(&dtrace_provider_lock);
17813 dtrace_probe_provide(&desc, NULL);
17814 mutex_exit(&dtrace_provider_lock);
17815 desc.dtpd_id++;
17816 }
17817
17818 if (cmd == DTRACEIOC_PROBEMATCH) {
17819 dtrace_probekey(&desc, &pkey);
17820 pkey.dtpk_id = DTRACE_IDNONE;
17821 }
17822
17823 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17824
17825 mutex_enter(&dtrace_lock);
17826
17827 if (cmd == DTRACEIOC_PROBEMATCH) {
17828 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17829 if ((probe = dtrace_probes[i - 1]) != NULL &&
17830 (m = dtrace_match_probe(probe, &pkey,
17831 priv, uid, zoneid)) != 0)
17832 break;
17833 }
17834
17835 if (m < 0) {
17836 mutex_exit(&dtrace_lock);
17837 return (EINVAL);
17838 }
17839
17840 } else {
17841 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17842 if ((probe = dtrace_probes[i - 1]) != NULL &&
17843 dtrace_match_priv(probe, priv, uid, zoneid))
17844 break;
17845 }
17846 }
17847
17848 if (probe == NULL) {
17849 mutex_exit(&dtrace_lock);
17850 return (ESRCH);
17851 }
17852
17853 dtrace_probe_description(probe, &desc);
17854 mutex_exit(&dtrace_lock);
17855
17856 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17857 return (EFAULT);
17858
17859 return (0);
17860 }
17861
17862 case DTRACEIOC_PROBEARG: {
17863 dtrace_argdesc_t desc;
17864 dtrace_probe_t *probe;
17865 dtrace_provider_t *prov;
17866
17867 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17868 return (EFAULT);
17869
17870 if (desc.dtargd_id == DTRACE_IDNONE)
17871 return (EINVAL);
17872
17873 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17874 return (EINVAL);
17875
17876 mutex_enter(&dtrace_provider_lock);
17877 mutex_enter(&mod_lock);
17878 mutex_enter(&dtrace_lock);
17879
17880 if (desc.dtargd_id > dtrace_nprobes) {
17881 mutex_exit(&dtrace_lock);
17882 mutex_exit(&mod_lock);
17883 mutex_exit(&dtrace_provider_lock);
17884 return (EINVAL);
17885 }
17886
17887 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17888 mutex_exit(&dtrace_lock);
17889 mutex_exit(&mod_lock);
17890 mutex_exit(&dtrace_provider_lock);
17891 return (EINVAL);
17892 }
17893
17894 mutex_exit(&dtrace_lock);
17895
17896 prov = probe->dtpr_provider;
17897
17898 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17899 /*
17900 * There isn't any typed information for this probe.
17901 * Set the argument number to DTRACE_ARGNONE.
17902 */
17903 desc.dtargd_ndx = DTRACE_ARGNONE;
17904 } else {
17905 desc.dtargd_native[0] = '\0';
17906 desc.dtargd_xlate[0] = '\0';
17907 desc.dtargd_mapping = desc.dtargd_ndx;
17908
17909 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17910 probe->dtpr_id, probe->dtpr_arg, &desc);
17911 }
17912
17913 mutex_exit(&mod_lock);
17914 mutex_exit(&dtrace_provider_lock);
17915
17916 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17917 return (EFAULT);
17918
17919 return (0);
17920 }
17921
17922 case DTRACEIOC_GO: {
17923 processorid_t cpuid;
17924 rval = dtrace_state_go(state, &cpuid);
17925
17926 if (rval != 0)
17927 return (rval);
17928
17929 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17930 return (EFAULT);
17931
17932 return (0);
17933 }
17934
17935 case DTRACEIOC_STOP: {
17936 processorid_t cpuid;
17937
17938 mutex_enter(&dtrace_lock);
17939 rval = dtrace_state_stop(state, &cpuid);
17940 mutex_exit(&dtrace_lock);
17941
17942 if (rval != 0)
17943 return (rval);
17944
17945 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17946 return (EFAULT);
17947
17948 return (0);
17949 }
17950
17951 case DTRACEIOC_DOFGET: {
17952 dof_hdr_t hdr, *dof;
17953 uint64_t len;
17954
17955 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17956 return (EFAULT);
17957
17958 mutex_enter(&dtrace_lock);
17959 dof = dtrace_dof_create(state);
17960 mutex_exit(&dtrace_lock);
17961
17962 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17963 rval = copyout(dof, (void *)arg, len);
17964 dtrace_dof_destroy(dof);
17965
17966 return (rval == 0 ? 0 : EFAULT);
17967 }
17968
17969 case DTRACEIOC_AGGSNAP:
17970 case DTRACEIOC_BUFSNAP: {
17971 dtrace_bufdesc_t desc;
17972 caddr_t cached;
17973 dtrace_buffer_t *buf;
17974
17975 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17976 return (EFAULT);
17977
17978 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17979 return (EINVAL);
17980
17981 mutex_enter(&dtrace_lock);
17982
17983 if (cmd == DTRACEIOC_BUFSNAP) {
17984 buf = &state->dts_buffer[desc.dtbd_cpu];
17985 } else {
17986 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17987 }
17988
17989 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17990 size_t sz = buf->dtb_offset;
17991
17992 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17993 mutex_exit(&dtrace_lock);
17994 return (EBUSY);
17995 }
17996
17997 /*
17998 * If this buffer has already been consumed, we're
17999 * going to indicate that there's nothing left here
18000 * to consume.
18001 */
18002 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18003 mutex_exit(&dtrace_lock);
18004
18005 desc.dtbd_size = 0;
18006 desc.dtbd_drops = 0;
18007 desc.dtbd_errors = 0;
18008 desc.dtbd_oldest = 0;
18009 sz = sizeof (desc);
18010
18011 if (copyout(&desc, (void *)arg, sz) != 0)
18012 return (EFAULT);
18013
18014 return (0);
18015 }
18016
18017 /*
18018 * If this is a ring buffer that has wrapped, we want
18019 * to copy the whole thing out.
18020 */
18021 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18022 dtrace_buffer_polish(buf);
18023 sz = buf->dtb_size;
18024 }
18025
18026 if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
18027 mutex_exit(&dtrace_lock);
18028 return (EFAULT);
18029 }
18030
18031 desc.dtbd_size = sz;
18032 desc.dtbd_drops = buf->dtb_drops;
18033 desc.dtbd_errors = buf->dtb_errors;
18034 desc.dtbd_oldest = buf->dtb_xamot_offset;
18035 desc.dtbd_timestamp = dtrace_gethrtime();
18036
18037 mutex_exit(&dtrace_lock);
18038
18039 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
18040 return (EFAULT);
18041
18042 buf->dtb_flags |= DTRACEBUF_CONSUMED;
18043
18044 return (0);
18045 }
18046
18047 if (buf->dtb_tomax == NULL) {
18048 ASSERT(buf->dtb_xamot == NULL);
18049 mutex_exit(&dtrace_lock);
18050 return (ENOENT);
18051 }
18052
18053 cached = buf->dtb_tomax;
18054 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18055
18056 dtrace_xcall(desc.dtbd_cpu,
18057 (dtrace_xcall_t)dtrace_buffer_switch, buf);
18058
18059 state->dts_errors += buf->dtb_xamot_errors;
18060
18061 /*
18062 * If the buffers did not actually switch, then the cross call
18063 * did not take place -- presumably because the given CPU is
18064 * not in the ready set. If this is the case, we'll return
18065 * ENOENT.
18066 */
18067 if (buf->dtb_tomax == cached) {
18068 ASSERT(buf->dtb_xamot != cached);
18069 mutex_exit(&dtrace_lock);
18070 return (ENOENT);
18071 }
18072
18073 ASSERT(cached == buf->dtb_xamot);
18074
18075 /*
18076 * We have our snapshot; now copy it out.
18077 */
18078 if (copyout(buf->dtb_xamot, desc.dtbd_data,
18079 buf->dtb_xamot_offset) != 0) {
18080 mutex_exit(&dtrace_lock);
18081 return (EFAULT);
18082 }
18083
18084 desc.dtbd_size = buf->dtb_xamot_offset;
18085 desc.dtbd_drops = buf->dtb_xamot_drops;
18086 desc.dtbd_errors = buf->dtb_xamot_errors;
18087 desc.dtbd_oldest = 0;
18088 desc.dtbd_timestamp = buf->dtb_switched;
18089
18090 mutex_exit(&dtrace_lock);
18091
18092 /*
18093 * Finally, copy out the buffer description.
18094 */
18095 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
18096 return (EFAULT);
18097
18098 return (0);
18099 }
18100
18101 case DTRACEIOC_CONF: {
18102 dtrace_conf_t conf;
18103
18104 bzero(&conf, sizeof (conf));
18105 conf.dtc_difversion = DIF_VERSION;
18106 conf.dtc_difintregs = DIF_DIR_NREGS;
18107 conf.dtc_diftupregs = DIF_DTR_NREGS;
18108 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18109
18110 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
18111 return (EFAULT);
18112
18113 return (0);
18114 }
18115
18116 case DTRACEIOC_STATUS: {
18117 dtrace_status_t stat;
18118 dtrace_dstate_t *dstate;
18119 int i, j;
18120 uint64_t nerrs;
18121
18122 /*
18123 * See the comment in dtrace_state_deadman() for the reason
18124 * for setting dts_laststatus to INT64_MAX before setting
18125 * it to the correct value.
18126 */
18127 state->dts_laststatus = INT64_MAX;
18128 dtrace_membar_producer();
18129 state->dts_laststatus = dtrace_gethrtime();
18130
18131 bzero(&stat, sizeof (stat));
18132
18133 mutex_enter(&dtrace_lock);
18134
18135 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18136 mutex_exit(&dtrace_lock);
18137 return (ENOENT);
18138 }
18139
18140 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18141 stat.dtst_exiting = 1;
18142
18143 nerrs = state->dts_errors;
18144 dstate = &state->dts_vstate.dtvs_dynvars;
18145
18146 for (i = 0; i < NCPU; i++) {
18147 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18148
18149 stat.dtst_dyndrops += dcpu->dtdsc_drops;
18150 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18151 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18152
18153 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18154 stat.dtst_filled++;
18155
18156 nerrs += state->dts_buffer[i].dtb_errors;
18157
18158 for (j = 0; j < state->dts_nspeculations; j++) {
18159 dtrace_speculation_t *spec;
18160 dtrace_buffer_t *buf;
18161
18162 spec = &state->dts_speculations[j];
18163 buf = &spec->dtsp_buffer[i];
18164 stat.dtst_specdrops += buf->dtb_xamot_drops;
18165 }
18166 }
18167
18168 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18169 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18170 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18171 stat.dtst_dblerrors = state->dts_dblerrors;
18172 stat.dtst_killed =
18173 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18174 stat.dtst_errors = nerrs;
18175
18176 mutex_exit(&dtrace_lock);
18177
18178 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
18179 return (EFAULT);
18180
18181 return (0);
18182 }
18183
18184 case DTRACEIOC_FORMAT: {
18185 dtrace_fmtdesc_t fmt;
18186 char *str;
18187 int len;
18188
18189 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
18190 return (EFAULT);
18191
18192 mutex_enter(&dtrace_lock);
18193
18194 if (fmt.dtfd_format == 0 ||
18195 fmt.dtfd_format > state->dts_nformats) {
18196 mutex_exit(&dtrace_lock);
18197 return (EINVAL);
18198 }
18199
18200 /*
18201 * Format strings are allocated contiguously and they are
18202 * never freed; if a format index is less than the number
18203 * of formats, we can assert that the format map is non-NULL
18204 * and that the format for the specified index is non-NULL.
18205 */
18206 ASSERT(state->dts_formats != NULL);
18207 str = state->dts_formats[fmt.dtfd_format - 1];
18208 ASSERT(str != NULL);
18209
18210 len = strlen(str) + 1;
18211
18212 if (len > fmt.dtfd_length) {
18213 fmt.dtfd_length = len;
18214
18215 if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
18216 mutex_exit(&dtrace_lock);
18217 return (EINVAL);
18218 }
18219 } else {
18220 if (copyout(str, fmt.dtfd_string, len) != 0) {
18221 mutex_exit(&dtrace_lock);
18222 return (EINVAL);
18223 }
18224 }
18225
18226 mutex_exit(&dtrace_lock);
18227 return (0);
18228 }
18229
18230 default:
18231 break;
18232 }
18233
18234 return (ENOTTY);
18235 }
18236
18237 /*ARGSUSED*/
18238 static int
18239 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18240 {
18241 dtrace_state_t *state;
18242
18243 switch (cmd) {
18244 case DDI_DETACH:
18245 break;
18246
18247 case DDI_SUSPEND:
18248 return (DDI_SUCCESS);
18249
18250 default:
18251 return (DDI_FAILURE);
18252 }
18253
18254 mutex_enter(&cpu_lock);
18255 mutex_enter(&dtrace_provider_lock);
18256 mutex_enter(&dtrace_lock);
18257
18258 ASSERT(dtrace_opens == 0);
18259
18260 if (dtrace_helpers > 0) {
18261 mutex_exit(&dtrace_provider_lock);
18262 mutex_exit(&dtrace_lock);
18263 mutex_exit(&cpu_lock);
18264 return (DDI_FAILURE);
18265 }
18266
18267 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18268 mutex_exit(&dtrace_provider_lock);
18269 mutex_exit(&dtrace_lock);
18270 mutex_exit(&cpu_lock);
18271 return (DDI_FAILURE);
18272 }
18273
18274 dtrace_provider = NULL;
18275
18276 if ((state = dtrace_anon_grab()) != NULL) {
18277 /*
18278 * If there were ECBs on this state, the provider should
18279 * have not been allowed to detach; assert that there is
18280 * none.
18281 */
18282 ASSERT(state->dts_necbs == 0);
18283 dtrace_state_destroy(state);
18284
18285 /*
18286 * If we're being detached with anonymous state, we need to
18287 * indicate to the kernel debugger that DTrace is now inactive.
18288 */
18289 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18290 }
18291
18292 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18293 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18294 dtrace_cpu_init = NULL;
18295 dtrace_helpers_cleanup = NULL;
18296 dtrace_helpers_fork = NULL;
18297 dtrace_cpustart_init = NULL;
18298 dtrace_cpustart_fini = NULL;
18299 dtrace_debugger_init = NULL;
18300 dtrace_debugger_fini = NULL;
18301 dtrace_modload = NULL;
18302 dtrace_modunload = NULL;
18303
18304 ASSERT(dtrace_getf == 0);
18305 ASSERT(dtrace_closef == NULL);
18306
18307 mutex_exit(&cpu_lock);
18308
18309 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18310 dtrace_probes = NULL;
18311 dtrace_nprobes = 0;
18312
18313 dtrace_hash_destroy(dtrace_bymod);
18314 dtrace_hash_destroy(dtrace_byfunc);
18315 dtrace_hash_destroy(dtrace_byname);
18316 dtrace_bymod = NULL;
18317 dtrace_byfunc = NULL;
18318 dtrace_byname = NULL;
18319
18320 kmem_cache_destroy(dtrace_state_cache);
18321 vmem_destroy(dtrace_minor);
18322 vmem_destroy(dtrace_arena);
18323
18324 if (dtrace_toxrange != NULL) {
18325 kmem_free(dtrace_toxrange,
18326 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18327 dtrace_toxrange = NULL;
18328 dtrace_toxranges = 0;
18329 dtrace_toxranges_max = 0;
18330 }
18331
18332 ddi_remove_minor_node(dtrace_devi, NULL);
18333 dtrace_devi = NULL;
18334
18335 ddi_soft_state_fini(&dtrace_softstate);
18336
18337 ASSERT(dtrace_vtime_references == 0);
18338 ASSERT(dtrace_opens == 0);
18339 ASSERT(dtrace_retained == NULL);
18340
18341 mutex_exit(&dtrace_lock);
18342 mutex_exit(&dtrace_provider_lock);
18343
18344 /*
18345 * We don't destroy the task queue until after we have dropped our
18346 * locks (taskq_destroy() may block on running tasks). To prevent
18347 * attempting to do work after we have effectively detached but before
18348 * the task queue has been destroyed, all tasks dispatched via the
18349 * task queue must check that DTrace is still attached before
18350 * performing any operation.
18351 */
18352 taskq_destroy(dtrace_taskq);
18353 dtrace_taskq = NULL;
18354
18355 return (DDI_SUCCESS);
18356 }
18357 #endif
18358
18359 #ifdef illumos
18360 /*ARGSUSED*/
18361 static int
18362 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
18363 {
18364 int error;
18365
18366 switch (infocmd) {
18367 case DDI_INFO_DEVT2DEVINFO:
18368 *result = (void *)dtrace_devi;
18369 error = DDI_SUCCESS;
18370 break;
18371 case DDI_INFO_DEVT2INSTANCE:
18372 *result = (void *)0;
18373 error = DDI_SUCCESS;
18374 break;
18375 default:
18376 error = DDI_FAILURE;
18377 }
18378 return (error);
18379 }
18380 #endif
18381
18382 #ifdef illumos
18383 static struct cb_ops dtrace_cb_ops = {
18384 dtrace_open, /* open */
18385 dtrace_close, /* close */
18386 nulldev, /* strategy */
18387 nulldev, /* print */
18388 nodev, /* dump */
18389 nodev, /* read */
18390 nodev, /* write */
18391 dtrace_ioctl, /* ioctl */
18392 nodev, /* devmap */
18393 nodev, /* mmap */
18394 nodev, /* segmap */
18395 nochpoll, /* poll */
18396 ddi_prop_op, /* cb_prop_op */
18397 0, /* streamtab */
18398 D_NEW | D_MP /* Driver compatibility flag */
18399 };
18400
18401 static struct dev_ops dtrace_ops = {
18402 DEVO_REV, /* devo_rev */
18403 0, /* refcnt */
18404 dtrace_info, /* get_dev_info */
18405 nulldev, /* identify */
18406 nulldev, /* probe */
18407 dtrace_attach, /* attach */
18408 dtrace_detach, /* detach */
18409 nodev, /* reset */
18410 &dtrace_cb_ops, /* driver operations */
18411 NULL, /* bus operations */
18412 nodev /* dev power */
18413 };
18414
18415 static struct modldrv modldrv = {
18416 &mod_driverops, /* module type (this is a pseudo driver) */
18417 "Dynamic Tracing", /* name of module */
18418 &dtrace_ops, /* driver ops */
18419 };
18420
18421 static struct modlinkage modlinkage = {
18422 MODREV_1,
18423 (void *)&modldrv,
18424 NULL
18425 };
18426
18427 int
18428 _init(void)
18429 {
18430 return (mod_install(&modlinkage));
18431 }
18432
18433 int
18434 _info(struct modinfo *modinfop)
18435 {
18436 return (mod_info(&modlinkage, modinfop));
18437 }
18438
18439 int
18440 _fini(void)
18441 {
18442 return (mod_remove(&modlinkage));
18443 }
18444 #else
18445
18446 static d_ioctl_t dtrace_ioctl;
18447 static d_ioctl_t dtrace_ioctl_helper;
18448 static void dtrace_load(void *);
18449 static int dtrace_unload(void);
18450 static struct cdev *dtrace_dev;
18451 static struct cdev *helper_dev;
18452
18453 void dtrace_invop_init(void);
18454 void dtrace_invop_uninit(void);
18455
18456 static struct cdevsw dtrace_cdevsw = {
18457 .d_version = D_VERSION,
18458 .d_ioctl = dtrace_ioctl,
18459 .d_open = dtrace_open,
18460 .d_name = "dtrace",
18461 };
18462
18463 static struct cdevsw helper_cdevsw = {
18464 .d_version = D_VERSION,
18465 .d_ioctl = dtrace_ioctl_helper,
18466 .d_name = "helper",
18467 };
18468
18469 #include <dtrace_anon.c>
18470 #include <dtrace_ioctl.c>
18471 #include <dtrace_load.c>
18472 #include <dtrace_modevent.c>
18473 #include <dtrace_sysctl.c>
18474 #include <dtrace_unload.c>
18475 #include <dtrace_vtime.c>
18476 #include <dtrace_hacks.c>
18477
18478 SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
18479 SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
18480 SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
18481
18482 DEV_MODULE(dtrace, dtrace_modevent, NULL);
18483 MODULE_VERSION(dtrace, 1);
18484 MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
18485 #endif
18486