xref: /illumos-gate/usr/src/uts/common/xen/dtrace/xdt.c (revision 48215d30bccaf4a9d58050835b3eb6ed630a2fde)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Xen event provider for DTrace
29  *
30  * NOTE: This provider is PRIVATE. It is intended as a short-term solution and
31  * may disappear or be re-implemented at anytime.
32  *
33  * This provider isn't suitable as a general-purpose solution for a number of
34  * reasons. First and foremost, we rely on the Xen tracing mechanism and don't
35  * have any way to gather data other than that collected by the Xen trace
36  * buffers. Further, it does not fit into the DTrace model (see "Interacting
37  * with DTrace" below.)
38  *
39  *
40  * Tracing in Xen
41  * --------------
42  *
43  * Xen implements a tracing facility for generating and collecting execution
44  * event traces from the hypervisor. When tracing is enabled, compiled in
45  * probes record events in contiguous per-CPU trace buffers.
46  *
47  *               +---------+
48  * +------+      |         |
49  * | CPUn |----> | BUFFERn |
50  * +------+      |         |
51  *               +---------+- tbuf.va + (tbuf.size * n)
52  *               :         :
53  *               +---------+
54  * +------+      |         |
55  * | CPU1 |----> | BUFFER1 |
56  * +------+      |         |
57  *               +---------+- tbuf.va + tbuf.size
58  * +------+      |         |
59  * | CPU0 |----> | BUFFER0 |
60  * +------+      |         |
61  *               +---------+- tbuf.va
62  *
63  * Each CPU buffer consists of a metadata header followed by the trace records.
64  * The metadata consists of a producer/consumer pair of pointers into the buffer
65  * that point to the next record to be written and the next record to be read
66  * respectively. The trace record format is as follows:
67  *
68  * +--------------------------------------------------------------------------+
69  * | CPUID(uint_t) | TSC(uint64_t) | EVENTID(uint32_t) |     DATA FIELDS      |
70  * +--------------------------------------------------------------------------+
71  *
72  * DATA FIELDS:
73  * +--------------------------------------------------------------------------+
74  * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | D4(uint32_t) | D5(uint32_t) |
75  * +--------------------------------------------------------------------------+
76  *
77  *
78  * Interacting with DTrace
79  * -----------------------
80  *
81  * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed
82  * each entry into dtrace_probe() with the corresponding probe ID for the event.
83  * As a result of this periodic collection implementation probe firings are
84  * asynchronous. This is the only sensible way to implement this form of
85  * provider, but because of its asynchronous nature asking things like
86  * "current CPU" and, more importantly, arbitrary questions about the context
87  * surrounding the probe firing are not meaningful. So, consumers should not
88  * attempt to infer anything beyond what is supplied via the probe arguments.
89  */
90 
91 #include <sys/types.h>
92 #include <sys/sysmacros.h>
93 #include <sys/modctl.h>
94 #include <sys/sunddi.h>
95 #include <sys/ddi.h>
96 #include <sys/conf.h>
97 #include <sys/devops.h>
98 #include <sys/stat.h>
99 #include <sys/cmn_err.h>
100 #include <sys/dtrace.h>
101 #include <sys/sdt.h>
102 #include <sys/cyclic.h>
103 #include <vm/seg_kmem.h>
104 #include <vm/hat_i86.h>
105 #include <sys/hypervisor.h>
106 #include <xen/public/trace.h>
107 #include <xen/public/sched.h>
108 
109 #define	XDT_POLL_DEFAULT	100000000	/* default poll interval (ns) */
110 #define	XDT_POLL_MIN		10000000	/* min poll interval (ns) */
111 #define	XDT_TBUF_RETRY		50		/* tbuf disable retry count */
112 
113 /*
114  * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h
115  * in the xVM gate.
116  */
117 #define	IS_IDLE_DOM(domid)	(domid == 0x7FFFU)
118 
119 /* Macros to extract the domid and cpuid from a HVM trace data field */
120 #define	HVM_DOMID(d)		(d >> 16)
121 #define	HVM_VCPUID(d)		(d & 0xFFFF)
122 
123 #define	XDT_PROBE4(event, cpuid, arg0, arg1, arg2, arg3) {		\
124 	dtrace_id_t id = xdt_probemap[event];				\
125 	if (id)								\
126 		dtrace_probe(id, cpuid, arg0, arg1, arg2, arg3);	\
127 }									\
128 
129 #define	XDT_PROBE3(event, cpuid, arg0, arg1, arg2) \
130 	XDT_PROBE4(event, cpuid, arg0, arg1, arg2, 0)
131 
132 #define	XDT_PROBE2(event, cpuid, arg0, arg1) \
133 	XDT_PROBE4(event, cpuid, arg0, arg1, 0, 0)
134 
135 #define	XDT_PROBE1(event, cpuid, arg0) \
136 	XDT_PROBE4(event, cpuid, arg0, 0, 0, 0)
137 
138 #define	XDT_PROBE0(event, cpuid) \
139 	XDT_PROBE4(event, cpuid, 0, 0, 0, 0)
140 
141 /* Probe classes */
142 #define	XDT_SCHED			0
143 #define	XDT_MEM				1
144 #define	XDT_HVM				2
145 #define	XDT_NCLASSES			3
146 
147 /* Probe events */
148 #define	XDT_EVT_INVALID			(-(int)1)
149 #define	XDT_SCHED_OFF_CPU		0
150 #define	XDT_SCHED_ON_CPU		1
151 #define	XDT_SCHED_IDLE_OFF_CPU		2
152 #define	XDT_SCHED_IDLE_ON_CPU		3
153 #define	XDT_SCHED_BLOCK			4
154 #define	XDT_SCHED_SLEEP			5
155 #define	XDT_SCHED_WAKE			6
156 #define	XDT_SCHED_YIELD			7
157 #define	XDT_SCHED_SHUTDOWN_POWEROFF	8
158 #define	XDT_SCHED_SHUTDOWN_REBOOT	9
159 #define	XDT_SCHED_SHUTDOWN_SUSPEND	10
160 #define	XDT_SCHED_SHUTDOWN_CRASH	11
161 #define	XDT_MEM_PAGE_GRANT_MAP		12
162 #define	XDT_MEM_PAGE_GRANT_UNMAP	13
163 #define	XDT_MEM_PAGE_GRANT_TRANSFER	14
164 #define	XDT_HVM_VMENTRY			15
165 #define	XDT_HVM_VMEXIT			16
166 #define	XDT_NEVENTS			17
167 
168 typedef struct {
169 	const char	*pr_mod;	/* probe module */
170 	const char	*pr_name;	/* probe name */
171 	int		evt_id;		/* event id */
172 	uint_t		class;		/* probe class */
173 } xdt_probe_t;
174 
175 typedef struct {
176 	uint32_t	trc_mask;	/* trace mask */
177 	uint32_t	cnt;		/* num enabled probes in class */
178 } xdt_classinfo_t;
179 
180 typedef struct {
181 	ulong_t prev_domid;		/* previous dom executed */
182 	ulong_t prev_vcpuid;		/* previous vcpu executed */
183 	ulong_t prev_ctime;		/* time spent on cpu */
184 	ulong_t next_domid;		/* next dom to be scheduled */
185 	ulong_t next_vcpuid;		/* next vcpu to be scheduled */
186 	ulong_t next_wtime;		/* time spent waiting to get on cpu */
187 	ulong_t next_ts;		/* allocated time slice */
188 } xdt_schedinfo_t;
189 
190 static struct {
191 	uint_t cnt;			/* total num of trace buffers */
192 	size_t size;			/* size of each cpu buffer */
193 	mfn_t start_mfn;		/* starting mfn of buffers */
194 	caddr_t va;			/* va buffers are mapped into */
195 
196 	/* per-cpu buffers */
197 	struct t_buf **meta;		/* buffer metadata */
198 	struct t_rec **data;		/* buffer data records */
199 
200 	/* statistics */
201 	uint64_t stat_dropped_recs;	/* records dropped */
202 	uint64_t stat_spurious_cpu;	/* recs with garbage cpuids */
203 	uint64_t stat_spurious_switch;	/* inconsistent vcpu switches */
204 	uint64_t stat_unknown_shutdown;	/* unknown shutdown code */
205 	uint64_t stat_unknown_recs;	/* unknown records */
206 } tbuf;
207 
208 static char *xdt_stats[] = {
209 	"dropped_recs",
210 };
211 
212 /*
213  * Tunable variables
214  *
215  * The following may be tuned by adding a line to /etc/system that
216  * includes both the name of the module ("xdt") and the name of the variable.
217  * For example:
218  *     set xdt:xdt_tbuf_pages = 40
219  */
220 uint_t xdt_tbuf_pages = 20;			/* pages to alloc per-cpu buf */
221 
222 /*
223  * The following may be tuned by adding a line to
224  * /platform/i86xpv/kernel/drv/xdt.conf.
225  * For example:
226  *     xdt_poll_nsec = 200000000;
227  */
228 static hrtime_t xdt_poll_nsec;			/* trace buffer poll interval */
229 
230 /*
231  * Internal variables
232  */
233 static dev_info_t *xdt_devi;
234 static dtrace_provider_id_t xdt_id;
235 static uint_t xdt_ncpus;			/* total number of phys CPUs */
236 static uint32_t cur_trace_mask;			/* current trace mask */
237 static xdt_schedinfo_t *xdt_cpu_schedinfo;	/* per-cpu sched info */
238 dtrace_id_t xdt_probemap[XDT_NEVENTS];		/* map of enabled probes */
239 dtrace_id_t xdt_prid[XDT_NEVENTS];		/* IDs of registered events */
240 static cyclic_id_t xdt_cyclic = CYCLIC_NONE;
241 static kstat_t *xdt_kstats;
242 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES];
243 
244 static xdt_probe_t xdt_probe[] = {
245 	/* Sched probes */
246 	{ "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED },
247 	{ "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED },
248 	{ "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED },
249 	{ "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED },
250 	{ "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED },
251 	{ "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED },
252 	{ "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED },
253 	{ "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED },
254 	{ "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF,
255 		XDT_SCHED },
256 	{ "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED },
257 	{ "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED },
258 	{ "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED },
259 
260 	/* Memory probes */
261 	{ "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM },
262 	{ "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM },
263 	{ "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM },
264 
265 	/* HVM probes */
266 	{ "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM },
267 	{ "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM },
268 
269 	{ NULL }
270 };
271 
272 extern uint_t xen_get_nphyscpus(void);
273 
274 static inline uint32_t
275 xdt_nr_active_probes()
276 {
277 	int i;
278 	uint32_t tot = 0;
279 
280 	for (i = 0; i < XDT_NCLASSES; i++)
281 		tot += xdt_classinfo[i].cnt;
282 
283 	return (tot);
284 }
285 
286 static void
287 xdt_init_trace_masks(void)
288 {
289 	xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED;
290 	xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM;
291 	xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM;
292 }
293 
294 static int
295 xdt_kstat_update(kstat_t *ksp, int flag)
296 {
297 	kstat_named_t *knp;
298 
299 	if (flag != KSTAT_READ)
300 		return (EACCES);
301 
302 	knp = ksp->ks_data;
303 
304 	/*
305 	 * Assignment order should match that of the names in
306 	 * xdt_stats.
307 	 */
308 	(knp++)->value.ui64 = tbuf.stat_dropped_recs;
309 
310 	return (0);
311 }
312 
313 static void
314 xdt_kstat_init(void)
315 {
316 	int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]);
317 	char **cp = xdt_stats;
318 	kstat_named_t *knp;
319 
320 	if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc",
321 	    KSTAT_TYPE_NAMED, nstats, 0)) == NULL)
322 		return;
323 
324 	xdt_kstats->ks_update = xdt_kstat_update;
325 
326 	knp = xdt_kstats->ks_data;
327 	while (nstats > 0) {
328 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
329 		knp++;
330 		cp++;
331 		nstats--;
332 	}
333 
334 	kstat_install(xdt_kstats);
335 }
336 
337 static int
338 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op)
339 {
340 	xen_sysctl_t op;
341 	int xerr;
342 
343 	op.cmd = XEN_SYSCTL_tbuf_op;
344 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
345 	op.u.tbuf_op = *tbuf_op;
346 
347 	if ((xerr = HYPERVISOR_sysctl(&op)) != 0)
348 		return (xen_xlate_errcode(xerr));
349 
350 	*tbuf_op = op.u.tbuf_op;
351 	return (0);
352 }
353 
354 static int
355 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len)
356 {
357 	x86pte_t pte;
358 	caddr_t const sva = va;
359 	caddr_t const eva = va + len;
360 	int xerr;
361 
362 	ASSERT(mfn != MFN_INVALID);
363 	ASSERT(va != NULL);
364 	ASSERT(IS_PAGEALIGNED(len));
365 
366 	for (; va < eva; va += MMU_PAGESIZE) {
367 		/*
368 		 * Ask the HAT to load a throwaway mapping to page zero, then
369 		 * overwrite it with the hypervisor mapping. It gets removed
370 		 * later via hat_unload().
371 		 */
372 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0,
373 		    PROT_READ | HAT_UNORDERED_OK,
374 		    HAT_LOAD_NOCONSIST | HAT_LOAD);
375 
376 		pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER
377 		    | PT_FOREIGN | PT_WRITABLE;
378 
379 		xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va,
380 		    pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN);
381 
382 		if (xerr != 0) {
383 			/* unmap pages loaded so far */
384 			size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) -
385 			    (uintptr_t)sva;
386 			hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP);
387 			return (xen_xlate_errcode(xerr));
388 		}
389 
390 		mfn++;
391 	}
392 
393 	return (0);
394 }
395 
396 static int
397 xdt_attach_trace_buffers(void)
398 {
399 	xen_sysctl_tbuf_op_t tbuf_op;
400 	size_t len;
401 	int err;
402 	uint_t i;
403 
404 	/*
405 	 * Xen does not support trace buffer re-sizing. If the buffers
406 	 * have already been allocated we just use them as is.
407 	 */
408 	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
409 	if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
410 		return (err);
411 
412 	if (tbuf_op.size == 0) {
413 		/* set trace buffer size */
414 		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_size;
415 		tbuf_op.size = xdt_tbuf_pages;
416 		(void) xdt_sysctl_tbuf(&tbuf_op);
417 
418 		/* get trace buffer info */
419 		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
420 		if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
421 			return (err);
422 
423 		if (tbuf_op.size == 0) {
424 			cmn_err(CE_NOTE, "Couldn't allocate trace buffers.");
425 			return (ENOBUFS);
426 		}
427 	}
428 
429 	tbuf.size = tbuf_op.size;
430 	tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn;
431 	tbuf.cnt = xdt_ncpus;
432 
433 	ASSERT(tbuf.start_mfn != MFN_INVALID);
434 	ASSERT(tbuf.cnt > 0);
435 
436 	len = tbuf.size * tbuf.cnt;
437 	tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP);
438 
439 	if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) {
440 		vmem_free(heap_arena, tbuf.va, len);
441 		tbuf.va = NULL;
442 		return (err);
443 	}
444 
445 	tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta),
446 	    KM_SLEEP);
447 	tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data),
448 	    KM_SLEEP);
449 
450 	for (i = 0; i < tbuf.cnt; i++) {
451 		void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i));
452 		tbuf.meta[i] = cpu_buf;
453 		tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf +
454 		    sizeof (struct t_buf));
455 
456 		/* throw away stale trace records */
457 		tbuf.meta[i]->cons = tbuf.meta[i]->prod;
458 	}
459 
460 	return (0);
461 }
462 
463 static void
464 xdt_detach_trace_buffers(void)
465 {
466 	size_t len = tbuf.size * tbuf.cnt;
467 
468 	ASSERT(tbuf.va != NULL);
469 
470 	hat_unload(kas.a_hat, tbuf.va, len,
471 	    HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK);
472 	vmem_free(heap_arena, tbuf.va, len);
473 	kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta));
474 	kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data));
475 }
476 
477 static inline void
478 xdt_process_rec(uint_t cpuid, struct t_rec *rec)
479 {
480 	xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
481 	int eid;
482 
483 	ASSERT(rec != NULL);
484 	ASSERT(xdt_ncpus == xen_get_nphyscpus());
485 
486 	if (cpuid >= xdt_ncpus) {
487 		tbuf.stat_spurious_cpu++;
488 		return;
489 	}
490 
491 	switch (rec->event) {
492 
493 	/*
494 	 * Sched probes
495 	 */
496 	case TRC_SCHED_SWITCH_INFPREV:
497 		/*
498 		 * Info on vCPU being de-scheduled
499 		 *
500 		 * rec->data[0] = prev domid
501 		 * rec->data[1] = time spent on pcpu
502 		 */
503 		sp->prev_domid = rec->data[0];
504 		sp->prev_ctime = rec->data[1];
505 		break;
506 
507 	case TRC_SCHED_SWITCH_INFNEXT:
508 		/*
509 		 * Info on next vCPU to be scheduled
510 		 *
511 		 * rec->data[0] = next domid
512 		 * rec->data[1] = time spent waiting to get on cpu
513 		 * rec->data[2] = time slice
514 		 */
515 		sp->next_domid = rec->data[0];
516 		sp->next_wtime = rec->data[1];
517 		sp->next_ts = rec->data[2];
518 		break;
519 
520 	case TRC_SCHED_SWITCH:
521 		/*
522 		 * vCPU switch
523 		 *
524 		 * rec->data[0] = prev domid
525 		 * rec->data[1] = prev vcpuid
526 		 * rec->data[2] = next domid
527 		 * rec->data[3] = next vcpuid
528 		 */
529 		if (rec->data[0] != sp->prev_domid &&
530 		    rec->data[2] != sp->next_domid) {
531 			/* prev and next info don't match doms being sched'd */
532 			tbuf.stat_spurious_switch++;
533 			return;
534 		}
535 
536 		sp->prev_vcpuid = rec->data[1];
537 		sp->next_vcpuid = rec->data[3];
538 
539 		XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)?
540 		    XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU,
541 		    cpuid, sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime);
542 
543 		XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)?
544 		    XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU,
545 		    cpuid, sp->next_domid, sp->next_vcpuid, sp->next_wtime,
546 		    sp->next_ts);
547 		break;
548 
549 	case TRC_SCHED_BLOCK:
550 		/*
551 		 * vCPU blocked
552 		 *
553 		 * rec->data[0] = domid
554 		 * rec->data[1] = vcpuid
555 		 */
556 		XDT_PROBE2(XDT_SCHED_BLOCK, cpuid, rec->data[0], rec->data[1]);
557 		break;
558 
559 	case TRC_SCHED_SLEEP:
560 		/*
561 		 * Put vCPU to sleep
562 		 *
563 		 * rec->data[0] = domid
564 		 * rec->data[1] = vcpuid
565 		 */
566 		XDT_PROBE2(XDT_SCHED_SLEEP, cpuid, rec->data[0], rec->data[1]);
567 		break;
568 
569 	case TRC_SCHED_WAKE:
570 		/*
571 		 * Wake up vCPU
572 		 *
573 		 * rec->data[0] = domid
574 		 * rec->data[1] = vcpuid
575 		 */
576 		XDT_PROBE2(XDT_SCHED_WAKE, cpuid, rec->data[0], rec->data[1]);
577 		break;
578 
579 	case TRC_SCHED_YIELD:
580 		/*
581 		 * vCPU yielded
582 		 *
583 		 * rec->data[0] = domid
584 		 * rec->data[1] = vcpuid
585 		 */
586 		XDT_PROBE2(XDT_SCHED_YIELD, cpuid, rec->data[0], rec->data[1]);
587 		break;
588 
589 	case TRC_SCHED_SHUTDOWN:
590 		/*
591 		 * Guest shutting down
592 		 *
593 		 * rec->data[0] = domid
594 		 * rec->data[1] = initiating vcpu
595 		 * rec->data[2] = shutdown code
596 		 */
597 		switch (rec->data[2]) {
598 		case SHUTDOWN_poweroff:
599 			eid = XDT_SCHED_SHUTDOWN_POWEROFF;
600 			break;
601 		case SHUTDOWN_reboot:
602 			eid = XDT_SCHED_SHUTDOWN_REBOOT;
603 			break;
604 		case SHUTDOWN_suspend:
605 			eid = XDT_SCHED_SHUTDOWN_SUSPEND;
606 			break;
607 		case SHUTDOWN_crash:
608 			eid = XDT_SCHED_SHUTDOWN_CRASH;
609 			break;
610 		default:
611 			tbuf.stat_unknown_shutdown++;
612 			return;
613 		}
614 
615 		XDT_PROBE1(eid, cpuid, rec->data[0]);
616 		break;
617 
618 	/*
619 	 * Mem probes
620 	 */
621 	case TRC_MEM_PAGE_GRANT_MAP:
622 		/*
623 		 * Guest mapped page grant
624 		 *
625 		 * rec->data[0] = domid
626 		 */
627 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, cpuid, rec->data[0]);
628 		break;
629 
630 	case TRC_MEM_PAGE_GRANT_UNMAP:
631 		/*
632 		 * Guest unmapped page grant
633 		 *
634 		 * rec->data[0] = domid
635 		 */
636 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, cpuid, rec->data[0]);
637 		break;
638 
639 	case TRC_MEM_PAGE_GRANT_TRANSFER:
640 		/*
641 		 * Page grant is being transferred
642 		 *
643 		 * rec->data[0] = target domid
644 		 */
645 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, cpuid, rec->data[0]);
646 		break;
647 
648 	/*
649 	 * HVM probes
650 	 */
651 	case TRC_HVM_VMENTRY:
652 		/*
653 		 * Return to guest via vmx_launch/vmrun
654 		 *
655 		 * rec->data[0] = (domid<<16 + vcpuid)
656 		 */
657 		XDT_PROBE2(XDT_HVM_VMENTRY, cpuid, HVM_DOMID(rec->data[0]),
658 		    HVM_VCPUID(rec->data[0]));
659 		break;
660 
661 	case TRC_HVM_VMEXIT:
662 		/*
663 		 * Entry into VMEXIT handler
664 		 *
665 		 * rec->data[0] = (domid<<16 + vcpuid)
666 		 * rec->data[1] = guest rip
667 		 * rec->data[2] = cpu vendor specific exit code
668 		 */
669 		XDT_PROBE4(XDT_HVM_VMEXIT, cpuid, HVM_DOMID(rec->data[0]),
670 		    HVM_VCPUID(rec->data[0]), rec->data[1], rec->data[2]);
671 		break;
672 
673 	case TRC_LOST_RECORDS:
674 		tbuf.stat_dropped_recs++;
675 		break;
676 
677 	default:
678 		tbuf.stat_unknown_recs++;
679 		break;
680 	}
681 }
682 
683 /*ARGSUSED*/
684 static void
685 xdt_tbuf_scan(void *arg)
686 {
687 	uint_t cpuid;
688 	size_t nrecs;
689 	struct t_rec *rec;
690 	uint32_t prod;
691 
692 	nrecs = (tbuf.size - sizeof (struct t_buf)) / sizeof (struct t_rec);
693 
694 	/* scan all cpu buffers for new records */
695 	for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) {
696 		prod = tbuf.meta[cpuid]->prod;
697 		membar_consumer(); /* read prod /then/ data */
698 		while (tbuf.meta[cpuid]->cons != prod) {
699 			rec = tbuf.data[cpuid] + tbuf.meta[cpuid]->cons % nrecs;
700 			xdt_process_rec(cpuid, rec);
701 			membar_exit(); /* read data /then/ update cons */
702 			tbuf.meta[cpuid]->cons++;
703 		}
704 	}
705 }
706 
707 static void
708 xdt_cyclic_enable(void)
709 {
710 	cyc_handler_t hdlr;
711 	cyc_time_t when;
712 
713 	ASSERT(MUTEX_HELD(&cpu_lock));
714 
715 	hdlr.cyh_func = xdt_tbuf_scan;
716 	hdlr.cyh_arg = NULL;
717 	hdlr.cyh_level = CY_LOW_LEVEL;
718 
719 	when.cyt_interval = xdt_poll_nsec;
720 	when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
721 
722 	xdt_cyclic = cyclic_add(&hdlr, &when);
723 }
724 
725 static void
726 xdt_probe_create(xdt_probe_t *p)
727 {
728 	ASSERT(p != NULL && p->pr_mod != NULL);
729 
730 	if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0)
731 		return;
732 
733 	xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL,
734 	    p->pr_name, dtrace_mach_aframes(), p);
735 }
736 
737 /*ARGSUSED*/
738 static void
739 xdt_provide(void *arg, const dtrace_probedesc_t *desc)
740 {
741 	const char *mod, *name;
742 	int i;
743 
744 	if (desc == NULL) {
745 		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
746 			xdt_probe_create(&xdt_probe[i]);
747 		}
748 	} else {
749 		mod = desc->dtpd_mod;
750 		name = desc->dtpd_name;
751 		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
752 			int l1 = strlen(xdt_probe[i].pr_name);
753 			int l2 = strlen(xdt_probe[i].pr_mod);
754 			if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 &&
755 			    strncmp(mod, xdt_probe[i].pr_mod, l2) == 0)
756 				break;
757 		}
758 
759 		if (xdt_probe[i].pr_mod == NULL)
760 			return;
761 		xdt_probe_create(&xdt_probe[i]);
762 	}
763 
764 }
765 
766 /*ARGSUSED*/
767 static void
768 xdt_destroy(void *arg, dtrace_id_t id, void *parg)
769 {
770 	xdt_probe_t *p = parg;
771 	xdt_prid[p->evt_id] = 0;
772 }
773 
774 static void
775 xdt_set_trace_mask(uint32_t mask)
776 {
777 	xen_sysctl_tbuf_op_t tbuf_op;
778 
779 	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_evt_mask;
780 	tbuf_op.evt_mask = mask;
781 	(void) xdt_sysctl_tbuf(&tbuf_op);
782 }
783 
784 /*ARGSUSED*/
785 static void
786 xdt_enable(void *arg, dtrace_id_t id, void *parg)
787 {
788 	xdt_probe_t *p = parg;
789 	xen_sysctl_tbuf_op_t tbuf_op;
790 
791 	ASSERT(MUTEX_HELD(&cpu_lock));
792 	ASSERT(xdt_prid[p->evt_id] != 0);
793 
794 	xdt_probemap[p->evt_id] = xdt_prid[p->evt_id];
795 	xdt_classinfo[p->class].cnt++;
796 
797 	if (xdt_classinfo[p->class].cnt == 1) {
798 		/* set the trace mask for this class */
799 		cur_trace_mask |= xdt_classinfo[p->class].trc_mask;
800 		xdt_set_trace_mask(cur_trace_mask);
801 	}
802 
803 	if (xdt_cyclic == CYCLIC_NONE) {
804 		/*
805 		 * DTrace doesn't have the notion of failing an enabling. It
806 		 * works on the premise that, if you have advertised a probe
807 		 * via the pops->dtps_provide() function, you can enable it.
808 		 * Failure is not an option. In the case where we can't enable
809 		 * Xen tracing the consumer will carry on regardless and
810 		 * think all is OK except the probes will never fire.
811 		 */
812 		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable;
813 		if (xdt_sysctl_tbuf(&tbuf_op) != 0) {
814 			cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing.");
815 			return;
816 		}
817 
818 		xdt_cyclic_enable();
819 	}
820 }
821 
822 /*ARGSUSED*/
823 static void
824 xdt_disable(void *arg, dtrace_id_t id, void *parg)
825 {
826 	xdt_probe_t *p = parg;
827 	xen_sysctl_tbuf_op_t tbuf_op;
828 	int i, err;
829 
830 	ASSERT(MUTEX_HELD(&cpu_lock));
831 	ASSERT(xdt_probemap[p->evt_id] != 0);
832 	ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]);
833 	ASSERT(xdt_classinfo[p->class].cnt > 0);
834 
835 	/*
836 	 * We could be here in the slight window between the cyclic firing and
837 	 * a call to dtrace_probe() occurring. We need to be careful if we tear
838 	 * down any shared state.
839 	 */
840 
841 	xdt_probemap[p->evt_id] = 0;
842 	xdt_classinfo[p->class].cnt--;
843 
844 	if (xdt_nr_active_probes() == 0) {
845 		cur_trace_mask = 0;
846 
847 		if (xdt_cyclic == CYCLIC_NONE)
848 			return;
849 
850 		/*
851 		 * We will try to disable the trace buffers. If we fail for some
852 		 * reason we will try again, up to a count of XDT_TBUF_RETRY.
853 		 * If we still aren't successful we try to set the trace mask
854 		 * to 0 in order to prevent trace records from being written.
855 		 */
856 		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable;
857 		i = 0;
858 		do {
859 			err = xdt_sysctl_tbuf(&tbuf_op);
860 		} while ((err != 0) && (++i < XDT_TBUF_RETRY));
861 
862 		if (err != 0) {
863 			cmn_err(CE_NOTE,
864 			    "Couldn't disable hypervisor tracing.");
865 			xdt_set_trace_mask(0);
866 		} else {
867 			cyclic_remove(xdt_cyclic);
868 			xdt_cyclic = CYCLIC_NONE;
869 			/*
870 			 * We don't bother making the hypercall to set
871 			 * the trace mask, since it will be reset when
872 			 * tracing is re-enabled.
873 			 */
874 		}
875 	} else if (xdt_classinfo[p->class].cnt == 0) {
876 		cur_trace_mask ^= xdt_classinfo[p->class].trc_mask;
877 		/* other probes are enabled, so add the sub-class mask back */
878 		cur_trace_mask |= 0xF000;
879 		xdt_set_trace_mask(cur_trace_mask);
880 	}
881 }
882 
883 static dtrace_pattr_t xdt_attr = {
884 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
885 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
886 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
887 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
888 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
889 };
890 
891 static dtrace_pops_t xdt_pops = {
892 	xdt_provide,		/* dtps_provide() */
893 	NULL,			/* dtps_provide_module() */
894 	xdt_enable,		/* dtps_enable() */
895 	xdt_disable,		/* dtps_disable() */
896 	NULL,			/* dtps_suspend() */
897 	NULL,			/* dtps_resume() */
898 	NULL,			/* dtps_getargdesc() */
899 	NULL,			/* dtps_getargval() */
900 	NULL,			/* dtps_usermode() */
901 	xdt_destroy		/* dtps_destroy() */
902 };
903 
904 static int
905 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
906 {
907 	int val;
908 
909 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
910 		return (DDI_FAILURE);
911 
912 	switch (cmd) {
913 	case DDI_ATTACH:
914 		break;
915 
916 	case DDI_RESUME:
917 		/*
918 		 * We might support proper suspend/resume in the future, so,
919 		 * return DDI_FAILURE for now.
920 		 */
921 		return (DDI_FAILURE);
922 
923 	default:
924 		return (DDI_FAILURE);
925 	}
926 
927 	xdt_ncpus = xen_get_nphyscpus();
928 	ASSERT(xdt_ncpus > 0);
929 
930 	if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) ==
931 	    DDI_FAILURE || xdt_attach_trace_buffers() != 0 ||
932 	    dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL,
933 	    &xdt_pops, NULL, &xdt_id) != 0) {
934 		if (tbuf.va != NULL)
935 			xdt_detach_trace_buffers();
936 		ddi_remove_minor_node(devi, NULL);
937 		return (DDI_FAILURE);
938 	}
939 
940 	val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
941 	    "xdt_poll_nsec", XDT_POLL_DEFAULT);
942 	xdt_poll_nsec = MAX(val, XDT_POLL_MIN);
943 
944 	xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_alloc(xdt_ncpus *
945 	    sizeof (xdt_schedinfo_t), KM_SLEEP);
946 	xdt_init_trace_masks();
947 	xdt_kstat_init();
948 
949 	xdt_devi = devi;
950 	ddi_report_dev(devi);
951 	return (DDI_SUCCESS);
952 }
953 
954 static int
955 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
956 {
957 	switch (cmd) {
958 	case DDI_DETACH:
959 		break;
960 
961 	case DDI_SUSPEND:
962 		/*
963 		 * We might support proper suspend/resume in the future. So
964 		 * return DDI_FAILURE for now.
965 		 */
966 		return (DDI_FAILURE);
967 
968 	default:
969 		return (DDI_FAILURE);
970 	}
971 
972 	if (dtrace_unregister(xdt_id) != 0)
973 		return (DDI_FAILURE);
974 
975 	xdt_detach_trace_buffers();
976 	kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t));
977 	if (xdt_cyclic != CYCLIC_NONE)
978 		cyclic_remove(xdt_cyclic);
979 	if (xdt_kstats != NULL)
980 		kstat_delete(xdt_kstats);
981 	xdt_devi = (void *)0;
982 	ddi_remove_minor_node(devi, NULL);
983 
984 	return (DDI_SUCCESS);
985 }
986 
987 /*ARGSUSED*/
988 static int
989 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result)
990 {
991 	int error;
992 
993 	switch (infocmd) {
994 	case DDI_INFO_DEVT2DEVINFO:
995 		*result = xdt_devi;
996 		error = DDI_SUCCESS;
997 		break;
998 	case DDI_INFO_DEVT2INSTANCE:
999 		*result = (void *)0;
1000 		error = DDI_SUCCESS;
1001 		break;
1002 	default:
1003 		error = DDI_FAILURE;
1004 	}
1005 	return (error);
1006 }
1007 
1008 static struct cb_ops xdt_cb_ops = {
1009 	nulldev,		/* open(9E) */
1010 	nodev,			/* close(9E) */
1011 	nodev,			/* strategy(9E) */
1012 	nodev,			/* print(9E) */
1013 	nodev,			/* dump(9E) */
1014 	nodev,			/* read(9E) */
1015 	nodev,			/* write(9E) */
1016 	nodev,			/* ioctl(9E) */
1017 	nodev,			/* devmap(9E) */
1018 	nodev,			/* mmap(9E) */
1019 	nodev,			/* segmap(9E) */
1020 	nochpoll,		/* chpoll(9E) */
1021 	ddi_prop_op,		/* prop_op(9E) */
1022 	NULL,			/* streamtab(9S) */
1023 	D_MP | D_64BIT | D_NEW	/* cb_flag */
1024 };
1025 
1026 static struct dev_ops xdt_ops = {
1027 	DEVO_REV,		/* devo_rev */
1028 	0,			/* devo_refcnt */
1029 	xdt_info,		/* getinfo(9E) */
1030 	nulldev,		/* identify(9E) */
1031 	nulldev,		/* probe(9E) */
1032 	xdt_attach,		/* attach(9E) */
1033 	xdt_detach,		/* detach(9E) */
1034 	nulldev,		/* devo_reset */
1035 	&xdt_cb_ops,		/* devo_cb_ops */
1036 	NULL,			/* devo_bus_ops */
1037 	NULL,			/* power(9E) */
1038 	ddi_quiesce_not_needed,		/* devo_quiesce */
1039 };
1040 
1041 
1042 static struct modldrv modldrv = {
1043 	&mod_driverops,
1044 	"Hypervisor event tracing",
1045 	&xdt_ops
1046 };
1047 
1048 static struct modlinkage modlinkage = {
1049 	MODREV_1,
1050 	&modldrv,
1051 	NULL
1052 };
1053 
1054 int
1055 _init(void)
1056 {
1057 	return (mod_install(&modlinkage));
1058 }
1059 
1060 int
1061 _fini(void)
1062 {
1063 	return (mod_remove(&modlinkage));
1064 }
1065 
1066 int
1067 _info(struct modinfo *modinfop)
1068 {
1069 	return (mod_info(&modlinkage, modinfop));
1070 }
1071