1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Xen event provider for DTrace 31 * 32 * NOTE: This provider is PRIVATE. It is intended as a short-term solution and 33 * may disappear or be re-implemented at anytime. 34 * 35 * This provider isn't suitable as a general-purpose solution for a number of 36 * reasons. First and foremost, we rely on the Xen tracing mechanism and don't 37 * have any way to gather data other than that collected by the Xen trace 38 * buffers. Further, it does not fit into the DTrace model (see "Interacting 39 * with DTrace" below.) 40 * 41 * 42 * Tracing in Xen 43 * -------------- 44 * 45 * Xen implements a tracing facility for generating and collecting execution 46 * event traces from the hypervisor. When tracing is enabled, compiled in 47 * probes record events in contiguous per-CPU trace buffers. 48 * 49 * +---------+ 50 * +------+ | | 51 * | CPUn |----> | BUFFERn | 52 * +------+ | | 53 * +---------+- tbuf.va + (tbuf.size * n) 54 * : : 55 * +---------+ 56 * +------+ | | 57 * | CPU1 |----> | BUFFER1 | 58 * +------+ | | 59 * +---------+- tbuf.va + tbuf.size 60 * +------+ | | 61 * | CPU0 |----> | BUFFER0 | 62 * +------+ | | 63 * +---------+- tbuf.va 64 * 65 * Each CPU buffer consists of a metadata header followed by the trace records. 66 * The metadata consists of a producer/consumer pair of pointers into the buffer 67 * that point to the next record to be written and the next record to be read 68 * respectively. The trace record format is as follows: 69 * 70 * +--------------------------------------------------------------------------+ 71 * | CPUID(uint_t) | TSC(uint64_t) | EVENTID(uint32_t) | DATA FIELDS | 72 * +--------------------------------------------------------------------------+ 73 * 74 * DATA FIELDS: 75 * +--------------------------------------------------------------------------+ 76 * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | D4(uint32_t) | D5(uint32_t) | 77 * +--------------------------------------------------------------------------+ 78 * 79 * 80 * Interacting with DTrace 81 * ----------------------- 82 * 83 * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed 84 * each entry into dtrace_probe() with the corresponding probe ID for the event. 85 * As a result of this periodic collection implementation probe firings are 86 * asynchronous. This is the only sensible way to implement this form of 87 * provider, but because of its asynchronous nature asking things like 88 * "current CPU" and, more importantly, arbitrary questions about the context 89 * surrounding the probe firing are not meaningful. So, consumers should not 90 * attempt to infer anything beyond what is supplied via the probe arguments. 91 */ 92 93 #include <sys/types.h> 94 #include <sys/sysmacros.h> 95 #include <sys/modctl.h> 96 #include <sys/sunddi.h> 97 #include <sys/ddi.h> 98 #include <sys/conf.h> 99 #include <sys/devops.h> 100 #include <sys/stat.h> 101 #include <sys/cmn_err.h> 102 #include <sys/dtrace.h> 103 #include <sys/sdt.h> 104 #include <sys/cyclic.h> 105 #include <vm/seg_kmem.h> 106 #include <vm/hat_i86.h> 107 #include <sys/hypervisor.h> 108 #include <xen/public/trace.h> 109 #include <xen/public/sched.h> 110 111 #define XDT_POLL_DEFAULT 100000000 /* default poll interval (ns) */ 112 #define XDT_POLL_MIN 10000000 /* min poll interval (ns) */ 113 #define XDT_TBUF_RETRY 50 /* tbuf disable retry count */ 114 115 /* 116 * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h 117 * in the xVM gate. 118 */ 119 #define IS_IDLE_DOM(domid) (domid == 0x7FFFU) 120 121 /* Macros to extract the domid and cpuid from a HVM trace data field */ 122 #define HVM_DOMID(d) (d >> 16) 123 #define HVM_VCPUID(d) (d & 0xFFFF) 124 125 #define XDT_PROBE4(event, cpuid, arg0, arg1, arg2, arg3) { \ 126 dtrace_id_t id = xdt_probemap[event]; \ 127 if (id) \ 128 dtrace_probe(id, cpuid, arg0, arg1, arg2, arg3); \ 129 } \ 130 131 #define XDT_PROBE3(event, cpuid, arg0, arg1, arg2) \ 132 XDT_PROBE4(event, cpuid, arg0, arg1, arg2, 0) 133 134 #define XDT_PROBE2(event, cpuid, arg0, arg1) \ 135 XDT_PROBE4(event, cpuid, arg0, arg1, 0, 0) 136 137 #define XDT_PROBE1(event, cpuid, arg0) \ 138 XDT_PROBE4(event, cpuid, arg0, 0, 0, 0) 139 140 #define XDT_PROBE0(event, cpuid) \ 141 XDT_PROBE4(event, cpuid, 0, 0, 0, 0) 142 143 /* Probe classes */ 144 #define XDT_SCHED 0 145 #define XDT_MEM 1 146 #define XDT_HVM 2 147 #define XDT_NCLASSES 3 148 149 /* Probe events */ 150 #define XDT_EVT_INVALID (-(int)1) 151 #define XDT_SCHED_OFF_CPU 0 152 #define XDT_SCHED_ON_CPU 1 153 #define XDT_SCHED_IDLE_OFF_CPU 2 154 #define XDT_SCHED_IDLE_ON_CPU 3 155 #define XDT_SCHED_BLOCK 4 156 #define XDT_SCHED_SLEEP 5 157 #define XDT_SCHED_WAKE 6 158 #define XDT_SCHED_YIELD 7 159 #define XDT_SCHED_SHUTDOWN_POWEROFF 8 160 #define XDT_SCHED_SHUTDOWN_REBOOT 9 161 #define XDT_SCHED_SHUTDOWN_SUSPEND 10 162 #define XDT_SCHED_SHUTDOWN_CRASH 11 163 #define XDT_MEM_PAGE_GRANT_MAP 12 164 #define XDT_MEM_PAGE_GRANT_UNMAP 13 165 #define XDT_MEM_PAGE_GRANT_TRANSFER 14 166 #define XDT_HVM_VMENTRY 15 167 #define XDT_HVM_VMEXIT 16 168 #define XDT_NEVENTS 17 169 170 typedef struct { 171 const char *pr_mod; /* probe module */ 172 const char *pr_name; /* probe name */ 173 int evt_id; /* event id */ 174 uint_t class; /* probe class */ 175 } xdt_probe_t; 176 177 typedef struct { 178 uint32_t trc_mask; /* trace mask */ 179 uint32_t cnt; /* num enabled probes in class */ 180 } xdt_classinfo_t; 181 182 typedef struct { 183 ulong_t prev_domid; /* previous dom executed */ 184 ulong_t prev_vcpuid; /* previous vcpu executed */ 185 ulong_t prev_ctime; /* time spent on cpu */ 186 ulong_t next_domid; /* next dom to be scheduled */ 187 ulong_t next_vcpuid; /* next vcpu to be scheduled */ 188 ulong_t next_wtime; /* time spent waiting to get on cpu */ 189 ulong_t next_ts; /* allocated time slice */ 190 } xdt_schedinfo_t; 191 192 static struct { 193 uint_t cnt; /* total num of trace buffers */ 194 size_t size; /* size of each cpu buffer */ 195 mfn_t start_mfn; /* starting mfn of buffers */ 196 caddr_t va; /* va buffers are mapped into */ 197 198 /* per-cpu buffers */ 199 struct t_buf **meta; /* buffer metadata */ 200 struct t_rec **data; /* buffer data records */ 201 202 /* statistics */ 203 uint64_t stat_dropped_recs; /* records dropped */ 204 uint64_t stat_spurious_cpu; /* recs with garbage cpuids */ 205 uint64_t stat_spurious_switch; /* inconsistent vcpu switches */ 206 uint64_t stat_unknown_shutdown; /* unknown shutdown code */ 207 uint64_t stat_unknown_recs; /* unknown records */ 208 } tbuf; 209 210 static char *xdt_stats[] = { 211 "dropped_recs", 212 }; 213 214 /* 215 * Tunable variables 216 * 217 * The following may be tuned by adding a line to /etc/system that 218 * includes both the name of the module ("xdt") and the name of the variable. 219 * For example: 220 * set xdt:xdt_tbuf_pages = 40 221 */ 222 uint_t xdt_tbuf_pages = 20; /* pages to alloc per-cpu buf */ 223 224 /* 225 * The following may be tuned by adding a line to 226 * /platform/i86xpv/kernel/drv/xdt.conf. 227 * For example: 228 * xdt_poll_nsec = 200000000; 229 */ 230 static hrtime_t xdt_poll_nsec; /* trace buffer poll interval */ 231 232 /* 233 * Internal variables 234 */ 235 static dev_info_t *xdt_devi; 236 static dtrace_provider_id_t xdt_id; 237 static uint_t xdt_ncpus; /* total number of phys CPUs */ 238 static uint32_t cur_trace_mask; /* current trace mask */ 239 static xdt_schedinfo_t *xdt_cpu_schedinfo; /* per-cpu sched info */ 240 dtrace_id_t xdt_probemap[XDT_NEVENTS]; /* map of enabled probes */ 241 dtrace_id_t xdt_prid[XDT_NEVENTS]; /* IDs of registered events */ 242 static cyclic_id_t xdt_cyclic = CYCLIC_NONE; 243 static kstat_t *xdt_kstats; 244 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES]; 245 246 static xdt_probe_t xdt_probe[] = { 247 /* Sched probes */ 248 { "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED }, 249 { "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED }, 250 { "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED }, 251 { "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED }, 252 { "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED }, 253 { "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED }, 254 { "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED }, 255 { "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED }, 256 { "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF, 257 XDT_SCHED }, 258 { "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED }, 259 { "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED }, 260 { "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED }, 261 262 /* Memory probes */ 263 { "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM }, 264 { "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM }, 265 { "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM }, 266 267 /* HVM probes */ 268 { "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM }, 269 { "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM }, 270 271 { NULL } 272 }; 273 274 extern uint_t xen_get_nphyscpus(void); 275 276 static inline uint32_t 277 xdt_nr_active_probes() 278 { 279 int i; 280 uint32_t tot = 0; 281 282 for (i = 0; i < XDT_NCLASSES; i++) 283 tot += xdt_classinfo[i].cnt; 284 285 return (tot); 286 } 287 288 static void 289 xdt_init_trace_masks(void) 290 { 291 xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED; 292 xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM; 293 xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM; 294 } 295 296 static int 297 xdt_kstat_update(kstat_t *ksp, int flag) 298 { 299 kstat_named_t *knp; 300 301 if (flag != KSTAT_READ) 302 return (EACCES); 303 304 knp = ksp->ks_data; 305 306 /* 307 * Assignment order should match that of the names in 308 * xdt_stats. 309 */ 310 (knp++)->value.ui64 = tbuf.stat_dropped_recs; 311 312 return (0); 313 } 314 315 static void 316 xdt_kstat_init(void) 317 { 318 int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]); 319 char **cp = xdt_stats; 320 kstat_named_t *knp; 321 322 if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc", 323 KSTAT_TYPE_NAMED, nstats, 0)) == NULL) 324 return; 325 326 xdt_kstats->ks_update = xdt_kstat_update; 327 328 knp = xdt_kstats->ks_data; 329 while (nstats > 0) { 330 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 331 knp++; 332 cp++; 333 nstats--; 334 } 335 336 kstat_install(xdt_kstats); 337 } 338 339 static int 340 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op) 341 { 342 xen_sysctl_t op; 343 int xerr; 344 345 op.cmd = XEN_SYSCTL_tbuf_op; 346 op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; 347 op.u.tbuf_op = *tbuf_op; 348 349 if ((xerr = HYPERVISOR_sysctl(&op)) != 0) 350 return (xen_xlate_errcode(xerr)); 351 352 *tbuf_op = op.u.tbuf_op; 353 return (0); 354 } 355 356 static int 357 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len) 358 { 359 x86pte_t pte; 360 caddr_t const sva = va; 361 caddr_t const eva = va + len; 362 int xerr; 363 364 ASSERT(mfn != MFN_INVALID); 365 ASSERT(va != NULL); 366 ASSERT(IS_PAGEALIGNED(len)); 367 368 for (; va < eva; va += MMU_PAGESIZE) { 369 /* 370 * Ask the HAT to load a throwaway mapping to page zero, then 371 * overwrite it with the hypervisor mapping. It gets removed 372 * later via hat_unload(). 373 */ 374 hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0, 375 PROT_READ | HAT_UNORDERED_OK, 376 HAT_LOAD_NOCONSIST | HAT_LOAD); 377 378 pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER 379 | PT_FOREIGN | PT_WRITABLE; 380 381 xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va, 382 pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN); 383 384 if (xerr != 0) { 385 /* unmap pages loaded so far */ 386 size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) - 387 (uintptr_t)sva; 388 hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP); 389 return (xen_xlate_errcode(xerr)); 390 } 391 392 mfn++; 393 } 394 395 return (0); 396 } 397 398 static int 399 xdt_attach_trace_buffers(void) 400 { 401 xen_sysctl_tbuf_op_t tbuf_op; 402 size_t len; 403 int err; 404 uint_t i; 405 406 /* set trace buffer size */ 407 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_size; 408 tbuf_op.size = xdt_tbuf_pages; 409 (void) xdt_sysctl_tbuf(&tbuf_op); 410 411 /* get trace buffer info */ 412 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 413 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 414 return (err); 415 416 tbuf.size = tbuf_op.size; 417 tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn; 418 tbuf.cnt = xdt_ncpus; 419 420 if (tbuf.size == 0) { 421 cmn_err(CE_NOTE, "No trace buffers allocated!"); 422 return (ENOBUFS); 423 } 424 425 ASSERT(tbuf.start_mfn != MFN_INVALID); 426 ASSERT(tbuf.cnt > 0); 427 428 len = tbuf.size * tbuf.cnt; 429 tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP); 430 431 if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) { 432 vmem_free(heap_arena, tbuf.va, len); 433 tbuf.va = NULL; 434 return (err); 435 } 436 437 tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta), 438 KM_SLEEP); 439 tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data), 440 KM_SLEEP); 441 442 for (i = 0; i < tbuf.cnt; i++) { 443 void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i)); 444 tbuf.meta[i] = cpu_buf; 445 tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf + 446 sizeof (struct t_buf)); 447 448 /* throw away stale trace records */ 449 tbuf.meta[i]->cons = tbuf.meta[i]->prod; 450 } 451 452 return (0); 453 } 454 455 static void 456 xdt_detach_trace_buffers(void) 457 { 458 size_t len = tbuf.size * tbuf.cnt; 459 460 ASSERT(tbuf.va != NULL); 461 462 hat_unload(kas.a_hat, tbuf.va, len, 463 HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK); 464 vmem_free(heap_arena, tbuf.va, len); 465 kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta)); 466 kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data)); 467 } 468 469 static inline void 470 xdt_process_rec(uint_t cpuid, struct t_rec *rec) 471 { 472 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 473 int eid; 474 475 ASSERT(rec != NULL); 476 ASSERT(xdt_ncpus == xen_get_nphyscpus()); 477 478 if (cpuid >= xdt_ncpus) { 479 tbuf.stat_spurious_cpu++; 480 return; 481 } 482 483 switch (rec->event) { 484 485 /* 486 * Sched probes 487 */ 488 case TRC_SCHED_SWITCH_INFPREV: 489 /* 490 * Info on vCPU being de-scheduled 491 * 492 * rec->data[0] = prev domid 493 * rec->data[1] = time spent on pcpu 494 */ 495 sp->prev_domid = rec->data[0]; 496 sp->prev_ctime = rec->data[1]; 497 break; 498 499 case TRC_SCHED_SWITCH_INFNEXT: 500 /* 501 * Info on next vCPU to be scheduled 502 * 503 * rec->data[0] = next domid 504 * rec->data[1] = time spent waiting to get on cpu 505 * rec->data[2] = time slice 506 */ 507 sp->next_domid = rec->data[0]; 508 sp->next_wtime = rec->data[1]; 509 sp->next_ts = rec->data[2]; 510 break; 511 512 case TRC_SCHED_SWITCH: 513 /* 514 * vCPU switch 515 * 516 * rec->data[0] = prev domid 517 * rec->data[1] = prev vcpuid 518 * rec->data[2] = next domid 519 * rec->data[3] = next vcpuid 520 */ 521 if (rec->data[0] != sp->prev_domid && 522 rec->data[2] != sp->next_domid) { 523 /* prev and next info don't match doms being sched'd */ 524 tbuf.stat_spurious_switch++; 525 return; 526 } 527 528 sp->prev_vcpuid = rec->data[1]; 529 sp->next_vcpuid = rec->data[3]; 530 531 XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)? 532 XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU, 533 cpuid, sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime); 534 535 XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)? 536 XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU, 537 cpuid, sp->next_domid, sp->next_vcpuid, sp->next_wtime, 538 sp->next_ts); 539 break; 540 541 case TRC_SCHED_BLOCK: 542 /* 543 * vCPU blocked 544 * 545 * rec->data[0] = domid 546 * rec->data[1] = vcpuid 547 */ 548 XDT_PROBE2(XDT_SCHED_BLOCK, cpuid, rec->data[0], rec->data[1]); 549 break; 550 551 case TRC_SCHED_SLEEP: 552 /* 553 * Put vCPU to sleep 554 * 555 * rec->data[0] = domid 556 * rec->data[1] = vcpuid 557 */ 558 XDT_PROBE2(XDT_SCHED_SLEEP, cpuid, rec->data[0], rec->data[1]); 559 break; 560 561 case TRC_SCHED_WAKE: 562 /* 563 * Wake up vCPU 564 * 565 * rec->data[0] = domid 566 * rec->data[1] = vcpuid 567 */ 568 XDT_PROBE2(XDT_SCHED_WAKE, cpuid, rec->data[0], rec->data[1]); 569 break; 570 571 case TRC_SCHED_YIELD: 572 /* 573 * vCPU yielded 574 * 575 * rec->data[0] = domid 576 * rec->data[1] = vcpuid 577 */ 578 XDT_PROBE2(XDT_SCHED_YIELD, cpuid, rec->data[0], rec->data[1]); 579 break; 580 581 case TRC_SCHED_SHUTDOWN: 582 /* 583 * Guest shutting down 584 * 585 * rec->data[0] = domid 586 * rec->data[1] = initiating vcpu 587 * rec->data[2] = shutdown code 588 */ 589 switch (rec->data[2]) { 590 case SHUTDOWN_poweroff: 591 eid = XDT_SCHED_SHUTDOWN_POWEROFF; 592 break; 593 case SHUTDOWN_reboot: 594 eid = XDT_SCHED_SHUTDOWN_REBOOT; 595 break; 596 case SHUTDOWN_suspend: 597 eid = XDT_SCHED_SHUTDOWN_SUSPEND; 598 break; 599 case SHUTDOWN_crash: 600 eid = XDT_SCHED_SHUTDOWN_CRASH; 601 break; 602 default: 603 tbuf.stat_unknown_shutdown++; 604 return; 605 } 606 607 XDT_PROBE1(eid, cpuid, rec->data[0]); 608 break; 609 610 /* 611 * Mem probes 612 */ 613 case TRC_MEM_PAGE_GRANT_MAP: 614 /* 615 * Guest mapped page grant 616 * 617 * rec->data[0] = domid 618 */ 619 XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, cpuid, rec->data[0]); 620 break; 621 622 case TRC_MEM_PAGE_GRANT_UNMAP: 623 /* 624 * Guest unmapped page grant 625 * 626 * rec->data[0] = domid 627 */ 628 XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, cpuid, rec->data[0]); 629 break; 630 631 case TRC_MEM_PAGE_GRANT_TRANSFER: 632 /* 633 * Page grant is being transferred 634 * 635 * rec->data[0] = target domid 636 */ 637 XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, cpuid, rec->data[0]); 638 break; 639 640 /* 641 * HVM probes 642 */ 643 case TRC_HVM_VMENTRY: 644 /* 645 * Return to guest via vmx_launch/vmrun 646 * 647 * rec->data[0] = (domid<<16 + vcpuid) 648 */ 649 XDT_PROBE2(XDT_HVM_VMENTRY, cpuid, HVM_DOMID(rec->data[0]), 650 HVM_VCPUID(rec->data[0])); 651 break; 652 653 case TRC_HVM_VMEXIT: 654 /* 655 * Entry into VMEXIT handler 656 * 657 * rec->data[0] = (domid<<16 + vcpuid) 658 * rec->data[1] = guest rip 659 * rec->data[2] = cpu vendor specific exit code 660 */ 661 XDT_PROBE4(XDT_HVM_VMEXIT, cpuid, HVM_DOMID(rec->data[0]), 662 HVM_VCPUID(rec->data[0]), rec->data[1], rec->data[2]); 663 break; 664 665 case TRC_LOST_RECORDS: 666 tbuf.stat_dropped_recs++; 667 break; 668 669 default: 670 tbuf.stat_unknown_recs++; 671 break; 672 } 673 } 674 675 /*ARGSUSED*/ 676 static void 677 xdt_tbuf_scan(void *arg) 678 { 679 uint_t cpuid; 680 size_t nrecs; 681 struct t_rec *rec; 682 uint32_t prod; 683 684 nrecs = (tbuf.size - sizeof (struct t_buf)) / sizeof (struct t_rec); 685 686 /* scan all cpu buffers for new records */ 687 for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) { 688 prod = tbuf.meta[cpuid]->prod; 689 membar_consumer(); /* read prod /then/ data */ 690 while (tbuf.meta[cpuid]->cons != prod) { 691 rec = tbuf.data[cpuid] + tbuf.meta[cpuid]->cons % nrecs; 692 xdt_process_rec(cpuid, rec); 693 membar_exit(); /* read data /then/ update cons */ 694 tbuf.meta[cpuid]->cons++; 695 } 696 } 697 } 698 699 static void 700 xdt_cyclic_enable(void) 701 { 702 cyc_handler_t hdlr; 703 cyc_time_t when; 704 705 ASSERT(MUTEX_HELD(&cpu_lock)); 706 707 hdlr.cyh_func = xdt_tbuf_scan; 708 hdlr.cyh_arg = NULL; 709 hdlr.cyh_level = CY_LOW_LEVEL; 710 711 when.cyt_interval = xdt_poll_nsec; 712 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 713 714 xdt_cyclic = cyclic_add(&hdlr, &when); 715 } 716 717 static void 718 xdt_probe_create(xdt_probe_t *p) 719 { 720 ASSERT(p != NULL && p->pr_mod != NULL); 721 722 if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0) 723 return; 724 725 xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL, 726 p->pr_name, dtrace_mach_aframes(), p); 727 } 728 729 /*ARGSUSED*/ 730 static void 731 xdt_provide(void *arg, const dtrace_probedesc_t *desc) 732 { 733 const char *mod, *name; 734 int i; 735 736 if (desc == NULL) { 737 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 738 xdt_probe_create(&xdt_probe[i]); 739 } 740 } else { 741 mod = desc->dtpd_mod; 742 name = desc->dtpd_name; 743 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 744 int l1 = strlen(xdt_probe[i].pr_name); 745 int l2 = strlen(xdt_probe[i].pr_mod); 746 if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 && 747 strncmp(mod, xdt_probe[i].pr_mod, l2) == 0) 748 break; 749 } 750 751 if (xdt_probe[i].pr_mod == NULL) 752 return; 753 xdt_probe_create(&xdt_probe[i]); 754 } 755 756 } 757 758 /*ARGSUSED*/ 759 static void 760 xdt_destroy(void *arg, dtrace_id_t id, void *parg) 761 { 762 xdt_probe_t *p = parg; 763 xdt_prid[p->evt_id] = 0; 764 } 765 766 static void 767 xdt_set_trace_mask(uint32_t mask) 768 { 769 xen_sysctl_tbuf_op_t tbuf_op; 770 771 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_evt_mask; 772 tbuf_op.evt_mask = mask; 773 (void) xdt_sysctl_tbuf(&tbuf_op); 774 } 775 776 /*ARGSUSED*/ 777 static void 778 xdt_enable(void *arg, dtrace_id_t id, void *parg) 779 { 780 xdt_probe_t *p = parg; 781 xen_sysctl_tbuf_op_t tbuf_op; 782 783 ASSERT(MUTEX_HELD(&cpu_lock)); 784 ASSERT(xdt_prid[p->evt_id] != 0); 785 786 xdt_probemap[p->evt_id] = xdt_prid[p->evt_id]; 787 xdt_classinfo[p->class].cnt++; 788 789 if (xdt_classinfo[p->class].cnt == 1) { 790 /* set the trace mask for this class */ 791 cur_trace_mask |= xdt_classinfo[p->class].trc_mask; 792 xdt_set_trace_mask(cur_trace_mask); 793 } 794 795 if (xdt_cyclic == CYCLIC_NONE) { 796 /* 797 * DTrace doesn't have the notion of failing an enabling. It 798 * works on the premise that, if you have advertised a probe 799 * via the pops->dtps_provide() function, you can enable it. 800 * Failure is not an option. In the case where we can't enable 801 * Xen tracing the consumer will carry on regardless and 802 * think all is OK except the probes will never fire. 803 */ 804 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable; 805 if (xdt_sysctl_tbuf(&tbuf_op) != 0) { 806 cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing."); 807 return; 808 } 809 810 xdt_cyclic_enable(); 811 } 812 } 813 814 /*ARGSUSED*/ 815 static void 816 xdt_disable(void *arg, dtrace_id_t id, void *parg) 817 { 818 xdt_probe_t *p = parg; 819 xen_sysctl_tbuf_op_t tbuf_op; 820 int i, err; 821 822 ASSERT(MUTEX_HELD(&cpu_lock)); 823 ASSERT(xdt_probemap[p->evt_id] != 0); 824 ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]); 825 ASSERT(xdt_classinfo[p->class].cnt > 0); 826 827 /* 828 * We could be here in the slight window between the cyclic firing and 829 * a call to dtrace_probe() occurring. We need to be careful if we tear 830 * down any shared state. 831 */ 832 833 xdt_probemap[p->evt_id] = 0; 834 xdt_classinfo[p->class].cnt--; 835 836 if (xdt_nr_active_probes() == 0) { 837 cur_trace_mask = 0; 838 839 if (xdt_cyclic == CYCLIC_NONE) 840 return; 841 842 /* 843 * We will try to disable the trace buffers. If we fail for some 844 * reason we will try again, up to a count of XDT_TBUF_RETRY. 845 * If we still aren't successful we try to set the trace mask 846 * to 0 in order to prevent trace records from being written. 847 */ 848 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable; 849 i = 0; 850 do { 851 err = xdt_sysctl_tbuf(&tbuf_op); 852 } while ((err != 0) && (++i < XDT_TBUF_RETRY)); 853 854 if (err != 0) { 855 cmn_err(CE_NOTE, 856 "Couldn't disable hypervisor tracing."); 857 xdt_set_trace_mask(0); 858 } else { 859 cyclic_remove(xdt_cyclic); 860 xdt_cyclic = CYCLIC_NONE; 861 /* 862 * We don't bother making the hypercall to set 863 * the trace mask, since it will be reset when 864 * tracing is re-enabled. 865 */ 866 } 867 } else if (xdt_classinfo[p->class].cnt == 0) { 868 cur_trace_mask ^= xdt_classinfo[p->class].trc_mask; 869 /* other probes are enabled, so add the sub-class mask back */ 870 cur_trace_mask |= 0xF000; 871 xdt_set_trace_mask(cur_trace_mask); 872 } 873 } 874 875 static dtrace_pattr_t xdt_attr = { 876 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 877 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 878 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 879 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 880 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 881 }; 882 883 static dtrace_pops_t xdt_pops = { 884 xdt_provide, /* dtps_provide() */ 885 NULL, /* dtps_provide_module() */ 886 xdt_enable, /* dtps_enable() */ 887 xdt_disable, /* dtps_disable() */ 888 NULL, /* dtps_suspend() */ 889 NULL, /* dtps_resume() */ 890 NULL, /* dtps_getargdesc() */ 891 NULL, /* dtps_getargval() */ 892 NULL, /* dtps_usermode() */ 893 xdt_destroy /* dtps_destroy() */ 894 }; 895 896 static int 897 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 898 { 899 int val; 900 901 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 902 return (DDI_FAILURE); 903 904 switch (cmd) { 905 case DDI_ATTACH: 906 break; 907 908 case DDI_RESUME: 909 /* 910 * We might support proper suspend/resume in the future, so, 911 * return DDI_FAILURE for now. 912 */ 913 return (DDI_FAILURE); 914 915 default: 916 return (DDI_FAILURE); 917 } 918 919 xdt_ncpus = xen_get_nphyscpus(); 920 ASSERT(xdt_ncpus > 0); 921 922 if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) == 923 DDI_FAILURE || xdt_attach_trace_buffers() != 0 || 924 dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL, 925 &xdt_pops, NULL, &xdt_id) != 0) { 926 if (tbuf.va != NULL) 927 xdt_detach_trace_buffers(); 928 ddi_remove_minor_node(devi, NULL); 929 return (DDI_FAILURE); 930 } 931 932 val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 933 "xdt_poll_nsec", XDT_POLL_DEFAULT); 934 xdt_poll_nsec = MAX(val, XDT_POLL_MIN); 935 936 xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_alloc(xdt_ncpus * 937 sizeof (xdt_schedinfo_t), KM_SLEEP); 938 xdt_init_trace_masks(); 939 xdt_kstat_init(); 940 941 xdt_devi = devi; 942 ddi_report_dev(devi); 943 return (DDI_SUCCESS); 944 } 945 946 static int 947 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 948 { 949 switch (cmd) { 950 case DDI_DETACH: 951 break; 952 953 case DDI_SUSPEND: 954 /* 955 * We might support proper suspend/resume in the future. So 956 * return DDI_FAILURE for now. 957 */ 958 return (DDI_FAILURE); 959 960 default: 961 return (DDI_FAILURE); 962 } 963 964 if (dtrace_unregister(xdt_id) != 0) 965 return (DDI_FAILURE); 966 967 xdt_detach_trace_buffers(); 968 kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t)); 969 if (xdt_cyclic != CYCLIC_NONE) 970 cyclic_remove(xdt_cyclic); 971 if (xdt_kstats != NULL) 972 kstat_delete(xdt_kstats); 973 xdt_devi = (void *)0; 974 ddi_remove_minor_node(devi, NULL); 975 976 return (DDI_SUCCESS); 977 } 978 979 /*ARGSUSED*/ 980 static int 981 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result) 982 { 983 int error; 984 985 switch (infocmd) { 986 case DDI_INFO_DEVT2DEVINFO: 987 *result = xdt_devi; 988 error = DDI_SUCCESS; 989 break; 990 case DDI_INFO_DEVT2INSTANCE: 991 *result = (void *)0; 992 error = DDI_SUCCESS; 993 break; 994 default: 995 error = DDI_FAILURE; 996 } 997 return (error); 998 } 999 1000 static struct cb_ops xdt_cb_ops = { 1001 nulldev, /* open(9E) */ 1002 nodev, /* close(9E) */ 1003 nodev, /* strategy(9E) */ 1004 nodev, /* print(9E) */ 1005 nodev, /* dump(9E) */ 1006 nodev, /* read(9E) */ 1007 nodev, /* write(9E) */ 1008 nodev, /* ioctl(9E) */ 1009 nodev, /* devmap(9E) */ 1010 nodev, /* mmap(9E) */ 1011 nodev, /* segmap(9E) */ 1012 nochpoll, /* chpoll(9E) */ 1013 ddi_prop_op, /* prop_op(9E) */ 1014 NULL, /* streamtab(9S) */ 1015 D_MP | D_64BIT | D_NEW /* cb_flag */ 1016 }; 1017 1018 static struct dev_ops xdt_ops = { 1019 DEVO_REV, /* devo_rev */ 1020 0, /* devo_refcnt */ 1021 xdt_info, /* getinfo(9E) */ 1022 nulldev, /* identify(9E) */ 1023 nulldev, /* probe(9E) */ 1024 xdt_attach, /* attach(9E) */ 1025 xdt_detach, /* detach(9E) */ 1026 nulldev, /* devo_reset */ 1027 &xdt_cb_ops, /* devo_cb_ops */ 1028 NULL, /* devo_bus_ops */ 1029 NULL /* power(9E) */ 1030 }; 1031 1032 1033 static struct modldrv modldrv = { 1034 &mod_driverops, 1035 "Hypervisor event tracing", 1036 &xdt_ops 1037 }; 1038 1039 static struct modlinkage modlinkage = { 1040 MODREV_1, 1041 &modldrv, 1042 NULL 1043 }; 1044 1045 int 1046 _init(void) 1047 { 1048 return (mod_install(&modlinkage)); 1049 } 1050 1051 int 1052 _fini(void) 1053 { 1054 return (mod_remove(&modlinkage)); 1055 } 1056 1057 int 1058 _info(struct modinfo *modinfop) 1059 { 1060 return (mod_info(&modlinkage, modinfop)); 1061 } 1062