1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Xen event provider for DTrace 29 * 30 * NOTE: This provider is PRIVATE. It is intended as a short-term solution and 31 * may disappear or be re-implemented at anytime. 32 * 33 * This provider isn't suitable as a general-purpose solution for a number of 34 * reasons. First and foremost, we rely on the Xen tracing mechanism and don't 35 * have any way to gather data other than that collected by the Xen trace 36 * buffers. Further, it does not fit into the DTrace model (see "Interacting 37 * with DTrace" below.) 38 * 39 * 40 * Tracing in Xen 41 * -------------- 42 * 43 * Xen implements a tracing facility for generating and collecting execution 44 * event traces from the hypervisor. When tracing is enabled, compiled in 45 * probes record events in contiguous per-CPU trace buffers. 46 * 47 * +---------+ 48 * +------+ | | 49 * | CPUn |----> | BUFFERn | 50 * +------+ | | 51 * +---------+- tbuf.va + (tbuf.size * n) 52 * : : 53 * +---------+ 54 * +------+ | | 55 * | CPU1 |----> | BUFFER1 | 56 * +------+ | | 57 * +---------+- tbuf.va + tbuf.size 58 * +------+ | | 59 * | CPU0 |----> | BUFFER0 | 60 * +------+ | | 61 * +---------+- tbuf.va 62 * 63 * Each CPU buffer consists of a metadata header followed by the trace records. 64 * The metadata consists of a producer/consumer pair of pointers into the buffer 65 * that point to the next record to be written and the next record to be read 66 * respectively. The trace record format is as follows: 67 * 68 * +--------------------------------------------------------------------------+ 69 * | CPUID(uint_t) | TSC(uint64_t) | EVENTID(uint32_t) | DATA FIELDS | 70 * +--------------------------------------------------------------------------+ 71 * 72 * DATA FIELDS: 73 * +--------------------------------------------------------------------------+ 74 * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | D4(uint32_t) | D5(uint32_t) | 75 * +--------------------------------------------------------------------------+ 76 * 77 * 78 * Interacting with DTrace 79 * ----------------------- 80 * 81 * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed 82 * each entry into dtrace_probe() with the corresponding probe ID for the event. 83 * As a result of this periodic collection implementation probe firings are 84 * asynchronous. This is the only sensible way to implement this form of 85 * provider, but because of its asynchronous nature asking things like 86 * "current CPU" and, more importantly, arbitrary questions about the context 87 * surrounding the probe firing are not meaningful. So, consumers should not 88 * attempt to infer anything beyond what is supplied via the probe arguments. 89 */ 90 91 #include <sys/types.h> 92 #include <sys/sysmacros.h> 93 #include <sys/modctl.h> 94 #include <sys/sunddi.h> 95 #include <sys/ddi.h> 96 #include <sys/conf.h> 97 #include <sys/devops.h> 98 #include <sys/stat.h> 99 #include <sys/cmn_err.h> 100 #include <sys/dtrace.h> 101 #include <sys/sdt.h> 102 #include <sys/cyclic.h> 103 #include <vm/seg_kmem.h> 104 #include <vm/hat_i86.h> 105 #include <sys/hypervisor.h> 106 #include <xen/public/trace.h> 107 #include <xen/public/sched.h> 108 109 #define XDT_POLL_DEFAULT 100000000 /* default poll interval (ns) */ 110 #define XDT_POLL_MIN 10000000 /* min poll interval (ns) */ 111 #define XDT_TBUF_RETRY 50 /* tbuf disable retry count */ 112 113 /* 114 * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h 115 * in the xVM gate. 116 */ 117 #define IS_IDLE_DOM(domid) (domid == 0x7FFFU) 118 119 /* Macros to extract the domid and cpuid from a HVM trace data field */ 120 #define HVM_DOMID(d) (d >> 16) 121 #define HVM_VCPUID(d) (d & 0xFFFF) 122 123 #define XDT_PROBE4(event, cpuid, arg0, arg1, arg2, arg3) { \ 124 dtrace_id_t id = xdt_probemap[event]; \ 125 if (id) \ 126 dtrace_probe(id, cpuid, arg0, arg1, arg2, arg3); \ 127 } \ 128 129 #define XDT_PROBE3(event, cpuid, arg0, arg1, arg2) \ 130 XDT_PROBE4(event, cpuid, arg0, arg1, arg2, 0) 131 132 #define XDT_PROBE2(event, cpuid, arg0, arg1) \ 133 XDT_PROBE4(event, cpuid, arg0, arg1, 0, 0) 134 135 #define XDT_PROBE1(event, cpuid, arg0) \ 136 XDT_PROBE4(event, cpuid, arg0, 0, 0, 0) 137 138 #define XDT_PROBE0(event, cpuid) \ 139 XDT_PROBE4(event, cpuid, 0, 0, 0, 0) 140 141 /* Probe classes */ 142 #define XDT_SCHED 0 143 #define XDT_MEM 1 144 #define XDT_HVM 2 145 #define XDT_NCLASSES 3 146 147 /* Probe events */ 148 #define XDT_EVT_INVALID (-(int)1) 149 #define XDT_SCHED_OFF_CPU 0 150 #define XDT_SCHED_ON_CPU 1 151 #define XDT_SCHED_IDLE_OFF_CPU 2 152 #define XDT_SCHED_IDLE_ON_CPU 3 153 #define XDT_SCHED_BLOCK 4 154 #define XDT_SCHED_SLEEP 5 155 #define XDT_SCHED_WAKE 6 156 #define XDT_SCHED_YIELD 7 157 #define XDT_SCHED_SHUTDOWN_POWEROFF 8 158 #define XDT_SCHED_SHUTDOWN_REBOOT 9 159 #define XDT_SCHED_SHUTDOWN_SUSPEND 10 160 #define XDT_SCHED_SHUTDOWN_CRASH 11 161 #define XDT_MEM_PAGE_GRANT_MAP 12 162 #define XDT_MEM_PAGE_GRANT_UNMAP 13 163 #define XDT_MEM_PAGE_GRANT_TRANSFER 14 164 #define XDT_HVM_VMENTRY 15 165 #define XDT_HVM_VMEXIT 16 166 #define XDT_NEVENTS 17 167 168 typedef struct { 169 const char *pr_mod; /* probe module */ 170 const char *pr_name; /* probe name */ 171 int evt_id; /* event id */ 172 uint_t class; /* probe class */ 173 } xdt_probe_t; 174 175 typedef struct { 176 uint32_t trc_mask; /* trace mask */ 177 uint32_t cnt; /* num enabled probes in class */ 178 } xdt_classinfo_t; 179 180 typedef struct { 181 ulong_t prev_domid; /* previous dom executed */ 182 ulong_t prev_vcpuid; /* previous vcpu executed */ 183 ulong_t prev_ctime; /* time spent on cpu */ 184 ulong_t next_domid; /* next dom to be scheduled */ 185 ulong_t next_vcpuid; /* next vcpu to be scheduled */ 186 ulong_t next_wtime; /* time spent waiting to get on cpu */ 187 ulong_t next_ts; /* allocated time slice */ 188 } xdt_schedinfo_t; 189 190 static struct { 191 uint_t cnt; /* total num of trace buffers */ 192 size_t size; /* size of each cpu buffer */ 193 mfn_t start_mfn; /* starting mfn of buffers */ 194 caddr_t va; /* va buffers are mapped into */ 195 196 /* per-cpu buffers */ 197 struct t_buf **meta; /* buffer metadata */ 198 struct t_rec **data; /* buffer data records */ 199 200 /* statistics */ 201 uint64_t stat_dropped_recs; /* records dropped */ 202 uint64_t stat_spurious_cpu; /* recs with garbage cpuids */ 203 uint64_t stat_spurious_switch; /* inconsistent vcpu switches */ 204 uint64_t stat_unknown_shutdown; /* unknown shutdown code */ 205 uint64_t stat_unknown_recs; /* unknown records */ 206 } tbuf; 207 208 static char *xdt_stats[] = { 209 "dropped_recs", 210 }; 211 212 /* 213 * Tunable variables 214 * 215 * The following may be tuned by adding a line to /etc/system that 216 * includes both the name of the module ("xdt") and the name of the variable. 217 * For example: 218 * set xdt:xdt_tbuf_pages = 40 219 */ 220 uint_t xdt_tbuf_pages = 20; /* pages to alloc per-cpu buf */ 221 222 /* 223 * The following may be tuned by adding a line to 224 * /platform/i86xpv/kernel/drv/xdt.conf. 225 * For example: 226 * xdt_poll_nsec = 200000000; 227 */ 228 static hrtime_t xdt_poll_nsec; /* trace buffer poll interval */ 229 230 /* 231 * Internal variables 232 */ 233 static dev_info_t *xdt_devi; 234 static dtrace_provider_id_t xdt_id; 235 static uint_t xdt_ncpus; /* total number of phys CPUs */ 236 static uint32_t cur_trace_mask; /* current trace mask */ 237 static xdt_schedinfo_t *xdt_cpu_schedinfo; /* per-cpu sched info */ 238 dtrace_id_t xdt_probemap[XDT_NEVENTS]; /* map of enabled probes */ 239 dtrace_id_t xdt_prid[XDT_NEVENTS]; /* IDs of registered events */ 240 static cyclic_id_t xdt_cyclic = CYCLIC_NONE; 241 static kstat_t *xdt_kstats; 242 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES]; 243 244 static xdt_probe_t xdt_probe[] = { 245 /* Sched probes */ 246 { "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED }, 247 { "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED }, 248 { "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED }, 249 { "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED }, 250 { "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED }, 251 { "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED }, 252 { "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED }, 253 { "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED }, 254 { "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF, 255 XDT_SCHED }, 256 { "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED }, 257 { "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED }, 258 { "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED }, 259 260 /* Memory probes */ 261 { "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM }, 262 { "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM }, 263 { "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM }, 264 265 /* HVM probes */ 266 { "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM }, 267 { "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM }, 268 269 { NULL } 270 }; 271 272 extern uint_t xen_get_nphyscpus(void); 273 274 static inline uint32_t 275 xdt_nr_active_probes() 276 { 277 int i; 278 uint32_t tot = 0; 279 280 for (i = 0; i < XDT_NCLASSES; i++) 281 tot += xdt_classinfo[i].cnt; 282 283 return (tot); 284 } 285 286 static void 287 xdt_init_trace_masks(void) 288 { 289 xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED; 290 xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM; 291 xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM; 292 } 293 294 static int 295 xdt_kstat_update(kstat_t *ksp, int flag) 296 { 297 kstat_named_t *knp; 298 299 if (flag != KSTAT_READ) 300 return (EACCES); 301 302 knp = ksp->ks_data; 303 304 /* 305 * Assignment order should match that of the names in 306 * xdt_stats. 307 */ 308 (knp++)->value.ui64 = tbuf.stat_dropped_recs; 309 310 return (0); 311 } 312 313 static void 314 xdt_kstat_init(void) 315 { 316 int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]); 317 char **cp = xdt_stats; 318 kstat_named_t *knp; 319 320 if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc", 321 KSTAT_TYPE_NAMED, nstats, 0)) == NULL) 322 return; 323 324 xdt_kstats->ks_update = xdt_kstat_update; 325 326 knp = xdt_kstats->ks_data; 327 while (nstats > 0) { 328 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 329 knp++; 330 cp++; 331 nstats--; 332 } 333 334 kstat_install(xdt_kstats); 335 } 336 337 static int 338 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op) 339 { 340 xen_sysctl_t op; 341 int xerr; 342 343 op.cmd = XEN_SYSCTL_tbuf_op; 344 op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; 345 op.u.tbuf_op = *tbuf_op; 346 347 if ((xerr = HYPERVISOR_sysctl(&op)) != 0) 348 return (xen_xlate_errcode(xerr)); 349 350 *tbuf_op = op.u.tbuf_op; 351 return (0); 352 } 353 354 static int 355 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len) 356 { 357 x86pte_t pte; 358 caddr_t const sva = va; 359 caddr_t const eva = va + len; 360 int xerr; 361 362 ASSERT(mfn != MFN_INVALID); 363 ASSERT(va != NULL); 364 ASSERT(IS_PAGEALIGNED(len)); 365 366 for (; va < eva; va += MMU_PAGESIZE) { 367 /* 368 * Ask the HAT to load a throwaway mapping to page zero, then 369 * overwrite it with the hypervisor mapping. It gets removed 370 * later via hat_unload(). 371 */ 372 hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0, 373 PROT_READ | HAT_UNORDERED_OK, 374 HAT_LOAD_NOCONSIST | HAT_LOAD); 375 376 pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER 377 | PT_FOREIGN | PT_WRITABLE; 378 379 xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va, 380 pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN); 381 382 if (xerr != 0) { 383 /* unmap pages loaded so far */ 384 size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) - 385 (uintptr_t)sva; 386 hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP); 387 return (xen_xlate_errcode(xerr)); 388 } 389 390 mfn++; 391 } 392 393 return (0); 394 } 395 396 static int 397 xdt_attach_trace_buffers(void) 398 { 399 xen_sysctl_tbuf_op_t tbuf_op; 400 size_t len; 401 int err; 402 uint_t i; 403 404 /* 405 * Xen does not support trace buffer re-sizing. If the buffers 406 * have already been allocated we just use them as is. 407 */ 408 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 409 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 410 return (err); 411 412 if (tbuf_op.size == 0) { 413 /* set trace buffer size */ 414 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_size; 415 tbuf_op.size = xdt_tbuf_pages; 416 (void) xdt_sysctl_tbuf(&tbuf_op); 417 418 /* get trace buffer info */ 419 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 420 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 421 return (err); 422 423 if (tbuf_op.size == 0) { 424 cmn_err(CE_NOTE, "Couldn't allocate trace buffers."); 425 return (ENOBUFS); 426 } 427 } 428 429 tbuf.size = tbuf_op.size; 430 tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn; 431 tbuf.cnt = xdt_ncpus; 432 433 ASSERT(tbuf.start_mfn != MFN_INVALID); 434 ASSERT(tbuf.cnt > 0); 435 436 len = tbuf.size * tbuf.cnt; 437 tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP); 438 439 if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) { 440 vmem_free(heap_arena, tbuf.va, len); 441 tbuf.va = NULL; 442 return (err); 443 } 444 445 tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta), 446 KM_SLEEP); 447 tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data), 448 KM_SLEEP); 449 450 for (i = 0; i < tbuf.cnt; i++) { 451 void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i)); 452 tbuf.meta[i] = cpu_buf; 453 tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf + 454 sizeof (struct t_buf)); 455 456 /* throw away stale trace records */ 457 tbuf.meta[i]->cons = tbuf.meta[i]->prod; 458 } 459 460 return (0); 461 } 462 463 static void 464 xdt_detach_trace_buffers(void) 465 { 466 size_t len = tbuf.size * tbuf.cnt; 467 468 ASSERT(tbuf.va != NULL); 469 470 hat_unload(kas.a_hat, tbuf.va, len, 471 HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK); 472 vmem_free(heap_arena, tbuf.va, len); 473 kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta)); 474 kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data)); 475 } 476 477 static inline void 478 xdt_process_rec(uint_t cpuid, struct t_rec *rec) 479 { 480 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 481 int eid; 482 483 ASSERT(rec != NULL); 484 ASSERT(xdt_ncpus == xen_get_nphyscpus()); 485 486 if (cpuid >= xdt_ncpus) { 487 tbuf.stat_spurious_cpu++; 488 return; 489 } 490 491 switch (rec->event) { 492 493 /* 494 * Sched probes 495 */ 496 case TRC_SCHED_SWITCH_INFPREV: 497 /* 498 * Info on vCPU being de-scheduled 499 * 500 * rec->data[0] = prev domid 501 * rec->data[1] = time spent on pcpu 502 */ 503 sp->prev_domid = rec->data[0]; 504 sp->prev_ctime = rec->data[1]; 505 break; 506 507 case TRC_SCHED_SWITCH_INFNEXT: 508 /* 509 * Info on next vCPU to be scheduled 510 * 511 * rec->data[0] = next domid 512 * rec->data[1] = time spent waiting to get on cpu 513 * rec->data[2] = time slice 514 */ 515 sp->next_domid = rec->data[0]; 516 sp->next_wtime = rec->data[1]; 517 sp->next_ts = rec->data[2]; 518 break; 519 520 case TRC_SCHED_SWITCH: 521 /* 522 * vCPU switch 523 * 524 * rec->data[0] = prev domid 525 * rec->data[1] = prev vcpuid 526 * rec->data[2] = next domid 527 * rec->data[3] = next vcpuid 528 */ 529 if (rec->data[0] != sp->prev_domid && 530 rec->data[2] != sp->next_domid) { 531 /* prev and next info don't match doms being sched'd */ 532 tbuf.stat_spurious_switch++; 533 return; 534 } 535 536 sp->prev_vcpuid = rec->data[1]; 537 sp->next_vcpuid = rec->data[3]; 538 539 XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)? 540 XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU, 541 cpuid, sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime); 542 543 XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)? 544 XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU, 545 cpuid, sp->next_domid, sp->next_vcpuid, sp->next_wtime, 546 sp->next_ts); 547 break; 548 549 case TRC_SCHED_BLOCK: 550 /* 551 * vCPU blocked 552 * 553 * rec->data[0] = domid 554 * rec->data[1] = vcpuid 555 */ 556 XDT_PROBE2(XDT_SCHED_BLOCK, cpuid, rec->data[0], rec->data[1]); 557 break; 558 559 case TRC_SCHED_SLEEP: 560 /* 561 * Put vCPU to sleep 562 * 563 * rec->data[0] = domid 564 * rec->data[1] = vcpuid 565 */ 566 XDT_PROBE2(XDT_SCHED_SLEEP, cpuid, rec->data[0], rec->data[1]); 567 break; 568 569 case TRC_SCHED_WAKE: 570 /* 571 * Wake up vCPU 572 * 573 * rec->data[0] = domid 574 * rec->data[1] = vcpuid 575 */ 576 XDT_PROBE2(XDT_SCHED_WAKE, cpuid, rec->data[0], rec->data[1]); 577 break; 578 579 case TRC_SCHED_YIELD: 580 /* 581 * vCPU yielded 582 * 583 * rec->data[0] = domid 584 * rec->data[1] = vcpuid 585 */ 586 XDT_PROBE2(XDT_SCHED_YIELD, cpuid, rec->data[0], rec->data[1]); 587 break; 588 589 case TRC_SCHED_SHUTDOWN: 590 /* 591 * Guest shutting down 592 * 593 * rec->data[0] = domid 594 * rec->data[1] = initiating vcpu 595 * rec->data[2] = shutdown code 596 */ 597 switch (rec->data[2]) { 598 case SHUTDOWN_poweroff: 599 eid = XDT_SCHED_SHUTDOWN_POWEROFF; 600 break; 601 case SHUTDOWN_reboot: 602 eid = XDT_SCHED_SHUTDOWN_REBOOT; 603 break; 604 case SHUTDOWN_suspend: 605 eid = XDT_SCHED_SHUTDOWN_SUSPEND; 606 break; 607 case SHUTDOWN_crash: 608 eid = XDT_SCHED_SHUTDOWN_CRASH; 609 break; 610 default: 611 tbuf.stat_unknown_shutdown++; 612 return; 613 } 614 615 XDT_PROBE1(eid, cpuid, rec->data[0]); 616 break; 617 618 /* 619 * Mem probes 620 */ 621 case TRC_MEM_PAGE_GRANT_MAP: 622 /* 623 * Guest mapped page grant 624 * 625 * rec->data[0] = domid 626 */ 627 XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, cpuid, rec->data[0]); 628 break; 629 630 case TRC_MEM_PAGE_GRANT_UNMAP: 631 /* 632 * Guest unmapped page grant 633 * 634 * rec->data[0] = domid 635 */ 636 XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, cpuid, rec->data[0]); 637 break; 638 639 case TRC_MEM_PAGE_GRANT_TRANSFER: 640 /* 641 * Page grant is being transferred 642 * 643 * rec->data[0] = target domid 644 */ 645 XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, cpuid, rec->data[0]); 646 break; 647 648 /* 649 * HVM probes 650 */ 651 case TRC_HVM_VMENTRY: 652 /* 653 * Return to guest via vmx_launch/vmrun 654 * 655 * rec->data[0] = (domid<<16 + vcpuid) 656 */ 657 XDT_PROBE2(XDT_HVM_VMENTRY, cpuid, HVM_DOMID(rec->data[0]), 658 HVM_VCPUID(rec->data[0])); 659 break; 660 661 case TRC_HVM_VMEXIT: 662 /* 663 * Entry into VMEXIT handler 664 * 665 * rec->data[0] = (domid<<16 + vcpuid) 666 * rec->data[1] = guest rip 667 * rec->data[2] = cpu vendor specific exit code 668 */ 669 XDT_PROBE4(XDT_HVM_VMEXIT, cpuid, HVM_DOMID(rec->data[0]), 670 HVM_VCPUID(rec->data[0]), rec->data[1], rec->data[2]); 671 break; 672 673 case TRC_LOST_RECORDS: 674 tbuf.stat_dropped_recs++; 675 break; 676 677 default: 678 tbuf.stat_unknown_recs++; 679 break; 680 } 681 } 682 683 /*ARGSUSED*/ 684 static void 685 xdt_tbuf_scan(void *arg) 686 { 687 uint_t cpuid; 688 size_t nrecs; 689 struct t_rec *rec; 690 uint32_t prod; 691 692 nrecs = (tbuf.size - sizeof (struct t_buf)) / sizeof (struct t_rec); 693 694 /* scan all cpu buffers for new records */ 695 for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) { 696 prod = tbuf.meta[cpuid]->prod; 697 membar_consumer(); /* read prod /then/ data */ 698 while (tbuf.meta[cpuid]->cons != prod) { 699 rec = tbuf.data[cpuid] + tbuf.meta[cpuid]->cons % nrecs; 700 xdt_process_rec(cpuid, rec); 701 membar_exit(); /* read data /then/ update cons */ 702 tbuf.meta[cpuid]->cons++; 703 } 704 } 705 } 706 707 static void 708 xdt_cyclic_enable(void) 709 { 710 cyc_handler_t hdlr; 711 cyc_time_t when; 712 713 ASSERT(MUTEX_HELD(&cpu_lock)); 714 715 hdlr.cyh_func = xdt_tbuf_scan; 716 hdlr.cyh_arg = NULL; 717 hdlr.cyh_level = CY_LOW_LEVEL; 718 719 when.cyt_interval = xdt_poll_nsec; 720 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 721 722 xdt_cyclic = cyclic_add(&hdlr, &when); 723 } 724 725 static void 726 xdt_probe_create(xdt_probe_t *p) 727 { 728 ASSERT(p != NULL && p->pr_mod != NULL); 729 730 if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0) 731 return; 732 733 xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL, 734 p->pr_name, dtrace_mach_aframes(), p); 735 } 736 737 /*ARGSUSED*/ 738 static void 739 xdt_provide(void *arg, const dtrace_probedesc_t *desc) 740 { 741 const char *mod, *name; 742 int i; 743 744 if (desc == NULL) { 745 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 746 xdt_probe_create(&xdt_probe[i]); 747 } 748 } else { 749 mod = desc->dtpd_mod; 750 name = desc->dtpd_name; 751 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 752 int l1 = strlen(xdt_probe[i].pr_name); 753 int l2 = strlen(xdt_probe[i].pr_mod); 754 if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 && 755 strncmp(mod, xdt_probe[i].pr_mod, l2) == 0) 756 break; 757 } 758 759 if (xdt_probe[i].pr_mod == NULL) 760 return; 761 xdt_probe_create(&xdt_probe[i]); 762 } 763 764 } 765 766 /*ARGSUSED*/ 767 static void 768 xdt_destroy(void *arg, dtrace_id_t id, void *parg) 769 { 770 xdt_probe_t *p = parg; 771 xdt_prid[p->evt_id] = 0; 772 } 773 774 static void 775 xdt_set_trace_mask(uint32_t mask) 776 { 777 xen_sysctl_tbuf_op_t tbuf_op; 778 779 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_evt_mask; 780 tbuf_op.evt_mask = mask; 781 (void) xdt_sysctl_tbuf(&tbuf_op); 782 } 783 784 /*ARGSUSED*/ 785 static int 786 xdt_enable(void *arg, dtrace_id_t id, void *parg) 787 { 788 xdt_probe_t *p = parg; 789 xen_sysctl_tbuf_op_t tbuf_op; 790 791 ASSERT(MUTEX_HELD(&cpu_lock)); 792 ASSERT(xdt_prid[p->evt_id] != 0); 793 794 xdt_probemap[p->evt_id] = xdt_prid[p->evt_id]; 795 xdt_classinfo[p->class].cnt++; 796 797 if (xdt_classinfo[p->class].cnt == 1) { 798 /* set the trace mask for this class */ 799 cur_trace_mask |= xdt_classinfo[p->class].trc_mask; 800 xdt_set_trace_mask(cur_trace_mask); 801 } 802 803 if (xdt_cyclic == CYCLIC_NONE) { 804 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable; 805 if (xdt_sysctl_tbuf(&tbuf_op) != 0) { 806 cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing."); 807 return (-1); 808 } 809 810 xdt_cyclic_enable(); 811 } 812 return (0); 813 } 814 815 /*ARGSUSED*/ 816 static void 817 xdt_disable(void *arg, dtrace_id_t id, void *parg) 818 { 819 xdt_probe_t *p = parg; 820 xen_sysctl_tbuf_op_t tbuf_op; 821 int i, err; 822 823 ASSERT(MUTEX_HELD(&cpu_lock)); 824 ASSERT(xdt_probemap[p->evt_id] != 0); 825 ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]); 826 ASSERT(xdt_classinfo[p->class].cnt > 0); 827 828 /* 829 * We could be here in the slight window between the cyclic firing and 830 * a call to dtrace_probe() occurring. We need to be careful if we tear 831 * down any shared state. 832 */ 833 834 xdt_probemap[p->evt_id] = 0; 835 xdt_classinfo[p->class].cnt--; 836 837 if (xdt_nr_active_probes() == 0) { 838 cur_trace_mask = 0; 839 840 if (xdt_cyclic == CYCLIC_NONE) 841 return; 842 843 /* 844 * We will try to disable the trace buffers. If we fail for some 845 * reason we will try again, up to a count of XDT_TBUF_RETRY. 846 * If we still aren't successful we try to set the trace mask 847 * to 0 in order to prevent trace records from being written. 848 */ 849 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable; 850 i = 0; 851 do { 852 err = xdt_sysctl_tbuf(&tbuf_op); 853 } while ((err != 0) && (++i < XDT_TBUF_RETRY)); 854 855 if (err != 0) { 856 cmn_err(CE_NOTE, 857 "Couldn't disable hypervisor tracing."); 858 xdt_set_trace_mask(0); 859 } else { 860 cyclic_remove(xdt_cyclic); 861 xdt_cyclic = CYCLIC_NONE; 862 /* 863 * We don't bother making the hypercall to set 864 * the trace mask, since it will be reset when 865 * tracing is re-enabled. 866 */ 867 } 868 } else if (xdt_classinfo[p->class].cnt == 0) { 869 cur_trace_mask ^= xdt_classinfo[p->class].trc_mask; 870 /* other probes are enabled, so add the sub-class mask back */ 871 cur_trace_mask |= 0xF000; 872 xdt_set_trace_mask(cur_trace_mask); 873 } 874 } 875 876 static dtrace_pattr_t xdt_attr = { 877 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 878 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 879 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 880 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 881 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 882 }; 883 884 static dtrace_pops_t xdt_pops = { 885 xdt_provide, /* dtps_provide() */ 886 NULL, /* dtps_provide_module() */ 887 xdt_enable, /* dtps_enable() */ 888 xdt_disable, /* dtps_disable() */ 889 NULL, /* dtps_suspend() */ 890 NULL, /* dtps_resume() */ 891 NULL, /* dtps_getargdesc() */ 892 NULL, /* dtps_getargval() */ 893 NULL, /* dtps_usermode() */ 894 xdt_destroy /* dtps_destroy() */ 895 }; 896 897 static int 898 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 899 { 900 int val; 901 902 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 903 return (DDI_FAILURE); 904 905 switch (cmd) { 906 case DDI_ATTACH: 907 break; 908 909 case DDI_RESUME: 910 /* 911 * We might support proper suspend/resume in the future, so, 912 * return DDI_FAILURE for now. 913 */ 914 return (DDI_FAILURE); 915 916 default: 917 return (DDI_FAILURE); 918 } 919 920 xdt_ncpus = xen_get_nphyscpus(); 921 ASSERT(xdt_ncpus > 0); 922 923 if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) == 924 DDI_FAILURE || xdt_attach_trace_buffers() != 0 || 925 dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL, 926 &xdt_pops, NULL, &xdt_id) != 0) { 927 if (tbuf.va != NULL) 928 xdt_detach_trace_buffers(); 929 ddi_remove_minor_node(devi, NULL); 930 return (DDI_FAILURE); 931 } 932 933 val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 934 "xdt_poll_nsec", XDT_POLL_DEFAULT); 935 xdt_poll_nsec = MAX(val, XDT_POLL_MIN); 936 937 xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_alloc(xdt_ncpus * 938 sizeof (xdt_schedinfo_t), KM_SLEEP); 939 xdt_init_trace_masks(); 940 xdt_kstat_init(); 941 942 xdt_devi = devi; 943 ddi_report_dev(devi); 944 return (DDI_SUCCESS); 945 } 946 947 static int 948 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 949 { 950 switch (cmd) { 951 case DDI_DETACH: 952 break; 953 954 case DDI_SUSPEND: 955 /* 956 * We might support proper suspend/resume in the future. So 957 * return DDI_FAILURE for now. 958 */ 959 return (DDI_FAILURE); 960 961 default: 962 return (DDI_FAILURE); 963 } 964 965 if (dtrace_unregister(xdt_id) != 0) 966 return (DDI_FAILURE); 967 968 xdt_detach_trace_buffers(); 969 kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t)); 970 if (xdt_cyclic != CYCLIC_NONE) 971 cyclic_remove(xdt_cyclic); 972 if (xdt_kstats != NULL) 973 kstat_delete(xdt_kstats); 974 xdt_devi = (void *)0; 975 ddi_remove_minor_node(devi, NULL); 976 977 return (DDI_SUCCESS); 978 } 979 980 /*ARGSUSED*/ 981 static int 982 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result) 983 { 984 int error; 985 986 switch (infocmd) { 987 case DDI_INFO_DEVT2DEVINFO: 988 *result = xdt_devi; 989 error = DDI_SUCCESS; 990 break; 991 case DDI_INFO_DEVT2INSTANCE: 992 *result = (void *)0; 993 error = DDI_SUCCESS; 994 break; 995 default: 996 error = DDI_FAILURE; 997 } 998 return (error); 999 } 1000 1001 static struct cb_ops xdt_cb_ops = { 1002 nulldev, /* open(9E) */ 1003 nodev, /* close(9E) */ 1004 nodev, /* strategy(9E) */ 1005 nodev, /* print(9E) */ 1006 nodev, /* dump(9E) */ 1007 nodev, /* read(9E) */ 1008 nodev, /* write(9E) */ 1009 nodev, /* ioctl(9E) */ 1010 nodev, /* devmap(9E) */ 1011 nodev, /* mmap(9E) */ 1012 nodev, /* segmap(9E) */ 1013 nochpoll, /* chpoll(9E) */ 1014 ddi_prop_op, /* prop_op(9E) */ 1015 NULL, /* streamtab(9S) */ 1016 D_MP | D_64BIT | D_NEW /* cb_flag */ 1017 }; 1018 1019 static struct dev_ops xdt_ops = { 1020 DEVO_REV, /* devo_rev */ 1021 0, /* devo_refcnt */ 1022 xdt_info, /* getinfo(9E) */ 1023 nulldev, /* identify(9E) */ 1024 nulldev, /* probe(9E) */ 1025 xdt_attach, /* attach(9E) */ 1026 xdt_detach, /* detach(9E) */ 1027 nulldev, /* devo_reset */ 1028 &xdt_cb_ops, /* devo_cb_ops */ 1029 NULL, /* devo_bus_ops */ 1030 NULL, /* power(9E) */ 1031 ddi_quiesce_not_needed, /* devo_quiesce */ 1032 }; 1033 1034 1035 static struct modldrv modldrv = { 1036 &mod_driverops, 1037 "Hypervisor event tracing", 1038 &xdt_ops 1039 }; 1040 1041 static struct modlinkage modlinkage = { 1042 MODREV_1, 1043 &modldrv, 1044 NULL 1045 }; 1046 1047 int 1048 _init(void) 1049 { 1050 return (mod_install(&modlinkage)); 1051 } 1052 1053 int 1054 _fini(void) 1055 { 1056 return (mod_remove(&modlinkage)); 1057 } 1058 1059 int 1060 _info(struct modinfo *modinfop) 1061 { 1062 return (mod_info(&modlinkage, modinfop)); 1063 } 1064