1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/kstat.h> 31 #include <sys/param.h> 32 #include <sys/stack.h> 33 #include <sys/regset.h> 34 #include <sys/thread.h> 35 #include <sys/proc.h> 36 #include <sys/procfs_isa.h> 37 #include <sys/kmem.h> 38 #include <sys/cpuvar.h> 39 #include <sys/systm.h> 40 #include <sys/machpcb.h> 41 #include <sys/machasi.h> 42 #include <sys/vis.h> 43 #include <sys/fpu/fpusystm.h> 44 #include <sys/cpu_module.h> 45 #include <sys/privregs.h> 46 #include <sys/archsystm.h> 47 #include <sys/atomic.h> 48 #include <sys/cmn_err.h> 49 #include <sys/time.h> 50 #include <sys/clock.h> 51 #include <sys/chip.h> 52 #include <sys/cmp.h> 53 #include <sys/platform_module.h> 54 #include <sys/bl.h> 55 #include <sys/nvpair.h> 56 #include <sys/kdi_impl.h> 57 #include <sys/machsystm.h> 58 #include <sys/sysmacros.h> 59 #include <sys/promif.h> 60 #include <sys/pool_pset.h> 61 62 int maxphys = MMU_PAGESIZE * 16; /* 128k */ 63 int klustsize = MMU_PAGESIZE * 16; /* 128k */ 64 65 /* 66 * Initialize kernel thread's stack. 67 */ 68 caddr_t 69 thread_stk_init(caddr_t stk) 70 { 71 kfpu_t *fp; 72 ulong_t align; 73 74 /* allocate extra space for floating point state */ 75 stk -= SA(sizeof (kfpu_t) + GSR_SIZE); 76 align = (uintptr_t)stk & 0x3f; 77 stk -= align; /* force v9_fpu to be 16 byte aligned */ 78 fp = (kfpu_t *)stk; 79 fp->fpu_fprs = 0; 80 81 stk -= SA(MINFRAME); 82 return (stk); 83 } 84 85 /* 86 * Initialize lwp's kernel stack. 87 * Note that now that the floating point register save area (kfpu_t) 88 * has been broken out from machpcb and aligned on a 64 byte boundary so that 89 * we can do block load/stores to/from it, there are a couple of potential 90 * optimizations to save stack space. 1. The floating point register save 91 * area could be aligned on a 16 byte boundary, and the floating point code 92 * changed to (a) check the alignment and (b) use different save/restore 93 * macros depending upon the alignment. 2. The lwp_stk_init code below 94 * could be changed to calculate if less space would be wasted if machpcb 95 * was first instead of second. However there is a REGOFF macro used in 96 * locore, syscall_trap, machdep and mlsetup that assumes that the saved 97 * register area is a fixed distance from the %sp, and would have to be 98 * changed to a pointer or something...JJ said later. 99 */ 100 caddr_t 101 lwp_stk_init(klwp_t *lwp, caddr_t stk) 102 { 103 struct machpcb *mpcb; 104 kfpu_t *fp; 105 uintptr_t aln; 106 107 stk -= SA(sizeof (kfpu_t) + GSR_SIZE); 108 aln = (uintptr_t)stk & 0x3F; 109 stk -= aln; 110 fp = (kfpu_t *)stk; 111 stk -= SA(sizeof (struct machpcb)); 112 mpcb = (struct machpcb *)stk; 113 bzero(mpcb, sizeof (struct machpcb)); 114 bzero(fp, sizeof (kfpu_t) + GSR_SIZE); 115 lwp->lwp_regs = (void *)&mpcb->mpcb_regs; 116 lwp->lwp_fpu = (void *)fp; 117 mpcb->mpcb_fpu = fp; 118 mpcb->mpcb_fpu->fpu_q = mpcb->mpcb_fpu_q; 119 mpcb->mpcb_thread = lwp->lwp_thread; 120 mpcb->mpcb_wbcnt = 0; 121 if (lwp->lwp_procp->p_model == DATAMODEL_ILP32) { 122 mpcb->mpcb_wstate = WSTATE_USER32; 123 mpcb->mpcb_wbuf = kmem_alloc(MAXWIN * sizeof (struct rwindow32), 124 KM_SLEEP); 125 } else { 126 mpcb->mpcb_wstate = WSTATE_USER64; 127 mpcb->mpcb_wbuf = kmem_alloc(MAXWIN * sizeof (struct rwindow64), 128 KM_SLEEP); 129 } 130 ASSERT(((uintptr_t)mpcb->mpcb_wbuf & 7) == 0); 131 mpcb->mpcb_wbuf_pa = va_to_pa(mpcb->mpcb_wbuf); 132 mpcb->mpcb_pa = va_to_pa(mpcb); 133 return (stk); 134 } 135 136 void 137 lwp_stk_fini(klwp_t *lwp) 138 { 139 struct machpcb *mpcb = lwptompcb(lwp); 140 141 /* 142 * there might be windows still in the wbuf due to unmapped 143 * stack, misaligned stack pointer, etc. We just free it. 144 */ 145 mpcb->mpcb_wbcnt = 0; 146 if (mpcb->mpcb_wstate == WSTATE_USER32) 147 kmem_free(mpcb->mpcb_wbuf, MAXWIN * sizeof (struct rwindow32)); 148 else 149 kmem_free(mpcb->mpcb_wbuf, MAXWIN * sizeof (struct rwindow64)); 150 mpcb->mpcb_wbuf = NULL; 151 mpcb->mpcb_wbuf_pa = -1; 152 } 153 154 155 /* 156 * Copy regs from parent to child. 157 */ 158 void 159 lwp_forkregs(klwp_t *lwp, klwp_t *clwp) 160 { 161 kthread_t *t, *pt = lwptot(lwp); 162 struct machpcb *mpcb = lwptompcb(clwp); 163 struct machpcb *pmpcb = lwptompcb(lwp); 164 kfpu_t *fp, *pfp = lwptofpu(lwp); 165 caddr_t wbuf; 166 uint_t wstate; 167 168 t = mpcb->mpcb_thread; 169 /* 170 * remember child's fp and wbuf since they will get erased during 171 * the bcopy. 172 */ 173 fp = mpcb->mpcb_fpu; 174 wbuf = mpcb->mpcb_wbuf; 175 wstate = mpcb->mpcb_wstate; 176 /* 177 * Don't copy mpcb_frame since we hand-crafted it 178 * in thread_load(). 179 */ 180 bcopy(lwp->lwp_regs, clwp->lwp_regs, sizeof (struct machpcb) - REGOFF); 181 mpcb->mpcb_thread = t; 182 mpcb->mpcb_fpu = fp; 183 fp->fpu_q = mpcb->mpcb_fpu_q; 184 185 /* 186 * It is theoretically possibly for the lwp's wstate to 187 * be different from its value assigned in lwp_stk_init, 188 * since lwp_stk_init assumed the data model of the process. 189 * Here, we took on the data model of the cloned lwp. 190 */ 191 if (mpcb->mpcb_wstate != wstate) { 192 size_t osize, size; 193 194 if (wstate == WSTATE_USER32) { 195 osize = MAXWIN * sizeof (struct rwindow32); 196 size = MAXWIN * sizeof (struct rwindow64); 197 wstate = WSTATE_USER64; 198 } else { 199 osize = MAXWIN * sizeof (struct rwindow64); 200 size = MAXWIN * sizeof (struct rwindow32); 201 wstate = WSTATE_USER32; 202 } 203 kmem_free(wbuf, osize); 204 wbuf = kmem_alloc(size, KM_SLEEP); 205 } 206 207 mpcb->mpcb_pa = va_to_pa(mpcb); 208 mpcb->mpcb_wbuf = wbuf; 209 mpcb->mpcb_wbuf_pa = va_to_pa(wbuf); 210 211 ASSERT(mpcb->mpcb_wstate == wstate); 212 213 if (mpcb->mpcb_wbcnt != 0) { 214 bcopy(pmpcb->mpcb_wbuf, mpcb->mpcb_wbuf, 215 mpcb->mpcb_wbcnt * ((mpcb->mpcb_wstate == WSTATE_USER32) ? 216 sizeof (struct rwindow32) : sizeof (struct rwindow64))); 217 } 218 219 if (pt == curthread) 220 pfp->fpu_fprs = _fp_read_fprs(); 221 if ((pfp->fpu_en) || (pfp->fpu_fprs & FPRS_FEF)) { 222 if (pt == curthread && fpu_exists) { 223 save_gsr(clwp->lwp_fpu); 224 } else { 225 uint64_t gsr; 226 gsr = get_gsr(lwp->lwp_fpu); 227 set_gsr(gsr, clwp->lwp_fpu); 228 } 229 fp_fork(lwp, clwp); 230 } 231 } 232 233 /* 234 * Free lwp fpu regs. 235 */ 236 void 237 lwp_freeregs(klwp_t *lwp, int isexec) 238 { 239 kfpu_t *fp = lwptofpu(lwp); 240 241 if (lwptot(lwp) == curthread) 242 fp->fpu_fprs = _fp_read_fprs(); 243 if ((fp->fpu_en) || (fp->fpu_fprs & FPRS_FEF)) 244 fp_free(fp, isexec); 245 } 246 247 /* 248 * fill in the extra register state area specified with the 249 * specified lwp's platform-dependent non-floating-point extra 250 * register state information 251 */ 252 /* ARGSUSED */ 253 void 254 xregs_getgfiller(klwp_id_t lwp, caddr_t xrp) 255 { 256 /* for sun4u nothing to do here, added for symmetry */ 257 } 258 259 /* 260 * fill in the extra register state area specified with the specified lwp's 261 * platform-dependent floating-point extra register state information. 262 * NOTE: 'lwp' might not correspond to 'curthread' since this is 263 * called from code in /proc to get the registers of another lwp. 264 */ 265 void 266 xregs_getfpfiller(klwp_id_t lwp, caddr_t xrp) 267 { 268 prxregset_t *xregs = (prxregset_t *)xrp; 269 kfpu_t *fp = lwptofpu(lwp); 270 uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL); 271 uint64_t gsr; 272 273 /* 274 * fp_fksave() does not flush the GSR register into 275 * the lwp area, so do it now 276 */ 277 kpreempt_disable(); 278 if (ttolwp(curthread) == lwp && fpu_exists) { 279 fp->fpu_fprs = _fp_read_fprs(); 280 if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) { 281 _fp_write_fprs(fprs); 282 fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs; 283 } 284 save_gsr(fp); 285 } 286 gsr = get_gsr(fp); 287 kpreempt_enable(); 288 PRXREG_GSR(xregs) = gsr; 289 } 290 291 /* 292 * set the specified lwp's platform-dependent non-floating-point 293 * extra register state based on the specified input 294 */ 295 /* ARGSUSED */ 296 void 297 xregs_setgfiller(klwp_id_t lwp, caddr_t xrp) 298 { 299 /* for sun4u nothing to do here, added for symmetry */ 300 } 301 302 /* 303 * set the specified lwp's platform-dependent floating-point 304 * extra register state based on the specified input 305 */ 306 void 307 xregs_setfpfiller(klwp_id_t lwp, caddr_t xrp) 308 { 309 prxregset_t *xregs = (prxregset_t *)xrp; 310 kfpu_t *fp = lwptofpu(lwp); 311 uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL); 312 uint64_t gsr = PRXREG_GSR(xregs); 313 314 kpreempt_disable(); 315 set_gsr(gsr, lwptofpu(lwp)); 316 317 if ((lwp == ttolwp(curthread)) && fpu_exists) { 318 fp->fpu_fprs = _fp_read_fprs(); 319 if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) { 320 _fp_write_fprs(fprs); 321 fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs; 322 } 323 restore_gsr(lwptofpu(lwp)); 324 } 325 kpreempt_enable(); 326 } 327 328 /* 329 * fill in the sun4u asrs, ie, the lwp's platform-dependent 330 * non-floating-point extra register state information 331 */ 332 /* ARGSUSED */ 333 void 334 getasrs(klwp_t *lwp, asrset_t asr) 335 { 336 /* for sun4u nothing to do here, added for symmetry */ 337 } 338 339 /* 340 * fill in the sun4u asrs, ie, the lwp's platform-dependent 341 * floating-point extra register state information 342 */ 343 void 344 getfpasrs(klwp_t *lwp, asrset_t asr) 345 { 346 kfpu_t *fp = lwptofpu(lwp); 347 uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL); 348 349 kpreempt_disable(); 350 if (ttolwp(curthread) == lwp) 351 fp->fpu_fprs = _fp_read_fprs(); 352 if ((fp->fpu_en) || (fp->fpu_fprs & FPRS_FEF)) { 353 if (fpu_exists && ttolwp(curthread) == lwp) { 354 if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) { 355 _fp_write_fprs(fprs); 356 fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs; 357 } 358 save_gsr(fp); 359 } 360 asr[ASR_GSR] = (int64_t)get_gsr(fp); 361 } 362 kpreempt_enable(); 363 } 364 365 /* 366 * set the sun4u asrs, ie, the lwp's platform-dependent 367 * non-floating-point extra register state information 368 */ 369 /* ARGSUSED */ 370 void 371 setasrs(klwp_t *lwp, asrset_t asr) 372 { 373 /* for sun4u nothing to do here, added for symmetry */ 374 } 375 376 void 377 setfpasrs(klwp_t *lwp, asrset_t asr) 378 { 379 kfpu_t *fp = lwptofpu(lwp); 380 uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL); 381 382 kpreempt_disable(); 383 if (ttolwp(curthread) == lwp) 384 fp->fpu_fprs = _fp_read_fprs(); 385 if ((fp->fpu_en) || (fp->fpu_fprs & FPRS_FEF)) { 386 set_gsr(asr[ASR_GSR], fp); 387 if (fpu_exists && ttolwp(curthread) == lwp) { 388 if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) { 389 _fp_write_fprs(fprs); 390 fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs; 391 } 392 restore_gsr(fp); 393 } 394 } 395 kpreempt_enable(); 396 } 397 398 /* 399 * Create interrupt kstats for this CPU. 400 */ 401 void 402 cpu_create_intrstat(cpu_t *cp) 403 { 404 int i; 405 kstat_t *intr_ksp; 406 kstat_named_t *knp; 407 char name[KSTAT_STRLEN]; 408 zoneid_t zoneid; 409 410 ASSERT(MUTEX_HELD(&cpu_lock)); 411 412 if (pool_pset_enabled()) 413 zoneid = GLOBAL_ZONEID; 414 else 415 zoneid = ALL_ZONES; 416 417 intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc", 418 KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid); 419 420 /* 421 * Initialize each PIL's named kstat 422 */ 423 if (intr_ksp != NULL) { 424 intr_ksp->ks_update = cpu_kstat_intrstat_update; 425 knp = (kstat_named_t *)intr_ksp->ks_data; 426 intr_ksp->ks_private = cp; 427 for (i = 0; i < PIL_MAX; i++) { 428 (void) snprintf(name, KSTAT_STRLEN, "level-%d-time", 429 i + 1); 430 kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64); 431 (void) snprintf(name, KSTAT_STRLEN, "level-%d-count", 432 i + 1); 433 kstat_named_init(&knp[(i * 2) + 1], name, 434 KSTAT_DATA_UINT64); 435 } 436 kstat_install(intr_ksp); 437 } 438 } 439 440 /* 441 * Delete interrupt kstats for this CPU. 442 */ 443 void 444 cpu_delete_intrstat(cpu_t *cp) 445 { 446 kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES); 447 } 448 449 /* 450 * Convert interrupt statistics from CPU ticks to nanoseconds and 451 * update kstat. 452 */ 453 int 454 cpu_kstat_intrstat_update(kstat_t *ksp, int rw) 455 { 456 kstat_named_t *knp = ksp->ks_data; 457 cpu_t *cpup = (cpu_t *)ksp->ks_private; 458 int i; 459 460 if (rw == KSTAT_WRITE) 461 return (EACCES); 462 463 /* 464 * We use separate passes to copy and convert the statistics to 465 * nanoseconds. This assures that the snapshot of the data is as 466 * self-consistent as possible. 467 */ 468 469 for (i = 0; i < PIL_MAX; i++) { 470 knp[i * 2].value.ui64 = cpup->cpu_m.intrstat[i + 1][0]; 471 knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i]; 472 } 473 474 for (i = 0; i < PIL_MAX; i++) { 475 knp[i * 2].value.ui64 = 476 (uint64_t)tick2ns((hrtime_t)knp[i * 2].value.ui64, 477 cpup->cpu_id); 478 } 479 480 return (0); 481 } 482 483 /* 484 * Called by common/os/cpu.c for psrinfo(1m) kstats 485 */ 486 char * 487 cpu_fru_fmri(cpu_t *cp) 488 { 489 return (cpunodes[cp->cpu_id].fru_fmri); 490 } 491 492 /* 493 * An interrupt thread is ending a time slice, so compute the interval it 494 * ran for and update the statistic for its PIL. 495 */ 496 void 497 cpu_intr_swtch_enter(kthread_id_t t) 498 { 499 uint64_t interval; 500 uint64_t start; 501 cpu_t *cpu; 502 503 ASSERT((t->t_flag & T_INTR_THREAD) != 0); 504 ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL); 505 506 /* 507 * We could be here with a zero timestamp. This could happen if: 508 * an interrupt thread which no longer has a pinned thread underneath 509 * it (i.e. it blocked at some point in its past) has finished running 510 * its handler. intr_thread() updated the interrupt statistic for its 511 * PIL and zeroed its timestamp. Since there was no pinned thread to 512 * return to, swtch() gets called and we end up here. 513 * 514 * It can also happen if an interrupt thread in intr_thread() calls 515 * preempt. It will have already taken care of updating stats. In 516 * this event, the interrupt thread will be runnable. 517 */ 518 if (t->t_intr_start) { 519 do { 520 start = t->t_intr_start; 521 interval = gettick_counter() - start; 522 } while (cas64(&t->t_intr_start, start, 0) != start); 523 cpu = CPU; 524 if (cpu->cpu_m.divisor > 1) 525 interval *= cpu->cpu_m.divisor; 526 cpu->cpu_m.intrstat[t->t_pil][0] += interval; 527 528 atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate], 529 interval); 530 } else 531 ASSERT(t->t_intr == NULL || t->t_state == TS_RUN); 532 } 533 534 535 /* 536 * An interrupt thread is returning from swtch(). Place a starting timestamp 537 * in its thread structure. 538 */ 539 void 540 cpu_intr_swtch_exit(kthread_id_t t) 541 { 542 uint64_t ts; 543 544 ASSERT((t->t_flag & T_INTR_THREAD) != 0); 545 ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL); 546 547 do { 548 ts = t->t_intr_start; 549 } while (cas64(&t->t_intr_start, ts, gettick_counter()) != ts); 550 } 551 552 553 int 554 blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class) 555 { 556 if (&plat_blacklist) 557 return (plat_blacklist(cmd, scheme, fmri, class)); 558 559 return (ENOTSUP); 560 } 561 562 int 563 kdi_pread(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp) 564 { 565 extern void kdi_flush_caches(void); 566 size_t nread = 0; 567 uint32_t word; 568 int slop, i; 569 570 kdi_flush_caches(); 571 membar_enter(); 572 573 /* We might not begin on a word boundary. */ 574 if ((slop = addr & 3) != 0) { 575 word = ldphys(addr & ~3); 576 for (i = slop; i < 4 && nbytes > 0; i++, nbytes--, nread++) 577 *buf++ = ((uchar_t *)&word)[i]; 578 addr = roundup(addr, 4); 579 } 580 581 while (nbytes > 0) { 582 word = ldphys(addr); 583 for (i = 0; i < 4 && nbytes > 0; i++, nbytes--, nread++, addr++) 584 *buf++ = ((uchar_t *)&word)[i]; 585 } 586 587 kdi_flush_caches(); 588 589 *ncopiedp = nread; 590 return (0); 591 } 592 593 int 594 kdi_pwrite(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp) 595 { 596 extern void kdi_flush_caches(void); 597 size_t nwritten = 0; 598 uint32_t word; 599 int slop, i; 600 601 kdi_flush_caches(); 602 603 /* We might not begin on a word boundary. */ 604 if ((slop = addr & 3) != 0) { 605 word = ldphys(addr & ~3); 606 for (i = slop; i < 4 && nbytes > 0; i++, nbytes--, nwritten++) 607 ((uchar_t *)&word)[i] = *buf++; 608 stphys(addr & ~3, word); 609 addr = roundup(addr, 4); 610 } 611 612 while (nbytes > 3) { 613 for (word = 0, i = 0; i < 4; i++, nbytes--, nwritten++) 614 ((uchar_t *)&word)[i] = *buf++; 615 stphys(addr, word); 616 addr += 4; 617 } 618 619 /* We might not end with a whole word. */ 620 if (nbytes > 0) { 621 word = ldphys(addr); 622 for (i = 0; nbytes > 0; i++, nbytes--, nwritten++) 623 ((uchar_t *)&word)[i] = *buf++; 624 stphys(addr, word); 625 } 626 627 membar_enter(); 628 kdi_flush_caches(); 629 630 *ncopiedp = nwritten; 631 return (0); 632 } 633 634 static void 635 kdi_kernpanic(struct regs *regs, uint_t tt) 636 { 637 sync_reg_buf = *regs; 638 sync_tt = tt; 639 640 sync_handler(); 641 } 642 643 static void 644 kdi_plat_call(void (*platfn)(void)) 645 { 646 if (platfn != NULL) { 647 prom_suspend_prepost(); 648 platfn(); 649 prom_resume_prepost(); 650 } 651 } 652 653 void 654 mach_kdi_init(kdi_t *kdi) 655 { 656 kdi->kdi_plat_call = kdi_plat_call; 657 kdi->mkdi_cpu_index = kdi_cpu_index; 658 kdi->mkdi_trap_vatotte = kdi_trap_vatotte; 659 kdi->mkdi_kernpanic = kdi_kernpanic; 660 } 661 662 663 /* 664 * get_cpu_mstate() is passed an array of timestamps, NCMSTATES 665 * long, and it fills in the array with the time spent on cpu in 666 * each of the mstates, where time is returned in nsec. 667 * 668 * No guarantee is made that the returned values in times[] will 669 * monotonically increase on sequential calls, although this will 670 * be true in the long run. Any such guarantee must be handled by 671 * the caller, if needed. This can happen if we fail to account 672 * for elapsed time due to a generation counter conflict, yet we 673 * did account for it on a prior call (see below). 674 * 675 * The complication is that the cpu in question may be updating 676 * its microstate at the same time that we are reading it. 677 * Because the microstate is only updated when the CPU's state 678 * changes, the values in cpu_intracct[] can be indefinitely out 679 * of date. To determine true current values, it is necessary to 680 * compare the current time with cpu_mstate_start, and add the 681 * difference to times[cpu_mstate]. 682 * 683 * This can be a problem if those values are changing out from 684 * under us. Because the code path in new_cpu_mstate() is 685 * performance critical, we have not added a lock to it. Instead, 686 * we have added a generation counter. Before beginning 687 * modifications, the counter is set to 0. After modifications, 688 * it is set to the old value plus one. 689 * 690 * get_cpu_mstate() will not consider the values of cpu_mstate 691 * and cpu_mstate_start to be usable unless the value of 692 * cpu_mstate_gen is both non-zero and unchanged, both before and 693 * after reading the mstate information. Note that we must 694 * protect against out-of-order loads around accesses to the 695 * generation counter. Also, this is a best effort approach in 696 * that we do not retry should the counter be found to have 697 * changed. 698 * 699 * cpu_intracct[] is used to identify time spent in each CPU 700 * mstate while handling interrupts. Such time should be reported 701 * against system time, and so is subtracted out from its 702 * corresponding cpu_acct[] time and added to 703 * cpu_acct[CMS_SYSTEM]. Additionally, intracct time is stored in 704 * %ticks, but acct time may be stored as %sticks, thus requiring 705 * different conversions before they can be compared. 706 */ 707 708 void 709 get_cpu_mstate(cpu_t *cpu, hrtime_t *times) 710 { 711 int i; 712 hrtime_t now, start; 713 uint16_t gen; 714 uint16_t state; 715 hrtime_t intracct[NCMSTATES]; 716 717 /* 718 * Load all volatile state under the protection of membar. 719 * cpu_acct[cpu_mstate] must be loaded to avoid double counting 720 * of (now - cpu_mstate_start) by a change in CPU mstate that 721 * arrives after we make our last check of cpu_mstate_gen. 722 */ 723 724 now = gethrtime_unscaled(); 725 gen = cpu->cpu_mstate_gen; 726 727 membar_consumer(); /* guarantee load ordering */ 728 start = cpu->cpu_mstate_start; 729 state = cpu->cpu_mstate; 730 for (i = 0; i < NCMSTATES; i++) { 731 intracct[i] = cpu->cpu_intracct[i]; 732 times[i] = cpu->cpu_acct[i]; 733 } 734 membar_consumer(); /* guarantee load ordering */ 735 736 if (gen != 0 && gen == cpu->cpu_mstate_gen && now > start) 737 times[state] += now - start; 738 739 for (i = 0; i < NCMSTATES; i++) { 740 scalehrtime(×[i]); 741 intracct[i] = tick2ns((hrtime_t)intracct[i], cpu->cpu_id); 742 } 743 744 for (i = 0; i < NCMSTATES; i++) { 745 if (i == CMS_SYSTEM) 746 continue; 747 times[i] -= intracct[i]; 748 if (times[i] < 0) { 749 intracct[i] += times[i]; 750 times[i] = 0; 751 } 752 times[CMS_SYSTEM] += intracct[i]; 753 } 754 } 755