1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/user.h> 30 #include <sys/proc.h> 31 #include <sys/cpuvar.h> 32 #include <sys/thread.h> 33 #include <sys/debug.h> 34 #include <sys/msacct.h> 35 #include <sys/time.h> 36 #include <sys/zone.h> 37 38 /* 39 * Mega-theory block comment: 40 * 41 * Microstate accounting uses finite states and the transitions between these 42 * states to measure timing and accounting information. The state information 43 * is presently tracked for threads (via microstate accounting) and cpus (via 44 * cpu microstate accounting). In each case, these accounting mechanisms use 45 * states and transitions to measure time spent in each state instead of 46 * clock-based sampling methodologies. 47 * 48 * For microstate accounting: 49 * state transitions are accomplished by calling new_mstate() to switch between 50 * states. Transitions from a sleeping state (LMS_SLEEP and LMS_STOPPED) occur 51 * by calling restore_mstate() which restores a thread to its previously running 52 * state. This code is primarialy executed by the dispatcher in disp() before 53 * running a process that was put to sleep. If the thread was not in a sleeping 54 * state, this call has little effect other than to update the count of time the 55 * thread has spent waiting on run-queues in its lifetime. 56 * 57 * For cpu microstate accounting: 58 * Cpu microstate accounting is similar to the microstate accounting for threads 59 * but it tracks user, system, and idle time for cpus. Cpu microstate 60 * accounting does not track interrupt times as there is a pre-existing 61 * interrupt accounting mechanism for this purpose. Cpu microstate accounting 62 * tracks time that user threads have spent active, idle, or in the system on a 63 * given cpu. Cpu microstate accounting has fewer states which allows it to 64 * have better defined transitions. The states transition in the following 65 * order: 66 * 67 * CMS_USER <-> CMS_SYSTEM <-> CMS_IDLE 68 * 69 * In order to get to the idle state, the cpu microstate must first go through 70 * the system state, and vice-versa for the user state from idle. The switching 71 * of the microstates from user to system is done as part of the regular thread 72 * microstate accounting code, except for the idle state which is switched by 73 * the dispatcher before it runs the idle loop. 74 * 75 * Cpu percentages: 76 * Cpu percentages are now handled by and based upon microstate accounting 77 * information (the same is true for load averages). The routines which handle 78 * the growing/shrinking and exponentiation of cpu percentages have been moved 79 * here as it now makes more sense for them to be generated from the microstate 80 * code. Cpu percentages are generated similarly to the way they were before; 81 * however, now they are based upon high-resolution timestamps and the 82 * timestamps are modified at various state changes instead of during a clock() 83 * interrupt. This allows us to generate more accurate cpu percentages which 84 * are also in-sync with microstate data. 85 */ 86 87 /* 88 * Initialize the microstate level and the 89 * associated accounting information for an LWP. 90 */ 91 void 92 init_mstate( 93 kthread_t *t, 94 int init_state) 95 { 96 struct mstate *ms; 97 klwp_t *lwp; 98 hrtime_t curtime; 99 100 ASSERT(init_state != LMS_WAIT_CPU); 101 ASSERT((unsigned)init_state < NMSTATES); 102 103 if ((lwp = ttolwp(t)) != NULL) { 104 ms = &lwp->lwp_mstate; 105 curtime = gethrtime_unscaled(); 106 ms->ms_prev = LMS_SYSTEM; 107 ms->ms_start = curtime; 108 ms->ms_term = 0; 109 ms->ms_state_start = curtime; 110 t->t_mstate = init_state; 111 t->t_waitrq = 0; 112 t->t_hrtime = curtime; 113 if ((t->t_proc_flag & TP_MSACCT) == 0) 114 t->t_proc_flag |= TP_MSACCT; 115 bzero((caddr_t)&ms->ms_acct[0], sizeof (ms->ms_acct)); 116 } 117 } 118 119 /* 120 * Initialize the microstate level and associated accounting information 121 * for the specified cpu 122 */ 123 124 void 125 init_cpu_mstate( 126 cpu_t *cpu, 127 int init_state) 128 { 129 ASSERT(init_state != CMS_DISABLED); 130 131 cpu->cpu_mstate = init_state; 132 cpu->cpu_mstate_start = gethrtime_unscaled(); 133 cpu->cpu_waitrq = 0; 134 bzero((caddr_t)&cpu->cpu_acct[0], sizeof (cpu->cpu_acct)); 135 } 136 137 /* 138 * sets cpu state to OFFLINE. We don't actually track this time, 139 * but it serves as a useful placeholder state for when we're not 140 * doing anything. 141 */ 142 143 void 144 term_cpu_mstate(struct cpu *cpu) 145 { 146 ASSERT(cpu->cpu_mstate != CMS_DISABLED); 147 cpu->cpu_mstate = CMS_DISABLED; 148 cpu->cpu_mstate_start = 0; 149 } 150 151 /* NEW_CPU_MSTATE comments inline in new_cpu_mstate below. */ 152 153 #define NEW_CPU_MSTATE(state) \ 154 gen = cpu->cpu_mstate_gen; \ 155 cpu->cpu_mstate_gen = 0; \ 156 /* Need membar_producer() here if stores not ordered / TSO */ \ 157 cpu->cpu_acct[cpu->cpu_mstate] += curtime - cpu->cpu_mstate_start; \ 158 cpu->cpu_mstate = state; \ 159 cpu->cpu_mstate_start = curtime; \ 160 /* Need membar_producer() here if stores not ordered / TSO */ \ 161 cpu->cpu_mstate_gen = (++gen == 0) ? 1 : gen; 162 163 void 164 new_cpu_mstate(int cmstate, hrtime_t curtime) 165 { 166 cpu_t *cpu = CPU; 167 uint16_t gen; 168 169 ASSERT(cpu->cpu_mstate != CMS_DISABLED); 170 ASSERT(cmstate < NCMSTATES); 171 ASSERT(cmstate != CMS_DISABLED); 172 173 /* 174 * This function cannot be re-entrant on a given CPU. As such, 175 * we ASSERT and panic if we are called on behalf of an interrupt. 176 * The one exception is for an interrupt which has previously 177 * blocked. Such an interrupt is being scheduled by the dispatcher 178 * just like a normal thread, and as such cannot arrive here 179 * in a re-entrant manner. 180 */ 181 182 ASSERT(!CPU_ON_INTR(cpu) && curthread->t_intr == NULL); 183 ASSERT(curthread->t_preempt > 0 || curthread == cpu->cpu_idle_thread); 184 185 /* 186 * LOCKING, or lack thereof: 187 * 188 * Updates to CPU mstate can only be made by the CPU 189 * itself, and the above check to ignore interrupts 190 * should prevent recursion into this function on a given 191 * processor. i.e. no possible write contention. 192 * 193 * However, reads of CPU mstate can occur at any time 194 * from any CPU. Any locking added to this code path 195 * would seriously impact syscall performance. So, 196 * instead we have a best-effort protection for readers. 197 * The reader will want to account for any time between 198 * cpu_mstate_start and the present time. This requires 199 * some guarantees that the reader is getting coherent 200 * information. 201 * 202 * We use a generation counter, which is set to 0 before 203 * we start making changes, and is set to a new value 204 * after we're done. Someone reading the CPU mstate 205 * should check for the same non-zero value of this 206 * counter both before and after reading all state. The 207 * important point is that the reader is not a 208 * performance-critical path, but this function is. 209 * 210 * The ordering of writes is critical. cpu_mstate_gen must 211 * be visibly zero on all CPUs before we change cpu_mstate 212 * and cpu_mstate_start. Additionally, cpu_mstate_gen must 213 * not be restored to oldgen+1 until after all of the other 214 * writes have become visible. 215 * 216 * Normally one puts membar_producer() calls to accomplish 217 * this. Unfortunately this routine is extremely performance 218 * critical (esp. in syscall_mstate below) and we cannot 219 * afford the additional time, particularly on some x86 220 * architectures with extremely slow sfence calls. On a 221 * CPU which guarantees write ordering (including sparc, x86, 222 * and amd64) this is not a problem. The compiler could still 223 * reorder the writes, so we make the four cpu fields 224 * volatile to prevent this. 225 * 226 * TSO warning: should we port to a non-TSO (or equivalent) 227 * CPU, this will break. 228 * 229 * The reader stills needs the membar_consumer() calls because, 230 * although the volatiles prevent the compiler from reordering 231 * loads, the CPU can still do so. 232 */ 233 234 NEW_CPU_MSTATE(cmstate); 235 } 236 237 /* 238 * Return an aggregation of user and system CPU time consumed by 239 * the specified thread in scaled nanoseconds. 240 */ 241 hrtime_t 242 mstate_thread_onproc_time(kthread_t *t) 243 { 244 hrtime_t aggr_time; 245 hrtime_t now; 246 hrtime_t waitrq; 247 hrtime_t state_start; 248 struct mstate *ms; 249 klwp_t *lwp; 250 int mstate; 251 252 ASSERT(THREAD_LOCK_HELD(t)); 253 254 if ((lwp = ttolwp(t)) == NULL) 255 return (0); 256 257 mstate = t->t_mstate; 258 waitrq = t->t_waitrq; 259 ms = &lwp->lwp_mstate; 260 state_start = ms->ms_state_start; 261 262 aggr_time = ms->ms_acct[LMS_USER] + 263 ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; 264 265 now = gethrtime_unscaled(); 266 267 /* 268 * NOTE: gethrtime_unscaled on X86 taken on different CPUs is 269 * inconsistent, so it is possible that now < state_start. 270 */ 271 if (mstate == LMS_USER || mstate == LMS_SYSTEM || mstate == LMS_TRAP) { 272 /* if waitrq is zero, count all of the time. */ 273 if (waitrq == 0) { 274 waitrq = now; 275 } 276 277 if (waitrq > state_start) { 278 aggr_time += waitrq - state_start; 279 } 280 } 281 282 scalehrtime(&aggr_time); 283 return (aggr_time); 284 } 285 286 /* 287 * Return the amount of onproc and runnable time this thread has experienced. 288 * 289 * Because the fields we read are not protected by locks when updated 290 * by the thread itself, this is an inherently racey interface. In 291 * particular, the ASSERT(THREAD_LOCK_HELD(t)) doesn't guarantee as much 292 * as it might appear to. 293 * 294 * The implication for users of this interface is that onproc and runnable 295 * are *NOT* monotonically increasing; they may temporarily be larger than 296 * they should be. 297 */ 298 void 299 mstate_systhread_times(kthread_t *t, hrtime_t *onproc, hrtime_t *runnable) 300 { 301 struct mstate *const ms = &ttolwp(t)->lwp_mstate; 302 303 int mstate; 304 hrtime_t now; 305 hrtime_t state_start; 306 hrtime_t waitrq; 307 hrtime_t aggr_onp; 308 hrtime_t aggr_run; 309 310 ASSERT(THREAD_LOCK_HELD(t)); 311 ASSERT(t->t_procp->p_flag & SSYS); 312 ASSERT(ttolwp(t) != NULL); 313 314 /* shouldn't be any non-SYSTEM on-CPU time */ 315 ASSERT(ms->ms_acct[LMS_USER] == 0); 316 ASSERT(ms->ms_acct[LMS_TRAP] == 0); 317 318 mstate = t->t_mstate; 319 waitrq = t->t_waitrq; 320 state_start = ms->ms_state_start; 321 322 aggr_onp = ms->ms_acct[LMS_SYSTEM]; 323 aggr_run = ms->ms_acct[LMS_WAIT_CPU]; 324 325 now = gethrtime_unscaled(); 326 327 /* if waitrq == 0, then there is no time to account to TS_RUN */ 328 if (waitrq == 0) 329 waitrq = now; 330 331 /* If there is system time to accumulate, do so */ 332 if (mstate == LMS_SYSTEM && state_start < waitrq) 333 aggr_onp += waitrq - state_start; 334 335 if (waitrq < now) 336 aggr_run += now - waitrq; 337 338 scalehrtime(&aggr_onp); 339 scalehrtime(&aggr_run); 340 341 *onproc = aggr_onp; 342 *runnable = aggr_run; 343 } 344 345 /* 346 * Return an aggregation of microstate times in scaled nanoseconds (high-res 347 * time). This keeps in mind that p_acct is already scaled, and ms_acct is 348 * not. 349 */ 350 hrtime_t 351 mstate_aggr_state(proc_t *p, int a_state) 352 { 353 struct mstate *ms; 354 kthread_t *t; 355 klwp_t *lwp; 356 hrtime_t aggr_time; 357 hrtime_t scaledtime; 358 359 ASSERT(MUTEX_HELD(&p->p_lock)); 360 ASSERT((unsigned)a_state < NMSTATES); 361 362 aggr_time = p->p_acct[a_state]; 363 if (a_state == LMS_SYSTEM) 364 aggr_time += p->p_acct[LMS_TRAP]; 365 366 t = p->p_tlist; 367 if (t == NULL) 368 return (aggr_time); 369 370 do { 371 if (t->t_proc_flag & TP_LWPEXIT) 372 continue; 373 374 lwp = ttolwp(t); 375 ms = &lwp->lwp_mstate; 376 scaledtime = ms->ms_acct[a_state]; 377 scalehrtime(&scaledtime); 378 aggr_time += scaledtime; 379 if (a_state == LMS_SYSTEM) { 380 scaledtime = ms->ms_acct[LMS_TRAP]; 381 scalehrtime(&scaledtime); 382 aggr_time += scaledtime; 383 } 384 } while ((t = t->t_forw) != p->p_tlist); 385 386 return (aggr_time); 387 } 388 389 390 void 391 syscall_mstate(int fromms, int toms) 392 { 393 kthread_t *t = curthread; 394 zone_t *z = ttozone(t); 395 struct mstate *ms; 396 hrtime_t *mstimep; 397 hrtime_t curtime; 398 klwp_t *lwp; 399 hrtime_t newtime; 400 cpu_t *cpu; 401 uint16_t gen; 402 403 if ((lwp = ttolwp(t)) == NULL) 404 return; 405 406 ASSERT(fromms < NMSTATES); 407 ASSERT(toms < NMSTATES); 408 409 ms = &lwp->lwp_mstate; 410 mstimep = &ms->ms_acct[fromms]; 411 curtime = gethrtime_unscaled(); 412 newtime = curtime - ms->ms_state_start; 413 while (newtime < 0) { 414 curtime = gethrtime_unscaled(); 415 newtime = curtime - ms->ms_state_start; 416 } 417 *mstimep += newtime; 418 if (fromms == LMS_USER) 419 atomic_add_64(&z->zone_utime, newtime); 420 else if (fromms == LMS_SYSTEM) 421 atomic_add_64(&z->zone_stime, newtime); 422 t->t_mstate = toms; 423 ms->ms_state_start = curtime; 424 ms->ms_prev = fromms; 425 kpreempt_disable(); /* don't change CPU while changing CPU's state */ 426 cpu = CPU; 427 ASSERT(cpu == t->t_cpu); 428 if ((toms != LMS_USER) && (cpu->cpu_mstate != CMS_SYSTEM)) { 429 NEW_CPU_MSTATE(CMS_SYSTEM); 430 } else if ((toms == LMS_USER) && (cpu->cpu_mstate != CMS_USER)) { 431 NEW_CPU_MSTATE(CMS_USER); 432 } 433 kpreempt_enable(); 434 } 435 436 #undef NEW_CPU_MSTATE 437 438 /* 439 * The following is for computing the percentage of cpu time used recently 440 * by an lwp. The function cpu_decay() is also called from /proc code. 441 * 442 * exp_x(x): 443 * Given x as a 64-bit non-negative scaled integer of arbitrary magnitude, 444 * Return exp(-x) as a 64-bit scaled integer in the range [0 .. 1]. 445 * 446 * Scaling for 64-bit scaled integer: 447 * The binary point is to the right of the high-order bit 448 * of the low-order 32-bit word. 449 */ 450 451 #define LSHIFT 31 452 #define LSI_ONE ((uint32_t)1 << LSHIFT) /* 32-bit scaled integer 1 */ 453 454 #ifdef DEBUG 455 uint_t expx_cnt = 0; /* number of calls to exp_x() */ 456 uint_t expx_mul = 0; /* number of long multiplies in exp_x() */ 457 #endif 458 459 static uint64_t 460 exp_x(uint64_t x) 461 { 462 int i; 463 uint64_t ull; 464 uint32_t ui; 465 466 #ifdef DEBUG 467 expx_cnt++; 468 #endif 469 /* 470 * By the formula: 471 * exp(-x) = exp(-x/2) * exp(-x/2) 472 * we keep halving x until it becomes small enough for 473 * the following approximation to be accurate enough: 474 * exp(-x) = 1 - x 475 * We reduce x until it is less than 1/4 (the 2 in LSHIFT-2 below). 476 * Our final error will be smaller than 4% . 477 */ 478 479 /* 480 * Use a uint64_t for the initial shift calculation. 481 */ 482 ull = x >> (LSHIFT-2); 483 484 /* 485 * Short circuit: 486 * A number this large produces effectively 0 (actually .005). 487 * This way, we will never do more than 5 multiplies. 488 */ 489 if (ull >= (1 << 5)) 490 return (0); 491 492 ui = ull; /* OK. Now we can use a uint_t. */ 493 for (i = 0; ui != 0; i++) 494 ui >>= 1; 495 496 if (i != 0) { 497 #ifdef DEBUG 498 expx_mul += i; /* seldom happens */ 499 #endif 500 x >>= i; 501 } 502 503 /* 504 * Now we compute 1 - x and square it the number of times 505 * that we halved x above to produce the final result: 506 */ 507 x = LSI_ONE - x; 508 while (i--) 509 x = (x * x) >> LSHIFT; 510 511 return (x); 512 } 513 514 /* 515 * Given the old percent cpu and a time delta in nanoseconds, 516 * return the new decayed percent cpu: pct * exp(-tau), 517 * where 'tau' is the time delta multiplied by a decay factor. 518 * We have chosen the decay factor (cpu_decay_factor in param.c) 519 * to make the decay over five seconds be approximately 20%. 520 * 521 * 'pct' is a 32-bit scaled integer <= 1 522 * The binary point is to the right of the high-order bit 523 * of the 32-bit word. 524 */ 525 static uint32_t 526 cpu_decay(uint32_t pct, hrtime_t nsec) 527 { 528 uint64_t delta = (uint64_t)nsec; 529 530 delta /= cpu_decay_factor; 531 return ((pct * exp_x(delta)) >> LSHIFT); 532 } 533 534 /* 535 * Given the old percent cpu and a time delta in nanoseconds, 536 * return the new grown percent cpu: 1 - ( 1 - pct ) * exp(-tau) 537 */ 538 static uint32_t 539 cpu_grow(uint32_t pct, hrtime_t nsec) 540 { 541 return (LSI_ONE - cpu_decay(LSI_ONE - pct, nsec)); 542 } 543 544 545 /* 546 * Defined to determine whether a lwp is still on a processor. 547 */ 548 549 #define T_ONPROC(kt) \ 550 ((kt)->t_mstate < LMS_SLEEP) 551 #define T_OFFPROC(kt) \ 552 ((kt)->t_mstate >= LMS_SLEEP) 553 554 uint_t 555 cpu_update_pct(kthread_t *t, hrtime_t newtime) 556 { 557 hrtime_t delta; 558 hrtime_t hrlb; 559 uint_t pctcpu; 560 uint_t npctcpu; 561 562 /* 563 * This routine can get called at PIL > 0, this *has* to be 564 * done atomically. Holding locks here causes bad things to happen. 565 * (read: deadlock). 566 */ 567 568 do { 569 if (T_ONPROC(t) && t->t_waitrq == 0) { 570 hrlb = t->t_hrtime; 571 delta = newtime - hrlb; 572 if (delta < 0) { 573 newtime = gethrtime_unscaled(); 574 delta = newtime - hrlb; 575 } 576 t->t_hrtime = newtime; 577 scalehrtime(&delta); 578 pctcpu = t->t_pctcpu; 579 npctcpu = cpu_grow(pctcpu, delta); 580 } else { 581 hrlb = t->t_hrtime; 582 delta = newtime - hrlb; 583 if (delta < 0) { 584 newtime = gethrtime_unscaled(); 585 delta = newtime - hrlb; 586 } 587 t->t_hrtime = newtime; 588 scalehrtime(&delta); 589 pctcpu = t->t_pctcpu; 590 npctcpu = cpu_decay(pctcpu, delta); 591 } 592 } while (atomic_cas_32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu); 593 594 return (npctcpu); 595 } 596 597 /* 598 * Change the microstate level for the LWP and update the 599 * associated accounting information. Return the previous 600 * LWP state. 601 */ 602 int 603 new_mstate(kthread_t *t, int new_state) 604 { 605 struct mstate *ms; 606 unsigned state; 607 hrtime_t *mstimep; 608 hrtime_t curtime; 609 hrtime_t newtime; 610 hrtime_t oldtime; 611 hrtime_t ztime; 612 hrtime_t origstart; 613 klwp_t *lwp; 614 zone_t *z; 615 616 ASSERT(new_state != LMS_WAIT_CPU); 617 ASSERT((unsigned)new_state < NMSTATES); 618 ASSERT(t == curthread || THREAD_LOCK_HELD(t)); 619 620 /* 621 * Don't do microstate processing for threads without a lwp (kernel 622 * threads). Also, if we're an interrupt thread that is pinning another 623 * thread, our t_mstate hasn't been initialized. We'd be modifying the 624 * microstate of the underlying lwp which doesn't realize that it's 625 * pinned. In this case, also don't change the microstate. 626 */ 627 if (((lwp = ttolwp(t)) == NULL) || t->t_intr) 628 return (LMS_SYSTEM); 629 630 curtime = gethrtime_unscaled(); 631 632 /* adjust cpu percentages before we go any further */ 633 (void) cpu_update_pct(t, curtime); 634 635 ms = &lwp->lwp_mstate; 636 state = t->t_mstate; 637 origstart = ms->ms_state_start; 638 do { 639 switch (state) { 640 case LMS_TFAULT: 641 case LMS_DFAULT: 642 case LMS_KFAULT: 643 case LMS_USER_LOCK: 644 mstimep = &ms->ms_acct[LMS_SYSTEM]; 645 break; 646 default: 647 mstimep = &ms->ms_acct[state]; 648 break; 649 } 650 ztime = newtime = curtime - ms->ms_state_start; 651 if (newtime < 0) { 652 curtime = gethrtime_unscaled(); 653 oldtime = *mstimep - 1; /* force CAS to fail */ 654 continue; 655 } 656 oldtime = *mstimep; 657 newtime += oldtime; 658 t->t_mstate = new_state; 659 ms->ms_state_start = curtime; 660 } while (atomic_cas_64((uint64_t *)mstimep, oldtime, newtime) != 661 oldtime); 662 663 /* 664 * When the system boots the initial startup thread will have a 665 * ms_state_start of 0 which would add a huge system time to the global 666 * zone. We want to skip aggregating that initial bit of work. 667 */ 668 if (origstart != 0) { 669 z = ttozone(t); 670 if (state == LMS_USER) 671 atomic_add_64(&z->zone_utime, ztime); 672 else if (state == LMS_SYSTEM) 673 atomic_add_64(&z->zone_stime, ztime); 674 } 675 676 /* 677 * Remember the previous running microstate. 678 */ 679 if (state != LMS_SLEEP && state != LMS_STOPPED) 680 ms->ms_prev = state; 681 682 /* 683 * Switch CPU microstate if appropriate 684 */ 685 686 kpreempt_disable(); /* MUST disable kpreempt before touching t->cpu */ 687 ASSERT(t->t_cpu == CPU); 688 if (!CPU_ON_INTR(t->t_cpu) && curthread->t_intr == NULL) { 689 if (new_state == LMS_USER && t->t_cpu->cpu_mstate != CMS_USER) 690 new_cpu_mstate(CMS_USER, curtime); 691 else if (new_state != LMS_USER && 692 t->t_cpu->cpu_mstate != CMS_SYSTEM) 693 new_cpu_mstate(CMS_SYSTEM, curtime); 694 } 695 kpreempt_enable(); 696 697 return (ms->ms_prev); 698 } 699 700 /* 701 * Restore the LWP microstate to the previous runnable state. 702 * Called from disp() with the newly selected lwp. 703 */ 704 void 705 restore_mstate(kthread_t *t) 706 { 707 struct mstate *ms; 708 hrtime_t *mstimep; 709 klwp_t *lwp; 710 hrtime_t curtime; 711 hrtime_t waitrq; 712 hrtime_t newtime; 713 hrtime_t oldtime; 714 hrtime_t waittime; 715 zone_t *z; 716 717 /* 718 * Don't call restore mstate of threads without lwps. (Kernel threads) 719 * 720 * threads with t_intr set shouldn't be in the dispatcher, so assert 721 * that nobody here has t_intr. 722 */ 723 ASSERT(t->t_intr == NULL); 724 725 if ((lwp = ttolwp(t)) == NULL) 726 return; 727 728 curtime = gethrtime_unscaled(); 729 (void) cpu_update_pct(t, curtime); 730 ms = &lwp->lwp_mstate; 731 ASSERT((unsigned)t->t_mstate < NMSTATES); 732 do { 733 switch (t->t_mstate) { 734 case LMS_SLEEP: 735 /* 736 * Update the timer for the current sleep state. 737 */ 738 ASSERT((unsigned)ms->ms_prev < NMSTATES); 739 switch (ms->ms_prev) { 740 case LMS_TFAULT: 741 case LMS_DFAULT: 742 case LMS_KFAULT: 743 case LMS_USER_LOCK: 744 mstimep = &ms->ms_acct[ms->ms_prev]; 745 break; 746 default: 747 mstimep = &ms->ms_acct[LMS_SLEEP]; 748 break; 749 } 750 /* 751 * Return to the previous run state. 752 */ 753 t->t_mstate = ms->ms_prev; 754 break; 755 case LMS_STOPPED: 756 mstimep = &ms->ms_acct[LMS_STOPPED]; 757 /* 758 * Return to the previous run state. 759 */ 760 t->t_mstate = ms->ms_prev; 761 break; 762 case LMS_TFAULT: 763 case LMS_DFAULT: 764 case LMS_KFAULT: 765 case LMS_USER_LOCK: 766 mstimep = &ms->ms_acct[LMS_SYSTEM]; 767 break; 768 default: 769 mstimep = &ms->ms_acct[t->t_mstate]; 770 break; 771 } 772 waitrq = t->t_waitrq; /* hopefully atomic */ 773 if (waitrq == 0) { 774 waitrq = curtime; 775 } 776 t->t_waitrq = 0; 777 newtime = waitrq - ms->ms_state_start; 778 if (newtime < 0) { 779 curtime = gethrtime_unscaled(); 780 oldtime = *mstimep - 1; /* force CAS to fail */ 781 continue; 782 } 783 oldtime = *mstimep; 784 newtime += oldtime; 785 } while (atomic_cas_64((uint64_t *)mstimep, oldtime, newtime) != 786 oldtime); 787 788 /* 789 * Update the WAIT_CPU timer and per-cpu waitrq total. 790 */ 791 z = ttozone(t); 792 waittime = curtime - waitrq; 793 ms->ms_acct[LMS_WAIT_CPU] += waittime; 794 atomic_add_64(&z->zone_wtime, waittime); 795 CPU->cpu_waitrq += waittime; 796 ms->ms_state_start = curtime; 797 } 798 799 /* 800 * Copy lwp microstate accounting and resource usage information 801 * to the process. (lwp is terminating) 802 */ 803 void 804 term_mstate(kthread_t *t) 805 { 806 struct mstate *ms; 807 proc_t *p = ttoproc(t); 808 klwp_t *lwp = ttolwp(t); 809 int i; 810 hrtime_t tmp; 811 812 ASSERT(MUTEX_HELD(&p->p_lock)); 813 814 ms = &lwp->lwp_mstate; 815 (void) new_mstate(t, LMS_STOPPED); 816 ms->ms_term = ms->ms_state_start; 817 tmp = ms->ms_term - ms->ms_start; 818 scalehrtime(&tmp); 819 p->p_mlreal += tmp; 820 for (i = 0; i < NMSTATES; i++) { 821 tmp = ms->ms_acct[i]; 822 scalehrtime(&tmp); 823 p->p_acct[i] += tmp; 824 } 825 p->p_ru.minflt += lwp->lwp_ru.minflt; 826 p->p_ru.majflt += lwp->lwp_ru.majflt; 827 p->p_ru.nswap += lwp->lwp_ru.nswap; 828 p->p_ru.inblock += lwp->lwp_ru.inblock; 829 p->p_ru.oublock += lwp->lwp_ru.oublock; 830 p->p_ru.msgsnd += lwp->lwp_ru.msgsnd; 831 p->p_ru.msgrcv += lwp->lwp_ru.msgrcv; 832 p->p_ru.nsignals += lwp->lwp_ru.nsignals; 833 p->p_ru.nvcsw += lwp->lwp_ru.nvcsw; 834 p->p_ru.nivcsw += lwp->lwp_ru.nivcsw; 835 p->p_ru.sysc += lwp->lwp_ru.sysc; 836 p->p_ru.ioch += lwp->lwp_ru.ioch; 837 p->p_defunct++; 838 } 839