1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/thread.h> 30 #include <sys/proc.h> 31 #include <sys/task.h> 32 #include <sys/cmn_err.h> 33 #include <sys/class.h> 34 #include <sys/sdt.h> 35 #include <sys/atomic.h> 36 #include <sys/cpu.h> 37 #include <sys/clock_tick.h> 38 #include <sys/sysmacros.h> 39 #include <vm/rm.h> 40 41 /* 42 * This file contains the implementation of clock tick accounting for threads. 43 * Every tick, user threads running on various CPUs are located and charged 44 * with a tick to account for their use of CPU time. 45 * 46 * Every tick, the clock() handler calls clock_tick_schedule() to perform tick 47 * accounting for all the threads in the system. Tick accounting is done in 48 * two phases: 49 * 50 * Tick scheduling Done in clock_tick_schedule(). In this phase, cross 51 * calls are scheduled to multiple CPUs to perform 52 * multi-threaded tick accounting. The CPUs are chosen 53 * on a rotational basis so as to distribute the tick 54 * accounting load evenly across all CPUs. 55 * 56 * Tick execution Done in clock_tick_execute(). In this phase, tick 57 * accounting is actually performed by softint handlers 58 * on multiple CPUs. 59 * 60 * This implementation gives us a multi-threaded tick processing facility that 61 * is suitable for configurations with a large number of CPUs. On smaller 62 * configurations it may be desirable to let the processing be single-threaded 63 * and just allow clock() to do it as it has been done traditionally. To 64 * facilitate this, a variable, clock_tick_threshold, is defined. Platforms 65 * that desire multi-threading should set this variable to something 66 * appropriate. A recommended value may be found in clock_tick.h. At boot time, 67 * if the number of CPUs is greater than clock_tick_threshold, multi-threading 68 * kicks in. Note that this is a decision made at boot time. If more CPUs 69 * are dynamically added later on to exceed the threshold, no attempt is made 70 * to switch to multi-threaded. Similarly, if CPUs are removed dynamically 71 * no attempt is made to switch to single-threaded. This is to keep the 72 * implementation simple. Also note that the threshold can be changed for a 73 * specific customer configuration via /etc/system. 74 * 75 * The boot time decision is reflected in clock_tick_single_threaded. 76 */ 77 78 /* 79 * clock_tick_threshold 80 * If the number of CPUs at boot time exceeds this threshold, 81 * multi-threaded tick accounting kicks in. 82 * 83 * clock_tick_ncpus 84 * The number of CPUs in a set. Each set is scheduled for tick execution 85 * on a separate processor. 86 * 87 * clock_tick_single_threaded 88 * Indicates whether or not tick accounting is single threaded. 89 * 90 * clock_tick_total_cpus 91 * Total number of online CPUs. 92 * 93 * clock_tick_cpus 94 * Array of online CPU pointers. 95 * 96 * clock_tick_cpu 97 * Per-CPU, cache-aligned data structures to facilitate multi-threading. 98 * 99 * clock_tick_active 100 * Counter that indicates the number of active tick processing softints 101 * in the system. 102 * 103 * clock_tick_pending 104 * Number of pending ticks that need to be accounted by the softint 105 * handlers. 106 * 107 * clock_tick_lock 108 * Mutex to synchronize between clock_tick_schedule() and 109 * CPU online/offline. 110 * 111 * clock_cpu_id 112 * CPU id of the clock() CPU. Used to detect when the clock CPU 113 * is offlined. 114 * 115 * clock_tick_online_cpuset 116 * CPU set of all online processors that can be X-called. 117 * 118 * clock_tick_proc_max 119 * Each process is allowed to accumulate a few ticks before checking 120 * for the task CPU time resource limit. We lower the number of calls 121 * to rctl_test() to make tick accounting more scalable. The tradeoff 122 * is that the limit may not get enforced in a timely manner. This is 123 * typically not a problem. 124 * 125 * clock_tick_set 126 * Per-set structures. Each structure contains the range of CPUs 127 * to be processed for the set. 128 * 129 * clock_tick_nsets; 130 * Number of sets. 131 * 132 * clock_tick_scan 133 * Where to begin the scan for single-threaded mode. In multi-threaded, 134 * the clock_tick_set itself contains a field for this. 135 */ 136 int clock_tick_threshold; 137 int clock_tick_ncpus; 138 int clock_tick_single_threaded; 139 int clock_tick_total_cpus; 140 cpu_t *clock_tick_cpus[NCPU]; 141 clock_tick_cpu_t *clock_tick_cpu[NCPU]; 142 ulong_t clock_tick_active; 143 int clock_tick_pending; 144 kmutex_t clock_tick_lock; 145 processorid_t clock_cpu_id; 146 cpuset_t clock_tick_online_cpuset; 147 clock_t clock_tick_proc_max; 148 clock_tick_set_t *clock_tick_set; 149 int clock_tick_nsets; 150 int clock_tick_scan; 151 152 static uint_t clock_tick_execute(caddr_t, caddr_t); 153 static void clock_tick_execute_common(int, int, int, clock_t, int); 154 155 #define CLOCK_TICK_ALIGN 64 /* cache alignment */ 156 157 /* 158 * Clock tick initialization is done in two phases: 159 * 160 * 1. Before clock_init() is called, clock_tick_init_pre() is called to set 161 * up single-threading so the clock() can begin to do its job. 162 * 163 * 2. After the slave CPUs are initialized at boot time, we know the number 164 * of CPUs. clock_tick_init_post() is called to set up multi-threading if 165 * required. 166 */ 167 void 168 clock_tick_init_pre(void) 169 { 170 clock_tick_cpu_t *ctp; 171 int i, n; 172 clock_tick_set_t *csp; 173 uintptr_t buf; 174 size_t size; 175 176 clock_tick_single_threaded = 1; 177 178 size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN); 179 buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP); 180 buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN); 181 182 /* 183 * Perform initialization in case multi-threading is chosen later. 184 */ 185 for (i = 0; i < NCPU; i++, buf += size) { 186 ctp = (clock_tick_cpu_t *)buf; 187 clock_tick_cpu[i] = ctp; 188 mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL); 189 if (&create_softint != NULL) { 190 ctp->ct_intr = create_softint(LOCK_LEVEL, 191 clock_tick_execute, (caddr_t)ctp); 192 } 193 ctp->ct_pending = 0; 194 } 195 196 mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL); 197 198 /* 199 * Compute clock_tick_ncpus here. We need it to compute the 200 * maximum number of tick sets we need to support. 201 */ 202 ASSERT(clock_tick_ncpus >= 0); 203 if (clock_tick_ncpus == 0) 204 clock_tick_ncpus = CLOCK_TICK_NCPUS; 205 if (clock_tick_ncpus > max_ncpus) 206 clock_tick_ncpus = max_ncpus; 207 208 /* 209 * Allocate and initialize the tick sets. 210 */ 211 n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus; 212 clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP); 213 for (i = 0; i < n; i++) { 214 csp = &clock_tick_set[i]; 215 csp->ct_start = i * clock_tick_ncpus; 216 csp->ct_scan = csp->ct_start; 217 csp->ct_end = csp->ct_start; 218 } 219 } 220 221 void 222 clock_tick_init_post(void) 223 { 224 /* 225 * If a platform does not provide create_softint() and invoke_softint(), 226 * then we assume single threaded. 227 */ 228 if (&invoke_softint == NULL) 229 clock_tick_threshold = 0; 230 231 ASSERT(clock_tick_threshold >= 0); 232 233 if (clock_tick_threshold == 0) 234 clock_tick_threshold = max_ncpus; 235 236 /* 237 * If a platform does not specify a threshold or if the number of CPUs 238 * at boot time does not exceed the threshold, tick accounting remains 239 * single-threaded. 240 */ 241 if (ncpus <= clock_tick_threshold) { 242 clock_tick_ncpus = max_ncpus; 243 clock_tick_proc_max = 1; 244 return; 245 } 246 247 /* 248 * OK. Multi-thread tick processing. If a platform has not specified 249 * the CPU set size for multi-threading, then use the default value. 250 * This value has been arrived through measurements on large 251 * configuration systems. 252 */ 253 clock_tick_single_threaded = 0; 254 if (clock_tick_proc_max == 0) { 255 clock_tick_proc_max = CLOCK_TICK_PROC_MAX; 256 if (hires_tick) 257 clock_tick_proc_max *= 10; 258 } 259 } 260 261 static void 262 clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid) 263 { 264 clock_tick_cpu_t *ctp; 265 266 ASSERT(&invoke_softint != NULL); 267 /* 268 * Schedule tick accounting for a set of CPUs. 269 */ 270 ctp = clock_tick_cpu[cid]; 271 mutex_enter(&ctp->ct_lock); 272 ctp->ct_lbolt = lbolt; 273 ctp->ct_pending += pending; 274 ctp->ct_start = csp->ct_start; 275 ctp->ct_end = csp->ct_end; 276 ctp->ct_scan = csp->ct_scan; 277 mutex_exit(&ctp->ct_lock); 278 279 invoke_softint(cid, ctp->ct_intr); 280 /* 281 * Return without waiting for the softint to finish. 282 */ 283 } 284 285 static void 286 clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending) 287 { 288 kthread_t *t; 289 kmutex_t *plockp; 290 int notick, intr; 291 klwp_id_t lwp; 292 293 /* 294 * The locking here is rather tricky. thread_free_prevent() 295 * prevents the thread returned from being freed while we 296 * are looking at it. We can then check if the thread 297 * is exiting and get the appropriate p_lock if it 298 * is not. We have to be careful, though, because 299 * the _process_ can still be freed while we've 300 * prevented thread free. To avoid touching the 301 * proc structure we put a pointer to the p_lock in the 302 * thread structure. The p_lock is persistent so we 303 * can acquire it even if the process is gone. At that 304 * point we can check (again) if the thread is exiting 305 * and either drop the lock or do the tick processing. 306 */ 307 t = cp->cpu_thread; /* Current running thread */ 308 if (CPU == cp) { 309 /* 310 * 't' will be the tick processing thread on this 311 * CPU. Use the pinned thread (if any) on this CPU 312 * as the target of the clock tick. 313 */ 314 if (t->t_intr != NULL) 315 t = t->t_intr; 316 } 317 318 /* 319 * We use thread_free_prevent to keep the currently running 320 * thread from being freed or recycled while we're 321 * looking at it. 322 */ 323 thread_free_prevent(t); 324 /* 325 * We cannot hold the cpu_lock to prevent the 326 * cpu_active from changing in the clock interrupt. 327 * As long as we don't block (or don't get pre-empted) 328 * the cpu_list will not change (all threads are paused 329 * before list modification). 330 */ 331 if (CLOCK_TICK_CPU_OFFLINE(cp)) { 332 thread_free_allow(t); 333 return; 334 } 335 336 /* 337 * Make sure the thread is still on the CPU. 338 */ 339 if ((t != cp->cpu_thread) && 340 ((cp != CPU) || (t != cp->cpu_thread->t_intr))) { 341 /* 342 * We could not locate the thread. Skip this CPU. Race 343 * conditions while performing these checks are benign. 344 * These checks are not perfect and they don't need 345 * to be. 346 */ 347 thread_free_allow(t); 348 return; 349 } 350 351 intr = t->t_flag & T_INTR_THREAD; 352 lwp = ttolwp(t); 353 if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { 354 /* 355 * Thread is exiting (or uninteresting) so don't 356 * do tick processing. 357 */ 358 thread_free_allow(t); 359 return; 360 } 361 362 /* 363 * OK, try to grab the process lock. See 364 * comments above for why we're not using 365 * ttoproc(t)->p_lockp here. 366 */ 367 plockp = t->t_plockp; 368 mutex_enter(plockp); 369 /* See above comment. */ 370 if (CLOCK_TICK_CPU_OFFLINE(cp)) { 371 mutex_exit(plockp); 372 thread_free_allow(t); 373 return; 374 } 375 376 /* 377 * The thread may have exited between when we 378 * checked above, and when we got the p_lock. 379 */ 380 if (t->t_proc_flag & TP_LWPEXIT) { 381 mutex_exit(plockp); 382 thread_free_allow(t); 383 return; 384 } 385 386 /* 387 * Either we have the p_lock for the thread's process, 388 * or we don't care about the thread structure any more. 389 * Either way we can allow thread free. 390 */ 391 thread_free_allow(t); 392 393 /* 394 * If we haven't done tick processing for this 395 * lwp, then do it now. Since we don't hold the 396 * lwp down on a CPU it can migrate and show up 397 * more than once, hence the lbolt check. mylbolt 398 * is copied at the time of tick scheduling to prevent 399 * lbolt mismatches. 400 * 401 * Also, make sure that it's okay to perform the 402 * tick processing before calling clock_tick. 403 * Setting notick to a TRUE value (ie. not 0) 404 * results in tick processing not being performed for 405 * that thread. 406 */ 407 notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) || 408 (cp->cpu_dispthread == cp->cpu_idle_thread)); 409 410 if ((!notick) && (t->t_lbolt < mylbolt)) { 411 t->t_lbolt = mylbolt; 412 clock_tick(t, pending); 413 } 414 415 mutex_exit(plockp); 416 } 417 418 void 419 clock_tick_schedule(int one_sec) 420 { 421 ulong_t active; 422 int i, end; 423 clock_tick_set_t *csp; 424 cpu_t *cp; 425 426 if (clock_cpu_id != CPU->cpu_id) 427 clock_cpu_id = CPU->cpu_id; 428 429 if (clock_tick_single_threaded) { 430 /* 431 * Each tick cycle, start the scan from a different 432 * CPU for the sake of fairness. 433 */ 434 end = clock_tick_total_cpus; 435 clock_tick_scan++; 436 if (clock_tick_scan >= end) 437 clock_tick_scan = 0; 438 439 clock_tick_execute_common(0, clock_tick_scan, end, lbolt, 1); 440 441 return; 442 } 443 444 /* 445 * If the previous invocation of handlers is not yet finished, then 446 * simply increment a pending count and return. Eventually when they 447 * finish, the pending count is passed down to the next set of 448 * handlers to process. This way, ticks that have already elapsed 449 * in the past are handled as quickly as possible to minimize the 450 * chances of threads getting away before their pending ticks are 451 * accounted. The other benefit is that if the pending count is 452 * more than one, it can be handled by a single invocation of 453 * clock_tick(). This is a good optimization for large configuration 454 * busy systems where tick accounting can get backed up for various 455 * reasons. 456 */ 457 clock_tick_pending++; 458 459 active = clock_tick_active; 460 active = atomic_cas_ulong(&clock_tick_active, active, active); 461 if (active) 462 return; 463 464 /* 465 * We want to handle the clock CPU here. If we 466 * scheduled the accounting for the clock CPU to another 467 * processor, that processor will find only the clock() thread 468 * running and not account for any user thread below it. Also, 469 * we want to handle this before we block on anything and allow 470 * the pinned thread below the current thread to escape. 471 */ 472 clock_tick_process(CPU, lbolt, clock_tick_pending); 473 474 mutex_enter(&clock_tick_lock); 475 476 /* 477 * Schedule each set on a separate processor. 478 */ 479 cp = clock_cpu_list; 480 for (i = 0; i < clock_tick_nsets; i++) { 481 csp = &clock_tick_set[i]; 482 483 /* 484 * Pick the next online CPU in list for scheduling tick 485 * accounting. The clock_tick_lock is held by the caller. 486 * So, CPU online/offline cannot muck with this while 487 * we are picking our CPU to X-call. 488 */ 489 if (cp == CPU) 490 cp = cp->cpu_next_onln; 491 492 /* 493 * Each tick cycle, start the scan from a different 494 * CPU for the sake of fairness. 495 */ 496 csp->ct_scan++; 497 if (csp->ct_scan >= csp->ct_end) 498 csp->ct_scan = csp->ct_start; 499 500 clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id); 501 502 cp = cp->cpu_next_onln; 503 } 504 505 if (one_sec) { 506 /* 507 * Move the CPU pointer around every second. This is so 508 * all the CPUs can be X-called in a round-robin fashion 509 * to evenly distribute the X-calls. We don't do this 510 * at a faster rate than this because we don't want 511 * to affect cache performance negatively. 512 */ 513 clock_cpu_list = clock_cpu_list->cpu_next_onln; 514 } 515 516 mutex_exit(&clock_tick_lock); 517 518 clock_tick_pending = 0; 519 } 520 521 static void 522 clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt, 523 int pending) 524 { 525 cpu_t *cp; 526 int i; 527 528 ASSERT((start <= scan) && (scan <= end)); 529 530 /* 531 * Handle the thread on current CPU first. This is to prevent a 532 * pinned thread from escaping if we ever block on something. 533 * Note that in the single-threaded mode, this handles the clock 534 * CPU. 535 */ 536 clock_tick_process(CPU, mylbolt, pending); 537 538 /* 539 * Perform tick accounting for the threads running on 540 * the scheduled CPUs. 541 */ 542 for (i = scan; i < end; i++) { 543 cp = clock_tick_cpus[i]; 544 if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) 545 continue; 546 clock_tick_process(cp, mylbolt, pending); 547 } 548 549 for (i = start; i < scan; i++) { 550 cp = clock_tick_cpus[i]; 551 if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) 552 continue; 553 clock_tick_process(cp, mylbolt, pending); 554 } 555 } 556 557 /*ARGSUSED*/ 558 static uint_t 559 clock_tick_execute(caddr_t arg1, caddr_t arg2) 560 { 561 clock_tick_cpu_t *ctp; 562 int start, scan, end, pending; 563 clock_t mylbolt; 564 565 /* 566 * We could have raced with cpu offline. We don't want to 567 * process anything on an offlined CPU. If we got blocked 568 * on anything, we may not get scheduled when we wakeup 569 * later on. 570 */ 571 if (!CLOCK_TICK_XCALL_SAFE(CPU)) 572 return (1); 573 574 atomic_inc_ulong(&clock_tick_active); 575 576 ctp = (clock_tick_cpu_t *)arg1; 577 mutex_enter(&ctp->ct_lock); 578 pending = ctp->ct_pending; 579 if (pending == 0) { 580 /* 581 * If a CPU is busy at LOCK_LEVEL, then an invocation 582 * of this softint may be queued for some time. In that case, 583 * clock_tick_active will not be incremented. 584 * clock_tick_schedule() will then assume that the previous 585 * invocation is done and post a new softint. The first one 586 * that gets in will reset the pending count so the 587 * second one is a noop. 588 */ 589 mutex_exit(&ctp->ct_lock); 590 goto out; 591 } 592 ctp->ct_pending = 0; 593 start = ctp->ct_start; 594 end = ctp->ct_end; 595 scan = ctp->ct_scan; 596 mylbolt = ctp->ct_lbolt; 597 mutex_exit(&ctp->ct_lock); 598 599 clock_tick_execute_common(start, scan, end, mylbolt, pending); 600 601 out: 602 /* 603 * Signal completion to the clock handler. 604 */ 605 atomic_dec_ulong(&clock_tick_active); 606 607 return (1); 608 } 609 610 /*ARGSUSED*/ 611 static int 612 clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg) 613 { 614 cpu_t *cp, *ncp; 615 int i, set; 616 clock_tick_set_t *csp; 617 618 /* 619 * This function performs some computations at CPU offline/online 620 * time. The computed values are used during tick scheduling and 621 * execution phases. This avoids having to compute things on 622 * an every tick basis. The other benefit is that we perform the 623 * computations only for onlined CPUs (not offlined ones). As a 624 * result, no tick processing is attempted for offlined CPUs. 625 * 626 * Also, cpu_offline() calls this function before checking for 627 * active interrupt threads. This allows us to avoid posting 628 * cross calls to CPUs that are being offlined. 629 */ 630 631 cp = cpu[cid]; 632 633 mutex_enter(&clock_tick_lock); 634 635 switch (what) { 636 case CPU_ON: 637 clock_tick_cpus[clock_tick_total_cpus] = cp; 638 set = clock_tick_total_cpus / clock_tick_ncpus; 639 csp = &clock_tick_set[set]; 640 csp->ct_end++; 641 clock_tick_total_cpus++; 642 clock_tick_nsets = 643 (clock_tick_total_cpus + clock_tick_ncpus - 1) / 644 clock_tick_ncpus; 645 CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id); 646 membar_sync(); 647 break; 648 649 case CPU_OFF: 650 if (&sync_softint != NULL) 651 sync_softint(clock_tick_online_cpuset); 652 CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id); 653 clock_tick_total_cpus--; 654 clock_tick_cpus[clock_tick_total_cpus] = NULL; 655 clock_tick_nsets = 656 (clock_tick_total_cpus + clock_tick_ncpus - 1) / 657 clock_tick_ncpus; 658 set = clock_tick_total_cpus / clock_tick_ncpus; 659 csp = &clock_tick_set[set]; 660 csp->ct_end--; 661 662 i = 0; 663 ncp = cpu_active; 664 do { 665 if (cp == ncp) 666 continue; 667 clock_tick_cpus[i] = ncp; 668 i++; 669 } while ((ncp = ncp->cpu_next_onln) != cpu_active); 670 ASSERT(i == clock_tick_total_cpus); 671 membar_sync(); 672 break; 673 674 default: 675 break; 676 } 677 678 mutex_exit(&clock_tick_lock); 679 680 return (0); 681 } 682 683 684 void 685 clock_tick_mp_init(void) 686 { 687 cpu_t *cp; 688 689 mutex_enter(&cpu_lock); 690 691 cp = cpu_active; 692 do { 693 (void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL); 694 } while ((cp = cp->cpu_next_onln) != cpu_active); 695 696 register_cpu_setup_func(clock_tick_cpu_setup, NULL); 697 698 mutex_exit(&cpu_lock); 699 } 700