1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/thread.h> 28 #include <sys/proc.h> 29 #include <sys/task.h> 30 #include <sys/cmn_err.h> 31 #include <sys/class.h> 32 #include <sys/sdt.h> 33 #include <sys/atomic.h> 34 #include <sys/cpu.h> 35 #include <sys/clock_tick.h> 36 #include <sys/clock_impl.h> 37 #include <sys/sysmacros.h> 38 #include <vm/rm.h> 39 40 /* 41 * This file contains the implementation of clock tick accounting for threads. 42 * Every tick, user threads running on various CPUs are located and charged 43 * with a tick to account for their use of CPU time. 44 * 45 * Every tick, the clock() handler calls clock_tick_schedule() to perform tick 46 * accounting for all the threads in the system. Tick accounting is done in 47 * two phases: 48 * 49 * Tick scheduling Done in clock_tick_schedule(). In this phase, cross 50 * calls are scheduled to multiple CPUs to perform 51 * multi-threaded tick accounting. The CPUs are chosen 52 * on a rotational basis so as to distribute the tick 53 * accounting load evenly across all CPUs. 54 * 55 * Tick execution Done in clock_tick_execute(). In this phase, tick 56 * accounting is actually performed by softint handlers 57 * on multiple CPUs. 58 * 59 * This implementation gives us a multi-threaded tick processing facility that 60 * is suitable for configurations with a large number of CPUs. On smaller 61 * configurations it may be desirable to let the processing be single-threaded 62 * and just allow clock() to do it as it has been done traditionally. To 63 * facilitate this, a variable, clock_tick_threshold, is defined. Platforms 64 * that desire multi-threading should set this variable to something 65 * appropriate. A recommended value may be found in clock_tick.h. At boot time, 66 * if the number of CPUs is greater than clock_tick_threshold, multi-threading 67 * kicks in. Note that this is a decision made at boot time. If more CPUs 68 * are dynamically added later on to exceed the threshold, no attempt is made 69 * to switch to multi-threaded. Similarly, if CPUs are removed dynamically 70 * no attempt is made to switch to single-threaded. This is to keep the 71 * implementation simple. Also note that the threshold can be changed for a 72 * specific customer configuration via /etc/system. 73 * 74 * The boot time decision is reflected in clock_tick_single_threaded. 75 */ 76 77 /* 78 * clock_tick_threshold 79 * If the number of CPUs at boot time exceeds this threshold, 80 * multi-threaded tick accounting kicks in. 81 * 82 * clock_tick_ncpus 83 * The number of CPUs in a set. Each set is scheduled for tick execution 84 * on a separate processor. 85 * 86 * clock_tick_single_threaded 87 * Indicates whether or not tick accounting is single threaded. 88 * 89 * clock_tick_total_cpus 90 * Total number of online CPUs. 91 * 92 * clock_tick_cpus 93 * Array of online CPU pointers. 94 * 95 * clock_tick_cpu 96 * Per-CPU, cache-aligned data structures to facilitate multi-threading. 97 * 98 * clock_tick_active 99 * Counter that indicates the number of active tick processing softints 100 * in the system. 101 * 102 * clock_tick_pending 103 * Number of pending ticks that need to be accounted by the softint 104 * handlers. 105 * 106 * clock_tick_lock 107 * Mutex to synchronize between clock_tick_schedule() and 108 * CPU online/offline. 109 * 110 * clock_cpu_id 111 * CPU id of the clock() CPU. Used to detect when the clock CPU 112 * is offlined. 113 * 114 * clock_tick_online_cpuset 115 * CPU set of all online processors that can be X-called. 116 * 117 * clock_tick_proc_max 118 * Each process is allowed to accumulate a few ticks before checking 119 * for the task CPU time resource limit. We lower the number of calls 120 * to rctl_test() to make tick accounting more scalable. The tradeoff 121 * is that the limit may not get enforced in a timely manner. This is 122 * typically not a problem. 123 * 124 * clock_tick_set 125 * Per-set structures. Each structure contains the range of CPUs 126 * to be processed for the set. 127 * 128 * clock_tick_nsets; 129 * Number of sets. 130 * 131 * clock_tick_scan 132 * Where to begin the scan for single-threaded mode. In multi-threaded, 133 * the clock_tick_set itself contains a field for this. 134 */ 135 int clock_tick_threshold; 136 int clock_tick_ncpus; 137 int clock_tick_single_threaded; 138 int clock_tick_total_cpus; 139 cpu_t *clock_tick_cpus[NCPU]; 140 clock_tick_cpu_t *clock_tick_cpu[NCPU]; 141 ulong_t clock_tick_active; 142 int clock_tick_pending; 143 kmutex_t clock_tick_lock; 144 processorid_t clock_cpu_id; 145 cpuset_t clock_tick_online_cpuset; 146 clock_t clock_tick_proc_max; 147 clock_tick_set_t *clock_tick_set; 148 int clock_tick_nsets; 149 int clock_tick_scan; 150 ulong_t clock_tick_intr; 151 152 static uint_t clock_tick_execute(caddr_t, caddr_t); 153 static void clock_tick_execute_common(int, int, int, clock_t, int); 154 155 /* 156 * Clock tick initialization is done in two phases: 157 * 158 * 1. Before clock_init() is called, clock_tick_init_pre() is called to set 159 * up single-threading so the clock() can begin to do its job. 160 * 161 * 2. After the slave CPUs are initialized at boot time, we know the number 162 * of CPUs. clock_tick_init_post() is called to set up multi-threading if 163 * required. 164 */ 165 void 166 clock_tick_init_pre(void) 167 { 168 clock_tick_cpu_t *ctp; 169 int i, n; 170 clock_tick_set_t *csp; 171 uintptr_t abuf, buf; 172 size_t size; 173 174 clock_tick_single_threaded = 1; 175 176 /* 177 * We will not free this memory, but to avoid the false sharing, 178 * align to cache line size. 179 */ 180 size = P2ROUNDUP(sizeof (clock_tick_cpu_t), _CACHE_LINE_SIZE); 181 abuf = (uintptr_t)kmem_zalloc(size * NCPU + _CACHE_LINE_SIZE, KM_SLEEP); 182 buf = P2ROUNDUP(abuf, _CACHE_LINE_SIZE); 183 184 /* 185 * Perform initialization in case multi-threading is chosen later. 186 */ 187 if (&create_softint != NULL) { 188 clock_tick_intr = create_softint(LOCK_LEVEL, 189 clock_tick_execute, (caddr_t)NULL); 190 } 191 for (i = 0; i < NCPU; i++, buf += size) { 192 ctp = (clock_tick_cpu_t *)buf; 193 clock_tick_cpu[i] = ctp; 194 mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL); 195 if (&create_softint != NULL) { 196 ctp->ct_intr = clock_tick_intr; 197 } 198 ctp->ct_pending = 0; 199 } 200 201 mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL); 202 203 /* 204 * Compute clock_tick_ncpus here. We need it to compute the 205 * maximum number of tick sets we need to support. 206 */ 207 ASSERT(clock_tick_ncpus >= 0); 208 if (clock_tick_ncpus == 0) 209 clock_tick_ncpus = CLOCK_TICK_NCPUS; 210 if (clock_tick_ncpus > max_ncpus) 211 clock_tick_ncpus = max_ncpus; 212 213 /* 214 * Allocate and initialize the tick sets. 215 */ 216 n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus; 217 clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP); 218 for (i = 0; i < n; i++) { 219 csp = &clock_tick_set[i]; 220 csp->ct_start = i * clock_tick_ncpus; 221 csp->ct_scan = csp->ct_start; 222 csp->ct_end = csp->ct_start; 223 } 224 } 225 226 void 227 clock_tick_init_post(void) 228 { 229 /* 230 * If a platform does not provide create_softint() and invoke_softint(), 231 * then we assume single threaded. 232 */ 233 if (&invoke_softint == NULL) 234 clock_tick_threshold = 0; 235 236 ASSERT(clock_tick_threshold >= 0); 237 238 if (clock_tick_threshold == 0) 239 clock_tick_threshold = max_ncpus; 240 241 /* 242 * If a platform does not specify a threshold or if the number of CPUs 243 * at boot time does not exceed the threshold, tick accounting remains 244 * single-threaded. 245 */ 246 if (ncpus <= clock_tick_threshold) { 247 clock_tick_ncpus = max_ncpus; 248 clock_tick_proc_max = 1; 249 return; 250 } 251 252 /* 253 * OK. Multi-thread tick processing. If a platform has not specified 254 * the CPU set size for multi-threading, then use the default value. 255 * This value has been arrived through measurements on large 256 * configuration systems. 257 */ 258 clock_tick_single_threaded = 0; 259 if (clock_tick_proc_max == 0) { 260 clock_tick_proc_max = CLOCK_TICK_PROC_MAX; 261 if (hires_tick) 262 clock_tick_proc_max *= 10; 263 } 264 } 265 266 static void 267 clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid) 268 { 269 clock_tick_cpu_t *ctp; 270 271 ASSERT(&invoke_softint != NULL); 272 273 atomic_inc_ulong(&clock_tick_active); 274 275 /* 276 * Schedule tick accounting for a set of CPUs. 277 */ 278 ctp = clock_tick_cpu[cid]; 279 mutex_enter(&ctp->ct_lock); 280 ctp->ct_lbolt = LBOLT_NO_ACCOUNT; 281 ctp->ct_pending += pending; 282 ctp->ct_start = csp->ct_start; 283 ctp->ct_end = csp->ct_end; 284 ctp->ct_scan = csp->ct_scan; 285 mutex_exit(&ctp->ct_lock); 286 287 invoke_softint(cid, ctp->ct_intr); 288 /* 289 * Return without waiting for the softint to finish. 290 */ 291 } 292 293 static void 294 clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending) 295 { 296 kthread_t *t; 297 kmutex_t *plockp; 298 int notick, intr; 299 klwp_id_t lwp; 300 301 /* 302 * The locking here is rather tricky. thread_free_prevent() 303 * prevents the thread returned from being freed while we 304 * are looking at it. We can then check if the thread 305 * is exiting and get the appropriate p_lock if it 306 * is not. We have to be careful, though, because 307 * the _process_ can still be freed while we've 308 * prevented thread free. To avoid touching the 309 * proc structure we put a pointer to the p_lock in the 310 * thread structure. The p_lock is persistent so we 311 * can acquire it even if the process is gone. At that 312 * point we can check (again) if the thread is exiting 313 * and either drop the lock or do the tick processing. 314 */ 315 t = cp->cpu_thread; /* Current running thread */ 316 if (CPU == cp) { 317 /* 318 * 't' will be the tick processing thread on this 319 * CPU. Use the pinned thread (if any) on this CPU 320 * as the target of the clock tick. 321 */ 322 if (t->t_intr != NULL) 323 t = t->t_intr; 324 } 325 326 /* 327 * We use thread_free_prevent to keep the currently running 328 * thread from being freed or recycled while we're 329 * looking at it. 330 */ 331 thread_free_prevent(t); 332 /* 333 * We cannot hold the cpu_lock to prevent the 334 * cpu_active from changing in the clock interrupt. 335 * As long as we don't block (or don't get pre-empted) 336 * the cpu_list will not change (all threads are paused 337 * before list modification). 338 */ 339 if (CLOCK_TICK_CPU_OFFLINE(cp)) { 340 thread_free_allow(t); 341 return; 342 } 343 344 /* 345 * Make sure the thread is still on the CPU. 346 */ 347 if ((t != cp->cpu_thread) && 348 ((cp != CPU) || (t != cp->cpu_thread->t_intr))) { 349 /* 350 * We could not locate the thread. Skip this CPU. Race 351 * conditions while performing these checks are benign. 352 * These checks are not perfect and they don't need 353 * to be. 354 */ 355 thread_free_allow(t); 356 return; 357 } 358 359 intr = t->t_flag & T_INTR_THREAD; 360 lwp = ttolwp(t); 361 if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { 362 /* 363 * Thread is exiting (or uninteresting) so don't 364 * do tick processing. 365 */ 366 thread_free_allow(t); 367 return; 368 } 369 370 /* 371 * OK, try to grab the process lock. See 372 * comments above for why we're not using 373 * ttoproc(t)->p_lockp here. 374 */ 375 plockp = t->t_plockp; 376 mutex_enter(plockp); 377 /* See above comment. */ 378 if (CLOCK_TICK_CPU_OFFLINE(cp)) { 379 mutex_exit(plockp); 380 thread_free_allow(t); 381 return; 382 } 383 384 /* 385 * The thread may have exited between when we 386 * checked above, and when we got the p_lock. 387 */ 388 if (t->t_proc_flag & TP_LWPEXIT) { 389 mutex_exit(plockp); 390 thread_free_allow(t); 391 return; 392 } 393 394 /* 395 * Either we have the p_lock for the thread's process, 396 * or we don't care about the thread structure any more. 397 * Either way we can allow thread free. 398 */ 399 thread_free_allow(t); 400 401 /* 402 * If we haven't done tick processing for this 403 * lwp, then do it now. Since we don't hold the 404 * lwp down on a CPU it can migrate and show up 405 * more than once, hence the lbolt check. mylbolt 406 * is copied at the time of tick scheduling to prevent 407 * lbolt mismatches. 408 * 409 * Also, make sure that it's okay to perform the 410 * tick processing before calling clock_tick. 411 * Setting notick to a TRUE value (ie. not 0) 412 * results in tick processing not being performed for 413 * that thread. 414 */ 415 notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) || 416 (cp->cpu_dispthread == cp->cpu_idle_thread)); 417 418 if ((!notick) && (t->t_lbolt < mylbolt)) { 419 t->t_lbolt = mylbolt; 420 clock_tick(t, pending); 421 } 422 423 mutex_exit(plockp); 424 } 425 426 void 427 clock_tick_schedule(int one_sec) 428 { 429 ulong_t active; 430 int i, end; 431 clock_tick_set_t *csp; 432 cpu_t *cp; 433 434 if (clock_cpu_id != CPU->cpu_id) 435 clock_cpu_id = CPU->cpu_id; 436 437 if (clock_tick_single_threaded) { 438 /* 439 * Each tick cycle, start the scan from a different 440 * CPU for the sake of fairness. 441 */ 442 end = clock_tick_total_cpus; 443 clock_tick_scan++; 444 if (clock_tick_scan >= end) 445 clock_tick_scan = 0; 446 447 clock_tick_execute_common(0, clock_tick_scan, end, 448 LBOLT_NO_ACCOUNT, 1); 449 450 return; 451 } 452 453 /* 454 * If the previous invocation of handlers is not yet finished, then 455 * simply increment a pending count and return. Eventually when they 456 * finish, the pending count is passed down to the next set of 457 * handlers to process. This way, ticks that have already elapsed 458 * in the past are handled as quickly as possible to minimize the 459 * chances of threads getting away before their pending ticks are 460 * accounted. The other benefit is that if the pending count is 461 * more than one, it can be handled by a single invocation of 462 * clock_tick(). This is a good optimization for large configuration 463 * busy systems where tick accounting can get backed up for various 464 * reasons. 465 */ 466 clock_tick_pending++; 467 468 active = clock_tick_active; 469 active = atomic_cas_ulong(&clock_tick_active, active, active); 470 if (active) 471 return; 472 473 /* 474 * We want to handle the clock CPU here. If we 475 * scheduled the accounting for the clock CPU to another 476 * processor, that processor will find only the clock() thread 477 * running and not account for any user thread below it. Also, 478 * we want to handle this before we block on anything and allow 479 * the pinned thread below the current thread to escape. 480 */ 481 clock_tick_process(CPU, LBOLT_NO_ACCOUNT, clock_tick_pending); 482 483 mutex_enter(&clock_tick_lock); 484 485 /* 486 * Schedule each set on a separate processor. 487 */ 488 cp = clock_cpu_list; 489 for (i = 0; i < clock_tick_nsets; i++) { 490 csp = &clock_tick_set[i]; 491 492 /* 493 * Pick the next online CPU in list for scheduling tick 494 * accounting. The clock_tick_lock is held by the caller. 495 * So, CPU online/offline cannot muck with this while 496 * we are picking our CPU to X-call. 497 */ 498 if (cp == CPU) 499 cp = cp->cpu_next_onln; 500 501 /* 502 * Each tick cycle, start the scan from a different 503 * CPU for the sake of fairness. 504 */ 505 csp->ct_scan++; 506 if (csp->ct_scan >= csp->ct_end) 507 csp->ct_scan = csp->ct_start; 508 509 clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id); 510 511 cp = cp->cpu_next_onln; 512 } 513 514 if (one_sec) { 515 /* 516 * Move the CPU pointer around every second. This is so 517 * all the CPUs can be X-called in a round-robin fashion 518 * to evenly distribute the X-calls. We don't do this 519 * at a faster rate than this because we don't want 520 * to affect cache performance negatively. 521 */ 522 clock_cpu_list = clock_cpu_list->cpu_next_onln; 523 } 524 525 mutex_exit(&clock_tick_lock); 526 527 clock_tick_pending = 0; 528 } 529 530 static void 531 clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt, 532 int pending) 533 { 534 cpu_t *cp; 535 int i; 536 537 ASSERT((start <= scan) && (scan <= end)); 538 539 /* 540 * Handle the thread on current CPU first. This is to prevent a 541 * pinned thread from escaping if we ever block on something. 542 * Note that in the single-threaded mode, this handles the clock 543 * CPU. 544 */ 545 clock_tick_process(CPU, mylbolt, pending); 546 547 /* 548 * Perform tick accounting for the threads running on 549 * the scheduled CPUs. 550 */ 551 for (i = scan; i < end; i++) { 552 cp = clock_tick_cpus[i]; 553 if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) 554 continue; 555 clock_tick_process(cp, mylbolt, pending); 556 } 557 558 for (i = start; i < scan; i++) { 559 cp = clock_tick_cpus[i]; 560 if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) 561 continue; 562 clock_tick_process(cp, mylbolt, pending); 563 } 564 } 565 566 /*ARGSUSED*/ 567 static uint_t 568 clock_tick_execute(caddr_t arg1, caddr_t arg2) 569 { 570 clock_tick_cpu_t *ctp; 571 int start, scan, end, pending; 572 clock_t mylbolt; 573 574 /* 575 * We could have raced with cpu offline. We don't want to 576 * process anything on an offlined CPU. If we got blocked 577 * on anything, we may not get scheduled when we wakeup 578 * later on. 579 */ 580 if (!CLOCK_TICK_XCALL_SAFE(CPU)) 581 goto out; 582 583 ctp = clock_tick_cpu[CPU->cpu_id]; 584 585 mutex_enter(&ctp->ct_lock); 586 pending = ctp->ct_pending; 587 if (pending == 0) { 588 /* 589 * If a CPU is busy at LOCK_LEVEL, then an invocation 590 * of this softint may be queued for some time. In that case, 591 * clock_tick_active will not be incremented. 592 * clock_tick_schedule() will then assume that the previous 593 * invocation is done and post a new softint. The first one 594 * that gets in will reset the pending count so the 595 * second one is a noop. 596 */ 597 mutex_exit(&ctp->ct_lock); 598 goto out; 599 } 600 ctp->ct_pending = 0; 601 start = ctp->ct_start; 602 end = ctp->ct_end; 603 scan = ctp->ct_scan; 604 mylbolt = ctp->ct_lbolt; 605 mutex_exit(&ctp->ct_lock); 606 607 clock_tick_execute_common(start, scan, end, mylbolt, pending); 608 609 out: 610 /* 611 * Signal completion to the clock handler. 612 */ 613 atomic_dec_ulong(&clock_tick_active); 614 615 return (1); 616 } 617 618 /*ARGSUSED*/ 619 static int 620 clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg) 621 { 622 cpu_t *cp, *ncp; 623 int i, set; 624 clock_tick_set_t *csp; 625 626 /* 627 * This function performs some computations at CPU offline/online 628 * time. The computed values are used during tick scheduling and 629 * execution phases. This avoids having to compute things on 630 * an every tick basis. The other benefit is that we perform the 631 * computations only for onlined CPUs (not offlined ones). As a 632 * result, no tick processing is attempted for offlined CPUs. 633 * 634 * Also, cpu_offline() calls this function before checking for 635 * active interrupt threads. This allows us to avoid posting 636 * cross calls to CPUs that are being offlined. 637 */ 638 639 cp = cpu[cid]; 640 641 mutex_enter(&clock_tick_lock); 642 643 switch (what) { 644 case CPU_ON: 645 clock_tick_cpus[clock_tick_total_cpus] = cp; 646 set = clock_tick_total_cpus / clock_tick_ncpus; 647 csp = &clock_tick_set[set]; 648 csp->ct_end++; 649 clock_tick_total_cpus++; 650 clock_tick_nsets = 651 (clock_tick_total_cpus + clock_tick_ncpus - 1) / 652 clock_tick_ncpus; 653 CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id); 654 membar_sync(); 655 break; 656 657 case CPU_OFF: 658 if (&sync_softint != NULL) 659 sync_softint(clock_tick_online_cpuset); 660 CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id); 661 clock_tick_total_cpus--; 662 clock_tick_cpus[clock_tick_total_cpus] = NULL; 663 clock_tick_nsets = 664 (clock_tick_total_cpus + clock_tick_ncpus - 1) / 665 clock_tick_ncpus; 666 set = clock_tick_total_cpus / clock_tick_ncpus; 667 csp = &clock_tick_set[set]; 668 csp->ct_end--; 669 670 i = 0; 671 ncp = cpu_active; 672 do { 673 if (cp == ncp) 674 continue; 675 clock_tick_cpus[i] = ncp; 676 i++; 677 } while ((ncp = ncp->cpu_next_onln) != cpu_active); 678 ASSERT(i == clock_tick_total_cpus); 679 membar_sync(); 680 break; 681 682 default: 683 break; 684 } 685 686 mutex_exit(&clock_tick_lock); 687 688 return (0); 689 } 690 691 692 void 693 clock_tick_mp_init(void) 694 { 695 cpu_t *cp; 696 697 mutex_enter(&cpu_lock); 698 699 cp = cpu_active; 700 do { 701 (void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL); 702 } while ((cp = cp->cpu_next_onln) != cpu_active); 703 704 register_cpu_setup_func(clock_tick_cpu_setup, NULL); 705 706 mutex_exit(&cpu_lock); 707 } 708