1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/thread.h> 28 #include <sys/proc.h> 29 #include <sys/task.h> 30 #include <sys/cmn_err.h> 31 #include <sys/class.h> 32 #include <sys/sdt.h> 33 #include <sys/atomic.h> 34 #include <sys/cpu.h> 35 #include <sys/clock_tick.h> 36 #include <sys/sysmacros.h> 37 #include <vm/rm.h> 38 39 /* 40 * This file contains the implementation of clock tick accounting for threads. 41 * Every tick, user threads running on various CPUs are located and charged 42 * with a tick to account for their use of CPU time. 43 * 44 * Every tick, the clock() handler calls clock_tick_schedule() to perform tick 45 * accounting for all the threads in the system. Tick accounting is done in 46 * two phases: 47 * 48 * Tick scheduling Done in clock_tick_schedule(). In this phase, cross 49 * calls are scheduled to multiple CPUs to perform 50 * multi-threaded tick accounting. The CPUs are chosen 51 * on a rotational basis so as to distribute the tick 52 * accounting load evenly across all CPUs. 53 * 54 * Tick execution Done in clock_tick_execute(). In this phase, tick 55 * accounting is actually performed by softint handlers 56 * on multiple CPUs. 57 * 58 * This implementation gives us a multi-threaded tick processing facility that 59 * is suitable for configurations with a large number of CPUs. On smaller 60 * configurations it may be desirable to let the processing be single-threaded 61 * and just allow clock() to do it as it has been done traditionally. To 62 * facilitate this, a variable, clock_tick_threshold, is defined. Platforms 63 * that desire multi-threading should set this variable to something 64 * appropriate. A recommended value may be found in clock_tick.h. At boot time, 65 * if the number of CPUs is greater than clock_tick_threshold, multi-threading 66 * kicks in. Note that this is a decision made at boot time. If more CPUs 67 * are dynamically added later on to exceed the threshold, no attempt is made 68 * to switch to multi-threaded. Similarly, if CPUs are removed dynamically 69 * no attempt is made to switch to single-threaded. This is to keep the 70 * implementation simple. Also note that the threshold can be changed for a 71 * specific customer configuration via /etc/system. 72 * 73 * The boot time decision is reflected in clock_tick_single_threaded. 74 */ 75 76 /* 77 * clock_tick_threshold 78 * If the number of CPUs at boot time exceeds this threshold, 79 * multi-threaded tick accounting kicks in. 80 * 81 * clock_tick_ncpus 82 * The number of CPUs in a set. Each set is scheduled for tick execution 83 * on a separate processor. 84 * 85 * clock_tick_single_threaded 86 * Indicates whether or not tick accounting is single threaded. 87 * 88 * clock_tick_total_cpus 89 * Total number of online CPUs. 90 * 91 * clock_tick_cpus 92 * Array of online CPU pointers. 93 * 94 * clock_tick_cpu 95 * Per-CPU, cache-aligned data structures to facilitate multi-threading. 96 * 97 * clock_tick_active 98 * Counter that indicates the number of active tick processing softints 99 * in the system. 100 * 101 * clock_tick_pending 102 * Number of pending ticks that need to be accounted by the softint 103 * handlers. 104 * 105 * clock_tick_lock 106 * Mutex to synchronize between clock_tick_schedule() and 107 * CPU online/offline. 108 * 109 * clock_cpu_id 110 * CPU id of the clock() CPU. Used to detect when the clock CPU 111 * is offlined. 112 * 113 * clock_tick_online_cpuset 114 * CPU set of all online processors that can be X-called. 115 * 116 * clock_tick_proc_max 117 * Each process is allowed to accumulate a few ticks before checking 118 * for the task CPU time resource limit. We lower the number of calls 119 * to rctl_test() to make tick accounting more scalable. The tradeoff 120 * is that the limit may not get enforced in a timely manner. This is 121 * typically not a problem. 122 * 123 * clock_tick_set 124 * Per-set structures. Each structure contains the range of CPUs 125 * to be processed for the set. 126 * 127 * clock_tick_nsets; 128 * Number of sets. 129 * 130 * clock_tick_scan 131 * Where to begin the scan for single-threaded mode. In multi-threaded, 132 * the clock_tick_set itself contains a field for this. 133 */ 134 int clock_tick_threshold; 135 int clock_tick_ncpus; 136 int clock_tick_single_threaded; 137 int clock_tick_total_cpus; 138 cpu_t *clock_tick_cpus[NCPU]; 139 clock_tick_cpu_t *clock_tick_cpu[NCPU]; 140 ulong_t clock_tick_active; 141 int clock_tick_pending; 142 kmutex_t clock_tick_lock; 143 processorid_t clock_cpu_id; 144 cpuset_t clock_tick_online_cpuset; 145 clock_t clock_tick_proc_max; 146 clock_tick_set_t *clock_tick_set; 147 int clock_tick_nsets; 148 int clock_tick_scan; 149 ulong_t clock_tick_intr; 150 151 static uint_t clock_tick_execute(caddr_t, caddr_t); 152 static void clock_tick_execute_common(int, int, int, clock_t, int); 153 154 #define CLOCK_TICK_ALIGN 64 /* cache alignment */ 155 156 /* 157 * Clock tick initialization is done in two phases: 158 * 159 * 1. Before clock_init() is called, clock_tick_init_pre() is called to set 160 * up single-threading so the clock() can begin to do its job. 161 * 162 * 2. After the slave CPUs are initialized at boot time, we know the number 163 * of CPUs. clock_tick_init_post() is called to set up multi-threading if 164 * required. 165 */ 166 void 167 clock_tick_init_pre(void) 168 { 169 clock_tick_cpu_t *ctp; 170 int i, n; 171 clock_tick_set_t *csp; 172 uintptr_t buf; 173 size_t size; 174 175 clock_tick_single_threaded = 1; 176 177 size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN); 178 buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP); 179 buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN); 180 181 /* 182 * Perform initialization in case multi-threading is chosen later. 183 */ 184 if (&create_softint != NULL) { 185 clock_tick_intr = create_softint(LOCK_LEVEL, 186 clock_tick_execute, (caddr_t)NULL); 187 } 188 for (i = 0; i < NCPU; i++, buf += size) { 189 ctp = (clock_tick_cpu_t *)buf; 190 clock_tick_cpu[i] = ctp; 191 mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL); 192 if (&create_softint != NULL) { 193 ctp->ct_intr = clock_tick_intr; 194 } 195 ctp->ct_pending = 0; 196 } 197 198 mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL); 199 200 /* 201 * Compute clock_tick_ncpus here. We need it to compute the 202 * maximum number of tick sets we need to support. 203 */ 204 ASSERT(clock_tick_ncpus >= 0); 205 if (clock_tick_ncpus == 0) 206 clock_tick_ncpus = CLOCK_TICK_NCPUS; 207 if (clock_tick_ncpus > max_ncpus) 208 clock_tick_ncpus = max_ncpus; 209 210 /* 211 * Allocate and initialize the tick sets. 212 */ 213 n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus; 214 clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP); 215 for (i = 0; i < n; i++) { 216 csp = &clock_tick_set[i]; 217 csp->ct_start = i * clock_tick_ncpus; 218 csp->ct_scan = csp->ct_start; 219 csp->ct_end = csp->ct_start; 220 } 221 } 222 223 void 224 clock_tick_init_post(void) 225 { 226 /* 227 * If a platform does not provide create_softint() and invoke_softint(), 228 * then we assume single threaded. 229 */ 230 if (&invoke_softint == NULL) 231 clock_tick_threshold = 0; 232 233 ASSERT(clock_tick_threshold >= 0); 234 235 if (clock_tick_threshold == 0) 236 clock_tick_threshold = max_ncpus; 237 238 /* 239 * If a platform does not specify a threshold or if the number of CPUs 240 * at boot time does not exceed the threshold, tick accounting remains 241 * single-threaded. 242 */ 243 if (ncpus <= clock_tick_threshold) { 244 clock_tick_ncpus = max_ncpus; 245 clock_tick_proc_max = 1; 246 return; 247 } 248 249 /* 250 * OK. Multi-thread tick processing. If a platform has not specified 251 * the CPU set size for multi-threading, then use the default value. 252 * This value has been arrived through measurements on large 253 * configuration systems. 254 */ 255 clock_tick_single_threaded = 0; 256 if (clock_tick_proc_max == 0) { 257 clock_tick_proc_max = CLOCK_TICK_PROC_MAX; 258 if (hires_tick) 259 clock_tick_proc_max *= 10; 260 } 261 } 262 263 static void 264 clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid) 265 { 266 clock_tick_cpu_t *ctp; 267 268 ASSERT(&invoke_softint != NULL); 269 270 atomic_inc_ulong(&clock_tick_active); 271 272 /* 273 * Schedule tick accounting for a set of CPUs. 274 */ 275 ctp = clock_tick_cpu[cid]; 276 mutex_enter(&ctp->ct_lock); 277 ctp->ct_lbolt = lbolt; 278 ctp->ct_pending += pending; 279 ctp->ct_start = csp->ct_start; 280 ctp->ct_end = csp->ct_end; 281 ctp->ct_scan = csp->ct_scan; 282 mutex_exit(&ctp->ct_lock); 283 284 invoke_softint(cid, ctp->ct_intr); 285 /* 286 * Return without waiting for the softint to finish. 287 */ 288 } 289 290 static void 291 clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending) 292 { 293 kthread_t *t; 294 kmutex_t *plockp; 295 int notick, intr; 296 klwp_id_t lwp; 297 298 /* 299 * The locking here is rather tricky. thread_free_prevent() 300 * prevents the thread returned from being freed while we 301 * are looking at it. We can then check if the thread 302 * is exiting and get the appropriate p_lock if it 303 * is not. We have to be careful, though, because 304 * the _process_ can still be freed while we've 305 * prevented thread free. To avoid touching the 306 * proc structure we put a pointer to the p_lock in the 307 * thread structure. The p_lock is persistent so we 308 * can acquire it even if the process is gone. At that 309 * point we can check (again) if the thread is exiting 310 * and either drop the lock or do the tick processing. 311 */ 312 t = cp->cpu_thread; /* Current running thread */ 313 if (CPU == cp) { 314 /* 315 * 't' will be the tick processing thread on this 316 * CPU. Use the pinned thread (if any) on this CPU 317 * as the target of the clock tick. 318 */ 319 if (t->t_intr != NULL) 320 t = t->t_intr; 321 } 322 323 /* 324 * We use thread_free_prevent to keep the currently running 325 * thread from being freed or recycled while we're 326 * looking at it. 327 */ 328 thread_free_prevent(t); 329 /* 330 * We cannot hold the cpu_lock to prevent the 331 * cpu_active from changing in the clock interrupt. 332 * As long as we don't block (or don't get pre-empted) 333 * the cpu_list will not change (all threads are paused 334 * before list modification). 335 */ 336 if (CLOCK_TICK_CPU_OFFLINE(cp)) { 337 thread_free_allow(t); 338 return; 339 } 340 341 /* 342 * Make sure the thread is still on the CPU. 343 */ 344 if ((t != cp->cpu_thread) && 345 ((cp != CPU) || (t != cp->cpu_thread->t_intr))) { 346 /* 347 * We could not locate the thread. Skip this CPU. Race 348 * conditions while performing these checks are benign. 349 * These checks are not perfect and they don't need 350 * to be. 351 */ 352 thread_free_allow(t); 353 return; 354 } 355 356 intr = t->t_flag & T_INTR_THREAD; 357 lwp = ttolwp(t); 358 if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { 359 /* 360 * Thread is exiting (or uninteresting) so don't 361 * do tick processing. 362 */ 363 thread_free_allow(t); 364 return; 365 } 366 367 /* 368 * OK, try to grab the process lock. See 369 * comments above for why we're not using 370 * ttoproc(t)->p_lockp here. 371 */ 372 plockp = t->t_plockp; 373 mutex_enter(plockp); 374 /* See above comment. */ 375 if (CLOCK_TICK_CPU_OFFLINE(cp)) { 376 mutex_exit(plockp); 377 thread_free_allow(t); 378 return; 379 } 380 381 /* 382 * The thread may have exited between when we 383 * checked above, and when we got the p_lock. 384 */ 385 if (t->t_proc_flag & TP_LWPEXIT) { 386 mutex_exit(plockp); 387 thread_free_allow(t); 388 return; 389 } 390 391 /* 392 * Either we have the p_lock for the thread's process, 393 * or we don't care about the thread structure any more. 394 * Either way we can allow thread free. 395 */ 396 thread_free_allow(t); 397 398 /* 399 * If we haven't done tick processing for this 400 * lwp, then do it now. Since we don't hold the 401 * lwp down on a CPU it can migrate and show up 402 * more than once, hence the lbolt check. mylbolt 403 * is copied at the time of tick scheduling to prevent 404 * lbolt mismatches. 405 * 406 * Also, make sure that it's okay to perform the 407 * tick processing before calling clock_tick. 408 * Setting notick to a TRUE value (ie. not 0) 409 * results in tick processing not being performed for 410 * that thread. 411 */ 412 notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) || 413 (cp->cpu_dispthread == cp->cpu_idle_thread)); 414 415 if ((!notick) && (t->t_lbolt < mylbolt)) { 416 t->t_lbolt = mylbolt; 417 clock_tick(t, pending); 418 } 419 420 mutex_exit(plockp); 421 } 422 423 void 424 clock_tick_schedule(int one_sec) 425 { 426 ulong_t active; 427 int i, end; 428 clock_tick_set_t *csp; 429 cpu_t *cp; 430 431 if (clock_cpu_id != CPU->cpu_id) 432 clock_cpu_id = CPU->cpu_id; 433 434 if (clock_tick_single_threaded) { 435 /* 436 * Each tick cycle, start the scan from a different 437 * CPU for the sake of fairness. 438 */ 439 end = clock_tick_total_cpus; 440 clock_tick_scan++; 441 if (clock_tick_scan >= end) 442 clock_tick_scan = 0; 443 444 clock_tick_execute_common(0, clock_tick_scan, end, lbolt, 1); 445 446 return; 447 } 448 449 /* 450 * If the previous invocation of handlers is not yet finished, then 451 * simply increment a pending count and return. Eventually when they 452 * finish, the pending count is passed down to the next set of 453 * handlers to process. This way, ticks that have already elapsed 454 * in the past are handled as quickly as possible to minimize the 455 * chances of threads getting away before their pending ticks are 456 * accounted. The other benefit is that if the pending count is 457 * more than one, it can be handled by a single invocation of 458 * clock_tick(). This is a good optimization for large configuration 459 * busy systems where tick accounting can get backed up for various 460 * reasons. 461 */ 462 clock_tick_pending++; 463 464 active = clock_tick_active; 465 active = atomic_cas_ulong(&clock_tick_active, active, active); 466 if (active) 467 return; 468 469 /* 470 * We want to handle the clock CPU here. If we 471 * scheduled the accounting for the clock CPU to another 472 * processor, that processor will find only the clock() thread 473 * running and not account for any user thread below it. Also, 474 * we want to handle this before we block on anything and allow 475 * the pinned thread below the current thread to escape. 476 */ 477 clock_tick_process(CPU, lbolt, clock_tick_pending); 478 479 mutex_enter(&clock_tick_lock); 480 481 /* 482 * Schedule each set on a separate processor. 483 */ 484 cp = clock_cpu_list; 485 for (i = 0; i < clock_tick_nsets; i++) { 486 csp = &clock_tick_set[i]; 487 488 /* 489 * Pick the next online CPU in list for scheduling tick 490 * accounting. The clock_tick_lock is held by the caller. 491 * So, CPU online/offline cannot muck with this while 492 * we are picking our CPU to X-call. 493 */ 494 if (cp == CPU) 495 cp = cp->cpu_next_onln; 496 497 /* 498 * Each tick cycle, start the scan from a different 499 * CPU for the sake of fairness. 500 */ 501 csp->ct_scan++; 502 if (csp->ct_scan >= csp->ct_end) 503 csp->ct_scan = csp->ct_start; 504 505 clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id); 506 507 cp = cp->cpu_next_onln; 508 } 509 510 if (one_sec) { 511 /* 512 * Move the CPU pointer around every second. This is so 513 * all the CPUs can be X-called in a round-robin fashion 514 * to evenly distribute the X-calls. We don't do this 515 * at a faster rate than this because we don't want 516 * to affect cache performance negatively. 517 */ 518 clock_cpu_list = clock_cpu_list->cpu_next_onln; 519 } 520 521 mutex_exit(&clock_tick_lock); 522 523 clock_tick_pending = 0; 524 } 525 526 static void 527 clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt, 528 int pending) 529 { 530 cpu_t *cp; 531 int i; 532 533 ASSERT((start <= scan) && (scan <= end)); 534 535 /* 536 * Handle the thread on current CPU first. This is to prevent a 537 * pinned thread from escaping if we ever block on something. 538 * Note that in the single-threaded mode, this handles the clock 539 * CPU. 540 */ 541 clock_tick_process(CPU, mylbolt, pending); 542 543 /* 544 * Perform tick accounting for the threads running on 545 * the scheduled CPUs. 546 */ 547 for (i = scan; i < end; i++) { 548 cp = clock_tick_cpus[i]; 549 if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) 550 continue; 551 clock_tick_process(cp, mylbolt, pending); 552 } 553 554 for (i = start; i < scan; i++) { 555 cp = clock_tick_cpus[i]; 556 if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) 557 continue; 558 clock_tick_process(cp, mylbolt, pending); 559 } 560 } 561 562 /*ARGSUSED*/ 563 static uint_t 564 clock_tick_execute(caddr_t arg1, caddr_t arg2) 565 { 566 clock_tick_cpu_t *ctp; 567 int start, scan, end, pending; 568 clock_t mylbolt; 569 570 /* 571 * We could have raced with cpu offline. We don't want to 572 * process anything on an offlined CPU. If we got blocked 573 * on anything, we may not get scheduled when we wakeup 574 * later on. 575 */ 576 if (!CLOCK_TICK_XCALL_SAFE(CPU)) 577 goto out; 578 579 ctp = clock_tick_cpu[CPU->cpu_id]; 580 581 mutex_enter(&ctp->ct_lock); 582 pending = ctp->ct_pending; 583 if (pending == 0) { 584 /* 585 * If a CPU is busy at LOCK_LEVEL, then an invocation 586 * of this softint may be queued for some time. In that case, 587 * clock_tick_active will not be incremented. 588 * clock_tick_schedule() will then assume that the previous 589 * invocation is done and post a new softint. The first one 590 * that gets in will reset the pending count so the 591 * second one is a noop. 592 */ 593 mutex_exit(&ctp->ct_lock); 594 goto out; 595 } 596 ctp->ct_pending = 0; 597 start = ctp->ct_start; 598 end = ctp->ct_end; 599 scan = ctp->ct_scan; 600 mylbolt = ctp->ct_lbolt; 601 mutex_exit(&ctp->ct_lock); 602 603 clock_tick_execute_common(start, scan, end, mylbolt, pending); 604 605 out: 606 /* 607 * Signal completion to the clock handler. 608 */ 609 atomic_dec_ulong(&clock_tick_active); 610 611 return (1); 612 } 613 614 /*ARGSUSED*/ 615 static int 616 clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg) 617 { 618 cpu_t *cp, *ncp; 619 int i, set; 620 clock_tick_set_t *csp; 621 622 /* 623 * This function performs some computations at CPU offline/online 624 * time. The computed values are used during tick scheduling and 625 * execution phases. This avoids having to compute things on 626 * an every tick basis. The other benefit is that we perform the 627 * computations only for onlined CPUs (not offlined ones). As a 628 * result, no tick processing is attempted for offlined CPUs. 629 * 630 * Also, cpu_offline() calls this function before checking for 631 * active interrupt threads. This allows us to avoid posting 632 * cross calls to CPUs that are being offlined. 633 */ 634 635 cp = cpu[cid]; 636 637 mutex_enter(&clock_tick_lock); 638 639 switch (what) { 640 case CPU_ON: 641 clock_tick_cpus[clock_tick_total_cpus] = cp; 642 set = clock_tick_total_cpus / clock_tick_ncpus; 643 csp = &clock_tick_set[set]; 644 csp->ct_end++; 645 clock_tick_total_cpus++; 646 clock_tick_nsets = 647 (clock_tick_total_cpus + clock_tick_ncpus - 1) / 648 clock_tick_ncpus; 649 CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id); 650 membar_sync(); 651 break; 652 653 case CPU_OFF: 654 if (&sync_softint != NULL) 655 sync_softint(clock_tick_online_cpuset); 656 CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id); 657 clock_tick_total_cpus--; 658 clock_tick_cpus[clock_tick_total_cpus] = NULL; 659 clock_tick_nsets = 660 (clock_tick_total_cpus + clock_tick_ncpus - 1) / 661 clock_tick_ncpus; 662 set = clock_tick_total_cpus / clock_tick_ncpus; 663 csp = &clock_tick_set[set]; 664 csp->ct_end--; 665 666 i = 0; 667 ncp = cpu_active; 668 do { 669 if (cp == ncp) 670 continue; 671 clock_tick_cpus[i] = ncp; 672 i++; 673 } while ((ncp = ncp->cpu_next_onln) != cpu_active); 674 ASSERT(i == clock_tick_total_cpus); 675 membar_sync(); 676 break; 677 678 default: 679 break; 680 } 681 682 mutex_exit(&clock_tick_lock); 683 684 return (0); 685 } 686 687 688 void 689 clock_tick_mp_init(void) 690 { 691 cpu_t *cp; 692 693 mutex_enter(&cpu_lock); 694 695 cp = cpu_active; 696 do { 697 (void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL); 698 } while ((cp = cp->cpu_next_onln) != cpu_active); 699 700 register_cpu_setup_func(clock_tick_cpu_setup, NULL); 701 702 mutex_exit(&cpu_lock); 703 } 704