1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/t_lock.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/x_call.h> 32 #include <sys/xc_levels.h> 33 #include <sys/cpu.h> 34 #include <sys/psw.h> 35 #include <sys/sunddi.h> 36 #include <sys/debug.h> 37 #include <sys/systm.h> 38 #include <sys/archsystm.h> 39 #include <sys/machsystm.h> 40 #include <sys/mutex_impl.h> 41 #include <sys/stack.h> 42 #include <sys/promif.h> 43 #include <sys/x86_archext.h> 44 45 /* 46 * Implementation for cross-processor calls via interprocessor interrupts 47 * 48 * This implementation uses a message passing architecture to allow multiple 49 * concurrent cross calls to be in flight at any given time. We use the cmpxchg 50 * instruction, aka casptr(), to implement simple efficient work queues for 51 * message passing between CPUs with almost no need for regular locking. 52 * See xc_extract() and xc_insert() below. 53 * 54 * The general idea is that initiating a cross call means putting a message 55 * on a target(s) CPU's work queue. Any synchronization is handled by passing 56 * the message back and forth between initiator and target(s). 57 * 58 * Every CPU has xc_work_cnt, which indicates it has messages to process. 59 * This value is incremented as message traffic is initiated and decremented 60 * with every message that finishes all processing. 61 * 62 * The code needs no mfence or other membar_*() calls. The uses of 63 * casptr(), cas32() and atomic_dec_32() for the message passing are 64 * implemented with LOCK prefix instructions which are equivalent to mfence. 65 * 66 * One interesting aspect of this implmentation is that it allows 2 or more 67 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time. 68 * The cross call processing by the CPUs will happen in any order with only 69 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return 70 * from cross calls before all slaves have invoked the function. 71 * 72 * The reason for this asynchronous approach is to allow for fast global 73 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation 74 * on a different Virtual Address at the same time. The old code required 75 * N squared IPIs. With this method, depending on timing, it could happen 76 * with just N IPIs. 77 */ 78 79 /* 80 * The default is to not enable collecting counts of IPI information, since 81 * the updating of shared cachelines could cause excess bus traffic. 82 */ 83 uint_t xc_collect_enable = 0; 84 uint64_t xc_total_cnt = 0; /* total #IPIs sent for cross calls */ 85 uint64_t xc_multi_cnt = 0; /* # times we piggy backed on another IPI */ 86 87 /* 88 * Values for message states. Here are the normal transitions. A transition 89 * of "->" happens in the slave cpu and "=>" happens in the master cpu as 90 * the messages are passed back and forth. 91 * 92 * FREE => ASYNC -> DONE => FREE 93 * FREE => CALL -> DONE => FREE 94 * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE 95 * 96 * The interesing one above is ASYNC. You might ask, why not go directly 97 * to FREE, instead of DONE. If it did that, it might be possible to exhaust 98 * the master's xc_free list if a master can generate ASYNC messages faster 99 * then the slave can process them. That could be handled with more complicated 100 * handling. However since nothing important uses ASYNC, I've not bothered. 101 */ 102 #define XC_MSG_FREE (0) /* msg in xc_free queue */ 103 #define XC_MSG_ASYNC (1) /* msg in slave xc_msgbox */ 104 #define XC_MSG_CALL (2) /* msg in slave xc_msgbox */ 105 #define XC_MSG_SYNC (3) /* msg in slave xc_msgbox */ 106 #define XC_MSG_WAITING (4) /* msg in master xc_msgbox or xc_waiters */ 107 #define XC_MSG_RELEASED (5) /* msg in slave xc_msgbox */ 108 #define XC_MSG_DONE (6) /* msg in master xc_msgbox */ 109 110 /* 111 * We allow for one high priority message at a time to happen in the system. 112 * This is used for panic, kmdb, etc., so no locking is done. 113 */ 114 static volatile cpuset_t xc_priority_set_store; 115 static volatile ulong_t *xc_priority_set = CPUSET2BV(xc_priority_set_store); 116 static xc_data_t xc_priority_data; 117 118 /* 119 * Wrappers to avoid C compiler warnings due to volatile. The atomic bit 120 * operations don't accept volatile bit vectors - which is a bit silly. 121 */ 122 #define XC_BT_SET(vector, b) BT_ATOMIC_SET((ulong_t *)(vector), (b)) 123 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b)) 124 125 /* 126 * Decrement a CPU's work count 127 */ 128 static void 129 xc_decrement(struct machcpu *mcpu) 130 { 131 atomic_dec_32(&mcpu->xc_work_cnt); 132 } 133 134 /* 135 * Increment a CPU's work count and return the old value 136 */ 137 static int 138 xc_increment(struct machcpu *mcpu) 139 { 140 int old; 141 do { 142 old = mcpu->xc_work_cnt; 143 } while (cas32((uint32_t *)&mcpu->xc_work_cnt, old, old + 1) != old); 144 return (old); 145 } 146 147 /* 148 * Put a message into a queue. The insertion is atomic no matter 149 * how many different inserts/extracts to the same queue happen. 150 */ 151 static void 152 xc_insert(void *queue, xc_msg_t *msg) 153 { 154 xc_msg_t *old_head; 155 do { 156 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue; 157 msg->xc_next = old_head; 158 } while (casptr(queue, old_head, msg) != old_head); 159 } 160 161 /* 162 * Extract a message from a queue. The extraction is atomic only 163 * when just one thread does extractions from the queue. 164 * If the queue is empty, NULL is returned. 165 */ 166 static xc_msg_t * 167 xc_extract(xc_msg_t **queue) 168 { 169 xc_msg_t *old_head; 170 171 do { 172 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue; 173 if (old_head == NULL) 174 return (old_head); 175 } while (casptr(queue, old_head, old_head->xc_next) != old_head); 176 old_head->xc_next = NULL; 177 return (old_head); 178 } 179 180 181 /* 182 * Initialize the machcpu fields used for cross calls 183 */ 184 static uint_t xc_initialized = 0; 185 void 186 xc_init_cpu(struct cpu *cpup) 187 { 188 xc_msg_t *msg; 189 int c; 190 191 /* 192 * add a new msg to each existing CPU's free list, as well as one for 193 * my list for each of them 194 */ 195 for (c = 0; c < ncpus; ++c) { 196 if (cpu[c] == NULL) 197 continue; 198 msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); 199 msg->xc_command = XC_MSG_FREE; 200 xc_insert(&cpu[c]->cpu_m.xc_free, msg); 201 202 msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); 203 msg->xc_command = XC_MSG_FREE; 204 xc_insert(&cpup->cpu_m.xc_free, msg); 205 } 206 207 /* 208 * Add one for self messages 209 */ 210 msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); 211 msg->xc_command = XC_MSG_FREE; 212 xc_insert(&cpup->cpu_m.xc_free, msg); 213 214 if (!xc_initialized) 215 xc_initialized = 1; 216 } 217 218 /* 219 * X-call message processing routine. Note that this is used by both 220 * senders and recipients of messages. 221 * 222 * We're protected against changing CPUs by either being in a high-priority 223 * interrupt, having preemption disabled or by having a raised SPL. 224 */ 225 /*ARGSUSED*/ 226 uint_t 227 xc_serv(caddr_t arg1, caddr_t arg2) 228 { 229 struct machcpu *mcpup = &(CPU->cpu_m); 230 xc_msg_t *msg; 231 xc_data_t *data; 232 xc_msg_t *xc_waiters = NULL; 233 uint32_t num_waiting = 0; 234 xc_func_t func; 235 xc_arg_t a1; 236 xc_arg_t a2; 237 xc_arg_t a3; 238 uint_t rc = DDI_INTR_UNCLAIMED; 239 240 while (mcpup->xc_work_cnt != 0) { 241 rc = DDI_INTR_CLAIMED; 242 243 /* 244 * We may have to wait for a message to arrive. 245 */ 246 for (;;) { 247 /* 248 * Alway check for and handle a priority message. 249 */ 250 if (BT_TEST(xc_priority_set, CPU->cpu_id)) { 251 func = xc_priority_data.xc_func; 252 a1 = xc_priority_data.xc_a1; 253 a2 = xc_priority_data.xc_a2; 254 a3 = xc_priority_data.xc_a3; 255 XC_BT_CLEAR(xc_priority_set, CPU->cpu_id); 256 xc_decrement(mcpup); 257 func(a1, a2, a3); 258 if (mcpup->xc_work_cnt == 0) 259 return (rc); 260 } 261 262 /* 263 * extract and handle regular message 264 */ 265 msg = xc_extract(&mcpup->xc_msgbox); 266 if (msg != NULL) 267 break; 268 269 /* 270 * wait for a message to arrive 271 */ 272 if (x86_feature & X86_MWAIT) { 273 i86_monitor( 274 (volatile uint32_t *)&mcpup->xc_msgbox, 275 0, 0); 276 if (mcpup->xc_msgbox == NULL) 277 i86_mwait(0, 0); 278 } else { 279 SMT_PAUSE(); 280 } 281 } 282 283 284 /* 285 * process the message 286 */ 287 switch (msg->xc_command) { 288 289 /* 290 * ASYNC gives back the message immediately, then we do the 291 * function and return with no more waiting. 292 */ 293 case XC_MSG_ASYNC: 294 data = &cpu[msg->xc_master]->cpu_m.xc_data; 295 func = data->xc_func; 296 a1 = data->xc_a1; 297 a2 = data->xc_a2; 298 a3 = data->xc_a3; 299 msg->xc_command = XC_MSG_DONE; 300 xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); 301 if (func != NULL) 302 (void) (*func)(a1, a2, a3); 303 xc_decrement(mcpup); 304 break; 305 306 /* 307 * SYNC messages do the call, then send it back to the master 308 * in WAITING mode 309 */ 310 case XC_MSG_SYNC: 311 data = &cpu[msg->xc_master]->cpu_m.xc_data; 312 if (data->xc_func != NULL) 313 (void) (*data->xc_func)(data->xc_a1, 314 data->xc_a2, data->xc_a3); 315 msg->xc_command = XC_MSG_WAITING; 316 xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); 317 break; 318 319 /* 320 * WAITING messsages are collected by the master until all 321 * have arrived. Once all arrive, we release them back to 322 * the slaves 323 */ 324 case XC_MSG_WAITING: 325 xc_insert(&xc_waiters, msg); 326 if (++num_waiting < mcpup->xc_wait_cnt) 327 break; 328 while ((msg = xc_extract(&xc_waiters)) != NULL) { 329 msg->xc_command = XC_MSG_RELEASED; 330 xc_insert(&cpu[msg->xc_slave]->cpu_m.xc_msgbox, 331 msg); 332 --num_waiting; 333 } 334 if (num_waiting != 0) 335 panic("wrong number waiting"); 336 mcpup->xc_wait_cnt = 0; 337 break; 338 339 /* 340 * CALL messages do the function and then, like RELEASE, 341 * send the message is back to master as DONE. 342 */ 343 case XC_MSG_CALL: 344 data = &cpu[msg->xc_master]->cpu_m.xc_data; 345 if (data->xc_func != NULL) 346 (void) (*data->xc_func)(data->xc_a1, 347 data->xc_a2, data->xc_a3); 348 /*FALLTHROUGH*/ 349 case XC_MSG_RELEASED: 350 msg->xc_command = XC_MSG_DONE; 351 xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); 352 xc_decrement(mcpup); 353 break; 354 355 /* 356 * DONE means a slave has completely finished up. 357 * Once we collect all the DONE messages, we'll exit 358 * processing too. 359 */ 360 case XC_MSG_DONE: 361 msg->xc_command = XC_MSG_FREE; 362 xc_insert(&mcpup->xc_free, msg); 363 xc_decrement(mcpup); 364 break; 365 366 case XC_MSG_FREE: 367 panic("free message in msgbox"); 368 break; 369 370 default: 371 panic("bad message in msgbox"); 372 break; 373 } 374 } 375 return (rc); 376 } 377 378 /* 379 * Initiate cross call processing. 380 */ 381 static void 382 xc_common( 383 xc_func_t func, 384 xc_arg_t arg1, 385 xc_arg_t arg2, 386 xc_arg_t arg3, 387 ulong_t *set, 388 uint_t command) 389 { 390 int c; 391 struct cpu *cpup; 392 xc_msg_t *msg; 393 xc_data_t *data; 394 int cnt; 395 int save_spl; 396 397 if (!xc_initialized) { 398 if (BT_TEST(set, CPU->cpu_id) && (CPU->cpu_flags & CPU_READY) && 399 func != NULL) 400 (void) (*func)(arg1, arg2, arg3); 401 return; 402 } 403 404 save_spl = splr(ipltospl(XC_HI_PIL)); 405 406 /* 407 * fill in cross call data 408 */ 409 data = &CPU->cpu_m.xc_data; 410 data->xc_func = func; 411 data->xc_a1 = arg1; 412 data->xc_a2 = arg2; 413 data->xc_a3 = arg3; 414 415 /* 416 * Post messages to all CPUs involved that are CPU_READY 417 */ 418 CPU->cpu_m.xc_wait_cnt = 0; 419 for (c = 0; c < ncpus; ++c) { 420 if (!BT_TEST(set, c)) 421 continue; 422 cpup = cpu[c]; 423 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) 424 continue; 425 426 /* 427 * Fill out a new message. 428 */ 429 msg = xc_extract(&CPU->cpu_m.xc_free); 430 if (msg == NULL) 431 panic("Ran out of free xc_msg_t's"); 432 msg->xc_command = command; 433 msg->xc_master = CPU->cpu_id; 434 msg->xc_slave = c; 435 436 /* 437 * Increment my work count for all messages that I'll 438 * transition from DONE to FREE. 439 * Also remember how many XC_MSG_WAITINGs to look for 440 */ 441 (void) xc_increment(&CPU->cpu_m); 442 if (command == XC_MSG_SYNC) 443 ++CPU->cpu_m.xc_wait_cnt; 444 445 /* 446 * Increment the target CPU work count then insert the message 447 * in the target msgbox. If I post the first bit of work 448 * for the target to do, send an IPI to the target CPU. 449 */ 450 cnt = xc_increment(&cpup->cpu_m); 451 xc_insert(&cpup->cpu_m.xc_msgbox, msg); 452 if (cpup != CPU) { 453 if (cnt == 0) { 454 CPU_STATS_ADDQ(CPU, sys, xcalls, 1); 455 send_dirint(c, XC_HI_PIL); 456 if (xc_collect_enable) 457 ++xc_total_cnt; 458 } else if (xc_collect_enable) { 459 ++xc_multi_cnt; 460 } 461 } 462 } 463 464 /* 465 * Now drop into the message handler until all work is done 466 */ 467 (void) xc_serv(NULL, NULL); 468 splx(save_spl); 469 } 470 471 /* 472 * Push out a priority cross call. 473 */ 474 static void 475 xc_priority_common( 476 xc_func_t func, 477 xc_arg_t arg1, 478 xc_arg_t arg2, 479 xc_arg_t arg3, 480 ulong_t *set) 481 { 482 int i; 483 int c; 484 struct cpu *cpup; 485 486 /* 487 * Wait briefly for any previous xc_priority to have finished. 488 */ 489 for (c = 0; c < ncpus; ++c) { 490 cpup = cpu[c]; 491 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) 492 continue; 493 494 /* 495 * The value of 40000 here is from old kernel code. It 496 * really should be changed to some time based value, since 497 * under a hypervisor, there's no guarantee a remote CPU 498 * is even scheduled. 499 */ 500 for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i) 501 SMT_PAUSE(); 502 503 /* 504 * Some CPU did not respond to a previous priority request. It's 505 * probably deadlocked with interrupts blocked or some such 506 * problem. We'll just erase the previous request - which was 507 * most likely a kmdb_enter that has already expired - and plow 508 * ahead. 509 */ 510 if (BT_TEST(xc_priority_set, c)) { 511 XC_BT_CLEAR(xc_priority_set, c); 512 if (cpup->cpu_m.xc_work_cnt > 0) 513 xc_decrement(&cpup->cpu_m); 514 } 515 } 516 517 /* 518 * fill in cross call data 519 */ 520 xc_priority_data.xc_func = func; 521 xc_priority_data.xc_a1 = arg1; 522 xc_priority_data.xc_a2 = arg2; 523 xc_priority_data.xc_a3 = arg3; 524 525 /* 526 * Post messages to all CPUs involved that are CPU_READY 527 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait() 528 */ 529 for (c = 0; c < ncpus; ++c) { 530 if (!BT_TEST(set, c)) 531 continue; 532 cpup = cpu[c]; 533 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) || 534 cpup == CPU) 535 continue; 536 (void) xc_increment(&cpup->cpu_m); 537 XC_BT_SET(xc_priority_set, c); 538 send_dirint(c, XC_HI_PIL); 539 for (i = 0; i < 10; ++i) { 540 (void) casptr(&cpup->cpu_m.xc_msgbox, 541 cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox); 542 } 543 } 544 } 545 546 /* 547 * Do cross call to all other CPUs with absolutely no waiting or handshaking. 548 * This should only be used for extraordinary operations, like panic(), which 549 * need to work, in some fashion, in a not completely functional system. 550 * All other uses that want minimal waiting should use xc_call_nowait(). 551 */ 552 void 553 xc_priority( 554 xc_arg_t arg1, 555 xc_arg_t arg2, 556 xc_arg_t arg3, 557 ulong_t *set, 558 xc_func_t func) 559 { 560 extern int IGNORE_KERNEL_PREEMPTION; 561 int save_spl = splr(ipltospl(XC_HI_PIL)); 562 int save_kernel_preemption = IGNORE_KERNEL_PREEMPTION; 563 564 IGNORE_KERNEL_PREEMPTION = 1; 565 xc_priority_common((xc_func_t)func, arg1, arg2, arg3, set); 566 IGNORE_KERNEL_PREEMPTION = save_kernel_preemption; 567 splx(save_spl); 568 } 569 570 /* 571 * Wrapper for kmdb to capture other CPUs, causing them to enter the debugger. 572 */ 573 void 574 kdi_xc_others(int this_cpu, void (*func)(void)) 575 { 576 extern int IGNORE_KERNEL_PREEMPTION; 577 int save_kernel_preemption; 578 cpuset_t set; 579 580 if (!xc_initialized) 581 return; 582 583 save_kernel_preemption = IGNORE_KERNEL_PREEMPTION; 584 IGNORE_KERNEL_PREEMPTION = 1; 585 CPUSET_ALL_BUT(set, this_cpu); 586 xc_priority_common((xc_func_t)func, 0, 0, 0, CPUSET2BV(set)); 587 IGNORE_KERNEL_PREEMPTION = save_kernel_preemption; 588 } 589 590 591 592 /* 593 * Invoke function on specified processors. Remotes may continue after 594 * service with no waiting. xc_call_nowait() may return immediately too. 595 */ 596 void 597 xc_call_nowait( 598 xc_arg_t arg1, 599 xc_arg_t arg2, 600 xc_arg_t arg3, 601 ulong_t *set, 602 xc_func_t func) 603 { 604 xc_common(func, arg1, arg2, arg3, set, XC_MSG_ASYNC); 605 } 606 607 /* 608 * Invoke function on specified processors. Remotes may continue after 609 * service with no waiting. xc_call() returns only after remotes have finished. 610 */ 611 void 612 xc_call( 613 xc_arg_t arg1, 614 xc_arg_t arg2, 615 xc_arg_t arg3, 616 ulong_t *set, 617 xc_func_t func) 618 { 619 xc_common(func, arg1, arg2, arg3, set, XC_MSG_CALL); 620 } 621 622 /* 623 * Invoke function on specified processors. Remotes wait until all have 624 * finished. xc_sync() also waits until all remotes have finished. 625 */ 626 void 627 xc_sync( 628 xc_arg_t arg1, 629 xc_arg_t arg2, 630 xc_arg_t arg3, 631 ulong_t *set, 632 xc_func_t func) 633 { 634 xc_common(func, arg1, arg2, arg3, set, XC_MSG_SYNC); 635 } 636