1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/t_lock.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/x_call.h> 32 #include <sys/xc_levels.h> 33 #include <sys/cpu.h> 34 #include <sys/psw.h> 35 #include <sys/sunddi.h> 36 #include <sys/debug.h> 37 #include <sys/systm.h> 38 #include <sys/archsystm.h> 39 #include <sys/machsystm.h> 40 #include <sys/mutex_impl.h> 41 #include <sys/stack.h> 42 #include <sys/promif.h> 43 #include <sys/x86_archext.h> 44 45 /* 46 * Implementation for cross-processor calls via interprocessor interrupts 47 * 48 * This implementation uses a message passing architecture to allow multiple 49 * concurrent cross calls to be in flight at any given time. We use the cmpxchg 50 * instruction, aka casptr(), to implement simple efficient work queues for 51 * message passing between CPUs with almost no need for regular locking. 52 * See xc_extract() and xc_insert() below. 53 * 54 * The general idea is that initiating a cross call means putting a message 55 * on a target(s) CPU's work queue. Any synchronization is handled by passing 56 * the message back and forth between initiator and target(s). 57 * 58 * Every CPU has xc_work_cnt, which indicates it has messages to process. 59 * This value is incremented as message traffic is initiated and decremented 60 * with every message that finishes all processing. 61 * 62 * The code needs no mfence or other membar_*() calls. The uses of 63 * casptr(), cas32() and atomic_dec_32() for the message passing are 64 * implemented with LOCK prefix instructions which are equivalent to mfence. 65 * 66 * One interesting aspect of this implmentation is that it allows 2 or more 67 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time. 68 * The cross call processing by the CPUs will happen in any order with only 69 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return 70 * from cross calls before all slaves have invoked the function. 71 * 72 * The reason for this asynchronous approach is to allow for fast global 73 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation 74 * on a different Virtual Address at the same time. The old code required 75 * N squared IPIs. With this method, depending on timing, it could happen 76 * with just N IPIs. 77 */ 78 79 /* 80 * The default is to not enable collecting counts of IPI information, since 81 * the updating of shared cachelines could cause excess bus traffic. 82 */ 83 uint_t xc_collect_enable = 0; 84 uint64_t xc_total_cnt = 0; /* total #IPIs sent for cross calls */ 85 uint64_t xc_multi_cnt = 0; /* # times we piggy backed on another IPI */ 86 87 /* 88 * Values for message states. Here are the normal transitions. A transition 89 * of "->" happens in the slave cpu and "=>" happens in the master cpu as 90 * the messages are passed back and forth. 91 * 92 * FREE => ASYNC -> DONE => FREE 93 * FREE => CALL -> DONE => FREE 94 * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE 95 * 96 * The interesing one above is ASYNC. You might ask, why not go directly 97 * to FREE, instead of DONE. If it did that, it might be possible to exhaust 98 * the master's xc_free list if a master can generate ASYNC messages faster 99 * then the slave can process them. That could be handled with more complicated 100 * handling. However since nothing important uses ASYNC, I've not bothered. 101 */ 102 #define XC_MSG_FREE (0) /* msg in xc_free queue */ 103 #define XC_MSG_ASYNC (1) /* msg in slave xc_msgbox */ 104 #define XC_MSG_CALL (2) /* msg in slave xc_msgbox */ 105 #define XC_MSG_SYNC (3) /* msg in slave xc_msgbox */ 106 #define XC_MSG_WAITING (4) /* msg in master xc_msgbox or xc_waiters */ 107 #define XC_MSG_RELEASED (5) /* msg in slave xc_msgbox */ 108 #define XC_MSG_DONE (6) /* msg in master xc_msgbox */ 109 110 /* 111 * We allow for one high priority message at a time to happen in the system. 112 * This is used for panic, kmdb, etc., so no locking is done. 113 */ 114 static volatile cpuset_t xc_priority_set_store; 115 static volatile ulong_t *xc_priority_set = CPUSET2BV(xc_priority_set_store); 116 static xc_data_t xc_priority_data; 117 118 /* 119 * Wrappers to avoid C compiler warnings due to volatile. The atomic bit 120 * operations don't accept volatile bit vectors - which is a bit silly. 121 */ 122 #define XC_BT_SET(vector, b) BT_ATOMIC_SET((ulong_t *)(vector), (b)) 123 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b)) 124 125 /* 126 * Decrement a CPU's work count 127 */ 128 static void 129 xc_decrement(struct machcpu *mcpu) 130 { 131 atomic_dec_32(&mcpu->xc_work_cnt); 132 } 133 134 /* 135 * Increment a CPU's work count and return the old value 136 */ 137 static int 138 xc_increment(struct machcpu *mcpu) 139 { 140 int old; 141 do { 142 old = mcpu->xc_work_cnt; 143 } while (cas32((uint32_t *)&mcpu->xc_work_cnt, old, old + 1) != old); 144 return (old); 145 } 146 147 /* 148 * Put a message into a queue. The insertion is atomic no matter 149 * how many different inserts/extracts to the same queue happen. 150 */ 151 static void 152 xc_insert(void *queue, xc_msg_t *msg) 153 { 154 xc_msg_t *old_head; 155 156 /* 157 * FREE messages should only ever be getting inserted into 158 * the xc_master CPUs xc_free queue. 159 */ 160 ASSERT(msg->xc_command != XC_MSG_FREE || 161 cpu[msg->xc_master] == NULL || /* possible only during init */ 162 queue == &cpu[msg->xc_master]->cpu_m.xc_free); 163 164 do { 165 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue; 166 msg->xc_next = old_head; 167 } while (casptr(queue, old_head, msg) != old_head); 168 } 169 170 /* 171 * Extract a message from a queue. The extraction is atomic only 172 * when just one thread does extractions from the queue. 173 * If the queue is empty, NULL is returned. 174 */ 175 static xc_msg_t * 176 xc_extract(xc_msg_t **queue) 177 { 178 xc_msg_t *old_head; 179 180 do { 181 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue; 182 if (old_head == NULL) 183 return (old_head); 184 } while (casptr(queue, old_head, old_head->xc_next) != old_head); 185 old_head->xc_next = NULL; 186 return (old_head); 187 } 188 189 190 /* 191 * Initialize the machcpu fields used for cross calls 192 */ 193 static uint_t xc_initialized = 0; 194 void 195 xc_init_cpu(struct cpu *cpup) 196 { 197 xc_msg_t *msg; 198 int c; 199 200 /* 201 * add a new msg to each existing CPU's free list, as well as one for 202 * my list for each of them. ncpus has an inconsistent value when this 203 * function is called, so use cpup->cpu_id. 204 */ 205 for (c = 0; c < cpup->cpu_id; ++c) { 206 if (cpu[c] == NULL) 207 continue; 208 msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); 209 msg->xc_command = XC_MSG_FREE; 210 msg->xc_master = c; 211 xc_insert(&cpu[c]->cpu_m.xc_free, msg); 212 213 msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); 214 msg->xc_command = XC_MSG_FREE; 215 msg->xc_master = cpup->cpu_id; 216 xc_insert(&cpup->cpu_m.xc_free, msg); 217 } 218 219 /* 220 * Add one for self messages 221 */ 222 msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); 223 msg->xc_command = XC_MSG_FREE; 224 msg->xc_master = cpup->cpu_id; 225 xc_insert(&cpup->cpu_m.xc_free, msg); 226 227 if (!xc_initialized) 228 xc_initialized = 1; 229 } 230 231 /* 232 * X-call message processing routine. Note that this is used by both 233 * senders and recipients of messages. 234 * 235 * We're protected against changing CPUs by either being in a high-priority 236 * interrupt, having preemption disabled or by having a raised SPL. 237 */ 238 /*ARGSUSED*/ 239 uint_t 240 xc_serv(caddr_t arg1, caddr_t arg2) 241 { 242 struct machcpu *mcpup = &(CPU->cpu_m); 243 xc_msg_t *msg; 244 xc_data_t *data; 245 xc_msg_t *xc_waiters = NULL; 246 uint32_t num_waiting = 0; 247 xc_func_t func; 248 xc_arg_t a1; 249 xc_arg_t a2; 250 xc_arg_t a3; 251 uint_t rc = DDI_INTR_UNCLAIMED; 252 253 while (mcpup->xc_work_cnt != 0) { 254 rc = DDI_INTR_CLAIMED; 255 256 /* 257 * We may have to wait for a message to arrive. 258 */ 259 for (msg = NULL; msg == NULL; 260 msg = xc_extract(&mcpup->xc_msgbox)) { 261 262 /* 263 * Alway check for and handle a priority message. 264 */ 265 if (BT_TEST(xc_priority_set, CPU->cpu_id)) { 266 func = xc_priority_data.xc_func; 267 a1 = xc_priority_data.xc_a1; 268 a2 = xc_priority_data.xc_a2; 269 a3 = xc_priority_data.xc_a3; 270 XC_BT_CLEAR(xc_priority_set, CPU->cpu_id); 271 xc_decrement(mcpup); 272 func(a1, a2, a3); 273 if (mcpup->xc_work_cnt == 0) 274 return (rc); 275 } 276 277 /* 278 * wait for a message to arrive 279 */ 280 SMT_PAUSE(); 281 } 282 283 284 /* 285 * process the message 286 */ 287 switch (msg->xc_command) { 288 289 /* 290 * ASYNC gives back the message immediately, then we do the 291 * function and return with no more waiting. 292 */ 293 case XC_MSG_ASYNC: 294 data = &cpu[msg->xc_master]->cpu_m.xc_data; 295 func = data->xc_func; 296 a1 = data->xc_a1; 297 a2 = data->xc_a2; 298 a3 = data->xc_a3; 299 msg->xc_command = XC_MSG_DONE; 300 xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); 301 if (func != NULL) 302 (void) (*func)(a1, a2, a3); 303 xc_decrement(mcpup); 304 break; 305 306 /* 307 * SYNC messages do the call, then send it back to the master 308 * in WAITING mode 309 */ 310 case XC_MSG_SYNC: 311 data = &cpu[msg->xc_master]->cpu_m.xc_data; 312 if (data->xc_func != NULL) 313 (void) (*data->xc_func)(data->xc_a1, 314 data->xc_a2, data->xc_a3); 315 msg->xc_command = XC_MSG_WAITING; 316 xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); 317 break; 318 319 /* 320 * WAITING messsages are collected by the master until all 321 * have arrived. Once all arrive, we release them back to 322 * the slaves 323 */ 324 case XC_MSG_WAITING: 325 xc_insert(&xc_waiters, msg); 326 if (++num_waiting < mcpup->xc_wait_cnt) 327 break; 328 while ((msg = xc_extract(&xc_waiters)) != NULL) { 329 msg->xc_command = XC_MSG_RELEASED; 330 xc_insert(&cpu[msg->xc_slave]->cpu_m.xc_msgbox, 331 msg); 332 --num_waiting; 333 } 334 if (num_waiting != 0) 335 panic("wrong number waiting"); 336 mcpup->xc_wait_cnt = 0; 337 break; 338 339 /* 340 * CALL messages do the function and then, like RELEASE, 341 * send the message is back to master as DONE. 342 */ 343 case XC_MSG_CALL: 344 data = &cpu[msg->xc_master]->cpu_m.xc_data; 345 if (data->xc_func != NULL) 346 (void) (*data->xc_func)(data->xc_a1, 347 data->xc_a2, data->xc_a3); 348 /*FALLTHROUGH*/ 349 case XC_MSG_RELEASED: 350 msg->xc_command = XC_MSG_DONE; 351 xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); 352 xc_decrement(mcpup); 353 break; 354 355 /* 356 * DONE means a slave has completely finished up. 357 * Once we collect all the DONE messages, we'll exit 358 * processing too. 359 */ 360 case XC_MSG_DONE: 361 msg->xc_command = XC_MSG_FREE; 362 xc_insert(&mcpup->xc_free, msg); 363 xc_decrement(mcpup); 364 break; 365 366 case XC_MSG_FREE: 367 panic("free message 0x%p in msgbox", (void *)msg); 368 break; 369 370 default: 371 panic("bad message 0x%p in msgbox", (void *)msg); 372 break; 373 } 374 } 375 return (rc); 376 } 377 378 /* 379 * Initiate cross call processing. 380 */ 381 static void 382 xc_common( 383 xc_func_t func, 384 xc_arg_t arg1, 385 xc_arg_t arg2, 386 xc_arg_t arg3, 387 ulong_t *set, 388 uint_t command) 389 { 390 int c; 391 struct cpu *cpup; 392 xc_msg_t *msg; 393 xc_data_t *data; 394 int cnt; 395 int save_spl; 396 397 if (!xc_initialized) { 398 if (BT_TEST(set, CPU->cpu_id) && (CPU->cpu_flags & CPU_READY) && 399 func != NULL) 400 (void) (*func)(arg1, arg2, arg3); 401 return; 402 } 403 404 save_spl = splr(ipltospl(XC_HI_PIL)); 405 406 /* 407 * fill in cross call data 408 */ 409 data = &CPU->cpu_m.xc_data; 410 data->xc_func = func; 411 data->xc_a1 = arg1; 412 data->xc_a2 = arg2; 413 data->xc_a3 = arg3; 414 415 /* 416 * Post messages to all CPUs involved that are CPU_READY 417 */ 418 CPU->cpu_m.xc_wait_cnt = 0; 419 for (c = 0; c < ncpus; ++c) { 420 if (!BT_TEST(set, c)) 421 continue; 422 cpup = cpu[c]; 423 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) 424 continue; 425 426 /* 427 * Fill out a new message. 428 */ 429 msg = xc_extract(&CPU->cpu_m.xc_free); 430 if (msg == NULL) 431 panic("Ran out of free xc_msg_t's"); 432 msg->xc_command = command; 433 if (msg->xc_master != CPU->cpu_id) 434 panic("msg %p has wrong xc_master", (void *)msg); 435 msg->xc_slave = c; 436 437 /* 438 * Increment my work count for all messages that I'll 439 * transition from DONE to FREE. 440 * Also remember how many XC_MSG_WAITINGs to look for 441 */ 442 (void) xc_increment(&CPU->cpu_m); 443 if (command == XC_MSG_SYNC) 444 ++CPU->cpu_m.xc_wait_cnt; 445 446 /* 447 * Increment the target CPU work count then insert the message 448 * in the target msgbox. If I post the first bit of work 449 * for the target to do, send an IPI to the target CPU. 450 */ 451 cnt = xc_increment(&cpup->cpu_m); 452 xc_insert(&cpup->cpu_m.xc_msgbox, msg); 453 if (cpup != CPU) { 454 if (cnt == 0) { 455 CPU_STATS_ADDQ(CPU, sys, xcalls, 1); 456 send_dirint(c, XC_HI_PIL); 457 if (xc_collect_enable) 458 ++xc_total_cnt; 459 } else if (xc_collect_enable) { 460 ++xc_multi_cnt; 461 } 462 } 463 } 464 465 /* 466 * Now drop into the message handler until all work is done 467 */ 468 (void) xc_serv(NULL, NULL); 469 splx(save_spl); 470 } 471 472 /* 473 * Push out a priority cross call. 474 */ 475 static void 476 xc_priority_common( 477 xc_func_t func, 478 xc_arg_t arg1, 479 xc_arg_t arg2, 480 xc_arg_t arg3, 481 ulong_t *set) 482 { 483 int i; 484 int c; 485 struct cpu *cpup; 486 487 /* 488 * Wait briefly for any previous xc_priority to have finished. 489 */ 490 for (c = 0; c < ncpus; ++c) { 491 cpup = cpu[c]; 492 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) 493 continue; 494 495 /* 496 * The value of 40000 here is from old kernel code. It 497 * really should be changed to some time based value, since 498 * under a hypervisor, there's no guarantee a remote CPU 499 * is even scheduled. 500 */ 501 for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i) 502 SMT_PAUSE(); 503 504 /* 505 * Some CPU did not respond to a previous priority request. It's 506 * probably deadlocked with interrupts blocked or some such 507 * problem. We'll just erase the previous request - which was 508 * most likely a kmdb_enter that has already expired - and plow 509 * ahead. 510 */ 511 if (BT_TEST(xc_priority_set, c)) { 512 XC_BT_CLEAR(xc_priority_set, c); 513 if (cpup->cpu_m.xc_work_cnt > 0) 514 xc_decrement(&cpup->cpu_m); 515 } 516 } 517 518 /* 519 * fill in cross call data 520 */ 521 xc_priority_data.xc_func = func; 522 xc_priority_data.xc_a1 = arg1; 523 xc_priority_data.xc_a2 = arg2; 524 xc_priority_data.xc_a3 = arg3; 525 526 /* 527 * Post messages to all CPUs involved that are CPU_READY 528 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait() 529 */ 530 for (c = 0; c < ncpus; ++c) { 531 if (!BT_TEST(set, c)) 532 continue; 533 cpup = cpu[c]; 534 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) || 535 cpup == CPU) 536 continue; 537 (void) xc_increment(&cpup->cpu_m); 538 XC_BT_SET(xc_priority_set, c); 539 send_dirint(c, XC_HI_PIL); 540 for (i = 0; i < 10; ++i) { 541 (void) casptr(&cpup->cpu_m.xc_msgbox, 542 cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox); 543 } 544 } 545 } 546 547 /* 548 * Do cross call to all other CPUs with absolutely no waiting or handshaking. 549 * This should only be used for extraordinary operations, like panic(), which 550 * need to work, in some fashion, in a not completely functional system. 551 * All other uses that want minimal waiting should use xc_call_nowait(). 552 */ 553 void 554 xc_priority( 555 xc_arg_t arg1, 556 xc_arg_t arg2, 557 xc_arg_t arg3, 558 ulong_t *set, 559 xc_func_t func) 560 { 561 extern int IGNORE_KERNEL_PREEMPTION; 562 int save_spl = splr(ipltospl(XC_HI_PIL)); 563 int save_kernel_preemption = IGNORE_KERNEL_PREEMPTION; 564 565 IGNORE_KERNEL_PREEMPTION = 1; 566 xc_priority_common((xc_func_t)func, arg1, arg2, arg3, set); 567 IGNORE_KERNEL_PREEMPTION = save_kernel_preemption; 568 splx(save_spl); 569 } 570 571 /* 572 * Wrapper for kmdb to capture other CPUs, causing them to enter the debugger. 573 */ 574 void 575 kdi_xc_others(int this_cpu, void (*func)(void)) 576 { 577 extern int IGNORE_KERNEL_PREEMPTION; 578 int save_kernel_preemption; 579 cpuset_t set; 580 581 if (!xc_initialized) 582 return; 583 584 save_kernel_preemption = IGNORE_KERNEL_PREEMPTION; 585 IGNORE_KERNEL_PREEMPTION = 1; 586 CPUSET_ALL_BUT(set, this_cpu); 587 xc_priority_common((xc_func_t)func, 0, 0, 0, CPUSET2BV(set)); 588 IGNORE_KERNEL_PREEMPTION = save_kernel_preemption; 589 } 590 591 592 593 /* 594 * Invoke function on specified processors. Remotes may continue after 595 * service with no waiting. xc_call_nowait() may return immediately too. 596 */ 597 void 598 xc_call_nowait( 599 xc_arg_t arg1, 600 xc_arg_t arg2, 601 xc_arg_t arg3, 602 ulong_t *set, 603 xc_func_t func) 604 { 605 xc_common(func, arg1, arg2, arg3, set, XC_MSG_ASYNC); 606 } 607 608 /* 609 * Invoke function on specified processors. Remotes may continue after 610 * service with no waiting. xc_call() returns only after remotes have finished. 611 */ 612 void 613 xc_call( 614 xc_arg_t arg1, 615 xc_arg_t arg2, 616 xc_arg_t arg3, 617 ulong_t *set, 618 xc_func_t func) 619 { 620 xc_common(func, arg1, arg2, arg3, set, XC_MSG_CALL); 621 } 622 623 /* 624 * Invoke function on specified processors. Remotes wait until all have 625 * finished. xc_sync() also waits until all remotes have finished. 626 */ 627 void 628 xc_sync( 629 xc_arg_t arg1, 630 xc_arg_t arg2, 631 xc_arg_t arg3, 632 ulong_t *set, 633 xc_func_t func) 634 { 635 xc_common(func, arg1, arg2, arg3, set, XC_MSG_SYNC); 636 } 637