1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * IP interface to squeues. 31 * 32 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 33 * cpu_squeue field of the cpu structure. Each squeue is associated with a 34 * connection instance (conn_t). 35 * 36 * For CPUs available at system startup time the squeue creation and association 37 * with CPU happens at MP initialization time. For CPUs added during dynamic 38 * reconfiguration, the initialization happens when the new CPU is configured in 39 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 40 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 41 * variable. 42 * 43 * There are two modes of associating connection with squeues. The first mode 44 * associates each connection with the CPU that creates the connection (either 45 * during open time or during accept time). The second mode associates each 46 * connection with a random CPU, effectively distributing load over all CPUs 47 * and all squeues in the system. The mode is controlled by the 48 * ip_squeue_fanout variable. 49 * 50 * NOTE: The fact that there is an association between each connection and 51 * squeue and squeue and CPU does not mean that each connection is always 52 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 53 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 54 * binding is only relevant for the worker thread. 55 * 56 * The list of all created squeues is kept in squeue_set structure. This list is 57 * used when ip_squeue_fanout is set and the load is distributed across all 58 * squeues. 59 * 60 * INTERFACE: 61 * 62 * squeue_t *ip_squeue_get(hint) 63 * 64 * Find an squeue based on the 'hint' value. The hint is used as an index 65 * in the array of IP squeues available. The way hint is computed may 66 * affect the effectiveness of the squeue distribution. Currently squeues 67 * are assigned in round-robin fashion using lbolt as a hint. 68 * 69 * 70 * DR Notes 71 * ======== 72 * 73 * The ip_squeue_init() registers a call-back function with the CPU DR 74 * subsystem using register_cpu_setup_func(). The call-back function does two 75 * things: 76 * 77 * o When the CPU is going off-line or unconfigured, the worker thread is 78 * unbound from the CPU. This allows the CPU unconfig code to move it to 79 * another CPU. 80 * 81 * o When the CPU is going online, it creates a new squeue for this CPU if 82 * necessary and binds the squeue worker thread to this CPU. 83 * 84 * TUNEBALES: 85 * 86 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 87 * associated with an squeue instance. 88 * 89 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 90 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 91 * an impact. 92 * 93 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 94 * otherwise get it from CPU->cpu_squeue. 95 * 96 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 97 * changed using ndd on /dev/tcp or /dev/ip. 98 * 99 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 100 * created. This is the time squeue code waits before waking up the worker 101 * thread after queuing a request. 102 */ 103 104 #include <sys/types.h> 105 #include <sys/debug.h> 106 #include <sys/kmem.h> 107 #include <sys/cpuvar.h> 108 109 #include <sys/cmn_err.h> 110 111 #include <inet/common.h> 112 #include <inet/ip.h> 113 #include <inet/ip_if.h> 114 #include <inet/mi.h> 115 #include <inet/nd.h> 116 #include <inet/ipclassifier.h> 117 #include <sys/types.h> 118 #include <sys/conf.h> 119 #include <sys/sunddi.h> 120 #include <sys/ddi.h> 121 #include <sys/squeue_impl.h> 122 123 124 /* 125 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 126 * mapping between squeue and NIC (or Rx ring) for performance reasons so 127 * each squeue can uniquely own a NIC or a Rx ring and do polling 128 * (PSARC 2004/630). So we allow up to MAX_THREAD_PER_CPU squeues per CPU. 129 * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues 130 * can be created dynamically as needed. 131 */ 132 #define MAX_THREAD_PER_CPU 32 133 #define MIN_THREAD_PER_CPU 1 134 uint_t ip_threads_per_cpu = MIN_THREAD_PER_CPU; 135 136 /* 137 * List of all created squeue sets. The size is protected by cpu_lock 138 */ 139 squeue_set_t **sqset_global_list; 140 uint_t sqset_global_size; 141 142 int ip_squeue_bind = B_TRUE; 143 int ip_squeue_profile = B_TRUE; 144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 145 146 /* 147 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 148 * created. This is the time squeue code waits before waking up the worker 149 * thread after queuing a request. 150 */ 151 uint_t ip_squeue_worker_wait = 10; 152 153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155 156 static void ip_squeue_set_bind(squeue_set_t *); 157 static void ip_squeue_set_unbind(squeue_set_t *); 158 159 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 160 161 /* 162 * Create squeue set containing ip_threads_per_cpu number of squeues 163 * for this CPU and bind them all to the CPU. 164 */ 165 static squeue_set_t * 166 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 167 { 168 int i; 169 squeue_set_t *sqs; 170 squeue_t *sqp; 171 char sqname[64]; 172 processorid_t id = cp->cpu_id; 173 174 if (reuse) { 175 int i; 176 177 /* 178 * We may already have an squeue created for this CPU. Try to 179 * find one and reuse it if possible. 180 */ 181 for (i = 0; i < sqset_global_size; i++) { 182 sqs = sqset_global_list[i]; 183 if (id == sqs->sqs_bind) 184 return (sqs); 185 } 186 } 187 188 sqs = kmem_zalloc(sizeof (squeue_set_t) + 189 (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP); 190 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 191 sqs->sqs_list = (squeue_t **)&sqs[1]; 192 sqs->sqs_max_size = MAX_THREAD_PER_CPU; 193 sqs->sqs_bind = id; 194 195 for (i = 0; i < ip_threads_per_cpu; i++) { 196 bzero(sqname, sizeof (sqname)); 197 198 (void) snprintf(sqname, sizeof (sqname), 199 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 200 cp->cpu_id, i); 201 202 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 203 minclsyspri); 204 205 ASSERT(sqp != NULL); 206 207 squeue_profile_enable(sqp); 208 sqs->sqs_list[sqs->sqs_size++] = sqp; 209 210 if (ip_squeue_create_callback != NULL) 211 ip_squeue_create_callback(sqp); 212 } 213 214 if (ip_squeue_bind && cpu_is_online(cp)) 215 ip_squeue_set_bind(sqs); 216 217 sqset_global_list[sqset_global_size++] = sqs; 218 ASSERT(sqset_global_size <= NCPU); 219 return (sqs); 220 } 221 222 /* 223 * Initialize IP squeues. 224 */ 225 void 226 ip_squeue_init(void (*callback)(squeue_t *)) 227 { 228 int i; 229 230 ASSERT(sqset_global_list == NULL); 231 232 if (ip_threads_per_cpu < MIN_THREAD_PER_CPU) 233 ip_threads_per_cpu = MIN_THREAD_PER_CPU; 234 else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU) 235 ip_threads_per_cpu = MAX_THREAD_PER_CPU; 236 237 ip_squeue_create_callback = callback; 238 squeue_init(); 239 sqset_global_list = 240 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 241 sqset_global_size = 0; 242 mutex_enter(&cpu_lock); 243 244 /* Create squeue for each active CPU available */ 245 for (i = 0; i < NCPU; i++) { 246 cpu_t *cp = cpu[i]; 247 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 248 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 249 } 250 } 251 252 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 253 254 mutex_exit(&cpu_lock); 255 256 if (ip_squeue_profile) 257 squeue_profile_start(); 258 } 259 260 /* 261 * Get squeue_t structure based on index. 262 * Since the squeue list can only grow, no need to grab any lock. 263 */ 264 squeue_t * 265 ip_squeue_random(uint_t index) 266 { 267 squeue_set_t *sqs; 268 269 sqs = sqset_global_list[index % sqset_global_size]; 270 return (sqs->sqs_list[index % sqs->sqs_size]); 271 } 272 273 /* ARGSUSED */ 274 void 275 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 276 { 277 squeue_t *sqp = arg2; 278 ill_rx_ring_t *ring = sqp->sq_rx_ring; 279 ill_t *ill; 280 281 ASSERT(sqp != NULL); 282 283 if (ring == NULL) { 284 return; 285 } 286 287 /* 288 * Clean up squeue 289 */ 290 mutex_enter(&sqp->sq_lock); 291 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 292 sqp->sq_rx_ring = NULL; 293 mutex_exit(&sqp->sq_lock); 294 295 ill = ring->rr_ill; 296 297 /* 298 * Cleanup the ring 299 */ 300 301 ring->rr_blank = NULL; 302 ring->rr_handle = NULL; 303 ring->rr_sqp = NULL; 304 305 /* 306 * Signal ill that cleanup is done 307 */ 308 mutex_enter(&ill->ill_lock); 309 ring->rr_ring_state = ILL_RING_FREE; 310 cv_signal(&ill->ill_cv); 311 mutex_exit(&ill->ill_lock); 312 } 313 314 typedef struct ip_taskq_arg { 315 ill_t *ip_taskq_ill; 316 ill_rx_ring_t *ip_taskq_ill_rx_ring; 317 cpu_t *ip_taskq_cpu; 318 } ip_taskq_arg_t; 319 320 /* 321 * Do a Rx ring to squeue binding. Find a unique squeue that is not 322 * managing a receive ring. If no such squeue exists, dynamically 323 * create a new one in the squeue set. 324 * 325 * The function runs via the system taskq. The ill passed as an 326 * argument can't go away since we hold a ref. The lock order is 327 * ill_lock -> sqs_lock -> sq_lock. 328 * 329 * If we are binding a Rx ring to a squeue attached to the offline CPU, 330 * no need to check that because squeues are never destroyed once 331 * created. 332 */ 333 /* ARGSUSED */ 334 static void 335 ip_squeue_extend(void *arg) 336 { 337 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 338 ill_t *ill = sq_arg->ip_taskq_ill; 339 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 340 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 341 squeue_set_t *sqs; 342 squeue_t *sqp = NULL; 343 char sqname[64]; 344 int i; 345 346 ASSERT(ill != NULL); 347 ASSERT(ill_rx_ring != NULL); 348 kmem_free(arg, sizeof (ip_taskq_arg_t)); 349 350 sqs = intr_cpu->cpu_squeue_set; 351 352 /* 353 * If this ill represents link aggregation, then there might be 354 * multiple NICs trying to register them selves at the same time 355 * and in order to ensure that test and assignment of free rings 356 * is sequential, we need to hold the ill_lock. 357 */ 358 mutex_enter(&ill->ill_lock); 359 mutex_enter(&sqs->sqs_lock); 360 for (i = 0; i < sqs->sqs_size; i++) { 361 mutex_enter(&sqs->sqs_list[i]->sq_lock); 362 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) { 363 sqp = sqs->sqs_list[i]; 364 break; 365 } 366 mutex_exit(&sqs->sqs_list[i]->sq_lock); 367 } 368 369 if (sqp == NULL) { 370 /* Need to create a new squeue */ 371 if (sqs->sqs_size == sqs->sqs_max_size) { 372 /* 373 * Reached the max limit for squeue 374 * we can allocate on this CPU. Leave 375 * ill_ring_state set to ILL_RING_INPROC 376 * so that ip_squeue_direct will just 377 * assign the default squeue for this 378 * ring for future connections. 379 */ 380 #ifdef DEBUG 381 cmn_err(CE_NOTE, "ip_squeue_add: Reached max " 382 " threads per CPU for sqp = %p\n", (void *)sqp); 383 #endif 384 mutex_exit(&sqs->sqs_lock); 385 mutex_exit(&ill->ill_lock); 386 ill_waiter_dcr(ill); 387 return; 388 } 389 390 bzero(sqname, sizeof (sqname)); 391 (void) snprintf(sqname, sizeof (sqname), 392 "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid, 393 CPU->cpu_id, sqs->sqs_size); 394 395 sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait, 396 minclsyspri); 397 398 ASSERT(sqp != NULL); 399 400 squeue_profile_enable(sqp); 401 sqs->sqs_list[sqs->sqs_size++] = sqp; 402 403 if (ip_squeue_create_callback != NULL) 404 ip_squeue_create_callback(sqp); 405 406 if (ip_squeue_bind) { 407 squeue_bind(sqp, -1); 408 } 409 mutex_enter(&sqp->sq_lock); 410 } 411 412 ASSERT(sqp != NULL); 413 414 sqp->sq_rx_ring = ill_rx_ring; 415 ill_rx_ring->rr_sqp = sqp; 416 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 417 418 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 419 mutex_exit(&sqp->sq_lock); 420 mutex_exit(&sqs->sqs_lock); 421 422 mutex_exit(&ill->ill_lock); 423 424 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 425 ill_waiter_dcr(ill); 426 } 427 428 /* 429 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 430 * owned by a squeue yet, do the assignment. When the NIC registers it 431 * Rx rings with IP, we don't know where the interrupts will land and 432 * hence we need to wait till this point to do the assignment. 433 */ 434 squeue_t * 435 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 436 { 437 squeue_t *sqp; 438 ill_t *ill; 439 int interrupt; 440 ip_taskq_arg_t *taskq_arg; 441 boolean_t refheld; 442 443 if (ill_rx_ring == NULL) 444 return (IP_SQUEUE_GET(lbolt)); 445 446 sqp = ill_rx_ring->rr_sqp; 447 /* 448 * Do a quick check. If it's not NULL, we are done. 449 * Squeues are never destroyed so worse we will bind 450 * this connection to a suboptimal squeue. 451 * 452 * This is the fast path case. 453 */ 454 if (sqp != NULL) 455 return (sqp); 456 457 ill = ill_rx_ring->rr_ill; 458 ASSERT(ill != NULL); 459 460 interrupt = servicing_interrupt(); 461 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 462 KM_NOSLEEP); 463 464 mutex_enter(&ill->ill_lock); 465 if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || 466 taskq_arg == NULL) { 467 /* 468 * Do the ring to squeue binding only if we are in interrupt 469 * context and there is no one else trying the bind already. 470 */ 471 mutex_exit(&ill->ill_lock); 472 if (taskq_arg != NULL) 473 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 474 return (IP_SQUEUE_GET(lbolt)); 475 } 476 477 /* 478 * No sqp assigned yet. Can't really do that in interrupt 479 * context. Assign the default sqp to this connection and 480 * trigger creation of new sqp and binding it to this ring 481 * via taskq. Need to make sure ill stays around. 482 */ 483 taskq_arg->ip_taskq_ill = ill; 484 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 485 taskq_arg->ip_taskq_cpu = CPU; 486 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 487 mutex_exit(&ill->ill_lock); 488 refheld = ill_waiter_inc(ill); 489 if (refheld) { 490 if (taskq_dispatch(system_taskq, ip_squeue_extend, 491 taskq_arg, TQ_NOSLEEP) != NULL) { 492 return (IP_SQUEUE_GET(lbolt)); 493 } 494 } 495 /* 496 * The ill is closing and we could not get a reference on the ill OR 497 * taskq_dispatch failed probably due to memory allocation failure. 498 * We will try again next time. 499 */ 500 mutex_enter(&ill->ill_lock); 501 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 502 mutex_exit(&ill->ill_lock); 503 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 504 if (refheld) 505 ill_waiter_dcr(ill); 506 507 return (IP_SQUEUE_GET(lbolt)); 508 } 509 510 /* 511 * NDD hooks for setting ip_squeue_xxx tuneables. 512 */ 513 514 /* ARGSUSED */ 515 int 516 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 517 caddr_t addr, cred_t *cr) 518 { 519 int *bind_enabled = (int *)addr; 520 long new_value; 521 int i; 522 523 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 524 return (EINVAL); 525 526 if (ip_squeue_bind == new_value) 527 return (0); 528 529 *bind_enabled = new_value; 530 mutex_enter(&cpu_lock); 531 if (new_value == 0) { 532 for (i = 0; i < sqset_global_size; i++) 533 ip_squeue_set_unbind(sqset_global_list[i]); 534 } else { 535 for (i = 0; i < sqset_global_size; i++) 536 ip_squeue_set_bind(sqset_global_list[i]); 537 } 538 539 mutex_exit(&cpu_lock); 540 return (0); 541 } 542 543 /* 544 * Set squeue profiling. 545 * 0 means "disable" 546 * 1 means "enable" 547 * 2 means "enable and reset" 548 */ 549 /* ARGSUSED */ 550 int 551 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 552 cred_t *cr) 553 { 554 int *profile_enabled = (int *)cp; 555 long new_value; 556 squeue_set_t *sqs; 557 558 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 559 return (EINVAL); 560 561 if (new_value == 0) 562 squeue_profile_stop(); 563 else if (new_value == 1) 564 squeue_profile_start(); 565 else if (new_value == 2) { 566 int i, j; 567 568 squeue_profile_stop(); 569 mutex_enter(&cpu_lock); 570 for (i = 0; i < sqset_global_size; i++) { 571 sqs = sqset_global_list[i]; 572 for (j = 0; j < sqs->sqs_size; j++) { 573 squeue_profile_reset(sqs->sqs_list[j]); 574 } 575 } 576 mutex_exit(&cpu_lock); 577 578 new_value = 1; 579 squeue_profile_start(); 580 } 581 *profile_enabled = new_value; 582 583 return (0); 584 } 585 586 /* 587 * Reconfiguration callback 588 */ 589 590 /* ARGSUSED */ 591 static int 592 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 593 { 594 cpu_t *cp = cpu[id]; 595 596 ASSERT(MUTEX_HELD(&cpu_lock)); 597 switch (what) { 598 case CPU_CONFIG: 599 /* 600 * A new CPU is added. Create an squeue for it but do not bind 601 * it yet. 602 */ 603 if (cp->cpu_squeue_set == NULL) 604 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 605 break; 606 case CPU_ON: 607 case CPU_INIT: 608 case CPU_CPUPART_IN: 609 if (cp->cpu_squeue_set == NULL) { 610 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 611 } 612 if (ip_squeue_bind) 613 ip_squeue_set_bind(cp->cpu_squeue_set); 614 break; 615 case CPU_UNCONFIG: 616 case CPU_OFF: 617 case CPU_CPUPART_OUT: 618 ASSERT((cp->cpu_squeue_set != NULL) || 619 (cp->cpu_flags & CPU_OFFLINE)); 620 621 if (cp->cpu_squeue_set != NULL) { 622 ip_squeue_set_unbind(cp->cpu_squeue_set); 623 } 624 break; 625 default: 626 break; 627 } 628 return (0); 629 } 630 631 /* ARGSUSED */ 632 static void 633 ip_squeue_set_bind(squeue_set_t *sqs) 634 { 635 int i; 636 squeue_t *sqp; 637 638 if (!ip_squeue_bind) 639 return; 640 641 mutex_enter(&sqs->sqs_lock); 642 for (i = 0; i < sqs->sqs_size; i++) { 643 sqp = sqs->sqs_list[i]; 644 if (sqp->sq_state & SQS_BOUND) 645 continue; 646 squeue_bind(sqp, -1); 647 } 648 mutex_exit(&sqs->sqs_lock); 649 } 650 651 static void 652 ip_squeue_set_unbind(squeue_set_t *sqs) 653 { 654 int i; 655 squeue_t *sqp; 656 657 mutex_enter(&sqs->sqs_lock); 658 for (i = 0; i < sqs->sqs_size; i++) { 659 sqp = sqs->sqs_list[i]; 660 if (!(sqp->sq_state & SQS_BOUND)) 661 continue; 662 squeue_unbind(sqp); 663 } 664 mutex_exit(&sqs->sqs_lock); 665 } 666