1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/mi.h> 114 #include <inet/nd.h> 115 #include <inet/ipclassifier.h> 116 #include <sys/types.h> 117 #include <sys/conf.h> 118 #include <sys/sunddi.h> 119 #include <sys/ddi.h> 120 #include <sys/squeue_impl.h> 121 122 123 /* 124 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 125 * mapping between squeue and NIC (or Rx ring) for performance reasons so 126 * each squeue can uniquely own a NIC or a Rx ring and do polling 127 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 128 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 129 * can be created dynamically as needed. 130 */ 131 #define MAX_SQUEUES_PER_CPU 32 132 #define MIN_SQUEUES_PER_CPU 1 133 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 134 135 #define IP_NUM_SOFT_RINGS 2 136 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 137 138 /* 139 * List of all created squeue sets. The size is protected by cpu_lock 140 */ 141 squeue_set_t **sqset_global_list; 142 uint_t sqset_global_size; 143 144 int ip_squeue_bind = B_TRUE; 145 int ip_squeue_profile = B_TRUE; 146 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 147 148 /* 149 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 150 * created. This is the time squeue code waits before waking up the worker 151 * thread after queuing a request. 152 */ 153 uint_t ip_squeue_worker_wait = 10; 154 155 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 156 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 157 158 static void ip_squeue_set_bind(squeue_set_t *); 159 static void ip_squeue_set_unbind(squeue_set_t *); 160 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 161 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 163 164 /* 165 * Create squeue set containing ip_squeues_per_cpu number of squeues 166 * for this CPU and bind them all to the CPU. 167 */ 168 static squeue_set_t * 169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 170 { 171 int i; 172 squeue_set_t *sqs; 173 squeue_t *sqp; 174 char sqname[64]; 175 processorid_t id = cp->cpu_id; 176 177 if (reuse) { 178 int i; 179 180 /* 181 * We may already have an squeue created for this CPU. Try to 182 * find one and reuse it if possible. 183 */ 184 for (i = 0; i < sqset_global_size; i++) { 185 sqs = sqset_global_list[i]; 186 if (id == sqs->sqs_bind) 187 return (sqs); 188 } 189 } 190 191 sqs = kmem_zalloc(sizeof (squeue_set_t) + 192 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 193 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 194 sqs->sqs_list = (squeue_t **)&sqs[1]; 195 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 196 sqs->sqs_bind = id; 197 198 for (i = 0; i < ip_squeues_per_cpu; i++) { 199 bzero(sqname, sizeof (sqname)); 200 201 (void) snprintf(sqname, sizeof (sqname), 202 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 203 cp->cpu_id, i); 204 205 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 206 minclsyspri); 207 208 /* 209 * The first squeue in each squeue_set is the DEFAULT 210 * squeue. 211 */ 212 sqp->sq_state |= SQS_DEFAULT; 213 214 ASSERT(sqp != NULL); 215 216 squeue_profile_enable(sqp); 217 sqs->sqs_list[sqs->sqs_size++] = sqp; 218 219 if (ip_squeue_create_callback != NULL) 220 ip_squeue_create_callback(sqp); 221 } 222 223 if (ip_squeue_bind && cpu_is_online(cp)) 224 ip_squeue_set_bind(sqs); 225 226 sqset_global_list[sqset_global_size++] = sqs; 227 ASSERT(sqset_global_size <= NCPU); 228 return (sqs); 229 } 230 231 /* 232 * Initialize IP squeues. 233 */ 234 void 235 ip_squeue_init(void (*callback)(squeue_t *)) 236 { 237 int i; 238 239 ASSERT(sqset_global_list == NULL); 240 241 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 243 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 244 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 245 246 ip_squeue_create_callback = callback; 247 squeue_init(); 248 sqset_global_list = 249 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 250 sqset_global_size = 0; 251 mutex_enter(&cpu_lock); 252 253 /* Create squeue for each active CPU available */ 254 for (i = 0; i < NCPU; i++) { 255 cpu_t *cp = cpu[i]; 256 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 257 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 258 } 259 } 260 261 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 262 263 mutex_exit(&cpu_lock); 264 265 if (ip_squeue_profile) 266 squeue_profile_start(); 267 } 268 269 /* 270 * Get squeue_t structure based on index. 271 * Since the squeue list can only grow, no need to grab any lock. 272 */ 273 squeue_t * 274 ip_squeue_random(uint_t index) 275 { 276 squeue_set_t *sqs; 277 278 sqs = sqset_global_list[index % sqset_global_size]; 279 return (sqs->sqs_list[index % sqs->sqs_size]); 280 } 281 282 /* ARGSUSED */ 283 void 284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 285 { 286 squeue_t *sqp = arg2; 287 ill_rx_ring_t *ring = sqp->sq_rx_ring; 288 ill_t *ill; 289 290 ASSERT(sqp != NULL); 291 292 if (ring == NULL) { 293 return; 294 } 295 296 /* 297 * Clean up squeue 298 */ 299 mutex_enter(&sqp->sq_lock); 300 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 301 sqp->sq_rx_ring = NULL; 302 mutex_exit(&sqp->sq_lock); 303 304 ill = ring->rr_ill; 305 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 306 ASSERT(ring->rr_handle != NULL); 307 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 308 } 309 310 /* 311 * Cleanup the ring 312 */ 313 314 ring->rr_blank = NULL; 315 ring->rr_handle = NULL; 316 ring->rr_sqp = NULL; 317 318 /* 319 * Signal ill that cleanup is done 320 */ 321 mutex_enter(&ill->ill_lock); 322 ring->rr_ring_state = ILL_RING_FREE; 323 cv_signal(&ill->ill_cv); 324 mutex_exit(&ill->ill_lock); 325 } 326 327 typedef struct ip_taskq_arg { 328 ill_t *ip_taskq_ill; 329 ill_rx_ring_t *ip_taskq_ill_rx_ring; 330 cpu_t *ip_taskq_cpu; 331 } ip_taskq_arg_t; 332 333 /* 334 * Do a Rx ring to squeue binding. Find a unique squeue that is not 335 * managing a receive ring. If no such squeue exists, dynamically 336 * create a new one in the squeue set. 337 * 338 * The function runs via the system taskq. The ill passed as an 339 * argument can't go away since we hold a ref. The lock order is 340 * ill_lock -> sqs_lock -> sq_lock. 341 * 342 * If we are binding a Rx ring to a squeue attached to the offline CPU, 343 * no need to check that because squeues are never destroyed once 344 * created. 345 */ 346 /* ARGSUSED */ 347 static void 348 ip_squeue_extend(void *arg) 349 { 350 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 351 ill_t *ill = sq_arg->ip_taskq_ill; 352 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 353 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 354 squeue_set_t *sqs; 355 squeue_t *sqp = NULL; 356 357 ASSERT(ill != NULL); 358 ASSERT(ill_rx_ring != NULL); 359 kmem_free(arg, sizeof (ip_taskq_arg_t)); 360 361 /* 362 * Make sure the CPU that originally took the interrupt still 363 * exists. 364 */ 365 if (!CPU_ISON(intr_cpu)) 366 intr_cpu = CPU; 367 368 sqs = intr_cpu->cpu_squeue_set; 369 370 /* 371 * If this ill represents link aggregation, then there might be 372 * multiple NICs trying to register them selves at the same time 373 * and in order to ensure that test and assignment of free rings 374 * is sequential, we need to hold the ill_lock. 375 */ 376 mutex_enter(&ill->ill_lock); 377 sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 378 if (sqp == NULL) { 379 /* 380 * We hit the max limit of squeues allowed per CPU. 381 * Assign this rx_ring to DEFAULT squeue of the 382 * interrupted CPU but the squeue will not manage 383 * the ring. Also print a warning. 384 */ 385 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 386 "has max number of squeues. System performance might " 387 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 388 389 /* the first squeue in the list is the default squeue */ 390 sqp = sqs->sqs_list[0]; 391 ASSERT(sqp != NULL); 392 ill_rx_ring->rr_sqp = sqp; 393 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 394 395 mutex_exit(&ill->ill_lock); 396 ill_waiter_dcr(ill); 397 return; 398 } 399 400 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 401 sqp->sq_rx_ring = ill_rx_ring; 402 ill_rx_ring->rr_sqp = sqp; 403 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 404 405 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 406 mutex_exit(&sqp->sq_lock); 407 408 mutex_exit(&ill->ill_lock); 409 410 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 411 ill_waiter_dcr(ill); 412 } 413 414 /* 415 * Do a Rx ring to squeue binding. Find a unique squeue that is not 416 * managing a receive ring. If no such squeue exists, dynamically 417 * create a new one in the squeue set. 418 * 419 * The function runs via the system taskq. The ill passed as an 420 * argument can't go away since we hold a ref. The lock order is 421 * ill_lock -> sqs_lock -> sq_lock. 422 * 423 * If we are binding a Rx ring to a squeue attached to the offline CPU, 424 * no need to check that because squeues are never destroyed once 425 * created. 426 */ 427 /* ARGSUSED */ 428 static void 429 ip_squeue_soft_ring_affinity(void *arg) 430 { 431 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 432 ill_t *ill = sq_arg->ip_taskq_ill; 433 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 434 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 435 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 436 cpu_t *bind_cpu; 437 int cpu_id = intr_cpu->cpu_id; 438 int min_cpu_id, max_cpu_id; 439 boolean_t enough_uniq_cpus = B_FALSE; 440 boolean_t enough_cpus = B_FALSE; 441 squeue_set_t *sqs, *last_sqs; 442 squeue_t *sqp = NULL; 443 int i, j; 444 445 ASSERT(ill != NULL); 446 kmem_free(arg, sizeof (ip_taskq_arg_t)); 447 448 /* 449 * Make sure the CPU that originally took the interrupt still 450 * exists. 451 */ 452 if (!CPU_ISON(intr_cpu)) { 453 intr_cpu = CPU; 454 cpu_id = intr_cpu->cpu_id; 455 } 456 457 /* 458 * If this ill represents link aggregation, then there might be 459 * multiple NICs trying to register them selves at the same time 460 * and in order to ensure that test and assignment of free rings 461 * is sequential, we need to hold the ill_lock. 462 */ 463 mutex_enter(&ill->ill_lock); 464 465 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 466 mutex_exit(&ill->ill_lock); 467 return; 468 } 469 /* 470 * We need to fanout the interrupts from the NIC. We do that by 471 * telling the driver underneath to create soft rings and use 472 * worker threads (if the driver advertized SOFT_RING capability) 473 * Its still a big performance win to if we can fanout to the 474 * threads on the same core that is taking interrupts. 475 * 476 * Since we don't know the interrupt to CPU binding, we don't 477 * assign any squeues or affinity to worker threads in the NIC. 478 * At the time of the first interrupt, we know which CPU is 479 * taking interrupts and try to find other threads on the same 480 * core. Assuming, ip_threads_per_cpu is correct and cpus are 481 * numbered sequentially for each core (XXX need something better 482 * than this in future), find the lowest number and highest 483 * number thread for that core. 484 * 485 * If we have one more thread per core than number of soft rings, 486 * then don't assign any worker threads to the H/W thread (cpu) 487 * taking interrupts (capability negotiation tries to ensure this) 488 * 489 * If the number of threads per core are same as the number of 490 * soft rings, then assign the worker affinity and squeue to 491 * the same cpu. 492 * 493 * Otherwise, just fanout to higher number CPUs starting from 494 * the interrupted CPU. 495 */ 496 497 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 498 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 499 500 /* 501 * Quickly check if there are enough CPUs present for fanout 502 * and also max_cpu_id is less than the id of the active CPU. 503 * We use the cpu_id stored in the last squeue_set to get 504 * an idea. The scheme is by no means perfect since it doesn't 505 * take into account CPU DR operations and the fact that 506 * interrupts themselves might change. An ideal scenario 507 * would be to ensure that interrupts run cpus by themselves 508 * and worker threads never have affinity to those CPUs. If 509 * the interrupts move to CPU which had a worker thread, it 510 * should be changed. Probably callbacks similar to CPU offline 511 * are needed to make it work perfectly. 512 */ 513 last_sqs = sqset_global_list[sqset_global_size - 1]; 514 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 515 if ((max_cpu_id - min_cpu_id) > 516 ill_soft_ring->ill_dls_soft_ring_cnt) 517 enough_uniq_cpus = B_TRUE; 518 else if ((max_cpu_id - min_cpu_id) >= 519 ill_soft_ring->ill_dls_soft_ring_cnt) 520 enough_cpus = B_TRUE; 521 } 522 523 j = 0; 524 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 525 if (enough_uniq_cpus) { 526 if ((min_cpu_id + i) == cpu_id) { 527 j++; 528 continue; 529 } 530 bind_cpu = cpu[min_cpu_id + i]; 531 } else if (enough_cpus) { 532 bind_cpu = cpu[min_cpu_id + i]; 533 } else { 534 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 535 bind_cpu = cpu[(cpu_id + i) % ncpus]; 536 } 537 538 /* 539 * Check if the CPU actually exist and active. If not, 540 * use the interrupted CPU. ip_find_unused_squeue() will 541 * find the right CPU to fanout anyway. 542 */ 543 if (!CPU_ISON(bind_cpu)) 544 bind_cpu = intr_cpu; 545 546 sqs = bind_cpu->cpu_squeue_set; 547 ASSERT(sqs != NULL); 548 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 549 550 sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 551 if (sqp == NULL) { 552 /* 553 * We hit the max limit of squeues allowed per CPU. 554 * Assign this rx_ring to DEFAULT squeue of the 555 * interrupted CPU but thesqueue will not manage 556 * the ring. Also print a warning. 557 */ 558 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 559 "%d/%p already has max number of squeues. System " 560 "performance might become suboptimal\n", 561 sqs->sqs_bind, (void *)sqs); 562 563 /* the first squeue in the list is the default squeue */ 564 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 565 ASSERT(sqp != NULL); 566 567 ill_rx_ring->rr_sqp = sqp; 568 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 569 continue; 570 571 } 572 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 573 ill_rx_ring->rr_sqp = sqp; 574 sqp->sq_rx_ring = ill_rx_ring; 575 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 576 sqp->sq_state |= SQS_ILL_BOUND; 577 578 /* assign affinity to soft ring */ 579 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 580 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 581 sqp->sq_bind); 582 } 583 mutex_exit(&sqp->sq_lock); 584 } 585 mutex_exit(&ill->ill_lock); 586 587 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 588 SOFT_RING_SRC_HASH); 589 590 mutex_enter(&ill->ill_lock); 591 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 592 mutex_exit(&ill->ill_lock); 593 594 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 595 ill_waiter_dcr(ill); 596 } 597 598 /* ARGSUSED */ 599 void 600 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 601 mblk_t *mp_chain, size_t hdrlen) 602 { 603 ip_taskq_arg_t *taskq_arg; 604 boolean_t refheld; 605 606 ASSERT(servicing_interrupt()); 607 608 mutex_enter(&ill->ill_lock); 609 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 610 taskq_arg = (ip_taskq_arg_t *) 611 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 612 613 if (taskq_arg == NULL) 614 goto out; 615 616 taskq_arg->ip_taskq_ill = ill; 617 taskq_arg->ip_taskq_ill_rx_ring = NULL; 618 taskq_arg->ip_taskq_cpu = CPU; 619 620 /* 621 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 622 * the next interrupt to schedule a task for calling 623 * ip_squeue_soft_ring_affinity(); 624 */ 625 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 626 } else { 627 mutex_exit(&ill->ill_lock); 628 goto out; 629 } 630 mutex_exit(&ill->ill_lock); 631 refheld = ill_waiter_inc(ill); 632 if (refheld) { 633 if (taskq_dispatch(system_taskq, 634 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 635 goto out; 636 637 /* release ref on ill if taskq dispatch fails */ 638 ill_waiter_dcr(ill); 639 } 640 /* 641 * Turn on CAPAB_SOFT_RING so that affinity assignment 642 * can be tried again later. 643 */ 644 mutex_enter(&ill->ill_lock); 645 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 646 mutex_exit(&ill->ill_lock); 647 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 648 649 out: 650 ip_input(ill, NULL, mp_chain, hdrlen); 651 } 652 653 static squeue_t * 654 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 655 { 656 int i; 657 squeue_set_t *best_sqs = NULL; 658 squeue_set_t *curr_sqs = NULL; 659 int min_sq = 0; 660 squeue_t *sqp = NULL; 661 char sqname[64]; 662 663 /* 664 * If fanout is set and the passed squeue_set already has some 665 * squeues which are managing the NICs, try to find squeues on 666 * unused CPU. 667 */ 668 if (sqs->sqs_size > 1 && fanout) { 669 /* 670 * First check to see if any squeue on the CPU passed 671 * is managing a NIC. 672 */ 673 for (i = 0; i < sqs->sqs_size; i++) { 674 mutex_enter(&sqs->sqs_list[i]->sq_lock); 675 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 676 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 677 mutex_exit(&sqs->sqs_list[i]->sq_lock); 678 break; 679 } 680 mutex_exit(&sqs->sqs_list[i]->sq_lock); 681 } 682 if (i != sqs->sqs_size) { 683 best_sqs = sqset_global_list[sqset_global_size - 1]; 684 min_sq = best_sqs->sqs_size; 685 686 for (i = sqset_global_size - 2; i >= 0; i--) { 687 curr_sqs = sqset_global_list[i]; 688 if (curr_sqs->sqs_size < min_sq) { 689 best_sqs = curr_sqs; 690 min_sq = curr_sqs->sqs_size; 691 } 692 } 693 694 ASSERT(best_sqs != NULL); 695 sqs = best_sqs; 696 bind_cpu = cpu[sqs->sqs_bind]; 697 } 698 } 699 700 mutex_enter(&sqs->sqs_lock); 701 702 for (i = 0; i < sqs->sqs_size; i++) { 703 mutex_enter(&sqs->sqs_list[i]->sq_lock); 704 if ((sqs->sqs_list[i]->sq_state & 705 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 706 sqp = sqs->sqs_list[i]; 707 break; 708 } 709 mutex_exit(&sqs->sqs_list[i]->sq_lock); 710 } 711 712 if (sqp == NULL) { 713 /* Need to create a new squeue */ 714 if (sqs->sqs_size == sqs->sqs_max_size) { 715 /* 716 * Reached the max limit for squeue 717 * we can allocate on this CPU. 718 */ 719 mutex_exit(&sqs->sqs_lock); 720 return (NULL); 721 } 722 723 bzero(sqname, sizeof (sqname)); 724 (void) snprintf(sqname, sizeof (sqname), 725 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 726 bind_cpu->cpu_id, sqs->sqs_size); 727 728 sqp = squeue_create(sqname, bind_cpu->cpu_id, 729 ip_squeue_worker_wait, minclsyspri); 730 731 ASSERT(sqp != NULL); 732 733 squeue_profile_enable(sqp); 734 sqs->sqs_list[sqs->sqs_size++] = sqp; 735 736 if (ip_squeue_create_callback != NULL) 737 ip_squeue_create_callback(sqp); 738 739 mutex_enter(&cpu_lock); 740 if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 741 squeue_bind(sqp, -1); 742 } 743 mutex_exit(&cpu_lock); 744 745 mutex_enter(&sqp->sq_lock); 746 } 747 748 mutex_exit(&sqs->sqs_lock); 749 ASSERT(sqp != NULL); 750 return (sqp); 751 } 752 753 /* 754 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 755 * owned by a squeue yet, do the assignment. When the NIC registers it 756 * Rx rings with IP, we don't know where the interrupts will land and 757 * hence we need to wait till this point to do the assignment. 758 */ 759 squeue_t * 760 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 761 { 762 squeue_t *sqp; 763 ill_t *ill; 764 int interrupt; 765 ip_taskq_arg_t *taskq_arg; 766 boolean_t refheld; 767 768 if (ill_rx_ring == NULL) 769 return (IP_SQUEUE_GET(lbolt)); 770 771 sqp = ill_rx_ring->rr_sqp; 772 /* 773 * Do a quick check. If it's not NULL, we are done. 774 * Squeues are never destroyed so worse we will bind 775 * this connection to a suboptimal squeue. 776 * 777 * This is the fast path case. 778 */ 779 if (sqp != NULL) 780 return (sqp); 781 782 ill = ill_rx_ring->rr_ill; 783 ASSERT(ill != NULL); 784 785 interrupt = servicing_interrupt(); 786 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 787 KM_NOSLEEP); 788 789 mutex_enter(&ill->ill_lock); 790 if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || 791 taskq_arg == NULL) { 792 /* 793 * Do the ring to squeue binding only if we are in interrupt 794 * context and there is no one else trying the bind already. 795 */ 796 mutex_exit(&ill->ill_lock); 797 if (taskq_arg != NULL) 798 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 799 return (IP_SQUEUE_GET(lbolt)); 800 } 801 802 /* 803 * No sqp assigned yet. Can't really do that in interrupt 804 * context. Assign the default sqp to this connection and 805 * trigger creation of new sqp and binding it to this ring 806 * via taskq. Need to make sure ill stays around. 807 */ 808 taskq_arg->ip_taskq_ill = ill; 809 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 810 taskq_arg->ip_taskq_cpu = CPU; 811 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 812 mutex_exit(&ill->ill_lock); 813 refheld = ill_waiter_inc(ill); 814 if (refheld) { 815 if (taskq_dispatch(system_taskq, ip_squeue_extend, 816 taskq_arg, TQ_NOSLEEP) != NULL) { 817 return (IP_SQUEUE_GET(lbolt)); 818 } 819 } 820 /* 821 * The ill is closing and we could not get a reference on the ill OR 822 * taskq_dispatch failed probably due to memory allocation failure. 823 * We will try again next time. 824 */ 825 mutex_enter(&ill->ill_lock); 826 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 827 mutex_exit(&ill->ill_lock); 828 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 829 if (refheld) 830 ill_waiter_dcr(ill); 831 832 return (IP_SQUEUE_GET(lbolt)); 833 } 834 835 /* 836 * NDD hooks for setting ip_squeue_xxx tuneables. 837 */ 838 839 /* ARGSUSED */ 840 int 841 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 842 caddr_t addr, cred_t *cr) 843 { 844 int *bind_enabled = (int *)addr; 845 long new_value; 846 int i; 847 848 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 849 return (EINVAL); 850 851 if (ip_squeue_bind == new_value) 852 return (0); 853 854 *bind_enabled = new_value; 855 mutex_enter(&cpu_lock); 856 if (new_value == 0) { 857 for (i = 0; i < sqset_global_size; i++) 858 ip_squeue_set_unbind(sqset_global_list[i]); 859 } else { 860 for (i = 0; i < sqset_global_size; i++) 861 ip_squeue_set_bind(sqset_global_list[i]); 862 } 863 864 mutex_exit(&cpu_lock); 865 return (0); 866 } 867 868 /* 869 * Set squeue profiling. 870 * 0 means "disable" 871 * 1 means "enable" 872 * 2 means "enable and reset" 873 */ 874 /* ARGSUSED */ 875 int 876 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 877 cred_t *cr) 878 { 879 int *profile_enabled = (int *)cp; 880 long new_value; 881 squeue_set_t *sqs; 882 883 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 884 return (EINVAL); 885 886 if (new_value == 0) 887 squeue_profile_stop(); 888 else if (new_value == 1) 889 squeue_profile_start(); 890 else if (new_value == 2) { 891 int i, j; 892 893 squeue_profile_stop(); 894 mutex_enter(&cpu_lock); 895 for (i = 0; i < sqset_global_size; i++) { 896 sqs = sqset_global_list[i]; 897 for (j = 0; j < sqs->sqs_size; j++) { 898 squeue_profile_reset(sqs->sqs_list[j]); 899 } 900 } 901 mutex_exit(&cpu_lock); 902 903 new_value = 1; 904 squeue_profile_start(); 905 } 906 *profile_enabled = new_value; 907 908 return (0); 909 } 910 911 /* 912 * Reconfiguration callback 913 */ 914 915 /* ARGSUSED */ 916 static int 917 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 918 { 919 cpu_t *cp = cpu[id]; 920 921 ASSERT(MUTEX_HELD(&cpu_lock)); 922 switch (what) { 923 case CPU_CONFIG: 924 /* 925 * A new CPU is added. Create an squeue for it but do not bind 926 * it yet. 927 */ 928 if (cp->cpu_squeue_set == NULL) 929 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 930 break; 931 case CPU_ON: 932 case CPU_INIT: 933 case CPU_CPUPART_IN: 934 if (cp->cpu_squeue_set == NULL) { 935 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 936 } 937 if (ip_squeue_bind) 938 ip_squeue_set_bind(cp->cpu_squeue_set); 939 break; 940 case CPU_UNCONFIG: 941 case CPU_OFF: 942 case CPU_CPUPART_OUT: 943 ASSERT((cp->cpu_squeue_set != NULL) || 944 (cp->cpu_flags & CPU_OFFLINE)); 945 946 if (cp->cpu_squeue_set != NULL) { 947 ip_squeue_set_unbind(cp->cpu_squeue_set); 948 } 949 break; 950 default: 951 break; 952 } 953 return (0); 954 } 955 956 /* ARGSUSED */ 957 static void 958 ip_squeue_set_bind(squeue_set_t *sqs) 959 { 960 int i; 961 squeue_t *sqp; 962 963 if (!ip_squeue_bind) 964 return; 965 966 mutex_enter(&sqs->sqs_lock); 967 for (i = 0; i < sqs->sqs_size; i++) { 968 sqp = sqs->sqs_list[i]; 969 if (sqp->sq_state & SQS_BOUND) 970 continue; 971 squeue_bind(sqp, -1); 972 } 973 mutex_exit(&sqs->sqs_lock); 974 } 975 976 static void 977 ip_squeue_set_unbind(squeue_set_t *sqs) 978 { 979 int i; 980 squeue_t *sqp; 981 982 mutex_enter(&sqs->sqs_lock); 983 for (i = 0; i < sqs->sqs_size; i++) { 984 sqp = sqs->sqs_list[i]; 985 986 /* 987 * CPU is going offline. Remove the thread affinity 988 * for any soft ring threads the squeue is managing. 989 */ 990 if (sqp->sq_state & SQS_ILL_BOUND) { 991 ill_rx_ring_t *ring = sqp->sq_rx_ring; 992 ill_t *ill = ring->rr_ill; 993 994 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 995 ASSERT(ring->rr_handle != NULL); 996 ill->ill_dls_capab->ill_dls_unbind( 997 ring->rr_handle); 998 } 999 } 1000 if (!(sqp->sq_state & SQS_BOUND)) 1001 continue; 1002 squeue_unbind(sqp); 1003 } 1004 mutex_exit(&sqs->sqs_lock); 1005 } 1006