1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/nd.h> 114 #include <inet/ipclassifier.h> 115 #include <sys/types.h> 116 #include <sys/conf.h> 117 #include <sys/sunddi.h> 118 #include <sys/dlpi.h> 119 #include <sys/squeue_impl.h> 120 121 /* 122 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 123 * mapping between squeue and NIC (or Rx ring) for performance reasons so 124 * each squeue can uniquely own a NIC or a Rx ring and do polling 125 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 126 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 127 * can be created dynamically as needed. 128 */ 129 #define MAX_SQUEUES_PER_CPU 32 130 #define MIN_SQUEUES_PER_CPU 1 131 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 132 133 #define IP_NUM_SOFT_RINGS 2 134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 135 136 /* 137 * List of all created squeue sets. The size is protected by cpu_lock 138 */ 139 squeue_set_t **sqset_global_list; 140 uint_t sqset_global_size; 141 142 int ip_squeue_bind = B_TRUE; 143 int ip_squeue_profile = B_TRUE; 144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 145 146 /* 147 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 148 * created. This is the time squeue code waits before waking up the worker 149 * thread after queuing a request. 150 */ 151 uint_t ip_squeue_worker_wait = 10; 152 153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155 156 static void ip_squeue_set_bind(squeue_set_t *); 157 static void ip_squeue_set_unbind(squeue_set_t *); 158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 159 160 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 161 162 /* 163 * Create squeue set containing ip_squeues_per_cpu number of squeues 164 * for this CPU and bind them all to the CPU. 165 */ 166 static squeue_set_t * 167 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 168 { 169 int i; 170 squeue_set_t *sqs; 171 squeue_t *sqp; 172 char sqname[64]; 173 processorid_t id = cp->cpu_id; 174 175 if (reuse) { 176 int i; 177 178 /* 179 * We may already have an squeue created for this CPU. Try to 180 * find one and reuse it if possible. 181 */ 182 for (i = 0; i < sqset_global_size; i++) { 183 sqs = sqset_global_list[i]; 184 if (id == sqs->sqs_bind) 185 return (sqs); 186 } 187 } 188 189 sqs = kmem_zalloc(sizeof (squeue_set_t) + 190 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 191 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 192 sqs->sqs_list = (squeue_t **)&sqs[1]; 193 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 194 sqs->sqs_bind = id; 195 196 for (i = 0; i < ip_squeues_per_cpu; i++) { 197 bzero(sqname, sizeof (sqname)); 198 199 (void) snprintf(sqname, sizeof (sqname), 200 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 201 cp->cpu_id, i); 202 203 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 204 minclsyspri); 205 206 /* 207 * The first squeue in each squeue_set is the DEFAULT 208 * squeue. 209 */ 210 sqp->sq_state |= SQS_DEFAULT; 211 212 ASSERT(sqp != NULL); 213 214 squeue_profile_enable(sqp); 215 sqs->sqs_list[sqs->sqs_size++] = sqp; 216 217 if (ip_squeue_create_callback != NULL) 218 ip_squeue_create_callback(sqp); 219 } 220 221 if (ip_squeue_bind && cpu_is_online(cp)) 222 ip_squeue_set_bind(sqs); 223 224 sqset_global_list[sqset_global_size++] = sqs; 225 ASSERT(sqset_global_size <= NCPU); 226 return (sqs); 227 } 228 229 /* 230 * Initialize IP squeues. 231 */ 232 void 233 ip_squeue_init(void (*callback)(squeue_t *)) 234 { 235 int i; 236 237 ASSERT(sqset_global_list == NULL); 238 239 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 240 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 241 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 243 244 ip_squeue_create_callback = callback; 245 squeue_init(); 246 sqset_global_list = 247 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 248 sqset_global_size = 0; 249 mutex_enter(&cpu_lock); 250 251 /* Create squeue for each active CPU available */ 252 for (i = 0; i < NCPU; i++) { 253 cpu_t *cp = cpu[i]; 254 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 255 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 256 } 257 } 258 259 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 260 261 mutex_exit(&cpu_lock); 262 263 if (ip_squeue_profile) 264 squeue_profile_start(); 265 } 266 267 /* 268 * Get squeue_t structure based on index. 269 * Since the squeue list can only grow, no need to grab any lock. 270 */ 271 squeue_t * 272 ip_squeue_random(uint_t index) 273 { 274 squeue_set_t *sqs; 275 276 sqs = sqset_global_list[index % sqset_global_size]; 277 return (sqs->sqs_list[index % sqs->sqs_size]); 278 } 279 280 /* ARGSUSED */ 281 void 282 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 283 { 284 squeue_t *sqp = arg2; 285 ill_rx_ring_t *ring = sqp->sq_rx_ring; 286 ill_t *ill; 287 288 ASSERT(sqp != NULL); 289 290 if (ring == NULL) { 291 return; 292 } 293 294 /* 295 * Clean up squeue 296 */ 297 mutex_enter(&sqp->sq_lock); 298 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 299 sqp->sq_rx_ring = NULL; 300 mutex_exit(&sqp->sq_lock); 301 302 ill = ring->rr_ill; 303 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 304 ASSERT(ring->rr_handle != NULL); 305 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 306 } 307 308 /* 309 * Cleanup the ring 310 */ 311 312 ring->rr_blank = NULL; 313 ring->rr_handle = NULL; 314 ring->rr_sqp = NULL; 315 316 /* 317 * Signal ill that cleanup is done 318 */ 319 mutex_enter(&ill->ill_lock); 320 ring->rr_ring_state = ILL_RING_FREE; 321 cv_signal(&ill->ill_cv); 322 mutex_exit(&ill->ill_lock); 323 } 324 325 typedef struct ip_taskq_arg { 326 ill_t *ip_taskq_ill; 327 ill_rx_ring_t *ip_taskq_ill_rx_ring; 328 cpu_t *ip_taskq_cpu; 329 } ip_taskq_arg_t; 330 331 /* 332 * Do a Rx ring to squeue binding. Find a unique squeue that is not 333 * managing a receive ring. If no such squeue exists, dynamically 334 * create a new one in the squeue set. 335 * 336 * The function runs via the system taskq. The ill passed as an 337 * argument can't go away since we hold a ref. The lock order is 338 * ill_lock -> sqs_lock -> sq_lock. 339 * 340 * If we are binding a Rx ring to a squeue attached to the offline CPU, 341 * no need to check that because squeues are never destroyed once 342 * created. 343 */ 344 /* ARGSUSED */ 345 static void 346 ip_squeue_extend(void *arg) 347 { 348 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 349 ill_t *ill = sq_arg->ip_taskq_ill; 350 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 351 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 352 squeue_set_t *sqs; 353 squeue_t *sqp = NULL; 354 355 ASSERT(ill != NULL); 356 ASSERT(ill_rx_ring != NULL); 357 kmem_free(arg, sizeof (ip_taskq_arg_t)); 358 359 /* 360 * Make sure the CPU that originally took the interrupt still 361 * exists. 362 */ 363 if (!CPU_ISON(intr_cpu)) 364 intr_cpu = CPU; 365 366 sqs = intr_cpu->cpu_squeue_set; 367 368 /* 369 * If this ill represents link aggregation, then there might be 370 * multiple NICs trying to register them selves at the same time 371 * and in order to ensure that test and assignment of free rings 372 * is sequential, we need to hold the ill_lock. 373 */ 374 mutex_enter(&ill->ill_lock); 375 sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 376 if (sqp == NULL) { 377 /* 378 * We hit the max limit of squeues allowed per CPU. 379 * Assign this rx_ring to DEFAULT squeue of the 380 * interrupted CPU but the squeue will not manage 381 * the ring. Also print a warning. 382 */ 383 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 384 "has max number of squeues. System performance might " 385 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 386 387 /* the first squeue in the list is the default squeue */ 388 sqp = sqs->sqs_list[0]; 389 ASSERT(sqp != NULL); 390 ill_rx_ring->rr_sqp = sqp; 391 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 392 393 mutex_exit(&ill->ill_lock); 394 ill_waiter_dcr(ill); 395 return; 396 } 397 398 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 399 sqp->sq_rx_ring = ill_rx_ring; 400 ill_rx_ring->rr_sqp = sqp; 401 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 402 403 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 404 mutex_exit(&sqp->sq_lock); 405 406 mutex_exit(&ill->ill_lock); 407 408 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 409 ill_waiter_dcr(ill); 410 } 411 412 /* 413 * Do a Rx ring to squeue binding. Find a unique squeue that is not 414 * managing a receive ring. If no such squeue exists, dynamically 415 * create a new one in the squeue set. 416 * 417 * The function runs via the system taskq. The ill passed as an 418 * argument can't go away since we hold a ref. The lock order is 419 * ill_lock -> sqs_lock -> sq_lock. 420 * 421 * If we are binding a Rx ring to a squeue attached to the offline CPU, 422 * no need to check that because squeues are never destroyed once 423 * created. 424 */ 425 /* ARGSUSED */ 426 static void 427 ip_squeue_soft_ring_affinity(void *arg) 428 { 429 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 430 ill_t *ill = sq_arg->ip_taskq_ill; 431 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 432 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 433 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 434 cpu_t *bind_cpu; 435 int cpu_id = intr_cpu->cpu_id; 436 int min_cpu_id, max_cpu_id; 437 boolean_t enough_uniq_cpus = B_FALSE; 438 boolean_t enough_cpus = B_FALSE; 439 squeue_set_t *sqs, *last_sqs; 440 squeue_t *sqp = NULL; 441 int i, j; 442 443 ASSERT(ill != NULL); 444 kmem_free(arg, sizeof (ip_taskq_arg_t)); 445 446 /* 447 * Make sure the CPU that originally took the interrupt still 448 * exists. 449 */ 450 if (!CPU_ISON(intr_cpu)) { 451 intr_cpu = CPU; 452 cpu_id = intr_cpu->cpu_id; 453 } 454 455 /* 456 * If this ill represents link aggregation, then there might be 457 * multiple NICs trying to register them selves at the same time 458 * and in order to ensure that test and assignment of free rings 459 * is sequential, we need to hold the ill_lock. 460 */ 461 mutex_enter(&ill->ill_lock); 462 463 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 464 mutex_exit(&ill->ill_lock); 465 return; 466 } 467 /* 468 * We need to fanout the interrupts from the NIC. We do that by 469 * telling the driver underneath to create soft rings and use 470 * worker threads (if the driver advertized SOFT_RING capability) 471 * Its still a big performance win to if we can fanout to the 472 * threads on the same core that is taking interrupts. 473 * 474 * Since we don't know the interrupt to CPU binding, we don't 475 * assign any squeues or affinity to worker threads in the NIC. 476 * At the time of the first interrupt, we know which CPU is 477 * taking interrupts and try to find other threads on the same 478 * core. Assuming, ip_threads_per_cpu is correct and cpus are 479 * numbered sequentially for each core (XXX need something better 480 * than this in future), find the lowest number and highest 481 * number thread for that core. 482 * 483 * If we have one more thread per core than number of soft rings, 484 * then don't assign any worker threads to the H/W thread (cpu) 485 * taking interrupts (capability negotiation tries to ensure this) 486 * 487 * If the number of threads per core are same as the number of 488 * soft rings, then assign the worker affinity and squeue to 489 * the same cpu. 490 * 491 * Otherwise, just fanout to higher number CPUs starting from 492 * the interrupted CPU. 493 */ 494 495 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 496 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 497 498 /* 499 * Quickly check if there are enough CPUs present for fanout 500 * and also max_cpu_id is less than the id of the active CPU. 501 * We use the cpu_id stored in the last squeue_set to get 502 * an idea. The scheme is by no means perfect since it doesn't 503 * take into account CPU DR operations and the fact that 504 * interrupts themselves might change. An ideal scenario 505 * would be to ensure that interrupts run cpus by themselves 506 * and worker threads never have affinity to those CPUs. If 507 * the interrupts move to CPU which had a worker thread, it 508 * should be changed. Probably callbacks similar to CPU offline 509 * are needed to make it work perfectly. 510 */ 511 last_sqs = sqset_global_list[sqset_global_size - 1]; 512 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 513 if ((max_cpu_id - min_cpu_id) > 514 ill_soft_ring->ill_dls_soft_ring_cnt) 515 enough_uniq_cpus = B_TRUE; 516 else if ((max_cpu_id - min_cpu_id) >= 517 ill_soft_ring->ill_dls_soft_ring_cnt) 518 enough_cpus = B_TRUE; 519 } 520 521 j = 0; 522 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 523 if (enough_uniq_cpus) { 524 if ((min_cpu_id + i) == cpu_id) { 525 j++; 526 continue; 527 } 528 bind_cpu = cpu[min_cpu_id + i]; 529 } else if (enough_cpus) { 530 bind_cpu = cpu[min_cpu_id + i]; 531 } else { 532 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 533 bind_cpu = cpu[(cpu_id + i) % ncpus]; 534 } 535 536 /* 537 * Check if the CPU actually exist and active. If not, 538 * use the interrupted CPU. ip_find_unused_squeue() will 539 * find the right CPU to fanout anyway. 540 */ 541 if (!CPU_ISON(bind_cpu)) 542 bind_cpu = intr_cpu; 543 544 sqs = bind_cpu->cpu_squeue_set; 545 ASSERT(sqs != NULL); 546 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 547 548 sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 549 if (sqp == NULL) { 550 /* 551 * We hit the max limit of squeues allowed per CPU. 552 * Assign this rx_ring to DEFAULT squeue of the 553 * interrupted CPU but thesqueue will not manage 554 * the ring. Also print a warning. 555 */ 556 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 557 "%d/%p already has max number of squeues. System " 558 "performance might become suboptimal\n", 559 sqs->sqs_bind, (void *)sqs); 560 561 /* the first squeue in the list is the default squeue */ 562 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 563 ASSERT(sqp != NULL); 564 565 ill_rx_ring->rr_sqp = sqp; 566 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 567 continue; 568 569 } 570 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 571 ill_rx_ring->rr_sqp = sqp; 572 sqp->sq_rx_ring = ill_rx_ring; 573 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 574 sqp->sq_state |= SQS_ILL_BOUND; 575 576 /* assign affinity to soft ring */ 577 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 578 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 579 sqp->sq_bind); 580 } 581 mutex_exit(&sqp->sq_lock); 582 } 583 mutex_exit(&ill->ill_lock); 584 585 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 586 SOFT_RING_FANOUT); 587 588 mutex_enter(&ill->ill_lock); 589 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 590 mutex_exit(&ill->ill_lock); 591 592 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 593 ill_waiter_dcr(ill); 594 } 595 596 /* ARGSUSED */ 597 void 598 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 599 mblk_t *mp_chain, struct mac_header_info_s *mhip) 600 { 601 ip_taskq_arg_t *taskq_arg; 602 boolean_t refheld; 603 604 ASSERT(servicing_interrupt()); 605 606 mutex_enter(&ill->ill_lock); 607 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 608 taskq_arg = (ip_taskq_arg_t *) 609 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 610 611 if (taskq_arg == NULL) 612 goto out; 613 614 taskq_arg->ip_taskq_ill = ill; 615 taskq_arg->ip_taskq_ill_rx_ring = NULL; 616 taskq_arg->ip_taskq_cpu = CPU; 617 618 /* 619 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 620 * the next interrupt to schedule a task for calling 621 * ip_squeue_soft_ring_affinity(); 622 */ 623 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 624 } else { 625 mutex_exit(&ill->ill_lock); 626 goto out; 627 } 628 mutex_exit(&ill->ill_lock); 629 refheld = ill_waiter_inc(ill); 630 if (refheld) { 631 if (taskq_dispatch(system_taskq, 632 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 633 goto out; 634 635 /* release ref on ill if taskq dispatch fails */ 636 ill_waiter_dcr(ill); 637 } 638 /* 639 * Turn on CAPAB_SOFT_RING so that affinity assignment 640 * can be tried again later. 641 */ 642 mutex_enter(&ill->ill_lock); 643 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 644 mutex_exit(&ill->ill_lock); 645 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 646 647 out: 648 ip_input(ill, NULL, mp_chain, mhip); 649 } 650 651 static squeue_t * 652 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 653 { 654 int i; 655 squeue_set_t *best_sqs = NULL; 656 squeue_set_t *curr_sqs = NULL; 657 int min_sq = 0; 658 squeue_t *sqp = NULL; 659 char sqname[64]; 660 661 /* 662 * If fanout is set and the passed squeue_set already has some 663 * squeues which are managing the NICs, try to find squeues on 664 * unused CPU. 665 */ 666 if (sqs->sqs_size > 1 && fanout) { 667 /* 668 * First check to see if any squeue on the CPU passed 669 * is managing a NIC. 670 */ 671 for (i = 0; i < sqs->sqs_size; i++) { 672 mutex_enter(&sqs->sqs_list[i]->sq_lock); 673 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 674 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 675 mutex_exit(&sqs->sqs_list[i]->sq_lock); 676 break; 677 } 678 mutex_exit(&sqs->sqs_list[i]->sq_lock); 679 } 680 if (i != sqs->sqs_size) { 681 best_sqs = sqset_global_list[sqset_global_size - 1]; 682 min_sq = best_sqs->sqs_size; 683 684 for (i = sqset_global_size - 2; i >= 0; i--) { 685 curr_sqs = sqset_global_list[i]; 686 if (curr_sqs->sqs_size < min_sq) { 687 best_sqs = curr_sqs; 688 min_sq = curr_sqs->sqs_size; 689 } 690 } 691 692 ASSERT(best_sqs != NULL); 693 sqs = best_sqs; 694 bind_cpu = cpu[sqs->sqs_bind]; 695 } 696 } 697 698 mutex_enter(&sqs->sqs_lock); 699 700 for (i = 0; i < sqs->sqs_size; i++) { 701 mutex_enter(&sqs->sqs_list[i]->sq_lock); 702 if ((sqs->sqs_list[i]->sq_state & 703 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 704 sqp = sqs->sqs_list[i]; 705 break; 706 } 707 mutex_exit(&sqs->sqs_list[i]->sq_lock); 708 } 709 710 if (sqp == NULL) { 711 /* Need to create a new squeue */ 712 if (sqs->sqs_size == sqs->sqs_max_size) { 713 /* 714 * Reached the max limit for squeue 715 * we can allocate on this CPU. 716 */ 717 mutex_exit(&sqs->sqs_lock); 718 return (NULL); 719 } 720 721 bzero(sqname, sizeof (sqname)); 722 (void) snprintf(sqname, sizeof (sqname), 723 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 724 bind_cpu->cpu_id, sqs->sqs_size); 725 726 sqp = squeue_create(sqname, bind_cpu->cpu_id, 727 ip_squeue_worker_wait, minclsyspri); 728 729 ASSERT(sqp != NULL); 730 731 squeue_profile_enable(sqp); 732 sqs->sqs_list[sqs->sqs_size++] = sqp; 733 734 if (ip_squeue_create_callback != NULL) 735 ip_squeue_create_callback(sqp); 736 737 mutex_enter(&cpu_lock); 738 if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 739 squeue_bind(sqp, -1); 740 } 741 mutex_exit(&cpu_lock); 742 743 mutex_enter(&sqp->sq_lock); 744 } 745 746 mutex_exit(&sqs->sqs_lock); 747 ASSERT(sqp != NULL); 748 return (sqp); 749 } 750 751 /* 752 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 753 * owned by a squeue yet, do the assignment. When the NIC registers it 754 * Rx rings with IP, we don't know where the interrupts will land and 755 * hence we need to wait till this point to do the assignment. 756 */ 757 squeue_t * 758 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 759 { 760 squeue_t *sqp; 761 ill_t *ill; 762 int interrupt; 763 ip_taskq_arg_t *taskq_arg; 764 boolean_t refheld; 765 766 if (ill_rx_ring == NULL) 767 return (IP_SQUEUE_GET(lbolt)); 768 769 sqp = ill_rx_ring->rr_sqp; 770 /* 771 * Do a quick check. If it's not NULL, we are done. 772 * Squeues are never destroyed so worse we will bind 773 * this connection to a suboptimal squeue. 774 * 775 * This is the fast path case. 776 */ 777 if (sqp != NULL) 778 return (sqp); 779 780 ill = ill_rx_ring->rr_ill; 781 ASSERT(ill != NULL); 782 783 interrupt = servicing_interrupt(); 784 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 785 KM_NOSLEEP); 786 787 mutex_enter(&ill->ill_lock); 788 /* 789 * Check sqp under the lock again for atomicity. Possible race with 790 * a previously scheduled ip_squeue_get -> ip_squeue_extend. 791 * Do the ring to squeue binding only if we are in interrupt context 792 * AND the ring is not already bound AND there is no one else trying 793 * the bind already. 794 */ 795 sqp = ill_rx_ring->rr_sqp; 796 if (sqp != NULL || !interrupt || 797 ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { 798 /* 799 * Note that the ring might get bound once we drop the lock 800 * below, if a previous request is in progress i.e. if the ring 801 * state is ILL_RING_INPROC. The incoming connection on whose 802 * behalf we are currently here might get a suboptimal squeue 803 * via the call to IP_SQUEUE_GET below, but there is no 804 * correctness issue. 805 */ 806 mutex_exit(&ill->ill_lock); 807 if (taskq_arg != NULL) 808 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 809 if (sqp != NULL) 810 return (sqp); 811 return (IP_SQUEUE_GET(lbolt)); 812 } 813 814 /* 815 * No sqp assigned yet. Can't really do that in interrupt 816 * context. Assign the default sqp to this connection and 817 * trigger creation of new sqp and binding it to this ring 818 * via taskq. Need to make sure ill stays around. 819 */ 820 taskq_arg->ip_taskq_ill = ill; 821 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 822 taskq_arg->ip_taskq_cpu = CPU; 823 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 824 mutex_exit(&ill->ill_lock); 825 refheld = ill_waiter_inc(ill); 826 if (refheld) { 827 if (taskq_dispatch(system_taskq, ip_squeue_extend, 828 taskq_arg, TQ_NOSLEEP) != NULL) { 829 return (IP_SQUEUE_GET(lbolt)); 830 } 831 } 832 /* 833 * The ill is closing and we could not get a reference on the ill OR 834 * taskq_dispatch failed probably due to memory allocation failure. 835 * We will try again next time. 836 */ 837 mutex_enter(&ill->ill_lock); 838 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 839 mutex_exit(&ill->ill_lock); 840 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 841 if (refheld) 842 ill_waiter_dcr(ill); 843 844 return (IP_SQUEUE_GET(lbolt)); 845 } 846 847 /* 848 * NDD hooks for setting ip_squeue_xxx tuneables. 849 */ 850 851 /* ARGSUSED */ 852 int 853 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 854 caddr_t addr, cred_t *cr) 855 { 856 int *bind_enabled = (int *)addr; 857 long new_value; 858 int i; 859 860 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 861 return (EINVAL); 862 863 if (ip_squeue_bind == new_value) 864 return (0); 865 866 *bind_enabled = new_value; 867 mutex_enter(&cpu_lock); 868 if (new_value == 0) { 869 for (i = 0; i < sqset_global_size; i++) 870 ip_squeue_set_unbind(sqset_global_list[i]); 871 } else { 872 for (i = 0; i < sqset_global_size; i++) 873 ip_squeue_set_bind(sqset_global_list[i]); 874 } 875 876 mutex_exit(&cpu_lock); 877 return (0); 878 } 879 880 /* 881 * Set squeue profiling. 882 * 0 means "disable" 883 * 1 means "enable" 884 * 2 means "enable and reset" 885 */ 886 /* ARGSUSED */ 887 int 888 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 889 cred_t *cr) 890 { 891 int *profile_enabled = (int *)cp; 892 long new_value; 893 squeue_set_t *sqs; 894 895 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 896 return (EINVAL); 897 898 if (new_value == 0) 899 squeue_profile_stop(); 900 else if (new_value == 1) 901 squeue_profile_start(); 902 else if (new_value == 2) { 903 int i, j; 904 905 squeue_profile_stop(); 906 mutex_enter(&cpu_lock); 907 for (i = 0; i < sqset_global_size; i++) { 908 sqs = sqset_global_list[i]; 909 for (j = 0; j < sqs->sqs_size; j++) { 910 squeue_profile_reset(sqs->sqs_list[j]); 911 } 912 } 913 mutex_exit(&cpu_lock); 914 915 new_value = 1; 916 squeue_profile_start(); 917 } 918 *profile_enabled = new_value; 919 920 return (0); 921 } 922 923 /* 924 * Reconfiguration callback 925 */ 926 927 /* ARGSUSED */ 928 static int 929 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 930 { 931 cpu_t *cp = cpu[id]; 932 933 ASSERT(MUTEX_HELD(&cpu_lock)); 934 switch (what) { 935 case CPU_CONFIG: 936 /* 937 * A new CPU is added. Create an squeue for it but do not bind 938 * it yet. 939 */ 940 if (cp->cpu_squeue_set == NULL) 941 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 942 break; 943 case CPU_ON: 944 case CPU_INIT: 945 case CPU_CPUPART_IN: 946 if (cp->cpu_squeue_set == NULL) { 947 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 948 } 949 if (ip_squeue_bind) 950 ip_squeue_set_bind(cp->cpu_squeue_set); 951 break; 952 case CPU_UNCONFIG: 953 case CPU_OFF: 954 case CPU_CPUPART_OUT: 955 ASSERT((cp->cpu_squeue_set != NULL) || 956 (cp->cpu_flags & CPU_OFFLINE)); 957 958 if (cp->cpu_squeue_set != NULL) { 959 ip_squeue_set_unbind(cp->cpu_squeue_set); 960 } 961 break; 962 default: 963 break; 964 } 965 return (0); 966 } 967 968 /* ARGSUSED */ 969 static void 970 ip_squeue_set_bind(squeue_set_t *sqs) 971 { 972 int i; 973 squeue_t *sqp; 974 975 if (!ip_squeue_bind) 976 return; 977 978 mutex_enter(&sqs->sqs_lock); 979 for (i = 0; i < sqs->sqs_size; i++) { 980 sqp = sqs->sqs_list[i]; 981 if (sqp->sq_state & SQS_BOUND) 982 continue; 983 squeue_bind(sqp, -1); 984 } 985 mutex_exit(&sqs->sqs_lock); 986 } 987 988 static void 989 ip_squeue_set_unbind(squeue_set_t *sqs) 990 { 991 int i; 992 squeue_t *sqp; 993 994 mutex_enter(&sqs->sqs_lock); 995 for (i = 0; i < sqs->sqs_size; i++) { 996 sqp = sqs->sqs_list[i]; 997 998 /* 999 * CPU is going offline. Remove the thread affinity 1000 * for any soft ring threads the squeue is managing. 1001 */ 1002 if (sqp->sq_state & SQS_ILL_BOUND) { 1003 ill_rx_ring_t *ring = sqp->sq_rx_ring; 1004 ill_t *ill = ring->rr_ill; 1005 1006 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1007 ASSERT(ring->rr_handle != NULL); 1008 ill->ill_dls_capab->ill_dls_unbind( 1009 ring->rr_handle); 1010 } 1011 } 1012 if (!(sqp->sq_state & SQS_BOUND)) 1013 continue; 1014 squeue_unbind(sqp); 1015 } 1016 mutex_exit(&sqs->sqs_lock); 1017 } 1018