1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * IP interface to squeues. 31 * 32 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 33 * cpu_squeue field of the cpu structure. Each squeue is associated with a 34 * connection instance (conn_t). 35 * 36 * For CPUs available at system startup time the squeue creation and association 37 * with CPU happens at MP initialization time. For CPUs added during dynamic 38 * reconfiguration, the initialization happens when the new CPU is configured in 39 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 40 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 41 * variable. 42 * 43 * There are two modes of associating connection with squeues. The first mode 44 * associates each connection with the CPU that creates the connection (either 45 * during open time or during accept time). The second mode associates each 46 * connection with a random CPU, effectively distributing load over all CPUs 47 * and all squeues in the system. The mode is controlled by the 48 * ip_squeue_fanout variable. 49 * 50 * NOTE: The fact that there is an association between each connection and 51 * squeue and squeue and CPU does not mean that each connection is always 52 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 53 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 54 * binding is only relevant for the worker thread. 55 * 56 * The list of all created squeues is kept in squeue_set structure. This list is 57 * used when ip_squeue_fanout is set and the load is distributed across all 58 * squeues. 59 * 60 * INTERFACE: 61 * 62 * squeue_t *ip_squeue_get(hint) 63 * 64 * Find an squeue based on the 'hint' value. The hint is used as an index 65 * in the array of IP squeues available. The way hint is computed may 66 * affect the effectiveness of the squeue distribution. Currently squeues 67 * are assigned in round-robin fashion using lbolt as a hint. 68 * 69 * 70 * DR Notes 71 * ======== 72 * 73 * The ip_squeue_init() registers a call-back function with the CPU DR 74 * subsystem using register_cpu_setup_func(). The call-back function does two 75 * things: 76 * 77 * o When the CPU is going off-line or unconfigured, the worker thread is 78 * unbound from the CPU. This allows the CPU unconfig code to move it to 79 * another CPU. 80 * 81 * o When the CPU is going online, it creates a new squeue for this CPU if 82 * necessary and binds the squeue worker thread to this CPU. 83 * 84 * TUNEBALES: 85 * 86 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 87 * associated with an squeue instance. 88 * 89 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 90 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 91 * an impact. 92 * 93 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 94 * otherwise get it from CPU->cpu_squeue. 95 * 96 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 97 * changed using ndd on /dev/tcp or /dev/ip. 98 * 99 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 100 * created. This is the time squeue code waits before waking up the worker 101 * thread after queuing a request. 102 */ 103 104 #include <sys/types.h> 105 #include <sys/debug.h> 106 #include <sys/kmem.h> 107 #include <sys/cpuvar.h> 108 109 #include <sys/cmn_err.h> 110 111 #include <inet/common.h> 112 #include <inet/ip.h> 113 #include <inet/ip_if.h> 114 #include <inet/mi.h> 115 #include <inet/nd.h> 116 #include <inet/ipclassifier.h> 117 #include <sys/types.h> 118 #include <sys/conf.h> 119 #include <sys/sunddi.h> 120 #include <sys/ddi.h> 121 #include <sys/squeue_impl.h> 122 123 124 /* 125 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 126 * mapping between squeue and NIC (or Rx ring) for performance reasons so 127 * each squeue can uniquely own a NIC or a Rx ring and do polling 128 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 129 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 130 * can be created dynamically as needed. 131 */ 132 #define MAX_SQUEUES_PER_CPU 32 133 #define MIN_SQUEUES_PER_CPU 1 134 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 135 136 #define IP_NUM_SOFT_RINGS 2 137 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 138 139 /* 140 * List of all created squeue sets. The size is protected by cpu_lock 141 */ 142 squeue_set_t **sqset_global_list; 143 uint_t sqset_global_size; 144 145 int ip_squeue_bind = B_TRUE; 146 int ip_squeue_profile = B_TRUE; 147 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 148 149 /* 150 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 151 * created. This is the time squeue code waits before waking up the worker 152 * thread after queuing a request. 153 */ 154 uint_t ip_squeue_worker_wait = 10; 155 156 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 157 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 158 159 static void ip_squeue_set_bind(squeue_set_t *); 160 static void ip_squeue_set_unbind(squeue_set_t *); 161 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 162 163 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 164 165 /* 166 * Create squeue set containing ip_squeues_per_cpu number of squeues 167 * for this CPU and bind them all to the CPU. 168 */ 169 static squeue_set_t * 170 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 171 { 172 int i; 173 squeue_set_t *sqs; 174 squeue_t *sqp; 175 char sqname[64]; 176 processorid_t id = cp->cpu_id; 177 178 if (reuse) { 179 int i; 180 181 /* 182 * We may already have an squeue created for this CPU. Try to 183 * find one and reuse it if possible. 184 */ 185 for (i = 0; i < sqset_global_size; i++) { 186 sqs = sqset_global_list[i]; 187 if (id == sqs->sqs_bind) 188 return (sqs); 189 } 190 } 191 192 sqs = kmem_zalloc(sizeof (squeue_set_t) + 193 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 194 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 195 sqs->sqs_list = (squeue_t **)&sqs[1]; 196 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 197 sqs->sqs_bind = id; 198 199 for (i = 0; i < ip_squeues_per_cpu; i++) { 200 bzero(sqname, sizeof (sqname)); 201 202 (void) snprintf(sqname, sizeof (sqname), 203 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 204 cp->cpu_id, i); 205 206 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 207 minclsyspri); 208 209 /* 210 * The first squeue in each squeue_set is the DEFAULT 211 * squeue. 212 */ 213 sqp->sq_state |= SQS_DEFAULT; 214 215 ASSERT(sqp != NULL); 216 217 squeue_profile_enable(sqp); 218 sqs->sqs_list[sqs->sqs_size++] = sqp; 219 220 if (ip_squeue_create_callback != NULL) 221 ip_squeue_create_callback(sqp); 222 } 223 224 if (ip_squeue_bind && cpu_is_online(cp)) 225 ip_squeue_set_bind(sqs); 226 227 sqset_global_list[sqset_global_size++] = sqs; 228 ASSERT(sqset_global_size <= NCPU); 229 return (sqs); 230 } 231 232 /* 233 * Initialize IP squeues. 234 */ 235 void 236 ip_squeue_init(void (*callback)(squeue_t *)) 237 { 238 int i; 239 240 ASSERT(sqset_global_list == NULL); 241 242 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 243 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 244 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 245 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 246 247 ip_squeue_create_callback = callback; 248 squeue_init(); 249 sqset_global_list = 250 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 251 sqset_global_size = 0; 252 mutex_enter(&cpu_lock); 253 254 /* Create squeue for each active CPU available */ 255 for (i = 0; i < NCPU; i++) { 256 cpu_t *cp = cpu[i]; 257 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 258 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 259 } 260 } 261 262 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 263 264 mutex_exit(&cpu_lock); 265 266 if (ip_squeue_profile) 267 squeue_profile_start(); 268 } 269 270 /* 271 * Get squeue_t structure based on index. 272 * Since the squeue list can only grow, no need to grab any lock. 273 */ 274 squeue_t * 275 ip_squeue_random(uint_t index) 276 { 277 squeue_set_t *sqs; 278 279 sqs = sqset_global_list[index % sqset_global_size]; 280 return (sqs->sqs_list[index % sqs->sqs_size]); 281 } 282 283 /* ARGSUSED */ 284 void 285 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 286 { 287 squeue_t *sqp = arg2; 288 ill_rx_ring_t *ring = sqp->sq_rx_ring; 289 ill_t *ill; 290 291 ASSERT(sqp != NULL); 292 293 if (ring == NULL) { 294 return; 295 } 296 297 /* 298 * Clean up squeue 299 */ 300 mutex_enter(&sqp->sq_lock); 301 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 302 sqp->sq_rx_ring = NULL; 303 mutex_exit(&sqp->sq_lock); 304 305 ill = ring->rr_ill; 306 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 307 ASSERT(ring->rr_handle != NULL); 308 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 309 } 310 311 /* 312 * Cleanup the ring 313 */ 314 315 ring->rr_blank = NULL; 316 ring->rr_handle = NULL; 317 ring->rr_sqp = NULL; 318 319 /* 320 * Signal ill that cleanup is done 321 */ 322 mutex_enter(&ill->ill_lock); 323 ring->rr_ring_state = ILL_RING_FREE; 324 cv_signal(&ill->ill_cv); 325 mutex_exit(&ill->ill_lock); 326 } 327 328 typedef struct ip_taskq_arg { 329 ill_t *ip_taskq_ill; 330 ill_rx_ring_t *ip_taskq_ill_rx_ring; 331 cpu_t *ip_taskq_cpu; 332 } ip_taskq_arg_t; 333 334 /* 335 * Do a Rx ring to squeue binding. Find a unique squeue that is not 336 * managing a receive ring. If no such squeue exists, dynamically 337 * create a new one in the squeue set. 338 * 339 * The function runs via the system taskq. The ill passed as an 340 * argument can't go away since we hold a ref. The lock order is 341 * ill_lock -> sqs_lock -> sq_lock. 342 * 343 * If we are binding a Rx ring to a squeue attached to the offline CPU, 344 * no need to check that because squeues are never destroyed once 345 * created. 346 */ 347 /* ARGSUSED */ 348 static void 349 ip_squeue_extend(void *arg) 350 { 351 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 352 ill_t *ill = sq_arg->ip_taskq_ill; 353 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 354 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 355 squeue_set_t *sqs; 356 squeue_t *sqp = NULL; 357 358 ASSERT(ill != NULL); 359 ASSERT(ill_rx_ring != NULL); 360 kmem_free(arg, sizeof (ip_taskq_arg_t)); 361 362 /* 363 * Make sure the CPU that originally took the interrupt still 364 * exists. 365 */ 366 if (!CPU_ISON(intr_cpu)) 367 intr_cpu = CPU; 368 369 sqs = intr_cpu->cpu_squeue_set; 370 371 /* 372 * If this ill represents link aggregation, then there might be 373 * multiple NICs trying to register them selves at the same time 374 * and in order to ensure that test and assignment of free rings 375 * is sequential, we need to hold the ill_lock. 376 */ 377 mutex_enter(&ill->ill_lock); 378 sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 379 if (sqp == NULL) { 380 /* 381 * We hit the max limit of squeues allowed per CPU. 382 * Assign this rx_ring to DEFAULT squeue of the 383 * interrupted CPU but the squeue will not manage 384 * the ring. Also print a warning. 385 */ 386 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 387 "has max number of squeues. System performance might " 388 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 389 390 /* the first squeue in the list is the default squeue */ 391 sqp = sqs->sqs_list[0]; 392 ASSERT(sqp != NULL); 393 ill_rx_ring->rr_sqp = sqp; 394 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 395 396 mutex_exit(&ill->ill_lock); 397 ill_waiter_dcr(ill); 398 return; 399 } 400 401 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 402 sqp->sq_rx_ring = ill_rx_ring; 403 ill_rx_ring->rr_sqp = sqp; 404 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 405 406 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 407 mutex_exit(&sqp->sq_lock); 408 409 mutex_exit(&ill->ill_lock); 410 411 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 412 ill_waiter_dcr(ill); 413 } 414 415 /* 416 * Do a Rx ring to squeue binding. Find a unique squeue that is not 417 * managing a receive ring. If no such squeue exists, dynamically 418 * create a new one in the squeue set. 419 * 420 * The function runs via the system taskq. The ill passed as an 421 * argument can't go away since we hold a ref. The lock order is 422 * ill_lock -> sqs_lock -> sq_lock. 423 * 424 * If we are binding a Rx ring to a squeue attached to the offline CPU, 425 * no need to check that because squeues are never destroyed once 426 * created. 427 */ 428 /* ARGSUSED */ 429 static void 430 ip_squeue_soft_ring_affinity(void *arg) 431 { 432 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 433 ill_t *ill = sq_arg->ip_taskq_ill; 434 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 435 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 436 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 437 cpu_t *bind_cpu; 438 int cpu_id = intr_cpu->cpu_id; 439 int min_cpu_id, max_cpu_id; 440 boolean_t enough_uniq_cpus = B_FALSE; 441 boolean_t enough_cpus = B_FALSE; 442 squeue_set_t *sqs, *last_sqs; 443 squeue_t *sqp = NULL; 444 int i, j; 445 446 ASSERT(ill != NULL); 447 kmem_free(arg, sizeof (ip_taskq_arg_t)); 448 449 /* 450 * Make sure the CPU that originally took the interrupt still 451 * exists. 452 */ 453 if (!CPU_ISON(intr_cpu)) { 454 intr_cpu = CPU; 455 cpu_id = intr_cpu->cpu_id; 456 } 457 458 /* 459 * If this ill represents link aggregation, then there might be 460 * multiple NICs trying to register them selves at the same time 461 * and in order to ensure that test and assignment of free rings 462 * is sequential, we need to hold the ill_lock. 463 */ 464 mutex_enter(&ill->ill_lock); 465 466 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 467 mutex_exit(&ill->ill_lock); 468 return; 469 } 470 /* 471 * We need to fanout the interrupts from the NIC. We do that by 472 * telling the driver underneath to create soft rings and use 473 * worker threads (if the driver advertized SOFT_RING capability) 474 * Its still a big performance win to if we can fanout to the 475 * threads on the same core that is taking interrupts. 476 * 477 * Since we don't know the interrupt to CPU binding, we don't 478 * assign any squeues or affinity to worker threads in the NIC. 479 * At the time of the first interrupt, we know which CPU is 480 * taking interrupts and try to find other threads on the same 481 * core. Assuming, ip_threads_per_cpu is correct and cpus are 482 * numbered sequentially for each core (XXX need something better 483 * than this in future), find the lowest number and highest 484 * number thread for that core. 485 * 486 * If we have one more thread per core than number of soft rings, 487 * then don't assign any worker threads to the H/W thread (cpu) 488 * taking interrupts (capability negotiation tries to ensure this) 489 * 490 * If the number of threads per core are same as the number of 491 * soft rings, then assign the worker affinity and squeue to 492 * the same cpu. 493 * 494 * Otherwise, just fanout to higher number CPUs starting from 495 * the interrupted CPU. 496 */ 497 498 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 499 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 500 501 cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n", 502 min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id); 503 504 /* 505 * Quickly check if there are enough CPUs present for fanout 506 * and also max_cpu_id is less than the id of the active CPU. 507 * We use the cpu_id stored in the last squeue_set to get 508 * an idea. The scheme is by no means perfect since it doesn't 509 * take into account CPU DR operations and the fact that 510 * interrupts themselves might change. An ideal scenario 511 * would be to ensure that interrupts run cpus by themselves 512 * and worker threads never have affinity to those CPUs. If 513 * the interrupts move to CPU which had a worker thread, it 514 * should be changed. Probably callbacks similar to CPU offline 515 * are needed to make it work perfectly. 516 */ 517 last_sqs = sqset_global_list[sqset_global_size - 1]; 518 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 519 if ((max_cpu_id - min_cpu_id) > 520 ill_soft_ring->ill_dls_soft_ring_cnt) 521 enough_uniq_cpus = B_TRUE; 522 else if ((max_cpu_id - min_cpu_id) >= 523 ill_soft_ring->ill_dls_soft_ring_cnt) 524 enough_cpus = B_TRUE; 525 } 526 527 j = 0; 528 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 529 if (enough_uniq_cpus) { 530 if ((min_cpu_id + i) == cpu_id) { 531 j++; 532 continue; 533 } 534 bind_cpu = cpu[min_cpu_id + i]; 535 } else if (enough_cpus) { 536 bind_cpu = cpu[min_cpu_id + i]; 537 } else { 538 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 539 bind_cpu = cpu[(cpu_id + i) % ncpus]; 540 } 541 542 /* 543 * Check if the CPU actually exist and active. If not, 544 * use the interrupted CPU. ip_find_unused_squeue() will 545 * find the right CPU to fanout anyway. 546 */ 547 if (!CPU_ISON(bind_cpu)) 548 bind_cpu = intr_cpu; 549 550 sqs = bind_cpu->cpu_squeue_set; 551 ASSERT(sqs != NULL); 552 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 553 554 sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 555 if (sqp == NULL) { 556 /* 557 * We hit the max limit of squeues allowed per CPU. 558 * Assign this rx_ring to DEFAULT squeue of the 559 * interrupted CPU but thesqueue will not manage 560 * the ring. Also print a warning. 561 */ 562 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 563 "%d/%p already has max number of squeues. System " 564 "performance might become suboptimal\n", 565 sqs->sqs_bind, (void *)sqs); 566 567 /* the first squeue in the list is the default squeue */ 568 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 569 ASSERT(sqp != NULL); 570 571 ill_rx_ring->rr_sqp = sqp; 572 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 573 continue; 574 575 } 576 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 577 ill_rx_ring->rr_sqp = sqp; 578 sqp->sq_rx_ring = ill_rx_ring; 579 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 580 sqp->sq_state |= SQS_ILL_BOUND; 581 582 /* assign affinity to soft ring */ 583 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 584 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 585 sqp->sq_bind); 586 } 587 mutex_exit(&sqp->sq_lock); 588 589 cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n", 590 i - j, sqp->sq_bind); 591 } 592 mutex_exit(&ill->ill_lock); 593 594 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 595 SOFT_RING_SRC_HASH); 596 597 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 598 ill_waiter_dcr(ill); 599 } 600 601 void 602 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 603 mblk_t *mp_chain, size_t hdrlen) 604 { 605 ip_taskq_arg_t *taskq_arg; 606 boolean_t refheld; 607 608 ASSERT(servicing_interrupt()); 609 ASSERT(ip_ring == NULL); 610 611 mutex_enter(&ill->ill_lock); 612 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 613 taskq_arg = (ip_taskq_arg_t *) 614 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 615 616 if (taskq_arg == NULL) 617 goto out; 618 619 taskq_arg->ip_taskq_ill = ill; 620 taskq_arg->ip_taskq_ill_rx_ring = ip_ring; 621 taskq_arg->ip_taskq_cpu = CPU; 622 623 /* 624 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 625 * the next interrupt to schedule a task for calling 626 * ip_squeue_soft_ring_affinity(); 627 */ 628 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 629 } else { 630 mutex_exit(&ill->ill_lock); 631 goto out; 632 } 633 mutex_exit(&ill->ill_lock); 634 refheld = ill_waiter_inc(ill); 635 if (refheld) { 636 if (taskq_dispatch(system_taskq, 637 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 638 goto out; 639 640 /* release ref on ill if taskq dispatch fails */ 641 ill_waiter_dcr(ill); 642 } 643 /* 644 * Turn on CAPAB_SOFT_RING so that affinity assignment 645 * can be tried again later. 646 */ 647 mutex_enter(&ill->ill_lock); 648 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 649 mutex_exit(&ill->ill_lock); 650 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 651 652 out: 653 ip_input(ill, ip_ring, mp_chain, hdrlen); 654 } 655 656 static squeue_t * 657 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 658 { 659 int i; 660 squeue_set_t *best_sqs = NULL; 661 squeue_set_t *curr_sqs = NULL; 662 int min_sq = 0; 663 squeue_t *sqp = NULL; 664 char sqname[64]; 665 666 /* 667 * If fanout is set and the passed squeue_set already has some 668 * squeues which are managing the NICs, try to find squeues on 669 * unused CPU. 670 */ 671 if (sqs->sqs_size > 1 && fanout) { 672 /* 673 * First check to see if any squeue on the CPU passed 674 * is managing a NIC. 675 */ 676 for (i = 0; i < sqs->sqs_size; i++) { 677 mutex_enter(&sqs->sqs_list[i]->sq_lock); 678 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 679 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 680 mutex_exit(&sqs->sqs_list[i]->sq_lock); 681 break; 682 } 683 mutex_exit(&sqs->sqs_list[i]->sq_lock); 684 } 685 if (i != sqs->sqs_size) { 686 best_sqs = sqset_global_list[sqset_global_size - 1]; 687 min_sq = best_sqs->sqs_size; 688 689 for (i = sqset_global_size - 2; i >= 0; i--) { 690 curr_sqs = sqset_global_list[i]; 691 if (curr_sqs->sqs_size < min_sq) { 692 best_sqs = curr_sqs; 693 min_sq = curr_sqs->sqs_size; 694 } 695 } 696 697 ASSERT(best_sqs != NULL); 698 sqs = best_sqs; 699 bind_cpu = cpu[sqs->sqs_bind]; 700 } 701 } 702 703 mutex_enter(&sqs->sqs_lock); 704 705 for (i = 0; i < sqs->sqs_size; i++) { 706 mutex_enter(&sqs->sqs_list[i]->sq_lock); 707 if ((sqs->sqs_list[i]->sq_state & 708 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 709 sqp = sqs->sqs_list[i]; 710 break; 711 } 712 mutex_exit(&sqs->sqs_list[i]->sq_lock); 713 } 714 715 if (sqp == NULL) { 716 /* Need to create a new squeue */ 717 if (sqs->sqs_size == sqs->sqs_max_size) { 718 /* 719 * Reached the max limit for squeue 720 * we can allocate on this CPU. 721 */ 722 mutex_exit(&sqs->sqs_lock); 723 return (NULL); 724 } 725 726 bzero(sqname, sizeof (sqname)); 727 (void) snprintf(sqname, sizeof (sqname), 728 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 729 bind_cpu->cpu_id, sqs->sqs_size); 730 731 sqp = squeue_create(sqname, bind_cpu->cpu_id, 732 ip_squeue_worker_wait, minclsyspri); 733 734 ASSERT(sqp != NULL); 735 736 squeue_profile_enable(sqp); 737 sqs->sqs_list[sqs->sqs_size++] = sqp; 738 739 if (ip_squeue_create_callback != NULL) 740 ip_squeue_create_callback(sqp); 741 742 mutex_enter(&cpu_lock); 743 if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 744 squeue_bind(sqp, -1); 745 } 746 mutex_exit(&cpu_lock); 747 748 mutex_enter(&sqp->sq_lock); 749 } 750 751 mutex_exit(&sqs->sqs_lock); 752 ASSERT(sqp != NULL); 753 return (sqp); 754 } 755 756 /* 757 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 758 * owned by a squeue yet, do the assignment. When the NIC registers it 759 * Rx rings with IP, we don't know where the interrupts will land and 760 * hence we need to wait till this point to do the assignment. 761 */ 762 squeue_t * 763 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 764 { 765 squeue_t *sqp; 766 ill_t *ill; 767 int interrupt; 768 ip_taskq_arg_t *taskq_arg; 769 boolean_t refheld; 770 771 if (ill_rx_ring == NULL) 772 return (IP_SQUEUE_GET(lbolt)); 773 774 sqp = ill_rx_ring->rr_sqp; 775 /* 776 * Do a quick check. If it's not NULL, we are done. 777 * Squeues are never destroyed so worse we will bind 778 * this connection to a suboptimal squeue. 779 * 780 * This is the fast path case. 781 */ 782 if (sqp != NULL) 783 return (sqp); 784 785 ill = ill_rx_ring->rr_ill; 786 ASSERT(ill != NULL); 787 788 interrupt = servicing_interrupt(); 789 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 790 KM_NOSLEEP); 791 792 mutex_enter(&ill->ill_lock); 793 if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || 794 taskq_arg == NULL) { 795 /* 796 * Do the ring to squeue binding only if we are in interrupt 797 * context and there is no one else trying the bind already. 798 */ 799 mutex_exit(&ill->ill_lock); 800 if (taskq_arg != NULL) 801 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 802 return (IP_SQUEUE_GET(lbolt)); 803 } 804 805 /* 806 * No sqp assigned yet. Can't really do that in interrupt 807 * context. Assign the default sqp to this connection and 808 * trigger creation of new sqp and binding it to this ring 809 * via taskq. Need to make sure ill stays around. 810 */ 811 taskq_arg->ip_taskq_ill = ill; 812 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 813 taskq_arg->ip_taskq_cpu = CPU; 814 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 815 mutex_exit(&ill->ill_lock); 816 refheld = ill_waiter_inc(ill); 817 if (refheld) { 818 if (taskq_dispatch(system_taskq, ip_squeue_extend, 819 taskq_arg, TQ_NOSLEEP) != NULL) { 820 return (IP_SQUEUE_GET(lbolt)); 821 } 822 } 823 /* 824 * The ill is closing and we could not get a reference on the ill OR 825 * taskq_dispatch failed probably due to memory allocation failure. 826 * We will try again next time. 827 */ 828 mutex_enter(&ill->ill_lock); 829 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 830 mutex_exit(&ill->ill_lock); 831 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 832 if (refheld) 833 ill_waiter_dcr(ill); 834 835 return (IP_SQUEUE_GET(lbolt)); 836 } 837 838 /* 839 * NDD hooks for setting ip_squeue_xxx tuneables. 840 */ 841 842 /* ARGSUSED */ 843 int 844 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 845 caddr_t addr, cred_t *cr) 846 { 847 int *bind_enabled = (int *)addr; 848 long new_value; 849 int i; 850 851 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 852 return (EINVAL); 853 854 if (ip_squeue_bind == new_value) 855 return (0); 856 857 *bind_enabled = new_value; 858 mutex_enter(&cpu_lock); 859 if (new_value == 0) { 860 for (i = 0; i < sqset_global_size; i++) 861 ip_squeue_set_unbind(sqset_global_list[i]); 862 } else { 863 for (i = 0; i < sqset_global_size; i++) 864 ip_squeue_set_bind(sqset_global_list[i]); 865 } 866 867 mutex_exit(&cpu_lock); 868 return (0); 869 } 870 871 /* 872 * Set squeue profiling. 873 * 0 means "disable" 874 * 1 means "enable" 875 * 2 means "enable and reset" 876 */ 877 /* ARGSUSED */ 878 int 879 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 880 cred_t *cr) 881 { 882 int *profile_enabled = (int *)cp; 883 long new_value; 884 squeue_set_t *sqs; 885 886 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 887 return (EINVAL); 888 889 if (new_value == 0) 890 squeue_profile_stop(); 891 else if (new_value == 1) 892 squeue_profile_start(); 893 else if (new_value == 2) { 894 int i, j; 895 896 squeue_profile_stop(); 897 mutex_enter(&cpu_lock); 898 for (i = 0; i < sqset_global_size; i++) { 899 sqs = sqset_global_list[i]; 900 for (j = 0; j < sqs->sqs_size; j++) { 901 squeue_profile_reset(sqs->sqs_list[j]); 902 } 903 } 904 mutex_exit(&cpu_lock); 905 906 new_value = 1; 907 squeue_profile_start(); 908 } 909 *profile_enabled = new_value; 910 911 return (0); 912 } 913 914 /* 915 * Reconfiguration callback 916 */ 917 918 /* ARGSUSED */ 919 static int 920 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 921 { 922 cpu_t *cp = cpu[id]; 923 924 ASSERT(MUTEX_HELD(&cpu_lock)); 925 switch (what) { 926 case CPU_CONFIG: 927 /* 928 * A new CPU is added. Create an squeue for it but do not bind 929 * it yet. 930 */ 931 if (cp->cpu_squeue_set == NULL) 932 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 933 break; 934 case CPU_ON: 935 case CPU_INIT: 936 case CPU_CPUPART_IN: 937 if (cp->cpu_squeue_set == NULL) { 938 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 939 } 940 if (ip_squeue_bind) 941 ip_squeue_set_bind(cp->cpu_squeue_set); 942 break; 943 case CPU_UNCONFIG: 944 case CPU_OFF: 945 case CPU_CPUPART_OUT: 946 ASSERT((cp->cpu_squeue_set != NULL) || 947 (cp->cpu_flags & CPU_OFFLINE)); 948 949 if (cp->cpu_squeue_set != NULL) { 950 ip_squeue_set_unbind(cp->cpu_squeue_set); 951 } 952 break; 953 default: 954 break; 955 } 956 return (0); 957 } 958 959 /* ARGSUSED */ 960 static void 961 ip_squeue_set_bind(squeue_set_t *sqs) 962 { 963 int i; 964 squeue_t *sqp; 965 966 if (!ip_squeue_bind) 967 return; 968 969 mutex_enter(&sqs->sqs_lock); 970 for (i = 0; i < sqs->sqs_size; i++) { 971 sqp = sqs->sqs_list[i]; 972 if (sqp->sq_state & SQS_BOUND) 973 continue; 974 squeue_bind(sqp, -1); 975 } 976 mutex_exit(&sqs->sqs_lock); 977 } 978 979 static void 980 ip_squeue_set_unbind(squeue_set_t *sqs) 981 { 982 int i; 983 squeue_t *sqp; 984 985 mutex_enter(&sqs->sqs_lock); 986 for (i = 0; i < sqs->sqs_size; i++) { 987 sqp = sqs->sqs_list[i]; 988 989 /* 990 * CPU is going offline. Remove the thread affinity 991 * for any soft ring threads the squeue is managing. 992 */ 993 if (sqp->sq_state & SQS_ILL_BOUND) { 994 ill_rx_ring_t *ring = sqp->sq_rx_ring; 995 ill_t *ill = ring->rr_ill; 996 997 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 998 ASSERT(ring->rr_handle != NULL); 999 ill->ill_dls_capab->ill_dls_unbind( 1000 ring->rr_handle); 1001 } 1002 } 1003 if (!(sqp->sq_state & SQS_BOUND)) 1004 continue; 1005 squeue_unbind(sqp); 1006 } 1007 mutex_exit(&sqs->sqs_lock); 1008 } 1009