1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/mi.h> 114 #include <inet/nd.h> 115 #include <inet/ipclassifier.h> 116 #include <sys/types.h> 117 #include <sys/conf.h> 118 #include <sys/sunddi.h> 119 #include <sys/ddi.h> 120 #include <sys/squeue_impl.h> 121 122 123 /* 124 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 125 * mapping between squeue and NIC (or Rx ring) for performance reasons so 126 * each squeue can uniquely own a NIC or a Rx ring and do polling 127 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 128 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 129 * can be created dynamically as needed. 130 */ 131 #define MAX_SQUEUES_PER_CPU 32 132 #define MIN_SQUEUES_PER_CPU 1 133 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 134 135 #define IP_NUM_SOFT_RINGS 2 136 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 137 138 /* 139 * List of all created squeue sets. The size is protected by cpu_lock 140 */ 141 squeue_set_t **sqset_global_list; 142 uint_t sqset_global_size; 143 144 int ip_squeue_bind = B_TRUE; 145 int ip_squeue_profile = B_TRUE; 146 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 147 148 /* 149 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 150 * created. This is the time squeue code waits before waking up the worker 151 * thread after queuing a request. 152 */ 153 uint_t ip_squeue_worker_wait = 10; 154 155 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 156 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 157 158 static void ip_squeue_set_bind(squeue_set_t *); 159 static void ip_squeue_set_unbind(squeue_set_t *); 160 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 161 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 163 164 /* 165 * Create squeue set containing ip_squeues_per_cpu number of squeues 166 * for this CPU and bind them all to the CPU. 167 */ 168 static squeue_set_t * 169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 170 { 171 int i; 172 squeue_set_t *sqs; 173 squeue_t *sqp; 174 char sqname[64]; 175 processorid_t id = cp->cpu_id; 176 177 if (reuse) { 178 int i; 179 180 /* 181 * We may already have an squeue created for this CPU. Try to 182 * find one and reuse it if possible. 183 */ 184 for (i = 0; i < sqset_global_size; i++) { 185 sqs = sqset_global_list[i]; 186 if (id == sqs->sqs_bind) 187 return (sqs); 188 } 189 } 190 191 sqs = kmem_zalloc(sizeof (squeue_set_t) + 192 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 193 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 194 sqs->sqs_list = (squeue_t **)&sqs[1]; 195 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 196 sqs->sqs_bind = id; 197 198 for (i = 0; i < ip_squeues_per_cpu; i++) { 199 bzero(sqname, sizeof (sqname)); 200 201 (void) snprintf(sqname, sizeof (sqname), 202 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 203 cp->cpu_id, i); 204 205 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 206 minclsyspri); 207 208 /* 209 * The first squeue in each squeue_set is the DEFAULT 210 * squeue. 211 */ 212 sqp->sq_state |= SQS_DEFAULT; 213 214 ASSERT(sqp != NULL); 215 216 squeue_profile_enable(sqp); 217 sqs->sqs_list[sqs->sqs_size++] = sqp; 218 219 if (ip_squeue_create_callback != NULL) 220 ip_squeue_create_callback(sqp); 221 } 222 223 if (ip_squeue_bind && cpu_is_online(cp)) 224 ip_squeue_set_bind(sqs); 225 226 sqset_global_list[sqset_global_size++] = sqs; 227 ASSERT(sqset_global_size <= NCPU); 228 return (sqs); 229 } 230 231 /* 232 * Initialize IP squeues. 233 */ 234 void 235 ip_squeue_init(void (*callback)(squeue_t *)) 236 { 237 int i; 238 239 ASSERT(sqset_global_list == NULL); 240 241 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 243 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 244 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 245 246 ip_squeue_create_callback = callback; 247 squeue_init(); 248 sqset_global_list = 249 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 250 sqset_global_size = 0; 251 mutex_enter(&cpu_lock); 252 253 /* Create squeue for each active CPU available */ 254 for (i = 0; i < NCPU; i++) { 255 cpu_t *cp = cpu[i]; 256 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 257 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 258 } 259 } 260 261 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 262 263 mutex_exit(&cpu_lock); 264 265 if (ip_squeue_profile) 266 squeue_profile_start(); 267 } 268 269 /* 270 * Get squeue_t structure based on index. 271 * Since the squeue list can only grow, no need to grab any lock. 272 */ 273 squeue_t * 274 ip_squeue_random(uint_t index) 275 { 276 squeue_set_t *sqs; 277 278 sqs = sqset_global_list[index % sqset_global_size]; 279 return (sqs->sqs_list[index % sqs->sqs_size]); 280 } 281 282 /* ARGSUSED */ 283 void 284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 285 { 286 squeue_t *sqp = arg2; 287 ill_rx_ring_t *ring = sqp->sq_rx_ring; 288 ill_t *ill; 289 290 ASSERT(sqp != NULL); 291 292 if (ring == NULL) { 293 return; 294 } 295 296 /* 297 * Clean up squeue 298 */ 299 mutex_enter(&sqp->sq_lock); 300 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 301 sqp->sq_rx_ring = NULL; 302 mutex_exit(&sqp->sq_lock); 303 304 ill = ring->rr_ill; 305 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 306 ASSERT(ring->rr_handle != NULL); 307 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 308 } 309 310 /* 311 * Cleanup the ring 312 */ 313 314 ring->rr_blank = NULL; 315 ring->rr_handle = NULL; 316 ring->rr_sqp = NULL; 317 318 /* 319 * Signal ill that cleanup is done 320 */ 321 mutex_enter(&ill->ill_lock); 322 ring->rr_ring_state = ILL_RING_FREE; 323 cv_signal(&ill->ill_cv); 324 mutex_exit(&ill->ill_lock); 325 } 326 327 typedef struct ip_taskq_arg { 328 ill_t *ip_taskq_ill; 329 ill_rx_ring_t *ip_taskq_ill_rx_ring; 330 cpu_t *ip_taskq_cpu; 331 } ip_taskq_arg_t; 332 333 /* 334 * Do a Rx ring to squeue binding. Find a unique squeue that is not 335 * managing a receive ring. If no such squeue exists, dynamically 336 * create a new one in the squeue set. 337 * 338 * The function runs via the system taskq. The ill passed as an 339 * argument can't go away since we hold a ref. The lock order is 340 * ill_lock -> sqs_lock -> sq_lock. 341 * 342 * If we are binding a Rx ring to a squeue attached to the offline CPU, 343 * no need to check that because squeues are never destroyed once 344 * created. 345 */ 346 /* ARGSUSED */ 347 static void 348 ip_squeue_extend(void *arg) 349 { 350 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 351 ill_t *ill = sq_arg->ip_taskq_ill; 352 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 353 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 354 squeue_set_t *sqs; 355 squeue_t *sqp = NULL; 356 357 ASSERT(ill != NULL); 358 ASSERT(ill_rx_ring != NULL); 359 kmem_free(arg, sizeof (ip_taskq_arg_t)); 360 361 /* 362 * Make sure the CPU that originally took the interrupt still 363 * exists. 364 */ 365 if (!CPU_ISON(intr_cpu)) 366 intr_cpu = CPU; 367 368 sqs = intr_cpu->cpu_squeue_set; 369 370 /* 371 * If this ill represents link aggregation, then there might be 372 * multiple NICs trying to register them selves at the same time 373 * and in order to ensure that test and assignment of free rings 374 * is sequential, we need to hold the ill_lock. 375 */ 376 mutex_enter(&ill->ill_lock); 377 sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 378 if (sqp == NULL) { 379 /* 380 * We hit the max limit of squeues allowed per CPU. 381 * Assign this rx_ring to DEFAULT squeue of the 382 * interrupted CPU but the squeue will not manage 383 * the ring. Also print a warning. 384 */ 385 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 386 "has max number of squeues. System performance might " 387 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 388 389 /* the first squeue in the list is the default squeue */ 390 sqp = sqs->sqs_list[0]; 391 ASSERT(sqp != NULL); 392 ill_rx_ring->rr_sqp = sqp; 393 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 394 395 mutex_exit(&ill->ill_lock); 396 ill_waiter_dcr(ill); 397 return; 398 } 399 400 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 401 sqp->sq_rx_ring = ill_rx_ring; 402 ill_rx_ring->rr_sqp = sqp; 403 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 404 405 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 406 mutex_exit(&sqp->sq_lock); 407 408 mutex_exit(&ill->ill_lock); 409 410 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 411 ill_waiter_dcr(ill); 412 } 413 414 /* 415 * Do a Rx ring to squeue binding. Find a unique squeue that is not 416 * managing a receive ring. If no such squeue exists, dynamically 417 * create a new one in the squeue set. 418 * 419 * The function runs via the system taskq. The ill passed as an 420 * argument can't go away since we hold a ref. The lock order is 421 * ill_lock -> sqs_lock -> sq_lock. 422 * 423 * If we are binding a Rx ring to a squeue attached to the offline CPU, 424 * no need to check that because squeues are never destroyed once 425 * created. 426 */ 427 /* ARGSUSED */ 428 static void 429 ip_squeue_soft_ring_affinity(void *arg) 430 { 431 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 432 ill_t *ill = sq_arg->ip_taskq_ill; 433 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 434 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 435 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 436 cpu_t *bind_cpu; 437 int cpu_id = intr_cpu->cpu_id; 438 int min_cpu_id, max_cpu_id; 439 boolean_t enough_uniq_cpus = B_FALSE; 440 boolean_t enough_cpus = B_FALSE; 441 squeue_set_t *sqs, *last_sqs; 442 squeue_t *sqp = NULL; 443 int i, j; 444 445 ASSERT(ill != NULL); 446 kmem_free(arg, sizeof (ip_taskq_arg_t)); 447 448 /* 449 * Make sure the CPU that originally took the interrupt still 450 * exists. 451 */ 452 if (!CPU_ISON(intr_cpu)) { 453 intr_cpu = CPU; 454 cpu_id = intr_cpu->cpu_id; 455 } 456 457 /* 458 * If this ill represents link aggregation, then there might be 459 * multiple NICs trying to register them selves at the same time 460 * and in order to ensure that test and assignment of free rings 461 * is sequential, we need to hold the ill_lock. 462 */ 463 mutex_enter(&ill->ill_lock); 464 465 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 466 mutex_exit(&ill->ill_lock); 467 return; 468 } 469 /* 470 * We need to fanout the interrupts from the NIC. We do that by 471 * telling the driver underneath to create soft rings and use 472 * worker threads (if the driver advertized SOFT_RING capability) 473 * Its still a big performance win to if we can fanout to the 474 * threads on the same core that is taking interrupts. 475 * 476 * Since we don't know the interrupt to CPU binding, we don't 477 * assign any squeues or affinity to worker threads in the NIC. 478 * At the time of the first interrupt, we know which CPU is 479 * taking interrupts and try to find other threads on the same 480 * core. Assuming, ip_threads_per_cpu is correct and cpus are 481 * numbered sequentially for each core (XXX need something better 482 * than this in future), find the lowest number and highest 483 * number thread for that core. 484 * 485 * If we have one more thread per core than number of soft rings, 486 * then don't assign any worker threads to the H/W thread (cpu) 487 * taking interrupts (capability negotiation tries to ensure this) 488 * 489 * If the number of threads per core are same as the number of 490 * soft rings, then assign the worker affinity and squeue to 491 * the same cpu. 492 * 493 * Otherwise, just fanout to higher number CPUs starting from 494 * the interrupted CPU. 495 */ 496 497 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 498 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 499 500 cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n", 501 min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id); 502 503 /* 504 * Quickly check if there are enough CPUs present for fanout 505 * and also max_cpu_id is less than the id of the active CPU. 506 * We use the cpu_id stored in the last squeue_set to get 507 * an idea. The scheme is by no means perfect since it doesn't 508 * take into account CPU DR operations and the fact that 509 * interrupts themselves might change. An ideal scenario 510 * would be to ensure that interrupts run cpus by themselves 511 * and worker threads never have affinity to those CPUs. If 512 * the interrupts move to CPU which had a worker thread, it 513 * should be changed. Probably callbacks similar to CPU offline 514 * are needed to make it work perfectly. 515 */ 516 last_sqs = sqset_global_list[sqset_global_size - 1]; 517 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 518 if ((max_cpu_id - min_cpu_id) > 519 ill_soft_ring->ill_dls_soft_ring_cnt) 520 enough_uniq_cpus = B_TRUE; 521 else if ((max_cpu_id - min_cpu_id) >= 522 ill_soft_ring->ill_dls_soft_ring_cnt) 523 enough_cpus = B_TRUE; 524 } 525 526 j = 0; 527 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 528 if (enough_uniq_cpus) { 529 if ((min_cpu_id + i) == cpu_id) { 530 j++; 531 continue; 532 } 533 bind_cpu = cpu[min_cpu_id + i]; 534 } else if (enough_cpus) { 535 bind_cpu = cpu[min_cpu_id + i]; 536 } else { 537 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 538 bind_cpu = cpu[(cpu_id + i) % ncpus]; 539 } 540 541 /* 542 * Check if the CPU actually exist and active. If not, 543 * use the interrupted CPU. ip_find_unused_squeue() will 544 * find the right CPU to fanout anyway. 545 */ 546 if (!CPU_ISON(bind_cpu)) 547 bind_cpu = intr_cpu; 548 549 sqs = bind_cpu->cpu_squeue_set; 550 ASSERT(sqs != NULL); 551 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 552 553 sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 554 if (sqp == NULL) { 555 /* 556 * We hit the max limit of squeues allowed per CPU. 557 * Assign this rx_ring to DEFAULT squeue of the 558 * interrupted CPU but thesqueue will not manage 559 * the ring. Also print a warning. 560 */ 561 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 562 "%d/%p already has max number of squeues. System " 563 "performance might become suboptimal\n", 564 sqs->sqs_bind, (void *)sqs); 565 566 /* the first squeue in the list is the default squeue */ 567 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 568 ASSERT(sqp != NULL); 569 570 ill_rx_ring->rr_sqp = sqp; 571 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 572 continue; 573 574 } 575 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 576 ill_rx_ring->rr_sqp = sqp; 577 sqp->sq_rx_ring = ill_rx_ring; 578 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 579 sqp->sq_state |= SQS_ILL_BOUND; 580 581 /* assign affinity to soft ring */ 582 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 583 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 584 sqp->sq_bind); 585 } 586 mutex_exit(&sqp->sq_lock); 587 588 cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n", 589 i - j, sqp->sq_bind); 590 } 591 mutex_exit(&ill->ill_lock); 592 593 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 594 SOFT_RING_SRC_HASH); 595 596 mutex_enter(&ill->ill_lock); 597 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 598 mutex_exit(&ill->ill_lock); 599 600 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 601 ill_waiter_dcr(ill); 602 } 603 604 void 605 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 606 mblk_t *mp_chain, size_t hdrlen) 607 { 608 ip_taskq_arg_t *taskq_arg; 609 boolean_t refheld; 610 611 ASSERT(servicing_interrupt()); 612 ASSERT(ip_ring == NULL); 613 614 mutex_enter(&ill->ill_lock); 615 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 616 taskq_arg = (ip_taskq_arg_t *) 617 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 618 619 if (taskq_arg == NULL) 620 goto out; 621 622 taskq_arg->ip_taskq_ill = ill; 623 taskq_arg->ip_taskq_ill_rx_ring = ip_ring; 624 taskq_arg->ip_taskq_cpu = CPU; 625 626 /* 627 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 628 * the next interrupt to schedule a task for calling 629 * ip_squeue_soft_ring_affinity(); 630 */ 631 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 632 } else { 633 mutex_exit(&ill->ill_lock); 634 goto out; 635 } 636 mutex_exit(&ill->ill_lock); 637 refheld = ill_waiter_inc(ill); 638 if (refheld) { 639 if (taskq_dispatch(system_taskq, 640 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 641 goto out; 642 643 /* release ref on ill if taskq dispatch fails */ 644 ill_waiter_dcr(ill); 645 } 646 /* 647 * Turn on CAPAB_SOFT_RING so that affinity assignment 648 * can be tried again later. 649 */ 650 mutex_enter(&ill->ill_lock); 651 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 652 mutex_exit(&ill->ill_lock); 653 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 654 655 out: 656 ip_input(ill, ip_ring, mp_chain, hdrlen); 657 } 658 659 static squeue_t * 660 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 661 { 662 int i; 663 squeue_set_t *best_sqs = NULL; 664 squeue_set_t *curr_sqs = NULL; 665 int min_sq = 0; 666 squeue_t *sqp = NULL; 667 char sqname[64]; 668 669 /* 670 * If fanout is set and the passed squeue_set already has some 671 * squeues which are managing the NICs, try to find squeues on 672 * unused CPU. 673 */ 674 if (sqs->sqs_size > 1 && fanout) { 675 /* 676 * First check to see if any squeue on the CPU passed 677 * is managing a NIC. 678 */ 679 for (i = 0; i < sqs->sqs_size; i++) { 680 mutex_enter(&sqs->sqs_list[i]->sq_lock); 681 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 682 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 683 mutex_exit(&sqs->sqs_list[i]->sq_lock); 684 break; 685 } 686 mutex_exit(&sqs->sqs_list[i]->sq_lock); 687 } 688 if (i != sqs->sqs_size) { 689 best_sqs = sqset_global_list[sqset_global_size - 1]; 690 min_sq = best_sqs->sqs_size; 691 692 for (i = sqset_global_size - 2; i >= 0; i--) { 693 curr_sqs = sqset_global_list[i]; 694 if (curr_sqs->sqs_size < min_sq) { 695 best_sqs = curr_sqs; 696 min_sq = curr_sqs->sqs_size; 697 } 698 } 699 700 ASSERT(best_sqs != NULL); 701 sqs = best_sqs; 702 bind_cpu = cpu[sqs->sqs_bind]; 703 } 704 } 705 706 mutex_enter(&sqs->sqs_lock); 707 708 for (i = 0; i < sqs->sqs_size; i++) { 709 mutex_enter(&sqs->sqs_list[i]->sq_lock); 710 if ((sqs->sqs_list[i]->sq_state & 711 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 712 sqp = sqs->sqs_list[i]; 713 break; 714 } 715 mutex_exit(&sqs->sqs_list[i]->sq_lock); 716 } 717 718 if (sqp == NULL) { 719 /* Need to create a new squeue */ 720 if (sqs->sqs_size == sqs->sqs_max_size) { 721 /* 722 * Reached the max limit for squeue 723 * we can allocate on this CPU. 724 */ 725 mutex_exit(&sqs->sqs_lock); 726 return (NULL); 727 } 728 729 bzero(sqname, sizeof (sqname)); 730 (void) snprintf(sqname, sizeof (sqname), 731 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 732 bind_cpu->cpu_id, sqs->sqs_size); 733 734 sqp = squeue_create(sqname, bind_cpu->cpu_id, 735 ip_squeue_worker_wait, minclsyspri); 736 737 ASSERT(sqp != NULL); 738 739 squeue_profile_enable(sqp); 740 sqs->sqs_list[sqs->sqs_size++] = sqp; 741 742 if (ip_squeue_create_callback != NULL) 743 ip_squeue_create_callback(sqp); 744 745 mutex_enter(&cpu_lock); 746 if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 747 squeue_bind(sqp, -1); 748 } 749 mutex_exit(&cpu_lock); 750 751 mutex_enter(&sqp->sq_lock); 752 } 753 754 mutex_exit(&sqs->sqs_lock); 755 ASSERT(sqp != NULL); 756 return (sqp); 757 } 758 759 /* 760 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 761 * owned by a squeue yet, do the assignment. When the NIC registers it 762 * Rx rings with IP, we don't know where the interrupts will land and 763 * hence we need to wait till this point to do the assignment. 764 */ 765 squeue_t * 766 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 767 { 768 squeue_t *sqp; 769 ill_t *ill; 770 int interrupt; 771 ip_taskq_arg_t *taskq_arg; 772 boolean_t refheld; 773 774 if (ill_rx_ring == NULL) 775 return (IP_SQUEUE_GET(lbolt)); 776 777 sqp = ill_rx_ring->rr_sqp; 778 /* 779 * Do a quick check. If it's not NULL, we are done. 780 * Squeues are never destroyed so worse we will bind 781 * this connection to a suboptimal squeue. 782 * 783 * This is the fast path case. 784 */ 785 if (sqp != NULL) 786 return (sqp); 787 788 ill = ill_rx_ring->rr_ill; 789 ASSERT(ill != NULL); 790 791 interrupt = servicing_interrupt(); 792 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 793 KM_NOSLEEP); 794 795 mutex_enter(&ill->ill_lock); 796 if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || 797 taskq_arg == NULL) { 798 /* 799 * Do the ring to squeue binding only if we are in interrupt 800 * context and there is no one else trying the bind already. 801 */ 802 mutex_exit(&ill->ill_lock); 803 if (taskq_arg != NULL) 804 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 805 return (IP_SQUEUE_GET(lbolt)); 806 } 807 808 /* 809 * No sqp assigned yet. Can't really do that in interrupt 810 * context. Assign the default sqp to this connection and 811 * trigger creation of new sqp and binding it to this ring 812 * via taskq. Need to make sure ill stays around. 813 */ 814 taskq_arg->ip_taskq_ill = ill; 815 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 816 taskq_arg->ip_taskq_cpu = CPU; 817 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 818 mutex_exit(&ill->ill_lock); 819 refheld = ill_waiter_inc(ill); 820 if (refheld) { 821 if (taskq_dispatch(system_taskq, ip_squeue_extend, 822 taskq_arg, TQ_NOSLEEP) != NULL) { 823 return (IP_SQUEUE_GET(lbolt)); 824 } 825 } 826 /* 827 * The ill is closing and we could not get a reference on the ill OR 828 * taskq_dispatch failed probably due to memory allocation failure. 829 * We will try again next time. 830 */ 831 mutex_enter(&ill->ill_lock); 832 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 833 mutex_exit(&ill->ill_lock); 834 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 835 if (refheld) 836 ill_waiter_dcr(ill); 837 838 return (IP_SQUEUE_GET(lbolt)); 839 } 840 841 /* 842 * NDD hooks for setting ip_squeue_xxx tuneables. 843 */ 844 845 /* ARGSUSED */ 846 int 847 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 848 caddr_t addr, cred_t *cr) 849 { 850 int *bind_enabled = (int *)addr; 851 long new_value; 852 int i; 853 854 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 855 return (EINVAL); 856 857 if (ip_squeue_bind == new_value) 858 return (0); 859 860 *bind_enabled = new_value; 861 mutex_enter(&cpu_lock); 862 if (new_value == 0) { 863 for (i = 0; i < sqset_global_size; i++) 864 ip_squeue_set_unbind(sqset_global_list[i]); 865 } else { 866 for (i = 0; i < sqset_global_size; i++) 867 ip_squeue_set_bind(sqset_global_list[i]); 868 } 869 870 mutex_exit(&cpu_lock); 871 return (0); 872 } 873 874 /* 875 * Set squeue profiling. 876 * 0 means "disable" 877 * 1 means "enable" 878 * 2 means "enable and reset" 879 */ 880 /* ARGSUSED */ 881 int 882 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 883 cred_t *cr) 884 { 885 int *profile_enabled = (int *)cp; 886 long new_value; 887 squeue_set_t *sqs; 888 889 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 890 return (EINVAL); 891 892 if (new_value == 0) 893 squeue_profile_stop(); 894 else if (new_value == 1) 895 squeue_profile_start(); 896 else if (new_value == 2) { 897 int i, j; 898 899 squeue_profile_stop(); 900 mutex_enter(&cpu_lock); 901 for (i = 0; i < sqset_global_size; i++) { 902 sqs = sqset_global_list[i]; 903 for (j = 0; j < sqs->sqs_size; j++) { 904 squeue_profile_reset(sqs->sqs_list[j]); 905 } 906 } 907 mutex_exit(&cpu_lock); 908 909 new_value = 1; 910 squeue_profile_start(); 911 } 912 *profile_enabled = new_value; 913 914 return (0); 915 } 916 917 /* 918 * Reconfiguration callback 919 */ 920 921 /* ARGSUSED */ 922 static int 923 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 924 { 925 cpu_t *cp = cpu[id]; 926 927 ASSERT(MUTEX_HELD(&cpu_lock)); 928 switch (what) { 929 case CPU_CONFIG: 930 /* 931 * A new CPU is added. Create an squeue for it but do not bind 932 * it yet. 933 */ 934 if (cp->cpu_squeue_set == NULL) 935 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 936 break; 937 case CPU_ON: 938 case CPU_INIT: 939 case CPU_CPUPART_IN: 940 if (cp->cpu_squeue_set == NULL) { 941 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 942 } 943 if (ip_squeue_bind) 944 ip_squeue_set_bind(cp->cpu_squeue_set); 945 break; 946 case CPU_UNCONFIG: 947 case CPU_OFF: 948 case CPU_CPUPART_OUT: 949 ASSERT((cp->cpu_squeue_set != NULL) || 950 (cp->cpu_flags & CPU_OFFLINE)); 951 952 if (cp->cpu_squeue_set != NULL) { 953 ip_squeue_set_unbind(cp->cpu_squeue_set); 954 } 955 break; 956 default: 957 break; 958 } 959 return (0); 960 } 961 962 /* ARGSUSED */ 963 static void 964 ip_squeue_set_bind(squeue_set_t *sqs) 965 { 966 int i; 967 squeue_t *sqp; 968 969 if (!ip_squeue_bind) 970 return; 971 972 mutex_enter(&sqs->sqs_lock); 973 for (i = 0; i < sqs->sqs_size; i++) { 974 sqp = sqs->sqs_list[i]; 975 if (sqp->sq_state & SQS_BOUND) 976 continue; 977 squeue_bind(sqp, -1); 978 } 979 mutex_exit(&sqs->sqs_lock); 980 } 981 982 static void 983 ip_squeue_set_unbind(squeue_set_t *sqs) 984 { 985 int i; 986 squeue_t *sqp; 987 988 mutex_enter(&sqs->sqs_lock); 989 for (i = 0; i < sqs->sqs_size; i++) { 990 sqp = sqs->sqs_list[i]; 991 992 /* 993 * CPU is going offline. Remove the thread affinity 994 * for any soft ring threads the squeue is managing. 995 */ 996 if (sqp->sq_state & SQS_ILL_BOUND) { 997 ill_rx_ring_t *ring = sqp->sq_rx_ring; 998 ill_t *ill = ring->rr_ill; 999 1000 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1001 ASSERT(ring->rr_handle != NULL); 1002 ill->ill_dls_capab->ill_dls_unbind( 1003 ring->rr_handle); 1004 } 1005 } 1006 if (!(sqp->sq_state & SQS_BOUND)) 1007 continue; 1008 squeue_unbind(sqp); 1009 } 1010 mutex_exit(&sqs->sqs_lock); 1011 } 1012