1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/nd.h> 114 #include <inet/ipclassifier.h> 115 #include <sys/types.h> 116 #include <sys/conf.h> 117 #include <sys/sunddi.h> 118 #include <sys/dlpi.h> 119 #include <sys/squeue_impl.h> 120 121 /* 122 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 123 * mapping between squeue and NIC (or Rx ring) for performance reasons so 124 * each squeue can uniquely own a NIC or a Rx ring and do polling 125 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 126 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 127 * can be created dynamically as needed. 128 */ 129 #define MAX_SQUEUES_PER_CPU 32 130 #define MIN_SQUEUES_PER_CPU 1 131 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 132 133 #define IP_NUM_SOFT_RINGS 2 134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 135 136 /* 137 * List of all created squeue sets. The size is protected by cpu_lock 138 */ 139 squeue_set_t **sqset_global_list; 140 uint_t sqset_global_size; 141 142 int ip_squeue_bind = B_TRUE; 143 int ip_squeue_profile = B_TRUE; 144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 145 146 /* 147 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 148 * created. This is the time squeue code waits before waking up the worker 149 * thread after queuing a request. 150 */ 151 uint_t ip_squeue_worker_wait = 10; 152 153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155 156 static void ip_squeue_set_bind(squeue_set_t *); 157 static void ip_squeue_set_unbind(squeue_set_t *); 158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t); 159 static void ip_squeue_clean(void *, mblk_t *, void *); 160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); 161 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 163 164 /* 165 * Create squeue set containing ip_squeues_per_cpu number of squeues 166 * for this CPU and bind them all to the CPU. 167 */ 168 static squeue_set_t * 169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 170 { 171 int i; 172 squeue_set_t *sqs; 173 squeue_t *sqp; 174 char sqname[64]; 175 processorid_t id = cp->cpu_id; 176 177 if (reuse) { 178 int i; 179 180 /* 181 * We may already have an squeue created for this CPU. Try to 182 * find one and reuse it if possible. 183 */ 184 for (i = 0; i < sqset_global_size; i++) { 185 sqs = sqset_global_list[i]; 186 if (id == sqs->sqs_bind) 187 return (sqs); 188 } 189 } 190 191 sqs = kmem_zalloc(sizeof (squeue_set_t) + 192 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 193 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 194 sqs->sqs_list = (squeue_t **)&sqs[1]; 195 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 196 sqs->sqs_bind = id; 197 198 for (i = 0; i < ip_squeues_per_cpu; i++) { 199 bzero(sqname, sizeof (sqname)); 200 201 (void) snprintf(sqname, sizeof (sqname), 202 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 203 cp->cpu_id, i); 204 205 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 206 minclsyspri); 207 208 /* 209 * The first squeue in each squeue_set is the DEFAULT 210 * squeue. 211 */ 212 sqp->sq_state |= SQS_DEFAULT; 213 214 ASSERT(sqp != NULL); 215 216 squeue_profile_enable(sqp); 217 sqs->sqs_list[sqs->sqs_size++] = sqp; 218 219 if (ip_squeue_create_callback != NULL) 220 ip_squeue_create_callback(sqp); 221 } 222 223 if (ip_squeue_bind && cpu_is_online(cp)) 224 ip_squeue_set_bind(sqs); 225 226 sqset_global_list[sqset_global_size++] = sqs; 227 ASSERT(sqset_global_size <= NCPU); 228 return (sqs); 229 } 230 231 /* 232 * Initialize IP squeues. 233 */ 234 void 235 ip_squeue_init(void (*callback)(squeue_t *)) 236 { 237 int i; 238 239 ASSERT(sqset_global_list == NULL); 240 241 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 243 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 244 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 245 246 ip_squeue_create_callback = callback; 247 squeue_init(); 248 sqset_global_list = 249 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 250 sqset_global_size = 0; 251 mutex_enter(&cpu_lock); 252 253 /* Create squeue for each active CPU available */ 254 for (i = 0; i < NCPU; i++) { 255 cpu_t *cp = cpu[i]; 256 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 257 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 258 } 259 } 260 261 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 262 263 mutex_exit(&cpu_lock); 264 265 if (ip_squeue_profile) 266 squeue_profile_start(); 267 } 268 269 /* 270 * Get squeue_t structure based on index. 271 * Since the squeue list can only grow, no need to grab any lock. 272 */ 273 squeue_t * 274 ip_squeue_random(uint_t index) 275 { 276 squeue_set_t *sqs; 277 278 sqs = sqset_global_list[index % sqset_global_size]; 279 return (sqs->sqs_list[index % sqs->sqs_size]); 280 } 281 282 /* ARGSUSED */ 283 static void 284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 285 { 286 squeue_t *sqp = arg2; 287 ill_rx_ring_t *ring = sqp->sq_rx_ring; 288 ill_t *ill; 289 290 ASSERT(sqp != NULL); 291 292 if (ring == NULL) { 293 return; 294 } 295 296 /* 297 * Clean up squeue 298 */ 299 mutex_enter(&sqp->sq_lock); 300 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 301 sqp->sq_rx_ring = NULL; 302 mutex_exit(&sqp->sq_lock); 303 304 ill = ring->rr_ill; 305 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 306 ASSERT(ring->rr_handle != NULL); 307 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 308 } 309 310 /* 311 * Cleanup the ring 312 */ 313 314 ring->rr_blank = NULL; 315 ring->rr_handle = NULL; 316 ring->rr_sqp = NULL; 317 318 /* 319 * Signal ill that cleanup is done 320 */ 321 mutex_enter(&ill->ill_lock); 322 ring->rr_ring_state = ILL_RING_FREE; 323 cv_signal(&ill->ill_cv); 324 mutex_exit(&ill->ill_lock); 325 } 326 327 /* 328 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 329 * The real cleanup happens behind the squeue via ip_squeue_clean function but 330 * we need to protect ourselves from 2 threads trying to cleanup at the same 331 * time (possible with one port going down for aggr and someone tearing down the 332 * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock 333 * to indicate when the cleanup has started (1 ref) and when the cleanup 334 * is done (0 ref). When a new ring gets assigned to squeue, we start by 335 * putting 2 ref on ill_inuse_ref. 336 */ 337 static void 338 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 339 { 340 conn_t *connp; 341 squeue_t *sqp; 342 mblk_t *mp; 343 344 ASSERT(rx_ring != NULL); 345 346 /* Just clean one squeue */ 347 mutex_enter(&ill->ill_lock); 348 /* 349 * Reset the ILL_SOFT_RING_ASSIGN bit so that 350 * ip_squeue_soft_ring_affinty() will not go 351 * ahead with assigning rings. 352 */ 353 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 354 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 355 /* Some operations pending on the ring. Wait */ 356 cv_wait(&ill->ill_cv, &ill->ill_lock); 357 358 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 359 /* 360 * Someone already trying to clean 361 * this squeue or it's already been cleaned. 362 */ 363 mutex_exit(&ill->ill_lock); 364 return; 365 } 366 sqp = rx_ring->rr_sqp; 367 368 if (sqp == NULL) { 369 /* 370 * The rx_ring never had a squeue assigned to it. 371 * We are under ill_lock so we can clean it up 372 * here itself since no one can get to it. 373 */ 374 rx_ring->rr_blank = NULL; 375 rx_ring->rr_handle = NULL; 376 rx_ring->rr_sqp = NULL; 377 rx_ring->rr_ring_state = ILL_RING_FREE; 378 mutex_exit(&ill->ill_lock); 379 return; 380 } 381 382 /* Indicate that it's being cleaned */ 383 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 384 ASSERT(sqp != NULL); 385 mutex_exit(&ill->ill_lock); 386 387 /* 388 * Use the preallocated ill_unbind_conn for this purpose 389 */ 390 connp = ill->ill_dls_capab->ill_unbind_conn; 391 392 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) { 393 connp->conn_tcp->tcp_closemp_used = B_TRUE; 394 } else { 395 cmn_err(CE_PANIC, "ip_squeue_clean_ring: " 396 "concurrent use of tcp_closemp_used: connp %p tcp %p\n", 397 (void *)connp, (void *)connp->conn_tcp); 398 } 399 400 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 401 mp = &connp->conn_tcp->tcp_closemp; 402 CONN_INC_REF(connp); 403 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 404 405 mutex_enter(&ill->ill_lock); 406 while (rx_ring->rr_ring_state != ILL_RING_FREE) 407 cv_wait(&ill->ill_cv, &ill->ill_lock); 408 mutex_exit(&ill->ill_lock); 409 } 410 411 void 412 ip_squeue_clean_all(ill_t *ill) 413 { 414 int idx; 415 416 /* 417 * No need to clean if poll_capab isn't set for this ill 418 */ 419 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 420 return; 421 422 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 423 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 424 425 ip_squeue_clean_ring(ill, ipr); 426 } 427 428 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 429 } 430 431 typedef struct ip_taskq_arg { 432 ill_t *ip_taskq_ill; 433 ill_rx_ring_t *ip_taskq_ill_rx_ring; 434 cpu_t *ip_taskq_cpu; 435 } ip_taskq_arg_t; 436 437 /* 438 * Do a Rx ring to squeue binding. Find a unique squeue that is not 439 * managing a receive ring. If no such squeue exists, dynamically 440 * create a new one in the squeue set. 441 * 442 * The function runs via the system taskq. The ill passed as an 443 * argument can't go away since we hold a ref. The lock order is 444 * ill_lock -> sqs_lock -> sq_lock. 445 * 446 * If we are binding a Rx ring to a squeue attached to the offline CPU, 447 * no need to check that because squeues are never destroyed once 448 * created. 449 */ 450 /* ARGSUSED */ 451 static void 452 ip_squeue_extend(void *arg) 453 { 454 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 455 ill_t *ill = sq_arg->ip_taskq_ill; 456 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 457 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 458 squeue_set_t *sqs; 459 squeue_t *sqp = NULL; 460 461 ASSERT(ill != NULL); 462 ASSERT(ill_rx_ring != NULL); 463 kmem_free(arg, sizeof (ip_taskq_arg_t)); 464 465 /* 466 * Make sure the CPU that originally took the interrupt still 467 * exists. 468 */ 469 if (!CPU_ISON(intr_cpu)) 470 intr_cpu = CPU; 471 472 sqs = intr_cpu->cpu_squeue_set; 473 474 /* 475 * If this ill represents link aggregation, then there might be 476 * multiple NICs trying to register them selves at the same time 477 * and in order to ensure that test and assignment of free rings 478 * is sequential, we need to hold the ill_lock. 479 */ 480 mutex_enter(&ill->ill_lock); 481 sqp = ip_find_unused_squeue(sqs, B_FALSE); 482 if (sqp == NULL) { 483 /* 484 * We hit the max limit of squeues allowed per CPU. 485 * Assign this rx_ring to DEFAULT squeue of the 486 * interrupted CPU but the squeue will not manage 487 * the ring. Also print a warning. 488 */ 489 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 490 "has max number of squeues. System performance might " 491 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 492 493 /* the first squeue in the list is the default squeue */ 494 sqp = sqs->sqs_list[0]; 495 ASSERT(sqp != NULL); 496 ill_rx_ring->rr_sqp = sqp; 497 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 498 499 mutex_exit(&ill->ill_lock); 500 ill_waiter_dcr(ill); 501 return; 502 } 503 504 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 505 sqp->sq_rx_ring = ill_rx_ring; 506 ill_rx_ring->rr_sqp = sqp; 507 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 508 509 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 510 mutex_exit(&sqp->sq_lock); 511 512 mutex_exit(&ill->ill_lock); 513 514 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 515 ill_waiter_dcr(ill); 516 } 517 518 /* 519 * Do a Rx ring to squeue binding. Find a unique squeue that is not 520 * managing a receive ring. If no such squeue exists, dynamically 521 * create a new one in the squeue set. 522 * 523 * The function runs via the system taskq. The ill passed as an 524 * argument can't go away since we hold a ref. The lock order is 525 * ill_lock -> sqs_lock -> sq_lock. 526 * 527 * If we are binding a Rx ring to a squeue attached to the offline CPU, 528 * no need to check that because squeues are never destroyed once 529 * created. 530 */ 531 /* ARGSUSED */ 532 static void 533 ip_squeue_soft_ring_affinity(void *arg) 534 { 535 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 536 ill_t *ill = sq_arg->ip_taskq_ill; 537 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 538 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 539 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 540 cpu_t *bind_cpu; 541 int cpu_id = intr_cpu->cpu_id; 542 int min_cpu_id, max_cpu_id; 543 boolean_t enough_uniq_cpus = B_FALSE; 544 boolean_t enough_cpus = B_FALSE; 545 squeue_set_t *sqs, *last_sqs; 546 squeue_t *sqp = NULL; 547 int i, j; 548 549 ASSERT(ill != NULL); 550 kmem_free(arg, sizeof (ip_taskq_arg_t)); 551 552 /* 553 * Make sure the CPU that originally took the interrupt still 554 * exists. 555 */ 556 if (!CPU_ISON(intr_cpu)) { 557 intr_cpu = CPU; 558 cpu_id = intr_cpu->cpu_id; 559 } 560 561 /* 562 * If this ill represents link aggregation, then there might be 563 * multiple NICs trying to register them selves at the same time 564 * and in order to ensure that test and assignment of free rings 565 * is sequential, we need to hold the ill_lock. 566 */ 567 mutex_enter(&ill->ill_lock); 568 569 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 570 mutex_exit(&ill->ill_lock); 571 return; 572 } 573 /* 574 * We need to fanout the interrupts from the NIC. We do that by 575 * telling the driver underneath to create soft rings and use 576 * worker threads (if the driver advertized SOFT_RING capability) 577 * Its still a big performance win to if we can fanout to the 578 * threads on the same core that is taking interrupts. 579 * 580 * Since we don't know the interrupt to CPU binding, we don't 581 * assign any squeues or affinity to worker threads in the NIC. 582 * At the time of the first interrupt, we know which CPU is 583 * taking interrupts and try to find other threads on the same 584 * core. Assuming, ip_threads_per_cpu is correct and cpus are 585 * numbered sequentially for each core (XXX need something better 586 * than this in future), find the lowest number and highest 587 * number thread for that core. 588 * 589 * If we have one more thread per core than number of soft rings, 590 * then don't assign any worker threads to the H/W thread (cpu) 591 * taking interrupts (capability negotiation tries to ensure this) 592 * 593 * If the number of threads per core are same as the number of 594 * soft rings, then assign the worker affinity and squeue to 595 * the same cpu. 596 * 597 * Otherwise, just fanout to higher number CPUs starting from 598 * the interrupted CPU. 599 */ 600 601 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 602 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 603 604 /* 605 * Quickly check if there are enough CPUs present for fanout 606 * and also max_cpu_id is less than the id of the active CPU. 607 * We use the cpu_id stored in the last squeue_set to get 608 * an idea. The scheme is by no means perfect since it doesn't 609 * take into account CPU DR operations and the fact that 610 * interrupts themselves might change. An ideal scenario 611 * would be to ensure that interrupts run cpus by themselves 612 * and worker threads never have affinity to those CPUs. If 613 * the interrupts move to CPU which had a worker thread, it 614 * should be changed. Probably callbacks similar to CPU offline 615 * are needed to make it work perfectly. 616 */ 617 last_sqs = sqset_global_list[sqset_global_size - 1]; 618 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 619 if ((max_cpu_id - min_cpu_id) > 620 ill_soft_ring->ill_dls_soft_ring_cnt) 621 enough_uniq_cpus = B_TRUE; 622 else if ((max_cpu_id - min_cpu_id) >= 623 ill_soft_ring->ill_dls_soft_ring_cnt) 624 enough_cpus = B_TRUE; 625 } 626 627 j = 0; 628 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 629 if (enough_uniq_cpus) { 630 if ((min_cpu_id + i) == cpu_id) { 631 j++; 632 continue; 633 } 634 bind_cpu = cpu[min_cpu_id + i]; 635 } else if (enough_cpus) { 636 bind_cpu = cpu[min_cpu_id + i]; 637 } else { 638 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 639 bind_cpu = cpu[(cpu_id + i) % ncpus]; 640 } 641 642 /* 643 * Check if the CPU actually exist and active. If not, 644 * use the interrupted CPU. ip_find_unused_squeue() will 645 * find the right CPU to fanout anyway. 646 */ 647 if (!CPU_ISON(bind_cpu)) 648 bind_cpu = intr_cpu; 649 650 sqs = bind_cpu->cpu_squeue_set; 651 ASSERT(sqs != NULL); 652 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 653 654 sqp = ip_find_unused_squeue(sqs, B_TRUE); 655 if (sqp == NULL) { 656 /* 657 * We hit the max limit of squeues allowed per CPU. 658 * Assign this rx_ring to DEFAULT squeue of the 659 * interrupted CPU but thesqueue will not manage 660 * the ring. Also print a warning. 661 */ 662 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 663 "%d/%p already has max number of squeues. System " 664 "performance might become suboptimal\n", 665 sqs->sqs_bind, (void *)sqs); 666 667 /* the first squeue in the list is the default squeue */ 668 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 669 ASSERT(sqp != NULL); 670 671 ill_rx_ring->rr_sqp = sqp; 672 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 673 continue; 674 675 } 676 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 677 ill_rx_ring->rr_sqp = sqp; 678 sqp->sq_rx_ring = ill_rx_ring; 679 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 680 sqp->sq_state |= SQS_ILL_BOUND; 681 682 /* assign affinity to soft ring */ 683 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 684 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 685 sqp->sq_bind); 686 } 687 mutex_exit(&sqp->sq_lock); 688 } 689 mutex_exit(&ill->ill_lock); 690 691 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 692 SOFT_RING_FANOUT); 693 694 mutex_enter(&ill->ill_lock); 695 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 696 mutex_exit(&ill->ill_lock); 697 698 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 699 ill_waiter_dcr(ill); 700 } 701 702 /* ARGSUSED */ 703 void 704 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 705 mblk_t *mp_chain, struct mac_header_info_s *mhip) 706 { 707 ip_taskq_arg_t *taskq_arg; 708 boolean_t refheld; 709 710 ASSERT(servicing_interrupt()); 711 712 mutex_enter(&ill->ill_lock); 713 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 714 taskq_arg = (ip_taskq_arg_t *) 715 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 716 717 if (taskq_arg == NULL) 718 goto out; 719 720 taskq_arg->ip_taskq_ill = ill; 721 taskq_arg->ip_taskq_ill_rx_ring = NULL; 722 taskq_arg->ip_taskq_cpu = CPU; 723 724 /* 725 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 726 * the next interrupt to schedule a task for calling 727 * ip_squeue_soft_ring_affinity(); 728 */ 729 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 730 } else { 731 mutex_exit(&ill->ill_lock); 732 goto out; 733 } 734 mutex_exit(&ill->ill_lock); 735 refheld = ill_waiter_inc(ill); 736 if (refheld) { 737 if (taskq_dispatch(system_taskq, 738 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 739 goto out; 740 741 /* release ref on ill if taskq dispatch fails */ 742 ill_waiter_dcr(ill); 743 } 744 /* 745 * Turn on CAPAB_SOFT_RING so that affinity assignment 746 * can be tried again later. 747 */ 748 mutex_enter(&ill->ill_lock); 749 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 750 mutex_exit(&ill->ill_lock); 751 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 752 753 out: 754 ip_input(ill, NULL, mp_chain, mhip); 755 } 756 757 static squeue_t * 758 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout) 759 { 760 int i; 761 squeue_set_t *best_sqs = NULL; 762 squeue_set_t *curr_sqs = NULL; 763 int min_sq = 0; 764 squeue_t *sqp = NULL; 765 char sqname[64]; 766 cpu_t *bind_cpu; 767 768 /* 769 * If fanout is set and the passed squeue_set already has some 770 * squeues which are managing the NICs, try to find squeues on 771 * unused CPU. 772 */ 773 if (sqs->sqs_size > 1 && fanout) { 774 /* 775 * First check to see if any squeue on the CPU passed 776 * is managing a NIC. 777 */ 778 for (i = 0; i < sqs->sqs_size; i++) { 779 mutex_enter(&sqs->sqs_list[i]->sq_lock); 780 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 781 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 782 mutex_exit(&sqs->sqs_list[i]->sq_lock); 783 break; 784 } 785 mutex_exit(&sqs->sqs_list[i]->sq_lock); 786 } 787 if (i != sqs->sqs_size) { 788 best_sqs = NULL; 789 790 for (i = sqset_global_size - 1; i >= 0; i--) { 791 curr_sqs = sqset_global_list[i]; 792 /* 793 * Check and make sure the CPU that sqs 794 * is bound to is valid. There could be 795 * sqs's around whose CPUs could have 796 * been DR'd out. 797 */ 798 mutex_enter(&cpu_lock); 799 if (cpu_get(curr_sqs->sqs_bind) != NULL) { 800 if (best_sqs == NULL) { 801 best_sqs = curr_sqs; 802 min_sq = curr_sqs->sqs_size; 803 } else if (curr_sqs->sqs_size < 804 min_sq) { 805 best_sqs = curr_sqs; 806 min_sq = curr_sqs->sqs_size; 807 } 808 } 809 mutex_exit(&cpu_lock); 810 } 811 812 ASSERT(best_sqs != NULL); 813 sqs = best_sqs; 814 } 815 } 816 817 mutex_enter(&sqs->sqs_lock); 818 819 for (i = 0; i < sqs->sqs_size; i++) { 820 mutex_enter(&sqs->sqs_list[i]->sq_lock); 821 if ((sqs->sqs_list[i]->sq_state & 822 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 823 sqp = sqs->sqs_list[i]; 824 break; 825 } 826 mutex_exit(&sqs->sqs_list[i]->sq_lock); 827 } 828 829 if (sqp == NULL) { 830 /* Need to create a new squeue */ 831 if (sqs->sqs_size == sqs->sqs_max_size) { 832 /* 833 * Reached the max limit for squeue 834 * we can allocate on this CPU. 835 */ 836 mutex_exit(&sqs->sqs_lock); 837 return (NULL); 838 } 839 840 mutex_enter(&cpu_lock); 841 if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) { 842 /* Too bad, CPU got DR'd out, return NULL */ 843 mutex_exit(&cpu_lock); 844 mutex_exit(&sqs->sqs_lock); 845 return (NULL); 846 } 847 848 bzero(sqname, sizeof (sqname)); 849 (void) snprintf(sqname, sizeof (sqname), 850 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 851 bind_cpu->cpu_id, sqs->sqs_size); 852 mutex_exit(&cpu_lock); 853 854 sqp = squeue_create(sqname, sqs->sqs_bind, 855 ip_squeue_worker_wait, minclsyspri); 856 857 ASSERT(sqp != NULL); 858 859 squeue_profile_enable(sqp); 860 sqs->sqs_list[sqs->sqs_size++] = sqp; 861 862 if (ip_squeue_create_callback != NULL) 863 ip_squeue_create_callback(sqp); 864 865 if (ip_squeue_bind) { 866 mutex_enter(&cpu_lock); 867 bind_cpu = cpu_get(sqs->sqs_bind); 868 if (bind_cpu != NULL && cpu_is_online(bind_cpu)) { 869 squeue_bind(sqp, -1); 870 } 871 mutex_exit(&cpu_lock); 872 } 873 mutex_enter(&sqp->sq_lock); 874 } 875 876 mutex_exit(&sqs->sqs_lock); 877 ASSERT(sqp != NULL); 878 return (sqp); 879 } 880 881 /* 882 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 883 * owned by a squeue yet, do the assignment. When the NIC registers it 884 * Rx rings with IP, we don't know where the interrupts will land and 885 * hence we need to wait till this point to do the assignment. 886 */ 887 squeue_t * 888 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 889 { 890 squeue_t *sqp; 891 ill_t *ill; 892 int interrupt; 893 ip_taskq_arg_t *taskq_arg; 894 boolean_t refheld; 895 896 if (ill_rx_ring == NULL) 897 return (IP_SQUEUE_GET(lbolt)); 898 899 sqp = ill_rx_ring->rr_sqp; 900 /* 901 * Do a quick check. If it's not NULL, we are done. 902 * Squeues are never destroyed so worse we will bind 903 * this connection to a suboptimal squeue. 904 * 905 * This is the fast path case. 906 */ 907 if (sqp != NULL) 908 return (sqp); 909 910 ill = ill_rx_ring->rr_ill; 911 ASSERT(ill != NULL); 912 913 interrupt = servicing_interrupt(); 914 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 915 KM_NOSLEEP); 916 917 mutex_enter(&ill->ill_lock); 918 /* 919 * Check sqp under the lock again for atomicity. Possible race with 920 * a previously scheduled ip_squeue_get -> ip_squeue_extend. 921 * Do the ring to squeue binding only if we are in interrupt context 922 * AND the ring is not already bound AND there is no one else trying 923 * the bind already. 924 */ 925 sqp = ill_rx_ring->rr_sqp; 926 if (sqp != NULL || !interrupt || 927 ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { 928 /* 929 * Note that the ring might get bound once we drop the lock 930 * below, if a previous request is in progress i.e. if the ring 931 * state is ILL_RING_INPROC. The incoming connection on whose 932 * behalf we are currently here might get a suboptimal squeue 933 * via the call to IP_SQUEUE_GET below, but there is no 934 * correctness issue. 935 */ 936 mutex_exit(&ill->ill_lock); 937 if (taskq_arg != NULL) 938 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 939 if (sqp != NULL) 940 return (sqp); 941 return (IP_SQUEUE_GET(lbolt)); 942 } 943 944 /* 945 * No sqp assigned yet. Can't really do that in interrupt 946 * context. Assign the default sqp to this connection and 947 * trigger creation of new sqp and binding it to this ring 948 * via taskq. Need to make sure ill stays around. 949 */ 950 taskq_arg->ip_taskq_ill = ill; 951 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 952 taskq_arg->ip_taskq_cpu = CPU; 953 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 954 mutex_exit(&ill->ill_lock); 955 refheld = ill_waiter_inc(ill); 956 if (refheld) { 957 if (taskq_dispatch(system_taskq, ip_squeue_extend, 958 taskq_arg, TQ_NOSLEEP) != NULL) { 959 return (IP_SQUEUE_GET(lbolt)); 960 } 961 } 962 /* 963 * The ill is closing and we could not get a reference on the ill OR 964 * taskq_dispatch failed probably due to memory allocation failure. 965 * We will try again next time. 966 */ 967 mutex_enter(&ill->ill_lock); 968 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 969 mutex_exit(&ill->ill_lock); 970 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 971 if (refheld) 972 ill_waiter_dcr(ill); 973 974 return (IP_SQUEUE_GET(lbolt)); 975 } 976 977 /* 978 * NDD hooks for setting ip_squeue_xxx tuneables. 979 */ 980 981 /* ARGSUSED */ 982 int 983 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 984 caddr_t addr, cred_t *cr) 985 { 986 int *bind_enabled = (int *)addr; 987 long new_value; 988 int i; 989 990 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 991 return (EINVAL); 992 993 if (ip_squeue_bind == new_value) 994 return (0); 995 996 *bind_enabled = new_value; 997 mutex_enter(&cpu_lock); 998 if (new_value == 0) { 999 for (i = 0; i < sqset_global_size; i++) 1000 ip_squeue_set_unbind(sqset_global_list[i]); 1001 } else { 1002 for (i = 0; i < sqset_global_size; i++) 1003 ip_squeue_set_bind(sqset_global_list[i]); 1004 } 1005 1006 mutex_exit(&cpu_lock); 1007 return (0); 1008 } 1009 1010 /* 1011 * Set squeue profiling. 1012 * 0 means "disable" 1013 * 1 means "enable" 1014 * 2 means "enable and reset" 1015 */ 1016 /* ARGSUSED */ 1017 int 1018 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 1019 cred_t *cr) 1020 { 1021 int *profile_enabled = (int *)cp; 1022 long new_value; 1023 squeue_set_t *sqs; 1024 1025 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1026 return (EINVAL); 1027 1028 if (new_value == 0) 1029 squeue_profile_stop(); 1030 else if (new_value == 1) 1031 squeue_profile_start(); 1032 else if (new_value == 2) { 1033 int i, j; 1034 1035 squeue_profile_stop(); 1036 mutex_enter(&cpu_lock); 1037 for (i = 0; i < sqset_global_size; i++) { 1038 sqs = sqset_global_list[i]; 1039 for (j = 0; j < sqs->sqs_size; j++) { 1040 squeue_profile_reset(sqs->sqs_list[j]); 1041 } 1042 } 1043 mutex_exit(&cpu_lock); 1044 1045 new_value = 1; 1046 squeue_profile_start(); 1047 } 1048 *profile_enabled = new_value; 1049 1050 return (0); 1051 } 1052 1053 /* 1054 * Reconfiguration callback 1055 */ 1056 1057 /* ARGSUSED */ 1058 static int 1059 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 1060 { 1061 cpu_t *cp = cpu[id]; 1062 1063 ASSERT(MUTEX_HELD(&cpu_lock)); 1064 switch (what) { 1065 case CPU_CONFIG: 1066 /* 1067 * A new CPU is added. Create an squeue for it but do not bind 1068 * it yet. 1069 */ 1070 if (cp->cpu_squeue_set == NULL) 1071 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1072 break; 1073 case CPU_ON: 1074 case CPU_INIT: 1075 case CPU_CPUPART_IN: 1076 if (cp->cpu_squeue_set == NULL) { 1077 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1078 } 1079 if (ip_squeue_bind) 1080 ip_squeue_set_bind(cp->cpu_squeue_set); 1081 break; 1082 case CPU_UNCONFIG: 1083 case CPU_OFF: 1084 case CPU_CPUPART_OUT: 1085 ASSERT((cp->cpu_squeue_set != NULL) || 1086 (cp->cpu_flags & CPU_OFFLINE)); 1087 1088 if (cp->cpu_squeue_set != NULL) { 1089 ip_squeue_set_unbind(cp->cpu_squeue_set); 1090 } 1091 break; 1092 default: 1093 break; 1094 } 1095 return (0); 1096 } 1097 1098 /* ARGSUSED */ 1099 static void 1100 ip_squeue_set_bind(squeue_set_t *sqs) 1101 { 1102 int i; 1103 squeue_t *sqp; 1104 1105 if (!ip_squeue_bind) 1106 return; 1107 1108 mutex_enter(&sqs->sqs_lock); 1109 for (i = 0; i < sqs->sqs_size; i++) { 1110 sqp = sqs->sqs_list[i]; 1111 if (sqp->sq_state & SQS_BOUND) 1112 continue; 1113 squeue_bind(sqp, -1); 1114 } 1115 mutex_exit(&sqs->sqs_lock); 1116 } 1117 1118 static void 1119 ip_squeue_set_unbind(squeue_set_t *sqs) 1120 { 1121 int i; 1122 squeue_t *sqp; 1123 1124 mutex_enter(&sqs->sqs_lock); 1125 for (i = 0; i < sqs->sqs_size; i++) { 1126 sqp = sqs->sqs_list[i]; 1127 1128 /* 1129 * CPU is going offline. Remove the thread affinity 1130 * for any soft ring threads the squeue is managing. 1131 */ 1132 if (sqp->sq_state & SQS_ILL_BOUND) { 1133 ill_rx_ring_t *ring = sqp->sq_rx_ring; 1134 ill_t *ill = ring->rr_ill; 1135 1136 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1137 ASSERT(ring->rr_handle != NULL); 1138 ill->ill_dls_capab->ill_dls_unbind( 1139 ring->rr_handle); 1140 } 1141 } 1142 if (!(sqp->sq_state & SQS_BOUND)) 1143 continue; 1144 squeue_unbind(sqp); 1145 } 1146 mutex_exit(&sqs->sqs_lock); 1147 } 1148