1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/nd.h> 114 #include <inet/ipclassifier.h> 115 #include <sys/types.h> 116 #include <sys/conf.h> 117 #include <sys/sunddi.h> 118 #include <sys/dlpi.h> 119 #include <sys/squeue_impl.h> 120 121 /* 122 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 123 * mapping between squeue and NIC (or Rx ring) for performance reasons so 124 * each squeue can uniquely own a NIC or a Rx ring and do polling 125 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 126 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 127 * can be created dynamically as needed. 128 */ 129 #define MAX_SQUEUES_PER_CPU 32 130 #define MIN_SQUEUES_PER_CPU 1 131 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 132 133 #define IP_NUM_SOFT_RINGS 2 134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 135 136 /* 137 * List of all created squeue sets. The size is protected by cpu_lock 138 */ 139 squeue_set_t **sqset_global_list; 140 uint_t sqset_global_size; 141 142 int ip_squeue_bind = B_TRUE; 143 int ip_squeue_profile = B_TRUE; 144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 145 146 /* 147 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 148 * created. This is the time squeue code waits before waking up the worker 149 * thread after queuing a request. 150 */ 151 uint_t ip_squeue_worker_wait = 10; 152 153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155 156 static void ip_squeue_set_bind(squeue_set_t *); 157 static void ip_squeue_set_unbind(squeue_set_t *); 158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t); 159 static void ip_squeue_clean(void *, mblk_t *, void *); 160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); 161 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 163 164 /* 165 * Create squeue set containing ip_squeues_per_cpu number of squeues 166 * for this CPU and bind them all to the CPU. 167 */ 168 static squeue_set_t * 169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 170 { 171 int i; 172 squeue_set_t *sqs; 173 squeue_t *sqp; 174 char sqname[64]; 175 processorid_t id = cp->cpu_id; 176 177 if (reuse) { 178 int i; 179 180 /* 181 * We may already have an squeue created for this CPU. Try to 182 * find one and reuse it if possible. 183 */ 184 for (i = 0; i < sqset_global_size; i++) { 185 sqs = sqset_global_list[i]; 186 if (id == sqs->sqs_bind) 187 return (sqs); 188 } 189 } 190 191 sqs = kmem_zalloc(sizeof (squeue_set_t) + 192 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 193 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 194 sqs->sqs_list = (squeue_t **)&sqs[1]; 195 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 196 sqs->sqs_bind = id; 197 198 for (i = 0; i < ip_squeues_per_cpu; i++) { 199 bzero(sqname, sizeof (sqname)); 200 201 (void) snprintf(sqname, sizeof (sqname), 202 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 203 cp->cpu_id, i); 204 205 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 206 minclsyspri); 207 208 /* 209 * The first squeue in each squeue_set is the DEFAULT 210 * squeue. 211 */ 212 sqp->sq_state |= SQS_DEFAULT; 213 214 ASSERT(sqp != NULL); 215 216 squeue_profile_enable(sqp); 217 sqs->sqs_list[sqs->sqs_size++] = sqp; 218 219 if (ip_squeue_create_callback != NULL) 220 ip_squeue_create_callback(sqp); 221 } 222 223 if (ip_squeue_bind && cpu_is_online(cp)) 224 ip_squeue_set_bind(sqs); 225 226 sqset_global_list[sqset_global_size++] = sqs; 227 ASSERT(sqset_global_size <= NCPU); 228 return (sqs); 229 } 230 231 /* 232 * Initialize IP squeues. 233 */ 234 void 235 ip_squeue_init(void (*callback)(squeue_t *)) 236 { 237 int i; 238 239 ASSERT(sqset_global_list == NULL); 240 241 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 243 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 244 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 245 246 ip_squeue_create_callback = callback; 247 squeue_init(); 248 sqset_global_list = 249 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 250 sqset_global_size = 0; 251 mutex_enter(&cpu_lock); 252 253 /* Create squeue for each active CPU available */ 254 for (i = 0; i < NCPU; i++) { 255 cpu_t *cp = cpu[i]; 256 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 257 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 258 } 259 } 260 261 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 262 263 mutex_exit(&cpu_lock); 264 265 if (ip_squeue_profile) 266 squeue_profile_start(); 267 } 268 269 /* 270 * Get squeue_t structure based on index. 271 * Since the squeue list can only grow, no need to grab any lock. 272 */ 273 squeue_t * 274 ip_squeue_random(uint_t index) 275 { 276 squeue_set_t *sqs; 277 278 sqs = sqset_global_list[index % sqset_global_size]; 279 return (sqs->sqs_list[index % sqs->sqs_size]); 280 } 281 282 /* ARGSUSED */ 283 static void 284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 285 { 286 squeue_t *sqp = arg2; 287 ill_rx_ring_t *ring = sqp->sq_rx_ring; 288 ill_t *ill; 289 290 ASSERT(sqp != NULL); 291 292 if (ring == NULL) { 293 return; 294 } 295 296 /* 297 * Clean up squeue 298 */ 299 mutex_enter(&sqp->sq_lock); 300 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 301 sqp->sq_rx_ring = NULL; 302 mutex_exit(&sqp->sq_lock); 303 304 ill = ring->rr_ill; 305 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 306 ASSERT(ring->rr_handle != NULL); 307 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 308 } 309 310 /* 311 * Cleanup the ring 312 */ 313 314 ring->rr_blank = NULL; 315 ring->rr_handle = NULL; 316 ring->rr_sqp = NULL; 317 318 /* 319 * Signal ill that cleanup is done 320 */ 321 mutex_enter(&ill->ill_lock); 322 ring->rr_ring_state = ILL_RING_FREE; 323 cv_signal(&ill->ill_cv); 324 mutex_exit(&ill->ill_lock); 325 } 326 327 /* 328 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 329 * The real cleanup happens behind the squeue via ip_squeue_clean function but 330 * we need to protect ourselves from 2 threads trying to cleanup at the same 331 * time (possible with one port going down for aggr and someone tearing down the 332 * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock 333 * to indicate when the cleanup has started (1 ref) and when the cleanup 334 * is done (0 ref). When a new ring gets assigned to squeue, we start by 335 * putting 2 ref on ill_inuse_ref. 336 */ 337 static void 338 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 339 { 340 conn_t *connp; 341 squeue_t *sqp; 342 mblk_t *mp; 343 344 ASSERT(rx_ring != NULL); 345 346 /* Just clean one squeue */ 347 mutex_enter(&ill->ill_lock); 348 /* 349 * Reset the ILL_SOFT_RING_ASSIGN bit so that 350 * ip_squeue_soft_ring_affinty() will not go 351 * ahead with assigning rings. 352 */ 353 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 354 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 355 /* Some operations pending on the ring. Wait */ 356 cv_wait(&ill->ill_cv, &ill->ill_lock); 357 358 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 359 /* 360 * Someone already trying to clean 361 * this squeue or it's already been cleaned. 362 */ 363 mutex_exit(&ill->ill_lock); 364 return; 365 } 366 sqp = rx_ring->rr_sqp; 367 368 if (sqp == NULL) { 369 /* 370 * The rx_ring never had a squeue assigned to it. 371 * We are under ill_lock so we can clean it up 372 * here itself since no one can get to it. 373 */ 374 rx_ring->rr_blank = NULL; 375 rx_ring->rr_handle = NULL; 376 rx_ring->rr_sqp = NULL; 377 rx_ring->rr_ring_state = ILL_RING_FREE; 378 mutex_exit(&ill->ill_lock); 379 return; 380 } 381 382 /* Indicate that it's being cleaned */ 383 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 384 ASSERT(sqp != NULL); 385 mutex_exit(&ill->ill_lock); 386 387 /* 388 * Use the preallocated ill_unbind_conn for this purpose 389 */ 390 connp = ill->ill_dls_capab->ill_unbind_conn; 391 392 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) { 393 connp->conn_tcp->tcp_closemp_used = B_TRUE; 394 } else { 395 cmn_err(CE_PANIC, "ip_squeue_clean_ring: " 396 "concurrent use of tcp_closemp_used: connp %p tcp %p\n", 397 (void *)connp, (void *)connp->conn_tcp); 398 } 399 400 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 401 mp = &connp->conn_tcp->tcp_closemp; 402 CONN_INC_REF(connp); 403 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 404 405 mutex_enter(&ill->ill_lock); 406 while (rx_ring->rr_ring_state != ILL_RING_FREE) 407 cv_wait(&ill->ill_cv, &ill->ill_lock); 408 mutex_exit(&ill->ill_lock); 409 } 410 411 void 412 ip_squeue_clean_all(ill_t *ill) 413 { 414 int idx; 415 416 /* 417 * No need to clean if poll_capab isn't set for this ill 418 */ 419 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 420 return; 421 422 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 423 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 424 425 ip_squeue_clean_ring(ill, ipr); 426 } 427 428 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 429 } 430 431 typedef struct ip_taskq_arg { 432 ill_t *ip_taskq_ill; 433 ill_rx_ring_t *ip_taskq_ill_rx_ring; 434 cpu_t *ip_taskq_cpu; 435 } ip_taskq_arg_t; 436 437 /* 438 * Do a Rx ring to squeue binding. Find a unique squeue that is not 439 * managing a receive ring. If no such squeue exists, dynamically 440 * create a new one in the squeue set. 441 * 442 * The function runs via the system taskq. The ill passed as an 443 * argument can't go away since we hold a ref. The lock order is 444 * ill_lock -> sqs_lock -> sq_lock. 445 * 446 * If we are binding a Rx ring to a squeue attached to the offline CPU, 447 * no need to check that because squeues are never destroyed once 448 * created. 449 */ 450 /* ARGSUSED */ 451 static void 452 ip_squeue_extend(void *arg) 453 { 454 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 455 ill_t *ill = sq_arg->ip_taskq_ill; 456 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 457 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 458 squeue_set_t *sqs; 459 squeue_t *sqp = NULL; 460 461 ASSERT(ill != NULL); 462 ASSERT(ill_rx_ring != NULL); 463 kmem_free(arg, sizeof (ip_taskq_arg_t)); 464 465 /* 466 * Make sure the CPU that originally took the interrupt still 467 * exists. 468 */ 469 if (!CPU_ISON(intr_cpu)) 470 intr_cpu = CPU; 471 472 sqs = intr_cpu->cpu_squeue_set; 473 474 /* 475 * If this ill represents link aggregation, then there might be 476 * multiple NICs trying to register them selves at the same time 477 * and in order to ensure that test and assignment of free rings 478 * is sequential, we need to hold the ill_lock. 479 */ 480 mutex_enter(&ill->ill_lock); 481 sqp = ip_find_unused_squeue(sqs, B_FALSE); 482 if (sqp == NULL) { 483 /* 484 * We hit the max limit of squeues allowed per CPU. 485 * Assign this rx_ring to DEFAULT squeue of the 486 * interrupted CPU but the squeue will not manage 487 * the ring. Also print a warning. 488 */ 489 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 490 "has max number of squeues. System performance might " 491 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 492 493 /* the first squeue in the list is the default squeue */ 494 sqp = sqs->sqs_list[0]; 495 ASSERT(sqp != NULL); 496 ill_rx_ring->rr_sqp = sqp; 497 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 498 499 mutex_exit(&ill->ill_lock); 500 ill_waiter_dcr(ill); 501 return; 502 } 503 504 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 505 sqp->sq_rx_ring = ill_rx_ring; 506 ill_rx_ring->rr_sqp = sqp; 507 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 508 509 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 510 mutex_exit(&sqp->sq_lock); 511 512 mutex_exit(&ill->ill_lock); 513 514 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 515 ill_waiter_dcr(ill); 516 } 517 518 /* 519 * Do a Rx ring to squeue binding. Find a unique squeue that is not 520 * managing a receive ring. If no such squeue exists, dynamically 521 * create a new one in the squeue set. 522 * 523 * The function runs via the system taskq. The ill passed as an 524 * argument can't go away since we hold a ref. The lock order is 525 * ill_lock -> sqs_lock -> sq_lock. 526 * 527 * If we are binding a Rx ring to a squeue attached to the offline CPU, 528 * no need to check that because squeues are never destroyed once 529 * created. 530 */ 531 /* ARGSUSED */ 532 static void 533 ip_squeue_soft_ring_affinity(void *arg) 534 { 535 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 536 ill_t *ill = sq_arg->ip_taskq_ill; 537 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 538 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 539 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 540 cpu_t *bind_cpu; 541 int cpu_id = intr_cpu->cpu_id; 542 int min_cpu_id, max_cpu_id; 543 boolean_t enough_uniq_cpus = B_FALSE; 544 boolean_t enough_cpus = B_FALSE; 545 squeue_set_t *sqs, *last_sqs; 546 squeue_t *sqp = NULL; 547 int i, j; 548 549 ASSERT(ill != NULL); 550 kmem_free(arg, sizeof (ip_taskq_arg_t)); 551 552 /* 553 * Make sure the CPU that originally took the interrupt still 554 * exists. 555 */ 556 if (!CPU_ISON(intr_cpu)) { 557 intr_cpu = CPU; 558 cpu_id = intr_cpu->cpu_id; 559 } 560 561 /* 562 * If this ill represents link aggregation, then there might be 563 * multiple NICs trying to register them selves at the same time 564 * and in order to ensure that test and assignment of free rings 565 * is sequential, we need to hold the ill_lock. 566 */ 567 mutex_enter(&ill->ill_lock); 568 569 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 570 mutex_exit(&ill->ill_lock); 571 return; 572 } 573 /* 574 * We need to fanout the interrupts from the NIC. We do that by 575 * telling the driver underneath to create soft rings and use 576 * worker threads (if the driver advertized SOFT_RING capability) 577 * Its still a big performance win to if we can fanout to the 578 * threads on the same core that is taking interrupts. 579 * 580 * Since we don't know the interrupt to CPU binding, we don't 581 * assign any squeues or affinity to worker threads in the NIC. 582 * At the time of the first interrupt, we know which CPU is 583 * taking interrupts and try to find other threads on the same 584 * core. Assuming, ip_threads_per_cpu is correct and cpus are 585 * numbered sequentially for each core (XXX need something better 586 * than this in future), find the lowest number and highest 587 * number thread for that core. 588 * 589 * If we have one more thread per core than number of soft rings, 590 * then don't assign any worker threads to the H/W thread (cpu) 591 * taking interrupts (capability negotiation tries to ensure this) 592 * 593 * If the number of threads per core are same as the number of 594 * soft rings, then assign the worker affinity and squeue to 595 * the same cpu. 596 * 597 * Otherwise, just fanout to higher number CPUs starting from 598 * the interrupted CPU. 599 */ 600 601 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 602 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 603 604 /* 605 * Quickly check if there are enough CPUs present for fanout 606 * and also max_cpu_id is less than the id of the active CPU. 607 * We use the cpu_id stored in the last squeue_set to get 608 * an idea. The scheme is by no means perfect since it doesn't 609 * take into account CPU DR operations and the fact that 610 * interrupts themselves might change. An ideal scenario 611 * would be to ensure that interrupts run cpus by themselves 612 * and worker threads never have affinity to those CPUs. If 613 * the interrupts move to CPU which had a worker thread, it 614 * should be changed. Probably callbacks similar to CPU offline 615 * are needed to make it work perfectly. 616 */ 617 last_sqs = sqset_global_list[sqset_global_size - 1]; 618 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 619 if ((max_cpu_id - min_cpu_id) > 620 ill_soft_ring->ill_dls_soft_ring_cnt) 621 enough_uniq_cpus = B_TRUE; 622 else if ((max_cpu_id - min_cpu_id) >= 623 ill_soft_ring->ill_dls_soft_ring_cnt) 624 enough_cpus = B_TRUE; 625 } 626 627 j = 0; 628 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 629 if (enough_uniq_cpus) { 630 if ((min_cpu_id + i) == cpu_id) { 631 j++; 632 continue; 633 } 634 bind_cpu = cpu[min_cpu_id + i]; 635 } else if (enough_cpus) { 636 bind_cpu = cpu[min_cpu_id + i]; 637 } else { 638 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 639 bind_cpu = cpu[(cpu_id + i) % ncpus]; 640 } 641 642 /* 643 * Check if the CPU actually exist and active. If not, 644 * use the interrupted CPU. ip_find_unused_squeue() will 645 * find the right CPU to fanout anyway. 646 */ 647 if (!CPU_ISON(bind_cpu)) 648 bind_cpu = intr_cpu; 649 650 sqs = bind_cpu->cpu_squeue_set; 651 ASSERT(sqs != NULL); 652 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 653 654 sqp = ip_find_unused_squeue(sqs, B_TRUE); 655 if (sqp == NULL) { 656 /* 657 * We hit the max limit of squeues allowed per CPU. 658 * Assign this rx_ring to DEFAULT squeue of the 659 * interrupted CPU but thesqueue will not manage 660 * the ring. Also print a warning. 661 */ 662 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 663 "%d/%p already has max number of squeues. System " 664 "performance might become suboptimal\n", 665 sqs->sqs_bind, (void *)sqs); 666 667 /* the first squeue in the list is the default squeue */ 668 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 669 ASSERT(sqp != NULL); 670 671 ill_rx_ring->rr_sqp = sqp; 672 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 673 continue; 674 675 } 676 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 677 ill_rx_ring->rr_sqp = sqp; 678 sqp->sq_rx_ring = ill_rx_ring; 679 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 680 sqp->sq_state |= SQS_ILL_BOUND; 681 682 /* assign affinity to soft ring */ 683 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 684 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 685 sqp->sq_bind); 686 } 687 mutex_exit(&sqp->sq_lock); 688 } 689 mutex_exit(&ill->ill_lock); 690 691 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 692 SOFT_RING_FANOUT); 693 694 mutex_enter(&ill->ill_lock); 695 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 696 mutex_exit(&ill->ill_lock); 697 698 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 699 ill_waiter_dcr(ill); 700 } 701 702 /* ARGSUSED */ 703 void 704 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 705 mblk_t *mp_chain, struct mac_header_info_s *mhip) 706 { 707 ip_taskq_arg_t *taskq_arg; 708 boolean_t refheld; 709 710 ASSERT(servicing_interrupt()); 711 712 mutex_enter(&ill->ill_lock); 713 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 714 taskq_arg = (ip_taskq_arg_t *) 715 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 716 717 if (taskq_arg == NULL) 718 goto out; 719 720 taskq_arg->ip_taskq_ill = ill; 721 taskq_arg->ip_taskq_ill_rx_ring = NULL; 722 taskq_arg->ip_taskq_cpu = CPU; 723 724 /* 725 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 726 * the next interrupt to schedule a task for calling 727 * ip_squeue_soft_ring_affinity(); 728 */ 729 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 730 } else { 731 mutex_exit(&ill->ill_lock); 732 goto out; 733 } 734 mutex_exit(&ill->ill_lock); 735 refheld = ill_waiter_inc(ill); 736 if (refheld) { 737 if (taskq_dispatch(system_taskq, 738 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 739 goto out; 740 741 /* release ref on ill if taskq dispatch fails */ 742 ill_waiter_dcr(ill); 743 } 744 /* 745 * Turn on CAPAB_SOFT_RING so that affinity assignment 746 * can be tried again later. 747 */ 748 mutex_enter(&ill->ill_lock); 749 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 750 mutex_exit(&ill->ill_lock); 751 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 752 753 out: 754 ip_input(ill, NULL, mp_chain, mhip); 755 } 756 757 static squeue_t * 758 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout) 759 { 760 int i; 761 squeue_set_t *best_sqs = NULL; 762 squeue_set_t *curr_sqs = NULL; 763 int min_sq = 0; 764 squeue_t *sqp = NULL; 765 char sqname[64]; 766 cpu_t *bind_cpu; 767 768 /* 769 * If fanout is set and the passed squeue_set already has some 770 * squeues which are managing the NICs, try to find squeues on 771 * unused CPU. 772 */ 773 if (sqs->sqs_size > 1 && fanout) { 774 /* 775 * First check to see if any squeue on the CPU passed 776 * is managing a NIC. 777 */ 778 for (i = 0; i < sqs->sqs_size; i++) { 779 mutex_enter(&sqs->sqs_list[i]->sq_lock); 780 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 781 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 782 mutex_exit(&sqs->sqs_list[i]->sq_lock); 783 break; 784 } 785 mutex_exit(&sqs->sqs_list[i]->sq_lock); 786 } 787 if (i != sqs->sqs_size) { 788 best_sqs = NULL; 789 790 for (i = sqset_global_size - 1; i >= 0; i--) { 791 curr_sqs = sqset_global_list[i]; 792 /* 793 * Check and make sure the CPU that sqs 794 * is bound to is valid. There could be 795 * sqs's around whose CPUs could have 796 * been DR'd out. Also note cpu_lock is 797 * not held here. It is ok as later we 798 * do cpu_lock when we access cpu_t 799 * members. 800 */ 801 if (cpu_get(curr_sqs->sqs_bind) != NULL) { 802 if (best_sqs == NULL) { 803 best_sqs = curr_sqs; 804 min_sq = curr_sqs->sqs_size; 805 } else if (curr_sqs->sqs_size < 806 min_sq) { 807 best_sqs = curr_sqs; 808 min_sq = curr_sqs->sqs_size; 809 } 810 } 811 } 812 813 ASSERT(best_sqs != NULL); 814 sqs = best_sqs; 815 } 816 } 817 818 mutex_enter(&sqs->sqs_lock); 819 820 for (i = 0; i < sqs->sqs_size; i++) { 821 mutex_enter(&sqs->sqs_list[i]->sq_lock); 822 if ((sqs->sqs_list[i]->sq_state & 823 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 824 sqp = sqs->sqs_list[i]; 825 break; 826 } 827 mutex_exit(&sqs->sqs_list[i]->sq_lock); 828 } 829 830 if (sqp == NULL) { 831 /* Need to create a new squeue */ 832 if (sqs->sqs_size == sqs->sqs_max_size) { 833 /* 834 * Reached the max limit for squeue 835 * we can allocate on this CPU. 836 */ 837 mutex_exit(&sqs->sqs_lock); 838 return (NULL); 839 } 840 841 mutex_enter(&cpu_lock); 842 if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) { 843 /* Too bad, CPU got DR'd out, return NULL */ 844 mutex_exit(&cpu_lock); 845 mutex_exit(&sqs->sqs_lock); 846 return (NULL); 847 } 848 849 bzero(sqname, sizeof (sqname)); 850 (void) snprintf(sqname, sizeof (sqname), 851 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 852 bind_cpu->cpu_id, sqs->sqs_size); 853 mutex_exit(&cpu_lock); 854 855 sqp = squeue_create(sqname, sqs->sqs_bind, 856 ip_squeue_worker_wait, minclsyspri); 857 858 ASSERT(sqp != NULL); 859 860 squeue_profile_enable(sqp); 861 sqs->sqs_list[sqs->sqs_size++] = sqp; 862 863 if (ip_squeue_create_callback != NULL) 864 ip_squeue_create_callback(sqp); 865 866 if (ip_squeue_bind) { 867 mutex_enter(&cpu_lock); 868 bind_cpu = cpu_get(sqs->sqs_bind); 869 if (bind_cpu != NULL && cpu_is_online(bind_cpu)) { 870 squeue_bind(sqp, -1); 871 } 872 mutex_exit(&cpu_lock); 873 } 874 mutex_enter(&sqp->sq_lock); 875 } 876 877 mutex_exit(&sqs->sqs_lock); 878 ASSERT(sqp != NULL); 879 return (sqp); 880 } 881 882 /* 883 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 884 * owned by a squeue yet, do the assignment. When the NIC registers it 885 * Rx rings with IP, we don't know where the interrupts will land and 886 * hence we need to wait till this point to do the assignment. 887 */ 888 squeue_t * 889 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 890 { 891 squeue_t *sqp; 892 ill_t *ill; 893 int interrupt; 894 ip_taskq_arg_t *taskq_arg; 895 boolean_t refheld; 896 897 if (ill_rx_ring == NULL) 898 return (IP_SQUEUE_GET(lbolt)); 899 900 sqp = ill_rx_ring->rr_sqp; 901 /* 902 * Do a quick check. If it's not NULL, we are done. 903 * Squeues are never destroyed so worse we will bind 904 * this connection to a suboptimal squeue. 905 * 906 * This is the fast path case. 907 */ 908 if (sqp != NULL) 909 return (sqp); 910 911 ill = ill_rx_ring->rr_ill; 912 ASSERT(ill != NULL); 913 914 interrupt = servicing_interrupt(); 915 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 916 KM_NOSLEEP); 917 918 mutex_enter(&ill->ill_lock); 919 /* 920 * Check sqp under the lock again for atomicity. Possible race with 921 * a previously scheduled ip_squeue_get -> ip_squeue_extend. 922 * Do the ring to squeue binding only if we are in interrupt context 923 * AND the ring is not already bound AND there is no one else trying 924 * the bind already. 925 */ 926 sqp = ill_rx_ring->rr_sqp; 927 if (sqp != NULL || !interrupt || 928 ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { 929 /* 930 * Note that the ring might get bound once we drop the lock 931 * below, if a previous request is in progress i.e. if the ring 932 * state is ILL_RING_INPROC. The incoming connection on whose 933 * behalf we are currently here might get a suboptimal squeue 934 * via the call to IP_SQUEUE_GET below, but there is no 935 * correctness issue. 936 */ 937 mutex_exit(&ill->ill_lock); 938 if (taskq_arg != NULL) 939 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 940 if (sqp != NULL) 941 return (sqp); 942 return (IP_SQUEUE_GET(lbolt)); 943 } 944 945 /* 946 * No sqp assigned yet. Can't really do that in interrupt 947 * context. Assign the default sqp to this connection and 948 * trigger creation of new sqp and binding it to this ring 949 * via taskq. Need to make sure ill stays around. 950 */ 951 taskq_arg->ip_taskq_ill = ill; 952 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 953 taskq_arg->ip_taskq_cpu = CPU; 954 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 955 mutex_exit(&ill->ill_lock); 956 refheld = ill_waiter_inc(ill); 957 if (refheld) { 958 if (taskq_dispatch(system_taskq, ip_squeue_extend, 959 taskq_arg, TQ_NOSLEEP) != NULL) { 960 return (IP_SQUEUE_GET(lbolt)); 961 } 962 } 963 /* 964 * The ill is closing and we could not get a reference on the ill OR 965 * taskq_dispatch failed probably due to memory allocation failure. 966 * We will try again next time. 967 */ 968 mutex_enter(&ill->ill_lock); 969 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 970 mutex_exit(&ill->ill_lock); 971 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 972 if (refheld) 973 ill_waiter_dcr(ill); 974 975 return (IP_SQUEUE_GET(lbolt)); 976 } 977 978 /* 979 * NDD hooks for setting ip_squeue_xxx tuneables. 980 */ 981 982 /* ARGSUSED */ 983 int 984 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 985 caddr_t addr, cred_t *cr) 986 { 987 int *bind_enabled = (int *)addr; 988 long new_value; 989 int i; 990 991 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 992 return (EINVAL); 993 994 if (ip_squeue_bind == new_value) 995 return (0); 996 997 *bind_enabled = new_value; 998 mutex_enter(&cpu_lock); 999 if (new_value == 0) { 1000 for (i = 0; i < sqset_global_size; i++) 1001 ip_squeue_set_unbind(sqset_global_list[i]); 1002 } else { 1003 for (i = 0; i < sqset_global_size; i++) 1004 ip_squeue_set_bind(sqset_global_list[i]); 1005 } 1006 1007 mutex_exit(&cpu_lock); 1008 return (0); 1009 } 1010 1011 /* 1012 * Set squeue profiling. 1013 * 0 means "disable" 1014 * 1 means "enable" 1015 * 2 means "enable and reset" 1016 */ 1017 /* ARGSUSED */ 1018 int 1019 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 1020 cred_t *cr) 1021 { 1022 int *profile_enabled = (int *)cp; 1023 long new_value; 1024 squeue_set_t *sqs; 1025 1026 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1027 return (EINVAL); 1028 1029 if (new_value == 0) 1030 squeue_profile_stop(); 1031 else if (new_value == 1) 1032 squeue_profile_start(); 1033 else if (new_value == 2) { 1034 int i, j; 1035 1036 squeue_profile_stop(); 1037 mutex_enter(&cpu_lock); 1038 for (i = 0; i < sqset_global_size; i++) { 1039 sqs = sqset_global_list[i]; 1040 for (j = 0; j < sqs->sqs_size; j++) { 1041 squeue_profile_reset(sqs->sqs_list[j]); 1042 } 1043 } 1044 mutex_exit(&cpu_lock); 1045 1046 new_value = 1; 1047 squeue_profile_start(); 1048 } 1049 *profile_enabled = new_value; 1050 1051 return (0); 1052 } 1053 1054 /* 1055 * Reconfiguration callback 1056 */ 1057 1058 /* ARGSUSED */ 1059 static int 1060 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 1061 { 1062 cpu_t *cp = cpu[id]; 1063 1064 ASSERT(MUTEX_HELD(&cpu_lock)); 1065 switch (what) { 1066 case CPU_CONFIG: 1067 /* 1068 * A new CPU is added. Create an squeue for it but do not bind 1069 * it yet. 1070 */ 1071 if (cp->cpu_squeue_set == NULL) 1072 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1073 break; 1074 case CPU_ON: 1075 case CPU_INIT: 1076 case CPU_CPUPART_IN: 1077 if (cp->cpu_squeue_set == NULL) { 1078 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1079 } 1080 if (ip_squeue_bind) 1081 ip_squeue_set_bind(cp->cpu_squeue_set); 1082 break; 1083 case CPU_UNCONFIG: 1084 case CPU_OFF: 1085 case CPU_CPUPART_OUT: 1086 ASSERT((cp->cpu_squeue_set != NULL) || 1087 (cp->cpu_flags & CPU_OFFLINE)); 1088 1089 if (cp->cpu_squeue_set != NULL) { 1090 ip_squeue_set_unbind(cp->cpu_squeue_set); 1091 } 1092 break; 1093 default: 1094 break; 1095 } 1096 return (0); 1097 } 1098 1099 /* ARGSUSED */ 1100 static void 1101 ip_squeue_set_bind(squeue_set_t *sqs) 1102 { 1103 int i; 1104 squeue_t *sqp; 1105 1106 if (!ip_squeue_bind) 1107 return; 1108 1109 mutex_enter(&sqs->sqs_lock); 1110 for (i = 0; i < sqs->sqs_size; i++) { 1111 sqp = sqs->sqs_list[i]; 1112 if (sqp->sq_state & SQS_BOUND) 1113 continue; 1114 squeue_bind(sqp, -1); 1115 } 1116 mutex_exit(&sqs->sqs_lock); 1117 } 1118 1119 static void 1120 ip_squeue_set_unbind(squeue_set_t *sqs) 1121 { 1122 int i; 1123 squeue_t *sqp; 1124 1125 mutex_enter(&sqs->sqs_lock); 1126 for (i = 0; i < sqs->sqs_size; i++) { 1127 sqp = sqs->sqs_list[i]; 1128 1129 /* 1130 * CPU is going offline. Remove the thread affinity 1131 * for any soft ring threads the squeue is managing. 1132 */ 1133 if (sqp->sq_state & SQS_ILL_BOUND) { 1134 ill_rx_ring_t *ring = sqp->sq_rx_ring; 1135 ill_t *ill = ring->rr_ill; 1136 1137 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1138 ASSERT(ring->rr_handle != NULL); 1139 ill->ill_dls_capab->ill_dls_unbind( 1140 ring->rr_handle); 1141 } 1142 } 1143 if (!(sqp->sq_state & SQS_BOUND)) 1144 continue; 1145 squeue_unbind(sqp); 1146 } 1147 mutex_exit(&sqs->sqs_lock); 1148 } 1149