1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP interface to squeues. 28 * 29 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 30 * cpu_squeue field of the cpu structure. Each squeue is associated with a 31 * connection instance (conn_t). 32 * 33 * For CPUs available at system startup time the squeue creation and association 34 * with CPU happens at MP initialization time. For CPUs added during dynamic 35 * reconfiguration, the initialization happens when the new CPU is configured in 36 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 37 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 38 * variable. 39 * 40 * There are two modes of associating connection with squeues. The first mode 41 * associates each connection with the CPU that creates the connection (either 42 * during open time or during accept time). The second mode associates each 43 * connection with a random CPU, effectively distributing load over all CPUs 44 * and all squeues in the system. The mode is controlled by the 45 * ip_squeue_fanout variable. 46 * 47 * NOTE: The fact that there is an association between each connection and 48 * squeue and squeue and CPU does not mean that each connection is always 49 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 50 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 51 * binding is only relevant for the worker thread. 52 * 53 * The list of all created squeues is kept in squeue_set structure. This list is 54 * used when ip_squeue_fanout is set and the load is distributed across all 55 * squeues. 56 * 57 * INTERFACE: 58 * 59 * squeue_t *ip_squeue_get(hint) 60 * 61 * Find an squeue based on the 'hint' value. The hint is used as an index 62 * in the array of IP squeues available. The way hint is computed may 63 * affect the effectiveness of the squeue distribution. Currently squeues 64 * are assigned in round-robin fashion using lbolt as a hint. 65 * 66 * 67 * DR Notes 68 * ======== 69 * 70 * The ip_squeue_init() registers a call-back function with the CPU DR 71 * subsystem using register_cpu_setup_func(). The call-back function does two 72 * things: 73 * 74 * o When the CPU is going off-line or unconfigured, the worker thread is 75 * unbound from the CPU. This allows the CPU unconfig code to move it to 76 * another CPU. 77 * 78 * o When the CPU is going online, it creates a new squeue for this CPU if 79 * necessary and binds the squeue worker thread to this CPU. 80 * 81 * TUNEBALES: 82 * 83 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 84 * associated with an squeue instance. 85 * 86 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 87 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 88 * an impact. 89 * 90 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 91 * otherwise get it from CPU->cpu_squeue. 92 * 93 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 94 * changed using ndd on /dev/tcp or /dev/ip. 95 * 96 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 97 * created. This is the time squeue code waits before waking up the worker 98 * thread after queuing a request. 99 */ 100 101 #include <sys/types.h> 102 #include <sys/debug.h> 103 #include <sys/kmem.h> 104 #include <sys/cpuvar.h> 105 106 #include <sys/cmn_err.h> 107 108 #include <inet/common.h> 109 #include <inet/ip.h> 110 #include <inet/ip_if.h> 111 #include <inet/nd.h> 112 #include <inet/ipclassifier.h> 113 #include <sys/types.h> 114 #include <sys/conf.h> 115 #include <sys/sunddi.h> 116 #include <sys/dlpi.h> 117 #include <sys/squeue_impl.h> 118 #include <sys/atomic.h> 119 120 /* 121 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 122 * mapping between squeue and NIC (or Rx ring) for performance reasons so 123 * each squeue can uniquely own a NIC or a Rx ring and do polling 124 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 125 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 126 * can be created dynamically as needed. 127 */ 128 #define MAX_SQUEUES_PER_CPU 32 129 #define MIN_SQUEUES_PER_CPU 1 130 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 131 132 #define IP_NUM_SOFT_RINGS 2 133 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 134 135 /* 136 * List of all created squeue sets. The size is protected by cpu_lock 137 */ 138 squeue_set_t **sqset_global_list; 139 uint_t sqset_global_size; 140 141 int ip_squeue_bind = B_TRUE; 142 int ip_squeue_profile = B_TRUE; 143 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 144 145 /* 146 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 147 * created. This is the time squeue code waits before waking up the worker 148 * thread after queuing a request. 149 */ 150 uint_t ip_squeue_worker_wait = 10; 151 152 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 153 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 154 155 static void ip_squeue_set_bind(squeue_set_t *); 156 static void ip_squeue_set_unbind(squeue_set_t *); 157 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t); 158 static void ip_squeue_clean(void *, mblk_t *, void *); 159 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); 160 161 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 162 163 /* 164 * Create squeue set containing ip_squeues_per_cpu number of squeues 165 * for this CPU and bind them all to the CPU. 166 */ 167 static squeue_set_t * 168 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 169 { 170 int i; 171 squeue_set_t *sqs; 172 squeue_t *sqp; 173 char sqname[64]; 174 processorid_t id = cp->cpu_id; 175 176 if (reuse) { 177 int i; 178 179 /* 180 * We may already have an squeue created for this CPU. Try to 181 * find one and reuse it if possible. 182 */ 183 for (i = 0; i < sqset_global_size; i++) { 184 sqs = sqset_global_list[i]; 185 if (id == sqs->sqs_bind) 186 return (sqs); 187 } 188 } 189 190 sqs = kmem_zalloc(sizeof (squeue_set_t) + 191 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 192 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 193 sqs->sqs_list = (squeue_t **)&sqs[1]; 194 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 195 sqs->sqs_bind = id; 196 197 for (i = 0; i < ip_squeues_per_cpu; i++) { 198 bzero(sqname, sizeof (sqname)); 199 200 (void) snprintf(sqname, sizeof (sqname), 201 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 202 cp->cpu_id, i); 203 204 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 205 minclsyspri); 206 207 /* 208 * The first squeue in each squeue_set is the DEFAULT 209 * squeue. 210 */ 211 sqp->sq_state |= SQS_DEFAULT; 212 213 ASSERT(sqp != NULL); 214 215 squeue_profile_enable(sqp); 216 sqs->sqs_list[sqs->sqs_size++] = sqp; 217 218 if (ip_squeue_create_callback != NULL) 219 ip_squeue_create_callback(sqp); 220 } 221 222 if (ip_squeue_bind && cpu_is_online(cp)) 223 ip_squeue_set_bind(sqs); 224 225 sqset_global_list[sqset_global_size++] = sqs; 226 ASSERT(sqset_global_size <= NCPU); 227 return (sqs); 228 } 229 230 /* 231 * Initialize IP squeues. 232 */ 233 void 234 ip_squeue_init(void (*callback)(squeue_t *)) 235 { 236 int i; 237 238 ASSERT(sqset_global_list == NULL); 239 240 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 241 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 242 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 243 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 244 245 ip_squeue_create_callback = callback; 246 squeue_init(); 247 sqset_global_list = 248 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 249 sqset_global_size = 0; 250 mutex_enter(&cpu_lock); 251 252 /* Create squeue for each active CPU available */ 253 for (i = 0; i < NCPU; i++) { 254 cpu_t *cp = cpu[i]; 255 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 256 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 257 } 258 } 259 260 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 261 262 mutex_exit(&cpu_lock); 263 264 if (ip_squeue_profile) 265 squeue_profile_start(); 266 } 267 268 /* 269 * Get squeue_t structure based on index. 270 * Since the squeue list can only grow, no need to grab any lock. 271 */ 272 squeue_t * 273 ip_squeue_random(uint_t index) 274 { 275 squeue_set_t *sqs; 276 277 sqs = sqset_global_list[index % sqset_global_size]; 278 return (sqs->sqs_list[index % sqs->sqs_size]); 279 } 280 281 /* ARGSUSED */ 282 static void 283 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 284 { 285 squeue_t *sqp = arg2; 286 ill_rx_ring_t *ring = (ill_rx_ring_t *)mp->b_wptr; 287 ill_t *ill; 288 289 ASSERT(sqp != NULL); 290 mp->b_wptr = NULL; 291 292 if (ring == NULL) { 293 return; 294 } 295 296 /* 297 * Clean up squeue 298 */ 299 mutex_enter(&sqp->sq_lock); 300 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 301 sqp->sq_rx_ring = NULL; 302 mutex_exit(&sqp->sq_lock); 303 304 ill = ring->rr_ill; 305 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 306 ASSERT(ring->rr_handle != NULL); 307 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 308 } 309 310 /* 311 * Cleanup the ring 312 */ 313 314 ring->rr_blank = NULL; 315 ring->rr_handle = NULL; 316 ring->rr_sqp = NULL; 317 318 /* 319 * Signal ill that cleanup is done 320 */ 321 mutex_enter(&ill->ill_lock); 322 ring->rr_ring_state = ILL_RING_FREE; 323 cv_signal(&ill->ill_cv); 324 mutex_exit(&ill->ill_lock); 325 } 326 327 /* 328 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 329 * The real cleanup happens behind the squeue via ip_squeue_clean function but 330 * we need to protect ourselves from 2 threads trying to cleanup at the same 331 * time (possible with one port going down for aggr and someone tearing down the 332 * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock 333 * to indicate when the cleanup has started (1 ref) and when the cleanup 334 * is done (0 ref). When a new ring gets assigned to squeue, we start by 335 * putting 2 ref on ill_inuse_ref. 336 */ 337 static void 338 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 339 { 340 conn_t *connp; 341 squeue_t *sqp; 342 mblk_t *mp; 343 344 ASSERT(rx_ring != NULL); 345 346 /* Just clean one squeue */ 347 mutex_enter(&ill->ill_lock); 348 /* 349 * Reset the ILL_SOFT_RING_ASSIGN bit so that 350 * ip_squeue_soft_ring_affinty() will not go 351 * ahead with assigning rings. 352 */ 353 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 354 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 355 /* Some operations pending on the ring. Wait */ 356 cv_wait(&ill->ill_cv, &ill->ill_lock); 357 358 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 359 /* 360 * Someone already trying to clean 361 * this squeue or it's already been cleaned. 362 */ 363 mutex_exit(&ill->ill_lock); 364 return; 365 } 366 sqp = rx_ring->rr_sqp; 367 368 if (sqp == NULL) { 369 /* 370 * The rx_ring never had a squeue assigned to it. 371 * We are under ill_lock so we can clean it up 372 * here itself since no one can get to it. 373 */ 374 rx_ring->rr_blank = NULL; 375 rx_ring->rr_handle = NULL; 376 rx_ring->rr_sqp = NULL; 377 rx_ring->rr_ring_state = ILL_RING_FREE; 378 mutex_exit(&ill->ill_lock); 379 return; 380 } 381 382 /* Indicate that it's being cleaned */ 383 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 384 ASSERT(sqp != NULL); 385 mutex_exit(&ill->ill_lock); 386 387 /* 388 * Use the preallocated ill_unbind_conn for this purpose 389 */ 390 connp = ill->ill_dls_capab->ill_unbind_conn; 391 392 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) { 393 connp->conn_tcp->tcp_closemp_used = B_TRUE; 394 } else { 395 cmn_err(CE_PANIC, "ip_squeue_clean_ring: " 396 "concurrent use of tcp_closemp_used: connp %p tcp %p\n", 397 (void *)connp, (void *)connp->conn_tcp); 398 } 399 400 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 401 mp = &connp->conn_tcp->tcp_closemp; 402 CONN_INC_REF(connp); 403 404 /* 405 * Since the field sq_rx_ring for default squeue is NULL, 406 * ip_squeue_clean() will have no way to get the ring if we 407 * don't pass the pointer to it. We use b_wptr to do so 408 * as use of b_wptr for any other purpose is not expected. 409 */ 410 411 ASSERT(mp->b_wptr == NULL); 412 mp->b_wptr = (unsigned char *)rx_ring; 413 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 414 415 mutex_enter(&ill->ill_lock); 416 while (rx_ring->rr_ring_state != ILL_RING_FREE) 417 cv_wait(&ill->ill_cv, &ill->ill_lock); 418 mutex_exit(&ill->ill_lock); 419 } 420 421 void 422 ip_squeue_clean_all(ill_t *ill) 423 { 424 int idx; 425 426 /* 427 * No need to clean if poll_capab isn't set for this ill 428 */ 429 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 430 return; 431 432 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 433 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 434 435 ip_squeue_clean_ring(ill, ipr); 436 } 437 438 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 439 } 440 441 typedef struct ip_taskq_arg { 442 ill_t *ip_taskq_ill; 443 ill_rx_ring_t *ip_taskq_ill_rx_ring; 444 cpu_t *ip_taskq_cpu; 445 } ip_taskq_arg_t; 446 447 /* 448 * Do a Rx ring to squeue binding. Find a unique squeue that is not 449 * managing a receive ring. If no such squeue exists, dynamically 450 * create a new one in the squeue set. 451 * 452 * The function runs via the system taskq. The ill passed as an 453 * argument can't go away since we hold a ref. The lock order is 454 * ill_lock -> sqs_lock -> sq_lock. 455 * 456 * If we are binding a Rx ring to a squeue attached to the offline CPU, 457 * no need to check that because squeues are never destroyed once 458 * created. 459 */ 460 /* ARGSUSED */ 461 static void 462 ip_squeue_extend(void *arg) 463 { 464 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 465 ill_t *ill = sq_arg->ip_taskq_ill; 466 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 467 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 468 squeue_set_t *sqs; 469 squeue_t *sqp = NULL; 470 471 ASSERT(ill != NULL); 472 ASSERT(ill_rx_ring != NULL); 473 kmem_free(arg, sizeof (ip_taskq_arg_t)); 474 475 /* 476 * Make sure the CPU that originally took the interrupt still 477 * exists. 478 */ 479 if (!CPU_ISON(intr_cpu)) 480 intr_cpu = CPU; 481 482 sqs = intr_cpu->cpu_squeue_set; 483 484 /* 485 * If this ill represents link aggregation, then there might be 486 * multiple NICs trying to register them selves at the same time 487 * and in order to ensure that test and assignment of free rings 488 * is sequential, we need to hold the ill_lock. 489 */ 490 mutex_enter(&ill->ill_lock); 491 sqp = ip_find_unused_squeue(sqs, B_FALSE); 492 if (sqp == NULL) { 493 /* 494 * We hit the max limit of squeues allowed per CPU. 495 * Assign this rx_ring to DEFAULT squeue of the 496 * interrupted CPU but the squeue will not manage 497 * the ring. Also print a warning. 498 */ 499 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 500 "has max number of squeues. System performance might " 501 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 502 503 /* the first squeue in the list is the default squeue */ 504 sqp = sqs->sqs_list[0]; 505 ASSERT(sqp != NULL); 506 ill_rx_ring->rr_sqp = sqp; 507 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 508 509 mutex_exit(&ill->ill_lock); 510 ill_waiter_dcr(ill); 511 return; 512 } 513 514 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 515 sqp->sq_rx_ring = ill_rx_ring; 516 ill_rx_ring->rr_sqp = sqp; 517 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 518 519 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 520 mutex_exit(&sqp->sq_lock); 521 522 mutex_exit(&ill->ill_lock); 523 524 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 525 ill_waiter_dcr(ill); 526 } 527 528 /* 529 * Do a Rx ring to squeue binding. Find a unique squeue that is not 530 * managing a receive ring. If no such squeue exists, dynamically 531 * create a new one in the squeue set. 532 * 533 * The function runs via the system taskq. The ill passed as an 534 * argument can't go away since we hold a ref. The lock order is 535 * ill_lock -> sqs_lock -> sq_lock. 536 * 537 * If we are binding a Rx ring to a squeue attached to the offline CPU, 538 * no need to check that because squeues are never destroyed once 539 * created. 540 */ 541 /* ARGSUSED */ 542 static void 543 ip_squeue_soft_ring_affinity(void *arg) 544 { 545 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 546 ill_t *ill = sq_arg->ip_taskq_ill; 547 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 548 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 549 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 550 cpu_t *bind_cpu; 551 int cpu_id = intr_cpu->cpu_id; 552 int min_cpu_id, max_cpu_id; 553 boolean_t enough_uniq_cpus = B_FALSE; 554 boolean_t enough_cpus = B_FALSE; 555 squeue_set_t *sqs, *last_sqs; 556 squeue_t *sqp = NULL; 557 int i, j; 558 559 ASSERT(ill != NULL); 560 kmem_free(arg, sizeof (ip_taskq_arg_t)); 561 562 /* 563 * Make sure the CPU that originally took the interrupt still 564 * exists. 565 */ 566 if (!CPU_ISON(intr_cpu)) { 567 intr_cpu = CPU; 568 cpu_id = intr_cpu->cpu_id; 569 } 570 571 /* 572 * If this ill represents link aggregation, then there might be 573 * multiple NICs trying to register them selves at the same time 574 * and in order to ensure that test and assignment of free rings 575 * is sequential, we need to hold the ill_lock. 576 */ 577 mutex_enter(&ill->ill_lock); 578 579 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 580 mutex_exit(&ill->ill_lock); 581 return; 582 } 583 /* 584 * We need to fanout the interrupts from the NIC. We do that by 585 * telling the driver underneath to create soft rings and use 586 * worker threads (if the driver advertized SOFT_RING capability) 587 * Its still a big performance win to if we can fanout to the 588 * threads on the same core that is taking interrupts. 589 * 590 * Since we don't know the interrupt to CPU binding, we don't 591 * assign any squeues or affinity to worker threads in the NIC. 592 * At the time of the first interrupt, we know which CPU is 593 * taking interrupts and try to find other threads on the same 594 * core. Assuming, ip_threads_per_cpu is correct and cpus are 595 * numbered sequentially for each core (XXX need something better 596 * than this in future), find the lowest number and highest 597 * number thread for that core. 598 * 599 * If we have one more thread per core than number of soft rings, 600 * then don't assign any worker threads to the H/W thread (cpu) 601 * taking interrupts (capability negotiation tries to ensure this) 602 * 603 * If the number of threads per core are same as the number of 604 * soft rings, then assign the worker affinity and squeue to 605 * the same cpu. 606 * 607 * Otherwise, just fanout to higher number CPUs starting from 608 * the interrupted CPU. 609 */ 610 611 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 612 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 613 614 /* 615 * Quickly check if there are enough CPUs present for fanout 616 * and also max_cpu_id is less than the id of the active CPU. 617 * We use the cpu_id stored in the last squeue_set to get 618 * an idea. The scheme is by no means perfect since it doesn't 619 * take into account CPU DR operations and the fact that 620 * interrupts themselves might change. An ideal scenario 621 * would be to ensure that interrupts run cpus by themselves 622 * and worker threads never have affinity to those CPUs. If 623 * the interrupts move to CPU which had a worker thread, it 624 * should be changed. Probably callbacks similar to CPU offline 625 * are needed to make it work perfectly. 626 */ 627 last_sqs = sqset_global_list[sqset_global_size - 1]; 628 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 629 if ((max_cpu_id - min_cpu_id) > 630 ill_soft_ring->ill_dls_soft_ring_cnt) 631 enough_uniq_cpus = B_TRUE; 632 else if ((max_cpu_id - min_cpu_id) >= 633 ill_soft_ring->ill_dls_soft_ring_cnt) 634 enough_cpus = B_TRUE; 635 } 636 637 j = 0; 638 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 639 if (enough_uniq_cpus) { 640 if ((min_cpu_id + i) == cpu_id) { 641 j++; 642 continue; 643 } 644 bind_cpu = cpu[min_cpu_id + i]; 645 } else if (enough_cpus) { 646 bind_cpu = cpu[min_cpu_id + i]; 647 } else { 648 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 649 bind_cpu = cpu[(cpu_id + i) % ncpus]; 650 } 651 652 /* 653 * Check if the CPU actually exist and active. If not, 654 * use the interrupted CPU. ip_find_unused_squeue() will 655 * find the right CPU to fanout anyway. 656 */ 657 if (!CPU_ISON(bind_cpu)) 658 bind_cpu = intr_cpu; 659 660 sqs = bind_cpu->cpu_squeue_set; 661 ASSERT(sqs != NULL); 662 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 663 664 sqp = ip_find_unused_squeue(sqs, B_TRUE); 665 if (sqp == NULL) { 666 /* 667 * We hit the max limit of squeues allowed per CPU. 668 * Assign this rx_ring to DEFAULT squeue of the 669 * interrupted CPU but thesqueue will not manage 670 * the ring. Also print a warning. 671 */ 672 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 673 "%d/%p already has max number of squeues. System " 674 "performance might become suboptimal\n", 675 sqs->sqs_bind, (void *)sqs); 676 677 /* the first squeue in the list is the default squeue */ 678 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 679 ASSERT(sqp != NULL); 680 681 ill_rx_ring->rr_sqp = sqp; 682 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 683 continue; 684 685 } 686 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 687 ill_rx_ring->rr_sqp = sqp; 688 sqp->sq_rx_ring = ill_rx_ring; 689 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 690 sqp->sq_state |= SQS_ILL_BOUND; 691 692 /* assign affinity to soft ring */ 693 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 694 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 695 sqp->sq_bind); 696 } 697 mutex_exit(&sqp->sq_lock); 698 } 699 mutex_exit(&ill->ill_lock); 700 701 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 702 SOFT_RING_FANOUT); 703 704 mutex_enter(&ill->ill_lock); 705 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 706 mutex_exit(&ill->ill_lock); 707 708 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 709 ill_waiter_dcr(ill); 710 } 711 712 /* ARGSUSED */ 713 void 714 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 715 mblk_t *mp_chain, struct mac_header_info_s *mhip) 716 { 717 ip_taskq_arg_t *taskq_arg; 718 boolean_t refheld; 719 720 mutex_enter(&ill->ill_lock); 721 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 722 taskq_arg = (ip_taskq_arg_t *) 723 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 724 725 if (taskq_arg == NULL) 726 goto out; 727 728 taskq_arg->ip_taskq_ill = ill; 729 taskq_arg->ip_taskq_ill_rx_ring = NULL; 730 taskq_arg->ip_taskq_cpu = CPU; 731 732 /* 733 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 734 * the next interrupt to schedule a task for calling 735 * ip_squeue_soft_ring_affinity(); 736 */ 737 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 738 } else { 739 mutex_exit(&ill->ill_lock); 740 goto out; 741 } 742 mutex_exit(&ill->ill_lock); 743 refheld = ill_waiter_inc(ill); 744 if (refheld) { 745 if (taskq_dispatch(system_taskq, 746 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 747 goto out; 748 749 /* release ref on ill if taskq dispatch fails */ 750 ill_waiter_dcr(ill); 751 } 752 /* 753 * Turn on CAPAB_SOFT_RING so that affinity assignment 754 * can be tried again later. 755 */ 756 mutex_enter(&ill->ill_lock); 757 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 758 mutex_exit(&ill->ill_lock); 759 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 760 761 out: 762 ip_input(ill, NULL, mp_chain, mhip); 763 } 764 765 static squeue_t * 766 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout) 767 { 768 int i; 769 squeue_set_t *best_sqs = NULL; 770 squeue_set_t *curr_sqs = NULL; 771 int min_sq = 0; 772 squeue_t *sqp = NULL; 773 char sqname[64]; 774 cpu_t *bind_cpu; 775 776 /* 777 * If fanout is set and the passed squeue_set already has some 778 * squeues which are managing the NICs, try to find squeues on 779 * unused CPU. 780 */ 781 if (sqs->sqs_size > 1 && fanout) { 782 /* 783 * First check to see if any squeue on the CPU passed 784 * is managing a NIC. 785 */ 786 mutex_enter(&sqs->sqs_lock); 787 for (i = 0; i < sqs->sqs_size; i++) { 788 mutex_enter(&sqs->sqs_list[i]->sq_lock); 789 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 790 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 791 mutex_exit(&sqs->sqs_list[i]->sq_lock); 792 break; 793 } 794 mutex_exit(&sqs->sqs_list[i]->sq_lock); 795 } 796 mutex_exit(&sqs->sqs_lock); 797 if (i != sqs->sqs_size) { 798 best_sqs = NULL; 799 800 for (i = sqset_global_size - 1; i >= 0; i--) { 801 curr_sqs = sqset_global_list[i]; 802 /* 803 * Check and make sure the CPU that sqs 804 * is bound to is valid. There could be 805 * sqs's around whose CPUs could have 806 * been DR'd out. 807 */ 808 mutex_enter(&cpu_lock); 809 if (cpu_get(curr_sqs->sqs_bind) != NULL) { 810 if (best_sqs == NULL) { 811 best_sqs = curr_sqs; 812 min_sq = curr_sqs->sqs_size; 813 } else if (curr_sqs->sqs_size < 814 min_sq) { 815 best_sqs = curr_sqs; 816 min_sq = curr_sqs->sqs_size; 817 } 818 } 819 mutex_exit(&cpu_lock); 820 } 821 822 ASSERT(best_sqs != NULL); 823 sqs = best_sqs; 824 } 825 } 826 827 mutex_enter(&sqs->sqs_lock); 828 829 for (i = 0; i < sqs->sqs_size; i++) { 830 mutex_enter(&sqs->sqs_list[i]->sq_lock); 831 if ((sqs->sqs_list[i]->sq_state & 832 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 833 sqp = sqs->sqs_list[i]; 834 break; 835 } 836 mutex_exit(&sqs->sqs_list[i]->sq_lock); 837 } 838 839 if (sqp == NULL) { 840 /* Need to create a new squeue */ 841 if (sqs->sqs_size == sqs->sqs_max_size) { 842 /* 843 * Reached the max limit for squeue 844 * we can allocate on this CPU. 845 */ 846 mutex_exit(&sqs->sqs_lock); 847 return (NULL); 848 } 849 850 mutex_enter(&cpu_lock); 851 if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) { 852 /* Too bad, CPU got DR'd out, return NULL */ 853 mutex_exit(&cpu_lock); 854 mutex_exit(&sqs->sqs_lock); 855 return (NULL); 856 } 857 858 bzero(sqname, sizeof (sqname)); 859 (void) snprintf(sqname, sizeof (sqname), 860 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 861 bind_cpu->cpu_id, sqs->sqs_size); 862 mutex_exit(&cpu_lock); 863 864 sqp = squeue_create(sqname, sqs->sqs_bind, 865 ip_squeue_worker_wait, minclsyspri); 866 867 ASSERT(sqp != NULL); 868 869 squeue_profile_enable(sqp); 870 /* 871 * Other functions scanning sqs_list don't take sqs_lock. 872 * Once sqp is stored in sqs_list[] global visibility is 873 * ensured before incrementing the sqs_size counter. 874 */ 875 sqs->sqs_list[sqs->sqs_size] = sqp; 876 membar_producer(); 877 sqs->sqs_size++; 878 879 if (ip_squeue_create_callback != NULL) 880 ip_squeue_create_callback(sqp); 881 882 if (ip_squeue_bind) { 883 mutex_enter(&cpu_lock); 884 bind_cpu = cpu_get(sqs->sqs_bind); 885 if (bind_cpu != NULL && cpu_is_online(bind_cpu)) { 886 squeue_bind(sqp, -1); 887 } 888 mutex_exit(&cpu_lock); 889 } 890 mutex_enter(&sqp->sq_lock); 891 } 892 893 mutex_exit(&sqs->sqs_lock); 894 ASSERT(sqp != NULL); 895 return (sqp); 896 } 897 898 /* 899 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 900 * owned by a squeue yet, do the assignment. When the NIC registers it 901 * Rx rings with IP, we don't know where the interrupts will land and 902 * hence we need to wait till this point to do the assignment. 903 */ 904 squeue_t * 905 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 906 { 907 squeue_t *sqp; 908 ill_t *ill; 909 int interrupt; 910 ip_taskq_arg_t *taskq_arg; 911 boolean_t refheld; 912 913 if (ill_rx_ring == NULL) 914 return (IP_SQUEUE_GET(lbolt)); 915 916 sqp = ill_rx_ring->rr_sqp; 917 /* 918 * Do a quick check. If it's not NULL, we are done. 919 * Squeues are never destroyed so worse we will bind 920 * this connection to a suboptimal squeue. 921 * 922 * This is the fast path case. 923 */ 924 if (sqp != NULL) 925 return (sqp); 926 927 ill = ill_rx_ring->rr_ill; 928 ASSERT(ill != NULL); 929 930 interrupt = servicing_interrupt(); 931 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 932 KM_NOSLEEP); 933 934 mutex_enter(&ill->ill_lock); 935 /* 936 * Check sqp under the lock again for atomicity. Possible race with 937 * a previously scheduled ip_squeue_get -> ip_squeue_extend. 938 * Do the ring to squeue binding only if we are in interrupt context 939 * AND the ring is not already bound AND there is no one else trying 940 * the bind already. 941 */ 942 sqp = ill_rx_ring->rr_sqp; 943 if (sqp != NULL || !interrupt || 944 ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { 945 /* 946 * Note that the ring might get bound once we drop the lock 947 * below, if a previous request is in progress i.e. if the ring 948 * state is ILL_RING_INPROC. The incoming connection on whose 949 * behalf we are currently here might get a suboptimal squeue 950 * via the call to IP_SQUEUE_GET below, but there is no 951 * correctness issue. 952 */ 953 mutex_exit(&ill->ill_lock); 954 if (taskq_arg != NULL) 955 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 956 if (sqp != NULL) 957 return (sqp); 958 return (IP_SQUEUE_GET(lbolt)); 959 } 960 961 /* 962 * No sqp assigned yet. Can't really do that in interrupt 963 * context. Assign the default sqp to this connection and 964 * trigger creation of new sqp and binding it to this ring 965 * via taskq. Need to make sure ill stays around. 966 */ 967 taskq_arg->ip_taskq_ill = ill; 968 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 969 taskq_arg->ip_taskq_cpu = CPU; 970 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 971 mutex_exit(&ill->ill_lock); 972 refheld = ill_waiter_inc(ill); 973 if (refheld) { 974 if (taskq_dispatch(system_taskq, ip_squeue_extend, 975 taskq_arg, TQ_NOSLEEP) != NULL) { 976 return (IP_SQUEUE_GET(lbolt)); 977 } 978 } 979 /* 980 * The ill is closing and we could not get a reference on the ill OR 981 * taskq_dispatch failed probably due to memory allocation failure. 982 * We will try again next time. 983 */ 984 mutex_enter(&ill->ill_lock); 985 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 986 mutex_exit(&ill->ill_lock); 987 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 988 if (refheld) 989 ill_waiter_dcr(ill); 990 991 return (IP_SQUEUE_GET(lbolt)); 992 } 993 994 /* 995 * NDD hooks for setting ip_squeue_xxx tuneables. 996 */ 997 998 /* ARGSUSED */ 999 int 1000 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 1001 caddr_t addr, cred_t *cr) 1002 { 1003 int *bind_enabled = (int *)addr; 1004 long new_value; 1005 int i; 1006 1007 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1008 return (EINVAL); 1009 1010 if (ip_squeue_bind == new_value) 1011 return (0); 1012 1013 *bind_enabled = new_value; 1014 mutex_enter(&cpu_lock); 1015 if (new_value == 0) { 1016 for (i = 0; i < sqset_global_size; i++) 1017 ip_squeue_set_unbind(sqset_global_list[i]); 1018 } else { 1019 for (i = 0; i < sqset_global_size; i++) 1020 ip_squeue_set_bind(sqset_global_list[i]); 1021 } 1022 1023 mutex_exit(&cpu_lock); 1024 return (0); 1025 } 1026 1027 /* 1028 * Set squeue profiling. 1029 * 0 means "disable" 1030 * 1 means "enable" 1031 * 2 means "enable and reset" 1032 */ 1033 /* ARGSUSED */ 1034 int 1035 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 1036 cred_t *cr) 1037 { 1038 int *profile_enabled = (int *)cp; 1039 long new_value; 1040 squeue_set_t *sqs; 1041 1042 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1043 return (EINVAL); 1044 1045 if (new_value == 0) 1046 squeue_profile_stop(); 1047 else if (new_value == 1) 1048 squeue_profile_start(); 1049 else if (new_value == 2) { 1050 int i, j; 1051 1052 squeue_profile_stop(); 1053 mutex_enter(&cpu_lock); 1054 for (i = 0; i < sqset_global_size; i++) { 1055 sqs = sqset_global_list[i]; 1056 for (j = 0; j < sqs->sqs_size; j++) { 1057 squeue_profile_reset(sqs->sqs_list[j]); 1058 } 1059 } 1060 mutex_exit(&cpu_lock); 1061 1062 new_value = 1; 1063 squeue_profile_start(); 1064 } 1065 *profile_enabled = new_value; 1066 1067 return (0); 1068 } 1069 1070 /* 1071 * Reconfiguration callback 1072 */ 1073 1074 /* ARGSUSED */ 1075 static int 1076 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 1077 { 1078 cpu_t *cp = cpu[id]; 1079 1080 ASSERT(MUTEX_HELD(&cpu_lock)); 1081 switch (what) { 1082 case CPU_CONFIG: 1083 /* 1084 * A new CPU is added. Create an squeue for it but do not bind 1085 * it yet. 1086 */ 1087 if (cp->cpu_squeue_set == NULL) 1088 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1089 break; 1090 case CPU_ON: 1091 case CPU_INIT: 1092 case CPU_CPUPART_IN: 1093 if (cp->cpu_squeue_set == NULL) { 1094 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1095 } 1096 if (ip_squeue_bind) 1097 ip_squeue_set_bind(cp->cpu_squeue_set); 1098 break; 1099 case CPU_UNCONFIG: 1100 case CPU_OFF: 1101 case CPU_CPUPART_OUT: 1102 ASSERT((cp->cpu_squeue_set != NULL) || 1103 (cp->cpu_flags & CPU_OFFLINE)); 1104 1105 if (cp->cpu_squeue_set != NULL) { 1106 ip_squeue_set_unbind(cp->cpu_squeue_set); 1107 } 1108 break; 1109 default: 1110 break; 1111 } 1112 return (0); 1113 } 1114 1115 /* ARGSUSED */ 1116 static void 1117 ip_squeue_set_bind(squeue_set_t *sqs) 1118 { 1119 int i; 1120 squeue_t *sqp; 1121 1122 if (!ip_squeue_bind) 1123 return; 1124 1125 mutex_enter(&sqs->sqs_lock); 1126 for (i = 0; i < sqs->sqs_size; i++) { 1127 sqp = sqs->sqs_list[i]; 1128 if (sqp->sq_state & SQS_BOUND) 1129 continue; 1130 squeue_bind(sqp, -1); 1131 } 1132 mutex_exit(&sqs->sqs_lock); 1133 } 1134 1135 static void 1136 ip_squeue_set_unbind(squeue_set_t *sqs) 1137 { 1138 int i; 1139 squeue_t *sqp; 1140 1141 mutex_enter(&sqs->sqs_lock); 1142 for (i = 0; i < sqs->sqs_size; i++) { 1143 sqp = sqs->sqs_list[i]; 1144 1145 /* 1146 * CPU is going offline. Remove the thread affinity 1147 * for any soft ring threads the squeue is managing. 1148 */ 1149 if (sqp->sq_state & SQS_ILL_BOUND) { 1150 ill_rx_ring_t *ring = sqp->sq_rx_ring; 1151 ill_t *ill = ring->rr_ill; 1152 1153 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1154 ASSERT(ring->rr_handle != NULL); 1155 ill->ill_dls_capab->ill_dls_unbind( 1156 ring->rr_handle); 1157 } 1158 } 1159 if (!(sqp->sq_state & SQS_BOUND)) 1160 continue; 1161 squeue_unbind(sqp); 1162 } 1163 mutex_exit(&sqs->sqs_lock); 1164 } 1165