1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/nd.h> 114 #include <inet/ipclassifier.h> 115 #include <sys/types.h> 116 #include <sys/conf.h> 117 #include <sys/sunddi.h> 118 #include <sys/dlpi.h> 119 #include <sys/squeue_impl.h> 120 121 /* 122 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 123 * mapping between squeue and NIC (or Rx ring) for performance reasons so 124 * each squeue can uniquely own a NIC or a Rx ring and do polling 125 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 126 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 127 * can be created dynamically as needed. 128 */ 129 #define MAX_SQUEUES_PER_CPU 32 130 #define MIN_SQUEUES_PER_CPU 1 131 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 132 133 #define IP_NUM_SOFT_RINGS 2 134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 135 136 /* 137 * List of all created squeue sets. The size is protected by cpu_lock 138 */ 139 squeue_set_t **sqset_global_list; 140 uint_t sqset_global_size; 141 142 int ip_squeue_bind = B_TRUE; 143 int ip_squeue_profile = B_TRUE; 144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 145 146 /* 147 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 148 * created. This is the time squeue code waits before waking up the worker 149 * thread after queuing a request. 150 */ 151 uint_t ip_squeue_worker_wait = 10; 152 153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155 156 static void ip_squeue_set_bind(squeue_set_t *); 157 static void ip_squeue_set_unbind(squeue_set_t *); 158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t); 159 static void ip_squeue_clean(void *, mblk_t *, void *); 160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); 161 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 163 164 /* 165 * Create squeue set containing ip_squeues_per_cpu number of squeues 166 * for this CPU and bind them all to the CPU. 167 */ 168 static squeue_set_t * 169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 170 { 171 int i; 172 squeue_set_t *sqs; 173 squeue_t *sqp; 174 char sqname[64]; 175 processorid_t id = cp->cpu_id; 176 177 if (reuse) { 178 int i; 179 180 /* 181 * We may already have an squeue created for this CPU. Try to 182 * find one and reuse it if possible. 183 */ 184 for (i = 0; i < sqset_global_size; i++) { 185 sqs = sqset_global_list[i]; 186 if (id == sqs->sqs_bind) 187 return (sqs); 188 } 189 } 190 191 sqs = kmem_zalloc(sizeof (squeue_set_t) + 192 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 193 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 194 sqs->sqs_list = (squeue_t **)&sqs[1]; 195 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 196 sqs->sqs_bind = id; 197 198 for (i = 0; i < ip_squeues_per_cpu; i++) { 199 bzero(sqname, sizeof (sqname)); 200 201 (void) snprintf(sqname, sizeof (sqname), 202 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 203 cp->cpu_id, i); 204 205 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 206 minclsyspri); 207 208 /* 209 * The first squeue in each squeue_set is the DEFAULT 210 * squeue. 211 */ 212 sqp->sq_state |= SQS_DEFAULT; 213 214 ASSERT(sqp != NULL); 215 216 squeue_profile_enable(sqp); 217 sqs->sqs_list[sqs->sqs_size++] = sqp; 218 219 if (ip_squeue_create_callback != NULL) 220 ip_squeue_create_callback(sqp); 221 } 222 223 if (ip_squeue_bind && cpu_is_online(cp)) 224 ip_squeue_set_bind(sqs); 225 226 sqset_global_list[sqset_global_size++] = sqs; 227 ASSERT(sqset_global_size <= NCPU); 228 return (sqs); 229 } 230 231 /* 232 * Initialize IP squeues. 233 */ 234 void 235 ip_squeue_init(void (*callback)(squeue_t *)) 236 { 237 int i; 238 239 ASSERT(sqset_global_list == NULL); 240 241 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 243 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 244 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 245 246 ip_squeue_create_callback = callback; 247 squeue_init(); 248 sqset_global_list = 249 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 250 sqset_global_size = 0; 251 mutex_enter(&cpu_lock); 252 253 /* Create squeue for each active CPU available */ 254 for (i = 0; i < NCPU; i++) { 255 cpu_t *cp = cpu[i]; 256 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 257 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 258 } 259 } 260 261 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 262 263 mutex_exit(&cpu_lock); 264 265 if (ip_squeue_profile) 266 squeue_profile_start(); 267 } 268 269 /* 270 * Get squeue_t structure based on index. 271 * Since the squeue list can only grow, no need to grab any lock. 272 */ 273 squeue_t * 274 ip_squeue_random(uint_t index) 275 { 276 squeue_set_t *sqs; 277 278 sqs = sqset_global_list[index % sqset_global_size]; 279 return (sqs->sqs_list[index % sqs->sqs_size]); 280 } 281 282 /* ARGSUSED */ 283 static void 284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 285 { 286 squeue_t *sqp = arg2; 287 ill_rx_ring_t *ring = (ill_rx_ring_t *)mp->b_wptr; 288 ill_t *ill; 289 290 ASSERT(sqp != NULL); 291 mp->b_wptr = NULL; 292 293 if (ring == NULL) { 294 return; 295 } 296 297 /* 298 * Clean up squeue 299 */ 300 mutex_enter(&sqp->sq_lock); 301 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 302 sqp->sq_rx_ring = NULL; 303 mutex_exit(&sqp->sq_lock); 304 305 ill = ring->rr_ill; 306 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 307 ASSERT(ring->rr_handle != NULL); 308 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 309 } 310 311 /* 312 * Cleanup the ring 313 */ 314 315 ring->rr_blank = NULL; 316 ring->rr_handle = NULL; 317 ring->rr_sqp = NULL; 318 319 /* 320 * Signal ill that cleanup is done 321 */ 322 mutex_enter(&ill->ill_lock); 323 ring->rr_ring_state = ILL_RING_FREE; 324 cv_signal(&ill->ill_cv); 325 mutex_exit(&ill->ill_lock); 326 } 327 328 /* 329 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 330 * The real cleanup happens behind the squeue via ip_squeue_clean function but 331 * we need to protect ourselves from 2 threads trying to cleanup at the same 332 * time (possible with one port going down for aggr and someone tearing down the 333 * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock 334 * to indicate when the cleanup has started (1 ref) and when the cleanup 335 * is done (0 ref). When a new ring gets assigned to squeue, we start by 336 * putting 2 ref on ill_inuse_ref. 337 */ 338 static void 339 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 340 { 341 conn_t *connp; 342 squeue_t *sqp; 343 mblk_t *mp; 344 345 ASSERT(rx_ring != NULL); 346 347 /* Just clean one squeue */ 348 mutex_enter(&ill->ill_lock); 349 /* 350 * Reset the ILL_SOFT_RING_ASSIGN bit so that 351 * ip_squeue_soft_ring_affinty() will not go 352 * ahead with assigning rings. 353 */ 354 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 355 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 356 /* Some operations pending on the ring. Wait */ 357 cv_wait(&ill->ill_cv, &ill->ill_lock); 358 359 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 360 /* 361 * Someone already trying to clean 362 * this squeue or it's already been cleaned. 363 */ 364 mutex_exit(&ill->ill_lock); 365 return; 366 } 367 sqp = rx_ring->rr_sqp; 368 369 if (sqp == NULL) { 370 /* 371 * The rx_ring never had a squeue assigned to it. 372 * We are under ill_lock so we can clean it up 373 * here itself since no one can get to it. 374 */ 375 rx_ring->rr_blank = NULL; 376 rx_ring->rr_handle = NULL; 377 rx_ring->rr_sqp = NULL; 378 rx_ring->rr_ring_state = ILL_RING_FREE; 379 mutex_exit(&ill->ill_lock); 380 return; 381 } 382 383 /* Indicate that it's being cleaned */ 384 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 385 ASSERT(sqp != NULL); 386 mutex_exit(&ill->ill_lock); 387 388 /* 389 * Use the preallocated ill_unbind_conn for this purpose 390 */ 391 connp = ill->ill_dls_capab->ill_unbind_conn; 392 393 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) { 394 connp->conn_tcp->tcp_closemp_used = B_TRUE; 395 } else { 396 cmn_err(CE_PANIC, "ip_squeue_clean_ring: " 397 "concurrent use of tcp_closemp_used: connp %p tcp %p\n", 398 (void *)connp, (void *)connp->conn_tcp); 399 } 400 401 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 402 mp = &connp->conn_tcp->tcp_closemp; 403 CONN_INC_REF(connp); 404 405 /* 406 * Since the field sq_rx_ring for default squeue is NULL, 407 * ip_squeue_clean() will have no way to get the ring if we 408 * don't pass the pointer to it. We use b_wptr to do so 409 * as use of b_wptr for any other purpose is not expected. 410 */ 411 412 ASSERT(mp->b_wptr == NULL); 413 mp->b_wptr = (unsigned char *)rx_ring; 414 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 415 416 mutex_enter(&ill->ill_lock); 417 while (rx_ring->rr_ring_state != ILL_RING_FREE) 418 cv_wait(&ill->ill_cv, &ill->ill_lock); 419 mutex_exit(&ill->ill_lock); 420 } 421 422 void 423 ip_squeue_clean_all(ill_t *ill) 424 { 425 int idx; 426 427 /* 428 * No need to clean if poll_capab isn't set for this ill 429 */ 430 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 431 return; 432 433 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 434 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 435 436 ip_squeue_clean_ring(ill, ipr); 437 } 438 439 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 440 } 441 442 typedef struct ip_taskq_arg { 443 ill_t *ip_taskq_ill; 444 ill_rx_ring_t *ip_taskq_ill_rx_ring; 445 cpu_t *ip_taskq_cpu; 446 } ip_taskq_arg_t; 447 448 /* 449 * Do a Rx ring to squeue binding. Find a unique squeue that is not 450 * managing a receive ring. If no such squeue exists, dynamically 451 * create a new one in the squeue set. 452 * 453 * The function runs via the system taskq. The ill passed as an 454 * argument can't go away since we hold a ref. The lock order is 455 * ill_lock -> sqs_lock -> sq_lock. 456 * 457 * If we are binding a Rx ring to a squeue attached to the offline CPU, 458 * no need to check that because squeues are never destroyed once 459 * created. 460 */ 461 /* ARGSUSED */ 462 static void 463 ip_squeue_extend(void *arg) 464 { 465 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 466 ill_t *ill = sq_arg->ip_taskq_ill; 467 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 468 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 469 squeue_set_t *sqs; 470 squeue_t *sqp = NULL; 471 472 ASSERT(ill != NULL); 473 ASSERT(ill_rx_ring != NULL); 474 kmem_free(arg, sizeof (ip_taskq_arg_t)); 475 476 /* 477 * Make sure the CPU that originally took the interrupt still 478 * exists. 479 */ 480 if (!CPU_ISON(intr_cpu)) 481 intr_cpu = CPU; 482 483 sqs = intr_cpu->cpu_squeue_set; 484 485 /* 486 * If this ill represents link aggregation, then there might be 487 * multiple NICs trying to register them selves at the same time 488 * and in order to ensure that test and assignment of free rings 489 * is sequential, we need to hold the ill_lock. 490 */ 491 mutex_enter(&ill->ill_lock); 492 sqp = ip_find_unused_squeue(sqs, B_FALSE); 493 if (sqp == NULL) { 494 /* 495 * We hit the max limit of squeues allowed per CPU. 496 * Assign this rx_ring to DEFAULT squeue of the 497 * interrupted CPU but the squeue will not manage 498 * the ring. Also print a warning. 499 */ 500 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 501 "has max number of squeues. System performance might " 502 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 503 504 /* the first squeue in the list is the default squeue */ 505 sqp = sqs->sqs_list[0]; 506 ASSERT(sqp != NULL); 507 ill_rx_ring->rr_sqp = sqp; 508 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 509 510 mutex_exit(&ill->ill_lock); 511 ill_waiter_dcr(ill); 512 return; 513 } 514 515 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 516 sqp->sq_rx_ring = ill_rx_ring; 517 ill_rx_ring->rr_sqp = sqp; 518 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 519 520 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 521 mutex_exit(&sqp->sq_lock); 522 523 mutex_exit(&ill->ill_lock); 524 525 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 526 ill_waiter_dcr(ill); 527 } 528 529 /* 530 * Do a Rx ring to squeue binding. Find a unique squeue that is not 531 * managing a receive ring. If no such squeue exists, dynamically 532 * create a new one in the squeue set. 533 * 534 * The function runs via the system taskq. The ill passed as an 535 * argument can't go away since we hold a ref. The lock order is 536 * ill_lock -> sqs_lock -> sq_lock. 537 * 538 * If we are binding a Rx ring to a squeue attached to the offline CPU, 539 * no need to check that because squeues are never destroyed once 540 * created. 541 */ 542 /* ARGSUSED */ 543 static void 544 ip_squeue_soft_ring_affinity(void *arg) 545 { 546 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 547 ill_t *ill = sq_arg->ip_taskq_ill; 548 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 549 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 550 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 551 cpu_t *bind_cpu; 552 int cpu_id = intr_cpu->cpu_id; 553 int min_cpu_id, max_cpu_id; 554 boolean_t enough_uniq_cpus = B_FALSE; 555 boolean_t enough_cpus = B_FALSE; 556 squeue_set_t *sqs, *last_sqs; 557 squeue_t *sqp = NULL; 558 int i, j; 559 560 ASSERT(ill != NULL); 561 kmem_free(arg, sizeof (ip_taskq_arg_t)); 562 563 /* 564 * Make sure the CPU that originally took the interrupt still 565 * exists. 566 */ 567 if (!CPU_ISON(intr_cpu)) { 568 intr_cpu = CPU; 569 cpu_id = intr_cpu->cpu_id; 570 } 571 572 /* 573 * If this ill represents link aggregation, then there might be 574 * multiple NICs trying to register them selves at the same time 575 * and in order to ensure that test and assignment of free rings 576 * is sequential, we need to hold the ill_lock. 577 */ 578 mutex_enter(&ill->ill_lock); 579 580 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 581 mutex_exit(&ill->ill_lock); 582 return; 583 } 584 /* 585 * We need to fanout the interrupts from the NIC. We do that by 586 * telling the driver underneath to create soft rings and use 587 * worker threads (if the driver advertized SOFT_RING capability) 588 * Its still a big performance win to if we can fanout to the 589 * threads on the same core that is taking interrupts. 590 * 591 * Since we don't know the interrupt to CPU binding, we don't 592 * assign any squeues or affinity to worker threads in the NIC. 593 * At the time of the first interrupt, we know which CPU is 594 * taking interrupts and try to find other threads on the same 595 * core. Assuming, ip_threads_per_cpu is correct and cpus are 596 * numbered sequentially for each core (XXX need something better 597 * than this in future), find the lowest number and highest 598 * number thread for that core. 599 * 600 * If we have one more thread per core than number of soft rings, 601 * then don't assign any worker threads to the H/W thread (cpu) 602 * taking interrupts (capability negotiation tries to ensure this) 603 * 604 * If the number of threads per core are same as the number of 605 * soft rings, then assign the worker affinity and squeue to 606 * the same cpu. 607 * 608 * Otherwise, just fanout to higher number CPUs starting from 609 * the interrupted CPU. 610 */ 611 612 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 613 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 614 615 /* 616 * Quickly check if there are enough CPUs present for fanout 617 * and also max_cpu_id is less than the id of the active CPU. 618 * We use the cpu_id stored in the last squeue_set to get 619 * an idea. The scheme is by no means perfect since it doesn't 620 * take into account CPU DR operations and the fact that 621 * interrupts themselves might change. An ideal scenario 622 * would be to ensure that interrupts run cpus by themselves 623 * and worker threads never have affinity to those CPUs. If 624 * the interrupts move to CPU which had a worker thread, it 625 * should be changed. Probably callbacks similar to CPU offline 626 * are needed to make it work perfectly. 627 */ 628 last_sqs = sqset_global_list[sqset_global_size - 1]; 629 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 630 if ((max_cpu_id - min_cpu_id) > 631 ill_soft_ring->ill_dls_soft_ring_cnt) 632 enough_uniq_cpus = B_TRUE; 633 else if ((max_cpu_id - min_cpu_id) >= 634 ill_soft_ring->ill_dls_soft_ring_cnt) 635 enough_cpus = B_TRUE; 636 } 637 638 j = 0; 639 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 640 if (enough_uniq_cpus) { 641 if ((min_cpu_id + i) == cpu_id) { 642 j++; 643 continue; 644 } 645 bind_cpu = cpu[min_cpu_id + i]; 646 } else if (enough_cpus) { 647 bind_cpu = cpu[min_cpu_id + i]; 648 } else { 649 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 650 bind_cpu = cpu[(cpu_id + i) % ncpus]; 651 } 652 653 /* 654 * Check if the CPU actually exist and active. If not, 655 * use the interrupted CPU. ip_find_unused_squeue() will 656 * find the right CPU to fanout anyway. 657 */ 658 if (!CPU_ISON(bind_cpu)) 659 bind_cpu = intr_cpu; 660 661 sqs = bind_cpu->cpu_squeue_set; 662 ASSERT(sqs != NULL); 663 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 664 665 sqp = ip_find_unused_squeue(sqs, B_TRUE); 666 if (sqp == NULL) { 667 /* 668 * We hit the max limit of squeues allowed per CPU. 669 * Assign this rx_ring to DEFAULT squeue of the 670 * interrupted CPU but thesqueue will not manage 671 * the ring. Also print a warning. 672 */ 673 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 674 "%d/%p already has max number of squeues. System " 675 "performance might become suboptimal\n", 676 sqs->sqs_bind, (void *)sqs); 677 678 /* the first squeue in the list is the default squeue */ 679 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 680 ASSERT(sqp != NULL); 681 682 ill_rx_ring->rr_sqp = sqp; 683 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 684 continue; 685 686 } 687 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 688 ill_rx_ring->rr_sqp = sqp; 689 sqp->sq_rx_ring = ill_rx_ring; 690 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 691 sqp->sq_state |= SQS_ILL_BOUND; 692 693 /* assign affinity to soft ring */ 694 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 695 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 696 sqp->sq_bind); 697 } 698 mutex_exit(&sqp->sq_lock); 699 } 700 mutex_exit(&ill->ill_lock); 701 702 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 703 SOFT_RING_FANOUT); 704 705 mutex_enter(&ill->ill_lock); 706 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 707 mutex_exit(&ill->ill_lock); 708 709 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 710 ill_waiter_dcr(ill); 711 } 712 713 /* ARGSUSED */ 714 void 715 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 716 mblk_t *mp_chain, struct mac_header_info_s *mhip) 717 { 718 ip_taskq_arg_t *taskq_arg; 719 boolean_t refheld; 720 721 mutex_enter(&ill->ill_lock); 722 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 723 taskq_arg = (ip_taskq_arg_t *) 724 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 725 726 if (taskq_arg == NULL) 727 goto out; 728 729 taskq_arg->ip_taskq_ill = ill; 730 taskq_arg->ip_taskq_ill_rx_ring = NULL; 731 taskq_arg->ip_taskq_cpu = CPU; 732 733 /* 734 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 735 * the next interrupt to schedule a task for calling 736 * ip_squeue_soft_ring_affinity(); 737 */ 738 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 739 } else { 740 mutex_exit(&ill->ill_lock); 741 goto out; 742 } 743 mutex_exit(&ill->ill_lock); 744 refheld = ill_waiter_inc(ill); 745 if (refheld) { 746 if (taskq_dispatch(system_taskq, 747 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 748 goto out; 749 750 /* release ref on ill if taskq dispatch fails */ 751 ill_waiter_dcr(ill); 752 } 753 /* 754 * Turn on CAPAB_SOFT_RING so that affinity assignment 755 * can be tried again later. 756 */ 757 mutex_enter(&ill->ill_lock); 758 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 759 mutex_exit(&ill->ill_lock); 760 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 761 762 out: 763 ip_input(ill, NULL, mp_chain, mhip); 764 } 765 766 static squeue_t * 767 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout) 768 { 769 int i; 770 squeue_set_t *best_sqs = NULL; 771 squeue_set_t *curr_sqs = NULL; 772 int min_sq = 0; 773 squeue_t *sqp = NULL; 774 char sqname[64]; 775 cpu_t *bind_cpu; 776 777 /* 778 * If fanout is set and the passed squeue_set already has some 779 * squeues which are managing the NICs, try to find squeues on 780 * unused CPU. 781 */ 782 if (sqs->sqs_size > 1 && fanout) { 783 /* 784 * First check to see if any squeue on the CPU passed 785 * is managing a NIC. 786 */ 787 for (i = 0; i < sqs->sqs_size; i++) { 788 mutex_enter(&sqs->sqs_list[i]->sq_lock); 789 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 790 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 791 mutex_exit(&sqs->sqs_list[i]->sq_lock); 792 break; 793 } 794 mutex_exit(&sqs->sqs_list[i]->sq_lock); 795 } 796 if (i != sqs->sqs_size) { 797 best_sqs = NULL; 798 799 for (i = sqset_global_size - 1; i >= 0; i--) { 800 curr_sqs = sqset_global_list[i]; 801 /* 802 * Check and make sure the CPU that sqs 803 * is bound to is valid. There could be 804 * sqs's around whose CPUs could have 805 * been DR'd out. 806 */ 807 mutex_enter(&cpu_lock); 808 if (cpu_get(curr_sqs->sqs_bind) != NULL) { 809 if (best_sqs == NULL) { 810 best_sqs = curr_sqs; 811 min_sq = curr_sqs->sqs_size; 812 } else if (curr_sqs->sqs_size < 813 min_sq) { 814 best_sqs = curr_sqs; 815 min_sq = curr_sqs->sqs_size; 816 } 817 } 818 mutex_exit(&cpu_lock); 819 } 820 821 ASSERT(best_sqs != NULL); 822 sqs = best_sqs; 823 } 824 } 825 826 mutex_enter(&sqs->sqs_lock); 827 828 for (i = 0; i < sqs->sqs_size; i++) { 829 mutex_enter(&sqs->sqs_list[i]->sq_lock); 830 if ((sqs->sqs_list[i]->sq_state & 831 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 832 sqp = sqs->sqs_list[i]; 833 break; 834 } 835 mutex_exit(&sqs->sqs_list[i]->sq_lock); 836 } 837 838 if (sqp == NULL) { 839 /* Need to create a new squeue */ 840 if (sqs->sqs_size == sqs->sqs_max_size) { 841 /* 842 * Reached the max limit for squeue 843 * we can allocate on this CPU. 844 */ 845 mutex_exit(&sqs->sqs_lock); 846 return (NULL); 847 } 848 849 mutex_enter(&cpu_lock); 850 if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) { 851 /* Too bad, CPU got DR'd out, return NULL */ 852 mutex_exit(&cpu_lock); 853 mutex_exit(&sqs->sqs_lock); 854 return (NULL); 855 } 856 857 bzero(sqname, sizeof (sqname)); 858 (void) snprintf(sqname, sizeof (sqname), 859 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 860 bind_cpu->cpu_id, sqs->sqs_size); 861 mutex_exit(&cpu_lock); 862 863 sqp = squeue_create(sqname, sqs->sqs_bind, 864 ip_squeue_worker_wait, minclsyspri); 865 866 ASSERT(sqp != NULL); 867 868 squeue_profile_enable(sqp); 869 sqs->sqs_list[sqs->sqs_size++] = sqp; 870 871 if (ip_squeue_create_callback != NULL) 872 ip_squeue_create_callback(sqp); 873 874 if (ip_squeue_bind) { 875 mutex_enter(&cpu_lock); 876 bind_cpu = cpu_get(sqs->sqs_bind); 877 if (bind_cpu != NULL && cpu_is_online(bind_cpu)) { 878 squeue_bind(sqp, -1); 879 } 880 mutex_exit(&cpu_lock); 881 } 882 mutex_enter(&sqp->sq_lock); 883 } 884 885 mutex_exit(&sqs->sqs_lock); 886 ASSERT(sqp != NULL); 887 return (sqp); 888 } 889 890 /* 891 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 892 * owned by a squeue yet, do the assignment. When the NIC registers it 893 * Rx rings with IP, we don't know where the interrupts will land and 894 * hence we need to wait till this point to do the assignment. 895 */ 896 squeue_t * 897 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 898 { 899 squeue_t *sqp; 900 ill_t *ill; 901 int interrupt; 902 ip_taskq_arg_t *taskq_arg; 903 boolean_t refheld; 904 905 if (ill_rx_ring == NULL) 906 return (IP_SQUEUE_GET(lbolt)); 907 908 sqp = ill_rx_ring->rr_sqp; 909 /* 910 * Do a quick check. If it's not NULL, we are done. 911 * Squeues are never destroyed so worse we will bind 912 * this connection to a suboptimal squeue. 913 * 914 * This is the fast path case. 915 */ 916 if (sqp != NULL) 917 return (sqp); 918 919 ill = ill_rx_ring->rr_ill; 920 ASSERT(ill != NULL); 921 922 interrupt = servicing_interrupt(); 923 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 924 KM_NOSLEEP); 925 926 mutex_enter(&ill->ill_lock); 927 /* 928 * Check sqp under the lock again for atomicity. Possible race with 929 * a previously scheduled ip_squeue_get -> ip_squeue_extend. 930 * Do the ring to squeue binding only if we are in interrupt context 931 * AND the ring is not already bound AND there is no one else trying 932 * the bind already. 933 */ 934 sqp = ill_rx_ring->rr_sqp; 935 if (sqp != NULL || !interrupt || 936 ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { 937 /* 938 * Note that the ring might get bound once we drop the lock 939 * below, if a previous request is in progress i.e. if the ring 940 * state is ILL_RING_INPROC. The incoming connection on whose 941 * behalf we are currently here might get a suboptimal squeue 942 * via the call to IP_SQUEUE_GET below, but there is no 943 * correctness issue. 944 */ 945 mutex_exit(&ill->ill_lock); 946 if (taskq_arg != NULL) 947 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 948 if (sqp != NULL) 949 return (sqp); 950 return (IP_SQUEUE_GET(lbolt)); 951 } 952 953 /* 954 * No sqp assigned yet. Can't really do that in interrupt 955 * context. Assign the default sqp to this connection and 956 * trigger creation of new sqp and binding it to this ring 957 * via taskq. Need to make sure ill stays around. 958 */ 959 taskq_arg->ip_taskq_ill = ill; 960 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 961 taskq_arg->ip_taskq_cpu = CPU; 962 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 963 mutex_exit(&ill->ill_lock); 964 refheld = ill_waiter_inc(ill); 965 if (refheld) { 966 if (taskq_dispatch(system_taskq, ip_squeue_extend, 967 taskq_arg, TQ_NOSLEEP) != NULL) { 968 return (IP_SQUEUE_GET(lbolt)); 969 } 970 } 971 /* 972 * The ill is closing and we could not get a reference on the ill OR 973 * taskq_dispatch failed probably due to memory allocation failure. 974 * We will try again next time. 975 */ 976 mutex_enter(&ill->ill_lock); 977 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 978 mutex_exit(&ill->ill_lock); 979 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 980 if (refheld) 981 ill_waiter_dcr(ill); 982 983 return (IP_SQUEUE_GET(lbolt)); 984 } 985 986 /* 987 * NDD hooks for setting ip_squeue_xxx tuneables. 988 */ 989 990 /* ARGSUSED */ 991 int 992 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 993 caddr_t addr, cred_t *cr) 994 { 995 int *bind_enabled = (int *)addr; 996 long new_value; 997 int i; 998 999 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1000 return (EINVAL); 1001 1002 if (ip_squeue_bind == new_value) 1003 return (0); 1004 1005 *bind_enabled = new_value; 1006 mutex_enter(&cpu_lock); 1007 if (new_value == 0) { 1008 for (i = 0; i < sqset_global_size; i++) 1009 ip_squeue_set_unbind(sqset_global_list[i]); 1010 } else { 1011 for (i = 0; i < sqset_global_size; i++) 1012 ip_squeue_set_bind(sqset_global_list[i]); 1013 } 1014 1015 mutex_exit(&cpu_lock); 1016 return (0); 1017 } 1018 1019 /* 1020 * Set squeue profiling. 1021 * 0 means "disable" 1022 * 1 means "enable" 1023 * 2 means "enable and reset" 1024 */ 1025 /* ARGSUSED */ 1026 int 1027 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 1028 cred_t *cr) 1029 { 1030 int *profile_enabled = (int *)cp; 1031 long new_value; 1032 squeue_set_t *sqs; 1033 1034 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1035 return (EINVAL); 1036 1037 if (new_value == 0) 1038 squeue_profile_stop(); 1039 else if (new_value == 1) 1040 squeue_profile_start(); 1041 else if (new_value == 2) { 1042 int i, j; 1043 1044 squeue_profile_stop(); 1045 mutex_enter(&cpu_lock); 1046 for (i = 0; i < sqset_global_size; i++) { 1047 sqs = sqset_global_list[i]; 1048 for (j = 0; j < sqs->sqs_size; j++) { 1049 squeue_profile_reset(sqs->sqs_list[j]); 1050 } 1051 } 1052 mutex_exit(&cpu_lock); 1053 1054 new_value = 1; 1055 squeue_profile_start(); 1056 } 1057 *profile_enabled = new_value; 1058 1059 return (0); 1060 } 1061 1062 /* 1063 * Reconfiguration callback 1064 */ 1065 1066 /* ARGSUSED */ 1067 static int 1068 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 1069 { 1070 cpu_t *cp = cpu[id]; 1071 1072 ASSERT(MUTEX_HELD(&cpu_lock)); 1073 switch (what) { 1074 case CPU_CONFIG: 1075 /* 1076 * A new CPU is added. Create an squeue for it but do not bind 1077 * it yet. 1078 */ 1079 if (cp->cpu_squeue_set == NULL) 1080 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1081 break; 1082 case CPU_ON: 1083 case CPU_INIT: 1084 case CPU_CPUPART_IN: 1085 if (cp->cpu_squeue_set == NULL) { 1086 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1087 } 1088 if (ip_squeue_bind) 1089 ip_squeue_set_bind(cp->cpu_squeue_set); 1090 break; 1091 case CPU_UNCONFIG: 1092 case CPU_OFF: 1093 case CPU_CPUPART_OUT: 1094 ASSERT((cp->cpu_squeue_set != NULL) || 1095 (cp->cpu_flags & CPU_OFFLINE)); 1096 1097 if (cp->cpu_squeue_set != NULL) { 1098 ip_squeue_set_unbind(cp->cpu_squeue_set); 1099 } 1100 break; 1101 default: 1102 break; 1103 } 1104 return (0); 1105 } 1106 1107 /* ARGSUSED */ 1108 static void 1109 ip_squeue_set_bind(squeue_set_t *sqs) 1110 { 1111 int i; 1112 squeue_t *sqp; 1113 1114 if (!ip_squeue_bind) 1115 return; 1116 1117 mutex_enter(&sqs->sqs_lock); 1118 for (i = 0; i < sqs->sqs_size; i++) { 1119 sqp = sqs->sqs_list[i]; 1120 if (sqp->sq_state & SQS_BOUND) 1121 continue; 1122 squeue_bind(sqp, -1); 1123 } 1124 mutex_exit(&sqs->sqs_lock); 1125 } 1126 1127 static void 1128 ip_squeue_set_unbind(squeue_set_t *sqs) 1129 { 1130 int i; 1131 squeue_t *sqp; 1132 1133 mutex_enter(&sqs->sqs_lock); 1134 for (i = 0; i < sqs->sqs_size; i++) { 1135 sqp = sqs->sqs_list[i]; 1136 1137 /* 1138 * CPU is going offline. Remove the thread affinity 1139 * for any soft ring threads the squeue is managing. 1140 */ 1141 if (sqp->sq_state & SQS_ILL_BOUND) { 1142 ill_rx_ring_t *ring = sqp->sq_rx_ring; 1143 ill_t *ill = ring->rr_ill; 1144 1145 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1146 ASSERT(ring->rr_handle != NULL); 1147 ill->ill_dls_capab->ill_dls_unbind( 1148 ring->rr_handle); 1149 } 1150 } 1151 if (!(sqp->sq_state & SQS_BOUND)) 1152 continue; 1153 squeue_unbind(sqp); 1154 } 1155 mutex_exit(&sqs->sqs_lock); 1156 } 1157