1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * IP interface to squeues. 30 * 31 * IP creates an squeue instance for each CPU. The squeue pointer is saved in 32 * cpu_squeue field of the cpu structure. Each squeue is associated with a 33 * connection instance (conn_t). 34 * 35 * For CPUs available at system startup time the squeue creation and association 36 * with CPU happens at MP initialization time. For CPUs added during dynamic 37 * reconfiguration, the initialization happens when the new CPU is configured in 38 * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 39 * return per-CPU squeue or random squeue based on the ip_squeue_fanout 40 * variable. 41 * 42 * There are two modes of associating connection with squeues. The first mode 43 * associates each connection with the CPU that creates the connection (either 44 * during open time or during accept time). The second mode associates each 45 * connection with a random CPU, effectively distributing load over all CPUs 46 * and all squeues in the system. The mode is controlled by the 47 * ip_squeue_fanout variable. 48 * 49 * NOTE: The fact that there is an association between each connection and 50 * squeue and squeue and CPU does not mean that each connection is always 51 * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 52 * may process the connection on whatever CPU it is scheduled. The squeue to CPU 53 * binding is only relevant for the worker thread. 54 * 55 * The list of all created squeues is kept in squeue_set structure. This list is 56 * used when ip_squeue_fanout is set and the load is distributed across all 57 * squeues. 58 * 59 * INTERFACE: 60 * 61 * squeue_t *ip_squeue_get(hint) 62 * 63 * Find an squeue based on the 'hint' value. The hint is used as an index 64 * in the array of IP squeues available. The way hint is computed may 65 * affect the effectiveness of the squeue distribution. Currently squeues 66 * are assigned in round-robin fashion using lbolt as a hint. 67 * 68 * 69 * DR Notes 70 * ======== 71 * 72 * The ip_squeue_init() registers a call-back function with the CPU DR 73 * subsystem using register_cpu_setup_func(). The call-back function does two 74 * things: 75 * 76 * o When the CPU is going off-line or unconfigured, the worker thread is 77 * unbound from the CPU. This allows the CPU unconfig code to move it to 78 * another CPU. 79 * 80 * o When the CPU is going online, it creates a new squeue for this CPU if 81 * necessary and binds the squeue worker thread to this CPU. 82 * 83 * TUNEBALES: 84 * 85 * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 86 * associated with an squeue instance. 87 * 88 * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 89 * should be compiled with SQUEUE_PROFILE enabled for this variable to have 90 * an impact. 91 * 92 * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 93 * otherwise get it from CPU->cpu_squeue. 94 * 95 * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 96 * changed using ndd on /dev/tcp or /dev/ip. 97 * 98 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 99 * created. This is the time squeue code waits before waking up the worker 100 * thread after queuing a request. 101 */ 102 103 #include <sys/types.h> 104 #include <sys/debug.h> 105 #include <sys/kmem.h> 106 #include <sys/cpuvar.h> 107 108 #include <sys/cmn_err.h> 109 110 #include <inet/common.h> 111 #include <inet/ip.h> 112 #include <inet/ip_if.h> 113 #include <inet/nd.h> 114 #include <inet/ipclassifier.h> 115 #include <sys/types.h> 116 #include <sys/conf.h> 117 #include <sys/sunddi.h> 118 #include <sys/dlpi.h> 119 #include <sys/squeue_impl.h> 120 121 /* 122 * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 123 * mapping between squeue and NIC (or Rx ring) for performance reasons so 124 * each squeue can uniquely own a NIC or a Rx ring and do polling 125 * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 126 * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 127 * can be created dynamically as needed. 128 */ 129 #define MAX_SQUEUES_PER_CPU 32 130 #define MIN_SQUEUES_PER_CPU 1 131 uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 132 133 #define IP_NUM_SOFT_RINGS 2 134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 135 136 /* 137 * List of all created squeue sets. The size is protected by cpu_lock 138 */ 139 squeue_set_t **sqset_global_list; 140 uint_t sqset_global_size; 141 142 int ip_squeue_bind = B_TRUE; 143 int ip_squeue_profile = B_TRUE; 144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 145 146 /* 147 * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 148 * created. This is the time squeue code waits before waking up the worker 149 * thread after queuing a request. 150 */ 151 uint_t ip_squeue_worker_wait = 10; 152 153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155 156 static void ip_squeue_set_bind(squeue_set_t *); 157 static void ip_squeue_set_unbind(squeue_set_t *); 158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 159 static void ip_squeue_clean(void *, mblk_t *, void *); 160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); 161 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 163 164 /* 165 * Create squeue set containing ip_squeues_per_cpu number of squeues 166 * for this CPU and bind them all to the CPU. 167 */ 168 static squeue_set_t * 169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 170 { 171 int i; 172 squeue_set_t *sqs; 173 squeue_t *sqp; 174 char sqname[64]; 175 processorid_t id = cp->cpu_id; 176 177 if (reuse) { 178 int i; 179 180 /* 181 * We may already have an squeue created for this CPU. Try to 182 * find one and reuse it if possible. 183 */ 184 for (i = 0; i < sqset_global_size; i++) { 185 sqs = sqset_global_list[i]; 186 if (id == sqs->sqs_bind) 187 return (sqs); 188 } 189 } 190 191 sqs = kmem_zalloc(sizeof (squeue_set_t) + 192 (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 193 mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 194 sqs->sqs_list = (squeue_t **)&sqs[1]; 195 sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 196 sqs->sqs_bind = id; 197 198 for (i = 0; i < ip_squeues_per_cpu; i++) { 199 bzero(sqname, sizeof (sqname)); 200 201 (void) snprintf(sqname, sizeof (sqname), 202 "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 203 cp->cpu_id, i); 204 205 sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 206 minclsyspri); 207 208 /* 209 * The first squeue in each squeue_set is the DEFAULT 210 * squeue. 211 */ 212 sqp->sq_state |= SQS_DEFAULT; 213 214 ASSERT(sqp != NULL); 215 216 squeue_profile_enable(sqp); 217 sqs->sqs_list[sqs->sqs_size++] = sqp; 218 219 if (ip_squeue_create_callback != NULL) 220 ip_squeue_create_callback(sqp); 221 } 222 223 if (ip_squeue_bind && cpu_is_online(cp)) 224 ip_squeue_set_bind(sqs); 225 226 sqset_global_list[sqset_global_size++] = sqs; 227 ASSERT(sqset_global_size <= NCPU); 228 return (sqs); 229 } 230 231 /* 232 * Initialize IP squeues. 233 */ 234 void 235 ip_squeue_init(void (*callback)(squeue_t *)) 236 { 237 int i; 238 239 ASSERT(sqset_global_list == NULL); 240 241 if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 242 ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 243 else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 244 ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 245 246 ip_squeue_create_callback = callback; 247 squeue_init(); 248 sqset_global_list = 249 kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 250 sqset_global_size = 0; 251 mutex_enter(&cpu_lock); 252 253 /* Create squeue for each active CPU available */ 254 for (i = 0; i < NCPU; i++) { 255 cpu_t *cp = cpu[i]; 256 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 257 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 258 } 259 } 260 261 register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 262 263 mutex_exit(&cpu_lock); 264 265 if (ip_squeue_profile) 266 squeue_profile_start(); 267 } 268 269 /* 270 * Get squeue_t structure based on index. 271 * Since the squeue list can only grow, no need to grab any lock. 272 */ 273 squeue_t * 274 ip_squeue_random(uint_t index) 275 { 276 squeue_set_t *sqs; 277 278 sqs = sqset_global_list[index % sqset_global_size]; 279 return (sqs->sqs_list[index % sqs->sqs_size]); 280 } 281 282 /* ARGSUSED */ 283 static void 284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 285 { 286 squeue_t *sqp = arg2; 287 ill_rx_ring_t *ring = sqp->sq_rx_ring; 288 ill_t *ill; 289 290 ASSERT(sqp != NULL); 291 292 if (ring == NULL) { 293 return; 294 } 295 296 /* 297 * Clean up squeue 298 */ 299 mutex_enter(&sqp->sq_lock); 300 sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 301 sqp->sq_rx_ring = NULL; 302 mutex_exit(&sqp->sq_lock); 303 304 ill = ring->rr_ill; 305 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 306 ASSERT(ring->rr_handle != NULL); 307 ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 308 } 309 310 /* 311 * Cleanup the ring 312 */ 313 314 ring->rr_blank = NULL; 315 ring->rr_handle = NULL; 316 ring->rr_sqp = NULL; 317 318 /* 319 * Signal ill that cleanup is done 320 */ 321 mutex_enter(&ill->ill_lock); 322 ring->rr_ring_state = ILL_RING_FREE; 323 cv_signal(&ill->ill_cv); 324 mutex_exit(&ill->ill_lock); 325 } 326 327 /* 328 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 329 * The real cleanup happens behind the squeue via ip_squeue_clean function but 330 * we need to protect ourselves from 2 threads trying to cleanup at the same 331 * time (possible with one port going down for aggr and someone tearing down the 332 * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock 333 * to indicate when the cleanup has started (1 ref) and when the cleanup 334 * is done (0 ref). When a new ring gets assigned to squeue, we start by 335 * putting 2 ref on ill_inuse_ref. 336 */ 337 static void 338 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 339 { 340 conn_t *connp; 341 squeue_t *sqp; 342 mblk_t *mp; 343 344 ASSERT(rx_ring != NULL); 345 346 /* Just clean one squeue */ 347 mutex_enter(&ill->ill_lock); 348 /* 349 * Reset the ILL_SOFT_RING_ASSIGN bit so that 350 * ip_squeue_soft_ring_affinty() will not go 351 * ahead with assigning rings. 352 */ 353 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 354 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 355 /* Some operations pending on the ring. Wait */ 356 cv_wait(&ill->ill_cv, &ill->ill_lock); 357 358 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 359 /* 360 * Someone already trying to clean 361 * this squeue or it's already been cleaned. 362 */ 363 mutex_exit(&ill->ill_lock); 364 return; 365 } 366 sqp = rx_ring->rr_sqp; 367 368 if (sqp == NULL) { 369 /* 370 * The rx_ring never had a squeue assigned to it. 371 * We are under ill_lock so we can clean it up 372 * here itself since no one can get to it. 373 */ 374 rx_ring->rr_blank = NULL; 375 rx_ring->rr_handle = NULL; 376 rx_ring->rr_sqp = NULL; 377 rx_ring->rr_ring_state = ILL_RING_FREE; 378 mutex_exit(&ill->ill_lock); 379 return; 380 } 381 382 /* Indicate that it's being cleaned */ 383 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 384 ASSERT(sqp != NULL); 385 mutex_exit(&ill->ill_lock); 386 387 /* 388 * Use the preallocated ill_unbind_conn for this purpose 389 */ 390 connp = ill->ill_dls_capab->ill_unbind_conn; 391 392 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) { 393 connp->conn_tcp->tcp_closemp_used = B_TRUE; 394 } else { 395 cmn_err(CE_PANIC, "ip_squeue_clean_ring: " 396 "concurrent use of tcp_closemp_used: connp %p tcp %p\n", 397 (void *)connp, (void *)connp->conn_tcp); 398 } 399 400 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 401 mp = &connp->conn_tcp->tcp_closemp; 402 CONN_INC_REF(connp); 403 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 404 405 mutex_enter(&ill->ill_lock); 406 while (rx_ring->rr_ring_state != ILL_RING_FREE) 407 cv_wait(&ill->ill_cv, &ill->ill_lock); 408 mutex_exit(&ill->ill_lock); 409 } 410 411 void 412 ip_squeue_clean_all(ill_t *ill) 413 { 414 int idx; 415 416 /* 417 * No need to clean if poll_capab isn't set for this ill 418 */ 419 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 420 return; 421 422 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 423 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 424 425 ip_squeue_clean_ring(ill, ipr); 426 } 427 428 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 429 } 430 431 typedef struct ip_taskq_arg { 432 ill_t *ip_taskq_ill; 433 ill_rx_ring_t *ip_taskq_ill_rx_ring; 434 cpu_t *ip_taskq_cpu; 435 } ip_taskq_arg_t; 436 437 /* 438 * Do a Rx ring to squeue binding. Find a unique squeue that is not 439 * managing a receive ring. If no such squeue exists, dynamically 440 * create a new one in the squeue set. 441 * 442 * The function runs via the system taskq. The ill passed as an 443 * argument can't go away since we hold a ref. The lock order is 444 * ill_lock -> sqs_lock -> sq_lock. 445 * 446 * If we are binding a Rx ring to a squeue attached to the offline CPU, 447 * no need to check that because squeues are never destroyed once 448 * created. 449 */ 450 /* ARGSUSED */ 451 static void 452 ip_squeue_extend(void *arg) 453 { 454 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 455 ill_t *ill = sq_arg->ip_taskq_ill; 456 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 457 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 458 squeue_set_t *sqs; 459 squeue_t *sqp = NULL; 460 461 ASSERT(ill != NULL); 462 ASSERT(ill_rx_ring != NULL); 463 kmem_free(arg, sizeof (ip_taskq_arg_t)); 464 465 /* 466 * Make sure the CPU that originally took the interrupt still 467 * exists. 468 */ 469 if (!CPU_ISON(intr_cpu)) 470 intr_cpu = CPU; 471 472 sqs = intr_cpu->cpu_squeue_set; 473 474 /* 475 * If this ill represents link aggregation, then there might be 476 * multiple NICs trying to register them selves at the same time 477 * and in order to ensure that test and assignment of free rings 478 * is sequential, we need to hold the ill_lock. 479 */ 480 mutex_enter(&ill->ill_lock); 481 sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 482 if (sqp == NULL) { 483 /* 484 * We hit the max limit of squeues allowed per CPU. 485 * Assign this rx_ring to DEFAULT squeue of the 486 * interrupted CPU but the squeue will not manage 487 * the ring. Also print a warning. 488 */ 489 cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 490 "has max number of squeues. System performance might " 491 "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 492 493 /* the first squeue in the list is the default squeue */ 494 sqp = sqs->sqs_list[0]; 495 ASSERT(sqp != NULL); 496 ill_rx_ring->rr_sqp = sqp; 497 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 498 499 mutex_exit(&ill->ill_lock); 500 ill_waiter_dcr(ill); 501 return; 502 } 503 504 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 505 sqp->sq_rx_ring = ill_rx_ring; 506 ill_rx_ring->rr_sqp = sqp; 507 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 508 509 sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 510 mutex_exit(&sqp->sq_lock); 511 512 mutex_exit(&ill->ill_lock); 513 514 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 515 ill_waiter_dcr(ill); 516 } 517 518 /* 519 * Do a Rx ring to squeue binding. Find a unique squeue that is not 520 * managing a receive ring. If no such squeue exists, dynamically 521 * create a new one in the squeue set. 522 * 523 * The function runs via the system taskq. The ill passed as an 524 * argument can't go away since we hold a ref. The lock order is 525 * ill_lock -> sqs_lock -> sq_lock. 526 * 527 * If we are binding a Rx ring to a squeue attached to the offline CPU, 528 * no need to check that because squeues are never destroyed once 529 * created. 530 */ 531 /* ARGSUSED */ 532 static void 533 ip_squeue_soft_ring_affinity(void *arg) 534 { 535 ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 536 ill_t *ill = sq_arg->ip_taskq_ill; 537 ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 538 ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 539 cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 540 cpu_t *bind_cpu; 541 int cpu_id = intr_cpu->cpu_id; 542 int min_cpu_id, max_cpu_id; 543 boolean_t enough_uniq_cpus = B_FALSE; 544 boolean_t enough_cpus = B_FALSE; 545 squeue_set_t *sqs, *last_sqs; 546 squeue_t *sqp = NULL; 547 int i, j; 548 549 ASSERT(ill != NULL); 550 kmem_free(arg, sizeof (ip_taskq_arg_t)); 551 552 /* 553 * Make sure the CPU that originally took the interrupt still 554 * exists. 555 */ 556 if (!CPU_ISON(intr_cpu)) { 557 intr_cpu = CPU; 558 cpu_id = intr_cpu->cpu_id; 559 } 560 561 /* 562 * If this ill represents link aggregation, then there might be 563 * multiple NICs trying to register them selves at the same time 564 * and in order to ensure that test and assignment of free rings 565 * is sequential, we need to hold the ill_lock. 566 */ 567 mutex_enter(&ill->ill_lock); 568 569 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 570 mutex_exit(&ill->ill_lock); 571 return; 572 } 573 /* 574 * We need to fanout the interrupts from the NIC. We do that by 575 * telling the driver underneath to create soft rings and use 576 * worker threads (if the driver advertized SOFT_RING capability) 577 * Its still a big performance win to if we can fanout to the 578 * threads on the same core that is taking interrupts. 579 * 580 * Since we don't know the interrupt to CPU binding, we don't 581 * assign any squeues or affinity to worker threads in the NIC. 582 * At the time of the first interrupt, we know which CPU is 583 * taking interrupts and try to find other threads on the same 584 * core. Assuming, ip_threads_per_cpu is correct and cpus are 585 * numbered sequentially for each core (XXX need something better 586 * than this in future), find the lowest number and highest 587 * number thread for that core. 588 * 589 * If we have one more thread per core than number of soft rings, 590 * then don't assign any worker threads to the H/W thread (cpu) 591 * taking interrupts (capability negotiation tries to ensure this) 592 * 593 * If the number of threads per core are same as the number of 594 * soft rings, then assign the worker affinity and squeue to 595 * the same cpu. 596 * 597 * Otherwise, just fanout to higher number CPUs starting from 598 * the interrupted CPU. 599 */ 600 601 min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 602 max_cpu_id = min_cpu_id + ip_threads_per_cpu; 603 604 /* 605 * Quickly check if there are enough CPUs present for fanout 606 * and also max_cpu_id is less than the id of the active CPU. 607 * We use the cpu_id stored in the last squeue_set to get 608 * an idea. The scheme is by no means perfect since it doesn't 609 * take into account CPU DR operations and the fact that 610 * interrupts themselves might change. An ideal scenario 611 * would be to ensure that interrupts run cpus by themselves 612 * and worker threads never have affinity to those CPUs. If 613 * the interrupts move to CPU which had a worker thread, it 614 * should be changed. Probably callbacks similar to CPU offline 615 * are needed to make it work perfectly. 616 */ 617 last_sqs = sqset_global_list[sqset_global_size - 1]; 618 if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 619 if ((max_cpu_id - min_cpu_id) > 620 ill_soft_ring->ill_dls_soft_ring_cnt) 621 enough_uniq_cpus = B_TRUE; 622 else if ((max_cpu_id - min_cpu_id) >= 623 ill_soft_ring->ill_dls_soft_ring_cnt) 624 enough_cpus = B_TRUE; 625 } 626 627 j = 0; 628 for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 629 if (enough_uniq_cpus) { 630 if ((min_cpu_id + i) == cpu_id) { 631 j++; 632 continue; 633 } 634 bind_cpu = cpu[min_cpu_id + i]; 635 } else if (enough_cpus) { 636 bind_cpu = cpu[min_cpu_id + i]; 637 } else { 638 /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 639 bind_cpu = cpu[(cpu_id + i) % ncpus]; 640 } 641 642 /* 643 * Check if the CPU actually exist and active. If not, 644 * use the interrupted CPU. ip_find_unused_squeue() will 645 * find the right CPU to fanout anyway. 646 */ 647 if (!CPU_ISON(bind_cpu)) 648 bind_cpu = intr_cpu; 649 650 sqs = bind_cpu->cpu_squeue_set; 651 ASSERT(sqs != NULL); 652 ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 653 654 sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 655 if (sqp == NULL) { 656 /* 657 * We hit the max limit of squeues allowed per CPU. 658 * Assign this rx_ring to DEFAULT squeue of the 659 * interrupted CPU but thesqueue will not manage 660 * the ring. Also print a warning. 661 */ 662 cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 663 "%d/%p already has max number of squeues. System " 664 "performance might become suboptimal\n", 665 sqs->sqs_bind, (void *)sqs); 666 667 /* the first squeue in the list is the default squeue */ 668 sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 669 ASSERT(sqp != NULL); 670 671 ill_rx_ring->rr_sqp = sqp; 672 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 673 continue; 674 675 } 676 ASSERT(MUTEX_HELD(&sqp->sq_lock)); 677 ill_rx_ring->rr_sqp = sqp; 678 sqp->sq_rx_ring = ill_rx_ring; 679 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 680 sqp->sq_state |= SQS_ILL_BOUND; 681 682 /* assign affinity to soft ring */ 683 if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 684 ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 685 sqp->sq_bind); 686 } 687 mutex_exit(&sqp->sq_lock); 688 } 689 mutex_exit(&ill->ill_lock); 690 691 ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 692 SOFT_RING_FANOUT); 693 694 mutex_enter(&ill->ill_lock); 695 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 696 mutex_exit(&ill->ill_lock); 697 698 /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 699 ill_waiter_dcr(ill); 700 } 701 702 /* ARGSUSED */ 703 void 704 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 705 mblk_t *mp_chain, struct mac_header_info_s *mhip) 706 { 707 ip_taskq_arg_t *taskq_arg; 708 boolean_t refheld; 709 710 ASSERT(servicing_interrupt()); 711 712 mutex_enter(&ill->ill_lock); 713 if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 714 taskq_arg = (ip_taskq_arg_t *) 715 kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 716 717 if (taskq_arg == NULL) 718 goto out; 719 720 taskq_arg->ip_taskq_ill = ill; 721 taskq_arg->ip_taskq_ill_rx_ring = NULL; 722 taskq_arg->ip_taskq_cpu = CPU; 723 724 /* 725 * Set ILL_SOFT_RING_ASSIGN flag. We don't want 726 * the next interrupt to schedule a task for calling 727 * ip_squeue_soft_ring_affinity(); 728 */ 729 ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 730 } else { 731 mutex_exit(&ill->ill_lock); 732 goto out; 733 } 734 mutex_exit(&ill->ill_lock); 735 refheld = ill_waiter_inc(ill); 736 if (refheld) { 737 if (taskq_dispatch(system_taskq, 738 ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 739 goto out; 740 741 /* release ref on ill if taskq dispatch fails */ 742 ill_waiter_dcr(ill); 743 } 744 /* 745 * Turn on CAPAB_SOFT_RING so that affinity assignment 746 * can be tried again later. 747 */ 748 mutex_enter(&ill->ill_lock); 749 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 750 mutex_exit(&ill->ill_lock); 751 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 752 753 out: 754 ip_input(ill, NULL, mp_chain, mhip); 755 } 756 757 static squeue_t * 758 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 759 { 760 int i; 761 squeue_set_t *best_sqs = NULL; 762 squeue_set_t *curr_sqs = NULL; 763 int min_sq = 0; 764 squeue_t *sqp = NULL; 765 char sqname[64]; 766 767 /* 768 * If fanout is set and the passed squeue_set already has some 769 * squeues which are managing the NICs, try to find squeues on 770 * unused CPU. 771 */ 772 if (sqs->sqs_size > 1 && fanout) { 773 /* 774 * First check to see if any squeue on the CPU passed 775 * is managing a NIC. 776 */ 777 for (i = 0; i < sqs->sqs_size; i++) { 778 mutex_enter(&sqs->sqs_list[i]->sq_lock); 779 if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 780 !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 781 mutex_exit(&sqs->sqs_list[i]->sq_lock); 782 break; 783 } 784 mutex_exit(&sqs->sqs_list[i]->sq_lock); 785 } 786 if (i != sqs->sqs_size) { 787 best_sqs = sqset_global_list[sqset_global_size - 1]; 788 min_sq = best_sqs->sqs_size; 789 790 for (i = sqset_global_size - 2; i >= 0; i--) { 791 curr_sqs = sqset_global_list[i]; 792 if (curr_sqs->sqs_size < min_sq) { 793 best_sqs = curr_sqs; 794 min_sq = curr_sqs->sqs_size; 795 } 796 } 797 798 ASSERT(best_sqs != NULL); 799 sqs = best_sqs; 800 bind_cpu = cpu[sqs->sqs_bind]; 801 } 802 } 803 804 mutex_enter(&sqs->sqs_lock); 805 806 for (i = 0; i < sqs->sqs_size; i++) { 807 mutex_enter(&sqs->sqs_list[i]->sq_lock); 808 if ((sqs->sqs_list[i]->sq_state & 809 (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 810 sqp = sqs->sqs_list[i]; 811 break; 812 } 813 mutex_exit(&sqs->sqs_list[i]->sq_lock); 814 } 815 816 if (sqp == NULL) { 817 /* Need to create a new squeue */ 818 if (sqs->sqs_size == sqs->sqs_max_size) { 819 /* 820 * Reached the max limit for squeue 821 * we can allocate on this CPU. 822 */ 823 mutex_exit(&sqs->sqs_lock); 824 return (NULL); 825 } 826 827 bzero(sqname, sizeof (sqname)); 828 (void) snprintf(sqname, sizeof (sqname), 829 "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 830 bind_cpu->cpu_id, sqs->sqs_size); 831 832 sqp = squeue_create(sqname, bind_cpu->cpu_id, 833 ip_squeue_worker_wait, minclsyspri); 834 835 ASSERT(sqp != NULL); 836 837 squeue_profile_enable(sqp); 838 sqs->sqs_list[sqs->sqs_size++] = sqp; 839 840 if (ip_squeue_create_callback != NULL) 841 ip_squeue_create_callback(sqp); 842 843 mutex_enter(&cpu_lock); 844 if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 845 squeue_bind(sqp, -1); 846 } 847 mutex_exit(&cpu_lock); 848 849 mutex_enter(&sqp->sq_lock); 850 } 851 852 mutex_exit(&sqs->sqs_lock); 853 ASSERT(sqp != NULL); 854 return (sqp); 855 } 856 857 /* 858 * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 859 * owned by a squeue yet, do the assignment. When the NIC registers it 860 * Rx rings with IP, we don't know where the interrupts will land and 861 * hence we need to wait till this point to do the assignment. 862 */ 863 squeue_t * 864 ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 865 { 866 squeue_t *sqp; 867 ill_t *ill; 868 int interrupt; 869 ip_taskq_arg_t *taskq_arg; 870 boolean_t refheld; 871 872 if (ill_rx_ring == NULL) 873 return (IP_SQUEUE_GET(lbolt)); 874 875 sqp = ill_rx_ring->rr_sqp; 876 /* 877 * Do a quick check. If it's not NULL, we are done. 878 * Squeues are never destroyed so worse we will bind 879 * this connection to a suboptimal squeue. 880 * 881 * This is the fast path case. 882 */ 883 if (sqp != NULL) 884 return (sqp); 885 886 ill = ill_rx_ring->rr_ill; 887 ASSERT(ill != NULL); 888 889 interrupt = servicing_interrupt(); 890 taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 891 KM_NOSLEEP); 892 893 mutex_enter(&ill->ill_lock); 894 /* 895 * Check sqp under the lock again for atomicity. Possible race with 896 * a previously scheduled ip_squeue_get -> ip_squeue_extend. 897 * Do the ring to squeue binding only if we are in interrupt context 898 * AND the ring is not already bound AND there is no one else trying 899 * the bind already. 900 */ 901 sqp = ill_rx_ring->rr_sqp; 902 if (sqp != NULL || !interrupt || 903 ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { 904 /* 905 * Note that the ring might get bound once we drop the lock 906 * below, if a previous request is in progress i.e. if the ring 907 * state is ILL_RING_INPROC. The incoming connection on whose 908 * behalf we are currently here might get a suboptimal squeue 909 * via the call to IP_SQUEUE_GET below, but there is no 910 * correctness issue. 911 */ 912 mutex_exit(&ill->ill_lock); 913 if (taskq_arg != NULL) 914 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 915 if (sqp != NULL) 916 return (sqp); 917 return (IP_SQUEUE_GET(lbolt)); 918 } 919 920 /* 921 * No sqp assigned yet. Can't really do that in interrupt 922 * context. Assign the default sqp to this connection and 923 * trigger creation of new sqp and binding it to this ring 924 * via taskq. Need to make sure ill stays around. 925 */ 926 taskq_arg->ip_taskq_ill = ill; 927 taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 928 taskq_arg->ip_taskq_cpu = CPU; 929 ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 930 mutex_exit(&ill->ill_lock); 931 refheld = ill_waiter_inc(ill); 932 if (refheld) { 933 if (taskq_dispatch(system_taskq, ip_squeue_extend, 934 taskq_arg, TQ_NOSLEEP) != NULL) { 935 return (IP_SQUEUE_GET(lbolt)); 936 } 937 } 938 /* 939 * The ill is closing and we could not get a reference on the ill OR 940 * taskq_dispatch failed probably due to memory allocation failure. 941 * We will try again next time. 942 */ 943 mutex_enter(&ill->ill_lock); 944 ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 945 mutex_exit(&ill->ill_lock); 946 kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 947 if (refheld) 948 ill_waiter_dcr(ill); 949 950 return (IP_SQUEUE_GET(lbolt)); 951 } 952 953 /* 954 * NDD hooks for setting ip_squeue_xxx tuneables. 955 */ 956 957 /* ARGSUSED */ 958 int 959 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 960 caddr_t addr, cred_t *cr) 961 { 962 int *bind_enabled = (int *)addr; 963 long new_value; 964 int i; 965 966 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 967 return (EINVAL); 968 969 if (ip_squeue_bind == new_value) 970 return (0); 971 972 *bind_enabled = new_value; 973 mutex_enter(&cpu_lock); 974 if (new_value == 0) { 975 for (i = 0; i < sqset_global_size; i++) 976 ip_squeue_set_unbind(sqset_global_list[i]); 977 } else { 978 for (i = 0; i < sqset_global_size; i++) 979 ip_squeue_set_bind(sqset_global_list[i]); 980 } 981 982 mutex_exit(&cpu_lock); 983 return (0); 984 } 985 986 /* 987 * Set squeue profiling. 988 * 0 means "disable" 989 * 1 means "enable" 990 * 2 means "enable and reset" 991 */ 992 /* ARGSUSED */ 993 int 994 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 995 cred_t *cr) 996 { 997 int *profile_enabled = (int *)cp; 998 long new_value; 999 squeue_set_t *sqs; 1000 1001 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 1002 return (EINVAL); 1003 1004 if (new_value == 0) 1005 squeue_profile_stop(); 1006 else if (new_value == 1) 1007 squeue_profile_start(); 1008 else if (new_value == 2) { 1009 int i, j; 1010 1011 squeue_profile_stop(); 1012 mutex_enter(&cpu_lock); 1013 for (i = 0; i < sqset_global_size; i++) { 1014 sqs = sqset_global_list[i]; 1015 for (j = 0; j < sqs->sqs_size; j++) { 1016 squeue_profile_reset(sqs->sqs_list[j]); 1017 } 1018 } 1019 mutex_exit(&cpu_lock); 1020 1021 new_value = 1; 1022 squeue_profile_start(); 1023 } 1024 *profile_enabled = new_value; 1025 1026 return (0); 1027 } 1028 1029 /* 1030 * Reconfiguration callback 1031 */ 1032 1033 /* ARGSUSED */ 1034 static int 1035 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 1036 { 1037 cpu_t *cp = cpu[id]; 1038 1039 ASSERT(MUTEX_HELD(&cpu_lock)); 1040 switch (what) { 1041 case CPU_CONFIG: 1042 /* 1043 * A new CPU is added. Create an squeue for it but do not bind 1044 * it yet. 1045 */ 1046 if (cp->cpu_squeue_set == NULL) 1047 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1048 break; 1049 case CPU_ON: 1050 case CPU_INIT: 1051 case CPU_CPUPART_IN: 1052 if (cp->cpu_squeue_set == NULL) { 1053 cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 1054 } 1055 if (ip_squeue_bind) 1056 ip_squeue_set_bind(cp->cpu_squeue_set); 1057 break; 1058 case CPU_UNCONFIG: 1059 case CPU_OFF: 1060 case CPU_CPUPART_OUT: 1061 ASSERT((cp->cpu_squeue_set != NULL) || 1062 (cp->cpu_flags & CPU_OFFLINE)); 1063 1064 if (cp->cpu_squeue_set != NULL) { 1065 ip_squeue_set_unbind(cp->cpu_squeue_set); 1066 } 1067 break; 1068 default: 1069 break; 1070 } 1071 return (0); 1072 } 1073 1074 /* ARGSUSED */ 1075 static void 1076 ip_squeue_set_bind(squeue_set_t *sqs) 1077 { 1078 int i; 1079 squeue_t *sqp; 1080 1081 if (!ip_squeue_bind) 1082 return; 1083 1084 mutex_enter(&sqs->sqs_lock); 1085 for (i = 0; i < sqs->sqs_size; i++) { 1086 sqp = sqs->sqs_list[i]; 1087 if (sqp->sq_state & SQS_BOUND) 1088 continue; 1089 squeue_bind(sqp, -1); 1090 } 1091 mutex_exit(&sqs->sqs_lock); 1092 } 1093 1094 static void 1095 ip_squeue_set_unbind(squeue_set_t *sqs) 1096 { 1097 int i; 1098 squeue_t *sqp; 1099 1100 mutex_enter(&sqs->sqs_lock); 1101 for (i = 0; i < sqs->sqs_size; i++) { 1102 sqp = sqs->sqs_list[i]; 1103 1104 /* 1105 * CPU is going offline. Remove the thread affinity 1106 * for any soft ring threads the squeue is managing. 1107 */ 1108 if (sqp->sq_state & SQS_ILL_BOUND) { 1109 ill_rx_ring_t *ring = sqp->sq_rx_ring; 1110 ill_t *ill = ring->rr_ill; 1111 1112 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 1113 ASSERT(ring->rr_handle != NULL); 1114 ill->ill_dls_capab->ill_dls_unbind( 1115 ring->rr_handle); 1116 } 1117 } 1118 if (!(sqp->sq_state & SQS_BOUND)) 1119 continue; 1120 squeue_unbind(sqp); 1121 } 1122 mutex_exit(&sqs->sqs_lock); 1123 } 1124