/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * IP interface to squeues. * * IP uses squeues to force serialization of packets, both incoming and * outgoing. Each squeue is associated with a connection instance (conn_t) * above, and a soft ring (if enabled) below. Each CPU will have a default * squeue for outbound connections, and each soft ring of an interface will * have an squeue to which it sends incoming packets. squeues are never * destroyed, and if they become unused they are kept around against future * needs. * * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU * in the system there will be one squeue set, all of whose squeues will be * bound to that CPU, plus one additional set known as the unbound set. Sets * associated with CPUs will have one default squeue, for outbound * connections, and a linked list of squeues used by various NICs for inbound * packets. The unbound set also has a linked list of squeues, but no default * squeue. * * When a CPU goes offline its squeue set is destroyed, and all its squeues * are moved to the unbound set. When a CPU comes online, a new squeue set is * created and the default set is searched for a default squeue formerly bound * to this CPU. If no default squeue is found, a new one is created. * * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP * and not the squeue code. squeue.c will not touch them, and we can modify * them without holding the squeue lock because of the guarantee that squeues * are never destroyed. ip_squeue locks must be held, however. * * All the squeue sets are protected by a single lock, the sqset_lock. This * is also used to protect the sq_next and sq_set fields of an squeue_t. * * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock * * There are two modes of associating connection with squeues. The first mode * associates each connection with the CPU that creates the connection (either * during open time or during accept time). The second mode associates each * connection with a random CPU, effectively distributing load over all CPUs * and all squeues in the system. The mode is controlled by the * ip_squeue_fanout variable. * * NOTE: The fact that there is an association between each connection and * squeue and squeue and CPU does not mean that each connection is always * processed on this CPU and on this CPU only. Any thread calling squeue_enter() * may process the connection on whatever CPU it is scheduled. The squeue to CPU * binding is only relevant for the worker thread. * * INTERFACE: * * squeue_t *ip_squeue_get(ill_rx_ring_t) * * Returns the squeue associated with an ill receive ring. If the ring is * not bound to a CPU, and we're currently servicing the interrupt which * generated the packet, then bind the squeue to CPU. * * * DR Notes * ======== * * The ip_squeue_init() registers a call-back function with the CPU DR * subsystem using register_cpu_setup_func(). The call-back function does two * things: * * o When the CPU is going off-line or unconfigured, the worker thread is * unbound from the CPU. This allows the CPU unconfig code to move it to * another CPU. * * o When the CPU is going online, it creates a new squeue for this CPU if * necessary and binds the squeue worker thread to this CPU. * * TUNABLES: * * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then * pick the default squeue from a random CPU, otherwise use our CPU's default * squeue. * * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or * /dev/ip. * * ip_squeue_worker_wait: global value for the sq_wait field for all squeues * * created. This is the time squeue code waits before waking up the worker * thread after queuing a request. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * List of all created squeue sets. The list and its size are protected by * sqset_lock. */ static squeue_set_t **sqset_global_list; /* list 0 is the unbound list */ static uint_t sqset_global_size; kmutex_t sqset_lock; static void (*ip_squeue_create_callback)(squeue_t *) = NULL; /* * ip_squeue_worker_wait: global value for the sq_wait field for all squeues * created. This is the time squeue code waits before waking up the worker * thread after queuing a request. */ uint_t ip_squeue_worker_wait = 10; static squeue_t *ip_squeue_create(pri_t); static squeue_set_t *ip_squeue_set_create(processorid_t); static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); static void ip_squeue_set_move(squeue_t *, squeue_set_t *); static void ip_squeue_set_destroy(cpu_t *); static void ip_squeue_clean(void *, mblk_t *, void *); #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) static squeue_t * ip_squeue_create(pri_t pri) { squeue_t *sqp; sqp = squeue_create(ip_squeue_worker_wait, pri); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); return (sqp); } /* * Create a new squeue_set. If id == -1, then we're creating the unbound set, * which should only happen once when we are first initialized. Otherwise id * is the id of the CPU that needs a set, either because we are initializing * or because the CPU has come online. * * If id != -1, then we need at a minimum to provide a default squeue for the * new set. We search the unbound set for candidates, and if none are found we * create a new one. */ static squeue_set_t * ip_squeue_set_create(processorid_t id) { squeue_set_t *sqs; squeue_set_t *src = sqset_global_list[0]; squeue_t **lastsqp, *sq; squeue_t **defaultq_lastp = NULL; sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP); sqs->sqs_cpuid = id; if (id == -1) { ASSERT(sqset_global_size == 0); sqset_global_list[0] = sqs; sqset_global_size = 1; return (sqs); } /* * When we create an squeue set id != -1, we need to give it a * default squeue, in order to support fanout of conns across * CPUs. Try to find a former default squeue that matches this * cpu id on the unbound squeue set. If no such squeue is found, * find some non-default TCP squeue and steal it. If still no such * candidate is found, create a new squeue. */ ASSERT(MUTEX_HELD(&cpu_lock)); mutex_enter(&sqset_lock); lastsqp = &src->sqs_head; while (*lastsqp) { if ((*lastsqp)->sq_bind == id && (*lastsqp)->sq_state & SQS_DEFAULT) { defaultq_lastp = lastsqp; break; } if (defaultq_lastp == NULL && !((*lastsqp)->sq_state & SQS_DEFAULT)) { defaultq_lastp = lastsqp; } lastsqp = &(*lastsqp)->sq_next; } if (defaultq_lastp) { /* Remove from src set and set SQS_DEFAULT */ sq = *defaultq_lastp; *defaultq_lastp = sq->sq_next; sq->sq_next = NULL; if (!(sq->sq_state & SQS_DEFAULT)) { mutex_enter(&sq->sq_lock); sq->sq_state |= SQS_DEFAULT; mutex_exit(&sq->sq_lock); } } else { sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY); sq->sq_state |= SQS_DEFAULT; } sq->sq_set = sqs; sqs->sqs_default = sq; squeue_bind(sq, id); /* this locks squeue mutex */ ASSERT(sqset_global_size <= NCPU); sqset_global_list[sqset_global_size++] = sqs; mutex_exit(&sqset_lock); return (sqs); } /* * Called by ill_ring_add() to find an squeue to associate with a new ring. */ squeue_t * ip_squeue_getfree(pri_t pri) { squeue_set_t *sqs = sqset_global_list[0]; squeue_t *sq; mutex_enter(&sqset_lock); for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) { /* * Select a non-default squeue */ if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND))) break; } if (sq == NULL) { sq = ip_squeue_create(pri); sq->sq_set = sqs; sq->sq_next = sqs->sqs_head; sqs->sqs_head = sq; } ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL | SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE | SQS_POLL_THR_QUIESCED))); mutex_enter(&sq->sq_lock); sq->sq_state |= SQS_ILL_BOUND; mutex_exit(&sq->sq_lock); mutex_exit(&sqset_lock); if (sq->sq_priority != pri) { thread_lock(sq->sq_worker); (void) thread_change_pri(sq->sq_worker, pri, 0); thread_unlock(sq->sq_worker); thread_lock(sq->sq_poll_thr); (void) thread_change_pri(sq->sq_poll_thr, pri, 0); thread_unlock(sq->sq_poll_thr); sq->sq_priority = pri; } return (sq); } /* * Initialize IP squeues. */ void ip_squeue_init(void (*callback)(squeue_t *)) { int i; squeue_set_t *sqs; ASSERT(sqset_global_list == NULL); ip_squeue_create_callback = callback; squeue_init(); mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL); sqset_global_list = kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP); sqset_global_size = 0; /* * We are called at system boot time and we don't * expect memory allocation failure. */ sqs = ip_squeue_set_create(-1); ASSERT(sqs != NULL); mutex_enter(&cpu_lock); /* Create squeue for each active CPU available */ for (i = 0; i < NCPU; i++) { cpu_t *cp = cpu_get(i); if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { /* * We are called at system boot time and we don't * expect memory allocation failure then */ cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id); ASSERT(cp->cpu_squeue_set != NULL); } } register_cpu_setup_func(ip_squeue_cpu_setup, NULL); mutex_exit(&cpu_lock); } /* * Get a default squeue, either from the current CPU or a CPU derived by hash * from the index argument, depending upon the setting of ip_squeue_fanout. */ squeue_t * ip_squeue_random(uint_t index) { squeue_set_t *sqs = NULL; squeue_t *sq; /* * The minimum value of sqset_global_size is 2, one for the unbound * squeue set and another for the squeue set of the zeroth CPU. * Even though the value could be changing, it can never go below 2, * so the assert does not need the lock protection. */ ASSERT(sqset_global_size > 1); /* Protect against changes to sqset_global_list */ mutex_enter(&sqset_lock); if (!ip_squeue_fanout) sqs = CPU->cpu_squeue_set; /* * sqset_global_list[0] corresponds to the unbound squeue set. * The computation below picks a set other than the unbound set. */ if (sqs == NULL) sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1]; sq = sqs->sqs_default; mutex_exit(&sqset_lock); ASSERT(sq); return (sq); } /* * Move squeue from its current set to newset. Not used for default squeues. * Bind or unbind the worker thread as appropriate. */ static void ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset) { squeue_set_t *set; squeue_t **lastsqp; processorid_t cpuid = newset->sqs_cpuid; ASSERT(!(sq->sq_state & SQS_DEFAULT)); ASSERT(!MUTEX_HELD(&sq->sq_lock)); ASSERT(MUTEX_HELD(&sqset_lock)); set = sq->sq_set; if (set == newset) return; lastsqp = &set->sqs_head; while (*lastsqp != sq) lastsqp = &(*lastsqp)->sq_next; *lastsqp = sq->sq_next; sq->sq_next = newset->sqs_head; newset->sqs_head = sq; sq->sq_set = newset; if (cpuid == -1) squeue_unbind(sq); else squeue_bind(sq, cpuid); } /* * Move squeue from its current set to cpuid's set and bind to cpuid. */ int ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid) { cpu_t *cpu; squeue_set_t *set; if (sq->sq_state & SQS_DEFAULT) return (-1); ASSERT(MUTEX_HELD(&cpu_lock)); cpu = cpu_get(cpuid); if (!CPU_ISON(cpu)) return (-1); mutex_enter(&sqset_lock); set = cpu->cpu_squeue_set; if (set != NULL) ip_squeue_set_move(sq, set); mutex_exit(&sqset_lock); return ((set == NULL) ? -1 : 0); } /* * The mac layer is calling, asking us to move an squeue to a * new CPU. This routine is called with cpu_lock held. */ void ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid) { ASSERT(ILL_MAC_PERIM_HELD(ill)); ASSERT(rx_ring->rr_ill == ill); mutex_enter(&ill->ill_lock); if (rx_ring->rr_ring_state == RR_FREE || rx_ring->rr_ring_state == RR_FREE_INPROG) { mutex_exit(&ill->ill_lock); return; } if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1) rx_ring->rr_ring_state = RR_SQUEUE_BOUND; mutex_exit(&ill->ill_lock); } void * ip_squeue_add_ring(ill_t *ill, void *mrp) { mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; ill_rx_ring_t *rx_ring, *ring_tbl; int ip_rx_index; squeue_t *sq = NULL; pri_t pri; ASSERT(ILL_MAC_PERIM_HELD(ill)); ASSERT(mrfp->mrf_type == MAC_RX_FIFO); ASSERT(ill->ill_dld_capab != NULL); ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl; mutex_enter(&ill->ill_lock); for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { rx_ring = &ring_tbl[ip_rx_index]; if (rx_ring->rr_ring_state == RR_FREE) break; } if (ip_rx_index == ILL_MAX_RINGS) { /* * We ran out of ILL_MAX_RINGS worth rx_ring structures. If * we have devices which can overwhelm this limit, * ILL_MAX_RING should be made configurable. Meanwhile it * cause no panic because driver will pass ip_input a NULL * handle which will make IP allocate the default squeue and * Polling mode will not be used for this ring. */ cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) for %s\n", ILL_MAX_RINGS, ill->ill_name); mutex_exit(&ill->ill_lock); return (NULL); } bzero(rx_ring, sizeof (ill_rx_ring_t)); rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive; /* XXX: Hard code it to tcp accept for now */ rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp; rx_ring->rr_intr_handle = mrfp->mrf_intr_handle; rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable; rx_ring->rr_intr_disable = (ip_mac_intr_disable_t)mrfp->mrf_intr_disable; rx_ring->rr_rx_handle = mrfp->mrf_rx_arg; rx_ring->rr_ill = ill; pri = mrfp->mrf_flow_priority; sq = ip_squeue_getfree(pri); mutex_enter(&sq->sq_lock); sq->sq_rx_ring = rx_ring; rx_ring->rr_sqp = sq; sq->sq_state |= SQS_POLL_CAPAB; rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND; sq->sq_ill = ill; mutex_exit(&sq->sq_lock); mutex_exit(&ill->ill_lock); DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int, ip_rx_index, void *, mrfp->mrf_rx_arg); /* Assign the squeue to the specified CPU as well */ mutex_enter(&cpu_lock); (void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id); mutex_exit(&cpu_lock); return (rx_ring); } /* * sanitize the squeue etc. Some of the processing * needs to be done from inside the perimeter. */ void ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) { squeue_t *sqp; ASSERT(ILL_MAC_PERIM_HELD(ill)); ASSERT(rx_ring != NULL); /* Just clean one squeue */ mutex_enter(&ill->ill_lock); if (rx_ring->rr_ring_state == RR_FREE) { mutex_exit(&ill->ill_lock); return; } rx_ring->rr_ring_state = RR_FREE_INPROG; sqp = rx_ring->rr_sqp; mutex_enter(&sqp->sq_lock); sqp->sq_state |= SQS_POLL_CLEANUP; cv_signal(&sqp->sq_worker_cv); mutex_exit(&ill->ill_lock); while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE)) cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND); ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE | SQS_POLL_THR_QUIESCED))); cv_signal(&sqp->sq_worker_cv); mutex_exit(&sqp->sq_lock); /* * Logically free the squeue. It goes back to the set of unused * squeues */ mutex_enter(&sqset_lock); ip_squeue_set_move(sqp, sqset_global_list[0]); mutex_exit(&sqset_lock); mutex_enter(&ill->ill_lock); rx_ring->rr_ring_state = RR_FREE; mutex_exit(&ill->ill_lock); } /* * Stop the squeue from polling. This needs to be done * from inside the perimeter. */ void ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring) { squeue_t *sqp; ASSERT(ILL_MAC_PERIM_HELD(ill)); ASSERT(rx_ring != NULL); sqp = rx_ring->rr_sqp; mutex_enter(&sqp->sq_lock); sqp->sq_state |= SQS_POLL_QUIESCE; cv_signal(&sqp->sq_worker_cv); while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); mutex_exit(&sqp->sq_lock); } /* * Restart polling etc. Needs to be inside the perimeter to * prevent races. */ void ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring) { squeue_t *sqp; ASSERT(ILL_MAC_PERIM_HELD(ill)); ASSERT(rx_ring != NULL); sqp = rx_ring->rr_sqp; mutex_enter(&sqp->sq_lock); /* * Handle change in number of rings between the quiesce and * restart operations by checking for a previous quiesce before * attempting a restart. */ if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) { mutex_exit(&sqp->sq_lock); return; } sqp->sq_state |= SQS_POLL_RESTART; cv_signal(&sqp->sq_worker_cv); while (!(sqp->sq_state & SQS_POLL_RESTART_DONE)) cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); sqp->sq_state &= ~SQS_POLL_RESTART_DONE; mutex_exit(&sqp->sq_lock); } /* * sanitize all squeues associated with the ill. */ void ip_squeue_clean_all(ill_t *ill) { int idx; ill_rx_ring_t *rx_ring; for (idx = 0; idx < ILL_MAX_RINGS; idx++) { rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx]; ip_squeue_clean_ring(ill, rx_ring); } } /* * Used by IP to get the squeue associated with a ring. If the squeue isn't * yet bound to a CPU, and we're being called directly from the NIC's * interrupt, then we know what CPU we want to assign the squeue to, so * dispatch that task to a taskq. */ squeue_t * ip_squeue_get(ill_rx_ring_t *ill_rx_ring) { squeue_t *sqp; if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL)) return (IP_SQUEUE_GET(lbolt)); return (sqp); } /* * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all * squeues are unboudn and moved to the unbound set. */ static void ip_squeue_set_destroy(cpu_t *cpu) { int i; squeue_t *sqp, *lastsqp = NULL; squeue_set_t *sqs, *unbound = sqset_global_list[0]; mutex_enter(&sqset_lock); if ((sqs = cpu->cpu_squeue_set) == NULL) { mutex_exit(&sqset_lock); return; } /* Move all squeues to unbound set */ for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) { squeue_unbind(sqp); sqp->sq_set = unbound; } if (sqs->sqs_head) { lastsqp->sq_next = unbound->sqs_head; unbound->sqs_head = sqs->sqs_head; } /* Also move default squeue to unbound set */ sqp = sqs->sqs_default; ASSERT(sqp); ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT); sqp->sq_next = unbound->sqs_head; unbound->sqs_head = sqp; squeue_unbind(sqp); sqp->sq_set = unbound; for (i = 1; i < sqset_global_size; i++) if (sqset_global_list[i] == sqs) break; ASSERT(i < sqset_global_size); sqset_global_list[i] = sqset_global_list[sqset_global_size - 1]; sqset_global_list[sqset_global_size - 1] = NULL; sqset_global_size--; mutex_exit(&sqset_lock); kmem_free(sqs, sizeof (*sqs)); } /* * Reconfiguration callback */ /* ARGSUSED */ static int ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) { cpu_t *cp = cpu_get(id); ASSERT(MUTEX_HELD(&cpu_lock)); switch (what) { case CPU_CONFIG: case CPU_ON: case CPU_INIT: case CPU_CPUPART_IN: if (cp->cpu_squeue_set == NULL) cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id); break; case CPU_UNCONFIG: case CPU_OFF: case CPU_CPUPART_OUT: ASSERT((cp->cpu_squeue_set != NULL) || (cp->cpu_flags & CPU_OFFLINE)); if (cp->cpu_squeue_set != NULL) { ip_squeue_set_destroy(cp); cp->cpu_squeue_set = NULL; } break; default: break; } return (0); }