1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation - Send code. 28 * 29 * Implements the Distributor function. 30 */ 31 32 #include <sys/conf.h> 33 #include <sys/modctl.h> 34 #include <sys/sunddi.h> 35 #include <sys/callb.h> 36 #include <sys/vlan.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 41 #include <inet/common.h> 42 #include <inet/led.h> 43 #include <inet/ip.h> 44 #include <inet/ip6.h> 45 #include <inet/tcp.h> 46 #include <netinet/udp.h> 47 48 #include <sys/aggr.h> 49 #include <sys/aggr_impl.h> 50 51 /* 52 * Update the TX load balancing policy of the specified group. 53 */ 54 void 55 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy) 56 { 57 uint8_t mac_policy = 0; 58 59 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 60 61 if ((policy & AGGR_POLICY_L2) != 0) 62 mac_policy |= MAC_PKT_HASH_L2; 63 if ((policy & AGGR_POLICY_L3) != 0) 64 mac_policy |= MAC_PKT_HASH_L3; 65 if ((policy & AGGR_POLICY_L4) != 0) 66 mac_policy |= MAC_PKT_HASH_L4; 67 68 grp->lg_tx_policy = policy; 69 grp->lg_mac_tx_policy = mac_policy; 70 } 71 72 #define HASH_HINT(hint) \ 73 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8)) 74 75 /* 76 * Function invoked by mac layer to find a specific TX ring on a port 77 * to send data. 78 */ 79 mblk_t * 80 aggr_find_tx_ring(void *arg, mblk_t *mp, uintptr_t hint, mac_ring_handle_t *rh) 81 { 82 aggr_grp_t *grp = arg; 83 aggr_port_t *port; 84 uint64_t hash; 85 86 rw_enter(&grp->lg_tx_lock, RW_READER); 87 if (grp->lg_ntx_ports == 0) { 88 /* 89 * We could have returned from aggr_m_start() before 90 * the ports were actually attached. Drop the chain. 91 */ 92 rw_exit(&grp->lg_tx_lock); 93 freemsgchain(mp); 94 return (NULL); 95 } 96 hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy, B_TRUE); 97 port = grp->lg_tx_ports[hash % grp->lg_ntx_ports]; 98 99 /* 100 * Use hash as the hint so to direct traffic to 101 * different TX rings. Note below bit operation 102 * is needed in case hint is 0 to get the most 103 * benefit from HASH_HINT() algorithm. 104 */ 105 if (port->lp_tx_ring_cnt > 1) { 106 if (hint == 0) { 107 hash = (hash << 24 | hash << 16 | hash); 108 hash = (hash << 32 | hash); 109 } else { 110 hash = hint; 111 } 112 hash = HASH_HINT(hash); 113 *rh = port->lp_pseudo_tx_rings[hash % port->lp_tx_ring_cnt]; 114 } else { 115 *rh = port->lp_pseudo_tx_rings[0]; 116 } 117 rw_exit(&grp->lg_tx_lock); 118 119 return (mp); 120 } 121 122 /* 123 * aggr_tx_notify_thread: 124 * 125 * aggr_tx_ring_update() callback function wakes up this thread when 126 * it gets called. This thread will call mac_tx_ring_update() to 127 * notify upper mac of flow control getting relieved. Note that 128 * aggr_tx_ring_update() cannot call mac_tx_ring_update() directly 129 * because aggr_tx_ring_update() is called from lower mac with 130 * mi_rw_lock held. 131 */ 132 void 133 aggr_tx_notify_thread(void *arg) 134 { 135 callb_cpr_t cprinfo; 136 aggr_grp_t *grp = (aggr_grp_t *)arg; 137 mac_ring_handle_t pseudo_mrh; 138 139 CALLB_CPR_INIT(&cprinfo, &grp->lg_tx_flowctl_lock, callb_generic_cpr, 140 "aggr_tx_notify_thread"); 141 142 mutex_enter(&grp->lg_tx_flowctl_lock); 143 while (!grp->lg_tx_notify_done) { 144 if ((grp->lg_tx_blocked_cnt) == 0) { 145 CALLB_CPR_SAFE_BEGIN(&cprinfo); 146 cv_wait(&grp->lg_tx_flowctl_cv, 147 &grp->lg_tx_flowctl_lock); 148 CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_tx_flowctl_lock); 149 continue; 150 } 151 while (grp->lg_tx_blocked_cnt != 0) { 152 grp->lg_tx_blocked_cnt--; 153 pseudo_mrh = 154 grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt]; 155 mutex_exit(&grp->lg_tx_flowctl_lock); 156 mac_tx_ring_update(grp->lg_mh, pseudo_mrh); 157 mutex_enter(&grp->lg_tx_flowctl_lock); 158 } 159 } 160 /* 161 * The grp is being destroyed, exit the thread. 162 */ 163 grp->lg_tx_notify_thread = NULL; 164 CALLB_CPR_EXIT(&cprinfo); 165 thread_exit(); 166 } 167 168 /* 169 * Callback function registered with lower mac to receive wakeups from 170 * drivers when flow control is relieved (i.e. Tx descriptors are 171 * available). 172 */ 173 void 174 aggr_tx_ring_update(void *arg1, uintptr_t arg2) 175 { 176 aggr_port_t *port = (aggr_port_t *)arg1; 177 mac_ring_handle_t mrh = (mac_ring_handle_t)arg2; 178 mac_ring_handle_t pseudo_mrh; 179 aggr_grp_t *grp = port->lp_grp; 180 int i = 0; 181 182 if (mrh == NULL) { 183 /* 184 * If the underlying NIC does not expose TX rings, 185 * still as pseudo TX ring is presented to the 186 * aggr mac. 187 */ 188 pseudo_mrh = port->lp_pseudo_tx_rings[0]; 189 } else { 190 for (i = 0; i < port->lp_tx_ring_cnt; i++) { 191 if (port->lp_tx_rings[i] == mrh) 192 break; 193 } 194 ASSERT(i < port->lp_tx_ring_cnt); 195 pseudo_mrh = port->lp_pseudo_tx_rings[i]; 196 } 197 mutex_enter(&grp->lg_tx_flowctl_lock); 198 /* 199 * It could be possible that some (broken?) device driver 200 * could send more than one wakeup on the same ring. In 201 * such a case, multiple instances of the same pseudo TX 202 * ring should not be saved in lg_tx_blocked_rings[] 203 * array. So first check if woken up ring (pseudo_mrh) is 204 * already in the lg_tx_blocked_rings[] array. 205 */ 206 for (i = 0; i < grp->lg_tx_blocked_cnt; i++) { 207 if (grp->lg_tx_blocked_rings[i] == pseudo_mrh) { 208 mutex_exit(&grp->lg_tx_flowctl_lock); 209 return; 210 } 211 } 212 /* A distinct mac_ring_handle. Save and increment count */ 213 grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt] = pseudo_mrh; 214 grp->lg_tx_blocked_cnt++; 215 cv_signal(&grp->lg_tx_flowctl_cv); 216 mutex_exit(&grp->lg_tx_flowctl_lock); 217 } 218 219 /* 220 * Send function invoked by the MAC service module. 221 */ 222 mblk_t * 223 aggr_ring_tx(void *arg, mblk_t *mp) 224 { 225 aggr_pseudo_tx_ring_t *pseudo_ring = (aggr_pseudo_tx_ring_t *)arg; 226 aggr_port_t *port = pseudo_ring->atr_port; 227 228 return (mac_hwring_send_priv(port->lp_mch, pseudo_ring->atr_hw_rh, mp)); 229 } 230 231 /* 232 * Enable sending on the specified port. 233 */ 234 void 235 aggr_send_port_enable(aggr_port_t *port) 236 { 237 aggr_grp_t *grp = port->lp_grp; 238 239 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 240 241 if (port->lp_tx_enabled || (port->lp_state != 242 AGGR_PORT_STATE_ATTACHED)) { 243 /* already enabled or port not yet attached */ 244 return; 245 } 246 247 /* 248 * Add to group's array of tx ports. 249 */ 250 rw_enter(&grp->lg_tx_lock, RW_WRITER); 251 if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) { 252 /* current array too small */ 253 aggr_port_t **new_ports; 254 uint_t new_size; 255 256 new_size = grp->lg_ntx_ports+1; 257 new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *), 258 KM_SLEEP); 259 260 if (grp->lg_tx_ports_size > 0) { 261 ASSERT(grp->lg_tx_ports != NULL); 262 bcopy(grp->lg_tx_ports, new_ports, 263 grp->lg_ntx_ports * sizeof (aggr_port_t *)); 264 kmem_free(grp->lg_tx_ports, 265 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 266 } 267 268 grp->lg_tx_ports = new_ports; 269 grp->lg_tx_ports_size = new_size; 270 } 271 272 grp->lg_tx_ports[grp->lg_ntx_ports++] = port; 273 port->lp_tx_idx = grp->lg_ntx_ports-1; 274 rw_exit(&grp->lg_tx_lock); 275 276 port->lp_tx_enabled = B_TRUE; 277 } 278 279 /* 280 * Disable sending from the specified port. 281 */ 282 void 283 aggr_send_port_disable(aggr_port_t *port) 284 { 285 uint_t idx, ntx; 286 aggr_grp_t *grp = port->lp_grp; 287 288 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 289 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 290 291 if (!port->lp_tx_enabled) { 292 /* not yet enabled */ 293 return; 294 } 295 296 rw_enter(&grp->lg_tx_lock, RW_WRITER); 297 idx = port->lp_tx_idx; 298 ntx = grp->lg_ntx_ports; 299 ASSERT(idx < ntx); 300 301 /* remove from array of attached ports */ 302 if (idx == (ntx - 1)) { 303 grp->lg_tx_ports[idx] = NULL; 304 } else { 305 /* not the last entry, replace with last one */ 306 aggr_port_t *victim; 307 308 victim = grp->lg_tx_ports[ntx - 1]; 309 grp->lg_tx_ports[ntx - 1] = NULL; 310 victim->lp_tx_idx = idx; 311 grp->lg_tx_ports[idx] = victim; 312 } 313 314 port->lp_tx_idx = 0; 315 grp->lg_ntx_ports--; 316 rw_exit(&grp->lg_tx_lock); 317 318 port->lp_tx_enabled = B_FALSE; 319 } 320