1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation - Send code. 28 * 29 * Implements the Distributor function. 30 */ 31 32 #include <sys/conf.h> 33 #include <sys/modctl.h> 34 #include <sys/sunddi.h> 35 #include <sys/vlan.h> 36 #include <sys/strsun.h> 37 #include <sys/strsubr.h> 38 39 #include <inet/common.h> 40 #include <inet/led.h> 41 #include <inet/ip.h> 42 #include <inet/ip6.h> 43 #include <inet/tcp.h> 44 #include <netinet/udp.h> 45 #include <inet/ipsec_impl.h> 46 #include <inet/sadb.h> 47 #include <inet/ipsecesp.h> 48 #include <inet/ipsecah.h> 49 50 #include <sys/aggr.h> 51 #include <sys/aggr_impl.h> 52 53 #define HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 54 #define HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 55 56 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *); 57 58 static uint64_t 59 aggr_send_hash(aggr_grp_t *grp, mblk_t *mp) 60 { 61 struct ether_header *ehp; 62 uint16_t sap; 63 uint_t skip_len; 64 uint8_t proto; 65 uint32_t policy = grp->lg_tx_policy; 66 uint64_t hash = 0; 67 68 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 69 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 70 ASSERT(RW_READ_HELD(&grp->lg_tx_lock)); 71 72 /* compute MAC hash */ 73 74 ehp = (struct ether_header *)mp->b_rptr; 75 76 if (policy & AGGR_POLICY_L2) { 77 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 78 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 79 hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst); 80 policy &= ~AGGR_POLICY_L2; 81 } 82 83 if (policy == 0) 84 goto done; 85 86 /* skip ethernet header */ 87 88 if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) { 89 struct ether_vlan_header *evhp; 90 mblk_t *newmp = NULL; 91 92 skip_len = sizeof (struct ether_vlan_header); 93 if (MBLKL(mp) < skip_len) { 94 /* the vlan tag is the payload, pull up first */ 95 newmp = msgpullup(mp, -1); 96 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 97 goto done; 98 } 99 evhp = (struct ether_vlan_header *)newmp->b_rptr; 100 } else { 101 evhp = (struct ether_vlan_header *)mp->b_rptr; 102 } 103 104 sap = ntohs(evhp->ether_type); 105 freemsg(newmp); 106 } else { 107 sap = ntohs(ehp->ether_type); 108 skip_len = sizeof (struct ether_header); 109 } 110 111 /* if ethernet header is in its own mblk, skip it */ 112 if (MBLKL(mp) <= skip_len) { 113 skip_len -= MBLKL(mp); 114 mp = mp->b_cont; 115 } 116 117 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 118 119 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 120 121 switch (sap) { 122 case ETHERTYPE_IP: { 123 ipha_t *iphp; 124 125 if (MBLKL(mp) < (skip_len + sizeof (ipha_t))) 126 goto done; 127 128 iphp = (ipha_t *)(mp->b_rptr + skip_len); 129 proto = iphp->ipha_protocol; 130 skip_len += IPH_HDR_LENGTH(iphp); 131 132 if (policy & AGGR_POLICY_L3) { 133 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 134 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 135 136 hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst)); 137 policy &= ~AGGR_POLICY_L3; 138 } 139 break; 140 } 141 case ETHERTYPE_IPV6: { 142 ip6_t *ip6hp; 143 144 /* 145 * if ipv6 packet has options, the proto will not be one of the 146 * ones handled by the ULP processor below, and will return 0 147 * as the index 148 */ 149 if (MBLKL(mp) < (skip_len + sizeof (ip6_t))) 150 goto done; 151 152 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 153 proto = ip6hp->ip6_nxt; 154 skip_len += aggr_send_ip6_hdr_len(mp, ip6hp); 155 156 if (policy & AGGR_POLICY_L3) { 157 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 158 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 159 160 hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst)); 161 policy &= ~AGGR_POLICY_L3; 162 } 163 break; 164 } 165 default: 166 goto done; 167 } 168 169 if (!(policy & AGGR_POLICY_L4)) 170 goto done; 171 172 /* if ip header is in its own mblk, skip it */ 173 if (MBLKL(mp) <= skip_len) { 174 skip_len -= MBLKL(mp); 175 mp = mp->b_cont; 176 } 177 178 /* parse ULP header */ 179 again: 180 switch (proto) { 181 case IPPROTO_TCP: 182 case IPPROTO_UDP: 183 case IPPROTO_ESP: 184 case IPPROTO_SCTP: 185 /* 186 * These Internet Protocols are intentionally designed 187 * for hashing from the git-go. Port numbers are in the first 188 * word for transports, SPI is first for ESP. 189 */ 190 hash ^= HASH_4BYTES((mp->b_rptr + skip_len)); 191 break; 192 193 case IPPROTO_AH: { 194 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 195 196 uint_t ah_length = AH_TOTAL_LEN(ah); 197 proto = ah->ah_nexthdr; 198 skip_len += ah_length; 199 200 /* if ip header is in its own mblk, skip it */ 201 if (MBLKL(mp) <= skip_len) { 202 skip_len -= MBLKL(mp); 203 mp = mp->b_cont; 204 } 205 206 goto again; 207 } 208 } 209 210 done: 211 return (hash); 212 } 213 214 /* 215 * Update the TX load balancing policy of the specified group. 216 */ 217 void 218 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy) 219 { 220 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 221 222 grp->lg_tx_policy = policy; 223 } 224 225 /* 226 * Send function invoked by the MAC service module. 227 */ 228 mblk_t * 229 aggr_m_tx(void *arg, mblk_t *mp) 230 { 231 aggr_grp_t *grp = arg; 232 aggr_port_t *port; 233 mblk_t *nextp; 234 mac_tx_cookie_t cookie; 235 uint64_t hash; 236 void *mytx_handle; 237 238 for (;;) { 239 rw_enter(&grp->lg_tx_lock, RW_READER); 240 if (grp->lg_ntx_ports == 0) { 241 /* 242 * We could have returned from aggr_m_start() before 243 * the ports were actually attached. Drop the chain. 244 */ 245 rw_exit(&grp->lg_tx_lock); 246 freemsgchain(mp); 247 return (NULL); 248 } 249 250 nextp = mp->b_next; 251 mp->b_next = NULL; 252 253 hash = aggr_send_hash(grp, mp); 254 port = grp->lg_tx_ports[hash % grp->lg_ntx_ports]; 255 256 /* 257 * Bump the active Tx ref count so that the port won't 258 * be deleted. The reference count will be dropped in mac_tx(). 259 */ 260 mytx_handle = mac_tx_hold(port->lp_mch); 261 rw_exit(&grp->lg_tx_lock); 262 263 if (mytx_handle == NULL) { 264 /* 265 * The port is quiesced. 266 */ 267 freemsg(mp); 268 } else { 269 mblk_t *ret_mp; 270 271 /* 272 * It is fine that the port state changes now. 273 * Set MAC_TX_NO_HOLD to inform mac_tx() not to bump 274 * the active Tx ref again. Use hash as the hint so 275 * to direct traffic to different TX rings. Note below 276 * bit operation is needed to get the most benefit 277 * from the mac_tx() hash algorithm. 278 */ 279 hash = (hash << 24 | hash << 16 | hash); 280 hash = (hash << 32 | hash); 281 cookie = mac_tx(port->lp_mch, mp, (uintptr_t)hash, 282 MAC_TX_NO_ENQUEUE | MAC_TX_NO_HOLD, &ret_mp); 283 284 mac_tx_rele(port->lp_mch, mytx_handle); 285 286 if (cookie != NULL) { 287 ret_mp->b_next = nextp; 288 mp = ret_mp; 289 break; 290 } 291 } 292 293 if ((mp = nextp) == NULL) 294 break; 295 } 296 return (mp); 297 } 298 299 /* 300 * Enable sending on the specified port. 301 */ 302 void 303 aggr_send_port_enable(aggr_port_t *port) 304 { 305 aggr_grp_t *grp = port->lp_grp; 306 307 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 308 309 if (port->lp_tx_enabled || (port->lp_state != 310 AGGR_PORT_STATE_ATTACHED)) { 311 /* already enabled or port not yet attached */ 312 return; 313 } 314 315 /* 316 * Add to group's array of tx ports. 317 */ 318 rw_enter(&grp->lg_tx_lock, RW_WRITER); 319 if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) { 320 /* current array too small */ 321 aggr_port_t **new_ports; 322 uint_t new_size; 323 324 new_size = grp->lg_ntx_ports+1; 325 new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *), 326 KM_SLEEP); 327 328 if (grp->lg_tx_ports_size > 0) { 329 ASSERT(grp->lg_tx_ports != NULL); 330 bcopy(grp->lg_tx_ports, new_ports, 331 grp->lg_ntx_ports * sizeof (aggr_port_t *)); 332 kmem_free(grp->lg_tx_ports, 333 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 334 } 335 336 grp->lg_tx_ports = new_ports; 337 grp->lg_tx_ports_size = new_size; 338 } 339 340 grp->lg_tx_ports[grp->lg_ntx_ports++] = port; 341 port->lp_tx_idx = grp->lg_ntx_ports-1; 342 rw_exit(&grp->lg_tx_lock); 343 344 port->lp_tx_enabled = B_TRUE; 345 } 346 347 /* 348 * Disable sending from the specified port. 349 */ 350 void 351 aggr_send_port_disable(aggr_port_t *port) 352 { 353 uint_t idx, ntx; 354 aggr_grp_t *grp = port->lp_grp; 355 356 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 357 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 358 359 if (!port->lp_tx_enabled) { 360 /* not yet enabled */ 361 return; 362 } 363 364 rw_enter(&grp->lg_tx_lock, RW_WRITER); 365 idx = port->lp_tx_idx; 366 ntx = grp->lg_ntx_ports; 367 ASSERT(idx < ntx); 368 369 /* remove from array of attached ports */ 370 if (idx == (ntx - 1)) { 371 grp->lg_tx_ports[idx] = NULL; 372 } else { 373 /* not the last entry, replace with last one */ 374 aggr_port_t *victim; 375 376 victim = grp->lg_tx_ports[ntx - 1]; 377 grp->lg_tx_ports[ntx - 1] = NULL; 378 victim->lp_tx_idx = idx; 379 grp->lg_tx_ports[idx] = victim; 380 } 381 382 port->lp_tx_idx = 0; 383 grp->lg_ntx_ports--; 384 rw_exit(&grp->lg_tx_lock); 385 386 port->lp_tx_enabled = B_FALSE; 387 } 388 389 static uint16_t 390 aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h) 391 { 392 uint16_t length; 393 uint_t ehdrlen; 394 uint8_t *nexthdrp; 395 uint8_t *whereptr; 396 uint8_t *endptr; 397 ip6_dest_t *desthdr; 398 ip6_rthdr_t *rthdr; 399 ip6_frag_t *fraghdr; 400 401 length = IPV6_HDR_LEN; 402 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 403 endptr = mp->b_wptr; 404 405 nexthdrp = &ip6h->ip6_nxt; 406 while (whereptr < endptr) { 407 switch (*nexthdrp) { 408 case IPPROTO_HOPOPTS: 409 case IPPROTO_DSTOPTS: 410 /* Assumes the headers are identical for hbh and dst */ 411 desthdr = (ip6_dest_t *)whereptr; 412 ehdrlen = 8 * (desthdr->ip6d_len + 1); 413 nexthdrp = &desthdr->ip6d_nxt; 414 break; 415 case IPPROTO_ROUTING: 416 rthdr = (ip6_rthdr_t *)whereptr; 417 ehdrlen = 8 * (rthdr->ip6r_len + 1); 418 nexthdrp = &rthdr->ip6r_nxt; 419 break; 420 case IPPROTO_FRAGMENT: 421 fraghdr = (ip6_frag_t *)whereptr; 422 ehdrlen = sizeof (ip6_frag_t); 423 nexthdrp = &fraghdr->ip6f_nxt; 424 break; 425 case IPPROTO_NONE: 426 /* No next header means we're finished */ 427 default: 428 return (length); 429 } 430 length += ehdrlen; 431 whereptr += ehdrlen; 432 } 433 434 return (length); 435 } 436