1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _INET_IP_IMPL_H 28 #define _INET_IP_IMPL_H 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 /* 33 * IP implementation private declarations. These interfaces are 34 * used to build the IP module and are not meant to be accessed 35 * by any modules except IP itself. They are undocumented and are 36 * subject to change without notice. 37 */ 38 39 #ifdef __cplusplus 40 extern "C" { 41 #endif 42 43 #ifdef _KERNEL 44 45 #define IP_MOD_ID 5701 46 47 #ifdef _BIG_ENDIAN 48 #define IP_HDR_CSUM_TTL_ADJUST 256 49 #define IP_TCP_CSUM_COMP IPPROTO_TCP 50 #define IP_UDP_CSUM_COMP IPPROTO_UDP 51 #else 52 #define IP_HDR_CSUM_TTL_ADJUST 1 53 #define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) 54 #define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) 55 #endif 56 57 #define TCP_CHECKSUM_OFFSET 16 58 #define TCP_CHECKSUM_SIZE 2 59 60 #define UDP_CHECKSUM_OFFSET 6 61 #define UDP_CHECKSUM_SIZE 2 62 63 #define IPH_TCPH_CHECKSUMP(ipha, hlen) \ 64 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET))) 65 66 #define IPH_UDPH_CHECKSUMP(ipha, hlen) \ 67 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET))) 68 69 #define ILL_HCKSUM_CAPABLE(ill) \ 70 (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0) 71 /* 72 * Macro that performs software checksum calculation on the IP header. 73 */ 74 #define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ 75 (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ 76 ((v_hlen_tos_len) >> 16) + \ 77 ((v_hlen_tos_len) & 0xFFFF) + \ 78 (ipha)->ipha_fragment_offset_and_flags; \ 79 (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ 80 (sum) = ~((sum) + ((sum) >> 16)); \ 81 (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ 82 } 83 84 #define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \ 85 ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \ 86 ILL_HCKSUM_CAPABLE(ill) && dohwcksum) 87 88 /* 89 * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs 90 * several checks on the IRE and ILL (among other things) in order to see 91 * whether or not hardware checksum offload is allowed for the outgoing 92 * packet. It assumes that the caller has held a reference to the IRE. 93 */ 94 #define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \ 95 max_frag, ipsec_len, pseudo) { \ 96 uint32_t _hck_flags; \ 97 /* \ 98 * We offload checksum calculation to hardware when IPsec isn't \ 99 * present and if fragmentation isn't required. We also check \ 100 * if M_DATA fastpath is safe to be used on the corresponding \ 101 * IRE; this check is performed without grabbing ire_lock but \ 102 * instead by holding a reference to it. This is sufficient \ 103 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \ 104 * DL_NOTE_FASTPATH_FLUSH indication could come up from the \ 105 * driver and trigger the IRE (hence fp_mp) deletion. This is \ 106 * why only IRE_CACHE type is eligible for offload. \ 107 * \ 108 * The presense of IP options also forces the network stack to \ 109 * calculate the checksum in software. This is because: \ 110 * \ 111 * Wrap around: certain partial-checksum NICs (eri, ce) limit \ 112 * the size of "start offset" width to 6-bit. This effectively \ 113 * sets the largest value of the offset to 64-bytes, starting \ 114 * from the MAC header. When the cumulative MAC and IP headers \ 115 * exceed such limit, the offset will wrap around. This causes \ 116 * the checksum to be calculated at the wrong place. \ 117 * \ 118 * IPv4 source routing: none of the full-checksum capable NICs \ 119 * is capable of correctly handling the IPv4 source-routing \ 120 * option for purposes of calculating the pseudo-header; the \ 121 * actual destination is different from the destination in the \ 122 * header which is that of the next-hop. (This case may not be \ 123 * true for NICs which can parse IPv6 extension headers, but \ 124 * we choose to simplify the implementation by not offloading \ 125 * checksum when they are present.) \ 126 * \ 127 */ \ 128 if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \ 129 !((ire)->ire_flags & RTF_MULTIRT) && \ 130 (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) || \ 131 (ill)->ill_type == IFT_ETHER) && \ 132 (ipsec_len) == 0 && \ 133 (((ire)->ire_ipversion == IPV4_VERSION && \ 134 (start) == IP_SIMPLE_HDR_LENGTH && \ 135 (ire)->ire_fp_mp != NULL && \ 136 MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) || \ 137 ((ire)->ire_ipversion == IPV6_VERSION && \ 138 (start) == IPV6_HDR_LEN && \ 139 (ire)->ire_nce->nce_fp_mp != NULL && \ 140 MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \ 141 (max_frag) >= (uint_t)((end) + (ipsec_len)) && \ 142 dohwcksum) { \ 143 _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \ 144 } else { \ 145 _hck_flags = 0; \ 146 } \ 147 IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \ 148 up, proto, start, end, pseudo); \ 149 } 150 151 /* 152 * Based on the device capabilities, this macro either marks an outgoing 153 * packet with hardware checksum offload information or calculate the 154 * checksum in software. If the latter is performed, the checksum field 155 * of the dblk is cleared; otherwise it will be non-zero and contain the 156 * necessary flag(s) for the driver. 157 */ 158 #define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \ 159 end, pseudo) { \ 160 uint32_t _sum; \ 161 /* \ 162 * Underlying interface supports hardware checksum offload for \ 163 * the payload; leave the payload checksum for the hardware to \ 164 * calculate. N.B: We only need to set up checksum info on the \ 165 * first mblk. \ 166 */ \ 167 DB_CKSUMFLAGS(mp) = 0; \ 168 if (((ipver) == IPV4_VERSION && \ 169 ((hck_flags) & HCKSUM_INET_FULL_V4)) || \ 170 ((ipver) == IPV6_VERSION && \ 171 ((hck_flags) & HCKSUM_INET_FULL_V6))) { \ 172 /* \ 173 * Hardware calculates pseudo-header, header and the \ 174 * payload checksums, so clear the checksum field in \ 175 * the protocol header. \ 176 */ \ 177 *(up) = 0; \ 178 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \ 179 } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \ 180 /* \ 181 * Partial checksum offload has been enabled. Fill \ 182 * the checksum field in the protocl header with the \ 183 * pseudo-header checksum value. \ 184 */ \ 185 _sum = ((proto) == IPPROTO_UDP) ? \ 186 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 187 _sum += *(up) + (pseudo); \ 188 _sum = (_sum & 0xFFFF) + (_sum >> 16); \ 189 *(up) = (_sum & 0xFFFF) + (_sum >> 16); \ 190 /* \ 191 * Offsets are relative to beginning of IP header. \ 192 */ \ 193 DB_CKSUMSTART(mp) = (start); \ 194 DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \ 195 (start) + UDP_CHECKSUM_OFFSET : \ 196 (start) + TCP_CHECKSUM_OFFSET; \ 197 DB_CKSUMEND(mp) = (end); \ 198 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \ 199 } else { \ 200 /* \ 201 * Software checksumming. \ 202 */ \ 203 _sum = ((proto) == IPPROTO_UDP) ? \ 204 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 205 _sum += (pseudo); \ 206 _sum = IP_CSUM(mp, start, _sum); \ 207 *(up) = (uint16_t)(_sum ? _sum : ~_sum); \ 208 } \ 209 /* \ 210 * Hardware supports IP header checksum offload; clear the \ 211 * contents of IP header checksum field as expected by NIC. \ 212 * Do this only if we offloaded either full or partial sum. \ 213 */ \ 214 if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \ 215 ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \ 216 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \ 217 ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \ 218 } \ 219 } 220 221 /* 222 * Macro to inspect the checksum of a fully-reassembled incoming datagram. 223 */ 224 #define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \ 225 (err) = B_FALSE; \ 226 if ((hck_flags) & HCK_FULLCKSUM) { \ 227 /* \ 228 * The sum of all fragment checksums should \ 229 * result in -0 (0xFFFF) or otherwise invalid. \ 230 */ \ 231 if ((sum) != 0xFFFF) \ 232 (err) = B_TRUE; \ 233 } else if ((hck_flags) & HCK_PARTIALCKSUM) { \ 234 (sum) += (pseudo); \ 235 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 236 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 237 if (~(sum) & 0xFFFF) \ 238 (err) = B_TRUE; \ 239 } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \ 240 (err) = B_TRUE; \ 241 } \ 242 } 243 244 /* 245 * This macro inspects an incoming packet to see if the checksum value 246 * contained in it is valid; if the hardware has provided the information, 247 * the value is verified, otherwise it performs software checksumming. 248 * The checksum value is returned to caller. 249 */ 250 #define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \ 251 int32_t _len; \ 252 \ 253 (err) = B_FALSE; \ 254 if ((hck_flags) & HCK_FULLCKSUM) { \ 255 /* \ 256 * Full checksum has been computed by the hardware \ 257 * and has been attached. If the driver wants us to \ 258 * verify the correctness of the attached value, in \ 259 * order to protect against faulty hardware, compare \ 260 * it against -0 (0xFFFF) to see if it's valid. \ 261 */ \ 262 (sum) = DB_CKSUM16(mp); \ 263 if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \ 264 (err) = B_TRUE; \ 265 } else if (((hck_flags) & HCK_PARTIALCKSUM) && \ 266 ((mp1) == NULL || (mp1)->b_cont == NULL) && \ 267 (ulph_off) >= DB_CKSUMSTART(mp) && \ 268 ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \ 269 uint32_t _adj; \ 270 /* \ 271 * Partial checksum has been calculated by hardware \ 272 * and attached to the packet; in addition, any \ 273 * prepended extraneous data is even byte aligned, \ 274 * and there are at most two mblks associated with \ 275 * the packet. If any such data exists, we adjust \ 276 * the checksum; also take care any postpended data. \ 277 */ \ 278 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \ 279 /* \ 280 * One's complement subtract extraneous checksum \ 281 */ \ 282 (sum) += DB_CKSUM16(mp); \ 283 if (_adj >= (sum)) \ 284 (sum) = ~(_adj - (sum)) & 0xFFFF; \ 285 else \ 286 (sum) -= _adj; \ 287 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 288 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 289 if (~(sum) & 0xFFFF) \ 290 (err) = B_TRUE; \ 291 } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \ 292 (err) = B_TRUE; \ 293 } \ 294 } 295 296 /* 297 * Macro to adjust a given checksum value depending on any prepended 298 * or postpended data on the packet. It expects the start offset to 299 * begin at an even boundary and that the packet consists of at most 300 * two mblks. 301 */ 302 #define IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) { \ 303 /* \ 304 * Prepended extraneous data; adjust checksum. \ 305 */ \ 306 if ((len) > 0) \ 307 (adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0); \ 308 else \ 309 (adj) = 0; \ 310 /* \ 311 * len is now the total length of mblk(s) \ 312 */ \ 313 (len) = MBLKL(mp); \ 314 if ((mp1) == NULL) \ 315 (mp1) = (mp); \ 316 else \ 317 (len) += MBLKL(mp1); \ 318 /* \ 319 * Postpended extraneous data; adjust checksum. \ 320 */ \ 321 if (((len) = (DB_CKSUMEND(mp) - len)) > 0) { \ 322 uint32_t _pad; \ 323 \ 324 _pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0); \ 325 /* \ 326 * If the postpended extraneous data was odd \ 327 * byte aligned, swap resulting checksum bytes. \ 328 */ \ 329 if ((uintptr_t)(mp1)->b_wptr & 1) \ 330 (adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8); \ 331 else \ 332 (adj) += _pad; \ 333 (adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16); \ 334 } \ 335 } 336 337 #define ILL_MDT_CAPABLE(ill) \ 338 (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0) 339 340 /* 341 * ioctl identifier and structure for Multidata Transmit update 342 * private M_CTL communication from IP to ULP. 343 */ 344 #define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) 345 346 typedef struct ip_mdt_info_s { 347 uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ 348 ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ 349 } ip_mdt_info_t; 350 351 /* 352 * Macro that determines whether or not a given ILL is allowed for MDT. 353 */ 354 #define ILL_MDT_USABLE(ill) \ 355 (ILL_MDT_CAPABLE(ill) && \ 356 ill->ill_mdt_capab != NULL && \ 357 ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ 358 ill->ill_mdt_capab->ill_mdt_on != 0) 359 360 /* 361 * Macro that determines whether or not a given CONN may be considered 362 * for fast path prior to proceeding further with Multidata. 363 */ 364 #define CONN_IS_MD_FASTPATH(connp) \ 365 ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ 366 !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ 367 (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ 368 (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \ 369 (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ 370 (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ 371 372 /* Definitons for fragmenting IP packets using MDT. */ 373 374 /* 375 * Smaller and private version of pdescinfo_t used specifically for IP, 376 * which allows for only a single payload span per packet. 377 */ 378 typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; 379 380 /* 381 * Macro version of ip_can_frag_mdt() which avoids the function call if we 382 * only examine a single message block. 383 */ 384 #define IP_CAN_FRAG_MDT(mp, hdr_len, len) \ 385 (((mp)->b_cont == NULL) ? \ 386 (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \ 387 ip_can_frag_mdt((mp), (hdr_len), (len))) 388 389 /* 390 * Macro that determines whether or not a given IPC requires 391 * outbound IPSEC processing. 392 */ 393 #define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ 394 ((connp)->conn_out_enforce_policy || \ 395 ((connp)->conn_latch != NULL && \ 396 (connp)->conn_latch->ipl_out_policy != NULL)) 397 398 /* 399 * These are used by the synchronous streams code in tcp and udp. 400 */ 401 #define STR_WAKEUP_CLEAR(stp) { \ 402 mutex_enter(&stp->sd_lock); \ 403 stp->sd_wakeq &= ~RSLEEP; \ 404 mutex_exit(&stp->sd_lock); \ 405 } 406 407 #define STR_WAKEUP_SET(stp) { \ 408 mutex_enter(&stp->sd_lock); \ 409 if (stp->sd_flag & RSLEEP) { \ 410 stp->sd_flag &= ~RSLEEP; \ 411 cv_broadcast(&_RD(stp->sd_wrq)->q_wait); \ 412 } else { \ 413 stp->sd_wakeq |= RSLEEP; \ 414 } \ 415 mutex_exit(&stp->sd_lock); \ 416 } 417 418 #define STR_SENDSIG(stp) { \ 419 int _events; \ 420 mutex_enter(&stp->sd_lock); \ 421 if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0) \ 422 strsendsig(stp->sd_siglist, _events, 0, 0); \ 423 if (stp->sd_rput_opt & SR_POLLIN) { \ 424 stp->sd_rput_opt &= ~SR_POLLIN; \ 425 mutex_exit(&stp->sd_lock); \ 426 pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); \ 427 } else { \ 428 mutex_exit(&stp->sd_lock); \ 429 } \ 430 } 431 432 #define CONN_UDP_SYNCSTR(connp) \ 433 (IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs) 434 435 /* 436 * Macro that checks whether or not a particular UDP conn is 437 * flow-controlling on the read-side. If udp module is directly 438 * above ip, check to see if the drain queue is full; note here 439 * that we check this without any lock protection because this 440 * is a coarse granularity inbound flow-control. If the module 441 * above ip is not udp, then use canputnext to determine the 442 * flow-control. 443 * 444 * Note that these checks are done after the conn is found in 445 * the UDP fanout table. A UDP conn in that table may have its 446 * IPCL_UDP bit cleared from the conn_flags when the application 447 * pops the udp module without issuing an unbind; in this case 448 * IP will still receive packets for the conn and deliver it 449 * upstream via putnext. This is the reason why we have to test 450 * against IPCL_UDP. 451 */ 452 #define CONN_UDP_FLOWCTLD(connp) \ 453 ((CONN_UDP_SYNCSTR(connp) && \ 454 (connp)->conn_udp->udp_drain_qfull) || \ 455 (!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq))) 456 457 /* 458 * Macro that delivers a given message upstream; if udp module 459 * is directly above ip, the message is passed directly into 460 * the stream-less entry point. Otherwise putnext is used. 461 */ 462 #define CONN_UDP_RECV(connp, mp) { \ 463 if (IPCL_IS_UDP(connp)) \ 464 udp_conn_recv(connp, mp); \ 465 else \ 466 putnext((connp)->conn_rq, mp); \ 467 } 468 469 #define ILL_POLL_CAPABLE(ill) \ 470 (((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0) 471 472 /* 473 * Macro that hands off one or more messages directly to DLD 474 * when the interface is marked with ILL_CAPAB_POLL. 475 */ 476 #define IP_POLL_ILL_TX(ill, mp) { \ 477 ill_poll_capab_t *ill_poll = ill->ill_poll_capab; \ 478 ASSERT(ILL_POLL_CAPABLE(ill)); \ 479 ASSERT(ill_poll != NULL); \ 480 ASSERT(ill_poll->ill_tx != NULL); \ 481 ASSERT(ill_poll->ill_tx_handle != NULL); \ 482 ill_poll->ill_tx(ill_poll->ill_tx_handle, mp); \ 483 } 484 485 extern int ip_wput_frag_mdt_min; 486 extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); 487 488 #endif /* _KERNEL */ 489 490 #ifdef __cplusplus 491 } 492 #endif 493 494 #endif /* _INET_IP_IMPL_H */ 495