1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _INET_IP_IMPL_H 27 #define _INET_IP_IMPL_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 /* 32 * IP implementation private declarations. These interfaces are 33 * used to build the IP module and are not meant to be accessed 34 * by any modules except IP itself. They are undocumented and are 35 * subject to change without notice. 36 */ 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 #ifdef _KERNEL 43 44 #define IP_MOD_ID 5701 45 46 #ifdef _BIG_ENDIAN 47 #define IP_HDR_CSUM_TTL_ADJUST 256 48 #define IP_TCP_CSUM_COMP IPPROTO_TCP 49 #define IP_UDP_CSUM_COMP IPPROTO_UDP 50 #else 51 #define IP_HDR_CSUM_TTL_ADJUST 1 52 #define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) 53 #define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) 54 #endif 55 56 #define TCP_CHECKSUM_OFFSET 16 57 #define TCP_CHECKSUM_SIZE 2 58 59 #define UDP_CHECKSUM_OFFSET 6 60 #define UDP_CHECKSUM_SIZE 2 61 62 #define IPH_TCPH_CHECKSUMP(ipha, hlen) \ 63 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET))) 64 65 #define IPH_UDPH_CHECKSUMP(ipha, hlen) \ 66 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET))) 67 68 #define ILL_HCKSUM_CAPABLE(ill) \ 69 (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0) 70 /* 71 * Macro that performs software checksum calculation on the IP header. 72 */ 73 #define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ 74 (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ 75 ((v_hlen_tos_len) >> 16) + \ 76 ((v_hlen_tos_len) & 0xFFFF) + \ 77 (ipha)->ipha_fragment_offset_and_flags; \ 78 (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ 79 (sum) = ~((sum) + ((sum) >> 16)); \ 80 (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ 81 } 82 83 #define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \ 84 ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \ 85 ILL_HCKSUM_CAPABLE(ill) && dohwcksum) 86 87 /* 88 * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs 89 * several checks on the IRE and ILL (among other things) in order to see 90 * whether or not hardware checksum offload is allowed for the outgoing 91 * packet. It assumes that the caller has held a reference to the IRE. 92 */ 93 #define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \ 94 max_frag, ipsec_len, pseudo) { \ 95 uint32_t _hck_flags; \ 96 /* \ 97 * We offload checksum calculation to hardware when IPsec isn't \ 98 * present and if fragmentation isn't required. We also check \ 99 * if M_DATA fastpath is safe to be used on the corresponding \ 100 * IRE; this check is performed without grabbing ire_lock but \ 101 * instead by holding a reference to it. This is sufficient \ 102 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \ 103 * DL_NOTE_FASTPATH_FLUSH indication could come up from the \ 104 * driver and trigger the IRE (hence fp_mp) deletion. This is \ 105 * why only IRE_CACHE type is eligible for offload. \ 106 * \ 107 * The presense of IP options also forces the network stack to \ 108 * calculate the checksum in software. This is because: \ 109 * \ 110 * Wrap around: certain partial-checksum NICs (eri, ce) limit \ 111 * the size of "start offset" width to 6-bit. This effectively \ 112 * sets the largest value of the offset to 64-bytes, starting \ 113 * from the MAC header. When the cumulative MAC and IP headers \ 114 * exceed such limit, the offset will wrap around. This causes \ 115 * the checksum to be calculated at the wrong place. \ 116 * \ 117 * IPv4 source routing: none of the full-checksum capable NICs \ 118 * is capable of correctly handling the IPv4 source-routing \ 119 * option for purposes of calculating the pseudo-header; the \ 120 * actual destination is different from the destination in the \ 121 * header which is that of the next-hop. (This case may not be \ 122 * true for NICs which can parse IPv6 extension headers, but \ 123 * we choose to simplify the implementation by not offloading \ 124 * checksum when they are present.) \ 125 * \ 126 */ \ 127 if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \ 128 !((ire)->ire_flags & RTF_MULTIRT) && \ 129 (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) || \ 130 (ill)->ill_type == IFT_ETHER) && \ 131 (ipsec_len) == 0 && \ 132 (((ire)->ire_ipversion == IPV4_VERSION && \ 133 (start) == IP_SIMPLE_HDR_LENGTH && \ 134 (ire)->ire_fp_mp != NULL && \ 135 MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) || \ 136 ((ire)->ire_ipversion == IPV6_VERSION && \ 137 (start) == IPV6_HDR_LEN && \ 138 (ire)->ire_nce->nce_fp_mp != NULL && \ 139 MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \ 140 (max_frag) >= (uint_t)((end) + (ipsec_len)) && \ 141 dohwcksum) { \ 142 _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \ 143 } else { \ 144 _hck_flags = 0; \ 145 } \ 146 IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \ 147 up, proto, start, end, pseudo); \ 148 } 149 150 /* 151 * Based on the device capabilities, this macro either marks an outgoing 152 * packet with hardware checksum offload information or calculate the 153 * checksum in software. If the latter is performed, the checksum field 154 * of the dblk is cleared; otherwise it will be non-zero and contain the 155 * necessary flag(s) for the driver. 156 */ 157 #define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \ 158 end, pseudo) { \ 159 uint32_t _sum; \ 160 /* \ 161 * Underlying interface supports hardware checksum offload for \ 162 * the payload; leave the payload checksum for the hardware to \ 163 * calculate. N.B: We only need to set up checksum info on the \ 164 * first mblk. \ 165 */ \ 166 DB_CKSUMFLAGS(mp) = 0; \ 167 if (((ipver) == IPV4_VERSION && \ 168 ((hck_flags) & HCKSUM_INET_FULL_V4)) || \ 169 ((ipver) == IPV6_VERSION && \ 170 ((hck_flags) & HCKSUM_INET_FULL_V6))) { \ 171 /* \ 172 * Hardware calculates pseudo-header, header and the \ 173 * payload checksums, so clear the checksum field in \ 174 * the protocol header. \ 175 */ \ 176 *(up) = 0; \ 177 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \ 178 } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \ 179 /* \ 180 * Partial checksum offload has been enabled. Fill \ 181 * the checksum field in the protocl header with the \ 182 * pseudo-header checksum value. \ 183 */ \ 184 _sum = ((proto) == IPPROTO_UDP) ? \ 185 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 186 _sum += *(up) + (pseudo); \ 187 _sum = (_sum & 0xFFFF) + (_sum >> 16); \ 188 *(up) = (_sum & 0xFFFF) + (_sum >> 16); \ 189 /* \ 190 * Offsets are relative to beginning of IP header. \ 191 */ \ 192 DB_CKSUMSTART(mp) = (start); \ 193 DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \ 194 (start) + UDP_CHECKSUM_OFFSET : \ 195 (start) + TCP_CHECKSUM_OFFSET; \ 196 DB_CKSUMEND(mp) = (end); \ 197 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \ 198 } else { \ 199 /* \ 200 * Software checksumming. \ 201 */ \ 202 _sum = ((proto) == IPPROTO_UDP) ? \ 203 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 204 _sum += (pseudo); \ 205 _sum = IP_CSUM(mp, start, _sum); \ 206 *(up) = (uint16_t)(_sum ? _sum : ~_sum); \ 207 } \ 208 /* \ 209 * Hardware supports IP header checksum offload; clear the \ 210 * contents of IP header checksum field as expected by NIC. \ 211 * Do this only if we offloaded either full or partial sum. \ 212 */ \ 213 if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \ 214 ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \ 215 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \ 216 ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \ 217 } \ 218 } 219 220 /* 221 * Macro to inspect the checksum of a fully-reassembled incoming datagram. 222 */ 223 #define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \ 224 (err) = B_FALSE; \ 225 if ((hck_flags) & HCK_FULLCKSUM) { \ 226 /* \ 227 * The sum of all fragment checksums should \ 228 * result in -0 (0xFFFF) or otherwise invalid. \ 229 */ \ 230 if ((sum) != 0xFFFF) \ 231 (err) = B_TRUE; \ 232 } else if ((hck_flags) & HCK_PARTIALCKSUM) { \ 233 (sum) += (pseudo); \ 234 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 235 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 236 if (~(sum) & 0xFFFF) \ 237 (err) = B_TRUE; \ 238 } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \ 239 (err) = B_TRUE; \ 240 } \ 241 } 242 243 /* 244 * This macro inspects an incoming packet to see if the checksum value 245 * contained in it is valid; if the hardware has provided the information, 246 * the value is verified, otherwise it performs software checksumming. 247 * The checksum value is returned to caller. 248 */ 249 #define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \ 250 int32_t _len; \ 251 \ 252 (err) = B_FALSE; \ 253 if ((hck_flags) & HCK_FULLCKSUM) { \ 254 /* \ 255 * Full checksum has been computed by the hardware \ 256 * and has been attached. If the driver wants us to \ 257 * verify the correctness of the attached value, in \ 258 * order to protect against faulty hardware, compare \ 259 * it against -0 (0xFFFF) to see if it's valid. \ 260 */ \ 261 (sum) = DB_CKSUM16(mp); \ 262 if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \ 263 (err) = B_TRUE; \ 264 } else if (((hck_flags) & HCK_PARTIALCKSUM) && \ 265 ((mp1) == NULL || (mp1)->b_cont == NULL) && \ 266 (ulph_off) >= DB_CKSUMSTART(mp) && \ 267 ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \ 268 uint32_t _adj; \ 269 /* \ 270 * Partial checksum has been calculated by hardware \ 271 * and attached to the packet; in addition, any \ 272 * prepended extraneous data is even byte aligned, \ 273 * and there are at most two mblks associated with \ 274 * the packet. If any such data exists, we adjust \ 275 * the checksum; also take care any postpended data. \ 276 */ \ 277 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \ 278 /* \ 279 * One's complement subtract extraneous checksum \ 280 */ \ 281 (sum) += DB_CKSUM16(mp); \ 282 if (_adj >= (sum)) \ 283 (sum) = ~(_adj - (sum)) & 0xFFFF; \ 284 else \ 285 (sum) -= _adj; \ 286 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 287 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 288 if (~(sum) & 0xFFFF) \ 289 (err) = B_TRUE; \ 290 } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \ 291 (err) = B_TRUE; \ 292 } \ 293 } 294 295 /* 296 * Macro to adjust a given checksum value depending on any prepended 297 * or postpended data on the packet. It expects the start offset to 298 * begin at an even boundary and that the packet consists of at most 299 * two mblks. 300 */ 301 #define IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) { \ 302 /* \ 303 * Prepended extraneous data; adjust checksum. \ 304 */ \ 305 if ((len) > 0) \ 306 (adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0); \ 307 else \ 308 (adj) = 0; \ 309 /* \ 310 * len is now the total length of mblk(s) \ 311 */ \ 312 (len) = MBLKL(mp); \ 313 if ((mp1) == NULL) \ 314 (mp1) = (mp); \ 315 else \ 316 (len) += MBLKL(mp1); \ 317 /* \ 318 * Postpended extraneous data; adjust checksum. \ 319 */ \ 320 if (((len) = (DB_CKSUMEND(mp) - len)) > 0) { \ 321 uint32_t _pad; \ 322 \ 323 _pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0); \ 324 /* \ 325 * If the postpended extraneous data was odd \ 326 * byte aligned, swap resulting checksum bytes. \ 327 */ \ 328 if ((uintptr_t)(mp1)->b_wptr & 1) \ 329 (adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8); \ 330 else \ 331 (adj) += _pad; \ 332 (adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16); \ 333 } \ 334 } 335 336 #define ILL_MDT_CAPABLE(ill) \ 337 (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0) 338 339 /* 340 * ioctl identifier and structure for Multidata Transmit update 341 * private M_CTL communication from IP to ULP. 342 */ 343 #define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) 344 345 typedef struct ip_mdt_info_s { 346 uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ 347 ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ 348 } ip_mdt_info_t; 349 350 /* 351 * Macro that determines whether or not a given ILL is allowed for MDT. 352 */ 353 #define ILL_MDT_USABLE(ill) \ 354 (ILL_MDT_CAPABLE(ill) && \ 355 ill->ill_mdt_capab != NULL && \ 356 ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ 357 ill->ill_mdt_capab->ill_mdt_on != 0) 358 359 /* 360 * Macro that determines whether or not a given CONN may be considered 361 * for fast path prior to proceeding further with Multidata. 362 */ 363 #define CONN_IS_MD_FASTPATH(connp) \ 364 ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ 365 !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ 366 (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ 367 (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \ 368 (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ 369 (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ 370 371 /* Definitons for fragmenting IP packets using MDT. */ 372 373 /* 374 * Smaller and private version of pdescinfo_t used specifically for IP, 375 * which allows for only a single payload span per packet. 376 */ 377 typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; 378 379 /* 380 * Macro version of ip_can_frag_mdt() which avoids the function call if we 381 * only examine a single message block. 382 */ 383 #define IP_CAN_FRAG_MDT(mp, hdr_len, len) \ 384 (((mp)->b_cont == NULL) ? \ 385 (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \ 386 ip_can_frag_mdt((mp), (hdr_len), (len))) 387 388 /* 389 * Macro that determines whether or not a given IPC requires 390 * outbound IPSEC processing. 391 */ 392 #define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ 393 ((connp)->conn_out_enforce_policy || \ 394 ((connp)->conn_latch != NULL && \ 395 (connp)->conn_latch->ipl_out_policy != NULL)) 396 397 /* 398 * These are used by the synchronous streams code in tcp and udp. 399 * When we set the flags for a wakeup from a synchronous stream we 400 * always set RSLEEP in sd_wakeq, even if we have a read thread waiting 401 * to do the io. This is in case the read thread gets interrupted 402 * before completing the io. The RSLEEP flag in sd_wakeq is used to 403 * indicate that there is data available at the synchronous barrier. 404 * The assumption is that subsequent functions calls through rwnext() 405 * will reset sd_wakeq appropriately. 406 */ 407 #define STR_WAKEUP_CLEAR(stp) { \ 408 mutex_enter(&stp->sd_lock); \ 409 stp->sd_wakeq &= ~RSLEEP; \ 410 mutex_exit(&stp->sd_lock); \ 411 } 412 413 #define STR_WAKEUP_SET(stp) { \ 414 mutex_enter(&stp->sd_lock); \ 415 if (stp->sd_flag & RSLEEP) { \ 416 stp->sd_flag &= ~RSLEEP; \ 417 cv_broadcast(&_RD(stp->sd_wrq)->q_wait); \ 418 } \ 419 stp->sd_wakeq |= RSLEEP; \ 420 mutex_exit(&stp->sd_lock); \ 421 } 422 423 #define STR_SENDSIG(stp) { \ 424 int _events; \ 425 mutex_enter(&stp->sd_lock); \ 426 if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0) \ 427 strsendsig(stp->sd_siglist, _events, 0, 0); \ 428 if (stp->sd_rput_opt & SR_POLLIN) { \ 429 stp->sd_rput_opt &= ~SR_POLLIN; \ 430 mutex_exit(&stp->sd_lock); \ 431 pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); \ 432 } else { \ 433 mutex_exit(&stp->sd_lock); \ 434 } \ 435 } 436 437 #define CONN_UDP_SYNCSTR(connp) \ 438 (IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs) 439 440 /* 441 * Macro that checks whether or not a particular UDP conn is 442 * flow-controlling on the read-side. If udp module is directly 443 * above ip, check to see if the drain queue is full; note here 444 * that we check this without any lock protection because this 445 * is a coarse granularity inbound flow-control. If the module 446 * above ip is not udp, then use canputnext to determine the 447 * flow-control. 448 * 449 * Note that these checks are done after the conn is found in 450 * the UDP fanout table. A UDP conn in that table may have its 451 * IPCL_UDP bit cleared from the conn_flags when the application 452 * pops the udp module without issuing an unbind; in this case 453 * IP will still receive packets for the conn and deliver it 454 * upstream via putnext. This is the reason why we have to test 455 * against IPCL_UDP. 456 */ 457 #define CONN_UDP_FLOWCTLD(connp) \ 458 ((CONN_UDP_SYNCSTR(connp) && \ 459 (connp)->conn_udp->udp_drain_qfull) || \ 460 (!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq))) 461 462 /* 463 * Macro that delivers a given message upstream; if udp module 464 * is directly above ip, the message is passed directly into 465 * the stream-less entry point. Otherwise putnext is used. 466 */ 467 #define CONN_UDP_RECV(connp, mp) { \ 468 if (IPCL_IS_UDP(connp)) \ 469 udp_conn_recv(connp, mp); \ 470 else \ 471 putnext((connp)->conn_rq, mp); \ 472 } 473 474 #define ILL_DLS_CAPABLE(ill) \ 475 (((ill)->ill_capabilities & \ 476 (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0) 477 478 /* 479 * Macro that hands off one or more messages directly to DLD 480 * when the interface is marked with ILL_CAPAB_POLL. 481 */ 482 #define IP_DLS_ILL_TX(ill, mp) { \ 483 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \ 484 ASSERT(ILL_DLS_CAPABLE(ill)); \ 485 ASSERT(ill_dls != NULL); \ 486 ASSERT(ill_dls->ill_tx != NULL); \ 487 ASSERT(ill_dls->ill_tx_handle != NULL); \ 488 ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \ 489 } 490 491 extern int ip_wput_frag_mdt_min; 492 extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); 493 494 #endif /* _KERNEL */ 495 496 #ifdef __cplusplus 497 } 498 #endif 499 500 #endif /* _INET_IP_IMPL_H */ 501