1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _INET_IP_IMPL_H 27 #define _INET_IP_IMPL_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 /* 32 * IP implementation private declarations. These interfaces are 33 * used to build the IP module and are not meant to be accessed 34 * by any modules except IP itself. They are undocumented and are 35 * subject to change without notice. 36 */ 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 #ifdef _KERNEL 43 44 #define IP_MOD_ID 5701 45 46 #ifdef _BIG_ENDIAN 47 #define IP_HDR_CSUM_TTL_ADJUST 256 48 #define IP_TCP_CSUM_COMP IPPROTO_TCP 49 #define IP_UDP_CSUM_COMP IPPROTO_UDP 50 #else 51 #define IP_HDR_CSUM_TTL_ADJUST 1 52 #define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) 53 #define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) 54 #endif 55 56 #define TCP_CHECKSUM_OFFSET 16 57 #define TCP_CHECKSUM_SIZE 2 58 59 #define UDP_CHECKSUM_OFFSET 6 60 #define UDP_CHECKSUM_SIZE 2 61 62 #define IPH_TCPH_CHECKSUMP(ipha, hlen) \ 63 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET))) 64 65 #define IPH_UDPH_CHECKSUMP(ipha, hlen) \ 66 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET))) 67 68 #define ILL_HCKSUM_CAPABLE(ill) \ 69 (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0) 70 /* 71 * Macro that performs software checksum calculation on the IP header. 72 */ 73 #define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ 74 (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ 75 ((v_hlen_tos_len) >> 16) + \ 76 ((v_hlen_tos_len) & 0xFFFF) + \ 77 (ipha)->ipha_fragment_offset_and_flags; \ 78 (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ 79 (sum) = ~((sum) + ((sum) >> 16)); \ 80 (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ 81 } 82 83 #define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \ 84 ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \ 85 ILL_HCKSUM_CAPABLE(ill) && dohwcksum) 86 87 /* 88 * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs 89 * several checks on the IRE and ILL (among other things) in order to see 90 * whether or not hardware checksum offload is allowed for the outgoing 91 * packet. It assumes that the caller has held a reference to the IRE. 92 */ 93 #define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \ 94 max_frag, ipsec_len, pseudo) { \ 95 uint32_t _hck_flags; \ 96 /* \ 97 * We offload checksum calculation to hardware when IPsec isn't \ 98 * present and if fragmentation isn't required. We also check \ 99 * if M_DATA fastpath is safe to be used on the corresponding \ 100 * IRE; this check is performed without grabbing ire_lock but \ 101 * instead by holding a reference to it. This is sufficient \ 102 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \ 103 * DL_NOTE_FASTPATH_FLUSH indication could come up from the \ 104 * driver and trigger the IRE (hence fp_mp) deletion. This is \ 105 * why only IRE_CACHE type is eligible for offload. \ 106 * \ 107 * The presense of IP options also forces the network stack to \ 108 * calculate the checksum in software. This is because: \ 109 * \ 110 * Wrap around: certain partial-checksum NICs (eri, ce) limit \ 111 * the size of "start offset" width to 6-bit. This effectively \ 112 * sets the largest value of the offset to 64-bytes, starting \ 113 * from the MAC header. When the cumulative MAC and IP headers \ 114 * exceed such limit, the offset will wrap around. This causes \ 115 * the checksum to be calculated at the wrong place. \ 116 * \ 117 * IPv4 source routing: none of the full-checksum capable NICs \ 118 * is capable of correctly handling the IPv4 source-routing \ 119 * option for purposes of calculating the pseudo-header; the \ 120 * actual destination is different from the destination in the \ 121 * header which is that of the next-hop. (This case may not be \ 122 * true for NICs which can parse IPv6 extension headers, but \ 123 * we choose to simplify the implementation by not offloading \ 124 * checksum when they are present.) \ 125 * \ 126 */ \ 127 if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \ 128 !((ire)->ire_flags & RTF_MULTIRT) && \ 129 (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) || \ 130 (ill)->ill_type == IFT_ETHER) && \ 131 (ipsec_len) == 0 && \ 132 (((ire)->ire_ipversion == IPV4_VERSION && \ 133 (start) == IP_SIMPLE_HDR_LENGTH && \ 134 ((ire)->ire_nce != NULL && \ 135 (ire)->ire_nce->nce_fp_mp != NULL && \ 136 MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) || \ 137 ((ire)->ire_ipversion == IPV6_VERSION && \ 138 (start) == IPV6_HDR_LEN && \ 139 (ire)->ire_nce->nce_fp_mp != NULL && \ 140 MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \ 141 (max_frag) >= (uint_t)((end) + (ipsec_len)) && \ 142 dohwcksum) { \ 143 _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \ 144 } else { \ 145 _hck_flags = 0; \ 146 } \ 147 IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \ 148 up, proto, start, end, pseudo); \ 149 } 150 151 /* 152 * Based on the device capabilities, this macro either marks an outgoing 153 * packet with hardware checksum offload information or calculate the 154 * checksum in software. If the latter is performed, the checksum field 155 * of the dblk is cleared; otherwise it will be non-zero and contain the 156 * necessary flag(s) for the driver. 157 */ 158 #define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \ 159 end, pseudo) { \ 160 uint32_t _sum; \ 161 /* \ 162 * Underlying interface supports hardware checksum offload for \ 163 * the payload; leave the payload checksum for the hardware to \ 164 * calculate. N.B: We only need to set up checksum info on the \ 165 * first mblk. \ 166 */ \ 167 DB_CKSUMFLAGS(mp) = 0; \ 168 if (((ipver) == IPV4_VERSION && \ 169 ((hck_flags) & HCKSUM_INET_FULL_V4)) || \ 170 ((ipver) == IPV6_VERSION && \ 171 ((hck_flags) & HCKSUM_INET_FULL_V6))) { \ 172 /* \ 173 * Hardware calculates pseudo-header, header and the \ 174 * payload checksums, so clear the checksum field in \ 175 * the protocol header. \ 176 */ \ 177 *(up) = 0; \ 178 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \ 179 } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \ 180 /* \ 181 * Partial checksum offload has been enabled. Fill \ 182 * the checksum field in the protocl header with the \ 183 * pseudo-header checksum value. \ 184 */ \ 185 _sum = ((proto) == IPPROTO_UDP) ? \ 186 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 187 _sum += *(up) + (pseudo); \ 188 _sum = (_sum & 0xFFFF) + (_sum >> 16); \ 189 *(up) = (_sum & 0xFFFF) + (_sum >> 16); \ 190 /* \ 191 * Offsets are relative to beginning of IP header. \ 192 */ \ 193 DB_CKSUMSTART(mp) = (start); \ 194 DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \ 195 (start) + UDP_CHECKSUM_OFFSET : \ 196 (start) + TCP_CHECKSUM_OFFSET; \ 197 DB_CKSUMEND(mp) = (end); \ 198 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \ 199 } else { \ 200 /* \ 201 * Software checksumming. \ 202 */ \ 203 _sum = ((proto) == IPPROTO_UDP) ? \ 204 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 205 _sum += (pseudo); \ 206 _sum = IP_CSUM(mp, start, _sum); \ 207 *(up) = (uint16_t)(_sum ? _sum : ~_sum); \ 208 } \ 209 /* \ 210 * Hardware supports IP header checksum offload; clear the \ 211 * contents of IP header checksum field as expected by NIC. \ 212 * Do this only if we offloaded either full or partial sum. \ 213 */ \ 214 if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \ 215 ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \ 216 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \ 217 ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \ 218 } \ 219 } 220 221 /* 222 * Macro to inspect the checksum of a fully-reassembled incoming datagram. 223 */ 224 #define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \ 225 (err) = B_FALSE; \ 226 if ((hck_flags) & HCK_FULLCKSUM) { \ 227 /* \ 228 * The sum of all fragment checksums should \ 229 * result in -0 (0xFFFF) or otherwise invalid. \ 230 */ \ 231 if ((sum) != 0xFFFF) \ 232 (err) = B_TRUE; \ 233 } else if ((hck_flags) & HCK_PARTIALCKSUM) { \ 234 (sum) += (pseudo); \ 235 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 236 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 237 if (~(sum) & 0xFFFF) \ 238 (err) = B_TRUE; \ 239 } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \ 240 (err) = B_TRUE; \ 241 } \ 242 } 243 244 /* 245 * This macro inspects an incoming packet to see if the checksum value 246 * contained in it is valid; if the hardware has provided the information, 247 * the value is verified, otherwise it performs software checksumming. 248 * The checksum value is returned to caller. 249 */ 250 #define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \ 251 int32_t _len; \ 252 \ 253 (err) = B_FALSE; \ 254 if ((hck_flags) & HCK_FULLCKSUM) { \ 255 /* \ 256 * Full checksum has been computed by the hardware \ 257 * and has been attached. If the driver wants us to \ 258 * verify the correctness of the attached value, in \ 259 * order to protect against faulty hardware, compare \ 260 * it against -0 (0xFFFF) to see if it's valid. \ 261 */ \ 262 (sum) = DB_CKSUM16(mp); \ 263 if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \ 264 (err) = B_TRUE; \ 265 } else if (((hck_flags) & HCK_PARTIALCKSUM) && \ 266 ((mp1) == NULL || (mp1)->b_cont == NULL) && \ 267 (ulph_off) >= DB_CKSUMSTART(mp) && \ 268 ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \ 269 uint32_t _adj; \ 270 /* \ 271 * Partial checksum has been calculated by hardware \ 272 * and attached to the packet; in addition, any \ 273 * prepended extraneous data is even byte aligned, \ 274 * and there are at most two mblks associated with \ 275 * the packet. If any such data exists, we adjust \ 276 * the checksum; also take care any postpended data. \ 277 */ \ 278 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \ 279 /* \ 280 * One's complement subtract extraneous checksum \ 281 */ \ 282 (sum) += DB_CKSUM16(mp); \ 283 if (_adj >= (sum)) \ 284 (sum) = ~(_adj - (sum)) & 0xFFFF; \ 285 else \ 286 (sum) -= _adj; \ 287 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 288 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 289 if (~(sum) & 0xFFFF) \ 290 (err) = B_TRUE; \ 291 } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \ 292 (err) = B_TRUE; \ 293 } \ 294 } 295 296 /* 297 * Macro to adjust a given checksum value depending on any prepended 298 * or postpended data on the packet. It expects the start offset to 299 * begin at an even boundary and that the packet consists of at most 300 * two mblks. 301 */ 302 #define IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) { \ 303 /* \ 304 * Prepended extraneous data; adjust checksum. \ 305 */ \ 306 if ((len) > 0) \ 307 (adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0); \ 308 else \ 309 (adj) = 0; \ 310 /* \ 311 * len is now the total length of mblk(s) \ 312 */ \ 313 (len) = MBLKL(mp); \ 314 if ((mp1) == NULL) \ 315 (mp1) = (mp); \ 316 else \ 317 (len) += MBLKL(mp1); \ 318 /* \ 319 * Postpended extraneous data; adjust checksum. \ 320 */ \ 321 if (((len) = (DB_CKSUMEND(mp) - len)) > 0) { \ 322 uint32_t _pad; \ 323 \ 324 _pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0); \ 325 /* \ 326 * If the postpended extraneous data was odd \ 327 * byte aligned, swap resulting checksum bytes. \ 328 */ \ 329 if ((uintptr_t)(mp1)->b_wptr & 1) \ 330 (adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8); \ 331 else \ 332 (adj) += _pad; \ 333 (adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16); \ 334 } \ 335 } 336 337 #define ILL_MDT_CAPABLE(ill) \ 338 (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0) 339 340 /* 341 * ioctl identifier and structure for Multidata Transmit update 342 * private M_CTL communication from IP to ULP. 343 */ 344 #define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) 345 346 typedef struct ip_mdt_info_s { 347 uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ 348 ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ 349 } ip_mdt_info_t; 350 351 /* 352 * Macro that determines whether or not a given ILL is allowed for MDT. 353 */ 354 #define ILL_MDT_USABLE(ill) \ 355 (ILL_MDT_CAPABLE(ill) && \ 356 ill->ill_mdt_capab != NULL && \ 357 ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ 358 ill->ill_mdt_capab->ill_mdt_on != 0) 359 360 /* 361 * Macro that determines whether or not a given CONN may be considered 362 * for fast path prior to proceeding further with Multidata. 363 */ 364 #define CONN_IS_MD_FASTPATH(connp) \ 365 ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ 366 !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ 367 (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ 368 (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \ 369 (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ 370 (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ 371 372 /* Definitons for fragmenting IP packets using MDT. */ 373 374 /* 375 * Smaller and private version of pdescinfo_t used specifically for IP, 376 * which allows for only a single payload span per packet. 377 */ 378 typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; 379 380 /* 381 * Macro version of ip_can_frag_mdt() which avoids the function call if we 382 * only examine a single message block. 383 */ 384 #define IP_CAN_FRAG_MDT(mp, hdr_len, len) \ 385 (((mp)->b_cont == NULL) ? \ 386 (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \ 387 ip_can_frag_mdt((mp), (hdr_len), (len))) 388 389 /* 390 * Macro that determines whether or not a given IPC requires 391 * outbound IPSEC processing. 392 */ 393 #define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ 394 ((connp)->conn_out_enforce_policy || \ 395 ((connp)->conn_latch != NULL && \ 396 (connp)->conn_latch->ipl_out_policy != NULL)) 397 398 /* 399 * These are used by the synchronous streams code in tcp and udp. 400 * When we set the flags for a wakeup from a synchronous stream we 401 * always set RSLEEP in sd_wakeq, even if we have a read thread waiting 402 * to do the io. This is in case the read thread gets interrupted 403 * before completing the io. The RSLEEP flag in sd_wakeq is used to 404 * indicate that there is data available at the synchronous barrier. 405 * The assumption is that subsequent functions calls through rwnext() 406 * will reset sd_wakeq appropriately. 407 */ 408 #define STR_WAKEUP_CLEAR(stp) { \ 409 mutex_enter(&stp->sd_lock); \ 410 stp->sd_wakeq &= ~RSLEEP; \ 411 mutex_exit(&stp->sd_lock); \ 412 } 413 414 #define STR_WAKEUP_SET(stp) { \ 415 mutex_enter(&stp->sd_lock); \ 416 if (stp->sd_flag & RSLEEP) { \ 417 stp->sd_flag &= ~RSLEEP; \ 418 cv_broadcast(&_RD(stp->sd_wrq)->q_wait); \ 419 } \ 420 stp->sd_wakeq |= RSLEEP; \ 421 mutex_exit(&stp->sd_lock); \ 422 } 423 424 #define STR_SENDSIG(stp) { \ 425 int _events; \ 426 mutex_enter(&stp->sd_lock); \ 427 if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0) \ 428 strsendsig(stp->sd_siglist, _events, 0, 0); \ 429 if (stp->sd_rput_opt & SR_POLLIN) { \ 430 stp->sd_rput_opt &= ~SR_POLLIN; \ 431 mutex_exit(&stp->sd_lock); \ 432 pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); \ 433 } else { \ 434 mutex_exit(&stp->sd_lock); \ 435 } \ 436 } 437 438 #define CONN_UDP_SYNCSTR(connp) \ 439 (IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs) 440 441 /* 442 * Macro that checks whether or not a particular UDP conn is 443 * flow-controlling on the read-side. If udp module is directly 444 * above ip, check to see if the drain queue is full; note here 445 * that we check this without any lock protection because this 446 * is a coarse granularity inbound flow-control. If the module 447 * above ip is not udp, then use canputnext to determine the 448 * flow-control. 449 * 450 * Note that these checks are done after the conn is found in 451 * the UDP fanout table. A UDP conn in that table may have its 452 * IPCL_UDP bit cleared from the conn_flags when the application 453 * pops the udp module without issuing an unbind; in this case 454 * IP will still receive packets for the conn and deliver it 455 * upstream via putnext. This is the reason why we have to test 456 * against IPCL_UDP. 457 */ 458 #define CONN_UDP_FLOWCTLD(connp) \ 459 ((CONN_UDP_SYNCSTR(connp) && \ 460 (connp)->conn_udp->udp_drain_qfull) || \ 461 (!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq))) 462 463 /* 464 * Macro that delivers a given message upstream; if udp module 465 * is directly above ip, the message is passed directly into 466 * the stream-less entry point. Otherwise putnext is used. 467 */ 468 #define CONN_UDP_RECV(connp, mp) { \ 469 if (IPCL_IS_UDP(connp)) \ 470 udp_conn_recv(connp, mp); \ 471 else \ 472 putnext((connp)->conn_rq, mp); \ 473 } 474 475 #define ILL_DLS_CAPABLE(ill) \ 476 (((ill)->ill_capabilities & \ 477 (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0) 478 479 /* 480 * Macro that hands off one or more messages directly to DLD 481 * when the interface is marked with ILL_CAPAB_POLL. 482 */ 483 #define IP_DLS_ILL_TX(ill, mp) { \ 484 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \ 485 ASSERT(ILL_DLS_CAPABLE(ill)); \ 486 ASSERT(ill_dls != NULL); \ 487 ASSERT(ill_dls->ill_tx != NULL); \ 488 ASSERT(ill_dls->ill_tx_handle != NULL); \ 489 ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \ 490 } 491 492 extern int ip_wput_frag_mdt_min; 493 extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); 494 extern mblk_t *ip_prepend_zoneid(mblk_t *, zoneid_t); 495 496 #endif /* _KERNEL */ 497 498 #ifdef __cplusplus 499 } 500 #endif 501 502 #endif /* _INET_IP_IMPL_H */ 503