1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _INET_IP_IMPL_H 27 #define _INET_IP_IMPL_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 /* 32 * IP implementation private declarations. These interfaces are 33 * used to build the IP module and are not meant to be accessed 34 * by any modules except IP itself. They are undocumented and are 35 * subject to change without notice. 36 */ 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 #ifdef _KERNEL 43 44 #include <sys/sdt.h> 45 46 #define IP_MOD_ID 5701 47 48 #ifdef _BIG_ENDIAN 49 #define IP_HDR_CSUM_TTL_ADJUST 256 50 #define IP_TCP_CSUM_COMP IPPROTO_TCP 51 #define IP_UDP_CSUM_COMP IPPROTO_UDP 52 #else 53 #define IP_HDR_CSUM_TTL_ADJUST 1 54 #define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) 55 #define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) 56 #endif 57 58 #define TCP_CHECKSUM_OFFSET 16 59 #define TCP_CHECKSUM_SIZE 2 60 61 #define UDP_CHECKSUM_OFFSET 6 62 #define UDP_CHECKSUM_SIZE 2 63 64 #define IPH_TCPH_CHECKSUMP(ipha, hlen) \ 65 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET))) 66 67 #define IPH_UDPH_CHECKSUMP(ipha, hlen) \ 68 ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET))) 69 70 #define ILL_HCKSUM_CAPABLE(ill) \ 71 (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0) 72 /* 73 * Macro that performs software checksum calculation on the IP header. 74 */ 75 #define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ 76 (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ 77 ((v_hlen_tos_len) >> 16) + \ 78 ((v_hlen_tos_len) & 0xFFFF) + \ 79 (ipha)->ipha_fragment_offset_and_flags; \ 80 (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ 81 (sum) = ~((sum) + ((sum) >> 16)); \ 82 (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ 83 } 84 85 #define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \ 86 ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \ 87 ILL_HCKSUM_CAPABLE(ill) && dohwcksum) 88 89 /* 90 * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs 91 * several checks on the IRE and ILL (among other things) in order to see 92 * whether or not hardware checksum offload is allowed for the outgoing 93 * packet. It assumes that the caller has held a reference to the IRE. 94 */ 95 #define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \ 96 max_frag, ipsec_len, pseudo) { \ 97 uint32_t _hck_flags; \ 98 /* \ 99 * We offload checksum calculation to hardware when IPsec isn't \ 100 * present and if fragmentation isn't required. We also check \ 101 * if M_DATA fastpath is safe to be used on the corresponding \ 102 * IRE; this check is performed without grabbing ire_lock but \ 103 * instead by holding a reference to it. This is sufficient \ 104 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \ 105 * DL_NOTE_FASTPATH_FLUSH indication could come up from the \ 106 * driver and trigger the IRE (hence fp_mp) deletion. This is \ 107 * why only IRE_CACHE type is eligible for offload. \ 108 * \ 109 * The presense of IP options also forces the network stack to \ 110 * calculate the checksum in software. This is because: \ 111 * \ 112 * Wrap around: certain partial-checksum NICs (eri, ce) limit \ 113 * the size of "start offset" width to 6-bit. This effectively \ 114 * sets the largest value of the offset to 64-bytes, starting \ 115 * from the MAC header. When the cumulative MAC and IP headers \ 116 * exceed such limit, the offset will wrap around. This causes \ 117 * the checksum to be calculated at the wrong place. \ 118 * \ 119 * IPv4 source routing: none of the full-checksum capable NICs \ 120 * is capable of correctly handling the IPv4 source-routing \ 121 * option for purposes of calculating the pseudo-header; the \ 122 * actual destination is different from the destination in the \ 123 * header which is that of the next-hop. (This case may not be \ 124 * true for NICs which can parse IPv6 extension headers, but \ 125 * we choose to simplify the implementation by not offloading \ 126 * checksum when they are present.) \ 127 * \ 128 */ \ 129 if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \ 130 !((ire)->ire_flags & RTF_MULTIRT) && \ 131 (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) || \ 132 (ill)->ill_type == IFT_ETHER) && \ 133 (ipsec_len) == 0 && \ 134 (((ire)->ire_ipversion == IPV4_VERSION && \ 135 (start) == IP_SIMPLE_HDR_LENGTH && \ 136 ((ire)->ire_nce != NULL && \ 137 (ire)->ire_nce->nce_fp_mp != NULL && \ 138 MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) || \ 139 ((ire)->ire_ipversion == IPV6_VERSION && \ 140 (start) == IPV6_HDR_LEN && \ 141 (ire)->ire_nce->nce_fp_mp != NULL && \ 142 MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \ 143 (max_frag) >= (uint_t)((end) + (ipsec_len)) && \ 144 dohwcksum) { \ 145 _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \ 146 } else { \ 147 _hck_flags = 0; \ 148 } \ 149 IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \ 150 up, proto, start, end, pseudo); \ 151 } 152 153 /* 154 * Based on the device capabilities, this macro either marks an outgoing 155 * packet with hardware checksum offload information or calculate the 156 * checksum in software. If the latter is performed, the checksum field 157 * of the dblk is cleared; otherwise it will be non-zero and contain the 158 * necessary flag(s) for the driver. 159 */ 160 #define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \ 161 end, pseudo) { \ 162 uint32_t _sum; \ 163 /* \ 164 * Underlying interface supports hardware checksum offload for \ 165 * the payload; leave the payload checksum for the hardware to \ 166 * calculate. N.B: We only need to set up checksum info on the \ 167 * first mblk. \ 168 */ \ 169 DB_CKSUMFLAGS(mp) = 0; \ 170 if (((ipver) == IPV4_VERSION && \ 171 ((hck_flags) & HCKSUM_INET_FULL_V4)) || \ 172 ((ipver) == IPV6_VERSION && \ 173 ((hck_flags) & HCKSUM_INET_FULL_V6))) { \ 174 /* \ 175 * Hardware calculates pseudo-header, header and the \ 176 * payload checksums, so clear the checksum field in \ 177 * the protocol header. \ 178 */ \ 179 *(up) = 0; \ 180 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \ 181 } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \ 182 /* \ 183 * Partial checksum offload has been enabled. Fill \ 184 * the checksum field in the protocl header with the \ 185 * pseudo-header checksum value. \ 186 */ \ 187 _sum = ((proto) == IPPROTO_UDP) ? \ 188 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 189 _sum += *(up) + (pseudo); \ 190 _sum = (_sum & 0xFFFF) + (_sum >> 16); \ 191 *(up) = (_sum & 0xFFFF) + (_sum >> 16); \ 192 /* \ 193 * Offsets are relative to beginning of IP header. \ 194 */ \ 195 DB_CKSUMSTART(mp) = (start); \ 196 DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \ 197 (start) + UDP_CHECKSUM_OFFSET : \ 198 (start) + TCP_CHECKSUM_OFFSET; \ 199 DB_CKSUMEND(mp) = (end); \ 200 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \ 201 } else { \ 202 /* \ 203 * Software checksumming. \ 204 */ \ 205 _sum = ((proto) == IPPROTO_UDP) ? \ 206 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ 207 _sum += (pseudo); \ 208 _sum = IP_CSUM(mp, start, _sum); \ 209 *(up) = (uint16_t)(_sum ? _sum : ~_sum); \ 210 } \ 211 /* \ 212 * Hardware supports IP header checksum offload; clear the \ 213 * contents of IP header checksum field as expected by NIC. \ 214 * Do this only if we offloaded either full or partial sum. \ 215 */ \ 216 if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \ 217 ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \ 218 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \ 219 ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \ 220 } \ 221 } 222 223 /* 224 * Macro to inspect the checksum of a fully-reassembled incoming datagram. 225 */ 226 #define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \ 227 (err) = B_FALSE; \ 228 if ((hck_flags) & HCK_FULLCKSUM) { \ 229 /* \ 230 * The sum of all fragment checksums should \ 231 * result in -0 (0xFFFF) or otherwise invalid. \ 232 */ \ 233 if ((sum) != 0xFFFF) \ 234 (err) = B_TRUE; \ 235 } else if ((hck_flags) & HCK_PARTIALCKSUM) { \ 236 (sum) += (pseudo); \ 237 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 238 (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ 239 if (~(sum) & 0xFFFF) \ 240 (err) = B_TRUE; \ 241 } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \ 242 (err) = B_TRUE; \ 243 } \ 244 } 245 246 /* 247 * This macro inspects an incoming packet to see if the checksum value 248 * contained in it is valid; if the hardware has provided the information, 249 * the value is verified, otherwise it performs software checksumming. 250 * The checksum value is returned to caller. 251 */ 252 #define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \ 253 int32_t _len; \ 254 \ 255 (err) = B_FALSE; \ 256 if ((hck_flags) & HCK_FULLCKSUM) { \ 257 /* \ 258 * Full checksum has been computed by the hardware \ 259 * and has been attached. If the driver wants us to \ 260 * verify the correctness of the attached value, in \ 261 * order to protect against faulty hardware, compare \ 262 * it against -0 (0xFFFF) to see if it's valid. \ 263 */ \ 264 (sum) = DB_CKSUM16(mp); \ 265 if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \ 266 (err) = B_TRUE; \ 267 } else if (((hck_flags) & HCK_PARTIALCKSUM) && \ 268 ((mp1) == NULL || (mp1)->b_cont == NULL) && \ 269 (ulph_off) >= DB_CKSUMSTART(mp) && \ 270 ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \ 271 uint32_t _adj; \ 272 /* \ 273 * Partial checksum has been calculated by hardware \ 274 * and attached to the packet; in addition, any \ 275 * prepended extraneous data is even byte aligned, \ 276 * and there are at most two mblks associated with \ 277 * the packet. If any such data exists, we adjust \ 278 * the checksum; also take care any postpended data. \ 279 */ \ 280 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \ 281 /* \ 282 * One's complement subtract extraneous checksum \ 283 */ \ 284 (sum) += DB_CKSUM16(mp); \ 285 if (_adj >= (sum)) \ 286 (sum) = ~(_adj - (sum)) & 0xFFFF; \ 287 else \ 288 (sum) -= _adj; \ 289 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 290 (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ 291 if (~(sum) & 0xFFFF) \ 292 (err) = B_TRUE; \ 293 } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \ 294 (err) = B_TRUE; \ 295 } \ 296 } 297 298 /* 299 * Macro to adjust a given checksum value depending on any prepended 300 * or postpended data on the packet. It expects the start offset to 301 * begin at an even boundary and that the packet consists of at most 302 * two mblks. 303 */ 304 #define IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) { \ 305 /* \ 306 * Prepended extraneous data; adjust checksum. \ 307 */ \ 308 if ((len) > 0) \ 309 (adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0); \ 310 else \ 311 (adj) = 0; \ 312 /* \ 313 * len is now the total length of mblk(s) \ 314 */ \ 315 (len) = MBLKL(mp); \ 316 if ((mp1) == NULL) \ 317 (mp1) = (mp); \ 318 else \ 319 (len) += MBLKL(mp1); \ 320 /* \ 321 * Postpended extraneous data; adjust checksum. \ 322 */ \ 323 if (((len) = (DB_CKSUMEND(mp) - len)) > 0) { \ 324 uint32_t _pad; \ 325 \ 326 _pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0); \ 327 /* \ 328 * If the postpended extraneous data was odd \ 329 * byte aligned, swap resulting checksum bytes. \ 330 */ \ 331 if ((uintptr_t)(mp1)->b_wptr & 1) \ 332 (adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8); \ 333 else \ 334 (adj) += _pad; \ 335 (adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16); \ 336 } \ 337 } 338 339 #define ILL_MDT_CAPABLE(ill) \ 340 (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0) 341 342 /* 343 * ioctl identifier and structure for Multidata Transmit update 344 * private M_CTL communication from IP to ULP. 345 */ 346 #define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) 347 348 typedef struct ip_mdt_info_s { 349 uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ 350 ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ 351 } ip_mdt_info_t; 352 353 /* 354 * Macro that determines whether or not a given ILL is allowed for MDT. 355 */ 356 #define ILL_MDT_USABLE(ill) \ 357 (ILL_MDT_CAPABLE(ill) && \ 358 ill->ill_mdt_capab != NULL && \ 359 ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ 360 ill->ill_mdt_capab->ill_mdt_on != 0) 361 362 /* 363 * Macro that determines whether or not a given CONN may be considered 364 * for fast path prior to proceeding further with Multidata. 365 */ 366 #define CONN_IS_MD_FASTPATH(connp) \ 367 ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ 368 !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ 369 (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ 370 (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \ 371 (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ 372 (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ 373 374 /* Definitons for fragmenting IP packets using MDT. */ 375 376 /* 377 * Smaller and private version of pdescinfo_t used specifically for IP, 378 * which allows for only a single payload span per packet. 379 */ 380 typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; 381 382 /* 383 * Macro version of ip_can_frag_mdt() which avoids the function call if we 384 * only examine a single message block. 385 */ 386 #define IP_CAN_FRAG_MDT(mp, hdr_len, len) \ 387 (((mp)->b_cont == NULL) ? \ 388 (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \ 389 ip_can_frag_mdt((mp), (hdr_len), (len))) 390 391 /* 392 * Macro that determines whether or not a given IPC requires 393 * outbound IPSEC processing. 394 */ 395 #define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ 396 ((connp)->conn_out_enforce_policy || \ 397 ((connp)->conn_latch != NULL && \ 398 (connp)->conn_latch->ipl_out_policy != NULL)) 399 400 /* 401 * These are used by the synchronous streams code in tcp and udp. 402 * When we set the flags for a wakeup from a synchronous stream we 403 * always set RSLEEP in sd_wakeq, even if we have a read thread waiting 404 * to do the io. This is in case the read thread gets interrupted 405 * before completing the io. The RSLEEP flag in sd_wakeq is used to 406 * indicate that there is data available at the synchronous barrier. 407 * The assumption is that subsequent functions calls through rwnext() 408 * will reset sd_wakeq appropriately. 409 */ 410 #define STR_WAKEUP_CLEAR(stp) { \ 411 mutex_enter(&stp->sd_lock); \ 412 stp->sd_wakeq &= ~RSLEEP; \ 413 mutex_exit(&stp->sd_lock); \ 414 } 415 416 #define STR_WAKEUP_SET(stp) { \ 417 mutex_enter(&stp->sd_lock); \ 418 if (stp->sd_flag & RSLEEP) { \ 419 stp->sd_flag &= ~RSLEEP; \ 420 cv_broadcast(&_RD(stp->sd_wrq)->q_wait); \ 421 } \ 422 stp->sd_wakeq |= RSLEEP; \ 423 mutex_exit(&stp->sd_lock); \ 424 } 425 426 #define STR_SENDSIG(stp) { \ 427 int _events; \ 428 mutex_enter(&stp->sd_lock); \ 429 if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0) \ 430 strsendsig(stp->sd_siglist, _events, 0, 0); \ 431 if (stp->sd_rput_opt & SR_POLLIN) { \ 432 stp->sd_rput_opt &= ~SR_POLLIN; \ 433 mutex_exit(&stp->sd_lock); \ 434 pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); \ 435 } else { \ 436 mutex_exit(&stp->sd_lock); \ 437 } \ 438 } 439 440 #define CONN_UDP_SYNCSTR(connp) \ 441 (IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs) 442 443 /* 444 * Macro that checks whether or not a particular UDP conn is 445 * flow-controlling on the read-side. If udp module is directly 446 * above ip, check to see if the drain queue is full; note here 447 * that we check this without any lock protection because this 448 * is a coarse granularity inbound flow-control. If the module 449 * above ip is not udp, then use canputnext to determine the 450 * flow-control. 451 * 452 * Note that these checks are done after the conn is found in 453 * the UDP fanout table. A UDP conn in that table may have its 454 * IPCL_UDP bit cleared from the conn_flags when the application 455 * pops the udp module without issuing an unbind; in this case 456 * IP will still receive packets for the conn and deliver it 457 * upstream via putnext. This is the reason why we have to test 458 * against IPCL_UDP. 459 */ 460 #define CONN_UDP_FLOWCTLD(connp) \ 461 ((CONN_UDP_SYNCSTR(connp) && \ 462 (connp)->conn_udp->udp_drain_qfull) || \ 463 (!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq))) 464 465 /* 466 * Macro that delivers a given message upstream; if udp module 467 * is directly above ip, the message is passed directly into 468 * the stream-less entry point. Otherwise putnext is used. 469 */ 470 #define CONN_UDP_RECV(connp, mp) { \ 471 if (IPCL_IS_UDP(connp)) \ 472 udp_conn_recv(connp, mp); \ 473 else \ 474 putnext((connp)->conn_rq, mp); \ 475 } 476 477 #define ILL_DLS_CAPABLE(ill) \ 478 (((ill)->ill_capabilities & \ 479 (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0) 480 481 /* 482 * Macro that hands off one or more messages directly to DLD 483 * when the interface is marked with ILL_CAPAB_POLL. 484 */ 485 #define IP_DLS_ILL_TX(ill, ipha, mp) { \ 486 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \ 487 ASSERT(ILL_DLS_CAPABLE(ill)); \ 488 ASSERT(ill_dls != NULL); \ 489 ASSERT(ill_dls->ill_tx != NULL); \ 490 ASSERT(ill_dls->ill_tx_handle != NULL); \ 491 DTRACE_PROBE4(ip4__physical__out__start, \ 492 ill_t *, NULL, ill_t *, ill, \ 493 ipha_t *, ipha, mblk_t *, mp); \ 494 FW_HOOKS(ip4_physical_out_event, ipv4firewall_physical_out, \ 495 NULL, ill, ipha, mp, mp); \ 496 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); \ 497 if (mp != NULL) \ 498 ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \ 499 } 500 501 extern int ip_wput_frag_mdt_min; 502 extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); 503 extern mblk_t *ip_prepend_zoneid(mblk_t *, zoneid_t); 504 505 #endif /* _KERNEL */ 506 507 #ifdef __cplusplus 508 } 509 #endif 510 511 #endif /* _INET_IP_IMPL_H */ 512