1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/dlpi.h> 31 #include <sys/stropts.h> 32 #include <sys/sysmacros.h> 33 #include <sys/strsubr.h> 34 #include <sys/strlog.h> 35 #include <sys/strsun.h> 36 #include <sys/zone.h> 37 #define _SUN_TPI_VERSION 2 38 #include <sys/tihdr.h> 39 #include <sys/xti_inet.h> 40 #include <sys/ddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/kobj.h> 44 #include <sys/modctl.h> 45 #include <sys/atomic.h> 46 #include <sys/policy.h> 47 #include <sys/priv.h> 48 #include <sys/taskq.h> 49 50 #include <sys/systm.h> 51 #include <sys/param.h> 52 #include <sys/kmem.h> 53 #include <sys/sdt.h> 54 #include <sys/socket.h> 55 #include <sys/vtrace.h> 56 #include <sys/isa_defs.h> 57 #include <sys/mac.h> 58 #include <net/if.h> 59 #include <net/if_arp.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <net/if_dl.h> 64 65 #include <inet/common.h> 66 #include <inet/mi.h> 67 #include <inet/mib2.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/snmpcom.h> 71 #include <inet/optcom.h> 72 #include <inet/kstatcom.h> 73 74 #include <netinet/igmp_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet/icmp6.h> 77 #include <netinet/sctp.h> 78 79 #include <inet/ip.h> 80 #include <inet/ip_impl.h> 81 #include <inet/ip6.h> 82 #include <inet/ip6_asp.h> 83 #include <inet/tcp.h> 84 #include <inet/tcp_impl.h> 85 #include <inet/ip_multi.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_ire.h> 88 #include <inet/ip_ftable.h> 89 #include <inet/ip_rts.h> 90 #include <inet/ip_ndp.h> 91 #include <inet/ip_listutils.h> 92 #include <netinet/igmp.h> 93 #include <netinet/ip_mroute.h> 94 #include <inet/ipp_common.h> 95 96 #include <net/pfkeyv2.h> 97 #include <inet/ipsec_info.h> 98 #include <inet/sadb.h> 99 #include <inet/ipsec_impl.h> 100 #include <sys/iphada.h> 101 #include <inet/iptun/iptun_impl.h> 102 #include <inet/ipdrop.h> 103 #include <inet/ip_netinfo.h> 104 105 #include <sys/ethernet.h> 106 #include <net/if_types.h> 107 #include <sys/cpuvar.h> 108 109 #include <ipp/ipp.h> 110 #include <ipp/ipp_impl.h> 111 #include <ipp/ipgpc/ipgpc.h> 112 113 #include <sys/multidata.h> 114 #include <sys/pattr.h> 115 116 #include <inet/ipclassifier.h> 117 #include <inet/sctp_ip.h> 118 #include <inet/sctp/sctp_impl.h> 119 #include <inet/udp_impl.h> 120 #include <inet/rawip_impl.h> 121 #include <inet/rts_impl.h> 122 123 #include <sys/tsol/label.h> 124 #include <sys/tsol/tnet.h> 125 126 #include <rpc/pmap_prot.h> 127 #include <sys/squeue_impl.h> 128 129 /* 130 * Values for squeue switch: 131 * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN 132 * IP_SQUEUE_ENTER: SQ_PROCESS 133 * IP_SQUEUE_FILL: SQ_FILL 134 */ 135 int ip_squeue_enter = 2; /* Setable in /etc/system */ 136 137 int ip_squeue_flag; 138 #define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) 139 140 /* 141 * Setable in /etc/system 142 */ 143 int ip_poll_normal_ms = 100; 144 int ip_poll_normal_ticks = 0; 145 int ip_modclose_ackwait_ms = 3000; 146 147 /* 148 * It would be nice to have these present only in DEBUG systems, but the 149 * current design of the global symbol checking logic requires them to be 150 * unconditionally present. 151 */ 152 uint_t ip_thread_data; /* TSD key for debug support */ 153 krwlock_t ip_thread_rwlock; 154 list_t ip_thread_list; 155 156 /* 157 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 158 */ 159 160 struct listptr_s { 161 mblk_t *lp_head; /* pointer to the head of the list */ 162 mblk_t *lp_tail; /* pointer to the tail of the list */ 163 }; 164 165 typedef struct listptr_s listptr_t; 166 167 /* 168 * This is used by ip_snmp_get_mib2_ip_route_media and 169 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 170 */ 171 typedef struct iproutedata_s { 172 uint_t ird_idx; 173 uint_t ird_flags; /* see below */ 174 listptr_t ird_route; /* ipRouteEntryTable */ 175 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 176 listptr_t ird_attrs; /* ipRouteAttributeTable */ 177 } iproutedata_t; 178 179 #define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */ 180 181 /* 182 * Cluster specific hooks. These should be NULL when booted as a non-cluster 183 */ 184 185 /* 186 * Hook functions to enable cluster networking 187 * On non-clustered systems these vectors must always be NULL. 188 * 189 * Hook function to Check ip specified ip address is a shared ip address 190 * in the cluster 191 * 192 */ 193 int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, 194 sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL; 195 196 /* 197 * Hook function to generate cluster wide ip fragment identifier 198 */ 199 uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, 200 sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp, 201 void *args) = NULL; 202 203 /* 204 * Hook function to generate cluster wide SPI. 205 */ 206 void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t, 207 void *) = NULL; 208 209 /* 210 * Hook function to verify if the SPI is already utlized. 211 */ 212 213 int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL; 214 215 /* 216 * Hook function to delete the SPI from the cluster wide repository. 217 */ 218 219 void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL; 220 221 /* 222 * Hook function to inform the cluster when packet received on an IDLE SA 223 */ 224 225 void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, 226 in6_addr_t, in6_addr_t, void *) = NULL; 227 228 /* 229 * Synchronization notes: 230 * 231 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 232 * MT level protection given by STREAMS. IP uses a combination of its own 233 * internal serialization mechanism and standard Solaris locking techniques. 234 * The internal serialization is per phyint. This is used to serialize 235 * plumbing operations, certain multicast operations, most set ioctls, 236 * igmp/mld timers etc. 237 * 238 * Plumbing is a long sequence of operations involving message 239 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 240 * involved in plumbing operations. A natural model is to serialize these 241 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 242 * parallel without any interference. But various set ioctls on hme0 are best 243 * serialized, along with multicast join/leave operations, igmp/mld timer 244 * operations, and processing of DLPI control messages received from drivers 245 * on a per phyint basis. This serialization is provided by the ipsq_t and 246 * primitives operating on this. Details can be found in ip_if.c above the 247 * core primitives operating on ipsq_t. 248 * 249 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 250 * Simiarly lookup of an ire by a thread also returns a refheld ire. 251 * In addition ipif's and ill's referenced by the ire are also indirectly 252 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 253 * the ipif's address or netmask change as long as an ipif is refheld 254 * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the 255 * address of an ipif has to go through the ipsq_t. This ensures that only 256 * 1 such exclusive operation proceeds at any time on the ipif. It then 257 * deletes all ires associated with this ipif, and waits for all refcnts 258 * associated with this ipif to come down to zero. The address is changed 259 * only after the ipif has been quiesced. Then the ipif is brought up again. 260 * More details are described above the comment in ip_sioctl_flags. 261 * 262 * Packet processing is based mostly on IREs and are fully multi-threaded 263 * using standard Solaris MT techniques. 264 * 265 * There are explicit locks in IP to handle: 266 * - The ip_g_head list maintained by mi_open_link() and friends. 267 * 268 * - The reassembly data structures (one lock per hash bucket) 269 * 270 * - conn_lock is meant to protect conn_t fields. The fields actually 271 * protected by conn_lock are documented in the conn_t definition. 272 * 273 * - ire_lock to protect some of the fields of the ire, IRE tables 274 * (one lock per hash bucket). Refer to ip_ire.c for details. 275 * 276 * - ndp_g_lock and nce_lock for protecting NCEs. 277 * 278 * - ill_lock protects fields of the ill and ipif. Details in ip.h 279 * 280 * - ill_g_lock: This is a global reader/writer lock. Protects the following 281 * * The AVL tree based global multi list of all ills. 282 * * The linked list of all ipifs of an ill 283 * * The <ipsq-xop> mapping 284 * * <ill-phyint> association 285 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 286 * into an ill, changing the <ipsq-xop> mapping of an ill, changing the 287 * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as 288 * writer for the actual duration of the insertion/deletion/change. 289 * 290 * - ill_lock: This is a per ill mutex. 291 * It protects some members of the ill_t struct; see ip.h for details. 292 * It also protects the <ill-phyint> assoc. 293 * It also protects the list of ipifs hanging off the ill. 294 * 295 * - ipsq_lock: This is a per ipsq_t mutex lock. 296 * This protects some members of the ipsq_t struct; see ip.h for details. 297 * It also protects the <ipsq-ipxop> mapping 298 * 299 * - ipx_lock: This is a per ipxop_t mutex lock. 300 * This protects some members of the ipxop_t struct; see ip.h for details. 301 * 302 * - phyint_lock: This is a per phyint mutex lock. Protects just the 303 * phyint_flags 304 * 305 * - ip_g_nd_lock: This is a global reader/writer lock. 306 * Any call to nd_load to load a new parameter to the ND table must hold the 307 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 308 * as reader. 309 * 310 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 311 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 312 * uniqueness check also done atomically. 313 * 314 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 315 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 316 * as a writer when adding or deleting elements from these lists, and 317 * as a reader when walking these lists to send a SADB update to the 318 * IPsec capable ills. 319 * 320 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 321 * group list linked by ill_usesrc_grp_next. It also protects the 322 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 323 * group is being added or deleted. This lock is taken as a reader when 324 * walking the list/group(eg: to get the number of members in a usesrc group). 325 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 326 * field is changing state i.e from NULL to non-NULL or vice-versa. For 327 * example, it is not necessary to take this lock in the initial portion 328 * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these 329 * operations are executed exclusively and that ensures that the "usesrc 330 * group state" cannot change. The "usesrc group state" change can happen 331 * only in the latter part of ip_sioctl_slifusesrc and in ill_delete. 332 * 333 * Changing <ill-phyint>, <ipsq-xop> assocications: 334 * 335 * To change the <ill-phyint> association, the ill_g_lock must be held 336 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 337 * must be held. 338 * 339 * To change the <ipsq-xop> association, the ill_g_lock must be held as 340 * writer, the ipsq_lock must be held, and one must be writer on the ipsq. 341 * This is only done when ills are added or removed from IPMP groups. 342 * 343 * To add or delete an ipif from the list of ipifs hanging off the ill, 344 * ill_g_lock (writer) and ill_lock must be held and the thread must be 345 * a writer on the associated ipsq. 346 * 347 * To add or delete an ill to the system, the ill_g_lock must be held as 348 * writer and the thread must be a writer on the associated ipsq. 349 * 350 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 351 * must be a writer on the associated ipsq. 352 * 353 * Lock hierarchy 354 * 355 * Some lock hierarchy scenarios are listed below. 356 * 357 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock 358 * ill_g_lock -> ill_lock(s) -> phyint_lock 359 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 360 * ill_g_lock -> ip_addr_avail_lock 361 * conn_lock -> irb_lock -> ill_lock -> ire_lock 362 * ill_g_lock -> ip_g_nd_lock 363 * 364 * When more than 1 ill lock is needed to be held, all ill lock addresses 365 * are sorted on address and locked starting from highest addressed lock 366 * downward. 367 * 368 * IPsec scenarios 369 * 370 * ipsa_lock -> ill_g_lock -> ill_lock 371 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 372 * ipsec_capab_ills_lock -> ipsa_lock 373 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 374 * 375 * Trusted Solaris scenarios 376 * 377 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 378 * igsa_lock -> gcdb_lock 379 * gcgrp_rwlock -> ire_lock 380 * gcgrp_rwlock -> gcdb_lock 381 * 382 * squeue(sq_lock), flow related (ft_lock, fe_lock) locking 383 * 384 * cpu_lock --> ill_lock --> sqset_lock --> sq_lock 385 * sq_lock -> conn_lock -> QLOCK(q) 386 * ill_lock -> ft_lock -> fe_lock 387 * 388 * Routing/forwarding table locking notes: 389 * 390 * Lock acquisition order: Radix tree lock, irb_lock. 391 * Requirements: 392 * i. Walker must not hold any locks during the walker callback. 393 * ii Walker must not see a truncated tree during the walk because of any node 394 * deletion. 395 * iii Existing code assumes ire_bucket is valid if it is non-null and is used 396 * in many places in the code to walk the irb list. Thus even if all the 397 * ires in a bucket have been deleted, we still can't free the radix node 398 * until the ires have actually been inactive'd (freed). 399 * 400 * Tree traversal - Need to hold the global tree lock in read mode. 401 * Before dropping the global tree lock, need to either increment the ire_refcnt 402 * to ensure that the radix node can't be deleted. 403 * 404 * Tree add - Need to hold the global tree lock in write mode to add a 405 * radix node. To prevent the node from being deleted, increment the 406 * irb_refcnt, after the node is added to the tree. The ire itself is 407 * added later while holding the irb_lock, but not the tree lock. 408 * 409 * Tree delete - Need to hold the global tree lock and irb_lock in write mode. 410 * All associated ires must be inactive (i.e. freed), and irb_refcnt 411 * must be zero. 412 * 413 * Walker - Increment irb_refcnt before calling the walker callback. Hold the 414 * global tree lock (read mode) for traversal. 415 * 416 * IPsec notes : 417 * 418 * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message 419 * in front of the actual packet. For outbound datagrams, the M_CTL 420 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 421 * information used by the IPsec code for applying the right level of 422 * protection. The information initialized by IP in the ipsec_out_t 423 * is determined by the per-socket policy or global policy in the system. 424 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 425 * ipsec_info.h) which starts out with nothing in it. It gets filled 426 * with the right information if it goes through the AH/ESP code, which 427 * happens if the incoming packet is secure. The information initialized 428 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 429 * the policy requirements needed by per-socket policy or global policy 430 * is met or not. 431 * 432 * If there is both per-socket policy (set using setsockopt) and there 433 * is also global policy match for the 5 tuples of the socket, 434 * ipsec_override_policy() makes the decision of which one to use. 435 * 436 * For fully connected sockets i.e dst, src [addr, port] is known, 437 * conn_policy_cached is set indicating that policy has been cached. 438 * conn_in_enforce_policy may or may not be set depending on whether 439 * there is a global policy match or per-socket policy match. 440 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 441 * Once the right policy is set on the conn_t, policy cannot change for 442 * this socket. This makes life simpler for TCP (UDP ?) where 443 * re-transmissions go out with the same policy. For symmetry, policy 444 * is cached for fully connected UDP sockets also. Thus if policy is cached, 445 * it also implies that policy is latched i.e policy cannot change 446 * on these sockets. As we have the right policy on the conn, we don't 447 * have to lookup global policy for every outbound and inbound datagram 448 * and thus serving as an optimization. Note that a global policy change 449 * does not affect fully connected sockets if they have policy. If fully 450 * connected sockets did not have any policy associated with it, global 451 * policy change may affect them. 452 * 453 * IP Flow control notes: 454 * --------------------- 455 * Non-TCP streams are flow controlled by IP. The way this is accomplished 456 * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When 457 * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into 458 * GLDv3. Otherwise packets are sent down to lower layers using STREAMS 459 * functions. 460 * 461 * Per Tx ring udp flow control: 462 * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in 463 * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true). 464 * 465 * The underlying link can expose multiple Tx rings to the GLDv3 mac layer. 466 * To achieve best performance, outgoing traffic need to be fanned out among 467 * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send 468 * traffic out of the NIC and it takes a fanout hint. UDP connections pass 469 * the address of connp as fanout hint to mac_tx(). Under flow controlled 470 * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This 471 * cookie points to a specific Tx ring that is blocked. The cookie is used to 472 * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t 473 * point to drain_lists (idl_t's). These drain list will store the blocked UDP 474 * connp's. The drain list is not a single list but a configurable number of 475 * lists. 476 * 477 * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t 478 * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE 479 * which is equal to 128. This array in turn contains a pointer to idl_t[], 480 * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain 481 * list will point to the list of connp's that are flow controlled. 482 * 483 * --------------- ------- ------- ------- 484 * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> 485 * | --------------- ------- ------- ------- 486 * | --------------- ------- ------- ------- 487 * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> 488 * ---------------- | --------------- ------- ------- ------- 489 * |idl_tx_list[0]|->| --------------- ------- ------- ------- 490 * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|--> 491 * | --------------- ------- ------- ------- 492 * . . . . . 493 * | --------------- ------- ------- ------- 494 * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> 495 * --------------- ------- ------- ------- 496 * --------------- ------- ------- ------- 497 * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> 498 * | --------------- ------- ------- ------- 499 * | --------------- ------- ------- ------- 500 * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> 501 * |idl_tx_list[1]|->| --------------- ------- ------- ------- 502 * ---------------- | . . . . 503 * | --------------- ------- ------- ------- 504 * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> 505 * --------------- ------- ------- ------- 506 * ..... 507 * ---------------- 508 * |idl_tx_list[n]|-> ... 509 * ---------------- 510 * 511 * When mac_tx() returns a cookie, the cookie is used to hash into a 512 * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is 513 * called passing idl_tx_list. The connp gets inserted in a drain list 514 * pointed to by idl_tx_list. conn_drain_list() asserts flow control for 515 * the sockets (non stream based) and sets QFULL condition for conn_wq. 516 * connp->conn_direct_blocked will be set to indicate the blocked 517 * condition. 518 * 519 * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved. 520 * A cookie is passed in the call to ill_flow_enable() that identifies the 521 * blocked Tx ring. This cookie is used to get to the idl_tx_list that 522 * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t 523 * and goes through each of the drain list (q)enabling the conn_wq of the 524 * first conn in each of the drain list. This causes ip_wsrv to run for the 525 * conn. ip_wsrv drains the queued messages, and removes the conn from the 526 * drain list, if all messages were drained. It also qenables the next conn 527 * in the drain list to continue the drain process. 528 * 529 * In reality the drain list is not a single list, but a configurable number 530 * of lists. conn_drain_walk() in the IP module, qenables the first conn in 531 * each list. If the ip_wsrv of the next qenabled conn does not run, because 532 * the stream closes, ip_close takes responsibility to qenable the next conn 533 * in the drain list. conn_drain_insert and conn_drain_tail are the only 534 * functions that manipulate this drain list. conn_drain_insert is called in 535 * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS 536 * case -- see below). The synchronization between drain insertion and flow 537 * control wakeup is handled by using idl_txl->txl_lock. 538 * 539 * Flow control using STREAMS: 540 * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism 541 * is used. On the send side, if the packet cannot be sent down to the 542 * driver by IP, because of a canput failure, IP does a putq on the conn_wq. 543 * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts 544 * the conn in a list of conn's that need to be drained when the flow 545 * control condition subsides. The blocked connps are put in first member 546 * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv 547 * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0]. 548 * ips_idl_tx_list[0] contains the drain lists of blocked conns. The 549 * conn_wq of the first conn in the drain lists is (q)enabled to run. 550 * ip_wsrv on this conn drains the queued messages, and removes the conn 551 * from the drain list, if all messages were drained. It also qenables the 552 * next conn in the drain list to continue the drain process. 553 * 554 * If the ip_wsrv of the next qenabled conn does not run, because the 555 * stream closes, ip_close takes responsibility to qenable the next conn in 556 * the drain list. The directly called ip_wput path always does a putq, if 557 * it cannot putnext. Thus synchronization problems are handled between 558 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 559 * functions that manipulate this drain list. Furthermore conn_drain_insert 560 * is called only from ip_wsrv for the STREAMS case, and there can be only 1 561 * instance of ip_wsrv running on a queue at any time. conn_drain_tail can 562 * be simultaneously called from both ip_wsrv and ip_close. 563 * 564 * IPQOS notes: 565 * 566 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 567 * and IPQoS modules. IPPF includes hooks in IP at different control points 568 * (callout positions) which direct packets to IPQoS modules for policy 569 * processing. Policies, if present, are global. 570 * 571 * The callout positions are located in the following paths: 572 * o local_in (packets destined for this host) 573 * o local_out (packets orginating from this host ) 574 * o fwd_in (packets forwarded by this m/c - inbound) 575 * o fwd_out (packets forwarded by this m/c - outbound) 576 * Hooks at these callout points can be enabled/disabled using the ndd variable 577 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 578 * By default all the callout positions are enabled. 579 * 580 * Outbound (local_out) 581 * Hooks are placed in ip_wput_ire and ipsec_out_process. 582 * 583 * Inbound (local_in) 584 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 585 * TCP and UDP fanout routines. 586 * 587 * Forwarding (in and out) 588 * Hooks are placed in ip_rput_forward. 589 * 590 * IP Policy Framework processing (IPPF processing) 591 * Policy processing for a packet is initiated by ip_process, which ascertains 592 * that the classifier (ipgpc) is loaded and configured, failing which the 593 * packet resumes normal processing in IP. If the clasifier is present, the 594 * packet is acted upon by one or more IPQoS modules (action instances), per 595 * filters configured in ipgpc and resumes normal IP processing thereafter. 596 * An action instance can drop a packet in course of its processing. 597 * 598 * A boolean variable, ip_policy, is used in all the fanout routines that can 599 * invoke ip_process for a packet. This variable indicates if the packet should 600 * to be sent for policy processing. The variable is set to B_TRUE by default, 601 * i.e. when the routines are invoked in the normal ip procesing path for a 602 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 603 * ip_policy is set to B_FALSE for all the routines called in these two 604 * functions because, in the former case, we don't process loopback traffic 605 * currently while in the latter, the packets have already been processed in 606 * icmp_inbound. 607 * 608 * Zones notes: 609 * 610 * The partitioning rules for networking are as follows: 611 * 1) Packets coming from a zone must have a source address belonging to that 612 * zone. 613 * 2) Packets coming from a zone can only be sent on a physical interface on 614 * which the zone has an IP address. 615 * 3) Between two zones on the same machine, packet delivery is only allowed if 616 * there's a matching route for the destination and zone in the forwarding 617 * table. 618 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 619 * different zones can bind to the same port with the wildcard address 620 * (INADDR_ANY). 621 * 622 * The granularity of interface partitioning is at the logical interface level. 623 * Therefore, every zone has its own IP addresses, and incoming packets can be 624 * attributed to a zone unambiguously. A logical interface is placed into a zone 625 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 626 * structure. Rule (1) is implemented by modifying the source address selection 627 * algorithm so that the list of eligible addresses is filtered based on the 628 * sending process zone. 629 * 630 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 631 * across all zones, depending on their type. Here is the break-up: 632 * 633 * IRE type Shared/exclusive 634 * -------- ---------------- 635 * IRE_BROADCAST Exclusive 636 * IRE_DEFAULT (default routes) Shared (*) 637 * IRE_LOCAL Exclusive (x) 638 * IRE_LOOPBACK Exclusive 639 * IRE_PREFIX (net routes) Shared (*) 640 * IRE_CACHE Exclusive 641 * IRE_IF_NORESOLVER (interface routes) Exclusive 642 * IRE_IF_RESOLVER (interface routes) Exclusive 643 * IRE_HOST (host routes) Shared (*) 644 * 645 * (*) A zone can only use a default or off-subnet route if the gateway is 646 * directly reachable from the zone, that is, if the gateway's address matches 647 * one of the zone's logical interfaces. 648 * 649 * (x) IRE_LOCAL are handled a bit differently, since for all other entries 650 * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source 651 * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP 652 * address of the zone itself (the destination). Since IRE_LOCAL is used 653 * for communication between zones, ip_wput_ire has special logic to set 654 * the right source address when sending using an IRE_LOCAL. 655 * 656 * Furthermore, when ip_restrict_interzone_loopback is set (the default), 657 * ire_cache_lookup restricts loopback using an IRE_LOCAL 658 * between zone to the case when L2 would have conceptually looped the packet 659 * back, i.e. the loopback which is required since neither Ethernet drivers 660 * nor Ethernet hardware loops them back. This is the case when the normal 661 * routes (ignoring IREs with different zoneids) would send out the packet on 662 * the same ill as the ill with which is IRE_LOCAL is associated. 663 * 664 * Multiple zones can share a common broadcast address; typically all zones 665 * share the 255.255.255.255 address. Incoming as well as locally originated 666 * broadcast packets must be dispatched to all the zones on the broadcast 667 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 668 * since some zones may not be on the 10.16.72/24 network. To handle this, each 669 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 670 * sent to every zone that has an IRE_BROADCAST entry for the destination 671 * address on the input ill, see conn_wantpacket(). 672 * 673 * Applications in different zones can join the same multicast group address. 674 * For IPv4, group memberships are per-logical interface, so they're already 675 * inherently part of a zone. For IPv6, group memberships are per-physical 676 * interface, so we distinguish IPv6 group memberships based on group address, 677 * interface and zoneid. In both cases, received multicast packets are sent to 678 * every zone for which a group membership entry exists. On IPv6 we need to 679 * check that the target zone still has an address on the receiving physical 680 * interface; it could have been removed since the application issued the 681 * IPV6_JOIN_GROUP. 682 */ 683 684 /* 685 * Squeue Fanout flags: 686 * 0: No fanout. 687 * 1: Fanout across all squeues 688 */ 689 boolean_t ip_squeue_fanout = 0; 690 691 /* 692 * Maximum dups allowed per packet. 693 */ 694 uint_t ip_max_frag_dups = 10; 695 696 #define IS_SIMPLE_IPH(ipha) \ 697 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 698 699 /* RFC 1122 Conformance */ 700 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 701 702 #define ILL_MAX_NAMELEN LIFNAMSIZ 703 704 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 705 706 static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, 707 cred_t *credp, boolean_t isv6); 708 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t, 709 ipha_t **); 710 711 static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t, 712 ip_stack_t *); 713 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 714 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 715 static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp); 716 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t, 717 mblk_t *, int, ip_stack_t *); 718 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 719 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 720 ill_t *, zoneid_t); 721 static void icmp_options_update(ipha_t *); 722 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t, 723 ip_stack_t *); 724 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t, 725 zoneid_t zoneid, ip_stack_t *); 726 static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *); 727 static void icmp_redirect(ill_t *, mblk_t *); 728 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, 729 ip_stack_t *); 730 731 static void ip_arp_news(queue_t *, mblk_t *); 732 static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *); 733 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 734 char *ip_dot_addr(ipaddr_t, char *); 735 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 736 int ip_close(queue_t *, int); 737 static char *ip_dot_saddr(uchar_t *, char *); 738 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 739 boolean_t, boolean_t, ill_t *, zoneid_t); 740 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 741 boolean_t, boolean_t, zoneid_t); 742 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 743 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 744 static void ip_lrput(queue_t *, mblk_t *); 745 ipaddr_t ip_net_mask(ipaddr_t); 746 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, 747 ip_stack_t *); 748 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 749 conn_t *, uint32_t, zoneid_t, ip_opt_info_t *); 750 char *ip_nv_lookup(nv_t *, int); 751 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 752 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 753 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 754 static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, 755 ipndp_t *, size_t); 756 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 757 void ip_rput(queue_t *, mblk_t *); 758 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 759 void *dummy_arg); 760 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 761 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *, 762 ip_stack_t *); 763 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 764 ire_t *, ip_stack_t *); 765 static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, 766 mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); 767 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, 768 ip_stack_t *); 769 static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *, 770 uint32_t *, uint16_t *); 771 int ip_snmp_get(queue_t *, mblk_t *, int); 772 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, 773 mib2_ipIfStatsEntry_t *, ip_stack_t *); 774 static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, 775 ip_stack_t *); 776 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); 777 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); 778 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); 779 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); 780 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); 781 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, 782 ip_stack_t *ipst); 783 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, 784 ip_stack_t *ipst); 785 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, 786 ip_stack_t *ipst); 787 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, 788 ip_stack_t *ipst); 789 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, 790 ip_stack_t *ipst); 791 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, 792 ip_stack_t *ipst); 793 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, 794 ip_stack_t *ipst); 795 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, 796 ip_stack_t *ipst); 797 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int, 798 ip_stack_t *ipst); 799 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int, 800 ip_stack_t *ipst); 801 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 802 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 803 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 804 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 805 static boolean_t ip_source_routed(ipha_t *, ip_stack_t *); 806 static boolean_t ip_source_route_included(ipha_t *); 807 static void ip_trash_ire_reclaim_stack(ip_stack_t *); 808 809 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, 810 zoneid_t, ip_stack_t *, conn_t *); 811 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *, 812 mblk_t *); 813 static void ip_wput_local_options(ipha_t *, ip_stack_t *); 814 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 815 zoneid_t, ip_stack_t *); 816 817 static void conn_drain_init(ip_stack_t *); 818 static void conn_drain_fini(ip_stack_t *); 819 static void conn_drain_tail(conn_t *connp, boolean_t closing); 820 821 static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *); 822 static void conn_setqfull(conn_t *); 823 static void conn_clrqfull(conn_t *); 824 825 static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); 826 static void ip_stack_shutdown(netstackid_t stackid, void *arg); 827 static void ip_stack_fini(netstackid_t stackid, void *arg); 828 829 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 830 zoneid_t); 831 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 832 void *dummy_arg); 833 834 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 835 836 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 837 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 838 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 839 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 840 841 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 842 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 843 caddr_t, cred_t *); 844 extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int, 845 cred_t *, boolean_t); 846 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 847 caddr_t cp, cred_t *cr); 848 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 849 cred_t *); 850 static int ip_squeue_switch(int); 851 852 static void *ip_kstat_init(netstackid_t, ip_stack_t *); 853 static void ip_kstat_fini(netstackid_t, kstat_t *); 854 static int ip_kstat_update(kstat_t *kp, int rw); 855 static void *icmp_kstat_init(netstackid_t); 856 static void icmp_kstat_fini(netstackid_t, kstat_t *); 857 static int icmp_kstat_update(kstat_t *kp, int rw); 858 static void *ip_kstat2_init(netstackid_t, ip_stat_t *); 859 static void ip_kstat2_fini(netstackid_t, kstat_t *); 860 861 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 862 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 863 864 static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, 865 ipha_t *, ill_t *, boolean_t, boolean_t); 866 867 static void ipobs_init(ip_stack_t *); 868 static void ipobs_fini(ip_stack_t *); 869 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 870 871 /* How long, in seconds, we allow frags to hang around. */ 872 #define IP_FRAG_TIMEOUT 15 873 #define IPV6_FRAG_TIMEOUT 60 874 875 /* 876 * Threshold which determines whether MDT should be used when 877 * generating IP fragments; payload size must be greater than 878 * this threshold for MDT to take place. 879 */ 880 #define IP_WPUT_FRAG_MDT_MIN 32768 881 882 /* Setable in /etc/system only */ 883 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 884 885 static long ip_rput_pullups; 886 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 887 888 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ 889 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ 890 891 int ip_debug; 892 893 #ifdef DEBUG 894 uint32_t ipsechw_debug = 0; 895 #endif 896 897 /* 898 * Multirouting/CGTP stuff 899 */ 900 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 901 902 /* 903 * XXX following really should only be in a header. Would need more 904 * header and .c clean up first. 905 */ 906 extern optdb_obj_t ip_opt_obj; 907 908 ulong_t ip_squeue_enter_unbound = 0; 909 910 /* 911 * Named Dispatch Parameter Table. 912 * All of these are alterable, within the min/max values given, at run time. 913 */ 914 static ipparam_t lcl_param_arr[] = { 915 /* min max value name */ 916 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 917 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 918 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 919 { 0, 1, 0, "ip_respond_to_timestamp"}, 920 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 921 { 0, 1, 1, "ip_send_redirects"}, 922 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 923 { 0, 10, 0, "ip_mrtdebug"}, 924 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 925 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 926 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 927 { 1, 255, 255, "ip_def_ttl" }, 928 { 0, 1, 0, "ip_forward_src_routed"}, 929 { 0, 256, 32, "ip_wroff_extra" }, 930 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 931 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 932 { 0, 1, 1, "ip_path_mtu_discovery" }, 933 { 0, 240, 30, "ip_ignore_delete_time" }, 934 { 0, 1, 0, "ip_ignore_redirect" }, 935 { 0, 1, 1, "ip_output_queue" }, 936 { 1, 254, 1, "ip_broadcast_ttl" }, 937 { 0, 99999, 100, "ip_icmp_err_interval" }, 938 { 1, 99999, 10, "ip_icmp_err_burst" }, 939 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 940 { 0, 1, 0, "ip_strict_dst_multihoming" }, 941 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 942 { 0, 1, 0, "ipsec_override_persocket_policy" }, 943 { 0, 1, 1, "icmp_accept_clear_messages" }, 944 { 0, 1, 1, "igmp_accept_clear_messages" }, 945 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 946 "ip_ndp_delay_first_probe_time"}, 947 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 948 "ip_ndp_max_unicast_solicit"}, 949 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 950 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 951 { 0, 1, 0, "ip6_forward_src_routed"}, 952 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 953 { 0, 1, 1, "ip6_send_redirects"}, 954 { 0, 1, 0, "ip6_ignore_redirect" }, 955 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 956 957 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 958 959 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 960 961 { 0, 1, 1, "pim_accept_clear_messages" }, 962 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 963 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 964 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 965 { 0, 15, 0, "ip_policy_mask" }, 966 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 967 { 0, 255, 1, "ip_multirt_ttl" }, 968 { 0, 1, 1, "ip_multidata_outbound" }, 969 { 0, 3600000, 300000, "ip_ndp_defense_interval" }, 970 { 0, 999999, 60*60*24, "ip_max_temp_idle" }, 971 { 0, 1000, 1, "ip_max_temp_defend" }, 972 { 0, 1000, 3, "ip_max_defend" }, 973 { 0, 999999, 30, "ip_defend_interval" }, 974 { 0, 3600000, 300000, "ip_dup_recovery" }, 975 { 0, 1, 1, "ip_restrict_interzone_loopback" }, 976 { 0, 1, 1, "ip_lso_outbound" }, 977 { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, 978 { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, 979 { 68, 65535, 576, "ip_pmtu_min" }, 980 #ifdef DEBUG 981 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 982 #else 983 { 0, 0, 0, "" }, 984 #endif 985 }; 986 987 /* 988 * Extended NDP table 989 * The addresses for the first two are filled in to be ips_ip_g_forward 990 * and ips_ipv6_forward at init time. 991 */ 992 static ipndp_t lcl_ndp_arr[] = { 993 /* getf setf data name */ 994 #define IPNDP_IP_FORWARDING_OFFSET 0 995 { ip_param_generic_get, ip_forward_set, NULL, 996 "ip_forwarding" }, 997 #define IPNDP_IP6_FORWARDING_OFFSET 1 998 { ip_param_generic_get, ip_forward_set, NULL, 999 "ip6_forwarding" }, 1000 { ip_param_generic_get, ip_input_proc_set, 1001 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 1002 { ip_param_generic_get, ip_int_set, 1003 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 1004 #define IPNDP_CGTP_FILTER_OFFSET 4 1005 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, 1006 "ip_cgtp_filter" }, 1007 { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, 1008 "ip_debug" }, 1009 }; 1010 1011 /* 1012 * Table of IP ioctls encoding the various properties of the ioctl and 1013 * indexed based on the last byte of the ioctl command. Occasionally there 1014 * is a clash, and there is more than 1 ioctl with the same last byte. 1015 * In such a case 1 ioctl is encoded in the ndx table and the remaining 1016 * ioctls are encoded in the misc table. An entry in the ndx table is 1017 * retrieved by indexing on the last byte of the ioctl command and comparing 1018 * the ioctl command with the value in the ndx table. In the event of a 1019 * mismatch the misc table is then searched sequentially for the desired 1020 * ioctl command. 1021 * 1022 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 1023 */ 1024 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 1025 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1026 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1027 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1028 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1029 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1030 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1031 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1032 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1033 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1034 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1035 1036 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 1037 MISC_CMD, ip_siocaddrt, NULL }, 1038 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 1039 MISC_CMD, ip_siocdelrt, NULL }, 1040 1041 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1042 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1043 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD, 1044 IF_CMD, ip_sioctl_get_addr, NULL }, 1045 1046 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1047 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1048 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 1049 IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL }, 1050 1051 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 1052 IPI_PRIV | IPI_WR, 1053 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1054 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 1055 IPI_MODOK | IPI_GET_CMD, 1056 IF_CMD, ip_sioctl_get_flags, NULL }, 1057 1058 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1059 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1060 1061 /* copyin size cannot be coded for SIOCGIFCONF */ 1062 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, 1063 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1064 1065 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1066 IF_CMD, ip_sioctl_mtu, NULL }, 1067 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD, 1068 IF_CMD, ip_sioctl_get_mtu, NULL }, 1069 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 1070 IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL }, 1071 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1072 IF_CMD, ip_sioctl_brdaddr, NULL }, 1073 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1074 IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL }, 1075 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1076 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1077 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1078 IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL }, 1079 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1080 IF_CMD, ip_sioctl_metric, NULL }, 1081 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1082 1083 /* See 166-168 below for extended SIOC*XARP ioctls */ 1084 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, 1085 ARP_CMD, ip_sioctl_arp, NULL }, 1086 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD, 1087 ARP_CMD, ip_sioctl_arp, NULL }, 1088 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, 1089 ARP_CMD, ip_sioctl_arp, NULL }, 1090 1091 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1092 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1093 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1094 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1095 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1096 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1097 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1098 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1099 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1100 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1101 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1102 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1103 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1104 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1105 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1106 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1107 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1108 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1109 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1110 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1111 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1112 1113 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1114 MISC_CMD, if_unitsel, if_unitsel_restart }, 1115 1116 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1117 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1118 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1119 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1120 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1121 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1122 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1123 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1124 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1125 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1126 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1127 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1128 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1129 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1130 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1131 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1132 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1133 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1134 1135 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1136 IPI_PRIV | IPI_WR | IPI_MODOK, 1137 IF_CMD, ip_sioctl_sifname, NULL }, 1138 1139 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1140 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1141 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1142 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1143 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1144 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1145 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1146 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1147 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1148 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1149 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1150 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1151 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1152 1153 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD, 1154 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1155 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD, 1156 IF_CMD, ip_sioctl_get_muxid, NULL }, 1157 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1158 IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL }, 1159 1160 /* Both if and lif variants share same func */ 1161 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD, 1162 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1163 /* Both if and lif variants share same func */ 1164 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1165 IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL }, 1166 1167 /* copyin size cannot be coded for SIOCGIFCONF */ 1168 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, 1169 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1170 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1171 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1172 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1173 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1174 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1175 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1176 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1177 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1178 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1179 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1180 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1181 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1182 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1183 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1184 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1185 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1186 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1187 1188 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1189 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif, 1190 ip_sioctl_removeif_restart }, 1191 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1192 IPI_GET_CMD | IPI_PRIV | IPI_WR, 1193 LIF_CMD, ip_sioctl_addif, NULL }, 1194 #define SIOCLIFADDR_NDX 112 1195 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1196 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1197 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1198 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL }, 1199 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1200 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1201 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1202 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1203 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1204 IPI_PRIV | IPI_WR, 1205 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1206 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1207 IPI_GET_CMD | IPI_MODOK, 1208 LIF_CMD, ip_sioctl_get_flags, NULL }, 1209 1210 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1211 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1212 1213 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1214 ip_sioctl_get_lifconf, NULL }, 1215 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1216 LIF_CMD, ip_sioctl_mtu, NULL }, 1217 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD, 1218 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1219 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1220 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1221 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1222 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1223 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1224 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL }, 1225 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1226 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1227 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1228 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL }, 1229 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1230 LIF_CMD, ip_sioctl_metric, NULL }, 1231 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1232 IPI_PRIV | IPI_WR | IPI_MODOK, 1233 LIF_CMD, ip_sioctl_slifname, 1234 ip_sioctl_slifname_restart }, 1235 1236 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD, 1237 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1238 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1239 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL }, 1240 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1241 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL }, 1242 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1243 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1244 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1245 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 }, 1246 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1247 LIF_CMD, ip_sioctl_token, NULL }, 1248 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1249 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL }, 1250 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1251 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1252 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1253 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL }, 1254 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1255 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1256 1257 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1258 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1259 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1260 LIF_CMD, ip_siocdelndp_v6, NULL }, 1261 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1262 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1263 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1264 LIF_CMD, ip_siocsetndp_v6, NULL }, 1265 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1266 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1267 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1268 MISC_CMD, ip_sioctl_tonlink, NULL }, 1269 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1270 MISC_CMD, ip_sioctl_tmysite, NULL }, 1271 /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1272 /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1273 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1274 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1275 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1276 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1277 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1278 1279 /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1280 1281 /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD, 1282 LIF_CMD, ip_sioctl_get_binding, NULL }, 1283 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1284 IPI_PRIV | IPI_WR, 1285 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1286 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1287 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL }, 1288 /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t), 1289 IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL }, 1290 1291 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1292 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1293 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1294 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1295 1296 /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1297 1298 /* These are handled in ip_sioctl_copyin_setup itself */ 1299 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1300 MISC_CMD, NULL, NULL }, 1301 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1302 MISC_CMD, NULL, NULL }, 1303 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1304 1305 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1306 ip_sioctl_get_lifconf, NULL }, 1307 1308 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, 1309 XARP_CMD, ip_sioctl_arp, NULL }, 1310 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD, 1311 XARP_CMD, ip_sioctl_arp, NULL }, 1312 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, 1313 XARP_CMD, ip_sioctl_arp, NULL }, 1314 1315 /* SIOCPOPSOCKFS is not handled by IP */ 1316 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1317 1318 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1319 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1320 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1321 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone, 1322 ip_sioctl_slifzone_restart }, 1323 /* 172-174 are SCTP ioctls and not handled by IP */ 1324 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1325 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1326 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1327 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1328 IPI_GET_CMD, LIF_CMD, 1329 ip_sioctl_get_lifusesrc, 0 }, 1330 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1331 IPI_PRIV | IPI_WR, 1332 LIF_CMD, ip_sioctl_slifusesrc, 1333 NULL }, 1334 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1335 ip_sioctl_get_lifsrcof, NULL }, 1336 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1337 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1338 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1339 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1340 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1341 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1342 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1343 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1344 /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1345 /* SIOCSENABLESDP is handled by SDP */ 1346 /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, 1347 /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL }, 1348 }; 1349 1350 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1351 1352 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1353 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1354 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1355 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1356 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1357 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1358 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1359 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1360 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD, 1361 MISC_CMD, mrt_ioctl}, 1362 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD, 1363 MISC_CMD, mrt_ioctl}, 1364 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD, 1365 MISC_CMD, mrt_ioctl} 1366 }; 1367 1368 int ip_misc_ioctl_count = 1369 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1370 1371 int conn_drain_nthreads; /* Number of drainers reqd. */ 1372 /* Settable in /etc/system */ 1373 /* Defined in ip_ire.c */ 1374 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1375 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1376 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1377 1378 static nv_t ire_nv_arr[] = { 1379 { IRE_BROADCAST, "BROADCAST" }, 1380 { IRE_LOCAL, "LOCAL" }, 1381 { IRE_LOOPBACK, "LOOPBACK" }, 1382 { IRE_CACHE, "CACHE" }, 1383 { IRE_DEFAULT, "DEFAULT" }, 1384 { IRE_PREFIX, "PREFIX" }, 1385 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1386 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1387 { IRE_HOST, "HOST" }, 1388 { 0 } 1389 }; 1390 1391 nv_t *ire_nv_tbl = ire_nv_arr; 1392 1393 /* Simple ICMP IP Header Template */ 1394 static ipha_t icmp_ipha = { 1395 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1396 }; 1397 1398 struct module_info ip_mod_info = { 1399 IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT, 1400 IP_MOD_LOWAT 1401 }; 1402 1403 /* 1404 * Duplicate static symbols within a module confuses mdb; so we avoid the 1405 * problem by making the symbols here distinct from those in udp.c. 1406 */ 1407 1408 /* 1409 * Entry points for IP as a device and as a module. 1410 * FIXME: down the road we might want a separate module and driver qinit. 1411 * We have separate open functions for the /dev/ip and /dev/ip6 devices. 1412 */ 1413 static struct qinit iprinitv4 = { 1414 (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, 1415 &ip_mod_info 1416 }; 1417 1418 struct qinit iprinitv6 = { 1419 (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, 1420 &ip_mod_info 1421 }; 1422 1423 static struct qinit ipwinitv4 = { 1424 (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1425 &ip_mod_info 1426 }; 1427 1428 struct qinit ipwinitv6 = { 1429 (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1430 &ip_mod_info 1431 }; 1432 1433 static struct qinit iplrinit = { 1434 (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, 1435 &ip_mod_info 1436 }; 1437 1438 static struct qinit iplwinit = { 1439 (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, 1440 &ip_mod_info 1441 }; 1442 1443 /* For AF_INET aka /dev/ip */ 1444 struct streamtab ipinfov4 = { 1445 &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit 1446 }; 1447 1448 /* For AF_INET6 aka /dev/ip6 */ 1449 struct streamtab ipinfov6 = { 1450 &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit 1451 }; 1452 1453 #ifdef DEBUG 1454 static boolean_t skip_sctp_cksum = B_FALSE; 1455 #endif 1456 1457 /* 1458 * Prepend the zoneid using an ipsec_out_t for later use by functions like 1459 * ip_rput_v6(), ip_output(), etc. If the message 1460 * block already has a M_CTL at the front of it, then simply set the zoneid 1461 * appropriately. 1462 */ 1463 mblk_t * 1464 ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1465 { 1466 mblk_t *first_mp; 1467 ipsec_out_t *io; 1468 1469 ASSERT(zoneid != ALL_ZONES); 1470 if (mp->b_datap->db_type == M_CTL) { 1471 io = (ipsec_out_t *)mp->b_rptr; 1472 ASSERT(io->ipsec_out_type == IPSEC_OUT); 1473 io->ipsec_out_zoneid = zoneid; 1474 return (mp); 1475 } 1476 1477 first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack); 1478 if (first_mp == NULL) 1479 return (NULL); 1480 io = (ipsec_out_t *)first_mp->b_rptr; 1481 /* This is not a secure packet */ 1482 io->ipsec_out_secure = B_FALSE; 1483 io->ipsec_out_zoneid = zoneid; 1484 first_mp->b_cont = mp; 1485 return (first_mp); 1486 } 1487 1488 /* 1489 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1490 */ 1491 mblk_t * 1492 ip_copymsg(mblk_t *mp) 1493 { 1494 mblk_t *nmp; 1495 ipsec_info_t *in; 1496 1497 if (mp->b_datap->db_type != M_CTL) 1498 return (copymsg(mp)); 1499 1500 in = (ipsec_info_t *)mp->b_rptr; 1501 1502 /* 1503 * Note that M_CTL is also used for delivering ICMP error messages 1504 * upstream to transport layers. 1505 */ 1506 if (in->ipsec_info_type != IPSEC_OUT && 1507 in->ipsec_info_type != IPSEC_IN) 1508 return (copymsg(mp)); 1509 1510 nmp = copymsg(mp->b_cont); 1511 1512 if (in->ipsec_info_type == IPSEC_OUT) { 1513 return (ipsec_out_tag(mp, nmp, 1514 ((ipsec_out_t *)in)->ipsec_out_ns)); 1515 } else { 1516 return (ipsec_in_tag(mp, nmp, 1517 ((ipsec_in_t *)in)->ipsec_in_ns)); 1518 } 1519 } 1520 1521 /* Generate an ICMP fragmentation needed message. */ 1522 static void 1523 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, 1524 ip_stack_t *ipst) 1525 { 1526 icmph_t icmph; 1527 mblk_t *first_mp; 1528 boolean_t mctl_present; 1529 1530 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1531 1532 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 1533 if (mctl_present) 1534 freeb(first_mp); 1535 return; 1536 } 1537 1538 bzero(&icmph, sizeof (icmph_t)); 1539 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1540 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1541 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1542 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); 1543 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 1544 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 1545 ipst); 1546 } 1547 1548 /* 1549 * icmp_inbound deals with ICMP messages in the following ways. 1550 * 1551 * 1) It needs to send a reply back and possibly delivering it 1552 * to the "interested" upper clients. 1553 * 2) It needs to send it to the upper clients only. 1554 * 3) It needs to change some values in IP only. 1555 * 4) It needs to change some values in IP and upper layers e.g TCP. 1556 * 1557 * We need to accomodate icmp messages coming in clear until we get 1558 * everything secure from the wire. If icmp_accept_clear_messages 1559 * is zero we check with the global policy and act accordingly. If 1560 * it is non-zero, we accept the message without any checks. But 1561 * *this does not mean* that this will be delivered to the upper 1562 * clients. By accepting we might send replies back, change our MTU 1563 * value etc. but delivery to the ULP/clients depends on their policy 1564 * dispositions. 1565 * 1566 * We handle the above 4 cases in the context of IPsec in the 1567 * following way : 1568 * 1569 * 1) Send the reply back in the same way as the request came in. 1570 * If it came in encrypted, it goes out encrypted. If it came in 1571 * clear, it goes out in clear. Thus, this will prevent chosen 1572 * plain text attack. 1573 * 2) The client may or may not expect things to come in secure. 1574 * If it comes in secure, the policy constraints are checked 1575 * before delivering it to the upper layers. If it comes in 1576 * clear, ipsec_inbound_accept_clear will decide whether to 1577 * accept this in clear or not. In both the cases, if the returned 1578 * message (IP header + 8 bytes) that caused the icmp message has 1579 * AH/ESP headers, it is sent up to AH/ESP for validation before 1580 * sending up. If there are only 8 bytes of returned message, then 1581 * upper client will not be notified. 1582 * 3) Check with global policy to see whether it matches the constaints. 1583 * But this will be done only if icmp_accept_messages_in_clear is 1584 * zero. 1585 * 4) If we need to change both in IP and ULP, then the decision taken 1586 * while affecting the values in IP and while delivering up to TCP 1587 * should be the same. 1588 * 1589 * There are two cases. 1590 * 1591 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1592 * failed), we will not deliver it to the ULP, even though they 1593 * are *willing* to accept in *clear*. This is fine as our global 1594 * disposition to icmp messages asks us reject the datagram. 1595 * 1596 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1597 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1598 * to deliver it to ULP (policy failed), it can lead to 1599 * consistency problems. The cases known at this time are 1600 * ICMP_DESTINATION_UNREACHABLE messages with following code 1601 * values : 1602 * 1603 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1604 * and Upper layer rejects. Then the communication will 1605 * come to a stop. This is solved by making similar decisions 1606 * at both levels. Currently, when we are unable to deliver 1607 * to the Upper Layer (due to policy failures) while IP has 1608 * adjusted ire_max_frag, the next outbound datagram would 1609 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1610 * will be with the right level of protection. Thus the right 1611 * value will be communicated even if we are not able to 1612 * communicate when we get from the wire initially. But this 1613 * assumes there would be at least one outbound datagram after 1614 * IP has adjusted its ire_max_frag value. To make things 1615 * simpler, we accept in clear after the validation of 1616 * AH/ESP headers. 1617 * 1618 * - Other ICMP ERRORS : We may not be able to deliver it to the 1619 * upper layer depending on the level of protection the upper 1620 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1621 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1622 * should be accepted in clear when the Upper layer expects secure. 1623 * Thus the communication may get aborted by some bad ICMP 1624 * packets. 1625 * 1626 * IPQoS Notes: 1627 * The only instance when a packet is sent for processing is when there 1628 * isn't an ICMP client and if we are interested in it. 1629 * If there is a client, IPPF processing will take place in the 1630 * ip_fanout_proto routine. 1631 * 1632 * Zones notes: 1633 * The packet is only processed in the context of the specified zone: typically 1634 * only this zone will reply to an echo request, and only interested clients in 1635 * this zone will receive a copy of the packet. This means that the caller must 1636 * call icmp_inbound() for each relevant zone. 1637 */ 1638 static void 1639 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1640 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1641 ill_t *recv_ill, zoneid_t zoneid) 1642 { 1643 icmph_t *icmph; 1644 ipha_t *ipha; 1645 int iph_hdr_length; 1646 int hdr_length; 1647 boolean_t interested; 1648 uint32_t ts; 1649 uchar_t *wptr; 1650 ipif_t *ipif; 1651 mblk_t *first_mp; 1652 ipsec_in_t *ii; 1653 timestruc_t now; 1654 uint32_t ill_index; 1655 ip_stack_t *ipst; 1656 1657 ASSERT(ill != NULL); 1658 ipst = ill->ill_ipst; 1659 1660 first_mp = mp; 1661 if (mctl_present) { 1662 mp = first_mp->b_cont; 1663 ASSERT(mp != NULL); 1664 } 1665 1666 ipha = (ipha_t *)mp->b_rptr; 1667 if (ipst->ips_icmp_accept_clear_messages == 0) { 1668 first_mp = ipsec_check_global_policy(first_mp, NULL, 1669 ipha, NULL, mctl_present, ipst->ips_netstack); 1670 if (first_mp == NULL) 1671 return; 1672 } 1673 1674 /* 1675 * On a labeled system, we have to check whether the zone itself is 1676 * permitted to receive raw traffic. 1677 */ 1678 if (is_system_labeled()) { 1679 if (zoneid == ALL_ZONES) 1680 zoneid = tsol_packet_to_zoneid(mp); 1681 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1682 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1683 zoneid)); 1684 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1685 freemsg(first_mp); 1686 return; 1687 } 1688 } 1689 1690 /* 1691 * We have accepted the ICMP message. It means that we will 1692 * respond to the packet if needed. It may not be delivered 1693 * to the upper client depending on the policy constraints 1694 * and the disposition in ipsec_inbound_accept_clear. 1695 */ 1696 1697 ASSERT(ill != NULL); 1698 1699 BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); 1700 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1701 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1702 /* Last chance to get real. */ 1703 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1704 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1705 freemsg(first_mp); 1706 return; 1707 } 1708 /* Refresh iph following the pullup. */ 1709 ipha = (ipha_t *)mp->b_rptr; 1710 } 1711 /* ICMP header checksum, including checksum field, should be zero. */ 1712 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1713 IP_CSUM(mp, iph_hdr_length, 0)) { 1714 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 1715 freemsg(first_mp); 1716 return; 1717 } 1718 /* The IP header will always be a multiple of four bytes */ 1719 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1720 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1721 icmph->icmph_code)); 1722 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1723 /* We will set "interested" to "true" if we want a copy */ 1724 interested = B_FALSE; 1725 switch (icmph->icmph_type) { 1726 case ICMP_ECHO_REPLY: 1727 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); 1728 break; 1729 case ICMP_DEST_UNREACHABLE: 1730 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1731 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); 1732 interested = B_TRUE; /* Pass up to transport */ 1733 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); 1734 break; 1735 case ICMP_SOURCE_QUENCH: 1736 interested = B_TRUE; /* Pass up to transport */ 1737 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); 1738 break; 1739 case ICMP_REDIRECT: 1740 if (!ipst->ips_ip_ignore_redirect) 1741 interested = B_TRUE; 1742 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); 1743 break; 1744 case ICMP_ECHO_REQUEST: 1745 /* 1746 * Whether to respond to echo requests that come in as IP 1747 * broadcasts or as IP multicast is subject to debate 1748 * (what isn't?). We aim to please, you pick it. 1749 * Default is do it. 1750 */ 1751 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1752 /* unicast: always respond */ 1753 interested = B_TRUE; 1754 } else if (CLASSD(ipha->ipha_dst)) { 1755 /* multicast: respond based on tunable */ 1756 interested = ipst->ips_ip_g_resp_to_echo_mcast; 1757 } else if (broadcast) { 1758 /* broadcast: respond based on tunable */ 1759 interested = ipst->ips_ip_g_resp_to_echo_bcast; 1760 } 1761 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); 1762 break; 1763 case ICMP_ROUTER_ADVERTISEMENT: 1764 case ICMP_ROUTER_SOLICITATION: 1765 break; 1766 case ICMP_TIME_EXCEEDED: 1767 interested = B_TRUE; /* Pass up to transport */ 1768 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); 1769 break; 1770 case ICMP_PARAM_PROBLEM: 1771 interested = B_TRUE; /* Pass up to transport */ 1772 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); 1773 break; 1774 case ICMP_TIME_STAMP_REQUEST: 1775 /* Response to Time Stamp Requests is local policy. */ 1776 if (ipst->ips_ip_g_resp_to_timestamp && 1777 /* So is whether to respond if it was an IP broadcast. */ 1778 (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) { 1779 int tstamp_len = 3 * sizeof (uint32_t); 1780 1781 if (wptr + tstamp_len > mp->b_wptr) { 1782 if (!pullupmsg(mp, wptr + tstamp_len - 1783 mp->b_rptr)) { 1784 BUMP_MIB(ill->ill_ip_mib, 1785 ipIfStatsInDiscards); 1786 freemsg(first_mp); 1787 return; 1788 } 1789 /* Refresh ipha following the pullup. */ 1790 ipha = (ipha_t *)mp->b_rptr; 1791 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1792 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1793 } 1794 interested = B_TRUE; 1795 } 1796 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); 1797 break; 1798 case ICMP_TIME_STAMP_REPLY: 1799 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); 1800 break; 1801 case ICMP_INFO_REQUEST: 1802 /* Per RFC 1122 3.2.2.7, ignore this. */ 1803 case ICMP_INFO_REPLY: 1804 break; 1805 case ICMP_ADDRESS_MASK_REQUEST: 1806 if ((ipst->ips_ip_respond_to_address_mask_broadcast || 1807 !broadcast) && 1808 /* TODO m_pullup of complete header? */ 1809 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) { 1810 interested = B_TRUE; 1811 } 1812 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); 1813 break; 1814 case ICMP_ADDRESS_MASK_REPLY: 1815 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); 1816 break; 1817 default: 1818 interested = B_TRUE; /* Pass up to transport */ 1819 BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); 1820 break; 1821 } 1822 /* See if there is an ICMP client. */ 1823 if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) { 1824 /* If there is an ICMP client and we want one too, copy it. */ 1825 mblk_t *first_mp1; 1826 1827 if (!interested) { 1828 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1829 ip_policy, recv_ill, zoneid); 1830 return; 1831 } 1832 first_mp1 = ip_copymsg(first_mp); 1833 if (first_mp1 != NULL) { 1834 ip_fanout_proto(q, first_mp1, ill, ipha, 1835 0, mctl_present, ip_policy, recv_ill, zoneid); 1836 } 1837 } else if (!interested) { 1838 freemsg(first_mp); 1839 return; 1840 } else { 1841 /* 1842 * Initiate policy processing for this packet if ip_policy 1843 * is true. 1844 */ 1845 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 1846 ill_index = ill->ill_phyint->phyint_ifindex; 1847 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1848 if (mp == NULL) { 1849 if (mctl_present) { 1850 freeb(first_mp); 1851 } 1852 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1853 return; 1854 } 1855 } 1856 } 1857 /* We want to do something with it. */ 1858 /* Check db_ref to make sure we can modify the packet. */ 1859 if (mp->b_datap->db_ref > 1) { 1860 mblk_t *first_mp1; 1861 1862 first_mp1 = ip_copymsg(first_mp); 1863 freemsg(first_mp); 1864 if (!first_mp1) { 1865 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1866 return; 1867 } 1868 first_mp = first_mp1; 1869 if (mctl_present) { 1870 mp = first_mp->b_cont; 1871 ASSERT(mp != NULL); 1872 } else { 1873 mp = first_mp; 1874 } 1875 ipha = (ipha_t *)mp->b_rptr; 1876 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1877 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1878 } 1879 switch (icmph->icmph_type) { 1880 case ICMP_ADDRESS_MASK_REQUEST: 1881 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1882 if (ipif == NULL) { 1883 freemsg(first_mp); 1884 return; 1885 } 1886 /* 1887 * outging interface must be IPv4 1888 */ 1889 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1890 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1891 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1892 ipif_refrele(ipif); 1893 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); 1894 break; 1895 case ICMP_ECHO_REQUEST: 1896 icmph->icmph_type = ICMP_ECHO_REPLY; 1897 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); 1898 break; 1899 case ICMP_TIME_STAMP_REQUEST: { 1900 uint32_t *tsp; 1901 1902 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1903 tsp = (uint32_t *)wptr; 1904 tsp++; /* Skip past 'originate time' */ 1905 /* Compute # of milliseconds since midnight */ 1906 gethrestime(&now); 1907 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1908 now.tv_nsec / (NANOSEC / MILLISEC); 1909 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1910 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1911 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); 1912 break; 1913 } 1914 default: 1915 ipha = (ipha_t *)&icmph[1]; 1916 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1917 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1918 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1919 freemsg(first_mp); 1920 return; 1921 } 1922 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1923 ipha = (ipha_t *)&icmph[1]; 1924 } 1925 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1926 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1927 freemsg(first_mp); 1928 return; 1929 } 1930 hdr_length = IPH_HDR_LENGTH(ipha); 1931 if (hdr_length < sizeof (ipha_t)) { 1932 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1933 freemsg(first_mp); 1934 return; 1935 } 1936 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1937 if (!pullupmsg(mp, 1938 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1939 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1940 freemsg(first_mp); 1941 return; 1942 } 1943 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1944 ipha = (ipha_t *)&icmph[1]; 1945 } 1946 switch (icmph->icmph_type) { 1947 case ICMP_REDIRECT: 1948 /* 1949 * As there is no upper client to deliver, we don't 1950 * need the first_mp any more. 1951 */ 1952 if (mctl_present) { 1953 freeb(first_mp); 1954 } 1955 icmp_redirect(ill, mp); 1956 return; 1957 case ICMP_DEST_UNREACHABLE: 1958 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1959 if (!icmp_inbound_too_big(icmph, ipha, ill, 1960 zoneid, mp, iph_hdr_length, ipst)) { 1961 freemsg(first_mp); 1962 return; 1963 } 1964 /* 1965 * icmp_inbound_too_big() may alter mp. 1966 * Resynch ipha and icmph accordingly. 1967 */ 1968 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1969 ipha = (ipha_t *)&icmph[1]; 1970 } 1971 /* FALLTHRU */ 1972 default : 1973 /* 1974 * IPQoS notes: Since we have already done IPQoS 1975 * processing we don't want to do it again in 1976 * the fanout routines called by 1977 * icmp_inbound_error_fanout, hence the last 1978 * argument, ip_policy, is B_FALSE. 1979 */ 1980 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1981 ipha, iph_hdr_length, hdr_length, mctl_present, 1982 B_FALSE, recv_ill, zoneid); 1983 } 1984 return; 1985 } 1986 /* Send out an ICMP packet */ 1987 icmph->icmph_checksum = 0; 1988 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 1989 if (broadcast || CLASSD(ipha->ipha_dst)) { 1990 ipif_t *ipif_chosen; 1991 /* 1992 * Make it look like it was directed to us, so we don't look 1993 * like a fool with a broadcast or multicast source address. 1994 */ 1995 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1996 /* 1997 * Make sure that we haven't grabbed an interface that's DOWN. 1998 */ 1999 if (ipif != NULL) { 2000 ipif_chosen = ipif_select_source(ipif->ipif_ill, 2001 ipha->ipha_src, zoneid); 2002 if (ipif_chosen != NULL) { 2003 ipif_refrele(ipif); 2004 ipif = ipif_chosen; 2005 } 2006 } 2007 if (ipif == NULL) { 2008 ip0dbg(("icmp_inbound: " 2009 "No source for broadcast/multicast:\n" 2010 "\tsrc 0x%x dst 0x%x ill %p " 2011 "ipif_lcl_addr 0x%x\n", 2012 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2013 (void *)ill, 2014 ill->ill_ipif->ipif_lcl_addr)); 2015 freemsg(first_mp); 2016 return; 2017 } 2018 ASSERT(ipif != NULL && !ipif->ipif_isv6); 2019 ipha->ipha_dst = ipif->ipif_src_addr; 2020 ipif_refrele(ipif); 2021 } 2022 /* Reset time to live. */ 2023 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 2024 { 2025 /* Swap source and destination addresses */ 2026 ipaddr_t tmp; 2027 2028 tmp = ipha->ipha_src; 2029 ipha->ipha_src = ipha->ipha_dst; 2030 ipha->ipha_dst = tmp; 2031 } 2032 ipha->ipha_ident = 0; 2033 if (!IS_SIMPLE_IPH(ipha)) 2034 icmp_options_update(ipha); 2035 2036 if (!mctl_present) { 2037 /* 2038 * This packet should go out the same way as it 2039 * came in i.e in clear. To make sure that global 2040 * policy will not be applied to this in ip_wput_ire, 2041 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2042 */ 2043 ASSERT(first_mp == mp); 2044 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2045 if (first_mp == NULL) { 2046 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2047 freemsg(mp); 2048 return; 2049 } 2050 ii = (ipsec_in_t *)first_mp->b_rptr; 2051 2052 /* This is not a secure packet */ 2053 ii->ipsec_in_secure = B_FALSE; 2054 first_mp->b_cont = mp; 2055 } else { 2056 ii = (ipsec_in_t *)first_mp->b_rptr; 2057 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2058 } 2059 ii->ipsec_in_zoneid = zoneid; 2060 ASSERT(zoneid != ALL_ZONES); 2061 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2062 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2063 return; 2064 } 2065 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 2066 put(WR(q), first_mp); 2067 } 2068 2069 static ipaddr_t 2070 icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp) 2071 { 2072 conn_t *connp; 2073 connf_t *connfp; 2074 ipaddr_t nexthop_addr = INADDR_ANY; 2075 int hdr_length = IPH_HDR_LENGTH(ipha); 2076 uint16_t *up; 2077 uint32_t ports; 2078 ip_stack_t *ipst = ill->ill_ipst; 2079 2080 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2081 switch (ipha->ipha_protocol) { 2082 case IPPROTO_TCP: 2083 { 2084 tcph_t *tcph; 2085 2086 /* do a reverse lookup */ 2087 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2088 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, 2089 TCPS_LISTEN, ipst); 2090 break; 2091 } 2092 case IPPROTO_UDP: 2093 { 2094 uint32_t dstport, srcport; 2095 2096 ((uint16_t *)&ports)[0] = up[1]; 2097 ((uint16_t *)&ports)[1] = up[0]; 2098 2099 /* Extract ports in net byte order */ 2100 dstport = htons(ntohl(ports) & 0xFFFF); 2101 srcport = htons(ntohl(ports) >> 16); 2102 2103 connfp = &ipst->ips_ipcl_udp_fanout[ 2104 IPCL_UDP_HASH(dstport, ipst)]; 2105 mutex_enter(&connfp->connf_lock); 2106 connp = connfp->connf_head; 2107 2108 /* do a reverse lookup */ 2109 while ((connp != NULL) && 2110 (!IPCL_UDP_MATCH(connp, dstport, 2111 ipha->ipha_src, srcport, ipha->ipha_dst) || 2112 !IPCL_ZONE_MATCH(connp, zoneid))) { 2113 connp = connp->conn_next; 2114 } 2115 if (connp != NULL) 2116 CONN_INC_REF(connp); 2117 mutex_exit(&connfp->connf_lock); 2118 break; 2119 } 2120 case IPPROTO_SCTP: 2121 { 2122 in6_addr_t map_src, map_dst; 2123 2124 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src); 2125 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst); 2126 ((uint16_t *)&ports)[0] = up[1]; 2127 ((uint16_t *)&ports)[1] = up[0]; 2128 2129 connp = sctp_find_conn(&map_src, &map_dst, ports, 2130 zoneid, ipst->ips_netstack->netstack_sctp); 2131 if (connp == NULL) { 2132 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, 2133 zoneid, ports, ipha, ipst); 2134 } else { 2135 CONN_INC_REF(connp); 2136 SCTP_REFRELE(CONN2SCTP(connp)); 2137 } 2138 break; 2139 } 2140 default: 2141 { 2142 ipha_t ripha; 2143 2144 ripha.ipha_src = ipha->ipha_dst; 2145 ripha.ipha_dst = ipha->ipha_src; 2146 ripha.ipha_protocol = ipha->ipha_protocol; 2147 2148 connfp = &ipst->ips_ipcl_proto_fanout[ 2149 ipha->ipha_protocol]; 2150 mutex_enter(&connfp->connf_lock); 2151 connp = connfp->connf_head; 2152 for (connp = connfp->connf_head; connp != NULL; 2153 connp = connp->conn_next) { 2154 if (IPCL_PROTO_MATCH(connp, 2155 ipha->ipha_protocol, &ripha, ill, 2156 0, zoneid)) { 2157 CONN_INC_REF(connp); 2158 break; 2159 } 2160 } 2161 mutex_exit(&connfp->connf_lock); 2162 } 2163 } 2164 if (connp != NULL) { 2165 if (connp->conn_nexthop_set) 2166 nexthop_addr = connp->conn_nexthop_v4; 2167 CONN_DEC_REF(connp); 2168 } 2169 return (nexthop_addr); 2170 } 2171 2172 /* Table from RFC 1191 */ 2173 static int icmp_frag_size_table[] = 2174 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2175 2176 /* 2177 * Process received ICMP Packet too big. 2178 * After updating any IRE it does the fanout to any matching transport streams. 2179 * Assumes the message has been pulled up till the IP header that caused 2180 * the error. 2181 * 2182 * Returns B_FALSE on failure and B_TRUE on success. 2183 */ 2184 static boolean_t 2185 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, 2186 zoneid_t zoneid, mblk_t *mp, int iph_hdr_length, 2187 ip_stack_t *ipst) 2188 { 2189 ire_t *ire, *first_ire; 2190 int mtu, orig_mtu; 2191 int hdr_length; 2192 ipaddr_t nexthop_addr; 2193 boolean_t disable_pmtud; 2194 2195 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2196 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2197 ASSERT(ill != NULL); 2198 2199 hdr_length = IPH_HDR_LENGTH(ipha); 2200 2201 /* Drop if the original packet contained a source route */ 2202 if (ip_source_route_included(ipha)) { 2203 return (B_FALSE); 2204 } 2205 /* 2206 * Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport 2207 * header. 2208 */ 2209 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2210 mp->b_wptr) { 2211 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2212 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2213 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2214 ip1dbg(("icmp_inbound_too_big: insufficient hdr\n")); 2215 return (B_FALSE); 2216 } 2217 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2218 ipha = (ipha_t *)&icmph[1]; 2219 } 2220 nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp); 2221 if (nexthop_addr != INADDR_ANY) { 2222 /* nexthop set */ 2223 first_ire = ire_ctable_lookup(ipha->ipha_dst, 2224 nexthop_addr, 0, NULL, ALL_ZONES, msg_getlabel(mp), 2225 MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst); 2226 } else { 2227 /* nexthop not set */ 2228 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, 2229 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2230 } 2231 2232 if (!first_ire) { 2233 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2234 ntohl(ipha->ipha_dst))); 2235 return (B_FALSE); 2236 } 2237 2238 /* Check for MTU discovery advice as described in RFC 1191 */ 2239 mtu = ntohs(icmph->icmph_du_mtu); 2240 orig_mtu = mtu; 2241 disable_pmtud = B_FALSE; 2242 2243 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2244 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2245 ire = ire->ire_next) { 2246 /* 2247 * Look for the connection to which this ICMP message is 2248 * directed. If it has the IP_NEXTHOP option set, then the 2249 * search is limited to IREs with the MATCH_IRE_PRIVATE 2250 * option. Else the search is limited to regular IREs. 2251 */ 2252 if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2253 (nexthop_addr != ire->ire_gateway_addr)) || 2254 (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2255 (nexthop_addr != INADDR_ANY))) 2256 continue; 2257 2258 mutex_enter(&ire->ire_lock); 2259 if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) { 2260 uint32_t length; 2261 int i; 2262 2263 /* 2264 * Use the table from RFC 1191 to figure out 2265 * the next "plateau" based on the length in 2266 * the original IP packet. 2267 */ 2268 length = ntohs(ipha->ipha_length); 2269 DTRACE_PROBE2(ip4__pmtu__guess, ire_t *, ire, 2270 uint32_t, length); 2271 if (ire->ire_max_frag <= length && 2272 ire->ire_max_frag >= length - hdr_length) { 2273 /* 2274 * Handle broken BSD 4.2 systems that 2275 * return the wrong iph_length in ICMP 2276 * errors. 2277 */ 2278 length -= hdr_length; 2279 } 2280 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2281 if (length > icmp_frag_size_table[i]) 2282 break; 2283 } 2284 if (i == A_CNT(icmp_frag_size_table)) { 2285 /* Smaller than 68! */ 2286 disable_pmtud = B_TRUE; 2287 mtu = ipst->ips_ip_pmtu_min; 2288 } else { 2289 mtu = icmp_frag_size_table[i]; 2290 if (mtu < ipst->ips_ip_pmtu_min) { 2291 mtu = ipst->ips_ip_pmtu_min; 2292 disable_pmtud = B_TRUE; 2293 } 2294 } 2295 /* Fool the ULP into believing our guessed PMTU. */ 2296 icmph->icmph_du_zero = 0; 2297 icmph->icmph_du_mtu = htons(mtu); 2298 } 2299 if (disable_pmtud) 2300 ire->ire_frag_flag = 0; 2301 /* Reduce the IRE max frag value as advised. */ 2302 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2303 if (ire->ire_max_frag == mtu) { 2304 /* Decreased it */ 2305 ire->ire_marks |= IRE_MARK_PMTU; 2306 } 2307 mutex_exit(&ire->ire_lock); 2308 DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, ire_t *, 2309 ire, int, orig_mtu, int, mtu); 2310 } 2311 rw_exit(&first_ire->ire_bucket->irb_lock); 2312 ire_refrele(first_ire); 2313 return (B_TRUE); 2314 } 2315 2316 /* 2317 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2318 * calls this function. 2319 */ 2320 static mblk_t * 2321 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2322 { 2323 ipha_t *ipha; 2324 icmph_t *icmph; 2325 ipha_t *in_ipha; 2326 int length; 2327 2328 ASSERT(mp->b_datap->db_type == M_DATA); 2329 2330 /* 2331 * For Self-encapsulated packets, we added an extra IP header 2332 * without the options. Inner IP header is the one from which 2333 * the outer IP header was formed. Thus, we need to remove the 2334 * outer IP header. To do this, we pullup the whole message 2335 * and overlay whatever follows the outer IP header over the 2336 * outer IP header. 2337 */ 2338 2339 if (!pullupmsg(mp, -1)) 2340 return (NULL); 2341 2342 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2343 ipha = (ipha_t *)&icmph[1]; 2344 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2345 2346 /* 2347 * The length that we want to overlay is following the inner 2348 * IP header. Subtracting the IP header + icmp header + outer 2349 * IP header's length should give us the length that we want to 2350 * overlay. 2351 */ 2352 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2353 hdr_length; 2354 /* 2355 * Overlay whatever follows the inner header over the 2356 * outer header. 2357 */ 2358 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2359 2360 /* Set the wptr to account for the outer header */ 2361 mp->b_wptr -= hdr_length; 2362 return (mp); 2363 } 2364 2365 /* 2366 * Fanout for ICMP errors containing IP-in-IPv4 packets. Returns B_TRUE if a 2367 * tunnel consumed the message, and B_FALSE otherwise. 2368 */ 2369 static boolean_t 2370 icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill, 2371 ip_stack_t *ipst) 2372 { 2373 conn_t *connp; 2374 2375 if ((connp = ipcl_iptun_classify_v4(&ripha->ipha_src, &ripha->ipha_dst, 2376 ipst)) == NULL) 2377 return (B_FALSE); 2378 2379 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2380 connp->conn_recv(connp, first_mp, NULL); 2381 CONN_DEC_REF(connp); 2382 return (B_TRUE); 2383 } 2384 2385 /* 2386 * Try to pass the ICMP message upstream in case the ULP cares. 2387 * 2388 * If the packet that caused the ICMP error is secure, we send 2389 * it to AH/ESP to make sure that the attached packet has a 2390 * valid association. ipha in the code below points to the 2391 * IP header of the packet that caused the error. 2392 * 2393 * For IPsec cases, we let the next-layer-up (which has access to 2394 * cached policy on the conn_t, or can query the SPD directly) 2395 * subtract out any IPsec overhead if they must. We therefore make no 2396 * adjustments here for IPsec overhead. 2397 * 2398 * IFN could have been generated locally or by some router. 2399 * 2400 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2401 * This happens because IP adjusted its value of MTU on an 2402 * earlier IFN message and could not tell the upper layer, 2403 * the new adjusted value of MTU e.g. Packet was encrypted 2404 * or there was not enough information to fanout to upper 2405 * layers. Thus on the next outbound datagram, ip_wput_ire 2406 * generates the IFN, where IPsec processing has *not* been 2407 * done. 2408 * 2409 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2410 * could have generated this. This happens because ire_max_frag 2411 * value in IP was set to a new value, while the IPsec processing 2412 * was being done and after we made the fragmentation check in 2413 * ip_wput_ire. Thus on return from IPsec processing, 2414 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2415 * and generates the IFN. As IPsec processing is over, we fanout 2416 * to AH/ESP to remove the header. 2417 * 2418 * In both these cases, ipsec_in_loopback will be set indicating 2419 * that IFN was generated locally. 2420 * 2421 * ROUTER : IFN could be secure or non-secure. 2422 * 2423 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2424 * packet in error has AH/ESP headers to validate the AH/ESP 2425 * headers. AH/ESP will verify whether there is a valid SA or 2426 * not and send it back. We will fanout again if we have more 2427 * data in the packet. 2428 * 2429 * If the packet in error does not have AH/ESP, we handle it 2430 * like any other case. 2431 * 2432 * * NON_SECURE : If the packet in error has AH/ESP headers, 2433 * we attach a dummy ipsec_in and send it up to AH/ESP 2434 * for validation. AH/ESP will verify whether there is a 2435 * valid SA or not and send it back. We will fanout again if 2436 * we have more data in the packet. 2437 * 2438 * If the packet in error does not have AH/ESP, we handle it 2439 * like any other case. 2440 */ 2441 static void 2442 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2443 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2444 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2445 zoneid_t zoneid) 2446 { 2447 uint16_t *up; /* Pointer to ports in ULP header */ 2448 uint32_t ports; /* reversed ports for fanout */ 2449 ipha_t ripha; /* With reversed addresses */ 2450 mblk_t *first_mp; 2451 ipsec_in_t *ii; 2452 tcph_t *tcph; 2453 conn_t *connp; 2454 ip_stack_t *ipst; 2455 2456 ASSERT(ill != NULL); 2457 2458 ASSERT(recv_ill != NULL); 2459 ipst = recv_ill->ill_ipst; 2460 2461 first_mp = mp; 2462 if (mctl_present) { 2463 mp = first_mp->b_cont; 2464 ASSERT(mp != NULL); 2465 2466 ii = (ipsec_in_t *)first_mp->b_rptr; 2467 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2468 } else { 2469 ii = NULL; 2470 } 2471 2472 /* 2473 * We need a separate IP header with the source and destination 2474 * addresses reversed to do fanout/classification because the ipha in 2475 * the ICMP error is in the form we sent it out. 2476 */ 2477 ripha.ipha_src = ipha->ipha_dst; 2478 ripha.ipha_dst = ipha->ipha_src; 2479 ripha.ipha_protocol = ipha->ipha_protocol; 2480 ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length; 2481 2482 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2483 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2484 ntohl(ipha->ipha_dst), 2485 icmph->icmph_type, icmph->icmph_code)); 2486 2487 switch (ipha->ipha_protocol) { 2488 case IPPROTO_UDP: 2489 /* 2490 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2491 * transport header. 2492 */ 2493 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2494 mp->b_wptr) { 2495 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2496 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2497 goto discard_pkt; 2498 } 2499 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2500 ipha = (ipha_t *)&icmph[1]; 2501 } 2502 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2503 2504 /* Attempt to find a client stream based on port. */ 2505 ((uint16_t *)&ports)[0] = up[1]; 2506 ((uint16_t *)&ports)[1] = up[0]; 2507 ip2dbg(("icmp_inbound_error: UDP ports %d to %d\n", 2508 ntohs(up[0]), ntohs(up[1]))); 2509 2510 /* Have to change db_type after any pullupmsg */ 2511 DB_TYPE(mp) = M_CTL; 2512 2513 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2514 mctl_present, ip_policy, recv_ill, zoneid); 2515 return; 2516 2517 case IPPROTO_TCP: 2518 /* 2519 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2520 * transport header. 2521 */ 2522 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2523 mp->b_wptr) { 2524 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2525 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2526 goto discard_pkt; 2527 } 2528 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2529 ipha = (ipha_t *)&icmph[1]; 2530 } 2531 /* 2532 * Find a TCP client stream for this packet. 2533 * Note that we do a reverse lookup since the header is 2534 * in the form we sent it out. 2535 */ 2536 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2537 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, 2538 ipst); 2539 if (connp == NULL) 2540 goto discard_pkt; 2541 2542 /* Have to change db_type after any pullupmsg */ 2543 DB_TYPE(mp) = M_CTL; 2544 SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp, 2545 SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR); 2546 return; 2547 2548 case IPPROTO_SCTP: 2549 /* 2550 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2551 * transport header. 2552 */ 2553 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2554 mp->b_wptr) { 2555 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2556 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2557 goto discard_pkt; 2558 } 2559 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2560 ipha = (ipha_t *)&icmph[1]; 2561 } 2562 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2563 /* Find a SCTP client stream for this packet. */ 2564 ((uint16_t *)&ports)[0] = up[1]; 2565 ((uint16_t *)&ports)[1] = up[0]; 2566 2567 /* Have to change db_type after any pullupmsg */ 2568 DB_TYPE(mp) = M_CTL; 2569 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2570 mctl_present, ip_policy, zoneid); 2571 return; 2572 2573 case IPPROTO_ESP: 2574 case IPPROTO_AH: { 2575 int ipsec_rc; 2576 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 2577 2578 /* 2579 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2580 * We will re-use the IPSEC_IN if it is already present as 2581 * AH/ESP will not affect any fields in the IPSEC_IN for 2582 * ICMP errors. If there is no IPSEC_IN, allocate a new 2583 * one and attach it in the front. 2584 */ 2585 if (ii != NULL) { 2586 /* 2587 * ip_fanout_proto_again converts the ICMP errors 2588 * that come back from AH/ESP to M_DATA so that 2589 * if it is non-AH/ESP and we do a pullupmsg in 2590 * this function, it would work. Convert it back 2591 * to M_CTL before we send up as this is a ICMP 2592 * error. This could have been generated locally or 2593 * by some router. Validate the inner IPsec 2594 * headers. 2595 * 2596 * NOTE : ill_index is used by ip_fanout_proto_again 2597 * to locate the ill. 2598 */ 2599 ASSERT(ill != NULL); 2600 ii->ipsec_in_ill_index = 2601 ill->ill_phyint->phyint_ifindex; 2602 ii->ipsec_in_rill_index = 2603 recv_ill->ill_phyint->phyint_ifindex; 2604 DB_TYPE(first_mp->b_cont) = M_CTL; 2605 } else { 2606 /* 2607 * IPSEC_IN is not present. We attach a ipsec_in 2608 * message and send up to IPsec for validating 2609 * and removing the IPsec headers. Clear 2610 * ipsec_in_secure so that when we return 2611 * from IPsec, we don't mistakenly think that this 2612 * is a secure packet came from the network. 2613 * 2614 * NOTE : ill_index is used by ip_fanout_proto_again 2615 * to locate the ill. 2616 */ 2617 ASSERT(first_mp == mp); 2618 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2619 if (first_mp == NULL) { 2620 freemsg(mp); 2621 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2622 return; 2623 } 2624 ii = (ipsec_in_t *)first_mp->b_rptr; 2625 2626 /* This is not a secure packet */ 2627 ii->ipsec_in_secure = B_FALSE; 2628 first_mp->b_cont = mp; 2629 DB_TYPE(mp) = M_CTL; 2630 ASSERT(ill != NULL); 2631 ii->ipsec_in_ill_index = 2632 ill->ill_phyint->phyint_ifindex; 2633 ii->ipsec_in_rill_index = 2634 recv_ill->ill_phyint->phyint_ifindex; 2635 } 2636 2637 if (!ipsec_loaded(ipss)) { 2638 ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); 2639 return; 2640 } 2641 2642 if (ipha->ipha_protocol == IPPROTO_ESP) 2643 ipsec_rc = ipsecesp_icmp_error(first_mp); 2644 else 2645 ipsec_rc = ipsecah_icmp_error(first_mp); 2646 if (ipsec_rc == IPSEC_STATUS_FAILED) 2647 return; 2648 2649 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2650 return; 2651 } 2652 case IPPROTO_ENCAP: 2653 case IPPROTO_IPV6: 2654 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2655 ipha_t *in_ipha; 2656 2657 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2658 mp->b_wptr) { 2659 if (!pullupmsg(mp, (uchar_t *)ipha + 2660 hdr_length + sizeof (ipha_t) - 2661 mp->b_rptr)) { 2662 goto discard_pkt; 2663 } 2664 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2665 ipha = (ipha_t *)&icmph[1]; 2666 } 2667 /* 2668 * Caller has verified that length has to be 2669 * at least the size of IP header. 2670 */ 2671 ASSERT(hdr_length >= sizeof (ipha_t)); 2672 /* 2673 * Check the sanity of the inner IP header like 2674 * we did for the outer header. 2675 */ 2676 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2677 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION) || 2678 IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) 2679 goto discard_pkt; 2680 /* Check for Self-encapsulated tunnels */ 2681 if (in_ipha->ipha_src == ipha->ipha_src && 2682 in_ipha->ipha_dst == ipha->ipha_dst) { 2683 2684 mp = icmp_inbound_self_encap_error(mp, 2685 iph_hdr_length, hdr_length); 2686 if (mp == NULL) 2687 goto discard_pkt; 2688 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2689 ipha = (ipha_t *)&icmph[1]; 2690 hdr_length = IPH_HDR_LENGTH(ipha); 2691 /* 2692 * The packet in error is self-encapsualted. 2693 * And we are finding it further encapsulated 2694 * which we could not have possibly generated. 2695 */ 2696 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2697 goto discard_pkt; 2698 } 2699 icmp_inbound_error_fanout(q, ill, first_mp, 2700 icmph, ipha, iph_hdr_length, hdr_length, 2701 mctl_present, ip_policy, recv_ill, zoneid); 2702 return; 2703 } 2704 } 2705 2706 DB_TYPE(mp) = M_CTL; 2707 if (icmp_inbound_iptun_fanout(first_mp, &ripha, ill, ipst)) 2708 return; 2709 /* 2710 * No IP tunnel is interested, fallthrough and see 2711 * if a raw socket will want it. 2712 */ 2713 /* FALLTHRU */ 2714 default: 2715 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2716 ip_policy, recv_ill, zoneid); 2717 return; 2718 } 2719 /* NOTREACHED */ 2720 discard_pkt: 2721 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2722 drop_pkt:; 2723 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2724 freemsg(first_mp); 2725 } 2726 2727 /* 2728 * Common IP options parser. 2729 * 2730 * Setup routine: fill in *optp with options-parsing state, then 2731 * tail-call ipoptp_next to return the first option. 2732 */ 2733 uint8_t 2734 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2735 { 2736 uint32_t totallen; /* total length of all options */ 2737 2738 totallen = ipha->ipha_version_and_hdr_length - 2739 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2740 totallen <<= 2; 2741 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2742 optp->ipoptp_end = optp->ipoptp_next + totallen; 2743 optp->ipoptp_flags = 0; 2744 return (ipoptp_next(optp)); 2745 } 2746 2747 /* 2748 * Common IP options parser: extract next option. 2749 */ 2750 uint8_t 2751 ipoptp_next(ipoptp_t *optp) 2752 { 2753 uint8_t *end = optp->ipoptp_end; 2754 uint8_t *cur = optp->ipoptp_next; 2755 uint8_t opt, len, pointer; 2756 2757 /* 2758 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2759 * has been corrupted. 2760 */ 2761 ASSERT(cur <= end); 2762 2763 if (cur == end) 2764 return (IPOPT_EOL); 2765 2766 opt = cur[IPOPT_OPTVAL]; 2767 2768 /* 2769 * Skip any NOP options. 2770 */ 2771 while (opt == IPOPT_NOP) { 2772 cur++; 2773 if (cur == end) 2774 return (IPOPT_EOL); 2775 opt = cur[IPOPT_OPTVAL]; 2776 } 2777 2778 if (opt == IPOPT_EOL) 2779 return (IPOPT_EOL); 2780 2781 /* 2782 * Option requiring a length. 2783 */ 2784 if ((cur + 1) >= end) { 2785 optp->ipoptp_flags |= IPOPTP_ERROR; 2786 return (IPOPT_EOL); 2787 } 2788 len = cur[IPOPT_OLEN]; 2789 if (len < 2) { 2790 optp->ipoptp_flags |= IPOPTP_ERROR; 2791 return (IPOPT_EOL); 2792 } 2793 optp->ipoptp_cur = cur; 2794 optp->ipoptp_len = len; 2795 optp->ipoptp_next = cur + len; 2796 if (cur + len > end) { 2797 optp->ipoptp_flags |= IPOPTP_ERROR; 2798 return (IPOPT_EOL); 2799 } 2800 2801 /* 2802 * For the options which require a pointer field, make sure 2803 * its there, and make sure it points to either something 2804 * inside this option, or the end of the option. 2805 */ 2806 switch (opt) { 2807 case IPOPT_RR: 2808 case IPOPT_TS: 2809 case IPOPT_LSRR: 2810 case IPOPT_SSRR: 2811 if (len <= IPOPT_OFFSET) { 2812 optp->ipoptp_flags |= IPOPTP_ERROR; 2813 return (opt); 2814 } 2815 pointer = cur[IPOPT_OFFSET]; 2816 if (pointer - 1 > len) { 2817 optp->ipoptp_flags |= IPOPTP_ERROR; 2818 return (opt); 2819 } 2820 break; 2821 } 2822 2823 /* 2824 * Sanity check the pointer field based on the type of the 2825 * option. 2826 */ 2827 switch (opt) { 2828 case IPOPT_RR: 2829 case IPOPT_SSRR: 2830 case IPOPT_LSRR: 2831 if (pointer < IPOPT_MINOFF_SR) 2832 optp->ipoptp_flags |= IPOPTP_ERROR; 2833 break; 2834 case IPOPT_TS: 2835 if (pointer < IPOPT_MINOFF_IT) 2836 optp->ipoptp_flags |= IPOPTP_ERROR; 2837 /* 2838 * Note that the Internet Timestamp option also 2839 * contains two four bit fields (the Overflow field, 2840 * and the Flag field), which follow the pointer 2841 * field. We don't need to check that these fields 2842 * fall within the length of the option because this 2843 * was implicitely done above. We've checked that the 2844 * pointer value is at least IPOPT_MINOFF_IT, and that 2845 * it falls within the option. Since IPOPT_MINOFF_IT > 2846 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2847 */ 2848 ASSERT(len > IPOPT_POS_OV_FLG); 2849 break; 2850 } 2851 2852 return (opt); 2853 } 2854 2855 /* 2856 * Use the outgoing IP header to create an IP_OPTIONS option the way 2857 * it was passed down from the application. 2858 */ 2859 int 2860 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2861 { 2862 ipoptp_t opts; 2863 const uchar_t *opt; 2864 uint8_t optval; 2865 uint8_t optlen; 2866 uint32_t len = 0; 2867 uchar_t *buf1 = buf; 2868 2869 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2870 len += IP_ADDR_LEN; 2871 bzero(buf1, IP_ADDR_LEN); 2872 2873 /* 2874 * OK to cast away const here, as we don't store through the returned 2875 * opts.ipoptp_cur pointer. 2876 */ 2877 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2878 optval != IPOPT_EOL; 2879 optval = ipoptp_next(&opts)) { 2880 int off; 2881 2882 opt = opts.ipoptp_cur; 2883 optlen = opts.ipoptp_len; 2884 switch (optval) { 2885 case IPOPT_SSRR: 2886 case IPOPT_LSRR: 2887 2888 /* 2889 * Insert ipha_dst as the first entry in the source 2890 * route and move down the entries on step. 2891 * The last entry gets placed at buf1. 2892 */ 2893 buf[IPOPT_OPTVAL] = optval; 2894 buf[IPOPT_OLEN] = optlen; 2895 buf[IPOPT_OFFSET] = optlen; 2896 2897 off = optlen - IP_ADDR_LEN; 2898 if (off < 0) { 2899 /* No entries in source route */ 2900 break; 2901 } 2902 /* Last entry in source route */ 2903 bcopy(opt + off, buf1, IP_ADDR_LEN); 2904 off -= IP_ADDR_LEN; 2905 2906 while (off > 0) { 2907 bcopy(opt + off, 2908 buf + off + IP_ADDR_LEN, 2909 IP_ADDR_LEN); 2910 off -= IP_ADDR_LEN; 2911 } 2912 /* ipha_dst into first slot */ 2913 bcopy(&ipha->ipha_dst, 2914 buf + off + IP_ADDR_LEN, 2915 IP_ADDR_LEN); 2916 buf += optlen; 2917 len += optlen; 2918 break; 2919 2920 case IPOPT_COMSEC: 2921 case IPOPT_SECURITY: 2922 /* if passing up a label is not ok, then remove */ 2923 if (is_system_labeled()) 2924 break; 2925 /* FALLTHROUGH */ 2926 default: 2927 bcopy(opt, buf, optlen); 2928 buf += optlen; 2929 len += optlen; 2930 break; 2931 } 2932 } 2933 done: 2934 /* Pad the resulting options */ 2935 while (len & 0x3) { 2936 *buf++ = IPOPT_EOL; 2937 len++; 2938 } 2939 return (len); 2940 } 2941 2942 /* 2943 * Update any record route or timestamp options to include this host. 2944 * Reverse any source route option. 2945 * This routine assumes that the options are well formed i.e. that they 2946 * have already been checked. 2947 */ 2948 static void 2949 icmp_options_update(ipha_t *ipha) 2950 { 2951 ipoptp_t opts; 2952 uchar_t *opt; 2953 uint8_t optval; 2954 ipaddr_t src; /* Our local address */ 2955 ipaddr_t dst; 2956 2957 ip2dbg(("icmp_options_update\n")); 2958 src = ipha->ipha_src; 2959 dst = ipha->ipha_dst; 2960 2961 for (optval = ipoptp_first(&opts, ipha); 2962 optval != IPOPT_EOL; 2963 optval = ipoptp_next(&opts)) { 2964 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 2965 opt = opts.ipoptp_cur; 2966 ip2dbg(("icmp_options_update: opt %d, len %d\n", 2967 optval, opts.ipoptp_len)); 2968 switch (optval) { 2969 int off1, off2; 2970 case IPOPT_SSRR: 2971 case IPOPT_LSRR: 2972 /* 2973 * Reverse the source route. The first entry 2974 * should be the next to last one in the current 2975 * source route (the last entry is our address). 2976 * The last entry should be the final destination. 2977 */ 2978 off1 = IPOPT_MINOFF_SR - 1; 2979 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 2980 if (off2 < 0) { 2981 /* No entries in source route */ 2982 ip1dbg(( 2983 "icmp_options_update: bad src route\n")); 2984 break; 2985 } 2986 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 2987 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 2988 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 2989 off2 -= IP_ADDR_LEN; 2990 2991 while (off1 < off2) { 2992 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 2993 bcopy((char *)opt + off2, (char *)opt + off1, 2994 IP_ADDR_LEN); 2995 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 2996 off1 += IP_ADDR_LEN; 2997 off2 -= IP_ADDR_LEN; 2998 } 2999 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 3000 break; 3001 } 3002 } 3003 } 3004 3005 /* 3006 * Process received ICMP Redirect messages. 3007 */ 3008 static void 3009 icmp_redirect(ill_t *ill, mblk_t *mp) 3010 { 3011 ipha_t *ipha; 3012 int iph_hdr_length; 3013 icmph_t *icmph; 3014 ipha_t *ipha_err; 3015 ire_t *ire; 3016 ire_t *prev_ire; 3017 ire_t *save_ire; 3018 ipaddr_t src, dst, gateway; 3019 iulp_t ulp_info = { 0 }; 3020 int error; 3021 ip_stack_t *ipst; 3022 3023 ASSERT(ill != NULL); 3024 ipst = ill->ill_ipst; 3025 3026 ipha = (ipha_t *)mp->b_rptr; 3027 iph_hdr_length = IPH_HDR_LENGTH(ipha); 3028 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 3029 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 3030 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3031 freemsg(mp); 3032 return; 3033 } 3034 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 3035 ipha_err = (ipha_t *)&icmph[1]; 3036 src = ipha->ipha_src; 3037 dst = ipha_err->ipha_dst; 3038 gateway = icmph->icmph_rd_gateway; 3039 /* Make sure the new gateway is reachable somehow. */ 3040 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 3041 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3042 /* 3043 * Make sure we had a route for the dest in question and that 3044 * that route was pointing to the old gateway (the source of the 3045 * redirect packet.) 3046 */ 3047 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3048 NULL, MATCH_IRE_GW, ipst); 3049 /* 3050 * Check that 3051 * the redirect was not from ourselves 3052 * the new gateway and the old gateway are directly reachable 3053 */ 3054 if (!prev_ire || 3055 !ire || 3056 ire->ire_type == IRE_LOCAL) { 3057 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3058 freemsg(mp); 3059 if (ire != NULL) 3060 ire_refrele(ire); 3061 if (prev_ire != NULL) 3062 ire_refrele(prev_ire); 3063 return; 3064 } 3065 3066 /* 3067 * Should we use the old ULP info to create the new gateway? From 3068 * a user's perspective, we should inherit the info so that it 3069 * is a "smooth" transition. If we do not do that, then new 3070 * connections going thru the new gateway will have no route metrics, 3071 * which is counter-intuitive to user. From a network point of 3072 * view, this may or may not make sense even though the new gateway 3073 * is still directly connected to us so the route metrics should not 3074 * change much. 3075 * 3076 * But if the old ire_uinfo is not initialized, we do another 3077 * recursive lookup on the dest using the new gateway. There may 3078 * be a route to that. If so, use it to initialize the redirect 3079 * route. 3080 */ 3081 if (prev_ire->ire_uinfo.iulp_set) { 3082 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3083 } else { 3084 ire_t *tmp_ire; 3085 ire_t *sire; 3086 3087 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3088 ALL_ZONES, 0, NULL, 3089 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), 3090 ipst); 3091 if (sire != NULL) { 3092 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3093 /* 3094 * If sire != NULL, ire_ftable_lookup() should not 3095 * return a NULL value. 3096 */ 3097 ASSERT(tmp_ire != NULL); 3098 ire_refrele(tmp_ire); 3099 ire_refrele(sire); 3100 } else if (tmp_ire != NULL) { 3101 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3102 sizeof (iulp_t)); 3103 ire_refrele(tmp_ire); 3104 } 3105 } 3106 if (prev_ire->ire_type == IRE_CACHE) 3107 ire_delete(prev_ire); 3108 ire_refrele(prev_ire); 3109 /* 3110 * TODO: more precise handling for cases 0, 2, 3, the latter two 3111 * require TOS routing 3112 */ 3113 switch (icmph->icmph_code) { 3114 case 0: 3115 case 1: 3116 /* TODO: TOS specificity for cases 2 and 3 */ 3117 case 2: 3118 case 3: 3119 break; 3120 default: 3121 freemsg(mp); 3122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3123 ire_refrele(ire); 3124 return; 3125 } 3126 /* 3127 * Create a Route Association. This will allow us to remember that 3128 * someone we believe told us to use the particular gateway. 3129 */ 3130 save_ire = ire; 3131 ire = ire_create( 3132 (uchar_t *)&dst, /* dest addr */ 3133 (uchar_t *)&ip_g_all_ones, /* mask */ 3134 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3135 (uchar_t *)&gateway, /* gateway addr */ 3136 &save_ire->ire_max_frag, /* max frag */ 3137 NULL, /* no src nce */ 3138 NULL, /* no rfq */ 3139 NULL, /* no stq */ 3140 IRE_HOST, 3141 NULL, /* ipif */ 3142 0, /* cmask */ 3143 0, /* phandle */ 3144 0, /* ihandle */ 3145 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3146 &ulp_info, 3147 NULL, /* tsol_gc_t */ 3148 NULL, /* gcgrp */ 3149 ipst); 3150 3151 if (ire == NULL) { 3152 freemsg(mp); 3153 ire_refrele(save_ire); 3154 return; 3155 } 3156 error = ire_add(&ire, NULL, NULL, NULL, B_FALSE); 3157 ire_refrele(save_ire); 3158 atomic_inc_32(&ipst->ips_ip_redirect_cnt); 3159 3160 if (error == 0) { 3161 ire_refrele(ire); /* Held in ire_add_v4 */ 3162 /* tell routing sockets that we received a redirect */ 3163 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3164 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3165 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); 3166 } 3167 3168 /* 3169 * Delete any existing IRE_HOST type redirect ires for this destination. 3170 * This together with the added IRE has the effect of 3171 * modifying an existing redirect. 3172 */ 3173 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL, 3174 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst); 3175 if (prev_ire != NULL) { 3176 if (prev_ire ->ire_flags & RTF_DYNAMIC) 3177 ire_delete(prev_ire); 3178 ire_refrele(prev_ire); 3179 } 3180 3181 freemsg(mp); 3182 } 3183 3184 /* 3185 * Generate an ICMP parameter problem message. 3186 */ 3187 static void 3188 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, 3189 ip_stack_t *ipst) 3190 { 3191 icmph_t icmph; 3192 boolean_t mctl_present; 3193 mblk_t *first_mp; 3194 3195 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3196 3197 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3198 if (mctl_present) 3199 freeb(first_mp); 3200 return; 3201 } 3202 3203 bzero(&icmph, sizeof (icmph_t)); 3204 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3205 icmph.icmph_pp_ptr = ptr; 3206 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); 3207 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3208 ipst); 3209 } 3210 3211 /* 3212 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3213 * the ICMP header pointed to by "stuff". (May be called as writer.) 3214 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3215 * an icmp error packet can be sent. 3216 * Assigns an appropriate source address to the packet. If ipha_dst is 3217 * one of our addresses use it for source. Otherwise pick a source based 3218 * on a route lookup back to ipha_src. 3219 * Note that ipha_src must be set here since the 3220 * packet is likely to arrive on an ill queue in ip_wput() which will 3221 * not set a source address. 3222 */ 3223 static void 3224 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3225 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 3226 { 3227 ipaddr_t dst; 3228 icmph_t *icmph; 3229 ipha_t *ipha; 3230 uint_t len_needed; 3231 size_t msg_len; 3232 mblk_t *mp1; 3233 ipaddr_t src; 3234 ire_t *ire; 3235 mblk_t *ipsec_mp; 3236 ipsec_out_t *io = NULL; 3237 3238 if (mctl_present) { 3239 /* 3240 * If it is : 3241 * 3242 * 1) a IPSEC_OUT, then this is caused by outbound 3243 * datagram originating on this host. IPsec processing 3244 * may or may not have been done. Refer to comments above 3245 * icmp_inbound_error_fanout for details. 3246 * 3247 * 2) a IPSEC_IN if we are generating a icmp_message 3248 * for an incoming datagram destined for us i.e called 3249 * from ip_fanout_send_icmp. 3250 */ 3251 ipsec_info_t *in; 3252 ipsec_mp = mp; 3253 mp = ipsec_mp->b_cont; 3254 3255 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3256 ipha = (ipha_t *)mp->b_rptr; 3257 3258 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3259 in->ipsec_info_type == IPSEC_IN); 3260 3261 if (in->ipsec_info_type == IPSEC_IN) { 3262 /* 3263 * Convert the IPSEC_IN to IPSEC_OUT. 3264 */ 3265 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3266 BUMP_MIB(&ipst->ips_ip_mib, 3267 ipIfStatsOutDiscards); 3268 return; 3269 } 3270 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3271 } else { 3272 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3273 io = (ipsec_out_t *)in; 3274 /* 3275 * Clear out ipsec_out_proc_begin, so we do a fresh 3276 * ire lookup. 3277 */ 3278 io->ipsec_out_proc_begin = B_FALSE; 3279 } 3280 ASSERT(zoneid != ALL_ZONES); 3281 /* 3282 * The IPSEC_IN (now an IPSEC_OUT) didn't have its zoneid 3283 * initialized. We need to do that now. 3284 */ 3285 io->ipsec_out_zoneid = zoneid; 3286 } else { 3287 /* 3288 * This is in clear. The icmp message we are building 3289 * here should go out in clear. 3290 * 3291 * Pardon the convolution of it all, but it's easier to 3292 * allocate a "use cleartext" IPSEC_IN message and convert 3293 * it than it is to allocate a new one. 3294 */ 3295 ipsec_in_t *ii; 3296 ASSERT(DB_TYPE(mp) == M_DATA); 3297 ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 3298 if (ipsec_mp == NULL) { 3299 freemsg(mp); 3300 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3301 return; 3302 } 3303 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3304 3305 /* This is not a secure packet */ 3306 ii->ipsec_in_secure = B_FALSE; 3307 /* 3308 * For trusted extensions using a shared IP address we can 3309 * send using any zoneid. 3310 */ 3311 if (zoneid == ALL_ZONES) 3312 ii->ipsec_in_zoneid = GLOBAL_ZONEID; 3313 else 3314 ii->ipsec_in_zoneid = zoneid; 3315 ipsec_mp->b_cont = mp; 3316 ipha = (ipha_t *)mp->b_rptr; 3317 /* 3318 * Convert the IPSEC_IN to IPSEC_OUT. 3319 */ 3320 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3321 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3322 return; 3323 } 3324 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3325 } 3326 3327 /* Remember our eventual destination */ 3328 dst = ipha->ipha_src; 3329 3330 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3331 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst); 3332 if (ire != NULL && 3333 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3334 src = ipha->ipha_dst; 3335 } else { 3336 if (ire != NULL) 3337 ire_refrele(ire); 3338 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3339 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY), 3340 ipst); 3341 if (ire == NULL) { 3342 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 3343 freemsg(ipsec_mp); 3344 return; 3345 } 3346 src = ire->ire_src_addr; 3347 } 3348 3349 if (ire != NULL) 3350 ire_refrele(ire); 3351 3352 /* 3353 * Check if we can send back more then 8 bytes in addition to 3354 * the IP header. We try to send 64 bytes of data and the internal 3355 * header in the special cases of ipv4 encapsulated ipv4 or ipv6. 3356 */ 3357 len_needed = IPH_HDR_LENGTH(ipha); 3358 if (ipha->ipha_protocol == IPPROTO_ENCAP || 3359 ipha->ipha_protocol == IPPROTO_IPV6) { 3360 3361 if (!pullupmsg(mp, -1)) { 3362 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3363 freemsg(ipsec_mp); 3364 return; 3365 } 3366 ipha = (ipha_t *)mp->b_rptr; 3367 3368 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 3369 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + 3370 len_needed)); 3371 } else { 3372 ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); 3373 3374 ASSERT(ipha->ipha_protocol == IPPROTO_IPV6); 3375 len_needed += ip_hdr_length_v6(mp, ip6h); 3376 } 3377 } 3378 len_needed += ipst->ips_ip_icmp_return; 3379 msg_len = msgdsize(mp); 3380 if (msg_len > len_needed) { 3381 (void) adjmsg(mp, len_needed - msg_len); 3382 msg_len = len_needed; 3383 } 3384 /* Make sure we propagate the cred/label for TX */ 3385 mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp); 3386 if (mp1 == NULL) { 3387 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors); 3388 freemsg(ipsec_mp); 3389 return; 3390 } 3391 mp1->b_cont = mp; 3392 mp = mp1; 3393 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3394 ipsec_mp->b_rptr == (uint8_t *)io && 3395 io->ipsec_out_type == IPSEC_OUT); 3396 ipsec_mp->b_cont = mp; 3397 3398 /* 3399 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3400 * node generates be accepted in peace by all on-host destinations. 3401 * If we do NOT assume that all on-host destinations trust 3402 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3403 * (Look for ipsec_out_icmp_loopback). 3404 */ 3405 io->ipsec_out_icmp_loopback = B_TRUE; 3406 3407 ipha = (ipha_t *)mp->b_rptr; 3408 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3409 *ipha = icmp_ipha; 3410 ipha->ipha_src = src; 3411 ipha->ipha_dst = dst; 3412 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 3413 msg_len += sizeof (icmp_ipha) + len; 3414 if (msg_len > IP_MAXPACKET) { 3415 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3416 msg_len = IP_MAXPACKET; 3417 } 3418 ipha->ipha_length = htons((uint16_t)msg_len); 3419 icmph = (icmph_t *)&ipha[1]; 3420 bcopy(stuff, icmph, len); 3421 icmph->icmph_checksum = 0; 3422 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3423 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 3424 put(q, ipsec_mp); 3425 } 3426 3427 /* 3428 * Determine if an ICMP error packet can be sent given the rate limit. 3429 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3430 * in milliseconds) and a burst size. Burst size number of packets can 3431 * be sent arbitrarely closely spaced. 3432 * The state is tracked using two variables to implement an approximate 3433 * token bucket filter: 3434 * icmp_pkt_err_last - lbolt value when the last burst started 3435 * icmp_pkt_err_sent - number of packets sent in current burst 3436 */ 3437 boolean_t 3438 icmp_err_rate_limit(ip_stack_t *ipst) 3439 { 3440 clock_t now = TICK_TO_MSEC(lbolt); 3441 uint_t refilled; /* Number of packets refilled in tbf since last */ 3442 /* Guard against changes by loading into local variable */ 3443 uint_t err_interval = ipst->ips_ip_icmp_err_interval; 3444 3445 if (err_interval == 0) 3446 return (B_FALSE); 3447 3448 if (ipst->ips_icmp_pkt_err_last > now) { 3449 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3450 ipst->ips_icmp_pkt_err_last = 0; 3451 ipst->ips_icmp_pkt_err_sent = 0; 3452 } 3453 /* 3454 * If we are in a burst update the token bucket filter. 3455 * Update the "last" time to be close to "now" but make sure 3456 * we don't loose precision. 3457 */ 3458 if (ipst->ips_icmp_pkt_err_sent != 0) { 3459 refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval; 3460 if (refilled > ipst->ips_icmp_pkt_err_sent) { 3461 ipst->ips_icmp_pkt_err_sent = 0; 3462 } else { 3463 ipst->ips_icmp_pkt_err_sent -= refilled; 3464 ipst->ips_icmp_pkt_err_last += refilled * err_interval; 3465 } 3466 } 3467 if (ipst->ips_icmp_pkt_err_sent == 0) { 3468 /* Start of new burst */ 3469 ipst->ips_icmp_pkt_err_last = now; 3470 } 3471 if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) { 3472 ipst->ips_icmp_pkt_err_sent++; 3473 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3474 ipst->ips_icmp_pkt_err_sent)); 3475 return (B_FALSE); 3476 } 3477 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3478 return (B_TRUE); 3479 } 3480 3481 /* 3482 * Check if it is ok to send an IPv4 ICMP error packet in 3483 * response to the IPv4 packet in mp. 3484 * Free the message and return null if no 3485 * ICMP error packet should be sent. 3486 */ 3487 static mblk_t * 3488 icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) 3489 { 3490 icmph_t *icmph; 3491 ipha_t *ipha; 3492 uint_t len_needed; 3493 ire_t *src_ire; 3494 ire_t *dst_ire; 3495 3496 if (!mp) 3497 return (NULL); 3498 ipha = (ipha_t *)mp->b_rptr; 3499 if (ip_csum_hdr(ipha)) { 3500 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs); 3501 freemsg(mp); 3502 return (NULL); 3503 } 3504 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3505 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3506 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3507 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3508 if (src_ire != NULL || dst_ire != NULL || 3509 CLASSD(ipha->ipha_dst) || 3510 CLASSD(ipha->ipha_src) || 3511 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3512 /* Note: only errors to the fragment with offset 0 */ 3513 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3514 freemsg(mp); 3515 if (src_ire != NULL) 3516 ire_refrele(src_ire); 3517 if (dst_ire != NULL) 3518 ire_refrele(dst_ire); 3519 return (NULL); 3520 } 3521 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3522 /* 3523 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3524 * errors in response to any ICMP errors. 3525 */ 3526 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3527 if (mp->b_wptr - mp->b_rptr < len_needed) { 3528 if (!pullupmsg(mp, len_needed)) { 3529 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3530 freemsg(mp); 3531 return (NULL); 3532 } 3533 ipha = (ipha_t *)mp->b_rptr; 3534 } 3535 icmph = (icmph_t *) 3536 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3537 switch (icmph->icmph_type) { 3538 case ICMP_DEST_UNREACHABLE: 3539 case ICMP_SOURCE_QUENCH: 3540 case ICMP_TIME_EXCEEDED: 3541 case ICMP_PARAM_PROBLEM: 3542 case ICMP_REDIRECT: 3543 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3544 freemsg(mp); 3545 return (NULL); 3546 default: 3547 break; 3548 } 3549 } 3550 /* 3551 * If this is a labeled system, then check to see if we're allowed to 3552 * send a response to this particular sender. If not, then just drop. 3553 */ 3554 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3555 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3556 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3557 freemsg(mp); 3558 return (NULL); 3559 } 3560 if (icmp_err_rate_limit(ipst)) { 3561 /* 3562 * Only send ICMP error packets every so often. 3563 * This should be done on a per port/source basis, 3564 * but for now this will suffice. 3565 */ 3566 freemsg(mp); 3567 return (NULL); 3568 } 3569 return (mp); 3570 } 3571 3572 /* 3573 * Generate an ICMP redirect message. 3574 */ 3575 static void 3576 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst) 3577 { 3578 icmph_t icmph; 3579 3580 /* 3581 * We are called from ip_rput where we could 3582 * not have attached an IPSEC_IN. 3583 */ 3584 ASSERT(mp->b_datap->db_type == M_DATA); 3585 3586 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3587 return; 3588 } 3589 3590 bzero(&icmph, sizeof (icmph_t)); 3591 icmph.icmph_type = ICMP_REDIRECT; 3592 icmph.icmph_code = 1; 3593 icmph.icmph_rd_gateway = gateway; 3594 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); 3595 /* Redirects sent by router, and router is global zone */ 3596 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst); 3597 } 3598 3599 /* 3600 * Generate an ICMP time exceeded message. 3601 */ 3602 void 3603 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3604 ip_stack_t *ipst) 3605 { 3606 icmph_t icmph; 3607 boolean_t mctl_present; 3608 mblk_t *first_mp; 3609 3610 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3611 3612 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3613 if (mctl_present) 3614 freeb(first_mp); 3615 return; 3616 } 3617 3618 bzero(&icmph, sizeof (icmph_t)); 3619 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3620 icmph.icmph_code = code; 3621 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); 3622 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3623 ipst); 3624 } 3625 3626 /* 3627 * Generate an ICMP unreachable message. 3628 */ 3629 void 3630 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3631 ip_stack_t *ipst) 3632 { 3633 icmph_t icmph; 3634 mblk_t *first_mp; 3635 boolean_t mctl_present; 3636 3637 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3638 3639 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3640 if (mctl_present) 3641 freeb(first_mp); 3642 return; 3643 } 3644 3645 bzero(&icmph, sizeof (icmph_t)); 3646 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3647 icmph.icmph_code = code; 3648 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 3649 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3650 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present, 3651 zoneid, ipst); 3652 } 3653 3654 /* 3655 * Attempt to start recovery of an IPv4 interface that's been shut down as a 3656 * duplicate. As long as someone else holds the address, the interface will 3657 * stay down. When that conflict goes away, the interface is brought back up. 3658 * This is done so that accidental shutdowns of addresses aren't made 3659 * permanent. Your server will recover from a failure. 3660 * 3661 * For DHCP, recovery is not done in the kernel. Instead, it's handled by a 3662 * user space process (dhcpagent). 3663 * 3664 * Recovery completes if ARP reports that the address is now ours (via 3665 * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. 3666 * 3667 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 3668 */ 3669 static void 3670 ipif_dup_recovery(void *arg) 3671 { 3672 ipif_t *ipif = arg; 3673 ill_t *ill = ipif->ipif_ill; 3674 mblk_t *arp_add_mp; 3675 mblk_t *arp_del_mp; 3676 ip_stack_t *ipst = ill->ill_ipst; 3677 3678 ipif->ipif_recovery_id = 0; 3679 3680 /* 3681 * No lock needed for moving or condemned check, as this is just an 3682 * optimization. 3683 */ 3684 if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || 3685 (ipif->ipif_flags & IPIF_POINTOPOINT) || 3686 (ipif->ipif_state_flags & (IPIF_CONDEMNED))) { 3687 /* No reason to try to bring this address back. */ 3688 return; 3689 } 3690 3691 /* ACE_F_UNVERIFIED restarts DAD */ 3692 if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) 3693 goto alloc_fail; 3694 3695 if (ipif->ipif_arp_del_mp == NULL) { 3696 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 3697 goto alloc_fail; 3698 ipif->ipif_arp_del_mp = arp_del_mp; 3699 } 3700 3701 putnext(ill->ill_rq, arp_add_mp); 3702 return; 3703 3704 alloc_fail: 3705 /* 3706 * On allocation failure, just restart the timer. Note that the ipif 3707 * is down here, so no other thread could be trying to start a recovery 3708 * timer. The ill_lock protects the condemned flag and the recovery 3709 * timer ID. 3710 */ 3711 freemsg(arp_add_mp); 3712 mutex_enter(&ill->ill_lock); 3713 if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 && 3714 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 3715 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, 3716 MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3717 } 3718 mutex_exit(&ill->ill_lock); 3719 } 3720 3721 /* 3722 * This is for exclusive changes due to ARP. Either tear down an interface due 3723 * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. 3724 */ 3725 /* ARGSUSED */ 3726 static void 3727 ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3728 { 3729 ill_t *ill = rq->q_ptr; 3730 arh_t *arh; 3731 ipaddr_t src; 3732 ipif_t *ipif; 3733 char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ 3734 char hbuf[MAC_STR_LEN]; 3735 char sbuf[INET_ADDRSTRLEN]; 3736 const char *failtype; 3737 boolean_t bring_up; 3738 ip_stack_t *ipst = ill->ill_ipst; 3739 3740 switch (((arcn_t *)mp->b_rptr)->arcn_code) { 3741 case AR_CN_READY: 3742 failtype = NULL; 3743 bring_up = B_TRUE; 3744 break; 3745 case AR_CN_FAILED: 3746 failtype = "in use"; 3747 bring_up = B_FALSE; 3748 break; 3749 default: 3750 failtype = "claimed"; 3751 bring_up = B_FALSE; 3752 break; 3753 } 3754 3755 arh = (arh_t *)mp->b_cont->b_rptr; 3756 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3757 3758 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, 3759 sizeof (hbuf)); 3760 (void) ip_dot_addr(src, sbuf); 3761 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3762 3763 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || 3764 ipif->ipif_lcl_addr != src) { 3765 continue; 3766 } 3767 3768 /* 3769 * If we failed on a recovery probe, then restart the timer to 3770 * try again later. 3771 */ 3772 if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && 3773 !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3774 ill->ill_net_type == IRE_IF_RESOLVER && 3775 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3776 ipst->ips_ip_dup_recovery > 0 && 3777 ipif->ipif_recovery_id == 0) { 3778 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3779 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3780 continue; 3781 } 3782 3783 /* 3784 * If what we're trying to do has already been done, then do 3785 * nothing. 3786 */ 3787 if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) 3788 continue; 3789 3790 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 3791 3792 if (failtype == NULL) { 3793 cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, 3794 ibuf); 3795 } else { 3796 cmn_err(CE_WARN, "%s has duplicate address %s (%s " 3797 "by %s); disabled", ibuf, sbuf, failtype, hbuf); 3798 } 3799 3800 if (bring_up) { 3801 ASSERT(ill->ill_dl_up); 3802 /* 3803 * Free up the ARP delete message so we can allocate 3804 * a fresh one through the normal path. 3805 */ 3806 freemsg(ipif->ipif_arp_del_mp); 3807 ipif->ipif_arp_del_mp = NULL; 3808 if (ipif_resolver_up(ipif, Res_act_initial) != 3809 EINPROGRESS) { 3810 ipif->ipif_addr_ready = 1; 3811 (void) ipif_up_done(ipif); 3812 ASSERT(ill->ill_move_ipif == NULL); 3813 } 3814 continue; 3815 } 3816 3817 mutex_enter(&ill->ill_lock); 3818 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 3819 ipif->ipif_flags |= IPIF_DUPLICATE; 3820 ill->ill_ipif_dup_count++; 3821 mutex_exit(&ill->ill_lock); 3822 /* 3823 * Already exclusive on the ill; no need to handle deferred 3824 * processing here. 3825 */ 3826 (void) ipif_down(ipif, NULL, NULL); 3827 ipif_down_tail(ipif); 3828 mutex_enter(&ill->ill_lock); 3829 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3830 ill->ill_net_type == IRE_IF_RESOLVER && 3831 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3832 ipst->ips_ip_dup_recovery > 0) { 3833 ASSERT(ipif->ipif_recovery_id == 0); 3834 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3835 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3836 } 3837 mutex_exit(&ill->ill_lock); 3838 } 3839 freemsg(mp); 3840 } 3841 3842 /* ARGSUSED */ 3843 static void 3844 ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3845 { 3846 ill_t *ill = rq->q_ptr; 3847 arh_t *arh; 3848 ipaddr_t src; 3849 ipif_t *ipif; 3850 3851 arh = (arh_t *)mp->b_cont->b_rptr; 3852 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3853 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3854 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src) 3855 (void) ipif_resolver_up(ipif, Res_act_defend); 3856 } 3857 freemsg(mp); 3858 } 3859 3860 /* 3861 * News from ARP. ARP sends notification of interesting events down 3862 * to its clients using M_CTL messages with the interesting ARP packet 3863 * attached via b_cont. 3864 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3865 * queue as opposed to ARP sending the message to all the clients, i.e. all 3866 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3867 * table if a cache IRE is found to delete all the entries for the address in 3868 * the packet. 3869 */ 3870 static void 3871 ip_arp_news(queue_t *q, mblk_t *mp) 3872 { 3873 arcn_t *arcn; 3874 arh_t *arh; 3875 ire_t *ire = NULL; 3876 char hbuf[MAC_STR_LEN]; 3877 char sbuf[INET_ADDRSTRLEN]; 3878 ipaddr_t src; 3879 in6_addr_t v6src; 3880 boolean_t isv6 = B_FALSE; 3881 ipif_t *ipif; 3882 ill_t *ill; 3883 ip_stack_t *ipst; 3884 3885 if (CONN_Q(q)) { 3886 conn_t *connp = Q_TO_CONN(q); 3887 3888 ipst = connp->conn_netstack->netstack_ip; 3889 } else { 3890 ill_t *ill = (ill_t *)q->q_ptr; 3891 3892 ipst = ill->ill_ipst; 3893 } 3894 3895 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3896 if (q->q_next) { 3897 putnext(q, mp); 3898 } else 3899 freemsg(mp); 3900 return; 3901 } 3902 arh = (arh_t *)mp->b_cont->b_rptr; 3903 /* Is it one we are interested in? */ 3904 if (BE16_TO_U16(arh->arh_proto) == ETHERTYPE_IPV6) { 3905 isv6 = B_TRUE; 3906 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3907 IPV6_ADDR_LEN); 3908 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3909 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3910 IP_ADDR_LEN); 3911 } else { 3912 freemsg(mp); 3913 return; 3914 } 3915 3916 ill = q->q_ptr; 3917 3918 arcn = (arcn_t *)mp->b_rptr; 3919 switch (arcn->arcn_code) { 3920 case AR_CN_BOGON: 3921 /* 3922 * Someone is sending ARP packets with a source protocol 3923 * address that we have published and for which we believe our 3924 * entry is authoritative and (when ill_arp_extend is set) 3925 * verified to be unique on the network. 3926 * 3927 * The ARP module internally handles the cases where the sender 3928 * is just probing (for DAD) and where the hardware address of 3929 * a non-authoritative entry has changed. Thus, these are the 3930 * real conflicts, and we have to do resolution. 3931 * 3932 * We back away quickly from the address if it's from DHCP or 3933 * otherwise temporary and hasn't been used recently (or at 3934 * all). We'd like to include "deprecated" addresses here as 3935 * well (as there's no real reason to defend something we're 3936 * discarding), but IPMP "reuses" this flag to mean something 3937 * other than the standard meaning. 3938 * 3939 * If the ARP module above is not extended (meaning that it 3940 * doesn't know how to defend the address), then we just log 3941 * the problem as we always did and continue on. It's not 3942 * right, but there's little else we can do, and those old ATM 3943 * users are going away anyway. 3944 */ 3945 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, 3946 hbuf, sizeof (hbuf)); 3947 (void) ip_dot_addr(src, sbuf); 3948 if (isv6) { 3949 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL, 3950 ipst); 3951 } else { 3952 ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst); 3953 } 3954 if (ire != NULL && IRE_IS_LOCAL(ire)) { 3955 uint32_t now; 3956 uint32_t maxage; 3957 clock_t lused; 3958 uint_t maxdefense; 3959 uint_t defs; 3960 3961 /* 3962 * First, figure out if this address hasn't been used 3963 * in a while. If it hasn't, then it's a better 3964 * candidate for abandoning. 3965 */ 3966 ipif = ire->ire_ipif; 3967 ASSERT(ipif != NULL); 3968 now = gethrestime_sec(); 3969 maxage = now - ire->ire_create_time; 3970 if (maxage > ipst->ips_ip_max_temp_idle) 3971 maxage = ipst->ips_ip_max_temp_idle; 3972 lused = drv_hztousec(ddi_get_lbolt() - 3973 ire->ire_last_used_time) / MICROSEC + 1; 3974 if (lused >= maxage && (ipif->ipif_flags & 3975 (IPIF_DHCPRUNNING | IPIF_TEMPORARY))) 3976 maxdefense = ipst->ips_ip_max_temp_defend; 3977 else 3978 maxdefense = ipst->ips_ip_max_defend; 3979 3980 /* 3981 * Now figure out how many times we've defended 3982 * ourselves. Ignore defenses that happened long in 3983 * the past. 3984 */ 3985 mutex_enter(&ire->ire_lock); 3986 if ((defs = ire->ire_defense_count) > 0 && 3987 now - ire->ire_defense_time > 3988 ipst->ips_ip_defend_interval) { 3989 ire->ire_defense_count = defs = 0; 3990 } 3991 ire->ire_defense_count++; 3992 ire->ire_defense_time = now; 3993 mutex_exit(&ire->ire_lock); 3994 ill_refhold(ill); 3995 ire_refrele(ire); 3996 3997 /* 3998 * If we've defended ourselves too many times already, 3999 * then give up and tear down the interface(s) using 4000 * this address. Otherwise, defend by sending out a 4001 * gratuitous ARP. 4002 */ 4003 if (defs >= maxdefense && ill->ill_arp_extend) { 4004 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4005 B_FALSE); 4006 } else { 4007 cmn_err(CE_WARN, 4008 "node %s is using our IP address %s on %s", 4009 hbuf, sbuf, ill->ill_name); 4010 /* 4011 * If this is an old (ATM) ARP module, then 4012 * don't try to defend the address. Remain 4013 * compatible with the old behavior. Defend 4014 * only with new ARP. 4015 */ 4016 if (ill->ill_arp_extend) { 4017 qwriter_ip(ill, q, mp, ip_arp_defend, 4018 NEW_OP, B_FALSE); 4019 } else { 4020 ill_refrele(ill); 4021 } 4022 } 4023 return; 4024 } 4025 cmn_err(CE_WARN, 4026 "proxy ARP problem? Node '%s' is using %s on %s", 4027 hbuf, sbuf, ill->ill_name); 4028 if (ire != NULL) 4029 ire_refrele(ire); 4030 break; 4031 case AR_CN_ANNOUNCE: 4032 if (isv6) { 4033 /* 4034 * For XRESOLV interfaces. 4035 * Delete the IRE cache entry and NCE for this 4036 * v6 address 4037 */ 4038 ip_ire_clookup_and_delete_v6(&v6src, ipst); 4039 /* 4040 * If v6src is a non-zero, it's a router address 4041 * as below. Do the same sort of thing to clean 4042 * out off-net IRE_CACHE entries that go through 4043 * the router. 4044 */ 4045 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4046 ire_walk_v6(ire_delete_cache_gw_v6, 4047 (char *)&v6src, ALL_ZONES, ipst); 4048 } 4049 } else { 4050 nce_hw_map_t hwm; 4051 4052 /* 4053 * ARP gives us a copy of any packet where it thinks 4054 * the address has changed, so that we can update our 4055 * caches. We're responsible for caching known answers 4056 * in the current design. We check whether the 4057 * hardware address really has changed in all of our 4058 * entries that have cached this mapping, and if so, we 4059 * blow them away. This way we will immediately pick 4060 * up the rare case of a host changing hardware 4061 * address. 4062 */ 4063 if (src == 0) 4064 break; 4065 hwm.hwm_addr = src; 4066 hwm.hwm_hwlen = arh->arh_hlen; 4067 hwm.hwm_hwaddr = (uchar_t *)(arh + 1); 4068 NDP_HW_CHANGE_INCR(ipst->ips_ndp4); 4069 ndp_walk_common(ipst->ips_ndp4, NULL, 4070 (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES); 4071 NDP_HW_CHANGE_DECR(ipst->ips_ndp4); 4072 } 4073 break; 4074 case AR_CN_READY: 4075 /* No external v6 resolver has a contract to use this */ 4076 if (isv6) 4077 break; 4078 /* If the link is down, we'll retry this later */ 4079 if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 4080 break; 4081 ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL, 4082 NULL, NULL, ipst); 4083 if (ipif != NULL) { 4084 /* 4085 * If this is a duplicate recovery, then we now need to 4086 * go exclusive to bring this thing back up. 4087 */ 4088 if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) == 4089 IPIF_DUPLICATE) { 4090 ipif_refrele(ipif); 4091 ill_refhold(ill); 4092 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4093 B_FALSE); 4094 return; 4095 } 4096 /* 4097 * If this is the first notice that this address is 4098 * ready, then let the user know now. 4099 */ 4100 if ((ipif->ipif_flags & IPIF_UP) && 4101 !ipif->ipif_addr_ready) { 4102 ipif_mask_reply(ipif); 4103 ipif_up_notify(ipif); 4104 } 4105 ipif->ipif_addr_ready = 1; 4106 ipif_refrele(ipif); 4107 } 4108 ire = ire_cache_lookup(src, ALL_ZONES, msg_getlabel(mp), ipst); 4109 if (ire != NULL) { 4110 ire->ire_defense_count = 0; 4111 ire_refrele(ire); 4112 } 4113 break; 4114 case AR_CN_FAILED: 4115 /* No external v6 resolver has a contract to use this */ 4116 if (isv6) 4117 break; 4118 if (!ill->ill_arp_extend) { 4119 (void) mac_colon_addr((uint8_t *)(arh + 1), 4120 arh->arh_hlen, hbuf, sizeof (hbuf)); 4121 (void) ip_dot_addr(src, sbuf); 4122 4123 cmn_err(CE_WARN, 4124 "node %s is using our IP address %s on %s", 4125 hbuf, sbuf, ill->ill_name); 4126 break; 4127 } 4128 ill_refhold(ill); 4129 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE); 4130 return; 4131 } 4132 freemsg(mp); 4133 } 4134 4135 /* 4136 * Create a mblk suitable for carrying the interface index and/or source link 4137 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 4138 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 4139 * application. 4140 */ 4141 mblk_t * 4142 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, 4143 ip_stack_t *ipst) 4144 { 4145 mblk_t *mp; 4146 ip_pktinfo_t *pinfo; 4147 ipha_t *ipha; 4148 struct ether_header *pether; 4149 boolean_t ipmp_ill_held = B_FALSE; 4150 4151 mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); 4152 if (mp == NULL) { 4153 ip1dbg(("ip_add_info: allocation failure.\n")); 4154 return (data_mp); 4155 } 4156 4157 ipha = (ipha_t *)data_mp->b_rptr; 4158 pinfo = (ip_pktinfo_t *)mp->b_rptr; 4159 bzero(pinfo, sizeof (ip_pktinfo_t)); 4160 pinfo->ip_pkt_flags = (uchar_t)flags; 4161 pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 4162 4163 pether = (struct ether_header *)((char *)ipha 4164 - sizeof (struct ether_header)); 4165 4166 /* 4167 * Make sure the interface is an ethernet type, since this option 4168 * is currently supported only on this type of interface. Also make 4169 * sure we are pointing correctly above db_base. 4170 */ 4171 if ((flags & IPF_RECVSLLA) && 4172 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 4173 (ill->ill_type == IFT_ETHER) && 4174 (ill->ill_net_type == IRE_IF_RESOLVER)) { 4175 pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; 4176 bcopy(pether->ether_shost.ether_addr_octet, 4177 pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); 4178 } else { 4179 /* 4180 * Clear the bit. Indicate to upper layer that IP is not 4181 * sending this ancillary info. 4182 */ 4183 pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; 4184 } 4185 4186 /* 4187 * If `ill' is in an IPMP group, use the IPMP ill to determine 4188 * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that 4189 * IPF_RECVADDR support on test addresses is not needed.) 4190 * 4191 * Note that `ill' may already be an IPMP ill if e.g. we're 4192 * processing a packet looped back to an IPMP data address 4193 * (since those IRE_LOCALs are tied to IPMP ills). 4194 */ 4195 if (IS_UNDER_IPMP(ill)) { 4196 if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) { 4197 ip1dbg(("ip_add_info: cannot hold IPMP ill.\n")); 4198 freemsg(mp); 4199 return (data_mp); 4200 } 4201 ipmp_ill_held = B_TRUE; 4202 } 4203 4204 if (flags & (IPF_RECVIF | IPF_RECVADDR)) 4205 pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 4206 if (flags & IPF_RECVADDR) { 4207 ipif_t *ipif; 4208 ire_t *ire; 4209 4210 /* 4211 * Only valid for V4 4212 */ 4213 ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) == 4214 (IPV4_VERSION << 4)); 4215 4216 ipif = ipif_get_next_ipif(NULL, ill); 4217 if (ipif != NULL) { 4218 /* 4219 * Since a decision has already been made to deliver the 4220 * packet, there is no need to test for SECATTR and 4221 * ZONEONLY. 4222 * When a multicast packet is transmitted 4223 * a cache entry is created for the multicast address. 4224 * When delivering a copy of the packet or when new 4225 * packets are received we do not want to match on the 4226 * cached entry so explicitly match on 4227 * IRE_LOCAL and IRE_LOOPBACK 4228 */ 4229 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4230 IRE_LOCAL | IRE_LOOPBACK, 4231 ipif, zoneid, NULL, 4232 MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 4233 if (ire == NULL) { 4234 /* 4235 * packet must have come on a different 4236 * interface. 4237 * Since a decision has already been made to 4238 * deliver the packet, there is no need to test 4239 * for SECATTR and ZONEONLY. 4240 * Only match on local and broadcast ire's. 4241 * See detailed comment above. 4242 */ 4243 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4244 IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, 4245 NULL, MATCH_IRE_TYPE, ipst); 4246 } 4247 4248 if (ire == NULL) { 4249 /* 4250 * This is either a multicast packet or 4251 * the address has been removed since 4252 * the packet was received. 4253 * Return INADDR_ANY so that normal source 4254 * selection occurs for the response. 4255 */ 4256 4257 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4258 } else { 4259 pinfo->ip_pkt_match_addr.s_addr = 4260 ire->ire_src_addr; 4261 ire_refrele(ire); 4262 } 4263 ipif_refrele(ipif); 4264 } else { 4265 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4266 } 4267 } 4268 4269 if (ipmp_ill_held) 4270 ill_refrele(ill); 4271 4272 mp->b_datap->db_type = M_CTL; 4273 mp->b_wptr += sizeof (ip_pktinfo_t); 4274 mp->b_cont = data_mp; 4275 4276 return (mp); 4277 } 4278 4279 /* 4280 * Used to determine the most accurate cred_t to use for TX. 4281 * First priority is SCM_UCRED having set the label in the message, 4282 * which is used for MLP on UDP. Second priority is the open credentials 4283 * with the peer's label (aka conn_effective_cred), which is needed for 4284 * MLP on TCP/SCTP and for MAC-Exempt. Last priority is the open credentials. 4285 */ 4286 cred_t * 4287 ip_best_cred(mblk_t *mp, conn_t *connp, pid_t *pidp) 4288 { 4289 cred_t *cr; 4290 4291 cr = msg_getcred(mp, pidp); 4292 if (cr != NULL && crgetlabel(cr) != NULL) 4293 return (cr); 4294 *pidp = NOPID; 4295 return (CONN_CRED(connp)); 4296 } 4297 4298 /* 4299 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 4300 * part of the bind request. 4301 */ 4302 4303 boolean_t 4304 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 4305 { 4306 ipsec_in_t *ii; 4307 4308 ASSERT(policy_mp != NULL); 4309 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 4310 4311 ii = (ipsec_in_t *)policy_mp->b_rptr; 4312 ASSERT(ii->ipsec_in_type == IPSEC_IN); 4313 4314 connp->conn_policy = ii->ipsec_in_policy; 4315 ii->ipsec_in_policy = NULL; 4316 4317 if (ii->ipsec_in_action != NULL) { 4318 if (connp->conn_latch == NULL) { 4319 connp->conn_latch = iplatch_create(); 4320 if (connp->conn_latch == NULL) 4321 return (B_FALSE); 4322 } 4323 ipsec_latch_inbound(connp->conn_latch, ii); 4324 } 4325 return (B_TRUE); 4326 } 4327 4328 /* 4329 * Upper level protocols (ULP) pass through bind requests to IP for inspection 4330 * and to arrange for power-fanout assist. The ULP is identified by 4331 * adding a single byte at the end of the original bind message. 4332 * A ULP other than UDP or TCP that wishes to be recognized passes 4333 * down a bind with a zero length address. 4334 * 4335 * The binding works as follows: 4336 * - A zero byte address means just bind to the protocol. 4337 * - A four byte address is treated as a request to validate 4338 * that the address is a valid local address, appropriate for 4339 * an application to bind to. This does not affect any fanout 4340 * information in IP. 4341 * - A sizeof sin_t byte address is used to bind to only the local address 4342 * and port. 4343 * - A sizeof ipa_conn_t byte address contains complete fanout information 4344 * consisting of local and remote addresses and ports. In 4345 * this case, the addresses are both validated as appropriate 4346 * for this operation, and, if so, the information is retained 4347 * for use in the inbound fanout. 4348 * 4349 * The ULP (except in the zero-length bind) can append an 4350 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 4351 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 4352 * a copy of the source or destination IRE (source for local bind; 4353 * destination for complete bind). IPSEC_POLICY_SET indicates that the 4354 * policy information contained should be copied on to the conn. 4355 * 4356 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 4357 */ 4358 mblk_t * 4359 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 4360 { 4361 ssize_t len; 4362 struct T_bind_req *tbr; 4363 sin_t *sin; 4364 ipa_conn_t *ac; 4365 uchar_t *ucp; 4366 mblk_t *mp1; 4367 int error = 0; 4368 int protocol; 4369 ipa_conn_x_t *acx; 4370 cred_t *cr; 4371 4372 /* 4373 * All Solaris components should pass a db_credp 4374 * for this TPI message, hence we ASSERT. 4375 * But in case there is some other M_PROTO that looks 4376 * like a TPI message sent by some other kernel 4377 * component, we check and return an error. 4378 */ 4379 cr = msg_getcred(mp, NULL); 4380 ASSERT(cr != NULL); 4381 if (cr == NULL) { 4382 error = EINVAL; 4383 goto bad_addr; 4384 } 4385 4386 ASSERT(!connp->conn_af_isv6); 4387 connp->conn_pkt_isv6 = B_FALSE; 4388 4389 len = MBLKL(mp); 4390 if (len < (sizeof (*tbr) + 1)) { 4391 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 4392 "ip_bind: bogus msg, len %ld", len); 4393 /* XXX: Need to return something better */ 4394 goto bad_addr; 4395 } 4396 /* Back up and extract the protocol identifier. */ 4397 mp->b_wptr--; 4398 protocol = *mp->b_wptr & 0xFF; 4399 tbr = (struct T_bind_req *)mp->b_rptr; 4400 /* Reset the message type in preparation for shipping it back. */ 4401 DB_TYPE(mp) = M_PCPROTO; 4402 4403 connp->conn_ulp = (uint8_t)protocol; 4404 4405 /* 4406 * Check for a zero length address. This is from a protocol that 4407 * wants to register to receive all packets of its type. 4408 */ 4409 if (tbr->ADDR_length == 0) { 4410 /* 4411 * These protocols are now intercepted in ip_bind_v6(). 4412 * Reject protocol-level binds here for now. 4413 * 4414 * For SCTP raw socket, ICMP sends down a bind with sin_t 4415 * so that the protocol type cannot be SCTP. 4416 */ 4417 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 4418 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 4419 goto bad_addr; 4420 } 4421 4422 /* 4423 * 4424 * The udp module never sends down a zero-length address, 4425 * and allowing this on a labeled system will break MLP 4426 * functionality. 4427 */ 4428 if (is_system_labeled() && protocol == IPPROTO_UDP) 4429 goto bad_addr; 4430 4431 if (connp->conn_mac_exempt) 4432 goto bad_addr; 4433 4434 /* No hash here really. The table is big enough. */ 4435 connp->conn_srcv6 = ipv6_all_zeros; 4436 4437 ipcl_proto_insert(connp, protocol); 4438 4439 tbr->PRIM_type = T_BIND_ACK; 4440 return (mp); 4441 } 4442 4443 /* Extract the address pointer from the message. */ 4444 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 4445 tbr->ADDR_length); 4446 if (ucp == NULL) { 4447 ip1dbg(("ip_bind: no address\n")); 4448 goto bad_addr; 4449 } 4450 if (!OK_32PTR(ucp)) { 4451 ip1dbg(("ip_bind: unaligned address\n")); 4452 goto bad_addr; 4453 } 4454 /* 4455 * Check for trailing mps. 4456 */ 4457 mp1 = mp->b_cont; 4458 4459 switch (tbr->ADDR_length) { 4460 default: 4461 ip1dbg(("ip_bind: bad address length %d\n", 4462 (int)tbr->ADDR_length)); 4463 goto bad_addr; 4464 4465 case IP_ADDR_LEN: 4466 /* Verification of local address only */ 4467 error = ip_bind_laddr_v4(connp, &mp1, protocol, 4468 *(ipaddr_t *)ucp, 0, B_FALSE); 4469 break; 4470 4471 case sizeof (sin_t): 4472 sin = (sin_t *)ucp; 4473 error = ip_bind_laddr_v4(connp, &mp1, protocol, 4474 sin->sin_addr.s_addr, sin->sin_port, B_TRUE); 4475 break; 4476 4477 case sizeof (ipa_conn_t): 4478 ac = (ipa_conn_t *)ucp; 4479 /* For raw socket, the local port is not set. */ 4480 if (ac->ac_lport == 0) 4481 ac->ac_lport = connp->conn_lport; 4482 /* Always verify destination reachability. */ 4483 error = ip_bind_connected_v4(connp, &mp1, protocol, 4484 &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport, 4485 B_TRUE, B_TRUE, cr); 4486 break; 4487 4488 case sizeof (ipa_conn_x_t): 4489 acx = (ipa_conn_x_t *)ucp; 4490 /* 4491 * Whether or not to verify destination reachability depends 4492 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4493 */ 4494 error = ip_bind_connected_v4(connp, &mp1, protocol, 4495 &acx->acx_conn.ac_laddr, acx->acx_conn.ac_lport, 4496 acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport, 4497 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0, cr); 4498 break; 4499 } 4500 ASSERT(error != EINPROGRESS); 4501 if (error != 0) 4502 goto bad_addr; 4503 4504 /* Send it home. */ 4505 mp->b_datap->db_type = M_PCPROTO; 4506 tbr->PRIM_type = T_BIND_ACK; 4507 return (mp); 4508 4509 bad_addr: 4510 /* 4511 * If error = -1 then we generate a TBADADDR - otherwise error is 4512 * a unix errno. 4513 */ 4514 if (error > 0) 4515 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4516 else 4517 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4518 return (mp); 4519 } 4520 4521 /* 4522 * Here address is verified to be a valid local address. 4523 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4524 * address is also considered a valid local address. 4525 * In the case of a broadcast/multicast address, however, the 4526 * upper protocol is expected to reset the src address 4527 * to 0 if it sees a IRE_BROADCAST type returned so that 4528 * no packets are emitted with broadcast/multicast address as 4529 * source address (that violates hosts requirements RFC 1122) 4530 * The addresses valid for bind are: 4531 * (1) - INADDR_ANY (0) 4532 * (2) - IP address of an UP interface 4533 * (3) - IP address of a DOWN interface 4534 * (4) - valid local IP broadcast addresses. In this case 4535 * the conn will only receive packets destined to 4536 * the specified broadcast address. 4537 * (5) - a multicast address. In this case 4538 * the conn will only receive packets destined to 4539 * the specified multicast address. Note: the 4540 * application still has to issue an 4541 * IP_ADD_MEMBERSHIP socket option. 4542 * 4543 * On error, return -1 for TBADADDR otherwise pass the 4544 * errno with TSYSERR reply. 4545 * 4546 * In all the above cases, the bound address must be valid in the current zone. 4547 * When the address is loopback, multicast or broadcast, there might be many 4548 * matching IREs so bind has to look up based on the zone. 4549 * 4550 * Note: lport is in network byte order. 4551 * 4552 */ 4553 int 4554 ip_bind_laddr_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, 4555 ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert) 4556 { 4557 int error = 0; 4558 ire_t *src_ire; 4559 zoneid_t zoneid; 4560 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4561 mblk_t *mp = NULL; 4562 boolean_t ire_requested = B_FALSE; 4563 boolean_t ipsec_policy_set = B_FALSE; 4564 4565 if (mpp) 4566 mp = *mpp; 4567 4568 if (mp != NULL) { 4569 ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); 4570 ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); 4571 } 4572 4573 /* 4574 * If it was previously connected, conn_fully_bound would have 4575 * been set. 4576 */ 4577 connp->conn_fully_bound = B_FALSE; 4578 4579 src_ire = NULL; 4580 4581 zoneid = IPCL_ZONEID(connp); 4582 4583 if (src_addr) { 4584 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4585 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 4586 /* 4587 * If an address other than 0.0.0.0 is requested, 4588 * we verify that it is a valid address for bind 4589 * Note: Following code is in if-else-if form for 4590 * readability compared to a condition check. 4591 */ 4592 /* LINTED - statement has no consequence */ 4593 if (IRE_IS_LOCAL(src_ire)) { 4594 /* 4595 * (2) Bind to address of local UP interface 4596 */ 4597 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4598 /* 4599 * (4) Bind to broadcast address 4600 * Note: permitted only from transports that 4601 * request IRE 4602 */ 4603 if (!ire_requested) 4604 error = EADDRNOTAVAIL; 4605 } else { 4606 /* 4607 * (3) Bind to address of local DOWN interface 4608 * (ipif_lookup_addr() looks up all interfaces 4609 * but we do not get here for UP interfaces 4610 * - case (2) above) 4611 */ 4612 /* LINTED - statement has no consequent */ 4613 if (ip_addr_exists(src_addr, zoneid, ipst)) { 4614 /* The address exists */ 4615 } else if (CLASSD(src_addr)) { 4616 error = 0; 4617 if (src_ire != NULL) 4618 ire_refrele(src_ire); 4619 /* 4620 * (5) bind to multicast address. 4621 * Fake out the IRE returned to upper 4622 * layer to be a broadcast IRE. 4623 */ 4624 src_ire = ire_ctable_lookup( 4625 INADDR_BROADCAST, INADDR_ANY, 4626 IRE_BROADCAST, NULL, zoneid, NULL, 4627 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY), 4628 ipst); 4629 if (src_ire == NULL || !ire_requested) 4630 error = EADDRNOTAVAIL; 4631 } else { 4632 /* 4633 * Not a valid address for bind 4634 */ 4635 error = EADDRNOTAVAIL; 4636 } 4637 } 4638 if (error) { 4639 /* Red Alert! Attempting to be a bogon! */ 4640 ip1dbg(("ip_bind_laddr_v4: bad src address 0x%x\n", 4641 ntohl(src_addr))); 4642 goto bad_addr; 4643 } 4644 } 4645 4646 /* 4647 * Allow setting new policies. For example, disconnects come 4648 * down as ipa_t bind. As we would have set conn_policy_cached 4649 * to B_TRUE before, we should set it to B_FALSE, so that policy 4650 * can change after the disconnect. 4651 */ 4652 connp->conn_policy_cached = B_FALSE; 4653 4654 /* 4655 * If not fanout_insert this was just an address verification 4656 */ 4657 if (fanout_insert) { 4658 /* 4659 * The addresses have been verified. Time to insert in 4660 * the correct fanout list. 4661 */ 4662 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4663 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4664 connp->conn_lport = lport; 4665 connp->conn_fport = 0; 4666 /* 4667 * Do we need to add a check to reject Multicast packets 4668 */ 4669 error = ipcl_bind_insert(connp, protocol, src_addr, lport); 4670 } 4671 4672 if (error == 0) { 4673 if (ire_requested) { 4674 if (!ip_bind_get_ire_v4(mpp, src_ire, NULL, ipst)) { 4675 error = -1; 4676 /* Falls through to bad_addr */ 4677 } 4678 } else if (ipsec_policy_set) { 4679 if (!ip_bind_ipsec_policy_set(connp, mp)) { 4680 error = -1; 4681 /* Falls through to bad_addr */ 4682 } 4683 } 4684 } 4685 bad_addr: 4686 if (error != 0) { 4687 if (connp->conn_anon_port) { 4688 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4689 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4690 B_FALSE); 4691 } 4692 connp->conn_mlp_type = mlptSingle; 4693 } 4694 if (src_ire != NULL) 4695 IRE_REFRELE(src_ire); 4696 return (error); 4697 } 4698 4699 int 4700 ip_proto_bind_laddr_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol, 4701 ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert) 4702 { 4703 int error; 4704 4705 ASSERT(!connp->conn_af_isv6); 4706 connp->conn_pkt_isv6 = B_FALSE; 4707 connp->conn_ulp = protocol; 4708 4709 error = ip_bind_laddr_v4(connp, ire_mpp, protocol, src_addr, lport, 4710 fanout_insert); 4711 if (error < 0) 4712 error = -TBADADDR; 4713 return (error); 4714 } 4715 4716 /* 4717 * Verify that both the source and destination addresses 4718 * are valid. If verify_dst is false, then the destination address may be 4719 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4720 * destination reachability, while tunnels do not. 4721 * Note that we allow connect to broadcast and multicast 4722 * addresses when ire_requested is set. Thus the ULP 4723 * has to check for IRE_BROADCAST and multicast. 4724 * 4725 * Returns zero if ok. 4726 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4727 * (for use with TSYSERR reply). 4728 * 4729 * Note: lport and fport are in network byte order. 4730 */ 4731 int 4732 ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, 4733 ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4734 boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr) 4735 { 4736 4737 ire_t *src_ire; 4738 ire_t *dst_ire; 4739 int error = 0; 4740 ire_t *sire = NULL; 4741 ire_t *md_dst_ire = NULL; 4742 ire_t *lso_dst_ire = NULL; 4743 ill_t *ill = NULL; 4744 zoneid_t zoneid; 4745 ipaddr_t src_addr = *src_addrp; 4746 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4747 mblk_t *mp = NULL; 4748 boolean_t ire_requested = B_FALSE; 4749 boolean_t ipsec_policy_set = B_FALSE; 4750 ts_label_t *tsl = NULL; 4751 cred_t *effective_cred = NULL; 4752 4753 if (mpp) 4754 mp = *mpp; 4755 4756 if (mp != NULL) { 4757 ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); 4758 ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); 4759 } 4760 4761 src_ire = dst_ire = NULL; 4762 4763 /* 4764 * If we never got a disconnect before, clear it now. 4765 */ 4766 connp->conn_fully_bound = B_FALSE; 4767 4768 zoneid = IPCL_ZONEID(connp); 4769 4770 /* 4771 * Check whether Trusted Solaris policy allows communication with this 4772 * host, and pretend that the destination is unreachable if not. 4773 * 4774 * This is never a problem for TCP, since that transport is known to 4775 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4776 * handling. If the remote is unreachable, it will be detected at that 4777 * point, so there's no reason to check it here. 4778 * 4779 * Note that for sendto (and other datagram-oriented friends), this 4780 * check is done as part of the data path label computation instead. 4781 * The check here is just to make non-TCP connect() report the right 4782 * error. 4783 */ 4784 if (is_system_labeled() && !IPCL_IS_TCP(connp)) { 4785 if ((error = tsol_check_dest(cr, &dst_addr, IPV4_VERSION, 4786 connp->conn_mac_exempt, &effective_cred)) != 0) { 4787 if (ip_debug > 2) { 4788 pr_addr_dbg( 4789 "ip_bind_connected_v4:" 4790 " no label for dst %s\n", 4791 AF_INET, &dst_addr); 4792 } 4793 goto bad_addr; 4794 } 4795 4796 /* 4797 * tsol_check_dest() may have created a new cred with 4798 * a modified security label. Use that cred if it exists 4799 * for ire lookups. 4800 */ 4801 if (effective_cred == NULL) { 4802 tsl = crgetlabel(cr); 4803 } else { 4804 tsl = crgetlabel(effective_cred); 4805 } 4806 } 4807 4808 if (CLASSD(dst_addr)) { 4809 /* Pick up an IRE_BROADCAST */ 4810 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4811 NULL, zoneid, tsl, 4812 (MATCH_IRE_RECURSIVE | 4813 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4814 MATCH_IRE_SECATTR), ipst); 4815 } else { 4816 /* 4817 * If conn_dontroute is set or if conn_nexthop_set is set, 4818 * and onlink ipif is not found set ENETUNREACH error. 4819 */ 4820 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4821 ipif_t *ipif; 4822 4823 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4824 dst_addr : connp->conn_nexthop_v4, zoneid, ipst); 4825 if (ipif == NULL) { 4826 error = ENETUNREACH; 4827 goto bad_addr; 4828 } 4829 ipif_refrele(ipif); 4830 } 4831 4832 if (connp->conn_nexthop_set) { 4833 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4834 0, 0, NULL, NULL, zoneid, tsl, 4835 MATCH_IRE_SECATTR, ipst); 4836 } else { 4837 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4838 &sire, zoneid, tsl, 4839 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4840 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4841 MATCH_IRE_SECATTR), ipst); 4842 } 4843 } 4844 /* 4845 * dst_ire can't be a broadcast when not ire_requested. 4846 * We also prevent ire's with src address INADDR_ANY to 4847 * be used, which are created temporarily for 4848 * sending out packets from endpoints that have 4849 * conn_unspec_src set. If verify_dst is true, the destination must be 4850 * reachable. If verify_dst is false, the destination needn't be 4851 * reachable. 4852 * 4853 * If we match on a reject or black hole, then we've got a 4854 * local failure. May as well fail out the connect() attempt, 4855 * since it's never going to succeed. 4856 */ 4857 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4858 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4859 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4860 /* 4861 * If we're verifying destination reachability, we always want 4862 * to complain here. 4863 * 4864 * If we're not verifying destination reachability but the 4865 * destination has a route, we still want to fail on the 4866 * temporary address and broadcast address tests. 4867 */ 4868 if (verify_dst || (dst_ire != NULL)) { 4869 if (ip_debug > 2) { 4870 pr_addr_dbg("ip_bind_connected_v4:" 4871 "bad connected dst %s\n", 4872 AF_INET, &dst_addr); 4873 } 4874 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4875 error = ENETUNREACH; 4876 else 4877 error = EHOSTUNREACH; 4878 goto bad_addr; 4879 } 4880 } 4881 4882 /* 4883 * If the app does a connect(), it means that it will most likely 4884 * send more than 1 packet to the destination. It makes sense 4885 * to clear the temporary flag. 4886 */ 4887 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4888 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4889 irb_t *irb = dst_ire->ire_bucket; 4890 4891 rw_enter(&irb->irb_lock, RW_WRITER); 4892 /* 4893 * We need to recheck for IRE_MARK_TEMPORARY after acquiring 4894 * the lock to guarantee irb_tmp_ire_cnt. 4895 */ 4896 if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) { 4897 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4898 irb->irb_tmp_ire_cnt--; 4899 } 4900 rw_exit(&irb->irb_lock); 4901 } 4902 4903 /* 4904 * See if we should notify ULP about LSO/MDT; we do this whether or not 4905 * ire_requested is TRUE, in order to handle active connects; LSO/MDT 4906 * eligibility tests for passive connects are handled separately 4907 * through tcp_adapt_ire(). We do this before the source address 4908 * selection, because dst_ire may change after a call to 4909 * ipif_select_source(). This is a best-effort check, as the 4910 * packet for this connection may not actually go through 4911 * dst_ire->ire_stq, and the exact IRE can only be known after 4912 * calling ip_newroute(). This is why we further check on the 4913 * IRE during LSO/Multidata packet transmission in 4914 * tcp_lsosend()/tcp_multisend(). 4915 */ 4916 if (!ipsec_policy_set && dst_ire != NULL && 4917 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4918 (ill = ire_to_ill(dst_ire), ill != NULL)) { 4919 if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { 4920 lso_dst_ire = dst_ire; 4921 IRE_REFHOLD(lso_dst_ire); 4922 } else if (ipst->ips_ip_multidata_outbound && 4923 ILL_MDT_CAPABLE(ill)) { 4924 md_dst_ire = dst_ire; 4925 IRE_REFHOLD(md_dst_ire); 4926 } 4927 } 4928 4929 if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL && 4930 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4931 /* 4932 * If the IRE belongs to a different zone, look for a matching 4933 * route in the forwarding table and use the source address from 4934 * that route. 4935 */ 4936 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4937 zoneid, 0, NULL, 4938 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4939 MATCH_IRE_RJ_BHOLE, ipst); 4940 if (src_ire == NULL) { 4941 error = EHOSTUNREACH; 4942 goto bad_addr; 4943 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4944 if (!(src_ire->ire_type & IRE_HOST)) 4945 error = ENETUNREACH; 4946 else 4947 error = EHOSTUNREACH; 4948 goto bad_addr; 4949 } 4950 if (src_addr == INADDR_ANY) 4951 src_addr = src_ire->ire_src_addr; 4952 ire_refrele(src_ire); 4953 src_ire = NULL; 4954 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4955 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4956 src_addr = sire->ire_src_addr; 4957 ire_refrele(dst_ire); 4958 dst_ire = sire; 4959 sire = NULL; 4960 } else { 4961 /* 4962 * Pick a source address so that a proper inbound 4963 * load spreading would happen. 4964 */ 4965 ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill; 4966 ipif_t *src_ipif = NULL; 4967 ire_t *ipif_ire; 4968 4969 /* 4970 * Supply a local source address such that inbound 4971 * load spreading happens. 4972 * 4973 * Determine the best source address on this ill for 4974 * the destination. 4975 * 4976 * 1) For broadcast, we should return a broadcast ire 4977 * found above so that upper layers know that the 4978 * destination address is a broadcast address. 4979 * 4980 * 2) If the ipif is DEPRECATED, select a better 4981 * source address. Similarly, if the ipif is on 4982 * the IPMP meta-interface, pick a source address 4983 * at random to improve inbound load spreading. 4984 * 4985 * 3) If the outgoing interface is part of a usesrc 4986 * group, then try selecting a source address from 4987 * the usesrc ILL. 4988 */ 4989 if ((dst_ire->ire_zoneid != zoneid && 4990 dst_ire->ire_zoneid != ALL_ZONES) || 4991 (!(dst_ire->ire_flags & RTF_SETSRC)) && 4992 (!(dst_ire->ire_type & IRE_BROADCAST) && 4993 (IS_IPMP(ire_ill) || 4994 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4995 (ire_ill->ill_usesrc_ifindex != 0)))) { 4996 /* 4997 * If the destination is reachable via a 4998 * given gateway, the selected source address 4999 * should be in the same subnet as the gateway. 5000 * Otherwise, the destination is not reachable. 5001 * 5002 * If there are no interfaces on the same subnet 5003 * as the destination, ipif_select_source gives 5004 * first non-deprecated interface which might be 5005 * on a different subnet than the gateway. 5006 * This is not desirable. Hence pass the dst_ire 5007 * source address to ipif_select_source. 5008 * It is sure that the destination is reachable 5009 * with the dst_ire source address subnet. 5010 * So passing dst_ire source address to 5011 * ipif_select_source will make sure that the 5012 * selected source will be on the same subnet 5013 * as dst_ire source address. 5014 */ 5015 ipaddr_t saddr = 5016 dst_ire->ire_ipif->ipif_src_addr; 5017 src_ipif = ipif_select_source(ire_ill, 5018 saddr, zoneid); 5019 if (src_ipif != NULL) { 5020 if (IS_VNI(src_ipif->ipif_ill)) { 5021 /* 5022 * For VNI there is no 5023 * interface route 5024 */ 5025 src_addr = 5026 src_ipif->ipif_src_addr; 5027 } else { 5028 ipif_ire = 5029 ipif_to_ire(src_ipif); 5030 if (ipif_ire != NULL) { 5031 IRE_REFRELE(dst_ire); 5032 dst_ire = ipif_ire; 5033 } 5034 src_addr = 5035 dst_ire->ire_src_addr; 5036 } 5037 ipif_refrele(src_ipif); 5038 } else { 5039 src_addr = dst_ire->ire_src_addr; 5040 } 5041 } else { 5042 src_addr = dst_ire->ire_src_addr; 5043 } 5044 } 5045 } 5046 5047 /* 5048 * We do ire_route_lookup() here (and not 5049 * interface lookup as we assert that 5050 * src_addr should only come from an 5051 * UP interface for hard binding. 5052 */ 5053 ASSERT(src_ire == NULL); 5054 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 5055 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 5056 /* src_ire must be a local|loopback */ 5057 if (!IRE_IS_LOCAL(src_ire)) { 5058 if (ip_debug > 2) { 5059 pr_addr_dbg("ip_bind_connected_v4: bad connected " 5060 "src %s\n", AF_INET, &src_addr); 5061 } 5062 error = EADDRNOTAVAIL; 5063 goto bad_addr; 5064 } 5065 5066 /* 5067 * If the source address is a loopback address, the 5068 * destination had best be local or multicast. 5069 * The transports that can't handle multicast will reject 5070 * those addresses. 5071 */ 5072 if (src_ire->ire_type == IRE_LOOPBACK && 5073 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 5074 ip1dbg(("ip_bind_connected_v4: bad connected loopback\n")); 5075 error = -1; 5076 goto bad_addr; 5077 } 5078 5079 /* 5080 * Allow setting new policies. For example, disconnects come 5081 * down as ipa_t bind. As we would have set conn_policy_cached 5082 * to B_TRUE before, we should set it to B_FALSE, so that policy 5083 * can change after the disconnect. 5084 */ 5085 connp->conn_policy_cached = B_FALSE; 5086 5087 /* 5088 * Set the conn addresses/ports immediately, so the IPsec policy calls 5089 * can handle their passed-in conn's. 5090 */ 5091 5092 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 5093 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 5094 connp->conn_lport = lport; 5095 connp->conn_fport = fport; 5096 *src_addrp = src_addr; 5097 5098 ASSERT(!(ipsec_policy_set && ire_requested)); 5099 if (ire_requested) { 5100 iulp_t *ulp_info = NULL; 5101 5102 /* 5103 * Note that sire will not be NULL if this is an off-link 5104 * connection and there is not cache for that dest yet. 5105 * 5106 * XXX Because of an existing bug, if there are multiple 5107 * default routes, the IRE returned now may not be the actual 5108 * default route used (default routes are chosen in a 5109 * round robin fashion). So if the metrics for different 5110 * default routes are different, we may return the wrong 5111 * metrics. This will not be a problem if the existing 5112 * bug is fixed. 5113 */ 5114 if (sire != NULL) { 5115 ulp_info = &(sire->ire_uinfo); 5116 } 5117 if (!ip_bind_get_ire_v4(mpp, dst_ire, ulp_info, ipst)) { 5118 error = -1; 5119 goto bad_addr; 5120 } 5121 mp = *mpp; 5122 } else if (ipsec_policy_set) { 5123 if (!ip_bind_ipsec_policy_set(connp, mp)) { 5124 error = -1; 5125 goto bad_addr; 5126 } 5127 } 5128 5129 /* 5130 * Cache IPsec policy in this conn. If we have per-socket policy, 5131 * we'll cache that. If we don't, we'll inherit global policy. 5132 * 5133 * We can't insert until the conn reflects the policy. Note that 5134 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 5135 * connections where we don't have a policy. This is to prevent 5136 * global policy lookups in the inbound path. 5137 * 5138 * If we insert before we set conn_policy_cached, 5139 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 5140 * because global policy cound be non-empty. We normally call 5141 * ipsec_check_policy() for conn_policy_cached connections only if 5142 * ipc_in_enforce_policy is set. But in this case, 5143 * conn_policy_cached can get set anytime since we made the 5144 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 5145 * called, which will make the above assumption false. Thus, we 5146 * need to insert after we set conn_policy_cached. 5147 */ 5148 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 5149 goto bad_addr; 5150 5151 if (fanout_insert) { 5152 /* 5153 * The addresses have been verified. Time to insert in 5154 * the correct fanout list. 5155 */ 5156 error = ipcl_conn_insert(connp, protocol, src_addr, 5157 dst_addr, connp->conn_ports); 5158 } 5159 5160 if (error == 0) { 5161 connp->conn_fully_bound = B_TRUE; 5162 /* 5163 * Our initial checks for LSO/MDT have passed; the IRE is not 5164 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 5165 * be supporting LSO/MDT. Pass the IRE, IPC and ILL into 5166 * ip_xxinfo_return(), which performs further checks 5167 * against them and upon success, returns the LSO/MDT info 5168 * mblk which we will attach to the bind acknowledgment. 5169 */ 5170 if (lso_dst_ire != NULL) { 5171 mblk_t *lsoinfo_mp; 5172 5173 ASSERT(ill->ill_lso_capab != NULL); 5174 if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp, 5175 ill->ill_name, ill->ill_lso_capab)) != NULL) { 5176 if (mp == NULL) { 5177 *mpp = lsoinfo_mp; 5178 } else { 5179 linkb(mp, lsoinfo_mp); 5180 } 5181 } 5182 } else if (md_dst_ire != NULL) { 5183 mblk_t *mdinfo_mp; 5184 5185 ASSERT(ill->ill_mdt_capab != NULL); 5186 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 5187 ill->ill_name, ill->ill_mdt_capab)) != NULL) { 5188 if (mp == NULL) { 5189 *mpp = mdinfo_mp; 5190 } else { 5191 linkb(mp, mdinfo_mp); 5192 } 5193 } 5194 } 5195 } 5196 bad_addr: 5197 if (ipsec_policy_set) { 5198 ASSERT(mp != NULL); 5199 freeb(mp); 5200 /* 5201 * As of now assume that nothing else accompanies 5202 * IPSEC_POLICY_SET. 5203 */ 5204 *mpp = NULL; 5205 } 5206 if (src_ire != NULL) 5207 IRE_REFRELE(src_ire); 5208 if (dst_ire != NULL) 5209 IRE_REFRELE(dst_ire); 5210 if (sire != NULL) 5211 IRE_REFRELE(sire); 5212 if (md_dst_ire != NULL) 5213 IRE_REFRELE(md_dst_ire); 5214 if (lso_dst_ire != NULL) 5215 IRE_REFRELE(lso_dst_ire); 5216 if (effective_cred != NULL) 5217 crfree(effective_cred); 5218 return (error); 5219 } 5220 5221 int 5222 ip_proto_bind_connected_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol, 5223 ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 5224 boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr) 5225 { 5226 int error; 5227 5228 ASSERT(!connp->conn_af_isv6); 5229 connp->conn_pkt_isv6 = B_FALSE; 5230 connp->conn_ulp = protocol; 5231 5232 /* For raw socket, the local port is not set. */ 5233 if (lport == 0) 5234 lport = connp->conn_lport; 5235 error = ip_bind_connected_v4(connp, ire_mpp, protocol, 5236 src_addrp, lport, dst_addr, fport, fanout_insert, verify_dst, cr); 5237 if (error < 0) 5238 error = -TBADADDR; 5239 return (error); 5240 } 5241 5242 /* 5243 * Get the ire in *mpp. Returns false if it fails (due to lack of space). 5244 * Prefers dst_ire over src_ire. 5245 */ 5246 static boolean_t 5247 ip_bind_get_ire_v4(mblk_t **mpp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) 5248 { 5249 mblk_t *mp = *mpp; 5250 ire_t *ret_ire; 5251 5252 ASSERT(mp != NULL); 5253 5254 if (ire != NULL) { 5255 /* 5256 * mp initialized above to IRE_DB_REQ_TYPE 5257 * appended mblk. Its <upper protocol>'s 5258 * job to make sure there is room. 5259 */ 5260 if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t)) 5261 return (B_FALSE); 5262 5263 mp->b_datap->db_type = IRE_DB_TYPE; 5264 mp->b_wptr = mp->b_rptr + sizeof (ire_t); 5265 bcopy(ire, mp->b_rptr, sizeof (ire_t)); 5266 ret_ire = (ire_t *)mp->b_rptr; 5267 /* 5268 * Pass the latest setting of the ip_path_mtu_discovery and 5269 * copy the ulp info if any. 5270 */ 5271 ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? 5272 IPH_DF : 0; 5273 if (ulp_info != NULL) { 5274 bcopy(ulp_info, &(ret_ire->ire_uinfo), 5275 sizeof (iulp_t)); 5276 } 5277 ret_ire->ire_mp = mp; 5278 } else { 5279 /* 5280 * No IRE was found. Remove IRE mblk. 5281 */ 5282 *mpp = mp->b_cont; 5283 freeb(mp); 5284 } 5285 return (B_TRUE); 5286 } 5287 5288 /* 5289 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 5290 * the final piece where we don't. Return a pointer to the first mblk in the 5291 * result, and update the pointer to the next mblk to chew on. If anything 5292 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 5293 * NULL pointer. 5294 */ 5295 mblk_t * 5296 ip_carve_mp(mblk_t **mpp, ssize_t len) 5297 { 5298 mblk_t *mp0; 5299 mblk_t *mp1; 5300 mblk_t *mp2; 5301 5302 if (!len || !mpp || !(mp0 = *mpp)) 5303 return (NULL); 5304 /* If we aren't going to consume the first mblk, we need a dup. */ 5305 if (mp0->b_wptr - mp0->b_rptr > len) { 5306 mp1 = dupb(mp0); 5307 if (mp1) { 5308 /* Partition the data between the two mblks. */ 5309 mp1->b_wptr = mp1->b_rptr + len; 5310 mp0->b_rptr = mp1->b_wptr; 5311 /* 5312 * after adjustments if mblk not consumed is now 5313 * unaligned, try to align it. If this fails free 5314 * all messages and let upper layer recover. 5315 */ 5316 if (!OK_32PTR(mp0->b_rptr)) { 5317 if (!pullupmsg(mp0, -1)) { 5318 freemsg(mp0); 5319 freemsg(mp1); 5320 *mpp = NULL; 5321 return (NULL); 5322 } 5323 } 5324 } 5325 return (mp1); 5326 } 5327 /* Eat through as many mblks as we need to get len bytes. */ 5328 len -= mp0->b_wptr - mp0->b_rptr; 5329 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 5330 if (mp2->b_wptr - mp2->b_rptr > len) { 5331 /* 5332 * We won't consume the entire last mblk. Like 5333 * above, dup and partition it. 5334 */ 5335 mp1->b_cont = dupb(mp2); 5336 mp1 = mp1->b_cont; 5337 if (!mp1) { 5338 /* 5339 * Trouble. Rather than go to a lot of 5340 * trouble to clean up, we free the messages. 5341 * This won't be any worse than losing it on 5342 * the wire. 5343 */ 5344 freemsg(mp0); 5345 freemsg(mp2); 5346 *mpp = NULL; 5347 return (NULL); 5348 } 5349 mp1->b_wptr = mp1->b_rptr + len; 5350 mp2->b_rptr = mp1->b_wptr; 5351 /* 5352 * after adjustments if mblk not consumed is now 5353 * unaligned, try to align it. If this fails free 5354 * all messages and let upper layer recover. 5355 */ 5356 if (!OK_32PTR(mp2->b_rptr)) { 5357 if (!pullupmsg(mp2, -1)) { 5358 freemsg(mp0); 5359 freemsg(mp2); 5360 *mpp = NULL; 5361 return (NULL); 5362 } 5363 } 5364 *mpp = mp2; 5365 return (mp0); 5366 } 5367 /* Decrement len by the amount we just got. */ 5368 len -= mp2->b_wptr - mp2->b_rptr; 5369 } 5370 /* 5371 * len should be reduced to zero now. If not our caller has 5372 * screwed up. 5373 */ 5374 if (len) { 5375 /* Shouldn't happen! */ 5376 freemsg(mp0); 5377 *mpp = NULL; 5378 return (NULL); 5379 } 5380 /* 5381 * We consumed up to exactly the end of an mblk. Detach the part 5382 * we are returning from the rest of the chain. 5383 */ 5384 mp1->b_cont = NULL; 5385 *mpp = mp2; 5386 return (mp0); 5387 } 5388 5389 /* The ill stream is being unplumbed. Called from ip_close */ 5390 int 5391 ip_modclose(ill_t *ill) 5392 { 5393 boolean_t success; 5394 ipsq_t *ipsq; 5395 ipif_t *ipif; 5396 queue_t *q = ill->ill_rq; 5397 ip_stack_t *ipst = ill->ill_ipst; 5398 int i; 5399 5400 /* 5401 * The punlink prior to this may have initiated a capability 5402 * negotiation. But ipsq_enter will block until that finishes or 5403 * times out. 5404 */ 5405 success = ipsq_enter(ill, B_FALSE, NEW_OP); 5406 5407 /* 5408 * Open/close/push/pop is guaranteed to be single threaded 5409 * per stream by STREAMS. FS guarantees that all references 5410 * from top are gone before close is called. So there can't 5411 * be another close thread that has set CONDEMNED on this ill. 5412 * and cause ipsq_enter to return failure. 5413 */ 5414 ASSERT(success); 5415 ipsq = ill->ill_phyint->phyint_ipsq; 5416 5417 /* 5418 * Mark it condemned. No new reference will be made to this ill. 5419 * Lookup functions will return an error. Threads that try to 5420 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 5421 * that the refcnt will drop down to zero. 5422 */ 5423 mutex_enter(&ill->ill_lock); 5424 ill->ill_state_flags |= ILL_CONDEMNED; 5425 for (ipif = ill->ill_ipif; ipif != NULL; 5426 ipif = ipif->ipif_next) { 5427 ipif->ipif_state_flags |= IPIF_CONDEMNED; 5428 } 5429 /* 5430 * Wake up anybody waiting to enter the ipsq. ipsq_enter 5431 * returns error if ILL_CONDEMNED is set 5432 */ 5433 cv_broadcast(&ill->ill_cv); 5434 mutex_exit(&ill->ill_lock); 5435 5436 /* 5437 * Send all the deferred DLPI messages downstream which came in 5438 * during the small window right before ipsq_enter(). We do this 5439 * without waiting for the ACKs because all the ACKs for M_PROTO 5440 * messages are ignored in ip_rput() when ILL_CONDEMNED is set. 5441 */ 5442 ill_dlpi_send_deferred(ill); 5443 5444 /* 5445 * Shut down fragmentation reassembly. 5446 * ill_frag_timer won't start a timer again. 5447 * Now cancel any existing timer 5448 */ 5449 (void) untimeout(ill->ill_frag_timer_id); 5450 (void) ill_frag_timeout(ill, 0); 5451 5452 /* 5453 * Call ill_delete to bring down the ipifs, ilms and ill on 5454 * this ill. Then wait for the refcnts to drop to zero. 5455 * ill_is_freeable checks whether the ill is really quiescent. 5456 * Then make sure that threads that are waiting to enter the 5457 * ipsq have seen the error returned by ipsq_enter and have 5458 * gone away. Then we call ill_delete_tail which does the 5459 * DL_UNBIND_REQ with the driver and then qprocsoff. 5460 */ 5461 ill_delete(ill); 5462 mutex_enter(&ill->ill_lock); 5463 while (!ill_is_freeable(ill)) 5464 cv_wait(&ill->ill_cv, &ill->ill_lock); 5465 while (ill->ill_waiters) 5466 cv_wait(&ill->ill_cv, &ill->ill_lock); 5467 5468 mutex_exit(&ill->ill_lock); 5469 5470 /* 5471 * ill_delete_tail drops reference on ill_ipst, but we need to keep 5472 * it held until the end of the function since the cleanup 5473 * below needs to be able to use the ip_stack_t. 5474 */ 5475 netstack_hold(ipst->ips_netstack); 5476 5477 /* qprocsoff is done via ill_delete_tail */ 5478 ill_delete_tail(ill); 5479 ASSERT(ill->ill_ipst == NULL); 5480 5481 /* 5482 * Walk through all upper (conn) streams and qenable 5483 * those that have queued data. 5484 * close synchronization needs this to 5485 * be done to ensure that all upper layers blocked 5486 * due to flow control to the closing device 5487 * get unblocked. 5488 */ 5489 ip1dbg(("ip_wsrv: walking\n")); 5490 for (i = 0; i < TX_FANOUT_SIZE; i++) { 5491 conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]); 5492 } 5493 5494 mutex_enter(&ipst->ips_ip_mi_lock); 5495 mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); 5496 mutex_exit(&ipst->ips_ip_mi_lock); 5497 5498 /* 5499 * credp could be null if the open didn't succeed and ip_modopen 5500 * itself calls ip_close. 5501 */ 5502 if (ill->ill_credp != NULL) 5503 crfree(ill->ill_credp); 5504 5505 /* 5506 * Now we are done with the module close pieces that 5507 * need the netstack_t. 5508 */ 5509 netstack_rele(ipst->ips_netstack); 5510 5511 mi_close_free((IDP)ill); 5512 q->q_ptr = WR(q)->q_ptr = NULL; 5513 5514 ipsq_exit(ipsq); 5515 5516 return (0); 5517 } 5518 5519 /* 5520 * This is called as part of close() for IP, UDP, ICMP, and RTS 5521 * in order to quiesce the conn. 5522 */ 5523 void 5524 ip_quiesce_conn(conn_t *connp) 5525 { 5526 boolean_t drain_cleanup_reqd = B_FALSE; 5527 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 5528 boolean_t ilg_cleanup_reqd = B_FALSE; 5529 ip_stack_t *ipst; 5530 5531 ASSERT(!IPCL_IS_TCP(connp)); 5532 ipst = connp->conn_netstack->netstack_ip; 5533 5534 /* 5535 * Mark the conn as closing, and this conn must not be 5536 * inserted in future into any list. Eg. conn_drain_insert(), 5537 * won't insert this conn into the conn_drain_list. 5538 * Similarly ill_pending_mp_add() will not add any mp to 5539 * the pending mp list, after this conn has started closing. 5540 * 5541 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 5542 * cannot get set henceforth. 5543 */ 5544 mutex_enter(&connp->conn_lock); 5545 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 5546 connp->conn_state_flags |= CONN_CLOSING; 5547 if (connp->conn_idl != NULL) 5548 drain_cleanup_reqd = B_TRUE; 5549 if (connp->conn_oper_pending_ill != NULL) 5550 conn_ioctl_cleanup_reqd = B_TRUE; 5551 if (connp->conn_dhcpinit_ill != NULL) { 5552 ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0); 5553 atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit); 5554 connp->conn_dhcpinit_ill = NULL; 5555 } 5556 if (connp->conn_ilg_inuse != 0) 5557 ilg_cleanup_reqd = B_TRUE; 5558 mutex_exit(&connp->conn_lock); 5559 5560 if (conn_ioctl_cleanup_reqd) 5561 conn_ioctl_cleanup(connp); 5562 5563 if (is_system_labeled() && connp->conn_anon_port) { 5564 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5565 connp->conn_mlp_type, connp->conn_ulp, 5566 ntohs(connp->conn_lport), B_FALSE); 5567 connp->conn_anon_port = 0; 5568 } 5569 connp->conn_mlp_type = mlptSingle; 5570 5571 /* 5572 * Remove this conn from any fanout list it is on. 5573 * and then wait for any threads currently operating 5574 * on this endpoint to finish 5575 */ 5576 ipcl_hash_remove(connp); 5577 5578 /* 5579 * Remove this conn from the drain list, and do 5580 * any other cleanup that may be required. 5581 * (Only non-tcp streams may have a non-null conn_idl. 5582 * TCP streams are never flow controlled, and 5583 * conn_idl will be null) 5584 */ 5585 if (drain_cleanup_reqd) 5586 conn_drain_tail(connp, B_TRUE); 5587 5588 if (connp == ipst->ips_ip_g_mrouter) 5589 (void) ip_mrouter_done(NULL, ipst); 5590 5591 if (ilg_cleanup_reqd) 5592 ilg_delete_all(connp); 5593 5594 conn_delete_ire(connp, NULL); 5595 5596 /* 5597 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5598 * callers from write side can't be there now because close 5599 * is in progress. The only other caller is ipcl_walk 5600 * which checks for the condemned flag. 5601 */ 5602 mutex_enter(&connp->conn_lock); 5603 connp->conn_state_flags |= CONN_CONDEMNED; 5604 while (connp->conn_ref != 1) 5605 cv_wait(&connp->conn_cv, &connp->conn_lock); 5606 connp->conn_state_flags |= CONN_QUIESCED; 5607 mutex_exit(&connp->conn_lock); 5608 } 5609 5610 /* ARGSUSED */ 5611 int 5612 ip_close(queue_t *q, int flags) 5613 { 5614 conn_t *connp; 5615 5616 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5617 5618 /* 5619 * Call the appropriate delete routine depending on whether this is 5620 * a module or device. 5621 */ 5622 if (WR(q)->q_next != NULL) { 5623 /* This is a module close */ 5624 return (ip_modclose((ill_t *)q->q_ptr)); 5625 } 5626 5627 connp = q->q_ptr; 5628 ip_quiesce_conn(connp); 5629 5630 qprocsoff(q); 5631 5632 /* 5633 * Now we are truly single threaded on this stream, and can 5634 * delete the things hanging off the connp, and finally the connp. 5635 * We removed this connp from the fanout list, it cannot be 5636 * accessed thru the fanouts, and we already waited for the 5637 * conn_ref to drop to 0. We are already in close, so 5638 * there cannot be any other thread from the top. qprocsoff 5639 * has completed, and service has completed or won't run in 5640 * future. 5641 */ 5642 ASSERT(connp->conn_ref == 1); 5643 5644 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 5645 5646 connp->conn_ref--; 5647 ipcl_conn_destroy(connp); 5648 5649 q->q_ptr = WR(q)->q_ptr = NULL; 5650 return (0); 5651 } 5652 5653 /* 5654 * Wapper around putnext() so that ip_rts_request can merely use 5655 * conn_recv. 5656 */ 5657 /*ARGSUSED2*/ 5658 static void 5659 ip_conn_input(void *arg1, mblk_t *mp, void *arg2) 5660 { 5661 conn_t *connp = (conn_t *)arg1; 5662 5663 putnext(connp->conn_rq, mp); 5664 } 5665 5666 /* 5667 * Called when the module is about to be unloaded 5668 */ 5669 void 5670 ip_ddi_destroy(void) 5671 { 5672 tnet_fini(); 5673 5674 icmp_ddi_g_destroy(); 5675 rts_ddi_g_destroy(); 5676 udp_ddi_g_destroy(); 5677 sctp_ddi_g_destroy(); 5678 tcp_ddi_g_destroy(); 5679 ipsec_policy_g_destroy(); 5680 ipcl_g_destroy(); 5681 ip_net_g_destroy(); 5682 ip_ire_g_fini(); 5683 inet_minor_destroy(ip_minor_arena_sa); 5684 #if defined(_LP64) 5685 inet_minor_destroy(ip_minor_arena_la); 5686 #endif 5687 5688 #ifdef DEBUG 5689 list_destroy(&ip_thread_list); 5690 rw_destroy(&ip_thread_rwlock); 5691 tsd_destroy(&ip_thread_data); 5692 #endif 5693 5694 netstack_unregister(NS_IP); 5695 } 5696 5697 /* 5698 * First step in cleanup. 5699 */ 5700 /* ARGSUSED */ 5701 static void 5702 ip_stack_shutdown(netstackid_t stackid, void *arg) 5703 { 5704 ip_stack_t *ipst = (ip_stack_t *)arg; 5705 5706 #ifdef NS_DEBUG 5707 printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid); 5708 #endif 5709 5710 /* Get rid of loopback interfaces and their IREs */ 5711 ip_loopback_cleanup(ipst); 5712 5713 /* 5714 * The *_hook_shutdown()s start the process of notifying any 5715 * consumers that things are going away.... nothing is destroyed. 5716 */ 5717 ipv4_hook_shutdown(ipst); 5718 ipv6_hook_shutdown(ipst); 5719 5720 mutex_enter(&ipst->ips_capab_taskq_lock); 5721 ipst->ips_capab_taskq_quit = B_TRUE; 5722 cv_signal(&ipst->ips_capab_taskq_cv); 5723 mutex_exit(&ipst->ips_capab_taskq_lock); 5724 5725 mutex_enter(&ipst->ips_mrt_lock); 5726 ipst->ips_mrt_flags |= IP_MRT_STOP; 5727 cv_signal(&ipst->ips_mrt_cv); 5728 mutex_exit(&ipst->ips_mrt_lock); 5729 } 5730 5731 /* 5732 * Free the IP stack instance. 5733 */ 5734 static void 5735 ip_stack_fini(netstackid_t stackid, void *arg) 5736 { 5737 ip_stack_t *ipst = (ip_stack_t *)arg; 5738 int ret; 5739 5740 #ifdef NS_DEBUG 5741 printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); 5742 #endif 5743 /* 5744 * At this point, all of the notifications that the events and 5745 * protocols are going away have been run, meaning that we can 5746 * now set about starting to clean things up. 5747 */ 5748 ipv4_hook_destroy(ipst); 5749 ipv6_hook_destroy(ipst); 5750 ip_net_destroy(ipst); 5751 5752 mutex_destroy(&ipst->ips_capab_taskq_lock); 5753 cv_destroy(&ipst->ips_capab_taskq_cv); 5754 5755 mutex_enter(&ipst->ips_mrt_lock); 5756 while (!(ipst->ips_mrt_flags & IP_MRT_DONE)) 5757 cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock); 5758 mutex_destroy(&ipst->ips_mrt_lock); 5759 cv_destroy(&ipst->ips_mrt_cv); 5760 cv_destroy(&ipst->ips_mrt_done_cv); 5761 5762 ipmp_destroy(ipst); 5763 rw_destroy(&ipst->ips_srcid_lock); 5764 5765 ip_kstat_fini(stackid, ipst->ips_ip_mibkp); 5766 ipst->ips_ip_mibkp = NULL; 5767 icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp); 5768 ipst->ips_icmp_mibkp = NULL; 5769 ip_kstat2_fini(stackid, ipst->ips_ip_kstat); 5770 ipst->ips_ip_kstat = NULL; 5771 bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics)); 5772 ip6_kstat_fini(stackid, ipst->ips_ip6_kstat); 5773 ipst->ips_ip6_kstat = NULL; 5774 bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics)); 5775 5776 nd_free(&ipst->ips_ip_g_nd); 5777 kmem_free(ipst->ips_param_arr, sizeof (lcl_param_arr)); 5778 ipst->ips_param_arr = NULL; 5779 kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 5780 ipst->ips_ndp_arr = NULL; 5781 5782 ip_mrouter_stack_destroy(ipst); 5783 5784 mutex_destroy(&ipst->ips_ip_mi_lock); 5785 rw_destroy(&ipst->ips_ipsec_capab_ills_lock); 5786 rw_destroy(&ipst->ips_ill_g_usesrc_lock); 5787 rw_destroy(&ipst->ips_ip_g_nd_lock); 5788 5789 ret = untimeout(ipst->ips_igmp_timeout_id); 5790 if (ret == -1) { 5791 ASSERT(ipst->ips_igmp_timeout_id == 0); 5792 } else { 5793 ASSERT(ipst->ips_igmp_timeout_id != 0); 5794 ipst->ips_igmp_timeout_id = 0; 5795 } 5796 ret = untimeout(ipst->ips_igmp_slowtimeout_id); 5797 if (ret == -1) { 5798 ASSERT(ipst->ips_igmp_slowtimeout_id == 0); 5799 } else { 5800 ASSERT(ipst->ips_igmp_slowtimeout_id != 0); 5801 ipst->ips_igmp_slowtimeout_id = 0; 5802 } 5803 ret = untimeout(ipst->ips_mld_timeout_id); 5804 if (ret == -1) { 5805 ASSERT(ipst->ips_mld_timeout_id == 0); 5806 } else { 5807 ASSERT(ipst->ips_mld_timeout_id != 0); 5808 ipst->ips_mld_timeout_id = 0; 5809 } 5810 ret = untimeout(ipst->ips_mld_slowtimeout_id); 5811 if (ret == -1) { 5812 ASSERT(ipst->ips_mld_slowtimeout_id == 0); 5813 } else { 5814 ASSERT(ipst->ips_mld_slowtimeout_id != 0); 5815 ipst->ips_mld_slowtimeout_id = 0; 5816 } 5817 ret = untimeout(ipst->ips_ip_ire_expire_id); 5818 if (ret == -1) { 5819 ASSERT(ipst->ips_ip_ire_expire_id == 0); 5820 } else { 5821 ASSERT(ipst->ips_ip_ire_expire_id != 0); 5822 ipst->ips_ip_ire_expire_id = 0; 5823 } 5824 5825 mutex_destroy(&ipst->ips_igmp_timer_lock); 5826 mutex_destroy(&ipst->ips_mld_timer_lock); 5827 mutex_destroy(&ipst->ips_igmp_slowtimeout_lock); 5828 mutex_destroy(&ipst->ips_mld_slowtimeout_lock); 5829 mutex_destroy(&ipst->ips_ip_addr_avail_lock); 5830 rw_destroy(&ipst->ips_ill_g_lock); 5831 5832 ipobs_fini(ipst); 5833 ip_ire_fini(ipst); 5834 ip6_asp_free(ipst); 5835 conn_drain_fini(ipst); 5836 ipcl_destroy(ipst); 5837 5838 mutex_destroy(&ipst->ips_ndp4->ndp_g_lock); 5839 mutex_destroy(&ipst->ips_ndp6->ndp_g_lock); 5840 kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t)); 5841 ipst->ips_ndp4 = NULL; 5842 kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t)); 5843 ipst->ips_ndp6 = NULL; 5844 5845 if (ipst->ips_loopback_ksp != NULL) { 5846 kstat_delete_netstack(ipst->ips_loopback_ksp, stackid); 5847 ipst->ips_loopback_ksp = NULL; 5848 } 5849 5850 kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t)); 5851 ipst->ips_phyint_g_list = NULL; 5852 kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS); 5853 ipst->ips_ill_g_heads = NULL; 5854 5855 ldi_ident_release(ipst->ips_ldi_ident); 5856 kmem_free(ipst, sizeof (*ipst)); 5857 } 5858 5859 /* 5860 * This function is called from the TSD destructor, and is used to debug 5861 * reference count issues in IP. See block comment in <inet/ip_if.h> for 5862 * details. 5863 */ 5864 static void 5865 ip_thread_exit(void *phash) 5866 { 5867 th_hash_t *thh = phash; 5868 5869 rw_enter(&ip_thread_rwlock, RW_WRITER); 5870 list_remove(&ip_thread_list, thh); 5871 rw_exit(&ip_thread_rwlock); 5872 mod_hash_destroy_hash(thh->thh_hash); 5873 kmem_free(thh, sizeof (*thh)); 5874 } 5875 5876 /* 5877 * Called when the IP kernel module is loaded into the kernel 5878 */ 5879 void 5880 ip_ddi_init(void) 5881 { 5882 ip_squeue_flag = ip_squeue_switch(ip_squeue_enter); 5883 5884 /* 5885 * For IP and TCP the minor numbers should start from 2 since we have 4 5886 * initial devices: ip, ip6, tcp, tcp6. 5887 */ 5888 /* 5889 * If this is a 64-bit kernel, then create two separate arenas - 5890 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the 5891 * other for socket apps in the range 2^^18 through 2^^32-1. 5892 */ 5893 ip_minor_arena_la = NULL; 5894 ip_minor_arena_sa = NULL; 5895 #if defined(_LP64) 5896 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5897 INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) { 5898 cmn_err(CE_PANIC, 5899 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5900 } 5901 if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la", 5902 MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) { 5903 cmn_err(CE_PANIC, 5904 "ip_ddi_init: ip_minor_arena_la creation failed\n"); 5905 } 5906 #else 5907 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5908 INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) { 5909 cmn_err(CE_PANIC, 5910 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5911 } 5912 #endif 5913 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5914 5915 ipcl_g_init(); 5916 ip_ire_g_init(); 5917 ip_net_g_init(); 5918 5919 #ifdef DEBUG 5920 tsd_create(&ip_thread_data, ip_thread_exit); 5921 rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL); 5922 list_create(&ip_thread_list, sizeof (th_hash_t), 5923 offsetof(th_hash_t, thh_link)); 5924 #endif 5925 5926 /* 5927 * We want to be informed each time a stack is created or 5928 * destroyed in the kernel, so we can maintain the 5929 * set of udp_stack_t's. 5930 */ 5931 netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown, 5932 ip_stack_fini); 5933 5934 ipsec_policy_g_init(); 5935 tcp_ddi_g_init(); 5936 sctp_ddi_g_init(); 5937 5938 tnet_init(); 5939 5940 udp_ddi_g_init(); 5941 rts_ddi_g_init(); 5942 icmp_ddi_g_init(); 5943 } 5944 5945 /* 5946 * Initialize the IP stack instance. 5947 */ 5948 static void * 5949 ip_stack_init(netstackid_t stackid, netstack_t *ns) 5950 { 5951 ip_stack_t *ipst; 5952 ipparam_t *pa; 5953 ipndp_t *na; 5954 major_t major; 5955 5956 #ifdef NS_DEBUG 5957 printf("ip_stack_init(stack %d)\n", stackid); 5958 #endif 5959 5960 ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP); 5961 ipst->ips_netstack = ns; 5962 5963 ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS, 5964 KM_SLEEP); 5965 ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t), 5966 KM_SLEEP); 5967 ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5968 ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5969 mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5970 mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5971 5972 rw_init(&ipst->ips_ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5973 mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5974 ipst->ips_igmp_deferred_next = INFINITY; 5975 mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5976 ipst->ips_mld_deferred_next = INFINITY; 5977 mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5978 mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5979 mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5980 mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5981 rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL); 5982 rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5983 rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5984 5985 ipcl_init(ipst); 5986 ip_ire_init(ipst); 5987 ip6_asp_init(ipst); 5988 ipif_init(ipst); 5989 conn_drain_init(ipst); 5990 ip_mrouter_stack_init(ipst); 5991 5992 ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT; 5993 ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 5994 ipst->ips_ipv6_frag_timeout = IPV6_FRAG_TIMEOUT; 5995 ipst->ips_ipv6_frag_timo_ms = IPV6_FRAG_TIMEOUT * 1000; 5996 5997 ipst->ips_ip_multirt_log_interval = 1000; 5998 5999 ipst->ips_ip_g_forward = IP_FORWARD_DEFAULT; 6000 ipst->ips_ipv6_forward = IP_FORWARD_DEFAULT; 6001 ipst->ips_ill_index = 1; 6002 6003 ipst->ips_saved_ip_g_forward = -1; 6004 ipst->ips_reg_vif_num = ALL_VIFS; /* Index to Register vif */ 6005 6006 pa = (ipparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP); 6007 ipst->ips_param_arr = pa; 6008 bcopy(lcl_param_arr, ipst->ips_param_arr, sizeof (lcl_param_arr)); 6009 6010 na = (ipndp_t *)kmem_alloc(sizeof (lcl_ndp_arr), KM_SLEEP); 6011 ipst->ips_ndp_arr = na; 6012 bcopy(lcl_ndp_arr, ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 6013 ipst->ips_ndp_arr[IPNDP_IP_FORWARDING_OFFSET].ip_ndp_data = 6014 (caddr_t)&ipst->ips_ip_g_forward; 6015 ipst->ips_ndp_arr[IPNDP_IP6_FORWARDING_OFFSET].ip_ndp_data = 6016 (caddr_t)&ipst->ips_ipv6_forward; 6017 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_name, 6018 "ip_cgtp_filter") == 0); 6019 ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data = 6020 (caddr_t)&ipst->ips_ip_cgtp_filter; 6021 6022 (void) ip_param_register(&ipst->ips_ip_g_nd, 6023 ipst->ips_param_arr, A_CNT(lcl_param_arr), 6024 ipst->ips_ndp_arr, A_CNT(lcl_ndp_arr)); 6025 6026 ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst); 6027 ipst->ips_icmp_mibkp = icmp_kstat_init(stackid); 6028 ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics); 6029 ipst->ips_ip6_kstat = 6030 ip6_kstat_init(stackid, &ipst->ips_ip6_statistics); 6031 6032 ipst->ips_ip_src_id = 1; 6033 rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); 6034 6035 ipobs_init(ipst); 6036 ip_net_init(ipst, ns); 6037 ipv4_hook_init(ipst); 6038 ipv6_hook_init(ipst); 6039 ipmp_init(ipst); 6040 6041 /* 6042 * Create the taskq dispatcher thread and initialize related stuff. 6043 */ 6044 ipst->ips_capab_taskq_thread = thread_create(NULL, 0, 6045 ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri); 6046 mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 6047 cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL); 6048 6049 /* 6050 * Create the mcast_restart_timers_thread() worker thread. 6051 */ 6052 mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL); 6053 cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL); 6054 cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL); 6055 ipst->ips_mrt_thread = thread_create(NULL, 0, 6056 mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri); 6057 6058 major = mod_name_to_major(INET_NAME); 6059 (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident); 6060 return (ipst); 6061 } 6062 6063 /* 6064 * Allocate and initialize a DLPI template of the specified length. (May be 6065 * called as writer.) 6066 */ 6067 mblk_t * 6068 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 6069 { 6070 mblk_t *mp; 6071 6072 mp = allocb(len, BPRI_MED); 6073 if (!mp) 6074 return (NULL); 6075 6076 /* 6077 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 6078 * of which we don't seem to use) are sent with M_PCPROTO, and 6079 * that other DLPI are M_PROTO. 6080 */ 6081 if (prim == DL_INFO_REQ) { 6082 mp->b_datap->db_type = M_PCPROTO; 6083 } else { 6084 mp->b_datap->db_type = M_PROTO; 6085 } 6086 6087 mp->b_wptr = mp->b_rptr + len; 6088 bzero(mp->b_rptr, len); 6089 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 6090 return (mp); 6091 } 6092 6093 /* 6094 * Allocate and initialize a DLPI notification. (May be called as writer.) 6095 */ 6096 mblk_t * 6097 ip_dlnotify_alloc(uint_t notification, uint_t data) 6098 { 6099 dl_notify_ind_t *notifyp; 6100 mblk_t *mp; 6101 6102 if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL) 6103 return (NULL); 6104 6105 notifyp = (dl_notify_ind_t *)mp->b_rptr; 6106 notifyp->dl_notification = notification; 6107 notifyp->dl_data = data; 6108 return (mp); 6109 } 6110 6111 /* 6112 * Debug formatting routine. Returns a character string representation of the 6113 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 6114 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 6115 * 6116 * Once the ndd table-printing interfaces are removed, this can be changed to 6117 * standard dotted-decimal form. 6118 */ 6119 char * 6120 ip_dot_addr(ipaddr_t addr, char *buf) 6121 { 6122 uint8_t *ap = (uint8_t *)&addr; 6123 6124 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 6125 ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF); 6126 return (buf); 6127 } 6128 6129 /* 6130 * Write the given MAC address as a printable string in the usual colon- 6131 * separated format. 6132 */ 6133 const char * 6134 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen) 6135 { 6136 char *bp; 6137 6138 if (alen == 0 || buflen < 4) 6139 return ("?"); 6140 bp = buf; 6141 for (;;) { 6142 /* 6143 * If there are more MAC address bytes available, but we won't 6144 * have any room to print them, then add "..." to the string 6145 * instead. See below for the 'magic number' explanation. 6146 */ 6147 if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) { 6148 (void) strcpy(bp, "..."); 6149 break; 6150 } 6151 (void) sprintf(bp, "%02x", *addr++); 6152 bp += 2; 6153 if (--alen == 0) 6154 break; 6155 *bp++ = ':'; 6156 buflen -= 3; 6157 /* 6158 * At this point, based on the first 'if' statement above, 6159 * either alen == 1 and buflen >= 3, or alen > 1 and 6160 * buflen >= 4. The first case leaves room for the final "xx" 6161 * number and trailing NUL byte. The second leaves room for at 6162 * least "...". Thus the apparently 'magic' numbers chosen for 6163 * that statement. 6164 */ 6165 } 6166 return (buf); 6167 } 6168 6169 /* 6170 * Send an ICMP error after patching up the packet appropriately. Returns 6171 * non-zero if the appropriate MIB should be bumped; zero otherwise. 6172 */ 6173 static boolean_t 6174 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 6175 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, 6176 zoneid_t zoneid, ip_stack_t *ipst) 6177 { 6178 ipha_t *ipha; 6179 mblk_t *first_mp; 6180 boolean_t secure; 6181 unsigned char db_type; 6182 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6183 6184 first_mp = mp; 6185 if (mctl_present) { 6186 mp = mp->b_cont; 6187 secure = ipsec_in_is_secure(first_mp); 6188 ASSERT(mp != NULL); 6189 } else { 6190 /* 6191 * If this is an ICMP error being reported - which goes 6192 * up as M_CTLs, we need to convert them to M_DATA till 6193 * we finish checking with global policy because 6194 * ipsec_check_global_policy() assumes M_DATA as clear 6195 * and M_CTL as secure. 6196 */ 6197 db_type = DB_TYPE(mp); 6198 DB_TYPE(mp) = M_DATA; 6199 secure = B_FALSE; 6200 } 6201 /* 6202 * We are generating an icmp error for some inbound packet. 6203 * Called from all ip_fanout_(udp, tcp, proto) functions. 6204 * Before we generate an error, check with global policy 6205 * to see whether this is allowed to enter the system. As 6206 * there is no "conn", we are checking with global policy. 6207 */ 6208 ipha = (ipha_t *)mp->b_rptr; 6209 if (secure || ipss->ipsec_inbound_v4_policy_present) { 6210 first_mp = ipsec_check_global_policy(first_mp, NULL, 6211 ipha, NULL, mctl_present, ipst->ips_netstack); 6212 if (first_mp == NULL) 6213 return (B_FALSE); 6214 } 6215 6216 if (!mctl_present) 6217 DB_TYPE(mp) = db_type; 6218 6219 if (flags & IP_FF_SEND_ICMP) { 6220 if (flags & IP_FF_HDR_COMPLETE) { 6221 if (ip_hdr_complete(ipha, zoneid, ipst)) { 6222 freemsg(first_mp); 6223 return (B_TRUE); 6224 } 6225 } 6226 if (flags & IP_FF_CKSUM) { 6227 /* 6228 * Have to correct checksum since 6229 * the packet might have been 6230 * fragmented and the reassembly code in ip_rput 6231 * does not restore the IP checksum. 6232 */ 6233 ipha->ipha_hdr_checksum = 0; 6234 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6235 } 6236 switch (icmp_type) { 6237 case ICMP_DEST_UNREACHABLE: 6238 icmp_unreachable(WR(q), first_mp, icmp_code, zoneid, 6239 ipst); 6240 break; 6241 default: 6242 freemsg(first_mp); 6243 break; 6244 } 6245 } else { 6246 freemsg(first_mp); 6247 return (B_FALSE); 6248 } 6249 6250 return (B_TRUE); 6251 } 6252 6253 /* 6254 * Used to send an ICMP error message when a packet is received for 6255 * a protocol that is not supported. The mblk passed as argument 6256 * is consumed by this function. 6257 */ 6258 void 6259 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid, 6260 ip_stack_t *ipst) 6261 { 6262 mblk_t *mp; 6263 ipha_t *ipha; 6264 ill_t *ill; 6265 ipsec_in_t *ii; 6266 6267 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6268 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6269 6270 mp = ipsec_mp->b_cont; 6271 ipsec_mp->b_cont = NULL; 6272 ipha = (ipha_t *)mp->b_rptr; 6273 /* Get ill from index in ipsec_in_t. */ 6274 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 6275 (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL, 6276 ipst); 6277 if (ill != NULL) { 6278 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 6279 if (ip_fanout_send_icmp(q, mp, flags, 6280 ICMP_DEST_UNREACHABLE, 6281 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) { 6282 BUMP_MIB(ill->ill_ip_mib, 6283 ipIfStatsInUnknownProtos); 6284 } 6285 } else { 6286 if (ip_fanout_send_icmp_v6(q, mp, flags, 6287 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 6288 0, B_FALSE, zoneid, ipst)) { 6289 BUMP_MIB(ill->ill_ip_mib, 6290 ipIfStatsInUnknownProtos); 6291 } 6292 } 6293 ill_refrele(ill); 6294 } else { /* re-link for the freemsg() below. */ 6295 ipsec_mp->b_cont = mp; 6296 } 6297 6298 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 6299 freemsg(ipsec_mp); 6300 } 6301 6302 /* 6303 * See if the inbound datagram has had IPsec processing applied to it. 6304 */ 6305 boolean_t 6306 ipsec_in_is_secure(mblk_t *ipsec_mp) 6307 { 6308 ipsec_in_t *ii; 6309 6310 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6311 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6312 6313 if (ii->ipsec_in_loopback) { 6314 return (ii->ipsec_in_secure); 6315 } else { 6316 return (ii->ipsec_in_ah_sa != NULL || 6317 ii->ipsec_in_esp_sa != NULL || 6318 ii->ipsec_in_decaps); 6319 } 6320 } 6321 6322 /* 6323 * Handle protocols with which IP is less intimate. There 6324 * can be more than one stream bound to a particular 6325 * protocol. When this is the case, normally each one gets a copy 6326 * of any incoming packets. 6327 * 6328 * IPsec NOTE : 6329 * 6330 * Don't allow a secure packet going up a non-secure connection. 6331 * We don't allow this because 6332 * 6333 * 1) Reply might go out in clear which will be dropped at 6334 * the sending side. 6335 * 2) If the reply goes out in clear it will give the 6336 * adversary enough information for getting the key in 6337 * most of the cases. 6338 * 6339 * Moreover getting a secure packet when we expect clear 6340 * implies that SA's were added without checking for 6341 * policy on both ends. This should not happen once ISAKMP 6342 * is used to negotiate SAs as SAs will be added only after 6343 * verifying the policy. 6344 * 6345 * IPQoS Notes: 6346 * Once we have determined the client, invoke IPPF processing. 6347 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6348 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6349 * ip_policy will be false. 6350 * 6351 * Zones notes: 6352 * Currently only applications in the global zone can create raw sockets for 6353 * protocols other than ICMP. So unlike the broadcast / multicast case of 6354 * ip_fanout_udp(), we only send a copy of the packet to streams in the 6355 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 6356 */ 6357 static void 6358 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 6359 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 6360 zoneid_t zoneid) 6361 { 6362 queue_t *rq; 6363 mblk_t *mp1, *first_mp1; 6364 uint_t protocol = ipha->ipha_protocol; 6365 ipaddr_t dst; 6366 mblk_t *first_mp = mp; 6367 boolean_t secure; 6368 uint32_t ill_index; 6369 conn_t *connp, *first_connp, *next_connp; 6370 connf_t *connfp; 6371 boolean_t shared_addr; 6372 mib2_ipIfStatsEntry_t *mibptr; 6373 ip_stack_t *ipst = recv_ill->ill_ipst; 6374 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6375 6376 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 6377 if (mctl_present) { 6378 mp = first_mp->b_cont; 6379 secure = ipsec_in_is_secure(first_mp); 6380 ASSERT(mp != NULL); 6381 } else { 6382 secure = B_FALSE; 6383 } 6384 dst = ipha->ipha_dst; 6385 shared_addr = (zoneid == ALL_ZONES); 6386 if (shared_addr) { 6387 /* 6388 * We don't allow multilevel ports for raw IP, so no need to 6389 * check for that here. 6390 */ 6391 zoneid = tsol_packet_to_zoneid(mp); 6392 } 6393 6394 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 6395 mutex_enter(&connfp->connf_lock); 6396 connp = connfp->connf_head; 6397 for (connp = connfp->connf_head; connp != NULL; 6398 connp = connp->conn_next) { 6399 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 6400 zoneid) && 6401 (!is_system_labeled() || 6402 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6403 connp))) { 6404 break; 6405 } 6406 } 6407 6408 if (connp == NULL) { 6409 /* 6410 * No one bound to these addresses. Is 6411 * there a client that wants all 6412 * unclaimed datagrams? 6413 */ 6414 mutex_exit(&connfp->connf_lock); 6415 /* 6416 * Check for IPPROTO_ENCAP... 6417 */ 6418 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { 6419 /* 6420 * If an IPsec mblk is here on a multicast 6421 * tunnel (using ip_mroute stuff), check policy here, 6422 * THEN ship off to ip_mroute_decap(). 6423 * 6424 * BTW, If I match a configured IP-in-IP 6425 * tunnel, this path will not be reached, and 6426 * ip_mroute_decap will never be called. 6427 */ 6428 first_mp = ipsec_check_global_policy(first_mp, connp, 6429 ipha, NULL, mctl_present, ipst->ips_netstack); 6430 if (first_mp != NULL) { 6431 if (mctl_present) 6432 freeb(first_mp); 6433 ip_mroute_decap(q, mp, ill); 6434 } /* Else we already freed everything! */ 6435 } else { 6436 /* 6437 * Otherwise send an ICMP protocol unreachable. 6438 */ 6439 if (ip_fanout_send_icmp(q, first_mp, flags, 6440 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 6441 mctl_present, zoneid, ipst)) { 6442 BUMP_MIB(mibptr, ipIfStatsInUnknownProtos); 6443 } 6444 } 6445 return; 6446 } 6447 6448 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); 6449 6450 CONN_INC_REF(connp); 6451 first_connp = connp; 6452 connp = connp->conn_next; 6453 6454 for (;;) { 6455 while (connp != NULL) { 6456 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 6457 flags, zoneid) && 6458 (!is_system_labeled() || 6459 tsol_receive_local(mp, &dst, IPV4_VERSION, 6460 shared_addr, connp))) 6461 break; 6462 connp = connp->conn_next; 6463 } 6464 6465 /* 6466 * Copy the packet. 6467 */ 6468 if (connp == NULL || 6469 (((first_mp1 = dupmsg(first_mp)) == NULL) && 6470 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 6471 /* 6472 * No more interested clients or memory 6473 * allocation failed 6474 */ 6475 connp = first_connp; 6476 break; 6477 } 6478 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); 6479 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 6480 CONN_INC_REF(connp); 6481 mutex_exit(&connfp->connf_lock); 6482 rq = connp->conn_rq; 6483 6484 /* 6485 * Check flow control 6486 */ 6487 if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || 6488 (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { 6489 if (flags & IP_FF_RAWIP) { 6490 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6491 } else { 6492 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6493 } 6494 6495 freemsg(first_mp1); 6496 } else { 6497 /* 6498 * Enforce policy like any other conn_t. Note that 6499 * IP-in-IP packets don't come through here, but 6500 * through ip_iptun_input() or 6501 * icmp_inbound_iptun_fanout(). IPsec policy for such 6502 * packets is enforced in the iptun module. 6503 */ 6504 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 6505 secure) { 6506 first_mp1 = ipsec_check_inbound_policy 6507 (first_mp1, connp, ipha, NULL, 6508 mctl_present); 6509 } 6510 if (first_mp1 != NULL) { 6511 int in_flags = 0; 6512 /* 6513 * ip_fanout_proto also gets called from 6514 * icmp_inbound_error_fanout, in which case 6515 * the msg type is M_CTL. Don't add info 6516 * in this case for the time being. In future 6517 * when there is a need for knowing the 6518 * inbound iface index for ICMP error msgs, 6519 * then this can be changed. 6520 */ 6521 if (connp->conn_recvif) 6522 in_flags = IPF_RECVIF; 6523 /* 6524 * The ULP may support IP_RECVPKTINFO for both 6525 * IP v4 and v6 so pass the appropriate argument 6526 * based on conn IP version. 6527 */ 6528 if (connp->conn_ip_recvpktinfo) { 6529 if (connp->conn_af_isv6) { 6530 /* 6531 * V6 only needs index 6532 */ 6533 in_flags |= IPF_RECVIF; 6534 } else { 6535 /* 6536 * V4 needs index + 6537 * matching address. 6538 */ 6539 in_flags |= IPF_RECVADDR; 6540 } 6541 } 6542 if ((in_flags != 0) && 6543 (mp->b_datap->db_type != M_CTL)) { 6544 /* 6545 * the actual data will be 6546 * contained in b_cont upon 6547 * successful return of the 6548 * following call else 6549 * original mblk is returned 6550 */ 6551 ASSERT(recv_ill != NULL); 6552 mp1 = ip_add_info(mp1, recv_ill, 6553 in_flags, IPCL_ZONEID(connp), ipst); 6554 } 6555 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6556 if (mctl_present) 6557 freeb(first_mp1); 6558 (connp->conn_recv)(connp, mp1, NULL); 6559 } 6560 } 6561 mutex_enter(&connfp->connf_lock); 6562 /* Follow the next pointer before releasing the conn. */ 6563 next_connp = connp->conn_next; 6564 CONN_DEC_REF(connp); 6565 connp = next_connp; 6566 } 6567 6568 /* Last one. Send it upstream. */ 6569 mutex_exit(&connfp->connf_lock); 6570 6571 /* 6572 * If this packet is coming from icmp_inbound_error_fanout ip_policy 6573 * will be set to false. 6574 */ 6575 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6576 ill_index = ill->ill_phyint->phyint_ifindex; 6577 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6578 if (mp == NULL) { 6579 CONN_DEC_REF(connp); 6580 if (mctl_present) { 6581 freeb(first_mp); 6582 } 6583 return; 6584 } 6585 } 6586 6587 rq = connp->conn_rq; 6588 /* 6589 * Check flow control 6590 */ 6591 if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || 6592 (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { 6593 if (flags & IP_FF_RAWIP) { 6594 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6595 } else { 6596 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6597 } 6598 6599 freemsg(first_mp); 6600 } else { 6601 ASSERT(!IPCL_IS_IPTUN(connp)); 6602 6603 if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) { 6604 first_mp = ipsec_check_inbound_policy(first_mp, connp, 6605 ipha, NULL, mctl_present); 6606 } 6607 6608 if (first_mp != NULL) { 6609 int in_flags = 0; 6610 6611 /* 6612 * ip_fanout_proto also gets called 6613 * from icmp_inbound_error_fanout, in 6614 * which case the msg type is M_CTL. 6615 * Don't add info in this case for time 6616 * being. In future when there is a 6617 * need for knowing the inbound iface 6618 * index for ICMP error msgs, then this 6619 * can be changed 6620 */ 6621 if (connp->conn_recvif) 6622 in_flags = IPF_RECVIF; 6623 if (connp->conn_ip_recvpktinfo) { 6624 if (connp->conn_af_isv6) { 6625 /* 6626 * V6 only needs index 6627 */ 6628 in_flags |= IPF_RECVIF; 6629 } else { 6630 /* 6631 * V4 needs index + 6632 * matching address. 6633 */ 6634 in_flags |= IPF_RECVADDR; 6635 } 6636 } 6637 if ((in_flags != 0) && 6638 (mp->b_datap->db_type != M_CTL)) { 6639 6640 /* 6641 * the actual data will be contained in 6642 * b_cont upon successful return 6643 * of the following call else original 6644 * mblk is returned 6645 */ 6646 ASSERT(recv_ill != NULL); 6647 mp = ip_add_info(mp, recv_ill, 6648 in_flags, IPCL_ZONEID(connp), ipst); 6649 } 6650 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6651 (connp->conn_recv)(connp, mp, NULL); 6652 if (mctl_present) 6653 freeb(first_mp); 6654 } 6655 } 6656 CONN_DEC_REF(connp); 6657 } 6658 6659 /* 6660 * Serialize tcp resets by calling tcp_xmit_reset_serialize through 6661 * SQUEUE_ENTER_ONE(SQ_FILL). We do this to ensure the reset is handled on 6662 * the correct squeue, in this case the same squeue as a valid listener with 6663 * no current connection state for the packet we are processing. The function 6664 * is called for synchronizing both IPv4 and IPv6. 6665 */ 6666 void 6667 ip_xmit_reset_serialize(mblk_t *mp, int hdrlen, zoneid_t zoneid, 6668 tcp_stack_t *tcps, conn_t *connp) 6669 { 6670 mblk_t *rst_mp; 6671 tcp_xmit_reset_event_t *eventp; 6672 6673 rst_mp = allocb(sizeof (tcp_xmit_reset_event_t), BPRI_HI); 6674 6675 if (rst_mp == NULL) { 6676 freemsg(mp); 6677 return; 6678 } 6679 6680 rst_mp->b_datap->db_type = M_PROTO; 6681 rst_mp->b_wptr += sizeof (tcp_xmit_reset_event_t); 6682 6683 eventp = (tcp_xmit_reset_event_t *)rst_mp->b_rptr; 6684 eventp->tcp_xre_event = TCP_XRE_EVENT_IP_FANOUT_TCP; 6685 eventp->tcp_xre_iphdrlen = hdrlen; 6686 eventp->tcp_xre_zoneid = zoneid; 6687 eventp->tcp_xre_tcps = tcps; 6688 6689 rst_mp->b_cont = mp; 6690 mp = rst_mp; 6691 6692 /* 6693 * Increment the connref, this ref will be released by the squeue 6694 * framework. 6695 */ 6696 CONN_INC_REF(connp); 6697 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_xmit_reset, connp, 6698 SQ_FILL, SQTAG_XMIT_EARLY_RESET); 6699 } 6700 6701 /* 6702 * Fanout for TCP packets 6703 * The caller puts <fport, lport> in the ports parameter. 6704 * 6705 * IPQoS Notes 6706 * Before sending it to the client, invoke IPPF processing. 6707 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6708 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6709 * ip_policy is false. 6710 */ 6711 static void 6712 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 6713 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 6714 { 6715 mblk_t *first_mp; 6716 boolean_t secure; 6717 uint32_t ill_index; 6718 int ip_hdr_len; 6719 tcph_t *tcph; 6720 boolean_t syn_present = B_FALSE; 6721 conn_t *connp; 6722 ip_stack_t *ipst = recv_ill->ill_ipst; 6723 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6724 6725 ASSERT(recv_ill != NULL); 6726 6727 first_mp = mp; 6728 if (mctl_present) { 6729 ASSERT(first_mp->b_datap->db_type == M_CTL); 6730 mp = first_mp->b_cont; 6731 secure = ipsec_in_is_secure(first_mp); 6732 ASSERT(mp != NULL); 6733 } else { 6734 secure = B_FALSE; 6735 } 6736 6737 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 6738 6739 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 6740 zoneid, ipst)) == NULL) { 6741 /* 6742 * No connected connection or listener. Send a 6743 * TH_RST via tcp_xmit_listeners_reset. 6744 */ 6745 6746 /* Initiate IPPf processing, if needed. */ 6747 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 6748 uint32_t ill_index; 6749 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6750 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 6751 if (first_mp == NULL) 6752 return; 6753 } 6754 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6755 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 6756 zoneid)); 6757 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6758 ipst->ips_netstack->netstack_tcp, NULL); 6759 return; 6760 } 6761 6762 /* 6763 * Allocate the SYN for the TCP connection here itself 6764 */ 6765 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6766 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 6767 if (IPCL_IS_TCP(connp)) { 6768 squeue_t *sqp; 6769 6770 /* 6771 * If the queue belongs to a conn, and fused tcp 6772 * loopback is enabled, assign the eager's squeue 6773 * to be that of the active connect's. Note that 6774 * we don't check for IP_FF_LOOPBACK here since this 6775 * routine gets called only for loopback (unlike the 6776 * IPv6 counterpart). 6777 */ 6778 if (do_tcp_fusion && 6779 CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) && 6780 !CONN_INBOUND_POLICY_PRESENT(connp, ipss) && 6781 !secure && 6782 !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy) { 6783 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 6784 sqp = Q_TO_CONN(q)->conn_sqp; 6785 } else { 6786 sqp = IP_SQUEUE_GET(lbolt); 6787 } 6788 6789 mp->b_datap->db_struioflag |= STRUIO_EAGER; 6790 DB_CKSUMSTART(mp) = (intptr_t)sqp; 6791 syn_present = B_TRUE; 6792 } 6793 } 6794 6795 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 6796 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 6797 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6798 if ((flags & TH_RST) || (flags & TH_URG)) { 6799 CONN_DEC_REF(connp); 6800 freemsg(first_mp); 6801 return; 6802 } 6803 if (flags & TH_ACK) { 6804 ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid, 6805 ipst->ips_netstack->netstack_tcp, connp); 6806 CONN_DEC_REF(connp); 6807 return; 6808 } 6809 6810 CONN_DEC_REF(connp); 6811 freemsg(first_mp); 6812 return; 6813 } 6814 6815 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 6816 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6817 NULL, mctl_present); 6818 if (first_mp == NULL) { 6819 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6820 CONN_DEC_REF(connp); 6821 return; 6822 } 6823 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 6824 ASSERT(syn_present); 6825 if (mctl_present) { 6826 ASSERT(first_mp != mp); 6827 first_mp->b_datap->db_struioflag |= 6828 STRUIO_POLICY; 6829 } else { 6830 ASSERT(first_mp == mp); 6831 mp->b_datap->db_struioflag &= 6832 ~STRUIO_EAGER; 6833 mp->b_datap->db_struioflag |= 6834 STRUIO_POLICY; 6835 } 6836 } else { 6837 /* 6838 * Discard first_mp early since we're dealing with a 6839 * fully-connected conn_t and tcp doesn't do policy in 6840 * this case. 6841 */ 6842 if (mctl_present) { 6843 freeb(first_mp); 6844 mctl_present = B_FALSE; 6845 } 6846 first_mp = mp; 6847 } 6848 } 6849 6850 /* 6851 * Initiate policy processing here if needed. If we get here from 6852 * icmp_inbound_error_fanout, ip_policy is false. 6853 */ 6854 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6855 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6856 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6857 if (mp == NULL) { 6858 CONN_DEC_REF(connp); 6859 if (mctl_present) 6860 freeb(first_mp); 6861 return; 6862 } else if (mctl_present) { 6863 ASSERT(first_mp != mp); 6864 first_mp->b_cont = mp; 6865 } else { 6866 first_mp = mp; 6867 } 6868 } 6869 6870 /* Handle socket options. */ 6871 if (!syn_present && 6872 connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 6873 /* Add header */ 6874 ASSERT(recv_ill != NULL); 6875 /* 6876 * Since tcp does not support IP_RECVPKTINFO for V4, only pass 6877 * IPF_RECVIF. 6878 */ 6879 mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp), 6880 ipst); 6881 if (mp == NULL) { 6882 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6883 CONN_DEC_REF(connp); 6884 if (mctl_present) 6885 freeb(first_mp); 6886 return; 6887 } else if (mctl_present) { 6888 /* 6889 * ip_add_info might return a new mp. 6890 */ 6891 ASSERT(first_mp != mp); 6892 first_mp->b_cont = mp; 6893 } else { 6894 first_mp = mp; 6895 } 6896 } 6897 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6898 if (IPCL_IS_TCP(connp)) { 6899 /* do not drain, certain use cases can blow the stack */ 6900 SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv, 6901 connp, SQ_NODRAIN, SQTAG_IP_FANOUT_TCP); 6902 } else { 6903 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 6904 (connp->conn_recv)(connp, first_mp, NULL); 6905 CONN_DEC_REF(connp); 6906 } 6907 } 6908 6909 /* 6910 * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or 6911 * pass it along to ESP if the SPI is non-zero. Returns TRUE if the mblk 6912 * is not consumed. 6913 * 6914 * One of four things can happen, all of which affect the passed-in mblk: 6915 * 6916 * 1.) ICMP messages that go through here just get returned TRUE. 6917 * 6918 * 2.) The packet is stock UDP and gets its zero-SPI stripped. Return TRUE. 6919 * 6920 * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent 6921 * ESP packet, and is passed along to ESP for consumption. Return FALSE. 6922 * 6923 * 4.) The packet is an ESP-in-UDP Keepalive. Drop it and return FALSE. 6924 */ 6925 static boolean_t 6926 zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, 6927 ipsec_stack_t *ipss) 6928 { 6929 int shift, plen, iph_len; 6930 ipha_t *ipha; 6931 udpha_t *udpha; 6932 uint32_t *spi; 6933 uint32_t esp_ports; 6934 uint8_t *orptr; 6935 boolean_t free_ire; 6936 6937 if (DB_TYPE(mp) == M_CTL) { 6938 /* 6939 * ICMP message with UDP inside. Don't bother stripping, just 6940 * send it up. 6941 * 6942 * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going 6943 * to ignore errors set by ICMP anyway ('cause they might be 6944 * forged), but that's the app's decision, not ours. 6945 */ 6946 6947 /* Bunch of reality checks for DEBUG kernels... */ 6948 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 6949 ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP); 6950 6951 return (B_TRUE); 6952 } 6953 6954 ipha = (ipha_t *)mp->b_rptr; 6955 iph_len = IPH_HDR_LENGTH(ipha); 6956 plen = ntohs(ipha->ipha_length); 6957 6958 if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) { 6959 /* 6960 * Most likely a keepalive for the benefit of an intervening 6961 * NAT. These aren't for us, per se, so drop it. 6962 * 6963 * RFC 3947/8 doesn't say for sure what to do for 2-3 6964 * byte packets (keepalives are 1-byte), but we'll drop them 6965 * also. 6966 */ 6967 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6968 DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper); 6969 return (B_FALSE); 6970 } 6971 6972 if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) { 6973 /* might as well pull it all up - it might be ESP. */ 6974 if (!pullupmsg(mp, -1)) { 6975 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6976 DROPPER(ipss, ipds_esp_nomem), 6977 &ipss->ipsec_dropper); 6978 return (B_FALSE); 6979 } 6980 6981 ipha = (ipha_t *)mp->b_rptr; 6982 } 6983 spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t)); 6984 if (*spi == 0) { 6985 /* UDP packet - remove 0-spi. */ 6986 shift = sizeof (uint32_t); 6987 } else { 6988 /* ESP-in-UDP packet - reduce to ESP. */ 6989 ipha->ipha_protocol = IPPROTO_ESP; 6990 shift = sizeof (udpha_t); 6991 } 6992 6993 /* Fix IP header */ 6994 ipha->ipha_length = htons(plen - shift); 6995 ipha->ipha_hdr_checksum = 0; 6996 6997 orptr = mp->b_rptr; 6998 mp->b_rptr += shift; 6999 7000 udpha = (udpha_t *)(orptr + iph_len); 7001 if (*spi == 0) { 7002 ASSERT((uint8_t *)ipha == orptr); 7003 udpha->uha_length = htons(plen - shift - iph_len); 7004 iph_len += sizeof (udpha_t); /* For the call to ovbcopy(). */ 7005 esp_ports = 0; 7006 } else { 7007 esp_ports = *((uint32_t *)udpha); 7008 ASSERT(esp_ports != 0); 7009 } 7010 ovbcopy(orptr, orptr + shift, iph_len); 7011 if (esp_ports != 0) /* Punt up for ESP processing. */ { 7012 ipha = (ipha_t *)(orptr + shift); 7013 7014 free_ire = (ire == NULL); 7015 if (free_ire) { 7016 /* Re-acquire ire. */ 7017 ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL, 7018 ipss->ipsec_netstack->netstack_ip); 7019 if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) { 7020 if (ire != NULL) 7021 ire_refrele(ire); 7022 /* 7023 * Do a regular freemsg(), as this is an IP 7024 * error (no local route) not an IPsec one. 7025 */ 7026 freemsg(mp); 7027 } 7028 } 7029 7030 ip_proto_input(q, mp, ipha, ire, recv_ill, esp_ports); 7031 if (free_ire) 7032 ire_refrele(ire); 7033 } 7034 7035 return (esp_ports == 0); 7036 } 7037 7038 /* 7039 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 7040 * We are responsible for disposing of mp, such as by freemsg() or putnext() 7041 * Caller is responsible for dropping references to the conn, and freeing 7042 * first_mp. 7043 * 7044 * IPQoS Notes 7045 * Before sending it to the client, invoke IPPF processing. Policy processing 7046 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 7047 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 7048 * ip_wput_local, ip_policy is false. 7049 */ 7050 static void 7051 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 7052 boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 7053 boolean_t ip_policy) 7054 { 7055 boolean_t mctl_present = (first_mp != NULL); 7056 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 7057 uint32_t ill_index; 7058 ip_stack_t *ipst = recv_ill->ill_ipst; 7059 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 7060 7061 ASSERT(ill != NULL); 7062 7063 if (mctl_present) 7064 first_mp->b_cont = mp; 7065 else 7066 first_mp = mp; 7067 7068 if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || 7069 (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { 7070 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 7071 freemsg(first_mp); 7072 return; 7073 } 7074 7075 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 7076 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 7077 NULL, mctl_present); 7078 /* Freed by ipsec_check_inbound_policy(). */ 7079 if (first_mp == NULL) { 7080 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 7081 return; 7082 } 7083 } 7084 if (mctl_present) 7085 freeb(first_mp); 7086 7087 /* Let's hope the compilers utter "branch, predict-not-taken..." ;) */ 7088 if (connp->conn_udp->udp_nat_t_endpoint) { 7089 if (mctl_present) { 7090 /* mctl_present *shouldn't* happen. */ 7091 ip_drop_packet(mp, B_TRUE, NULL, NULL, 7092 DROPPER(ipss, ipds_esp_nat_t_ipsec), 7093 &ipss->ipsec_dropper); 7094 return; 7095 } 7096 7097 if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss)) 7098 return; 7099 } 7100 7101 /* Handle options. */ 7102 if (connp->conn_recvif) 7103 in_flags = IPF_RECVIF; 7104 /* 7105 * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag 7106 * passed to ip_add_info is based on IP version of connp. 7107 */ 7108 if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 7109 if (connp->conn_af_isv6) { 7110 /* 7111 * V6 only needs index 7112 */ 7113 in_flags |= IPF_RECVIF; 7114 } else { 7115 /* 7116 * V4 needs index + matching address. 7117 */ 7118 in_flags |= IPF_RECVADDR; 7119 } 7120 } 7121 7122 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 7123 in_flags |= IPF_RECVSLLA; 7124 7125 /* 7126 * Initiate IPPF processing here, if needed. Note first_mp won't be 7127 * freed if the packet is dropped. The caller will do so. 7128 */ 7129 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 7130 ill_index = recv_ill->ill_phyint->phyint_ifindex; 7131 ip_process(IPP_LOCAL_IN, &mp, ill_index); 7132 if (mp == NULL) { 7133 return; 7134 } 7135 } 7136 if ((in_flags != 0) && 7137 (mp->b_datap->db_type != M_CTL)) { 7138 /* 7139 * The actual data will be contained in b_cont 7140 * upon successful return of the following call 7141 * else original mblk is returned 7142 */ 7143 ASSERT(recv_ill != NULL); 7144 mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp), 7145 ipst); 7146 } 7147 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 7148 /* Send it upstream */ 7149 (connp->conn_recv)(connp, mp, NULL); 7150 } 7151 7152 /* 7153 * Fanout for UDP packets. 7154 * The caller puts <fport, lport> in the ports parameter. 7155 * 7156 * If SO_REUSEADDR is set all multicast and broadcast packets 7157 * will be delivered to all streams bound to the same port. 7158 * 7159 * Zones notes: 7160 * Multicast and broadcast packets will be distributed to streams in all zones. 7161 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 7162 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 7163 * packets. To maintain this behavior with multiple zones, the conns are grouped 7164 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 7165 * each zone. If unset, all the following conns in the same zone are skipped. 7166 */ 7167 static void 7168 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 7169 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 7170 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 7171 { 7172 uint32_t dstport, srcport; 7173 ipaddr_t dst; 7174 mblk_t *first_mp; 7175 boolean_t secure; 7176 in6_addr_t v6src; 7177 conn_t *connp; 7178 connf_t *connfp; 7179 conn_t *first_connp; 7180 conn_t *next_connp; 7181 mblk_t *mp1, *first_mp1; 7182 ipaddr_t src; 7183 zoneid_t last_zoneid; 7184 boolean_t reuseaddr; 7185 boolean_t shared_addr; 7186 boolean_t unlabeled; 7187 ip_stack_t *ipst; 7188 7189 ASSERT(recv_ill != NULL); 7190 ipst = recv_ill->ill_ipst; 7191 7192 first_mp = mp; 7193 if (mctl_present) { 7194 mp = first_mp->b_cont; 7195 first_mp->b_cont = NULL; 7196 secure = ipsec_in_is_secure(first_mp); 7197 ASSERT(mp != NULL); 7198 } else { 7199 first_mp = NULL; 7200 secure = B_FALSE; 7201 } 7202 7203 /* Extract ports in net byte order */ 7204 dstport = htons(ntohl(ports) & 0xFFFF); 7205 srcport = htons(ntohl(ports) >> 16); 7206 dst = ipha->ipha_dst; 7207 src = ipha->ipha_src; 7208 7209 unlabeled = B_FALSE; 7210 if (is_system_labeled()) 7211 /* Cred cannot be null on IPv4 */ 7212 unlabeled = (msg_getlabel(mp)->tsl_flags & 7213 TSLF_UNLABELED) != 0; 7214 shared_addr = (zoneid == ALL_ZONES); 7215 if (shared_addr) { 7216 /* 7217 * No need to handle exclusive-stack zones since ALL_ZONES 7218 * only applies to the shared stack. 7219 */ 7220 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 7221 /* 7222 * If no shared MLP is found, tsol_mlp_findzone returns 7223 * ALL_ZONES. In that case, we assume it's SLP, and 7224 * search for the zone based on the packet label. 7225 * 7226 * If there is such a zone, we prefer to find a 7227 * connection in it. Otherwise, we look for a 7228 * MAC-exempt connection in any zone whose label 7229 * dominates the default label on the packet. 7230 */ 7231 if (zoneid == ALL_ZONES) 7232 zoneid = tsol_packet_to_zoneid(mp); 7233 else 7234 unlabeled = B_FALSE; 7235 } 7236 7237 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7238 mutex_enter(&connfp->connf_lock); 7239 connp = connfp->connf_head; 7240 if (!broadcast && !CLASSD(dst)) { 7241 /* 7242 * Not broadcast or multicast. Send to the one (first) 7243 * client we find. No need to check conn_wantpacket() 7244 * since IP_BOUND_IF/conn_incoming_ill does not apply to 7245 * IPv4 unicast packets. 7246 */ 7247 while ((connp != NULL) && 7248 (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) || 7249 (!IPCL_ZONE_MATCH(connp, zoneid) && 7250 !(unlabeled && connp->conn_mac_exempt && shared_addr)))) { 7251 /* 7252 * We keep searching since the conn did not match, 7253 * or its zone did not match and it is not either 7254 * an allzones conn or a mac exempt conn (if the 7255 * sender is unlabeled.) 7256 */ 7257 connp = connp->conn_next; 7258 } 7259 7260 if (connp == NULL || 7261 !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) 7262 goto notfound; 7263 7264 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); 7265 7266 if (is_system_labeled() && 7267 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7268 connp)) 7269 goto notfound; 7270 7271 CONN_INC_REF(connp); 7272 mutex_exit(&connfp->connf_lock); 7273 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7274 flags, recv_ill, ip_policy); 7275 IP_STAT(ipst, ip_udp_fannorm); 7276 CONN_DEC_REF(connp); 7277 return; 7278 } 7279 7280 /* 7281 * Broadcast and multicast case 7282 * 7283 * Need to check conn_wantpacket(). 7284 * If SO_REUSEADDR has been set on the first we send the 7285 * packet to all clients that have joined the group and 7286 * match the port. 7287 */ 7288 7289 while (connp != NULL) { 7290 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 7291 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7292 (!is_system_labeled() || 7293 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7294 connp))) 7295 break; 7296 connp = connp->conn_next; 7297 } 7298 7299 if (connp == NULL || 7300 !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) 7301 goto notfound; 7302 7303 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); 7304 7305 first_connp = connp; 7306 /* 7307 * When SO_REUSEADDR is not set, send the packet only to the first 7308 * matching connection in its zone by keeping track of the zoneid. 7309 */ 7310 reuseaddr = first_connp->conn_reuseaddr; 7311 last_zoneid = first_connp->conn_zoneid; 7312 7313 CONN_INC_REF(connp); 7314 connp = connp->conn_next; 7315 for (;;) { 7316 while (connp != NULL) { 7317 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 7318 (reuseaddr || connp->conn_zoneid != last_zoneid) && 7319 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7320 (!is_system_labeled() || 7321 tsol_receive_local(mp, &dst, IPV4_VERSION, 7322 shared_addr, connp))) 7323 break; 7324 connp = connp->conn_next; 7325 } 7326 /* 7327 * Just copy the data part alone. The mctl part is 7328 * needed just for verifying policy and it is never 7329 * sent up. 7330 */ 7331 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7332 ((mp1 = copymsg(mp)) == NULL))) { 7333 /* 7334 * No more interested clients or memory 7335 * allocation failed 7336 */ 7337 connp = first_connp; 7338 break; 7339 } 7340 if (connp->conn_zoneid != last_zoneid) { 7341 /* 7342 * Update the zoneid so that the packet isn't sent to 7343 * any more conns in the same zone unless SO_REUSEADDR 7344 * is set. 7345 */ 7346 reuseaddr = connp->conn_reuseaddr; 7347 last_zoneid = connp->conn_zoneid; 7348 } 7349 if (first_mp != NULL) { 7350 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7351 ipsec_info_type == IPSEC_IN); 7352 first_mp1 = ipsec_in_tag(first_mp, NULL, 7353 ipst->ips_netstack); 7354 if (first_mp1 == NULL) { 7355 freemsg(mp1); 7356 connp = first_connp; 7357 break; 7358 } 7359 } else { 7360 first_mp1 = NULL; 7361 } 7362 CONN_INC_REF(connp); 7363 mutex_exit(&connfp->connf_lock); 7364 /* 7365 * IPQoS notes: We don't send the packet for policy 7366 * processing here, will do it for the last one (below). 7367 * i.e. we do it per-packet now, but if we do policy 7368 * processing per-conn, then we would need to do it 7369 * here too. 7370 */ 7371 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7372 ipha, flags, recv_ill, B_FALSE); 7373 mutex_enter(&connfp->connf_lock); 7374 /* Follow the next pointer before releasing the conn. */ 7375 next_connp = connp->conn_next; 7376 IP_STAT(ipst, ip_udp_fanmb); 7377 CONN_DEC_REF(connp); 7378 connp = next_connp; 7379 } 7380 7381 /* Last one. Send it upstream. */ 7382 mutex_exit(&connfp->connf_lock); 7383 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7384 recv_ill, ip_policy); 7385 IP_STAT(ipst, ip_udp_fanmb); 7386 CONN_DEC_REF(connp); 7387 return; 7388 7389 notfound: 7390 7391 mutex_exit(&connfp->connf_lock); 7392 IP_STAT(ipst, ip_udp_fanothers); 7393 /* 7394 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 7395 * have already been matched above, since they live in the IPv4 7396 * fanout tables. This implies we only need to 7397 * check for IPv6 in6addr_any endpoints here. 7398 * Thus we compare using ipv6_all_zeros instead of the destination 7399 * address, except for the multicast group membership lookup which 7400 * uses the IPv4 destination. 7401 */ 7402 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 7403 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7404 mutex_enter(&connfp->connf_lock); 7405 connp = connfp->connf_head; 7406 if (!broadcast && !CLASSD(dst)) { 7407 while (connp != NULL) { 7408 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7409 srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) && 7410 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7411 !connp->conn_ipv6_v6only) 7412 break; 7413 connp = connp->conn_next; 7414 } 7415 7416 if (connp != NULL && is_system_labeled() && 7417 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7418 connp)) 7419 connp = NULL; 7420 7421 if (connp == NULL || 7422 !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) { 7423 /* 7424 * No one bound to this port. Is 7425 * there a client that wants all 7426 * unclaimed datagrams? 7427 */ 7428 mutex_exit(&connfp->connf_lock); 7429 7430 if (mctl_present) 7431 first_mp->b_cont = mp; 7432 else 7433 first_mp = mp; 7434 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP]. 7435 connf_head != NULL) { 7436 ip_fanout_proto(q, first_mp, ill, ipha, 7437 flags | IP_FF_RAWIP, mctl_present, 7438 ip_policy, recv_ill, zoneid); 7439 } else { 7440 if (ip_fanout_send_icmp(q, first_mp, flags, 7441 ICMP_DEST_UNREACHABLE, 7442 ICMP_PORT_UNREACHABLE, 7443 mctl_present, zoneid, ipst)) { 7444 BUMP_MIB(ill->ill_ip_mib, 7445 udpIfStatsNoPorts); 7446 } 7447 } 7448 return; 7449 } 7450 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); 7451 7452 CONN_INC_REF(connp); 7453 mutex_exit(&connfp->connf_lock); 7454 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7455 flags, recv_ill, ip_policy); 7456 CONN_DEC_REF(connp); 7457 return; 7458 } 7459 /* 7460 * IPv4 multicast packet being delivered to an AF_INET6 7461 * in6addr_any endpoint. 7462 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 7463 * and not conn_wantpacket_v6() since any multicast membership is 7464 * for an IPv4-mapped multicast address. 7465 * The packet is sent to all clients in all zones that have joined the 7466 * group and match the port. 7467 */ 7468 while (connp != NULL) { 7469 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7470 srcport, v6src) && 7471 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7472 (!is_system_labeled() || 7473 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7474 connp))) 7475 break; 7476 connp = connp->conn_next; 7477 } 7478 7479 if (connp == NULL || 7480 !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) { 7481 /* 7482 * No one bound to this port. Is 7483 * there a client that wants all 7484 * unclaimed datagrams? 7485 */ 7486 mutex_exit(&connfp->connf_lock); 7487 7488 if (mctl_present) 7489 first_mp->b_cont = mp; 7490 else 7491 first_mp = mp; 7492 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head != 7493 NULL) { 7494 ip_fanout_proto(q, first_mp, ill, ipha, 7495 flags | IP_FF_RAWIP, mctl_present, ip_policy, 7496 recv_ill, zoneid); 7497 } else { 7498 /* 7499 * We used to attempt to send an icmp error here, but 7500 * since this is known to be a multicast packet 7501 * and we don't send icmp errors in response to 7502 * multicast, just drop the packet and give up sooner. 7503 */ 7504 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); 7505 freemsg(first_mp); 7506 } 7507 return; 7508 } 7509 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); 7510 7511 first_connp = connp; 7512 7513 CONN_INC_REF(connp); 7514 connp = connp->conn_next; 7515 for (;;) { 7516 while (connp != NULL) { 7517 if (IPCL_UDP_MATCH_V6(connp, dstport, 7518 ipv6_all_zeros, srcport, v6src) && 7519 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7520 (!is_system_labeled() || 7521 tsol_receive_local(mp, &dst, IPV4_VERSION, 7522 shared_addr, connp))) 7523 break; 7524 connp = connp->conn_next; 7525 } 7526 /* 7527 * Just copy the data part alone. The mctl part is 7528 * needed just for verifying policy and it is never 7529 * sent up. 7530 */ 7531 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7532 ((mp1 = copymsg(mp)) == NULL))) { 7533 /* 7534 * No more intested clients or memory 7535 * allocation failed 7536 */ 7537 connp = first_connp; 7538 break; 7539 } 7540 if (first_mp != NULL) { 7541 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7542 ipsec_info_type == IPSEC_IN); 7543 first_mp1 = ipsec_in_tag(first_mp, NULL, 7544 ipst->ips_netstack); 7545 if (first_mp1 == NULL) { 7546 freemsg(mp1); 7547 connp = first_connp; 7548 break; 7549 } 7550 } else { 7551 first_mp1 = NULL; 7552 } 7553 CONN_INC_REF(connp); 7554 mutex_exit(&connfp->connf_lock); 7555 /* 7556 * IPQoS notes: We don't send the packet for policy 7557 * processing here, will do it for the last one (below). 7558 * i.e. we do it per-packet now, but if we do policy 7559 * processing per-conn, then we would need to do it 7560 * here too. 7561 */ 7562 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7563 ipha, flags, recv_ill, B_FALSE); 7564 mutex_enter(&connfp->connf_lock); 7565 /* Follow the next pointer before releasing the conn. */ 7566 next_connp = connp->conn_next; 7567 CONN_DEC_REF(connp); 7568 connp = next_connp; 7569 } 7570 7571 /* Last one. Send it upstream. */ 7572 mutex_exit(&connfp->connf_lock); 7573 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7574 recv_ill, ip_policy); 7575 CONN_DEC_REF(connp); 7576 } 7577 7578 /* 7579 * Complete the ip_wput header so that it 7580 * is possible to generate ICMP 7581 * errors. 7582 */ 7583 int 7584 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst) 7585 { 7586 ire_t *ire; 7587 7588 if (ipha->ipha_src == INADDR_ANY) { 7589 ire = ire_lookup_local(zoneid, ipst); 7590 if (ire == NULL) { 7591 ip1dbg(("ip_hdr_complete: no source IRE\n")); 7592 return (1); 7593 } 7594 ipha->ipha_src = ire->ire_addr; 7595 ire_refrele(ire); 7596 } 7597 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 7598 ipha->ipha_hdr_checksum = 0; 7599 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 7600 return (0); 7601 } 7602 7603 /* 7604 * Nobody should be sending 7605 * packets up this stream 7606 */ 7607 static void 7608 ip_lrput(queue_t *q, mblk_t *mp) 7609 { 7610 mblk_t *mp1; 7611 7612 switch (mp->b_datap->db_type) { 7613 case M_FLUSH: 7614 /* Turn around */ 7615 if (*mp->b_rptr & FLUSHW) { 7616 *mp->b_rptr &= ~FLUSHR; 7617 qreply(q, mp); 7618 return; 7619 } 7620 break; 7621 } 7622 /* Could receive messages that passed through ar_rput */ 7623 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 7624 mp1->b_prev = mp1->b_next = NULL; 7625 freemsg(mp); 7626 } 7627 7628 /* Nobody should be sending packets down this stream */ 7629 /* ARGSUSED */ 7630 void 7631 ip_lwput(queue_t *q, mblk_t *mp) 7632 { 7633 freemsg(mp); 7634 } 7635 7636 /* 7637 * Move the first hop in any source route to ipha_dst and remove that part of 7638 * the source route. Called by other protocols. Errors in option formatting 7639 * are ignored - will be handled by ip_wput_options Return the final 7640 * destination (either ipha_dst or the last entry in a source route.) 7641 */ 7642 ipaddr_t 7643 ip_massage_options(ipha_t *ipha, netstack_t *ns) 7644 { 7645 ipoptp_t opts; 7646 uchar_t *opt; 7647 uint8_t optval; 7648 uint8_t optlen; 7649 ipaddr_t dst; 7650 int i; 7651 ire_t *ire; 7652 ip_stack_t *ipst = ns->netstack_ip; 7653 7654 ip2dbg(("ip_massage_options\n")); 7655 dst = ipha->ipha_dst; 7656 for (optval = ipoptp_first(&opts, ipha); 7657 optval != IPOPT_EOL; 7658 optval = ipoptp_next(&opts)) { 7659 opt = opts.ipoptp_cur; 7660 switch (optval) { 7661 uint8_t off; 7662 case IPOPT_SSRR: 7663 case IPOPT_LSRR: 7664 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 7665 ip1dbg(("ip_massage_options: bad src route\n")); 7666 break; 7667 } 7668 optlen = opts.ipoptp_len; 7669 off = opt[IPOPT_OFFSET]; 7670 off--; 7671 redo_srr: 7672 if (optlen < IP_ADDR_LEN || 7673 off > optlen - IP_ADDR_LEN) { 7674 /* End of source route */ 7675 ip1dbg(("ip_massage_options: end of SR\n")); 7676 break; 7677 } 7678 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 7679 ip1dbg(("ip_massage_options: next hop 0x%x\n", 7680 ntohl(dst))); 7681 /* 7682 * Check if our address is present more than 7683 * once as consecutive hops in source route. 7684 * XXX verify per-interface ip_forwarding 7685 * for source route? 7686 */ 7687 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 7688 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7689 if (ire != NULL) { 7690 ire_refrele(ire); 7691 off += IP_ADDR_LEN; 7692 goto redo_srr; 7693 } 7694 if (dst == htonl(INADDR_LOOPBACK)) { 7695 ip1dbg(("ip_massage_options: loopback addr in " 7696 "source route!\n")); 7697 break; 7698 } 7699 /* 7700 * Update ipha_dst to be the first hop and remove the 7701 * first hop from the source route (by overwriting 7702 * part of the option with NOP options). 7703 */ 7704 ipha->ipha_dst = dst; 7705 /* Put the last entry in dst */ 7706 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 7707 3; 7708 bcopy(&opt[off], &dst, IP_ADDR_LEN); 7709 7710 ip1dbg(("ip_massage_options: last hop 0x%x\n", 7711 ntohl(dst))); 7712 /* Move down and overwrite */ 7713 opt[IP_ADDR_LEN] = opt[0]; 7714 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 7715 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 7716 for (i = 0; i < IP_ADDR_LEN; i++) 7717 opt[i] = IPOPT_NOP; 7718 break; 7719 } 7720 } 7721 return (dst); 7722 } 7723 7724 /* 7725 * Return the network mask 7726 * associated with the specified address. 7727 */ 7728 ipaddr_t 7729 ip_net_mask(ipaddr_t addr) 7730 { 7731 uchar_t *up = (uchar_t *)&addr; 7732 ipaddr_t mask = 0; 7733 uchar_t *maskp = (uchar_t *)&mask; 7734 7735 #if defined(__i386) || defined(__amd64) 7736 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 7737 #endif 7738 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 7739 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 7740 #endif 7741 if (CLASSD(addr)) { 7742 maskp[0] = 0xF0; 7743 return (mask); 7744 } 7745 7746 /* We assume Class E default netmask to be 32 */ 7747 if (CLASSE(addr)) 7748 return (0xffffffffU); 7749 7750 if (addr == 0) 7751 return (0); 7752 maskp[0] = 0xFF; 7753 if ((up[0] & 0x80) == 0) 7754 return (mask); 7755 7756 maskp[1] = 0xFF; 7757 if ((up[0] & 0xC0) == 0x80) 7758 return (mask); 7759 7760 maskp[2] = 0xFF; 7761 if ((up[0] & 0xE0) == 0xC0) 7762 return (mask); 7763 7764 /* Otherwise return no mask */ 7765 return ((ipaddr_t)0); 7766 } 7767 7768 /* 7769 * Helper ill lookup function used by IPsec. 7770 */ 7771 ill_t * 7772 ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst) 7773 { 7774 ill_t *ret_ill; 7775 7776 ASSERT(ifindex != 0); 7777 7778 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 7779 ipst); 7780 if (ret_ill == NULL) { 7781 if (isv6) { 7782 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 7783 ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n", 7784 ifindex)); 7785 } else { 7786 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 7787 ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n", 7788 ifindex)); 7789 } 7790 freemsg(first_mp); 7791 return (NULL); 7792 } 7793 return (ret_ill); 7794 } 7795 7796 /* 7797 * IPv4 - 7798 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 7799 * out a packet to a destination address for which we do not have specific 7800 * (or sufficient) routing information. 7801 * 7802 * NOTE : These are the scopes of some of the variables that point at IRE, 7803 * which needs to be followed while making any future modifications 7804 * to avoid memory leaks. 7805 * 7806 * - ire and sire are the entries looked up initially by 7807 * ire_ftable_lookup. 7808 * - ipif_ire is used to hold the interface ire associated with 7809 * the new cache ire. But it's scope is limited, so we always REFRELE 7810 * it before branching out to error paths. 7811 * - save_ire is initialized before ire_create, so that ire returned 7812 * by ire_create will not over-write the ire. We REFRELE save_ire 7813 * before breaking out of the switch. 7814 * 7815 * Thus on failures, we have to REFRELE only ire and sire, if they 7816 * are not NULL. 7817 */ 7818 void 7819 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, 7820 zoneid_t zoneid, ip_stack_t *ipst) 7821 { 7822 areq_t *areq; 7823 ipaddr_t gw = 0; 7824 ire_t *ire = NULL; 7825 mblk_t *res_mp; 7826 ipaddr_t *addrp; 7827 ipaddr_t nexthop_addr; 7828 ipif_t *src_ipif = NULL; 7829 ill_t *dst_ill = NULL; 7830 ipha_t *ipha; 7831 ire_t *sire = NULL; 7832 mblk_t *first_mp; 7833 ire_t *save_ire; 7834 ushort_t ire_marks = 0; 7835 boolean_t mctl_present; 7836 ipsec_out_t *io; 7837 mblk_t *saved_mp; 7838 mblk_t *copy_mp = NULL; 7839 mblk_t *xmit_mp = NULL; 7840 ipaddr_t save_dst; 7841 uint32_t multirt_flags = 7842 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 7843 boolean_t multirt_is_resolvable; 7844 boolean_t multirt_resolve_next; 7845 boolean_t unspec_src; 7846 boolean_t ip_nexthop = B_FALSE; 7847 tsol_ire_gw_secattr_t *attrp = NULL; 7848 tsol_gcgrp_t *gcgrp = NULL; 7849 tsol_gcgrp_addr_t ga; 7850 int multirt_res_failures = 0; 7851 int multirt_res_attempts = 0; 7852 int multirt_already_resolved = 0; 7853 boolean_t multirt_no_icmp_error = B_FALSE; 7854 7855 if (ip_debug > 2) { 7856 /* ip1dbg */ 7857 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7858 } 7859 7860 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7861 if (mctl_present) { 7862 io = (ipsec_out_t *)first_mp->b_rptr; 7863 ASSERT(io->ipsec_out_type == IPSEC_OUT); 7864 ASSERT(zoneid == io->ipsec_out_zoneid); 7865 ASSERT(zoneid != ALL_ZONES); 7866 } 7867 7868 ipha = (ipha_t *)mp->b_rptr; 7869 7870 /* All multicast lookups come through ip_newroute_ipif() */ 7871 if (CLASSD(dst)) { 7872 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7873 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7874 freemsg(first_mp); 7875 return; 7876 } 7877 7878 if (mctl_present && io->ipsec_out_ip_nexthop) { 7879 ip_nexthop = B_TRUE; 7880 nexthop_addr = io->ipsec_out_nexthop_addr; 7881 } 7882 /* 7883 * If this IRE is created for forwarding or it is not for 7884 * traffic for congestion controlled protocols, mark it as temporary. 7885 */ 7886 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7887 ire_marks |= IRE_MARK_TEMPORARY; 7888 7889 /* 7890 * Get what we can from ire_ftable_lookup which will follow an IRE 7891 * chain until it gets the most specific information available. 7892 * For example, we know that there is no IRE_CACHE for this dest, 7893 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7894 * ire_ftable_lookup will look up the gateway, etc. 7895 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7896 * to the destination, of equal netmask length in the forward table, 7897 * will be recursively explored. If no information is available 7898 * for the final gateway of that route, we force the returned ire 7899 * to be equal to sire using MATCH_IRE_PARENT. 7900 * At least, in this case we have a starting point (in the buckets) 7901 * to look for other routes to the destination in the forward table. 7902 * This is actually used only for multirouting, where a list 7903 * of routes has to be processed in sequence. 7904 * 7905 * In the process of coming up with the most specific information, 7906 * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry 7907 * for the gateway (i.e., one for which the ire_nce->nce_state is 7908 * not yet ND_REACHABLE, and is in the middle of arp resolution). 7909 * Two caveats when handling incomplete ire's in ip_newroute: 7910 * - we should be careful when accessing its ire_nce (specifically 7911 * the nce_res_mp) ast it might change underneath our feet, and, 7912 * - not all legacy code path callers are prepared to handle 7913 * incomplete ire's, so we should not create/add incomplete 7914 * ire_cache entries here. (See discussion about temporary solution 7915 * further below). 7916 * 7917 * In order to minimize packet dropping, and to preserve existing 7918 * behavior, we treat this case as if there were no IRE_CACHE for the 7919 * gateway, and instead use the IF_RESOLVER ire to send out 7920 * another request to ARP (this is achieved by passing the 7921 * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the 7922 * arp response comes back in ip_wput_nondata, we will create 7923 * a per-dst ire_cache that has an ND_COMPLETE ire. 7924 * 7925 * Note that this is a temporary solution; the correct solution is 7926 * to create an incomplete per-dst ire_cache entry, and send the 7927 * packet out when the gw's nce is resolved. In order to achieve this, 7928 * all packet processing must have been completed prior to calling 7929 * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need 7930 * to be modified to accomodate this solution. 7931 */ 7932 if (ip_nexthop) { 7933 /* 7934 * The first time we come here, we look for an IRE_INTERFACE 7935 * entry for the specified nexthop, set the dst to be the 7936 * nexthop address and create an IRE_CACHE entry for the 7937 * nexthop. The next time around, we are able to find an 7938 * IRE_CACHE entry for the nexthop, set the gateway to be the 7939 * nexthop address and create an IRE_CACHE entry for the 7940 * destination address via the specified nexthop. 7941 */ 7942 ire = ire_cache_lookup(nexthop_addr, zoneid, 7943 msg_getlabel(mp), ipst); 7944 if (ire != NULL) { 7945 gw = nexthop_addr; 7946 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7947 } else { 7948 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7949 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7950 msg_getlabel(mp), 7951 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 7952 ipst); 7953 if (ire != NULL) { 7954 dst = nexthop_addr; 7955 } 7956 } 7957 } else { 7958 ire = ire_ftable_lookup(dst, 0, 0, 0, 7959 NULL, &sire, zoneid, 0, msg_getlabel(mp), 7960 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7961 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7962 MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, 7963 ipst); 7964 } 7965 7966 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7967 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7968 7969 /* 7970 * This loop is run only once in most cases. 7971 * We loop to resolve further routes only when the destination 7972 * can be reached through multiple RTF_MULTIRT-flagged ires. 7973 */ 7974 do { 7975 /* Clear the previous iteration's values */ 7976 if (src_ipif != NULL) { 7977 ipif_refrele(src_ipif); 7978 src_ipif = NULL; 7979 } 7980 if (dst_ill != NULL) { 7981 ill_refrele(dst_ill); 7982 dst_ill = NULL; 7983 } 7984 7985 multirt_resolve_next = B_FALSE; 7986 /* 7987 * We check if packets have to be multirouted. 7988 * In this case, given the current <ire, sire> couple, 7989 * we look for the next suitable <ire, sire>. 7990 * This check is done in ire_multirt_lookup(), 7991 * which applies various criteria to find the next route 7992 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7993 * unchanged if it detects it has not been tried yet. 7994 */ 7995 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7996 ip3dbg(("ip_newroute: starting next_resolution " 7997 "with first_mp %p, tag %d\n", 7998 (void *)first_mp, 7999 MULTIRT_DEBUG_TAGGED(first_mp))); 8000 8001 ASSERT(sire != NULL); 8002 multirt_is_resolvable = 8003 ire_multirt_lookup(&ire, &sire, multirt_flags, 8004 &multirt_already_resolved, msg_getlabel(mp), ipst); 8005 8006 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 8007 "multirt_already_resolved %d, " 8008 "multirt_res_attempts %d, multirt_res_failures %d, " 8009 "ire %p, sire %p\n", multirt_is_resolvable, 8010 multirt_already_resolved, multirt_res_attempts, 8011 multirt_res_failures, (void *)ire, (void *)sire)); 8012 8013 if (!multirt_is_resolvable) { 8014 /* 8015 * No more multirt route to resolve; give up 8016 * (all routes resolved or no more 8017 * resolvable routes). 8018 */ 8019 if (ire != NULL) { 8020 ire_refrele(ire); 8021 ire = NULL; 8022 } 8023 /* 8024 * Generate ICMP error only if all attempts to 8025 * resolve multirt route failed and there is no 8026 * already resolved one. Don't generate ICMP 8027 * error when: 8028 * 8029 * 1) there was no attempt to resolve 8030 * 2) at least one attempt passed 8031 * 3) a multirt route is already resolved 8032 * 8033 * Case 1) may occur due to multiple 8034 * resolution attempts during single 8035 * ip_multirt_resolution_interval. 8036 * 8037 * Case 2-3) means that CGTP destination is 8038 * reachable via one link so we don't want to 8039 * generate ICMP host unreachable error. 8040 */ 8041 if (multirt_res_attempts == 0 || 8042 multirt_res_failures < 8043 multirt_res_attempts || 8044 multirt_already_resolved > 0) 8045 multirt_no_icmp_error = B_TRUE; 8046 } else { 8047 ASSERT(sire != NULL); 8048 ASSERT(ire != NULL); 8049 8050 multirt_res_attempts++; 8051 } 8052 } 8053 8054 if (ire == NULL) { 8055 if (ip_debug > 3) { 8056 /* ip2dbg */ 8057 pr_addr_dbg("ip_newroute: " 8058 "can't resolve %s\n", AF_INET, &dst); 8059 } 8060 ip3dbg(("ip_newroute: " 8061 "ire %p, sire %p, multirt_no_icmp_error %d\n", 8062 (void *)ire, (void *)sire, 8063 (int)multirt_no_icmp_error)); 8064 8065 if (sire != NULL) { 8066 ire_refrele(sire); 8067 sire = NULL; 8068 } 8069 8070 if (multirt_no_icmp_error) { 8071 /* There is no need to report an ICMP error. */ 8072 MULTIRT_DEBUG_UNTAG(first_mp); 8073 freemsg(first_mp); 8074 return; 8075 } 8076 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 8077 RTA_DST, ipst); 8078 goto icmp_err_ret; 8079 } 8080 8081 /* 8082 * Verify that the returned IRE does not have either 8083 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 8084 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 8085 */ 8086 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 8087 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 8088 goto icmp_err_ret; 8089 } 8090 /* 8091 * Increment the ire_ob_pkt_count field for ire if it is an 8092 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 8093 * increment the same for the parent IRE, sire, if it is some 8094 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST) 8095 */ 8096 if ((ire->ire_type & IRE_INTERFACE) != 0) { 8097 UPDATE_OB_PKT_COUNT(ire); 8098 ire->ire_last_used_time = lbolt; 8099 } 8100 8101 if (sire != NULL) { 8102 gw = sire->ire_gateway_addr; 8103 ASSERT((sire->ire_type & (IRE_CACHETABLE | 8104 IRE_INTERFACE)) == 0); 8105 UPDATE_OB_PKT_COUNT(sire); 8106 sire->ire_last_used_time = lbolt; 8107 } 8108 /* 8109 * We have a route to reach the destination. Find the 8110 * appropriate ill, then get a source address using 8111 * ipif_select_source(). 8112 * 8113 * If we are here trying to create an IRE_CACHE for an offlink 8114 * destination and have an IRE_CACHE entry for VNI, then use 8115 * ire_stq instead since VNI's queue is a black hole. 8116 */ 8117 if ((ire->ire_type == IRE_CACHE) && 8118 IS_VNI(ire->ire_ipif->ipif_ill)) { 8119 dst_ill = ire->ire_stq->q_ptr; 8120 ill_refhold(dst_ill); 8121 } else { 8122 ill_t *ill = ire->ire_ipif->ipif_ill; 8123 8124 if (IS_IPMP(ill)) { 8125 dst_ill = 8126 ipmp_illgrp_hold_next_ill(ill->ill_grp); 8127 } else { 8128 dst_ill = ill; 8129 ill_refhold(dst_ill); 8130 } 8131 } 8132 8133 if (dst_ill == NULL) { 8134 if (ip_debug > 2) { 8135 pr_addr_dbg("ip_newroute: no dst " 8136 "ill for dst %s\n", AF_INET, &dst); 8137 } 8138 goto icmp_err_ret; 8139 } 8140 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 8141 8142 /* 8143 * Pick the best source address from dst_ill. 8144 * 8145 * 1) Try to pick the source address from the destination 8146 * route. Clustering assumes that when we have multiple 8147 * prefixes hosted on an interface, the prefix of the 8148 * source address matches the prefix of the destination 8149 * route. We do this only if the address is not 8150 * DEPRECATED. 8151 * 8152 * 2) If the conn is in a different zone than the ire, we 8153 * need to pick a source address from the right zone. 8154 */ 8155 ASSERT(src_ipif == NULL); 8156 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 8157 /* 8158 * The RTF_SETSRC flag is set in the parent ire (sire). 8159 * Check that the ipif matching the requested source 8160 * address still exists. 8161 */ 8162 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 8163 zoneid, NULL, NULL, NULL, NULL, ipst); 8164 } 8165 8166 unspec_src = (connp != NULL && connp->conn_unspec_src); 8167 8168 if (src_ipif == NULL && 8169 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 8170 ire_marks |= IRE_MARK_USESRC_CHECK; 8171 if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && 8172 IS_IPMP(ire->ire_ipif->ipif_ill) || 8173 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 8174 (connp != NULL && ire->ire_zoneid != zoneid && 8175 ire->ire_zoneid != ALL_ZONES) || 8176 (dst_ill->ill_usesrc_ifindex != 0)) { 8177 /* 8178 * If the destination is reachable via a 8179 * given gateway, the selected source address 8180 * should be in the same subnet as the gateway. 8181 * Otherwise, the destination is not reachable. 8182 * 8183 * If there are no interfaces on the same subnet 8184 * as the destination, ipif_select_source gives 8185 * first non-deprecated interface which might be 8186 * on a different subnet than the gateway. 8187 * This is not desirable. Hence pass the dst_ire 8188 * source address to ipif_select_source. 8189 * It is sure that the destination is reachable 8190 * with the dst_ire source address subnet. 8191 * So passing dst_ire source address to 8192 * ipif_select_source will make sure that the 8193 * selected source will be on the same subnet 8194 * as dst_ire source address. 8195 */ 8196 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 8197 8198 src_ipif = ipif_select_source(dst_ill, saddr, 8199 zoneid); 8200 if (src_ipif == NULL) { 8201 /* 8202 * In the case of multirouting, it may 8203 * happen that ipif_select_source fails 8204 * as DAD may disallow use of the 8205 * particular source interface. Anyway, 8206 * we need to continue and attempt to 8207 * resolve other multirt routes. 8208 */ 8209 if ((sire != NULL) && 8210 (sire->ire_flags & RTF_MULTIRT)) { 8211 ire_refrele(ire); 8212 ire = NULL; 8213 multirt_resolve_next = B_TRUE; 8214 multirt_res_failures++; 8215 continue; 8216 } 8217 8218 if (ip_debug > 2) { 8219 pr_addr_dbg("ip_newroute: " 8220 "no src for dst %s ", 8221 AF_INET, &dst); 8222 printf("on interface %s\n", 8223 dst_ill->ill_name); 8224 } 8225 goto icmp_err_ret; 8226 } 8227 } else { 8228 src_ipif = ire->ire_ipif; 8229 ASSERT(src_ipif != NULL); 8230 /* hold src_ipif for uniformity */ 8231 ipif_refhold(src_ipif); 8232 } 8233 } 8234 8235 /* 8236 * Assign a source address while we have the conn. 8237 * We can't have ip_wput_ire pick a source address when the 8238 * packet returns from arp since we need to look at 8239 * conn_unspec_src and conn_zoneid, and we lose the conn when 8240 * going through arp. 8241 * 8242 * NOTE : ip_newroute_v6 does not have this piece of code as 8243 * it uses ip6i to store this information. 8244 */ 8245 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 8246 ipha->ipha_src = src_ipif->ipif_src_addr; 8247 8248 if (ip_debug > 3) { 8249 /* ip2dbg */ 8250 pr_addr_dbg("ip_newroute: first hop %s\n", 8251 AF_INET, &gw); 8252 } 8253 ip2dbg(("\tire type %s (%d)\n", 8254 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 8255 8256 /* 8257 * The TTL of multirouted packets is bounded by the 8258 * ip_multirt_ttl ndd variable. 8259 */ 8260 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8261 /* Force TTL of multirouted packets */ 8262 if ((ipst->ips_ip_multirt_ttl > 0) && 8263 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 8264 ip2dbg(("ip_newroute: forcing multirt TTL " 8265 "to %d (was %d), dst 0x%08x\n", 8266 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 8267 ntohl(sire->ire_addr))); 8268 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 8269 } 8270 } 8271 /* 8272 * At this point in ip_newroute(), ire is either the 8273 * IRE_CACHE of the next-hop gateway for an off-subnet 8274 * destination or an IRE_INTERFACE type that should be used 8275 * to resolve an on-subnet destination or an on-subnet 8276 * next-hop gateway. 8277 * 8278 * In the IRE_CACHE case, we have the following : 8279 * 8280 * 1) src_ipif - used for getting a source address. 8281 * 8282 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8283 * means packets using this IRE_CACHE will go out on 8284 * dst_ill. 8285 * 8286 * 3) The IRE sire will point to the prefix that is the 8287 * longest matching route for the destination. These 8288 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST. 8289 * 8290 * The newly created IRE_CACHE entry for the off-subnet 8291 * destination is tied to both the prefix route and the 8292 * interface route used to resolve the next-hop gateway 8293 * via the ire_phandle and ire_ihandle fields, 8294 * respectively. 8295 * 8296 * In the IRE_INTERFACE case, we have the following : 8297 * 8298 * 1) src_ipif - used for getting a source address. 8299 * 8300 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8301 * means packets using the IRE_CACHE that we will build 8302 * here will go out on dst_ill. 8303 * 8304 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 8305 * to be created will only be tied to the IRE_INTERFACE 8306 * that was derived from the ire_ihandle field. 8307 * 8308 * If sire is non-NULL, it means the destination is 8309 * off-link and we will first create the IRE_CACHE for the 8310 * gateway. Next time through ip_newroute, we will create 8311 * the IRE_CACHE for the final destination as described 8312 * above. 8313 * 8314 * In both cases, after the current resolution has been 8315 * completed (or possibly initialised, in the IRE_INTERFACE 8316 * case), the loop may be re-entered to attempt the resolution 8317 * of another RTF_MULTIRT route. 8318 * 8319 * When an IRE_CACHE entry for the off-subnet destination is 8320 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 8321 * for further processing in emission loops. 8322 */ 8323 save_ire = ire; 8324 switch (ire->ire_type) { 8325 case IRE_CACHE: { 8326 ire_t *ipif_ire; 8327 8328 ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE); 8329 if (gw == 0) 8330 gw = ire->ire_gateway_addr; 8331 /* 8332 * We need 3 ire's to create a new cache ire for an 8333 * off-link destination from the cache ire of the 8334 * gateway. 8335 * 8336 * 1. The prefix ire 'sire' (Note that this does 8337 * not apply to the conn_nexthop_set case) 8338 * 2. The cache ire of the gateway 'ire' 8339 * 3. The interface ire 'ipif_ire' 8340 * 8341 * We have (1) and (2). We lookup (3) below. 8342 * 8343 * If there is no interface route to the gateway, 8344 * it is a race condition, where we found the cache 8345 * but the interface route has been deleted. 8346 */ 8347 if (ip_nexthop) { 8348 ipif_ire = ire_ihandle_lookup_onlink(ire); 8349 } else { 8350 ipif_ire = 8351 ire_ihandle_lookup_offlink(ire, sire); 8352 } 8353 if (ipif_ire == NULL) { 8354 ip1dbg(("ip_newroute: " 8355 "ire_ihandle_lookup_offlink failed\n")); 8356 goto icmp_err_ret; 8357 } 8358 8359 /* 8360 * Check cached gateway IRE for any security 8361 * attributes; if found, associate the gateway 8362 * credentials group to the destination IRE. 8363 */ 8364 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 8365 mutex_enter(&attrp->igsa_lock); 8366 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 8367 GCGRP_REFHOLD(gcgrp); 8368 mutex_exit(&attrp->igsa_lock); 8369 } 8370 8371 /* 8372 * XXX For the source of the resolver mp, 8373 * we are using the same DL_UNITDATA_REQ 8374 * (from save_ire->ire_nce->nce_res_mp) 8375 * though the save_ire is not pointing at the same ill. 8376 * This is incorrect. We need to send it up to the 8377 * resolver to get the right res_mp. For ethernets 8378 * this may be okay (ill_type == DL_ETHER). 8379 */ 8380 8381 ire = ire_create( 8382 (uchar_t *)&dst, /* dest address */ 8383 (uchar_t *)&ip_g_all_ones, /* mask */ 8384 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8385 (uchar_t *)&gw, /* gateway address */ 8386 &save_ire->ire_max_frag, 8387 save_ire->ire_nce, /* src nce */ 8388 dst_ill->ill_rq, /* recv-from queue */ 8389 dst_ill->ill_wq, /* send-to queue */ 8390 IRE_CACHE, /* IRE type */ 8391 src_ipif, 8392 (sire != NULL) ? 8393 sire->ire_mask : 0, /* Parent mask */ 8394 (sire != NULL) ? 8395 sire->ire_phandle : 0, /* Parent handle */ 8396 ipif_ire->ire_ihandle, /* Interface handle */ 8397 (sire != NULL) ? (sire->ire_flags & 8398 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 8399 (sire != NULL) ? 8400 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 8401 NULL, 8402 gcgrp, 8403 ipst); 8404 8405 if (ire == NULL) { 8406 if (gcgrp != NULL) { 8407 GCGRP_REFRELE(gcgrp); 8408 gcgrp = NULL; 8409 } 8410 ire_refrele(ipif_ire); 8411 ire_refrele(save_ire); 8412 break; 8413 } 8414 8415 /* reference now held by IRE */ 8416 gcgrp = NULL; 8417 8418 ire->ire_marks |= ire_marks; 8419 8420 /* 8421 * Prevent sire and ipif_ire from getting deleted. 8422 * The newly created ire is tied to both of them via 8423 * the phandle and ihandle respectively. 8424 */ 8425 if (sire != NULL) { 8426 IRB_REFHOLD(sire->ire_bucket); 8427 /* Has it been removed already ? */ 8428 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 8429 IRB_REFRELE(sire->ire_bucket); 8430 ire_refrele(ipif_ire); 8431 ire_refrele(save_ire); 8432 break; 8433 } 8434 } 8435 8436 IRB_REFHOLD(ipif_ire->ire_bucket); 8437 /* Has it been removed already ? */ 8438 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 8439 IRB_REFRELE(ipif_ire->ire_bucket); 8440 if (sire != NULL) 8441 IRB_REFRELE(sire->ire_bucket); 8442 ire_refrele(ipif_ire); 8443 ire_refrele(save_ire); 8444 break; 8445 } 8446 8447 xmit_mp = first_mp; 8448 /* 8449 * In the case of multirouting, a copy 8450 * of the packet is done before its sending. 8451 * The copy is used to attempt another 8452 * route resolution, in a next loop. 8453 */ 8454 if (ire->ire_flags & RTF_MULTIRT) { 8455 copy_mp = copymsg(first_mp); 8456 if (copy_mp != NULL) { 8457 xmit_mp = copy_mp; 8458 MULTIRT_DEBUG_TAG(first_mp); 8459 } 8460 } 8461 8462 ire_add_then_send(q, ire, xmit_mp); 8463 ire_refrele(save_ire); 8464 8465 /* Assert that sire is not deleted yet. */ 8466 if (sire != NULL) { 8467 ASSERT(sire->ire_ptpn != NULL); 8468 IRB_REFRELE(sire->ire_bucket); 8469 } 8470 8471 /* Assert that ipif_ire is not deleted yet. */ 8472 ASSERT(ipif_ire->ire_ptpn != NULL); 8473 IRB_REFRELE(ipif_ire->ire_bucket); 8474 ire_refrele(ipif_ire); 8475 8476 /* 8477 * If copy_mp is not NULL, multirouting was 8478 * requested. We loop to initiate a next 8479 * route resolution attempt, starting from sire. 8480 */ 8481 if (copy_mp != NULL) { 8482 /* 8483 * Search for the next unresolved 8484 * multirt route. 8485 */ 8486 copy_mp = NULL; 8487 ipif_ire = NULL; 8488 ire = NULL; 8489 multirt_resolve_next = B_TRUE; 8490 continue; 8491 } 8492 if (sire != NULL) 8493 ire_refrele(sire); 8494 ipif_refrele(src_ipif); 8495 ill_refrele(dst_ill); 8496 return; 8497 } 8498 case IRE_IF_NORESOLVER: { 8499 if (dst_ill->ill_resolver_mp == NULL) { 8500 ip1dbg(("ip_newroute: dst_ill %p " 8501 "for IRE_IF_NORESOLVER ire %p has " 8502 "no ill_resolver_mp\n", 8503 (void *)dst_ill, (void *)ire)); 8504 break; 8505 } 8506 8507 /* 8508 * TSol note: We are creating the ire cache for the 8509 * destination 'dst'. If 'dst' is offlink, going 8510 * through the first hop 'gw', the security attributes 8511 * of 'dst' must be set to point to the gateway 8512 * credentials of gateway 'gw'. If 'dst' is onlink, it 8513 * is possible that 'dst' is a potential gateway that is 8514 * referenced by some route that has some security 8515 * attributes. Thus in the former case, we need to do a 8516 * gcgrp_lookup of 'gw' while in the latter case we 8517 * need to do gcgrp_lookup of 'dst' itself. 8518 */ 8519 ga.ga_af = AF_INET; 8520 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 8521 &ga.ga_addr); 8522 gcgrp = gcgrp_lookup(&ga, B_FALSE); 8523 8524 ire = ire_create( 8525 (uchar_t *)&dst, /* dest address */ 8526 (uchar_t *)&ip_g_all_ones, /* mask */ 8527 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8528 (uchar_t *)&gw, /* gateway address */ 8529 &save_ire->ire_max_frag, 8530 NULL, /* no src nce */ 8531 dst_ill->ill_rq, /* recv-from queue */ 8532 dst_ill->ill_wq, /* send-to queue */ 8533 IRE_CACHE, 8534 src_ipif, 8535 save_ire->ire_mask, /* Parent mask */ 8536 (sire != NULL) ? /* Parent handle */ 8537 sire->ire_phandle : 0, 8538 save_ire->ire_ihandle, /* Interface handle */ 8539 (sire != NULL) ? sire->ire_flags & 8540 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 8541 &(save_ire->ire_uinfo), 8542 NULL, 8543 gcgrp, 8544 ipst); 8545 8546 if (ire == NULL) { 8547 if (gcgrp != NULL) { 8548 GCGRP_REFRELE(gcgrp); 8549 gcgrp = NULL; 8550 } 8551 ire_refrele(save_ire); 8552 break; 8553 } 8554 8555 /* reference now held by IRE */ 8556 gcgrp = NULL; 8557 8558 ire->ire_marks |= ire_marks; 8559 8560 /* Prevent save_ire from getting deleted */ 8561 IRB_REFHOLD(save_ire->ire_bucket); 8562 /* Has it been removed already ? */ 8563 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8564 IRB_REFRELE(save_ire->ire_bucket); 8565 ire_refrele(save_ire); 8566 break; 8567 } 8568 8569 /* 8570 * In the case of multirouting, a copy 8571 * of the packet is made before it is sent. 8572 * The copy is used in the next 8573 * loop to attempt another resolution. 8574 */ 8575 xmit_mp = first_mp; 8576 if ((sire != NULL) && 8577 (sire->ire_flags & RTF_MULTIRT)) { 8578 copy_mp = copymsg(first_mp); 8579 if (copy_mp != NULL) { 8580 xmit_mp = copy_mp; 8581 MULTIRT_DEBUG_TAG(first_mp); 8582 } 8583 } 8584 ire_add_then_send(q, ire, xmit_mp); 8585 8586 /* Assert that it is not deleted yet. */ 8587 ASSERT(save_ire->ire_ptpn != NULL); 8588 IRB_REFRELE(save_ire->ire_bucket); 8589 ire_refrele(save_ire); 8590 8591 if (copy_mp != NULL) { 8592 /* 8593 * If we found a (no)resolver, we ignore any 8594 * trailing top priority IRE_CACHE in further 8595 * loops. This ensures that we do not omit any 8596 * (no)resolver. 8597 * This IRE_CACHE, if any, will be processed 8598 * by another thread entering ip_newroute(). 8599 * IRE_CACHE entries, if any, will be processed 8600 * by another thread entering ip_newroute(), 8601 * (upon resolver response, for instance). 8602 * This aims to force parallel multirt 8603 * resolutions as soon as a packet must be sent. 8604 * In the best case, after the tx of only one 8605 * packet, all reachable routes are resolved. 8606 * Otherwise, the resolution of all RTF_MULTIRT 8607 * routes would require several emissions. 8608 */ 8609 multirt_flags &= ~MULTIRT_CACHEGW; 8610 8611 /* 8612 * Search for the next unresolved multirt 8613 * route. 8614 */ 8615 copy_mp = NULL; 8616 save_ire = NULL; 8617 ire = NULL; 8618 multirt_resolve_next = B_TRUE; 8619 continue; 8620 } 8621 8622 /* 8623 * Don't need sire anymore 8624 */ 8625 if (sire != NULL) 8626 ire_refrele(sire); 8627 8628 ipif_refrele(src_ipif); 8629 ill_refrele(dst_ill); 8630 return; 8631 } 8632 case IRE_IF_RESOLVER: 8633 /* 8634 * We can't build an IRE_CACHE yet, but at least we 8635 * found a resolver that can help. 8636 */ 8637 res_mp = dst_ill->ill_resolver_mp; 8638 if (!OK_RESOLVER_MP(res_mp)) 8639 break; 8640 8641 /* 8642 * To be at this point in the code with a non-zero gw 8643 * means that dst is reachable through a gateway that 8644 * we have never resolved. By changing dst to the gw 8645 * addr we resolve the gateway first. 8646 * When ire_add_then_send() tries to put the IP dg 8647 * to dst, it will reenter ip_newroute() at which 8648 * time we will find the IRE_CACHE for the gw and 8649 * create another IRE_CACHE in case IRE_CACHE above. 8650 */ 8651 if (gw != INADDR_ANY) { 8652 /* 8653 * The source ipif that was determined above was 8654 * relative to the destination address, not the 8655 * gateway's. If src_ipif was not taken out of 8656 * the IRE_IF_RESOLVER entry, we'll need to call 8657 * ipif_select_source() again. 8658 */ 8659 if (src_ipif != ire->ire_ipif) { 8660 ipif_refrele(src_ipif); 8661 src_ipif = ipif_select_source(dst_ill, 8662 gw, zoneid); 8663 /* 8664 * In the case of multirouting, it may 8665 * happen that ipif_select_source fails 8666 * as DAD may disallow use of the 8667 * particular source interface. Anyway, 8668 * we need to continue and attempt to 8669 * resolve other multirt routes. 8670 */ 8671 if (src_ipif == NULL) { 8672 if (sire != NULL && 8673 (sire->ire_flags & 8674 RTF_MULTIRT)) { 8675 ire_refrele(ire); 8676 ire = NULL; 8677 multirt_resolve_next = 8678 B_TRUE; 8679 multirt_res_failures++; 8680 continue; 8681 } 8682 if (ip_debug > 2) { 8683 pr_addr_dbg( 8684 "ip_newroute: no " 8685 "src for gw %s ", 8686 AF_INET, &gw); 8687 printf("on " 8688 "interface %s\n", 8689 dst_ill->ill_name); 8690 } 8691 goto icmp_err_ret; 8692 } 8693 } 8694 save_dst = dst; 8695 dst = gw; 8696 gw = INADDR_ANY; 8697 } 8698 8699 /* 8700 * We obtain a partial IRE_CACHE which we will pass 8701 * along with the resolver query. When the response 8702 * comes back it will be there ready for us to add. 8703 * The ire_max_frag is atomically set under the 8704 * irebucket lock in ire_add_v[46]. 8705 */ 8706 8707 ire = ire_create_mp( 8708 (uchar_t *)&dst, /* dest address */ 8709 (uchar_t *)&ip_g_all_ones, /* mask */ 8710 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8711 (uchar_t *)&gw, /* gateway address */ 8712 NULL, /* ire_max_frag */ 8713 NULL, /* no src nce */ 8714 dst_ill->ill_rq, /* recv-from queue */ 8715 dst_ill->ill_wq, /* send-to queue */ 8716 IRE_CACHE, 8717 src_ipif, /* Interface ipif */ 8718 save_ire->ire_mask, /* Parent mask */ 8719 0, 8720 save_ire->ire_ihandle, /* Interface handle */ 8721 0, /* flags if any */ 8722 &(save_ire->ire_uinfo), 8723 NULL, 8724 NULL, 8725 ipst); 8726 8727 if (ire == NULL) { 8728 ire_refrele(save_ire); 8729 break; 8730 } 8731 8732 if ((sire != NULL) && 8733 (sire->ire_flags & RTF_MULTIRT)) { 8734 copy_mp = copymsg(first_mp); 8735 if (copy_mp != NULL) 8736 MULTIRT_DEBUG_TAG(copy_mp); 8737 } 8738 8739 ire->ire_marks |= ire_marks; 8740 8741 /* 8742 * Construct message chain for the resolver 8743 * of the form: 8744 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8745 * Packet could contain a IPSEC_OUT mp. 8746 * 8747 * NOTE : ire will be added later when the response 8748 * comes back from ARP. If the response does not 8749 * come back, ARP frees the packet. For this reason, 8750 * we can't REFHOLD the bucket of save_ire to prevent 8751 * deletions. We may not be able to REFRELE the bucket 8752 * if the response never comes back. Thus, before 8753 * adding the ire, ire_add_v4 will make sure that the 8754 * interface route does not get deleted. This is the 8755 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8756 * where we can always prevent deletions because of 8757 * the synchronous nature of adding IRES i.e 8758 * ire_add_then_send is called after creating the IRE. 8759 */ 8760 ASSERT(ire->ire_mp != NULL); 8761 ire->ire_mp->b_cont = first_mp; 8762 /* Have saved_mp handy, for cleanup if canput fails */ 8763 saved_mp = mp; 8764 mp = copyb(res_mp); 8765 if (mp == NULL) { 8766 /* Prepare for cleanup */ 8767 mp = saved_mp; /* pkt */ 8768 ire_delete(ire); /* ire_mp */ 8769 ire = NULL; 8770 ire_refrele(save_ire); 8771 if (copy_mp != NULL) { 8772 MULTIRT_DEBUG_UNTAG(copy_mp); 8773 freemsg(copy_mp); 8774 copy_mp = NULL; 8775 } 8776 break; 8777 } 8778 linkb(mp, ire->ire_mp); 8779 8780 /* 8781 * Fill in the source and dest addrs for the resolver. 8782 * NOTE: this depends on memory layouts imposed by 8783 * ill_init(). 8784 */ 8785 areq = (areq_t *)mp->b_rptr; 8786 addrp = (ipaddr_t *)((char *)areq + 8787 areq->areq_sender_addr_offset); 8788 *addrp = save_ire->ire_src_addr; 8789 8790 ire_refrele(save_ire); 8791 addrp = (ipaddr_t *)((char *)areq + 8792 areq->areq_target_addr_offset); 8793 *addrp = dst; 8794 /* Up to the resolver. */ 8795 if (canputnext(dst_ill->ill_rq) && 8796 !(dst_ill->ill_arp_closing)) { 8797 putnext(dst_ill->ill_rq, mp); 8798 ire = NULL; 8799 if (copy_mp != NULL) { 8800 /* 8801 * If we found a resolver, we ignore 8802 * any trailing top priority IRE_CACHE 8803 * in the further loops. This ensures 8804 * that we do not omit any resolver. 8805 * IRE_CACHE entries, if any, will be 8806 * processed next time we enter 8807 * ip_newroute(). 8808 */ 8809 multirt_flags &= ~MULTIRT_CACHEGW; 8810 /* 8811 * Search for the next unresolved 8812 * multirt route. 8813 */ 8814 first_mp = copy_mp; 8815 copy_mp = NULL; 8816 /* Prepare the next resolution loop. */ 8817 mp = first_mp; 8818 EXTRACT_PKT_MP(mp, first_mp, 8819 mctl_present); 8820 if (mctl_present) 8821 io = (ipsec_out_t *) 8822 first_mp->b_rptr; 8823 ipha = (ipha_t *)mp->b_rptr; 8824 8825 ASSERT(sire != NULL); 8826 8827 dst = save_dst; 8828 multirt_resolve_next = B_TRUE; 8829 continue; 8830 } 8831 8832 if (sire != NULL) 8833 ire_refrele(sire); 8834 8835 /* 8836 * The response will come back in ip_wput 8837 * with db_type IRE_DB_TYPE. 8838 */ 8839 ipif_refrele(src_ipif); 8840 ill_refrele(dst_ill); 8841 return; 8842 } else { 8843 /* Prepare for cleanup */ 8844 DTRACE_PROBE1(ip__newroute__drop, mblk_t *, 8845 mp); 8846 mp->b_cont = NULL; 8847 freeb(mp); /* areq */ 8848 /* 8849 * this is an ire that is not added to the 8850 * cache. ire_freemblk will handle the release 8851 * of any resources associated with the ire. 8852 */ 8853 ire_delete(ire); /* ire_mp */ 8854 mp = saved_mp; /* pkt */ 8855 ire = NULL; 8856 if (copy_mp != NULL) { 8857 MULTIRT_DEBUG_UNTAG(copy_mp); 8858 freemsg(copy_mp); 8859 copy_mp = NULL; 8860 } 8861 break; 8862 } 8863 default: 8864 break; 8865 } 8866 } while (multirt_resolve_next); 8867 8868 ip1dbg(("ip_newroute: dropped\n")); 8869 /* Did this packet originate externally? */ 8870 if (mp->b_prev) { 8871 mp->b_next = NULL; 8872 mp->b_prev = NULL; 8873 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 8874 } else { 8875 if (dst_ill != NULL) { 8876 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); 8877 } else { 8878 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 8879 } 8880 } 8881 ASSERT(copy_mp == NULL); 8882 MULTIRT_DEBUG_UNTAG(first_mp); 8883 freemsg(first_mp); 8884 if (ire != NULL) 8885 ire_refrele(ire); 8886 if (sire != NULL) 8887 ire_refrele(sire); 8888 if (src_ipif != NULL) 8889 ipif_refrele(src_ipif); 8890 if (dst_ill != NULL) 8891 ill_refrele(dst_ill); 8892 return; 8893 8894 icmp_err_ret: 8895 ip1dbg(("ip_newroute: no route\n")); 8896 if (src_ipif != NULL) 8897 ipif_refrele(src_ipif); 8898 if (dst_ill != NULL) 8899 ill_refrele(dst_ill); 8900 if (sire != NULL) 8901 ire_refrele(sire); 8902 /* Did this packet originate externally? */ 8903 if (mp->b_prev) { 8904 mp->b_next = NULL; 8905 mp->b_prev = NULL; 8906 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes); 8907 q = WR(q); 8908 } else { 8909 /* 8910 * There is no outgoing ill, so just increment the 8911 * system MIB. 8912 */ 8913 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 8914 /* 8915 * Since ip_wput() isn't close to finished, we fill 8916 * in enough of the header for credible error reporting. 8917 */ 8918 if (ip_hdr_complete(ipha, zoneid, ipst)) { 8919 /* Failed */ 8920 MULTIRT_DEBUG_UNTAG(first_mp); 8921 freemsg(first_mp); 8922 if (ire != NULL) 8923 ire_refrele(ire); 8924 return; 8925 } 8926 } 8927 8928 /* 8929 * At this point we will have ire only if RTF_BLACKHOLE 8930 * or RTF_REJECT flags are set on the IRE. It will not 8931 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8932 */ 8933 if (ire != NULL) { 8934 if (ire->ire_flags & RTF_BLACKHOLE) { 8935 ire_refrele(ire); 8936 MULTIRT_DEBUG_UNTAG(first_mp); 8937 freemsg(first_mp); 8938 return; 8939 } 8940 ire_refrele(ire); 8941 } 8942 if (ip_source_routed(ipha, ipst)) { 8943 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED, 8944 zoneid, ipst); 8945 return; 8946 } 8947 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 8948 } 8949 8950 ip_opt_info_t zero_info; 8951 8952 /* 8953 * IPv4 - 8954 * ip_newroute_ipif is called by ip_wput_multicast and 8955 * ip_rput_forward_multicast whenever we need to send 8956 * out a packet to a destination address for which we do not have specific 8957 * routing information. It is used when the packet will be sent out 8958 * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF 8959 * socket option is set or icmp error message wants to go out on a particular 8960 * interface for a unicast packet. 8961 * 8962 * In most cases, the destination address is resolved thanks to the ipif 8963 * intrinsic resolver. However, there are some cases where the call to 8964 * ip_newroute_ipif must take into account the potential presence of 8965 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8966 * that uses the interface. This is specified through flags, 8967 * which can be a combination of: 8968 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8969 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8970 * and flags. Additionally, the packet source address has to be set to 8971 * the specified address. The caller is thus expected to set this flag 8972 * if the packet has no specific source address yet. 8973 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 8974 * flag, the resulting ire will inherit the flag. All unresolved routes 8975 * to the destination must be explored in the same call to 8976 * ip_newroute_ipif(). 8977 */ 8978 static void 8979 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 8980 conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop) 8981 { 8982 areq_t *areq; 8983 ire_t *ire = NULL; 8984 mblk_t *res_mp; 8985 ipaddr_t *addrp; 8986 mblk_t *first_mp; 8987 ire_t *save_ire = NULL; 8988 ipif_t *src_ipif = NULL; 8989 ushort_t ire_marks = 0; 8990 ill_t *dst_ill = NULL; 8991 ipha_t *ipha; 8992 mblk_t *saved_mp; 8993 ire_t *fire = NULL; 8994 mblk_t *copy_mp = NULL; 8995 boolean_t multirt_resolve_next; 8996 boolean_t unspec_src; 8997 ipaddr_t ipha_dst; 8998 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 8999 9000 /* 9001 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 9002 * here for uniformity 9003 */ 9004 ipif_refhold(ipif); 9005 9006 /* 9007 * This loop is run only once in most cases. 9008 * We loop to resolve further routes only when the destination 9009 * can be reached through multiple RTF_MULTIRT-flagged ires. 9010 */ 9011 do { 9012 if (dst_ill != NULL) { 9013 ill_refrele(dst_ill); 9014 dst_ill = NULL; 9015 } 9016 if (src_ipif != NULL) { 9017 ipif_refrele(src_ipif); 9018 src_ipif = NULL; 9019 } 9020 multirt_resolve_next = B_FALSE; 9021 9022 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 9023 ipif->ipif_ill->ill_name)); 9024 9025 first_mp = mp; 9026 if (DB_TYPE(mp) == M_CTL) 9027 mp = mp->b_cont; 9028 ipha = (ipha_t *)mp->b_rptr; 9029 9030 /* 9031 * Save the packet destination address, we may need it after 9032 * the packet has been consumed. 9033 */ 9034 ipha_dst = ipha->ipha_dst; 9035 9036 /* 9037 * If the interface is a pt-pt interface we look for an 9038 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 9039 * local_address and the pt-pt destination address. Otherwise 9040 * we just match the local address. 9041 * NOTE: dst could be different than ipha->ipha_dst in case 9042 * of sending igmp multicast packets over a point-to-point 9043 * connection. 9044 * Thus we must be careful enough to check ipha_dst to be a 9045 * multicast address, otherwise it will take xmit_if path for 9046 * multicast packets resulting into kernel stack overflow by 9047 * repeated calls to ip_newroute_ipif from ire_send(). 9048 */ 9049 if (CLASSD(ipha_dst) && 9050 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 9051 goto err_ret; 9052 } 9053 9054 /* 9055 * We check if an IRE_OFFSUBNET for the addr that goes through 9056 * ipif exists. We need it to determine if the RTF_SETSRC and/or 9057 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 9058 * propagate its flags to the new ire. 9059 */ 9060 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 9061 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 9062 ip2dbg(("ip_newroute_ipif: " 9063 "ipif_lookup_multi_ire(" 9064 "ipif %p, dst %08x) = fire %p\n", 9065 (void *)ipif, ntohl(dst), (void *)fire)); 9066 } 9067 9068 /* 9069 * Note: While we pick a dst_ill we are really only 9070 * interested in the ill for load spreading. The source 9071 * ipif is determined by source address selection below. 9072 */ 9073 if (IS_IPMP(ipif->ipif_ill)) { 9074 ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp; 9075 9076 if (CLASSD(ipha_dst)) 9077 dst_ill = ipmp_illgrp_hold_cast_ill(illg); 9078 else 9079 dst_ill = ipmp_illgrp_hold_next_ill(illg); 9080 } else { 9081 dst_ill = ipif->ipif_ill; 9082 ill_refhold(dst_ill); 9083 } 9084 9085 if (dst_ill == NULL) { 9086 if (ip_debug > 2) { 9087 pr_addr_dbg("ip_newroute_ipif: no dst ill " 9088 "for dst %s\n", AF_INET, &dst); 9089 } 9090 goto err_ret; 9091 } 9092 9093 /* 9094 * Pick a source address preferring non-deprecated ones. 9095 * Unlike ip_newroute, we don't do any source address 9096 * selection here since for multicast it really does not help 9097 * in inbound load spreading as in the unicast case. 9098 */ 9099 if ((flags & RTF_SETSRC) && (fire != NULL) && 9100 (fire->ire_flags & RTF_SETSRC)) { 9101 /* 9102 * As requested by flags, an IRE_OFFSUBNET was looked up 9103 * on that interface. This ire has RTF_SETSRC flag, so 9104 * the source address of the packet must be changed. 9105 * Check that the ipif matching the requested source 9106 * address still exists. 9107 */ 9108 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 9109 zoneid, NULL, NULL, NULL, NULL, ipst); 9110 } 9111 9112 unspec_src = (connp != NULL && connp->conn_unspec_src); 9113 9114 if (!IS_UNDER_IPMP(ipif->ipif_ill) && 9115 (IS_IPMP(ipif->ipif_ill) || 9116 (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || 9117 (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || 9118 (connp != NULL && ipif->ipif_zoneid != zoneid && 9119 ipif->ipif_zoneid != ALL_ZONES)) && 9120 (src_ipif == NULL) && 9121 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 9122 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 9123 if (src_ipif == NULL) { 9124 if (ip_debug > 2) { 9125 /* ip1dbg */ 9126 pr_addr_dbg("ip_newroute_ipif: " 9127 "no src for dst %s", 9128 AF_INET, &dst); 9129 } 9130 ip1dbg((" on interface %s\n", 9131 dst_ill->ill_name)); 9132 goto err_ret; 9133 } 9134 ipif_refrele(ipif); 9135 ipif = src_ipif; 9136 ipif_refhold(ipif); 9137 } 9138 if (src_ipif == NULL) { 9139 src_ipif = ipif; 9140 ipif_refhold(src_ipif); 9141 } 9142 9143 /* 9144 * Assign a source address while we have the conn. 9145 * We can't have ip_wput_ire pick a source address when the 9146 * packet returns from arp since conn_unspec_src might be set 9147 * and we lose the conn when going through arp. 9148 */ 9149 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 9150 ipha->ipha_src = src_ipif->ipif_src_addr; 9151 9152 /* 9153 * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible 9154 * that the outgoing interface does not have an interface ire. 9155 */ 9156 if (CLASSD(ipha_dst) && (connp == NULL || 9157 connp->conn_outgoing_ill == NULL) && 9158 infop->ip_opt_ill_index == 0) { 9159 /* ipif_to_ire returns an held ire */ 9160 ire = ipif_to_ire(ipif); 9161 if (ire == NULL) 9162 goto err_ret; 9163 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 9164 goto err_ret; 9165 save_ire = ire; 9166 9167 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 9168 "flags %04x\n", 9169 (void *)ire, (void *)ipif, flags)); 9170 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9171 (fire->ire_flags & RTF_MULTIRT)) { 9172 /* 9173 * As requested by flags, an IRE_OFFSUBNET was 9174 * looked up on that interface. This ire has 9175 * RTF_MULTIRT flag, so the resolution loop will 9176 * be re-entered to resolve additional routes on 9177 * other interfaces. For that purpose, a copy of 9178 * the packet is performed at this point. 9179 */ 9180 fire->ire_last_used_time = lbolt; 9181 copy_mp = copymsg(first_mp); 9182 if (copy_mp) { 9183 MULTIRT_DEBUG_TAG(copy_mp); 9184 } 9185 } 9186 if ((flags & RTF_SETSRC) && (fire != NULL) && 9187 (fire->ire_flags & RTF_SETSRC)) { 9188 /* 9189 * As requested by flags, an IRE_OFFSUBET was 9190 * looked up on that interface. This ire has 9191 * RTF_SETSRC flag, so the source address of the 9192 * packet must be changed. 9193 */ 9194 ipha->ipha_src = fire->ire_src_addr; 9195 } 9196 } else { 9197 /* 9198 * The only ways we can come here are: 9199 * 1) IP_BOUND_IF socket option is set 9200 * 2) SO_DONTROUTE socket option is set 9201 * 3) IP_PKTINFO option is passed in as ancillary data. 9202 * In all cases, the new ire will not be added 9203 * into cache table. 9204 */ 9205 ASSERT(connp == NULL || connp->conn_dontroute || 9206 connp->conn_outgoing_ill != NULL || 9207 infop->ip_opt_ill_index != 0); 9208 ire_marks |= IRE_MARK_NOADD; 9209 } 9210 9211 switch (ipif->ipif_net_type) { 9212 case IRE_IF_NORESOLVER: { 9213 /* We have what we need to build an IRE_CACHE. */ 9214 9215 if (dst_ill->ill_resolver_mp == NULL) { 9216 ip1dbg(("ip_newroute_ipif: dst_ill %p " 9217 "for IRE_IF_NORESOLVER ire %p has " 9218 "no ill_resolver_mp\n", 9219 (void *)dst_ill, (void *)ire)); 9220 break; 9221 } 9222 9223 /* 9224 * The new ire inherits the IRE_OFFSUBNET flags 9225 * and source address, if this was requested. 9226 */ 9227 ire = ire_create( 9228 (uchar_t *)&dst, /* dest address */ 9229 (uchar_t *)&ip_g_all_ones, /* mask */ 9230 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9231 NULL, /* gateway address */ 9232 &ipif->ipif_mtu, 9233 NULL, /* no src nce */ 9234 dst_ill->ill_rq, /* recv-from queue */ 9235 dst_ill->ill_wq, /* send-to queue */ 9236 IRE_CACHE, 9237 src_ipif, 9238 (save_ire != NULL ? save_ire->ire_mask : 0), 9239 (fire != NULL) ? /* Parent handle */ 9240 fire->ire_phandle : 0, 9241 (save_ire != NULL) ? /* Interface handle */ 9242 save_ire->ire_ihandle : 0, 9243 (fire != NULL) ? 9244 (fire->ire_flags & 9245 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9246 (save_ire == NULL ? &ire_uinfo_null : 9247 &save_ire->ire_uinfo), 9248 NULL, 9249 NULL, 9250 ipst); 9251 9252 if (ire == NULL) { 9253 if (save_ire != NULL) 9254 ire_refrele(save_ire); 9255 break; 9256 } 9257 9258 ire->ire_marks |= ire_marks; 9259 9260 /* 9261 * If IRE_MARK_NOADD is set then we need to convert 9262 * the max_fragp to a useable value now. This is 9263 * normally done in ire_add_v[46]. We also need to 9264 * associate the ire with an nce (normally would be 9265 * done in ip_wput_nondata()). 9266 * 9267 * Note that IRE_MARK_NOADD packets created here 9268 * do not have a non-null ire_mp pointer. The null 9269 * value of ire_bucket indicates that they were 9270 * never added. 9271 */ 9272 if (ire->ire_marks & IRE_MARK_NOADD) { 9273 uint_t max_frag; 9274 9275 max_frag = *ire->ire_max_fragp; 9276 ire->ire_max_fragp = NULL; 9277 ire->ire_max_frag = max_frag; 9278 9279 if ((ire->ire_nce = ndp_lookup_v4( 9280 ire_to_ill(ire), 9281 (ire->ire_gateway_addr != INADDR_ANY ? 9282 &ire->ire_gateway_addr : &ire->ire_addr), 9283 B_FALSE)) == NULL) { 9284 if (save_ire != NULL) 9285 ire_refrele(save_ire); 9286 break; 9287 } 9288 ASSERT(ire->ire_nce->nce_state == 9289 ND_REACHABLE); 9290 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 9291 } 9292 9293 /* Prevent save_ire from getting deleted */ 9294 if (save_ire != NULL) { 9295 IRB_REFHOLD(save_ire->ire_bucket); 9296 /* Has it been removed already ? */ 9297 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 9298 IRB_REFRELE(save_ire->ire_bucket); 9299 ire_refrele(save_ire); 9300 break; 9301 } 9302 } 9303 9304 ire_add_then_send(q, ire, first_mp); 9305 9306 /* Assert that save_ire is not deleted yet. */ 9307 if (save_ire != NULL) { 9308 ASSERT(save_ire->ire_ptpn != NULL); 9309 IRB_REFRELE(save_ire->ire_bucket); 9310 ire_refrele(save_ire); 9311 save_ire = NULL; 9312 } 9313 if (fire != NULL) { 9314 ire_refrele(fire); 9315 fire = NULL; 9316 } 9317 9318 /* 9319 * the resolution loop is re-entered if this 9320 * was requested through flags and if we 9321 * actually are in a multirouting case. 9322 */ 9323 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9324 boolean_t need_resolve = 9325 ire_multirt_need_resolve(ipha_dst, 9326 msg_getlabel(copy_mp), ipst); 9327 if (!need_resolve) { 9328 MULTIRT_DEBUG_UNTAG(copy_mp); 9329 freemsg(copy_mp); 9330 copy_mp = NULL; 9331 } else { 9332 /* 9333 * ipif_lookup_group() calls 9334 * ire_lookup_multi() that uses 9335 * ire_ftable_lookup() to find 9336 * an IRE_INTERFACE for the group. 9337 * In the multirt case, 9338 * ire_lookup_multi() then invokes 9339 * ire_multirt_lookup() to find 9340 * the next resolvable ire. 9341 * As a result, we obtain an new 9342 * interface, derived from the 9343 * next ire. 9344 */ 9345 ipif_refrele(ipif); 9346 ipif = ipif_lookup_group(ipha_dst, 9347 zoneid, ipst); 9348 ip2dbg(("ip_newroute_ipif: " 9349 "multirt dst %08x, ipif %p\n", 9350 htonl(dst), (void *)ipif)); 9351 if (ipif != NULL) { 9352 mp = copy_mp; 9353 copy_mp = NULL; 9354 multirt_resolve_next = B_TRUE; 9355 continue; 9356 } else { 9357 freemsg(copy_mp); 9358 } 9359 } 9360 } 9361 if (ipif != NULL) 9362 ipif_refrele(ipif); 9363 ill_refrele(dst_ill); 9364 ipif_refrele(src_ipif); 9365 return; 9366 } 9367 case IRE_IF_RESOLVER: 9368 /* 9369 * We can't build an IRE_CACHE yet, but at least 9370 * we found a resolver that can help. 9371 */ 9372 res_mp = dst_ill->ill_resolver_mp; 9373 if (!OK_RESOLVER_MP(res_mp)) 9374 break; 9375 9376 /* 9377 * We obtain a partial IRE_CACHE which we will pass 9378 * along with the resolver query. When the response 9379 * comes back it will be there ready for us to add. 9380 * The new ire inherits the IRE_OFFSUBNET flags 9381 * and source address, if this was requested. 9382 * The ire_max_frag is atomically set under the 9383 * irebucket lock in ire_add_v[46]. Only in the 9384 * case of IRE_MARK_NOADD, we set it here itself. 9385 */ 9386 ire = ire_create_mp( 9387 (uchar_t *)&dst, /* dest address */ 9388 (uchar_t *)&ip_g_all_ones, /* mask */ 9389 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9390 NULL, /* gateway address */ 9391 (ire_marks & IRE_MARK_NOADD) ? 9392 ipif->ipif_mtu : 0, /* max_frag */ 9393 NULL, /* no src nce */ 9394 dst_ill->ill_rq, /* recv-from queue */ 9395 dst_ill->ill_wq, /* send-to queue */ 9396 IRE_CACHE, 9397 src_ipif, 9398 (save_ire != NULL ? save_ire->ire_mask : 0), 9399 (fire != NULL) ? /* Parent handle */ 9400 fire->ire_phandle : 0, 9401 (save_ire != NULL) ? /* Interface handle */ 9402 save_ire->ire_ihandle : 0, 9403 (fire != NULL) ? /* flags if any */ 9404 (fire->ire_flags & 9405 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9406 (save_ire == NULL ? &ire_uinfo_null : 9407 &save_ire->ire_uinfo), 9408 NULL, 9409 NULL, 9410 ipst); 9411 9412 if (save_ire != NULL) { 9413 ire_refrele(save_ire); 9414 save_ire = NULL; 9415 } 9416 if (ire == NULL) 9417 break; 9418 9419 ire->ire_marks |= ire_marks; 9420 /* 9421 * Construct message chain for the resolver of the 9422 * form: 9423 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 9424 * 9425 * NOTE : ire will be added later when the response 9426 * comes back from ARP. If the response does not 9427 * come back, ARP frees the packet. For this reason, 9428 * we can't REFHOLD the bucket of save_ire to prevent 9429 * deletions. We may not be able to REFRELE the 9430 * bucket if the response never comes back. 9431 * Thus, before adding the ire, ire_add_v4 will make 9432 * sure that the interface route does not get deleted. 9433 * This is the only case unlike ip_newroute_v6, 9434 * ip_newroute_ipif_v6 where we can always prevent 9435 * deletions because ire_add_then_send is called after 9436 * creating the IRE. 9437 * If IRE_MARK_NOADD is set, then ire_add_then_send 9438 * does not add this IRE into the IRE CACHE. 9439 */ 9440 ASSERT(ire->ire_mp != NULL); 9441 ire->ire_mp->b_cont = first_mp; 9442 /* Have saved_mp handy, for cleanup if canput fails */ 9443 saved_mp = mp; 9444 mp = copyb(res_mp); 9445 if (mp == NULL) { 9446 /* Prepare for cleanup */ 9447 mp = saved_mp; /* pkt */ 9448 ire_delete(ire); /* ire_mp */ 9449 ire = NULL; 9450 if (copy_mp != NULL) { 9451 MULTIRT_DEBUG_UNTAG(copy_mp); 9452 freemsg(copy_mp); 9453 copy_mp = NULL; 9454 } 9455 break; 9456 } 9457 linkb(mp, ire->ire_mp); 9458 9459 /* 9460 * Fill in the source and dest addrs for the resolver. 9461 * NOTE: this depends on memory layouts imposed by 9462 * ill_init(). There are corner cases above where we 9463 * might've created the IRE with an INADDR_ANY source 9464 * address (e.g., if the zeroth ipif on an underlying 9465 * ill in an IPMP group is 0.0.0.0, but another ipif 9466 * on the ill has a usable test address). If so, tell 9467 * ARP to use ipha_src as its sender address. 9468 */ 9469 areq = (areq_t *)mp->b_rptr; 9470 addrp = (ipaddr_t *)((char *)areq + 9471 areq->areq_sender_addr_offset); 9472 if (ire->ire_src_addr != INADDR_ANY) 9473 *addrp = ire->ire_src_addr; 9474 else 9475 *addrp = ipha->ipha_src; 9476 addrp = (ipaddr_t *)((char *)areq + 9477 areq->areq_target_addr_offset); 9478 *addrp = dst; 9479 /* Up to the resolver. */ 9480 if (canputnext(dst_ill->ill_rq) && 9481 !(dst_ill->ill_arp_closing)) { 9482 putnext(dst_ill->ill_rq, mp); 9483 /* 9484 * The response will come back in ip_wput 9485 * with db_type IRE_DB_TYPE. 9486 */ 9487 } else { 9488 mp->b_cont = NULL; 9489 freeb(mp); /* areq */ 9490 ire_delete(ire); /* ire_mp */ 9491 saved_mp->b_next = NULL; 9492 saved_mp->b_prev = NULL; 9493 freemsg(first_mp); /* pkt */ 9494 ip2dbg(("ip_newroute_ipif: dropped\n")); 9495 } 9496 9497 if (fire != NULL) { 9498 ire_refrele(fire); 9499 fire = NULL; 9500 } 9501 9502 /* 9503 * The resolution loop is re-entered if this was 9504 * requested through flags and we actually are 9505 * in a multirouting case. 9506 */ 9507 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9508 boolean_t need_resolve = 9509 ire_multirt_need_resolve(ipha_dst, 9510 msg_getlabel(copy_mp), ipst); 9511 if (!need_resolve) { 9512 MULTIRT_DEBUG_UNTAG(copy_mp); 9513 freemsg(copy_mp); 9514 copy_mp = NULL; 9515 } else { 9516 /* 9517 * ipif_lookup_group() calls 9518 * ire_lookup_multi() that uses 9519 * ire_ftable_lookup() to find 9520 * an IRE_INTERFACE for the group. 9521 * In the multirt case, 9522 * ire_lookup_multi() then invokes 9523 * ire_multirt_lookup() to find 9524 * the next resolvable ire. 9525 * As a result, we obtain an new 9526 * interface, derived from the 9527 * next ire. 9528 */ 9529 ipif_refrele(ipif); 9530 ipif = ipif_lookup_group(ipha_dst, 9531 zoneid, ipst); 9532 if (ipif != NULL) { 9533 mp = copy_mp; 9534 copy_mp = NULL; 9535 multirt_resolve_next = B_TRUE; 9536 continue; 9537 } else { 9538 freemsg(copy_mp); 9539 } 9540 } 9541 } 9542 if (ipif != NULL) 9543 ipif_refrele(ipif); 9544 ill_refrele(dst_ill); 9545 ipif_refrele(src_ipif); 9546 return; 9547 default: 9548 break; 9549 } 9550 } while (multirt_resolve_next); 9551 9552 err_ret: 9553 ip2dbg(("ip_newroute_ipif: dropped\n")); 9554 if (fire != NULL) 9555 ire_refrele(fire); 9556 ipif_refrele(ipif); 9557 /* Did this packet originate externally? */ 9558 if (dst_ill != NULL) 9559 ill_refrele(dst_ill); 9560 if (src_ipif != NULL) 9561 ipif_refrele(src_ipif); 9562 if (mp->b_prev || mp->b_next) { 9563 mp->b_next = NULL; 9564 mp->b_prev = NULL; 9565 } else { 9566 /* 9567 * Since ip_wput() isn't close to finished, we fill 9568 * in enough of the header for credible error reporting. 9569 */ 9570 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 9571 /* Failed */ 9572 freemsg(first_mp); 9573 if (ire != NULL) 9574 ire_refrele(ire); 9575 return; 9576 } 9577 } 9578 /* 9579 * At this point we will have ire only if RTF_BLACKHOLE 9580 * or RTF_REJECT flags are set on the IRE. It will not 9581 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 9582 */ 9583 if (ire != NULL) { 9584 if (ire->ire_flags & RTF_BLACKHOLE) { 9585 ire_refrele(ire); 9586 freemsg(first_mp); 9587 return; 9588 } 9589 ire_refrele(ire); 9590 } 9591 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 9592 } 9593 9594 /* Name/Value Table Lookup Routine */ 9595 char * 9596 ip_nv_lookup(nv_t *nv, int value) 9597 { 9598 if (!nv) 9599 return (NULL); 9600 for (; nv->nv_name; nv++) { 9601 if (nv->nv_value == value) 9602 return (nv->nv_name); 9603 } 9604 return ("unknown"); 9605 } 9606 9607 /* 9608 * This is a module open, i.e. this is a control stream for access 9609 * to a DLPI device. We allocate an ill_t as the instance data in 9610 * this case. 9611 */ 9612 int 9613 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9614 { 9615 ill_t *ill; 9616 int err; 9617 zoneid_t zoneid; 9618 netstack_t *ns; 9619 ip_stack_t *ipst; 9620 9621 /* 9622 * Prevent unprivileged processes from pushing IP so that 9623 * they can't send raw IP. 9624 */ 9625 if (secpolicy_net_rawaccess(credp) != 0) 9626 return (EPERM); 9627 9628 ns = netstack_find_by_cred(credp); 9629 ASSERT(ns != NULL); 9630 ipst = ns->netstack_ip; 9631 ASSERT(ipst != NULL); 9632 9633 /* 9634 * For exclusive stacks we set the zoneid to zero 9635 * to make IP operate as if in the global zone. 9636 */ 9637 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9638 zoneid = GLOBAL_ZONEID; 9639 else 9640 zoneid = crgetzoneid(credp); 9641 9642 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 9643 q->q_ptr = WR(q)->q_ptr = ill; 9644 ill->ill_ipst = ipst; 9645 ill->ill_zoneid = zoneid; 9646 9647 /* 9648 * ill_init initializes the ill fields and then sends down 9649 * down a DL_INFO_REQ after calling qprocson. 9650 */ 9651 err = ill_init(q, ill); 9652 if (err != 0) { 9653 mi_free(ill); 9654 netstack_rele(ipst->ips_netstack); 9655 q->q_ptr = NULL; 9656 WR(q)->q_ptr = NULL; 9657 return (err); 9658 } 9659 9660 /* ill_init initializes the ipsq marking this thread as writer */ 9661 ipsq_exit(ill->ill_phyint->phyint_ipsq); 9662 /* Wait for the DL_INFO_ACK */ 9663 mutex_enter(&ill->ill_lock); 9664 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 9665 /* 9666 * Return value of 0 indicates a pending signal. 9667 */ 9668 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 9669 if (err == 0) { 9670 mutex_exit(&ill->ill_lock); 9671 (void) ip_close(q, 0); 9672 return (EINTR); 9673 } 9674 } 9675 mutex_exit(&ill->ill_lock); 9676 9677 /* 9678 * ip_rput_other could have set an error in ill_error on 9679 * receipt of M_ERROR. 9680 */ 9681 9682 err = ill->ill_error; 9683 if (err != 0) { 9684 (void) ip_close(q, 0); 9685 return (err); 9686 } 9687 9688 ill->ill_credp = credp; 9689 crhold(credp); 9690 9691 mutex_enter(&ipst->ips_ip_mi_lock); 9692 err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag, 9693 credp); 9694 mutex_exit(&ipst->ips_ip_mi_lock); 9695 if (err) { 9696 (void) ip_close(q, 0); 9697 return (err); 9698 } 9699 return (0); 9700 } 9701 9702 /* For /dev/ip aka AF_INET open */ 9703 int 9704 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9705 { 9706 return (ip_open(q, devp, flag, sflag, credp, B_FALSE)); 9707 } 9708 9709 /* For /dev/ip6 aka AF_INET6 open */ 9710 int 9711 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9712 { 9713 return (ip_open(q, devp, flag, sflag, credp, B_TRUE)); 9714 } 9715 9716 /* IP open routine. */ 9717 int 9718 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 9719 boolean_t isv6) 9720 { 9721 conn_t *connp; 9722 major_t maj; 9723 zoneid_t zoneid; 9724 netstack_t *ns; 9725 ip_stack_t *ipst; 9726 9727 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 9728 9729 /* Allow reopen. */ 9730 if (q->q_ptr != NULL) 9731 return (0); 9732 9733 if (sflag & MODOPEN) { 9734 /* This is a module open */ 9735 return (ip_modopen(q, devp, flag, sflag, credp)); 9736 } 9737 9738 if ((flag & ~(FKLYR)) == IP_HELPER_STR) { 9739 /* 9740 * Non streams based socket looking for a stream 9741 * to access IP 9742 */ 9743 return (ip_helper_stream_setup(q, devp, flag, sflag, 9744 credp, isv6)); 9745 } 9746 9747 ns = netstack_find_by_cred(credp); 9748 ASSERT(ns != NULL); 9749 ipst = ns->netstack_ip; 9750 ASSERT(ipst != NULL); 9751 9752 /* 9753 * For exclusive stacks we set the zoneid to zero 9754 * to make IP operate as if in the global zone. 9755 */ 9756 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9757 zoneid = GLOBAL_ZONEID; 9758 else 9759 zoneid = crgetzoneid(credp); 9760 9761 /* 9762 * We are opening as a device. This is an IP client stream, and we 9763 * allocate an conn_t as the instance data. 9764 */ 9765 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack); 9766 9767 /* 9768 * ipcl_conn_create did a netstack_hold. Undo the hold that was 9769 * done by netstack_find_by_cred() 9770 */ 9771 netstack_rele(ipst->ips_netstack); 9772 9773 connp->conn_zoneid = zoneid; 9774 connp->conn_sqp = NULL; 9775 connp->conn_initial_sqp = NULL; 9776 connp->conn_final_sqp = NULL; 9777 9778 connp->conn_upq = q; 9779 q->q_ptr = WR(q)->q_ptr = connp; 9780 9781 if (flag & SO_SOCKSTR) 9782 connp->conn_flags |= IPCL_SOCKET; 9783 9784 /* Minor tells us which /dev entry was opened */ 9785 if (isv6) { 9786 connp->conn_af_isv6 = B_TRUE; 9787 ip_setpktversion(connp, isv6, B_FALSE, ipst); 9788 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9789 } else { 9790 connp->conn_af_isv6 = B_FALSE; 9791 connp->conn_pkt_isv6 = B_FALSE; 9792 } 9793 9794 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 9795 ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 9796 connp->conn_minor_arena = ip_minor_arena_la; 9797 } else { 9798 /* 9799 * Either minor numbers in the large arena were exhausted 9800 * or a non socket application is doing the open. 9801 * Try to allocate from the small arena. 9802 */ 9803 if ((connp->conn_dev = 9804 inet_minor_alloc(ip_minor_arena_sa)) == 0) { 9805 /* CONN_DEC_REF takes care of netstack_rele() */ 9806 q->q_ptr = WR(q)->q_ptr = NULL; 9807 CONN_DEC_REF(connp); 9808 return (EBUSY); 9809 } 9810 connp->conn_minor_arena = ip_minor_arena_sa; 9811 } 9812 9813 maj = getemajor(*devp); 9814 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9815 9816 /* 9817 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9818 */ 9819 connp->conn_cred = credp; 9820 9821 /* 9822 * Handle IP_RTS_REQUEST and other ioctls which use conn_recv 9823 */ 9824 connp->conn_recv = ip_conn_input; 9825 9826 crhold(connp->conn_cred); 9827 9828 /* 9829 * If the caller has the process-wide flag set, then default to MAC 9830 * exempt mode. This allows read-down to unlabeled hosts. 9831 */ 9832 if (getpflags(NET_MAC_AWARE, credp) != 0) 9833 connp->conn_mac_exempt = B_TRUE; 9834 9835 connp->conn_rq = q; 9836 connp->conn_wq = WR(q); 9837 9838 /* Non-zero default values */ 9839 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9840 9841 /* 9842 * Make the conn globally visible to walkers 9843 */ 9844 ASSERT(connp->conn_ref == 1); 9845 mutex_enter(&connp->conn_lock); 9846 connp->conn_state_flags &= ~CONN_INCIPIENT; 9847 mutex_exit(&connp->conn_lock); 9848 9849 qprocson(q); 9850 9851 return (0); 9852 } 9853 9854 /* 9855 * Change the output format (IPv4 vs. IPv6) for a conn_t. 9856 * Note that there is no race since either ip_output function works - it 9857 * is just an optimization to enter the best ip_output routine directly. 9858 */ 9859 void 9860 ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib, 9861 ip_stack_t *ipst) 9862 { 9863 if (isv6) { 9864 if (bump_mib) { 9865 BUMP_MIB(&ipst->ips_ip6_mib, 9866 ipIfStatsOutSwitchIPVersion); 9867 } 9868 connp->conn_send = ip_output_v6; 9869 connp->conn_pkt_isv6 = B_TRUE; 9870 } else { 9871 if (bump_mib) { 9872 BUMP_MIB(&ipst->ips_ip_mib, 9873 ipIfStatsOutSwitchIPVersion); 9874 } 9875 connp->conn_send = ip_output; 9876 connp->conn_pkt_isv6 = B_FALSE; 9877 } 9878 9879 } 9880 9881 /* 9882 * See if IPsec needs loading because of the options in mp. 9883 */ 9884 static boolean_t 9885 ipsec_opt_present(mblk_t *mp) 9886 { 9887 uint8_t *optcp, *next_optcp, *opt_endcp; 9888 struct opthdr *opt; 9889 struct T_opthdr *topt; 9890 int opthdr_len; 9891 t_uscalar_t optname, optlevel; 9892 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9893 ipsec_req_t *ipsr; 9894 9895 /* 9896 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9897 * return TRUE. 9898 */ 9899 9900 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9901 opt_endcp = optcp + tor->OPT_length; 9902 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9903 opthdr_len = sizeof (struct T_opthdr); 9904 } else { /* O_OPTMGMT_REQ */ 9905 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9906 opthdr_len = sizeof (struct opthdr); 9907 } 9908 for (; optcp < opt_endcp; optcp = next_optcp) { 9909 if (optcp + opthdr_len > opt_endcp) 9910 return (B_FALSE); /* Not enough option header. */ 9911 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9912 topt = (struct T_opthdr *)optcp; 9913 optlevel = topt->level; 9914 optname = topt->name; 9915 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9916 } else { 9917 opt = (struct opthdr *)optcp; 9918 optlevel = opt->level; 9919 optname = opt->name; 9920 next_optcp = optcp + opthdr_len + 9921 _TPI_ALIGN_OPT(opt->len); 9922 } 9923 if ((next_optcp < optcp) || /* wraparound pointer space */ 9924 ((next_optcp >= opt_endcp) && /* last option bad len */ 9925 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9926 return (B_FALSE); /* bad option buffer */ 9927 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9928 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9929 /* 9930 * Check to see if it's an all-bypass or all-zeroes 9931 * IPsec request. Don't bother loading IPsec if 9932 * the socket doesn't want to use it. (A good example 9933 * is a bypass request.) 9934 * 9935 * Basically, if any of the non-NEVER bits are set, 9936 * load IPsec. 9937 */ 9938 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9939 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9940 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9941 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9942 != 0) 9943 return (B_TRUE); 9944 } 9945 } 9946 return (B_FALSE); 9947 } 9948 9949 /* 9950 * If conn is is waiting for ipsec to finish loading, kick it. 9951 */ 9952 /* ARGSUSED */ 9953 static void 9954 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 9955 { 9956 t_scalar_t optreq_prim; 9957 mblk_t *mp; 9958 cred_t *cr; 9959 int err = 0; 9960 9961 /* 9962 * This function is called, after ipsec loading is complete. 9963 * Since IP checks exclusively and atomically (i.e it prevents 9964 * ipsec load from completing until ip_optcom_req completes) 9965 * whether ipsec load is complete, there cannot be a race with IP 9966 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 9967 */ 9968 mutex_enter(&connp->conn_lock); 9969 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 9970 ASSERT(connp->conn_ipsec_opt_mp != NULL); 9971 mp = connp->conn_ipsec_opt_mp; 9972 connp->conn_ipsec_opt_mp = NULL; 9973 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 9974 mutex_exit(&connp->conn_lock); 9975 9976 /* 9977 * All Solaris components should pass a db_credp 9978 * for this TPI message, hence we ASSERT. 9979 * But in case there is some other M_PROTO that looks 9980 * like a TPI message sent by some other kernel 9981 * component, we check and return an error. 9982 */ 9983 cr = msg_getcred(mp, NULL); 9984 ASSERT(cr != NULL); 9985 if (cr == NULL) { 9986 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 9987 if (mp != NULL) 9988 qreply(connp->conn_wq, mp); 9989 return; 9990 } 9991 9992 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 9993 9994 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 9995 if (optreq_prim == T_OPTMGMT_REQ) { 9996 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9997 &ip_opt_obj, B_FALSE); 9998 } else { 9999 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 10000 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10001 &ip_opt_obj, B_FALSE); 10002 } 10003 if (err != EINPROGRESS) 10004 CONN_OPER_PENDING_DONE(connp); 10005 return; 10006 } 10007 mutex_exit(&connp->conn_lock); 10008 } 10009 10010 /* 10011 * Called from the ipsec_loader thread, outside any perimeter, to tell 10012 * ip qenable any of the queues waiting for the ipsec loader to 10013 * complete. 10014 */ 10015 void 10016 ip_ipsec_load_complete(ipsec_stack_t *ipss) 10017 { 10018 netstack_t *ns = ipss->ipsec_netstack; 10019 10020 ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip); 10021 } 10022 10023 /* 10024 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 10025 * determines the grp on which it has to become exclusive, queues the mp 10026 * and IPSQ draining restarts the optmgmt 10027 */ 10028 static boolean_t 10029 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 10030 { 10031 conn_t *connp = Q_TO_CONN(q); 10032 ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec; 10033 10034 /* 10035 * Take IPsec requests and treat them special. 10036 */ 10037 if (ipsec_opt_present(mp)) { 10038 /* First check if IPsec is loaded. */ 10039 mutex_enter(&ipss->ipsec_loader_lock); 10040 if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) { 10041 mutex_exit(&ipss->ipsec_loader_lock); 10042 return (B_FALSE); 10043 } 10044 mutex_enter(&connp->conn_lock); 10045 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 10046 10047 ASSERT(connp->conn_ipsec_opt_mp == NULL); 10048 connp->conn_ipsec_opt_mp = mp; 10049 mutex_exit(&connp->conn_lock); 10050 mutex_exit(&ipss->ipsec_loader_lock); 10051 10052 ipsec_loader_loadnow(ipss); 10053 return (B_TRUE); 10054 } 10055 return (B_FALSE); 10056 } 10057 10058 /* 10059 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 10060 * all of them are copied to the conn_t. If the req is "zero", the policy is 10061 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 10062 * fields. 10063 * We keep only the latest setting of the policy and thus policy setting 10064 * is not incremental/cumulative. 10065 * 10066 * Requests to set policies with multiple alternative actions will 10067 * go through a different API. 10068 */ 10069 int 10070 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 10071 { 10072 uint_t ah_req = 0; 10073 uint_t esp_req = 0; 10074 uint_t se_req = 0; 10075 ipsec_act_t *actp = NULL; 10076 uint_t nact; 10077 ipsec_policy_head_t *ph; 10078 boolean_t is_pol_reset, is_pol_inserted = B_FALSE; 10079 int error = 0; 10080 netstack_t *ns = connp->conn_netstack; 10081 ip_stack_t *ipst = ns->netstack_ip; 10082 ipsec_stack_t *ipss = ns->netstack_ipsec; 10083 10084 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 10085 10086 /* 10087 * The IP_SEC_OPT option does not allow variable length parameters, 10088 * hence a request cannot be NULL. 10089 */ 10090 if (req == NULL) 10091 return (EINVAL); 10092 10093 ah_req = req->ipsr_ah_req; 10094 esp_req = req->ipsr_esp_req; 10095 se_req = req->ipsr_self_encap_req; 10096 10097 /* Don't allow setting self-encap without one or more of AH/ESP. */ 10098 if (se_req != 0 && esp_req == 0 && ah_req == 0) 10099 return (EINVAL); 10100 10101 /* 10102 * Are we dealing with a request to reset the policy (i.e. 10103 * zero requests). 10104 */ 10105 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 10106 (esp_req & REQ_MASK) == 0 && 10107 (se_req & REQ_MASK) == 0); 10108 10109 if (!is_pol_reset) { 10110 /* 10111 * If we couldn't load IPsec, fail with "protocol 10112 * not supported". 10113 * IPsec may not have been loaded for a request with zero 10114 * policies, so we don't fail in this case. 10115 */ 10116 mutex_enter(&ipss->ipsec_loader_lock); 10117 if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 10118 mutex_exit(&ipss->ipsec_loader_lock); 10119 return (EPROTONOSUPPORT); 10120 } 10121 mutex_exit(&ipss->ipsec_loader_lock); 10122 10123 /* 10124 * Test for valid requests. Invalid algorithms 10125 * need to be tested by IPsec code because new 10126 * algorithms can be added dynamically. 10127 */ 10128 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10129 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10130 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 10131 return (EINVAL); 10132 } 10133 10134 /* 10135 * Only privileged users can issue these 10136 * requests. 10137 */ 10138 if (((ah_req & IPSEC_PREF_NEVER) || 10139 (esp_req & IPSEC_PREF_NEVER) || 10140 (se_req & IPSEC_PREF_NEVER)) && 10141 secpolicy_ip_config(cr, B_FALSE) != 0) { 10142 return (EPERM); 10143 } 10144 10145 /* 10146 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 10147 * are mutually exclusive. 10148 */ 10149 if (((ah_req & REQ_MASK) == REQ_MASK) || 10150 ((esp_req & REQ_MASK) == REQ_MASK) || 10151 ((se_req & REQ_MASK) == REQ_MASK)) { 10152 /* Both of them are set */ 10153 return (EINVAL); 10154 } 10155 } 10156 10157 mutex_enter(&connp->conn_lock); 10158 10159 /* 10160 * If we have already cached policies in ip_bind_connected*(), don't 10161 * let them change now. We cache policies for connections 10162 * whose src,dst [addr, port] is known. 10163 */ 10164 if (connp->conn_policy_cached) { 10165 mutex_exit(&connp->conn_lock); 10166 return (EINVAL); 10167 } 10168 10169 /* 10170 * We have a zero policies, reset the connection policy if already 10171 * set. This will cause the connection to inherit the 10172 * global policy, if any. 10173 */ 10174 if (is_pol_reset) { 10175 if (connp->conn_policy != NULL) { 10176 IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack); 10177 connp->conn_policy = NULL; 10178 } 10179 connp->conn_flags &= ~IPCL_CHECK_POLICY; 10180 connp->conn_in_enforce_policy = B_FALSE; 10181 connp->conn_out_enforce_policy = B_FALSE; 10182 mutex_exit(&connp->conn_lock); 10183 return (0); 10184 } 10185 10186 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy, 10187 ipst->ips_netstack); 10188 if (ph == NULL) 10189 goto enomem; 10190 10191 ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack); 10192 if (actp == NULL) 10193 goto enomem; 10194 10195 /* 10196 * Always insert IPv4 policy entries, since they can also apply to 10197 * ipv6 sockets being used in ipv4-compat mode. 10198 */ 10199 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4, 10200 IPSEC_TYPE_INBOUND, ns)) 10201 goto enomem; 10202 is_pol_inserted = B_TRUE; 10203 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4, 10204 IPSEC_TYPE_OUTBOUND, ns)) 10205 goto enomem; 10206 10207 /* 10208 * We're looking at a v6 socket, also insert the v6-specific 10209 * entries. 10210 */ 10211 if (connp->conn_af_isv6) { 10212 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6, 10213 IPSEC_TYPE_INBOUND, ns)) 10214 goto enomem; 10215 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6, 10216 IPSEC_TYPE_OUTBOUND, ns)) 10217 goto enomem; 10218 } 10219 10220 ipsec_actvec_free(actp, nact); 10221 10222 /* 10223 * If the requests need security, set enforce_policy. 10224 * If the requests are IPSEC_PREF_NEVER, one should 10225 * still set conn_out_enforce_policy so that an ipsec_out 10226 * gets attached in ip_wput. This is needed so that 10227 * for connections that we don't cache policy in ip_bind, 10228 * if global policy matches in ip_wput_attach_policy, we 10229 * don't wrongly inherit global policy. Similarly, we need 10230 * to set conn_in_enforce_policy also so that we don't verify 10231 * policy wrongly. 10232 */ 10233 if ((ah_req & REQ_MASK) != 0 || 10234 (esp_req & REQ_MASK) != 0 || 10235 (se_req & REQ_MASK) != 0) { 10236 connp->conn_in_enforce_policy = B_TRUE; 10237 connp->conn_out_enforce_policy = B_TRUE; 10238 connp->conn_flags |= IPCL_CHECK_POLICY; 10239 } 10240 10241 mutex_exit(&connp->conn_lock); 10242 return (error); 10243 #undef REQ_MASK 10244 10245 /* 10246 * Common memory-allocation-failure exit path. 10247 */ 10248 enomem: 10249 mutex_exit(&connp->conn_lock); 10250 if (actp != NULL) 10251 ipsec_actvec_free(actp, nact); 10252 if (is_pol_inserted) 10253 ipsec_polhead_flush(ph, ns); 10254 return (ENOMEM); 10255 } 10256 10257 /* 10258 * Only for options that pass in an IP addr. Currently only V4 options 10259 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 10260 * So this function assumes level is IPPROTO_IP 10261 */ 10262 int 10263 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 10264 mblk_t *first_mp) 10265 { 10266 ipif_t *ipif = NULL; 10267 int error; 10268 ill_t *ill; 10269 int zoneid; 10270 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10271 10272 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 10273 10274 if (addr != INADDR_ANY || checkonly) { 10275 ASSERT(connp != NULL); 10276 zoneid = IPCL_ZONEID(connp); 10277 if (option == IP_NEXTHOP) { 10278 ipif = ipif_lookup_onlink_addr(addr, 10279 connp->conn_zoneid, ipst); 10280 } else { 10281 ipif = ipif_lookup_addr(addr, NULL, zoneid, 10282 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 10283 &error, ipst); 10284 } 10285 if (ipif == NULL) { 10286 if (error == EINPROGRESS) 10287 return (error); 10288 if ((option == IP_MULTICAST_IF) || 10289 (option == IP_NEXTHOP)) 10290 return (EHOSTUNREACH); 10291 else 10292 return (EINVAL); 10293 } else if (checkonly) { 10294 if (option == IP_MULTICAST_IF) { 10295 ill = ipif->ipif_ill; 10296 /* not supported by the virtual network iface */ 10297 if (IS_VNI(ill)) { 10298 ipif_refrele(ipif); 10299 return (EINVAL); 10300 } 10301 } 10302 ipif_refrele(ipif); 10303 return (0); 10304 } 10305 ill = ipif->ipif_ill; 10306 mutex_enter(&connp->conn_lock); 10307 mutex_enter(&ill->ill_lock); 10308 if ((ill->ill_state_flags & ILL_CONDEMNED) || 10309 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 10310 mutex_exit(&ill->ill_lock); 10311 mutex_exit(&connp->conn_lock); 10312 ipif_refrele(ipif); 10313 return (option == IP_MULTICAST_IF ? 10314 EHOSTUNREACH : EINVAL); 10315 } 10316 } else { 10317 mutex_enter(&connp->conn_lock); 10318 } 10319 10320 /* None of the options below are supported on the VNI */ 10321 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 10322 mutex_exit(&ill->ill_lock); 10323 mutex_exit(&connp->conn_lock); 10324 ipif_refrele(ipif); 10325 return (EINVAL); 10326 } 10327 10328 switch (option) { 10329 case IP_MULTICAST_IF: 10330 connp->conn_multicast_ipif = ipif; 10331 break; 10332 case IP_NEXTHOP: 10333 connp->conn_nexthop_v4 = addr; 10334 connp->conn_nexthop_set = B_TRUE; 10335 break; 10336 } 10337 10338 if (ipif != NULL) { 10339 mutex_exit(&ill->ill_lock); 10340 mutex_exit(&connp->conn_lock); 10341 ipif_refrele(ipif); 10342 return (0); 10343 } 10344 mutex_exit(&connp->conn_lock); 10345 /* We succeded in cleared the option */ 10346 return (0); 10347 } 10348 10349 /* 10350 * For options that pass in an ifindex specifying the ill. V6 options always 10351 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 10352 */ 10353 int 10354 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 10355 int level, int option, mblk_t *first_mp) 10356 { 10357 ill_t *ill = NULL; 10358 int error = 0; 10359 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10360 10361 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 10362 if (ifindex != 0) { 10363 ASSERT(connp != NULL); 10364 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 10365 first_mp, ip_restart_optmgmt, &error, ipst); 10366 if (ill != NULL) { 10367 if (checkonly) { 10368 /* not supported by the virtual network iface */ 10369 if (IS_VNI(ill)) { 10370 ill_refrele(ill); 10371 return (EINVAL); 10372 } 10373 ill_refrele(ill); 10374 return (0); 10375 } 10376 if (!ipif_lookup_zoneid(ill, connp->conn_zoneid, 10377 0, NULL)) { 10378 ill_refrele(ill); 10379 ill = NULL; 10380 mutex_enter(&connp->conn_lock); 10381 goto setit; 10382 } 10383 mutex_enter(&connp->conn_lock); 10384 mutex_enter(&ill->ill_lock); 10385 if (ill->ill_state_flags & ILL_CONDEMNED) { 10386 mutex_exit(&ill->ill_lock); 10387 mutex_exit(&connp->conn_lock); 10388 ill_refrele(ill); 10389 ill = NULL; 10390 mutex_enter(&connp->conn_lock); 10391 } 10392 goto setit; 10393 } else if (error == EINPROGRESS) { 10394 return (error); 10395 } else { 10396 error = 0; 10397 } 10398 } 10399 mutex_enter(&connp->conn_lock); 10400 setit: 10401 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 10402 10403 /* 10404 * The options below assume that the ILL (if any) transmits and/or 10405 * receives traffic. Neither of which is true for the virtual network 10406 * interface, so fail setting these on a VNI. 10407 */ 10408 if (IS_VNI(ill)) { 10409 ASSERT(ill != NULL); 10410 mutex_exit(&ill->ill_lock); 10411 mutex_exit(&connp->conn_lock); 10412 ill_refrele(ill); 10413 return (EINVAL); 10414 } 10415 10416 if (level == IPPROTO_IP) { 10417 switch (option) { 10418 case IP_BOUND_IF: 10419 connp->conn_incoming_ill = ill; 10420 connp->conn_outgoing_ill = ill; 10421 break; 10422 10423 case IP_MULTICAST_IF: 10424 /* 10425 * This option is an internal special. The socket 10426 * level IP_MULTICAST_IF specifies an 'ipaddr' and 10427 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 10428 * specifies an ifindex and we try first on V6 ill's. 10429 * If we don't find one, we they try using on v4 ill's 10430 * intenally and we come here. 10431 */ 10432 if (!checkonly && ill != NULL) { 10433 ipif_t *ipif; 10434 ipif = ill->ill_ipif; 10435 10436 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 10437 mutex_exit(&ill->ill_lock); 10438 mutex_exit(&connp->conn_lock); 10439 ill_refrele(ill); 10440 ill = NULL; 10441 mutex_enter(&connp->conn_lock); 10442 } else { 10443 connp->conn_multicast_ipif = ipif; 10444 } 10445 } 10446 break; 10447 10448 case IP_DHCPINIT_IF: 10449 if (connp->conn_dhcpinit_ill != NULL) { 10450 /* 10451 * We've locked the conn so conn_cleanup_ill() 10452 * cannot clear conn_dhcpinit_ill -- so it's 10453 * safe to access the ill. 10454 */ 10455 ill_t *oill = connp->conn_dhcpinit_ill; 10456 10457 ASSERT(oill->ill_dhcpinit != 0); 10458 atomic_dec_32(&oill->ill_dhcpinit); 10459 connp->conn_dhcpinit_ill = NULL; 10460 } 10461 10462 if (ill != NULL) { 10463 connp->conn_dhcpinit_ill = ill; 10464 atomic_inc_32(&ill->ill_dhcpinit); 10465 } 10466 break; 10467 } 10468 } else { 10469 switch (option) { 10470 case IPV6_BOUND_IF: 10471 connp->conn_incoming_ill = ill; 10472 connp->conn_outgoing_ill = ill; 10473 break; 10474 10475 case IPV6_MULTICAST_IF: 10476 /* 10477 * Set conn_multicast_ill to be the IPv6 ill. 10478 * Set conn_multicast_ipif to be an IPv4 ipif 10479 * for ifindex to make IPv4 mapped addresses 10480 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 10481 * Even if no IPv6 ill exists for the ifindex 10482 * we need to check for an IPv4 ifindex in order 10483 * for this to work with mapped addresses. In that 10484 * case only set conn_multicast_ipif. 10485 */ 10486 if (!checkonly) { 10487 if (ifindex == 0) { 10488 connp->conn_multicast_ill = NULL; 10489 connp->conn_multicast_ipif = NULL; 10490 } else if (ill != NULL) { 10491 connp->conn_multicast_ill = ill; 10492 } 10493 } 10494 break; 10495 } 10496 } 10497 10498 if (ill != NULL) { 10499 mutex_exit(&ill->ill_lock); 10500 mutex_exit(&connp->conn_lock); 10501 ill_refrele(ill); 10502 return (0); 10503 } 10504 mutex_exit(&connp->conn_lock); 10505 /* 10506 * We succeeded in clearing the option (ifindex == 0) or failed to 10507 * locate the ill and could not set the option (ifindex != 0) 10508 */ 10509 return (ifindex == 0 ? 0 : EINVAL); 10510 } 10511 10512 /* This routine sets socket options. */ 10513 /* ARGSUSED */ 10514 int 10515 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 10516 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 10517 void *dummy, cred_t *cr, mblk_t *first_mp) 10518 { 10519 int *i1 = (int *)invalp; 10520 conn_t *connp = Q_TO_CONN(q); 10521 int error = 0; 10522 boolean_t checkonly; 10523 ire_t *ire; 10524 boolean_t found; 10525 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10526 10527 switch (optset_context) { 10528 10529 case SETFN_OPTCOM_CHECKONLY: 10530 checkonly = B_TRUE; 10531 /* 10532 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 10533 * inlen != 0 implies value supplied and 10534 * we have to "pretend" to set it. 10535 * inlen == 0 implies that there is no 10536 * value part in T_CHECK request and just validation 10537 * done elsewhere should be enough, we just return here. 10538 */ 10539 if (inlen == 0) { 10540 *outlenp = 0; 10541 return (0); 10542 } 10543 break; 10544 case SETFN_OPTCOM_NEGOTIATE: 10545 case SETFN_UD_NEGOTIATE: 10546 case SETFN_CONN_NEGOTIATE: 10547 checkonly = B_FALSE; 10548 break; 10549 default: 10550 /* 10551 * We should never get here 10552 */ 10553 *outlenp = 0; 10554 return (EINVAL); 10555 } 10556 10557 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 10558 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 10559 10560 /* 10561 * For fixed length options, no sanity check 10562 * of passed in length is done. It is assumed *_optcom_req() 10563 * routines do the right thing. 10564 */ 10565 10566 switch (level) { 10567 case SOL_SOCKET: 10568 /* 10569 * conn_lock protects the bitfields, and is used to 10570 * set the fields atomically. 10571 */ 10572 switch (name) { 10573 case SO_BROADCAST: 10574 if (!checkonly) { 10575 /* TODO: use value someplace? */ 10576 mutex_enter(&connp->conn_lock); 10577 connp->conn_broadcast = *i1 ? 1 : 0; 10578 mutex_exit(&connp->conn_lock); 10579 } 10580 break; /* goto sizeof (int) option return */ 10581 case SO_USELOOPBACK: 10582 if (!checkonly) { 10583 /* TODO: use value someplace? */ 10584 mutex_enter(&connp->conn_lock); 10585 connp->conn_loopback = *i1 ? 1 : 0; 10586 mutex_exit(&connp->conn_lock); 10587 } 10588 break; /* goto sizeof (int) option return */ 10589 case SO_DONTROUTE: 10590 if (!checkonly) { 10591 mutex_enter(&connp->conn_lock); 10592 connp->conn_dontroute = *i1 ? 1 : 0; 10593 mutex_exit(&connp->conn_lock); 10594 } 10595 break; /* goto sizeof (int) option return */ 10596 case SO_REUSEADDR: 10597 if (!checkonly) { 10598 mutex_enter(&connp->conn_lock); 10599 connp->conn_reuseaddr = *i1 ? 1 : 0; 10600 mutex_exit(&connp->conn_lock); 10601 } 10602 break; /* goto sizeof (int) option return */ 10603 case SO_PROTOTYPE: 10604 if (!checkonly) { 10605 mutex_enter(&connp->conn_lock); 10606 connp->conn_proto = *i1; 10607 mutex_exit(&connp->conn_lock); 10608 } 10609 break; /* goto sizeof (int) option return */ 10610 case SO_ALLZONES: 10611 if (!checkonly) { 10612 mutex_enter(&connp->conn_lock); 10613 if (IPCL_IS_BOUND(connp)) { 10614 mutex_exit(&connp->conn_lock); 10615 return (EINVAL); 10616 } 10617 connp->conn_allzones = *i1 != 0 ? 1 : 0; 10618 mutex_exit(&connp->conn_lock); 10619 } 10620 break; /* goto sizeof (int) option return */ 10621 case SO_ANON_MLP: 10622 if (!checkonly) { 10623 mutex_enter(&connp->conn_lock); 10624 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 10625 mutex_exit(&connp->conn_lock); 10626 } 10627 break; /* goto sizeof (int) option return */ 10628 case SO_MAC_EXEMPT: 10629 if (secpolicy_net_mac_aware(cr) != 0 || 10630 IPCL_IS_BOUND(connp)) 10631 return (EACCES); 10632 if (!checkonly) { 10633 mutex_enter(&connp->conn_lock); 10634 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 10635 mutex_exit(&connp->conn_lock); 10636 } 10637 break; /* goto sizeof (int) option return */ 10638 default: 10639 /* 10640 * "soft" error (negative) 10641 * option not handled at this level 10642 * Note: Do not modify *outlenp 10643 */ 10644 return (-EINVAL); 10645 } 10646 break; 10647 case IPPROTO_IP: 10648 switch (name) { 10649 case IP_NEXTHOP: 10650 if (secpolicy_ip_config(cr, B_FALSE) != 0) 10651 return (EPERM); 10652 /* FALLTHRU */ 10653 case IP_MULTICAST_IF: { 10654 ipaddr_t addr = *i1; 10655 10656 error = ip_opt_set_ipif(connp, addr, checkonly, name, 10657 first_mp); 10658 if (error != 0) 10659 return (error); 10660 break; /* goto sizeof (int) option return */ 10661 } 10662 10663 case IP_MULTICAST_TTL: 10664 /* Recorded in transport above IP */ 10665 *outvalp = *invalp; 10666 *outlenp = sizeof (uchar_t); 10667 return (0); 10668 case IP_MULTICAST_LOOP: 10669 if (!checkonly) { 10670 mutex_enter(&connp->conn_lock); 10671 connp->conn_multicast_loop = *invalp ? 1 : 0; 10672 mutex_exit(&connp->conn_lock); 10673 } 10674 *outvalp = *invalp; 10675 *outlenp = sizeof (uchar_t); 10676 return (0); 10677 case IP_ADD_MEMBERSHIP: 10678 case MCAST_JOIN_GROUP: 10679 case IP_DROP_MEMBERSHIP: 10680 case MCAST_LEAVE_GROUP: { 10681 struct ip_mreq *mreqp; 10682 struct group_req *greqp; 10683 ire_t *ire; 10684 boolean_t done = B_FALSE; 10685 ipaddr_t group, ifaddr; 10686 struct sockaddr_in *sin; 10687 uint32_t *ifindexp; 10688 boolean_t mcast_opt = B_TRUE; 10689 mcast_record_t fmode; 10690 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10691 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10692 10693 switch (name) { 10694 case IP_ADD_MEMBERSHIP: 10695 mcast_opt = B_FALSE; 10696 /* FALLTHRU */ 10697 case MCAST_JOIN_GROUP: 10698 fmode = MODE_IS_EXCLUDE; 10699 optfn = ip_opt_add_group; 10700 break; 10701 10702 case IP_DROP_MEMBERSHIP: 10703 mcast_opt = B_FALSE; 10704 /* FALLTHRU */ 10705 case MCAST_LEAVE_GROUP: 10706 fmode = MODE_IS_INCLUDE; 10707 optfn = ip_opt_delete_group; 10708 break; 10709 } 10710 10711 if (mcast_opt) { 10712 greqp = (struct group_req *)i1; 10713 sin = (struct sockaddr_in *)&greqp->gr_group; 10714 if (sin->sin_family != AF_INET) { 10715 *outlenp = 0; 10716 return (ENOPROTOOPT); 10717 } 10718 group = (ipaddr_t)sin->sin_addr.s_addr; 10719 ifaddr = INADDR_ANY; 10720 ifindexp = &greqp->gr_interface; 10721 } else { 10722 mreqp = (struct ip_mreq *)i1; 10723 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10724 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10725 ifindexp = NULL; 10726 } 10727 10728 /* 10729 * In the multirouting case, we need to replicate 10730 * the request on all interfaces that will take part 10731 * in replication. We do so because multirouting is 10732 * reflective, thus we will probably receive multi- 10733 * casts on those interfaces. 10734 * The ip_multirt_apply_membership() succeeds if the 10735 * operation succeeds on at least one interface. 10736 */ 10737 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10738 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10739 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10740 if (ire != NULL) { 10741 if (ire->ire_flags & RTF_MULTIRT) { 10742 error = ip_multirt_apply_membership( 10743 optfn, ire, connp, checkonly, group, 10744 fmode, INADDR_ANY, first_mp); 10745 done = B_TRUE; 10746 } 10747 ire_refrele(ire); 10748 } 10749 if (!done) { 10750 error = optfn(connp, checkonly, group, ifaddr, 10751 ifindexp, fmode, INADDR_ANY, first_mp); 10752 } 10753 if (error) { 10754 /* 10755 * EINPROGRESS is a soft error, needs retry 10756 * so don't make *outlenp zero. 10757 */ 10758 if (error != EINPROGRESS) 10759 *outlenp = 0; 10760 return (error); 10761 } 10762 /* OK return - copy input buffer into output buffer */ 10763 if (invalp != outvalp) { 10764 /* don't trust bcopy for identical src/dst */ 10765 bcopy(invalp, outvalp, inlen); 10766 } 10767 *outlenp = inlen; 10768 return (0); 10769 } 10770 case IP_BLOCK_SOURCE: 10771 case IP_UNBLOCK_SOURCE: 10772 case IP_ADD_SOURCE_MEMBERSHIP: 10773 case IP_DROP_SOURCE_MEMBERSHIP: 10774 case MCAST_BLOCK_SOURCE: 10775 case MCAST_UNBLOCK_SOURCE: 10776 case MCAST_JOIN_SOURCE_GROUP: 10777 case MCAST_LEAVE_SOURCE_GROUP: { 10778 struct ip_mreq_source *imreqp; 10779 struct group_source_req *gsreqp; 10780 in_addr_t grp, src, ifaddr = INADDR_ANY; 10781 uint32_t ifindex = 0; 10782 mcast_record_t fmode; 10783 struct sockaddr_in *sin; 10784 ire_t *ire; 10785 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10786 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10787 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10788 10789 switch (name) { 10790 case IP_BLOCK_SOURCE: 10791 mcast_opt = B_FALSE; 10792 /* FALLTHRU */ 10793 case MCAST_BLOCK_SOURCE: 10794 fmode = MODE_IS_EXCLUDE; 10795 optfn = ip_opt_add_group; 10796 break; 10797 10798 case IP_UNBLOCK_SOURCE: 10799 mcast_opt = B_FALSE; 10800 /* FALLTHRU */ 10801 case MCAST_UNBLOCK_SOURCE: 10802 fmode = MODE_IS_EXCLUDE; 10803 optfn = ip_opt_delete_group; 10804 break; 10805 10806 case IP_ADD_SOURCE_MEMBERSHIP: 10807 mcast_opt = B_FALSE; 10808 /* FALLTHRU */ 10809 case MCAST_JOIN_SOURCE_GROUP: 10810 fmode = MODE_IS_INCLUDE; 10811 optfn = ip_opt_add_group; 10812 break; 10813 10814 case IP_DROP_SOURCE_MEMBERSHIP: 10815 mcast_opt = B_FALSE; 10816 /* FALLTHRU */ 10817 case MCAST_LEAVE_SOURCE_GROUP: 10818 fmode = MODE_IS_INCLUDE; 10819 optfn = ip_opt_delete_group; 10820 break; 10821 } 10822 10823 if (mcast_opt) { 10824 gsreqp = (struct group_source_req *)i1; 10825 if (gsreqp->gsr_group.ss_family != AF_INET) { 10826 *outlenp = 0; 10827 return (ENOPROTOOPT); 10828 } 10829 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10830 grp = (ipaddr_t)sin->sin_addr.s_addr; 10831 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10832 src = (ipaddr_t)sin->sin_addr.s_addr; 10833 ifindex = gsreqp->gsr_interface; 10834 } else { 10835 imreqp = (struct ip_mreq_source *)i1; 10836 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10837 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10838 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10839 } 10840 10841 /* 10842 * In the multirouting case, we need to replicate 10843 * the request as noted in the mcast cases above. 10844 */ 10845 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10846 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10847 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10848 if (ire != NULL) { 10849 if (ire->ire_flags & RTF_MULTIRT) { 10850 error = ip_multirt_apply_membership( 10851 optfn, ire, connp, checkonly, grp, 10852 fmode, src, first_mp); 10853 done = B_TRUE; 10854 } 10855 ire_refrele(ire); 10856 } 10857 if (!done) { 10858 error = optfn(connp, checkonly, grp, ifaddr, 10859 &ifindex, fmode, src, first_mp); 10860 } 10861 if (error != 0) { 10862 /* 10863 * EINPROGRESS is a soft error, needs retry 10864 * so don't make *outlenp zero. 10865 */ 10866 if (error != EINPROGRESS) 10867 *outlenp = 0; 10868 return (error); 10869 } 10870 /* OK return - copy input buffer into output buffer */ 10871 if (invalp != outvalp) { 10872 bcopy(invalp, outvalp, inlen); 10873 } 10874 *outlenp = inlen; 10875 return (0); 10876 } 10877 case IP_SEC_OPT: 10878 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10879 if (error != 0) { 10880 *outlenp = 0; 10881 return (error); 10882 } 10883 break; 10884 case IP_HDRINCL: 10885 case IP_OPTIONS: 10886 case T_IP_OPTIONS: 10887 case IP_TOS: 10888 case T_IP_TOS: 10889 case IP_TTL: 10890 case IP_RECVDSTADDR: 10891 case IP_RECVOPTS: 10892 /* OK return - copy input buffer into output buffer */ 10893 if (invalp != outvalp) { 10894 /* don't trust bcopy for identical src/dst */ 10895 bcopy(invalp, outvalp, inlen); 10896 } 10897 *outlenp = inlen; 10898 return (0); 10899 case IP_RECVIF: 10900 /* Retrieve the inbound interface index */ 10901 if (!checkonly) { 10902 mutex_enter(&connp->conn_lock); 10903 connp->conn_recvif = *i1 ? 1 : 0; 10904 mutex_exit(&connp->conn_lock); 10905 } 10906 break; /* goto sizeof (int) option return */ 10907 case IP_RECVPKTINFO: 10908 if (!checkonly) { 10909 mutex_enter(&connp->conn_lock); 10910 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 10911 mutex_exit(&connp->conn_lock); 10912 } 10913 break; /* goto sizeof (int) option return */ 10914 case IP_RECVSLLA: 10915 /* Retrieve the source link layer address */ 10916 if (!checkonly) { 10917 mutex_enter(&connp->conn_lock); 10918 connp->conn_recvslla = *i1 ? 1 : 0; 10919 mutex_exit(&connp->conn_lock); 10920 } 10921 break; /* goto sizeof (int) option return */ 10922 case MRT_INIT: 10923 case MRT_DONE: 10924 case MRT_ADD_VIF: 10925 case MRT_DEL_VIF: 10926 case MRT_ADD_MFC: 10927 case MRT_DEL_MFC: 10928 case MRT_ASSERT: 10929 if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { 10930 *outlenp = 0; 10931 return (error); 10932 } 10933 error = ip_mrouter_set((int)name, q, checkonly, 10934 (uchar_t *)invalp, inlen, first_mp); 10935 if (error) { 10936 *outlenp = 0; 10937 return (error); 10938 } 10939 /* OK return - copy input buffer into output buffer */ 10940 if (invalp != outvalp) { 10941 /* don't trust bcopy for identical src/dst */ 10942 bcopy(invalp, outvalp, inlen); 10943 } 10944 *outlenp = inlen; 10945 return (0); 10946 case IP_BOUND_IF: 10947 case IP_DHCPINIT_IF: 10948 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10949 level, name, first_mp); 10950 if (error != 0) 10951 return (error); 10952 break; /* goto sizeof (int) option return */ 10953 10954 case IP_UNSPEC_SRC: 10955 /* Allow sending with a zero source address */ 10956 if (!checkonly) { 10957 mutex_enter(&connp->conn_lock); 10958 connp->conn_unspec_src = *i1 ? 1 : 0; 10959 mutex_exit(&connp->conn_lock); 10960 } 10961 break; /* goto sizeof (int) option return */ 10962 default: 10963 /* 10964 * "soft" error (negative) 10965 * option not handled at this level 10966 * Note: Do not modify *outlenp 10967 */ 10968 return (-EINVAL); 10969 } 10970 break; 10971 case IPPROTO_IPV6: 10972 switch (name) { 10973 case IPV6_BOUND_IF: 10974 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10975 level, name, first_mp); 10976 if (error != 0) 10977 return (error); 10978 break; /* goto sizeof (int) option return */ 10979 10980 case IPV6_MULTICAST_IF: 10981 /* 10982 * The only possible errors are EINPROGRESS and 10983 * EINVAL. EINPROGRESS will be restarted and is not 10984 * a hard error. We call this option on both V4 and V6 10985 * If both return EINVAL, then this call returns 10986 * EINVAL. If at least one of them succeeds we 10987 * return success. 10988 */ 10989 found = B_FALSE; 10990 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10991 level, name, first_mp); 10992 if (error == EINPROGRESS) 10993 return (error); 10994 if (error == 0) 10995 found = B_TRUE; 10996 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10997 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 10998 if (error == 0) 10999 found = B_TRUE; 11000 if (!found) 11001 return (error); 11002 break; /* goto sizeof (int) option return */ 11003 11004 case IPV6_MULTICAST_HOPS: 11005 /* Recorded in transport above IP */ 11006 break; /* goto sizeof (int) option return */ 11007 case IPV6_MULTICAST_LOOP: 11008 if (!checkonly) { 11009 mutex_enter(&connp->conn_lock); 11010 connp->conn_multicast_loop = *i1; 11011 mutex_exit(&connp->conn_lock); 11012 } 11013 break; /* goto sizeof (int) option return */ 11014 case IPV6_JOIN_GROUP: 11015 case MCAST_JOIN_GROUP: 11016 case IPV6_LEAVE_GROUP: 11017 case MCAST_LEAVE_GROUP: { 11018 struct ipv6_mreq *ip_mreqp; 11019 struct group_req *greqp; 11020 ire_t *ire; 11021 boolean_t done = B_FALSE; 11022 in6_addr_t groupv6; 11023 uint32_t ifindex; 11024 boolean_t mcast_opt = B_TRUE; 11025 mcast_record_t fmode; 11026 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11027 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11028 11029 switch (name) { 11030 case IPV6_JOIN_GROUP: 11031 mcast_opt = B_FALSE; 11032 /* FALLTHRU */ 11033 case MCAST_JOIN_GROUP: 11034 fmode = MODE_IS_EXCLUDE; 11035 optfn = ip_opt_add_group_v6; 11036 break; 11037 11038 case IPV6_LEAVE_GROUP: 11039 mcast_opt = B_FALSE; 11040 /* FALLTHRU */ 11041 case MCAST_LEAVE_GROUP: 11042 fmode = MODE_IS_INCLUDE; 11043 optfn = ip_opt_delete_group_v6; 11044 break; 11045 } 11046 11047 if (mcast_opt) { 11048 struct sockaddr_in *sin; 11049 struct sockaddr_in6 *sin6; 11050 greqp = (struct group_req *)i1; 11051 if (greqp->gr_group.ss_family == AF_INET) { 11052 sin = (struct sockaddr_in *) 11053 &(greqp->gr_group); 11054 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 11055 &groupv6); 11056 } else { 11057 sin6 = (struct sockaddr_in6 *) 11058 &(greqp->gr_group); 11059 groupv6 = sin6->sin6_addr; 11060 } 11061 ifindex = greqp->gr_interface; 11062 } else { 11063 ip_mreqp = (struct ipv6_mreq *)i1; 11064 groupv6 = ip_mreqp->ipv6mr_multiaddr; 11065 ifindex = ip_mreqp->ipv6mr_interface; 11066 } 11067 /* 11068 * In the multirouting case, we need to replicate 11069 * the request on all interfaces that will take part 11070 * in replication. We do so because multirouting is 11071 * reflective, thus we will probably receive multi- 11072 * casts on those interfaces. 11073 * The ip_multirt_apply_membership_v6() succeeds if 11074 * the operation succeeds on at least one interface. 11075 */ 11076 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 11077 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11078 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11079 if (ire != NULL) { 11080 if (ire->ire_flags & RTF_MULTIRT) { 11081 error = ip_multirt_apply_membership_v6( 11082 optfn, ire, connp, checkonly, 11083 &groupv6, fmode, &ipv6_all_zeros, 11084 first_mp); 11085 done = B_TRUE; 11086 } 11087 ire_refrele(ire); 11088 } 11089 if (!done) { 11090 error = optfn(connp, checkonly, &groupv6, 11091 ifindex, fmode, &ipv6_all_zeros, first_mp); 11092 } 11093 if (error) { 11094 /* 11095 * EINPROGRESS is a soft error, needs retry 11096 * so don't make *outlenp zero. 11097 */ 11098 if (error != EINPROGRESS) 11099 *outlenp = 0; 11100 return (error); 11101 } 11102 /* OK return - copy input buffer into output buffer */ 11103 if (invalp != outvalp) { 11104 /* don't trust bcopy for identical src/dst */ 11105 bcopy(invalp, outvalp, inlen); 11106 } 11107 *outlenp = inlen; 11108 return (0); 11109 } 11110 case MCAST_BLOCK_SOURCE: 11111 case MCAST_UNBLOCK_SOURCE: 11112 case MCAST_JOIN_SOURCE_GROUP: 11113 case MCAST_LEAVE_SOURCE_GROUP: { 11114 struct group_source_req *gsreqp; 11115 in6_addr_t v6grp, v6src; 11116 uint32_t ifindex; 11117 mcast_record_t fmode; 11118 ire_t *ire; 11119 boolean_t done = B_FALSE; 11120 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11121 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11122 11123 switch (name) { 11124 case MCAST_BLOCK_SOURCE: 11125 fmode = MODE_IS_EXCLUDE; 11126 optfn = ip_opt_add_group_v6; 11127 break; 11128 case MCAST_UNBLOCK_SOURCE: 11129 fmode = MODE_IS_EXCLUDE; 11130 optfn = ip_opt_delete_group_v6; 11131 break; 11132 case MCAST_JOIN_SOURCE_GROUP: 11133 fmode = MODE_IS_INCLUDE; 11134 optfn = ip_opt_add_group_v6; 11135 break; 11136 case MCAST_LEAVE_SOURCE_GROUP: 11137 fmode = MODE_IS_INCLUDE; 11138 optfn = ip_opt_delete_group_v6; 11139 break; 11140 } 11141 11142 gsreqp = (struct group_source_req *)i1; 11143 ifindex = gsreqp->gsr_interface; 11144 if (gsreqp->gsr_group.ss_family == AF_INET) { 11145 struct sockaddr_in *s; 11146 s = (struct sockaddr_in *)&gsreqp->gsr_group; 11147 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 11148 s = (struct sockaddr_in *)&gsreqp->gsr_source; 11149 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 11150 } else { 11151 struct sockaddr_in6 *s6; 11152 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 11153 v6grp = s6->sin6_addr; 11154 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 11155 v6src = s6->sin6_addr; 11156 } 11157 11158 /* 11159 * In the multirouting case, we need to replicate 11160 * the request as noted in the mcast cases above. 11161 */ 11162 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 11163 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11164 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11165 if (ire != NULL) { 11166 if (ire->ire_flags & RTF_MULTIRT) { 11167 error = ip_multirt_apply_membership_v6( 11168 optfn, ire, connp, checkonly, 11169 &v6grp, fmode, &v6src, first_mp); 11170 done = B_TRUE; 11171 } 11172 ire_refrele(ire); 11173 } 11174 if (!done) { 11175 error = optfn(connp, checkonly, &v6grp, 11176 ifindex, fmode, &v6src, first_mp); 11177 } 11178 if (error != 0) { 11179 /* 11180 * EINPROGRESS is a soft error, needs retry 11181 * so don't make *outlenp zero. 11182 */ 11183 if (error != EINPROGRESS) 11184 *outlenp = 0; 11185 return (error); 11186 } 11187 /* OK return - copy input buffer into output buffer */ 11188 if (invalp != outvalp) { 11189 bcopy(invalp, outvalp, inlen); 11190 } 11191 *outlenp = inlen; 11192 return (0); 11193 } 11194 case IPV6_UNICAST_HOPS: 11195 /* Recorded in transport above IP */ 11196 break; /* goto sizeof (int) option return */ 11197 case IPV6_UNSPEC_SRC: 11198 /* Allow sending with a zero source address */ 11199 if (!checkonly) { 11200 mutex_enter(&connp->conn_lock); 11201 connp->conn_unspec_src = *i1 ? 1 : 0; 11202 mutex_exit(&connp->conn_lock); 11203 } 11204 break; /* goto sizeof (int) option return */ 11205 case IPV6_RECVPKTINFO: 11206 if (!checkonly) { 11207 mutex_enter(&connp->conn_lock); 11208 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11209 mutex_exit(&connp->conn_lock); 11210 } 11211 break; /* goto sizeof (int) option return */ 11212 case IPV6_RECVTCLASS: 11213 if (!checkonly) { 11214 if (*i1 < 0 || *i1 > 1) { 11215 return (EINVAL); 11216 } 11217 mutex_enter(&connp->conn_lock); 11218 connp->conn_ipv6_recvtclass = *i1; 11219 mutex_exit(&connp->conn_lock); 11220 } 11221 break; 11222 case IPV6_RECVPATHMTU: 11223 if (!checkonly) { 11224 if (*i1 < 0 || *i1 > 1) { 11225 return (EINVAL); 11226 } 11227 mutex_enter(&connp->conn_lock); 11228 connp->conn_ipv6_recvpathmtu = *i1; 11229 mutex_exit(&connp->conn_lock); 11230 } 11231 break; 11232 case IPV6_RECVHOPLIMIT: 11233 if (!checkonly) { 11234 mutex_enter(&connp->conn_lock); 11235 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 11236 mutex_exit(&connp->conn_lock); 11237 } 11238 break; /* goto sizeof (int) option return */ 11239 case IPV6_RECVHOPOPTS: 11240 if (!checkonly) { 11241 mutex_enter(&connp->conn_lock); 11242 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 11243 mutex_exit(&connp->conn_lock); 11244 } 11245 break; /* goto sizeof (int) option return */ 11246 case IPV6_RECVDSTOPTS: 11247 if (!checkonly) { 11248 mutex_enter(&connp->conn_lock); 11249 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 11250 mutex_exit(&connp->conn_lock); 11251 } 11252 break; /* goto sizeof (int) option return */ 11253 case IPV6_RECVRTHDR: 11254 if (!checkonly) { 11255 mutex_enter(&connp->conn_lock); 11256 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 11257 mutex_exit(&connp->conn_lock); 11258 } 11259 break; /* goto sizeof (int) option return */ 11260 case IPV6_RECVRTHDRDSTOPTS: 11261 if (!checkonly) { 11262 mutex_enter(&connp->conn_lock); 11263 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 11264 mutex_exit(&connp->conn_lock); 11265 } 11266 break; /* goto sizeof (int) option return */ 11267 case IPV6_PKTINFO: 11268 if (inlen == 0) 11269 return (-EINVAL); /* clearing option */ 11270 error = ip6_set_pktinfo(cr, connp, 11271 (struct in6_pktinfo *)invalp); 11272 if (error != 0) 11273 *outlenp = 0; 11274 else 11275 *outlenp = inlen; 11276 return (error); 11277 case IPV6_NEXTHOP: { 11278 struct sockaddr_in6 *sin6; 11279 11280 /* Verify that the nexthop is reachable */ 11281 if (inlen == 0) 11282 return (-EINVAL); /* clearing option */ 11283 11284 sin6 = (struct sockaddr_in6 *)invalp; 11285 ire = ire_route_lookup_v6(&sin6->sin6_addr, 11286 0, 0, 0, NULL, NULL, connp->conn_zoneid, 11287 NULL, MATCH_IRE_DEFAULT, ipst); 11288 11289 if (ire == NULL) { 11290 *outlenp = 0; 11291 return (EHOSTUNREACH); 11292 } 11293 ire_refrele(ire); 11294 return (-EINVAL); 11295 } 11296 case IPV6_SEC_OPT: 11297 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 11298 if (error != 0) { 11299 *outlenp = 0; 11300 return (error); 11301 } 11302 break; 11303 case IPV6_SRC_PREFERENCES: { 11304 /* 11305 * This is implemented strictly in the ip module 11306 * (here and in tcp_opt_*() to accomodate tcp 11307 * sockets). Modules above ip pass this option 11308 * down here since ip is the only one that needs to 11309 * be aware of source address preferences. 11310 * 11311 * This socket option only affects connected 11312 * sockets that haven't already bound to a specific 11313 * IPv6 address. In other words, sockets that 11314 * don't call bind() with an address other than the 11315 * unspecified address and that call connect(). 11316 * ip_bind_connected_v6() passes these preferences 11317 * to the ipif_select_source_v6() function. 11318 */ 11319 if (inlen != sizeof (uint32_t)) 11320 return (EINVAL); 11321 error = ip6_set_src_preferences(connp, 11322 *(uint32_t *)invalp); 11323 if (error != 0) { 11324 *outlenp = 0; 11325 return (error); 11326 } else { 11327 *outlenp = sizeof (uint32_t); 11328 } 11329 break; 11330 } 11331 case IPV6_V6ONLY: 11332 if (*i1 < 0 || *i1 > 1) { 11333 return (EINVAL); 11334 } 11335 mutex_enter(&connp->conn_lock); 11336 connp->conn_ipv6_v6only = *i1; 11337 mutex_exit(&connp->conn_lock); 11338 break; 11339 default: 11340 return (-EINVAL); 11341 } 11342 break; 11343 default: 11344 /* 11345 * "soft" error (negative) 11346 * option not handled at this level 11347 * Note: Do not modify *outlenp 11348 */ 11349 return (-EINVAL); 11350 } 11351 /* 11352 * Common case of return from an option that is sizeof (int) 11353 */ 11354 *(int *)outvalp = *i1; 11355 *outlenp = sizeof (int); 11356 return (0); 11357 } 11358 11359 /* 11360 * This routine gets default values of certain options whose default 11361 * values are maintained by protocol specific code 11362 */ 11363 /* ARGSUSED */ 11364 int 11365 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 11366 { 11367 int *i1 = (int *)ptr; 11368 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11369 11370 switch (level) { 11371 case IPPROTO_IP: 11372 switch (name) { 11373 case IP_MULTICAST_TTL: 11374 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 11375 return (sizeof (uchar_t)); 11376 case IP_MULTICAST_LOOP: 11377 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 11378 return (sizeof (uchar_t)); 11379 default: 11380 return (-1); 11381 } 11382 case IPPROTO_IPV6: 11383 switch (name) { 11384 case IPV6_UNICAST_HOPS: 11385 *i1 = ipst->ips_ipv6_def_hops; 11386 return (sizeof (int)); 11387 case IPV6_MULTICAST_HOPS: 11388 *i1 = IP_DEFAULT_MULTICAST_TTL; 11389 return (sizeof (int)); 11390 case IPV6_MULTICAST_LOOP: 11391 *i1 = IP_DEFAULT_MULTICAST_LOOP; 11392 return (sizeof (int)); 11393 case IPV6_V6ONLY: 11394 *i1 = 1; 11395 return (sizeof (int)); 11396 default: 11397 return (-1); 11398 } 11399 default: 11400 return (-1); 11401 } 11402 /* NOTREACHED */ 11403 } 11404 11405 /* 11406 * Given a destination address and a pointer to where to put the information 11407 * this routine fills in the mtuinfo. 11408 */ 11409 int 11410 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 11411 struct ip6_mtuinfo *mtuinfo, netstack_t *ns) 11412 { 11413 ire_t *ire; 11414 ip_stack_t *ipst = ns->netstack_ip; 11415 11416 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 11417 return (-1); 11418 11419 bzero(mtuinfo, sizeof (*mtuinfo)); 11420 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 11421 mtuinfo->ip6m_addr.sin6_port = port; 11422 mtuinfo->ip6m_addr.sin6_addr = *in6; 11423 11424 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst); 11425 if (ire != NULL) { 11426 mtuinfo->ip6m_mtu = ire->ire_max_frag; 11427 ire_refrele(ire); 11428 } else { 11429 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 11430 } 11431 return (sizeof (struct ip6_mtuinfo)); 11432 } 11433 11434 /* 11435 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 11436 * checking of cred and that ip_g_mrouter is set should be done and 11437 * isn't. This doesn't matter as the error checking is done properly for the 11438 * other MRT options coming in through ip_opt_set. 11439 */ 11440 int 11441 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 11442 { 11443 conn_t *connp = Q_TO_CONN(q); 11444 ipsec_req_t *req = (ipsec_req_t *)ptr; 11445 11446 switch (level) { 11447 case IPPROTO_IP: 11448 switch (name) { 11449 case MRT_VERSION: 11450 case MRT_ASSERT: 11451 (void) ip_mrouter_get(name, q, ptr); 11452 return (sizeof (int)); 11453 case IP_SEC_OPT: 11454 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 11455 case IP_NEXTHOP: 11456 if (connp->conn_nexthop_set) { 11457 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 11458 return (sizeof (ipaddr_t)); 11459 } else 11460 return (0); 11461 case IP_RECVPKTINFO: 11462 *(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0; 11463 return (sizeof (int)); 11464 default: 11465 break; 11466 } 11467 break; 11468 case IPPROTO_IPV6: 11469 switch (name) { 11470 case IPV6_SEC_OPT: 11471 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 11472 case IPV6_SRC_PREFERENCES: { 11473 return (ip6_get_src_preferences(connp, 11474 (uint32_t *)ptr)); 11475 } 11476 case IPV6_V6ONLY: 11477 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 11478 return (sizeof (int)); 11479 case IPV6_PATHMTU: 11480 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 11481 (struct ip6_mtuinfo *)ptr, connp->conn_netstack)); 11482 default: 11483 break; 11484 } 11485 break; 11486 default: 11487 break; 11488 } 11489 return (-1); 11490 } 11491 /* Named Dispatch routine to get a current value out of our parameter table. */ 11492 /* ARGSUSED */ 11493 static int 11494 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11495 { 11496 ipparam_t *ippa = (ipparam_t *)cp; 11497 11498 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 11499 return (0); 11500 } 11501 11502 /* ARGSUSED */ 11503 static int 11504 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11505 { 11506 11507 (void) mi_mpprintf(mp, "%d", *(int *)cp); 11508 return (0); 11509 } 11510 11511 /* 11512 * Set ip{,6}_forwarding values. This means walking through all of the 11513 * ill's and toggling their forwarding values. 11514 */ 11515 /* ARGSUSED */ 11516 static int 11517 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11518 { 11519 long new_value; 11520 int *forwarding_value = (int *)cp; 11521 ill_t *ill; 11522 boolean_t isv6; 11523 ill_walk_context_t ctx; 11524 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11525 11526 isv6 = (forwarding_value == &ipst->ips_ipv6_forward); 11527 11528 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11529 new_value < 0 || new_value > 1) { 11530 return (EINVAL); 11531 } 11532 11533 *forwarding_value = new_value; 11534 11535 /* 11536 * Regardless of the current value of ip_forwarding, set all per-ill 11537 * values of ip_forwarding to the value being set. 11538 * 11539 * Bring all the ill's up to date with the new global value. 11540 */ 11541 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11542 11543 if (isv6) 11544 ill = ILL_START_WALK_V6(&ctx, ipst); 11545 else 11546 ill = ILL_START_WALK_V4(&ctx, ipst); 11547 11548 for (; ill != NULL; ill = ill_next(&ctx, ill)) 11549 (void) ill_forward_set(ill, new_value != 0); 11550 11551 rw_exit(&ipst->ips_ill_g_lock); 11552 return (0); 11553 } 11554 11555 /* 11556 * Walk through the param array specified registering each element with the 11557 * Named Dispatch handler. This is called only during init. So it is ok 11558 * not to acquire any locks 11559 */ 11560 static boolean_t 11561 ip_param_register(IDP *ndp, ipparam_t *ippa, size_t ippa_cnt, 11562 ipndp_t *ipnd, size_t ipnd_cnt) 11563 { 11564 for (; ippa_cnt-- > 0; ippa++) { 11565 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 11566 if (!nd_load(ndp, ippa->ip_param_name, 11567 ip_param_get, ip_param_set, (caddr_t)ippa)) { 11568 nd_free(ndp); 11569 return (B_FALSE); 11570 } 11571 } 11572 } 11573 11574 for (; ipnd_cnt-- > 0; ipnd++) { 11575 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 11576 if (!nd_load(ndp, ipnd->ip_ndp_name, 11577 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 11578 ipnd->ip_ndp_data)) { 11579 nd_free(ndp); 11580 return (B_FALSE); 11581 } 11582 } 11583 } 11584 11585 return (B_TRUE); 11586 } 11587 11588 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 11589 /* ARGSUSED */ 11590 static int 11591 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11592 { 11593 long new_value; 11594 ipparam_t *ippa = (ipparam_t *)cp; 11595 11596 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11597 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 11598 return (EINVAL); 11599 } 11600 ippa->ip_param_value = new_value; 11601 return (0); 11602 } 11603 11604 /* 11605 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 11606 * When an ipf is passed here for the first time, if 11607 * we already have in-order fragments on the queue, we convert from the fast- 11608 * path reassembly scheme to the hard-case scheme. From then on, additional 11609 * fragments are reassembled here. We keep track of the start and end offsets 11610 * of each piece, and the number of holes in the chain. When the hole count 11611 * goes to zero, we are done! 11612 * 11613 * The ipf_count will be updated to account for any mblk(s) added (pointed to 11614 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 11615 * ipfb_count and ill_frag_count by the difference of ipf_count before and 11616 * after the call to ip_reassemble(). 11617 */ 11618 int 11619 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 11620 size_t msg_len) 11621 { 11622 uint_t end; 11623 mblk_t *next_mp; 11624 mblk_t *mp1; 11625 uint_t offset; 11626 boolean_t incr_dups = B_TRUE; 11627 boolean_t offset_zero_seen = B_FALSE; 11628 boolean_t pkt_boundary_checked = B_FALSE; 11629 11630 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 11631 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 11632 11633 /* Add in byte count */ 11634 ipf->ipf_count += msg_len; 11635 if (ipf->ipf_end) { 11636 /* 11637 * We were part way through in-order reassembly, but now there 11638 * is a hole. We walk through messages already queued, and 11639 * mark them for hard case reassembly. We know that up till 11640 * now they were in order starting from offset zero. 11641 */ 11642 offset = 0; 11643 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11644 IP_REASS_SET_START(mp1, offset); 11645 if (offset == 0) { 11646 ASSERT(ipf->ipf_nf_hdr_len != 0); 11647 offset = -ipf->ipf_nf_hdr_len; 11648 } 11649 offset += mp1->b_wptr - mp1->b_rptr; 11650 IP_REASS_SET_END(mp1, offset); 11651 } 11652 /* One hole at the end. */ 11653 ipf->ipf_hole_cnt = 1; 11654 /* Brand it as a hard case, forever. */ 11655 ipf->ipf_end = 0; 11656 } 11657 /* Walk through all the new pieces. */ 11658 do { 11659 end = start + (mp->b_wptr - mp->b_rptr); 11660 /* 11661 * If start is 0, decrease 'end' only for the first mblk of 11662 * the fragment. Otherwise 'end' can get wrong value in the 11663 * second pass of the loop if first mblk is exactly the 11664 * size of ipf_nf_hdr_len. 11665 */ 11666 if (start == 0 && !offset_zero_seen) { 11667 /* First segment */ 11668 ASSERT(ipf->ipf_nf_hdr_len != 0); 11669 end -= ipf->ipf_nf_hdr_len; 11670 offset_zero_seen = B_TRUE; 11671 } 11672 next_mp = mp->b_cont; 11673 /* 11674 * We are checking to see if there is any interesing data 11675 * to process. If there isn't and the mblk isn't the 11676 * one which carries the unfragmentable header then we 11677 * drop it. It's possible to have just the unfragmentable 11678 * header come through without any data. That needs to be 11679 * saved. 11680 * 11681 * If the assert at the top of this function holds then the 11682 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 11683 * is infrequently traveled enough that the test is left in 11684 * to protect against future code changes which break that 11685 * invariant. 11686 */ 11687 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 11688 /* Empty. Blast it. */ 11689 IP_REASS_SET_START(mp, 0); 11690 IP_REASS_SET_END(mp, 0); 11691 /* 11692 * If the ipf points to the mblk we are about to free, 11693 * update ipf to point to the next mblk (or NULL 11694 * if none). 11695 */ 11696 if (ipf->ipf_mp->b_cont == mp) 11697 ipf->ipf_mp->b_cont = next_mp; 11698 freeb(mp); 11699 continue; 11700 } 11701 mp->b_cont = NULL; 11702 IP_REASS_SET_START(mp, start); 11703 IP_REASS_SET_END(mp, end); 11704 if (!ipf->ipf_tail_mp) { 11705 ipf->ipf_tail_mp = mp; 11706 ipf->ipf_mp->b_cont = mp; 11707 if (start == 0 || !more) { 11708 ipf->ipf_hole_cnt = 1; 11709 /* 11710 * if the first fragment comes in more than one 11711 * mblk, this loop will be executed for each 11712 * mblk. Need to adjust hole count so exiting 11713 * this routine will leave hole count at 1. 11714 */ 11715 if (next_mp) 11716 ipf->ipf_hole_cnt++; 11717 } else 11718 ipf->ipf_hole_cnt = 2; 11719 continue; 11720 } else if (ipf->ipf_last_frag_seen && !more && 11721 !pkt_boundary_checked) { 11722 /* 11723 * We check datagram boundary only if this fragment 11724 * claims to be the last fragment and we have seen a 11725 * last fragment in the past too. We do this only 11726 * once for a given fragment. 11727 * 11728 * start cannot be 0 here as fragments with start=0 11729 * and MF=0 gets handled as a complete packet. These 11730 * fragments should not reach here. 11731 */ 11732 11733 if (start + msgdsize(mp) != 11734 IP_REASS_END(ipf->ipf_tail_mp)) { 11735 /* 11736 * We have two fragments both of which claim 11737 * to be the last fragment but gives conflicting 11738 * information about the whole datagram size. 11739 * Something fishy is going on. Drop the 11740 * fragment and free up the reassembly list. 11741 */ 11742 return (IP_REASS_FAILED); 11743 } 11744 11745 /* 11746 * We shouldn't come to this code block again for this 11747 * particular fragment. 11748 */ 11749 pkt_boundary_checked = B_TRUE; 11750 } 11751 11752 /* New stuff at or beyond tail? */ 11753 offset = IP_REASS_END(ipf->ipf_tail_mp); 11754 if (start >= offset) { 11755 if (ipf->ipf_last_frag_seen) { 11756 /* current fragment is beyond last fragment */ 11757 return (IP_REASS_FAILED); 11758 } 11759 /* Link it on end. */ 11760 ipf->ipf_tail_mp->b_cont = mp; 11761 ipf->ipf_tail_mp = mp; 11762 if (more) { 11763 if (start != offset) 11764 ipf->ipf_hole_cnt++; 11765 } else if (start == offset && next_mp == NULL) 11766 ipf->ipf_hole_cnt--; 11767 continue; 11768 } 11769 mp1 = ipf->ipf_mp->b_cont; 11770 offset = IP_REASS_START(mp1); 11771 /* New stuff at the front? */ 11772 if (start < offset) { 11773 if (start == 0) { 11774 if (end >= offset) { 11775 /* Nailed the hole at the begining. */ 11776 ipf->ipf_hole_cnt--; 11777 } 11778 } else if (end < offset) { 11779 /* 11780 * A hole, stuff, and a hole where there used 11781 * to be just a hole. 11782 */ 11783 ipf->ipf_hole_cnt++; 11784 } 11785 mp->b_cont = mp1; 11786 /* Check for overlap. */ 11787 while (end > offset) { 11788 if (end < IP_REASS_END(mp1)) { 11789 mp->b_wptr -= end - offset; 11790 IP_REASS_SET_END(mp, offset); 11791 BUMP_MIB(ill->ill_ip_mib, 11792 ipIfStatsReasmPartDups); 11793 break; 11794 } 11795 /* Did we cover another hole? */ 11796 if ((mp1->b_cont && 11797 IP_REASS_END(mp1) != 11798 IP_REASS_START(mp1->b_cont) && 11799 end >= IP_REASS_START(mp1->b_cont)) || 11800 (!ipf->ipf_last_frag_seen && !more)) { 11801 ipf->ipf_hole_cnt--; 11802 } 11803 /* Clip out mp1. */ 11804 if ((mp->b_cont = mp1->b_cont) == NULL) { 11805 /* 11806 * After clipping out mp1, this guy 11807 * is now hanging off the end. 11808 */ 11809 ipf->ipf_tail_mp = mp; 11810 } 11811 IP_REASS_SET_START(mp1, 0); 11812 IP_REASS_SET_END(mp1, 0); 11813 /* Subtract byte count */ 11814 ipf->ipf_count -= mp1->b_datap->db_lim - 11815 mp1->b_datap->db_base; 11816 freeb(mp1); 11817 BUMP_MIB(ill->ill_ip_mib, 11818 ipIfStatsReasmPartDups); 11819 mp1 = mp->b_cont; 11820 if (!mp1) 11821 break; 11822 offset = IP_REASS_START(mp1); 11823 } 11824 ipf->ipf_mp->b_cont = mp; 11825 continue; 11826 } 11827 /* 11828 * The new piece starts somewhere between the start of the head 11829 * and before the end of the tail. 11830 */ 11831 for (; mp1; mp1 = mp1->b_cont) { 11832 offset = IP_REASS_END(mp1); 11833 if (start < offset) { 11834 if (end <= offset) { 11835 /* Nothing new. */ 11836 IP_REASS_SET_START(mp, 0); 11837 IP_REASS_SET_END(mp, 0); 11838 /* Subtract byte count */ 11839 ipf->ipf_count -= mp->b_datap->db_lim - 11840 mp->b_datap->db_base; 11841 if (incr_dups) { 11842 ipf->ipf_num_dups++; 11843 incr_dups = B_FALSE; 11844 } 11845 freeb(mp); 11846 BUMP_MIB(ill->ill_ip_mib, 11847 ipIfStatsReasmDuplicates); 11848 break; 11849 } 11850 /* 11851 * Trim redundant stuff off beginning of new 11852 * piece. 11853 */ 11854 IP_REASS_SET_START(mp, offset); 11855 mp->b_rptr += offset - start; 11856 BUMP_MIB(ill->ill_ip_mib, 11857 ipIfStatsReasmPartDups); 11858 start = offset; 11859 if (!mp1->b_cont) { 11860 /* 11861 * After trimming, this guy is now 11862 * hanging off the end. 11863 */ 11864 mp1->b_cont = mp; 11865 ipf->ipf_tail_mp = mp; 11866 if (!more) { 11867 ipf->ipf_hole_cnt--; 11868 } 11869 break; 11870 } 11871 } 11872 if (start >= IP_REASS_START(mp1->b_cont)) 11873 continue; 11874 /* Fill a hole */ 11875 if (start > offset) 11876 ipf->ipf_hole_cnt++; 11877 mp->b_cont = mp1->b_cont; 11878 mp1->b_cont = mp; 11879 mp1 = mp->b_cont; 11880 offset = IP_REASS_START(mp1); 11881 if (end >= offset) { 11882 ipf->ipf_hole_cnt--; 11883 /* Check for overlap. */ 11884 while (end > offset) { 11885 if (end < IP_REASS_END(mp1)) { 11886 mp->b_wptr -= end - offset; 11887 IP_REASS_SET_END(mp, offset); 11888 /* 11889 * TODO we might bump 11890 * this up twice if there is 11891 * overlap at both ends. 11892 */ 11893 BUMP_MIB(ill->ill_ip_mib, 11894 ipIfStatsReasmPartDups); 11895 break; 11896 } 11897 /* Did we cover another hole? */ 11898 if ((mp1->b_cont && 11899 IP_REASS_END(mp1) 11900 != IP_REASS_START(mp1->b_cont) && 11901 end >= 11902 IP_REASS_START(mp1->b_cont)) || 11903 (!ipf->ipf_last_frag_seen && 11904 !more)) { 11905 ipf->ipf_hole_cnt--; 11906 } 11907 /* Clip out mp1. */ 11908 if ((mp->b_cont = mp1->b_cont) == 11909 NULL) { 11910 /* 11911 * After clipping out mp1, 11912 * this guy is now hanging 11913 * off the end. 11914 */ 11915 ipf->ipf_tail_mp = mp; 11916 } 11917 IP_REASS_SET_START(mp1, 0); 11918 IP_REASS_SET_END(mp1, 0); 11919 /* Subtract byte count */ 11920 ipf->ipf_count -= 11921 mp1->b_datap->db_lim - 11922 mp1->b_datap->db_base; 11923 freeb(mp1); 11924 BUMP_MIB(ill->ill_ip_mib, 11925 ipIfStatsReasmPartDups); 11926 mp1 = mp->b_cont; 11927 if (!mp1) 11928 break; 11929 offset = IP_REASS_START(mp1); 11930 } 11931 } 11932 break; 11933 } 11934 } while (start = end, mp = next_mp); 11935 11936 /* Fragment just processed could be the last one. Remember this fact */ 11937 if (!more) 11938 ipf->ipf_last_frag_seen = B_TRUE; 11939 11940 /* Still got holes? */ 11941 if (ipf->ipf_hole_cnt) 11942 return (IP_REASS_PARTIAL); 11943 /* Clean up overloaded fields to avoid upstream disasters. */ 11944 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11945 IP_REASS_SET_START(mp1, 0); 11946 IP_REASS_SET_END(mp1, 0); 11947 } 11948 return (IP_REASS_COMPLETE); 11949 } 11950 11951 /* 11952 * ipsec processing for the fast path, used for input UDP Packets 11953 * Returns true if ready for passup to UDP. 11954 * Return false if packet is not passable to UDP (e.g. it failed IPsec policy, 11955 * was an ESP-in-UDP packet, etc.). 11956 */ 11957 static boolean_t 11958 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 11959 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire) 11960 { 11961 uint32_t ill_index; 11962 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 11963 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 11964 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 11965 udp_t *udp = connp->conn_udp; 11966 11967 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11968 /* The ill_index of the incoming ILL */ 11969 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 11970 11971 /* pass packet up to the transport */ 11972 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 11973 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 11974 NULL, mctl_present); 11975 if (*first_mpp == NULL) { 11976 return (B_FALSE); 11977 } 11978 } 11979 11980 /* Initiate IPPF processing for fastpath UDP */ 11981 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 11982 ip_process(IPP_LOCAL_IN, mpp, ill_index); 11983 if (*mpp == NULL) { 11984 ip2dbg(("ip_input_ipsec_process: UDP pkt " 11985 "deferred/dropped during IPPF processing\n")); 11986 return (B_FALSE); 11987 } 11988 } 11989 /* 11990 * Remove 0-spi if it's 0, or move everything behind 11991 * the UDP header over it and forward to ESP via 11992 * ip_proto_input(). 11993 */ 11994 if (udp->udp_nat_t_endpoint) { 11995 if (mctl_present) { 11996 /* mctl_present *shouldn't* happen. */ 11997 ip_drop_packet(*first_mpp, B_TRUE, NULL, 11998 NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec), 11999 &ipss->ipsec_dropper); 12000 *first_mpp = NULL; 12001 return (B_FALSE); 12002 } 12003 12004 /* "ill" is "recv_ill" in actuality. */ 12005 if (!zero_spi_check(q, *mpp, ire, ill, ipss)) 12006 return (B_FALSE); 12007 12008 /* Else continue like a normal UDP packet. */ 12009 } 12010 12011 /* 12012 * We make the checks as below since we are in the fast path 12013 * and want to minimize the number of checks if the IP_RECVIF and/or 12014 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 12015 */ 12016 if (connp->conn_recvif || connp->conn_recvslla || 12017 connp->conn_ip_recvpktinfo) { 12018 if (connp->conn_recvif) { 12019 in_flags = IPF_RECVIF; 12020 } 12021 /* 12022 * UDP supports IP_RECVPKTINFO option for both v4 and v6 12023 * so the flag passed to ip_add_info is based on IP version 12024 * of connp. 12025 */ 12026 if (connp->conn_ip_recvpktinfo) { 12027 if (connp->conn_af_isv6) { 12028 /* 12029 * V6 only needs index 12030 */ 12031 in_flags |= IPF_RECVIF; 12032 } else { 12033 /* 12034 * V4 needs index + matching address. 12035 */ 12036 in_flags |= IPF_RECVADDR; 12037 } 12038 } 12039 if (connp->conn_recvslla) { 12040 in_flags |= IPF_RECVSLLA; 12041 } 12042 /* 12043 * since in_flags are being set ill will be 12044 * referenced in ip_add_info, so it better not 12045 * be NULL. 12046 */ 12047 /* 12048 * the actual data will be contained in b_cont 12049 * upon successful return of the following call. 12050 * If the call fails then the original mblk is 12051 * returned. 12052 */ 12053 *mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp), 12054 ipst); 12055 } 12056 12057 return (B_TRUE); 12058 } 12059 12060 /* 12061 * Fragmentation reassembly. Each ILL has a hash table for 12062 * queuing packets undergoing reassembly for all IPIFs 12063 * associated with the ILL. The hash is based on the packet 12064 * IP ident field. The ILL frag hash table was allocated 12065 * as a timer block at the time the ILL was created. Whenever 12066 * there is anything on the reassembly queue, the timer will 12067 * be running. Returns B_TRUE if successful else B_FALSE; 12068 * frees mp on failure. 12069 */ 12070 static boolean_t 12071 ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, 12072 uint32_t *cksum_val, uint16_t *cksum_flags) 12073 { 12074 uint32_t frag_offset_flags; 12075 mblk_t *mp = *mpp; 12076 mblk_t *t_mp; 12077 ipaddr_t dst; 12078 uint8_t proto = ipha->ipha_protocol; 12079 uint32_t sum_val; 12080 uint16_t sum_flags; 12081 ipf_t *ipf; 12082 ipf_t **ipfp; 12083 ipfb_t *ipfb; 12084 uint16_t ident; 12085 uint32_t offset; 12086 ipaddr_t src; 12087 uint_t hdr_length; 12088 uint32_t end; 12089 mblk_t *mp1; 12090 mblk_t *tail_mp; 12091 size_t count; 12092 size_t msg_len; 12093 uint8_t ecn_info = 0; 12094 uint32_t packet_size; 12095 boolean_t pruned = B_FALSE; 12096 ip_stack_t *ipst = ill->ill_ipst; 12097 12098 if (cksum_val != NULL) 12099 *cksum_val = 0; 12100 if (cksum_flags != NULL) 12101 *cksum_flags = 0; 12102 12103 /* 12104 * Drop the fragmented as early as possible, if 12105 * we don't have resource(s) to re-assemble. 12106 */ 12107 if (ipst->ips_ip_reass_queue_bytes == 0) { 12108 freemsg(mp); 12109 return (B_FALSE); 12110 } 12111 12112 /* Check for fragmentation offset; return if there's none */ 12113 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 12114 (IPH_MF | IPH_OFFSET)) == 0) 12115 return (B_TRUE); 12116 12117 /* 12118 * We utilize hardware computed checksum info only for UDP since 12119 * IP fragmentation is a normal occurrence for the protocol. In 12120 * addition, checksum offload support for IP fragments carrying 12121 * UDP payload is commonly implemented across network adapters. 12122 */ 12123 ASSERT(recv_ill != NULL); 12124 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) && 12125 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 12126 mblk_t *mp1 = mp->b_cont; 12127 int32_t len; 12128 12129 /* Record checksum information from the packet */ 12130 sum_val = (uint32_t)DB_CKSUM16(mp); 12131 sum_flags = DB_CKSUMFLAGS(mp); 12132 12133 /* IP payload offset from beginning of mblk */ 12134 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 12135 12136 if ((sum_flags & HCK_PARTIALCKSUM) && 12137 (mp1 == NULL || mp1->b_cont == NULL) && 12138 offset >= DB_CKSUMSTART(mp) && 12139 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 12140 uint32_t adj; 12141 /* 12142 * Partial checksum has been calculated by hardware 12143 * and attached to the packet; in addition, any 12144 * prepended extraneous data is even byte aligned. 12145 * If any such data exists, we adjust the checksum; 12146 * this would also handle any postpended data. 12147 */ 12148 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 12149 mp, mp1, len, adj); 12150 12151 /* One's complement subtract extraneous checksum */ 12152 if (adj >= sum_val) 12153 sum_val = ~(adj - sum_val) & 0xFFFF; 12154 else 12155 sum_val -= adj; 12156 } 12157 } else { 12158 sum_val = 0; 12159 sum_flags = 0; 12160 } 12161 12162 /* Clear hardware checksumming flag */ 12163 DB_CKSUMFLAGS(mp) = 0; 12164 12165 ident = ipha->ipha_ident; 12166 offset = (frag_offset_flags << 3) & 0xFFFF; 12167 src = ipha->ipha_src; 12168 dst = ipha->ipha_dst; 12169 hdr_length = IPH_HDR_LENGTH(ipha); 12170 end = ntohs(ipha->ipha_length) - hdr_length; 12171 12172 /* If end == 0 then we have a packet with no data, so just free it */ 12173 if (end == 0) { 12174 freemsg(mp); 12175 return (B_FALSE); 12176 } 12177 12178 /* Record the ECN field info. */ 12179 ecn_info = (ipha->ipha_type_of_service & 0x3); 12180 if (offset != 0) { 12181 /* 12182 * If this isn't the first piece, strip the header, and 12183 * add the offset to the end value. 12184 */ 12185 mp->b_rptr += hdr_length; 12186 end += offset; 12187 } 12188 12189 msg_len = MBLKSIZE(mp); 12190 tail_mp = mp; 12191 while (tail_mp->b_cont != NULL) { 12192 tail_mp = tail_mp->b_cont; 12193 msg_len += MBLKSIZE(tail_mp); 12194 } 12195 12196 /* If the reassembly list for this ILL will get too big, prune it */ 12197 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 12198 ipst->ips_ip_reass_queue_bytes) { 12199 ill_frag_prune(ill, 12200 (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : 12201 (ipst->ips_ip_reass_queue_bytes - msg_len)); 12202 pruned = B_TRUE; 12203 } 12204 12205 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 12206 mutex_enter(&ipfb->ipfb_lock); 12207 12208 ipfp = &ipfb->ipfb_ipf; 12209 /* Try to find an existing fragment queue for this packet. */ 12210 for (;;) { 12211 ipf = ipfp[0]; 12212 if (ipf != NULL) { 12213 /* 12214 * It has to match on ident and src/dst address. 12215 */ 12216 if (ipf->ipf_ident == ident && 12217 ipf->ipf_src == src && 12218 ipf->ipf_dst == dst && 12219 ipf->ipf_protocol == proto) { 12220 /* 12221 * If we have received too many 12222 * duplicate fragments for this packet 12223 * free it. 12224 */ 12225 if (ipf->ipf_num_dups > ip_max_frag_dups) { 12226 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12227 freemsg(mp); 12228 mutex_exit(&ipfb->ipfb_lock); 12229 return (B_FALSE); 12230 } 12231 /* Found it. */ 12232 break; 12233 } 12234 ipfp = &ipf->ipf_hash_next; 12235 continue; 12236 } 12237 12238 /* 12239 * If we pruned the list, do we want to store this new 12240 * fragment?. We apply an optimization here based on the 12241 * fact that most fragments will be received in order. 12242 * So if the offset of this incoming fragment is zero, 12243 * it is the first fragment of a new packet. We will 12244 * keep it. Otherwise drop the fragment, as we have 12245 * probably pruned the packet already (since the 12246 * packet cannot be found). 12247 */ 12248 if (pruned && offset != 0) { 12249 mutex_exit(&ipfb->ipfb_lock); 12250 freemsg(mp); 12251 return (B_FALSE); 12252 } 12253 12254 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) { 12255 /* 12256 * Too many fragmented packets in this hash 12257 * bucket. Free the oldest. 12258 */ 12259 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 12260 } 12261 12262 /* New guy. Allocate a frag message. */ 12263 mp1 = allocb(sizeof (*ipf), BPRI_MED); 12264 if (mp1 == NULL) { 12265 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12266 freemsg(mp); 12267 reass_done: 12268 mutex_exit(&ipfb->ipfb_lock); 12269 return (B_FALSE); 12270 } 12271 12272 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); 12273 mp1->b_cont = mp; 12274 12275 /* Initialize the fragment header. */ 12276 ipf = (ipf_t *)mp1->b_rptr; 12277 ipf->ipf_mp = mp1; 12278 ipf->ipf_ptphn = ipfp; 12279 ipfp[0] = ipf; 12280 ipf->ipf_hash_next = NULL; 12281 ipf->ipf_ident = ident; 12282 ipf->ipf_protocol = proto; 12283 ipf->ipf_src = src; 12284 ipf->ipf_dst = dst; 12285 ipf->ipf_nf_hdr_len = 0; 12286 /* Record reassembly start time. */ 12287 ipf->ipf_timestamp = gethrestime_sec(); 12288 /* Record ipf generation and account for frag header */ 12289 ipf->ipf_gen = ill->ill_ipf_gen++; 12290 ipf->ipf_count = MBLKSIZE(mp1); 12291 ipf->ipf_last_frag_seen = B_FALSE; 12292 ipf->ipf_ecn = ecn_info; 12293 ipf->ipf_num_dups = 0; 12294 ipfb->ipfb_frag_pkts++; 12295 ipf->ipf_checksum = 0; 12296 ipf->ipf_checksum_flags = 0; 12297 12298 /* Store checksum value in fragment header */ 12299 if (sum_flags != 0) { 12300 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12301 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12302 ipf->ipf_checksum = sum_val; 12303 ipf->ipf_checksum_flags = sum_flags; 12304 } 12305 12306 /* 12307 * We handle reassembly two ways. In the easy case, 12308 * where all the fragments show up in order, we do 12309 * minimal bookkeeping, and just clip new pieces on 12310 * the end. If we ever see a hole, then we go off 12311 * to ip_reassemble which has to mark the pieces and 12312 * keep track of the number of holes, etc. Obviously, 12313 * the point of having both mechanisms is so we can 12314 * handle the easy case as efficiently as possible. 12315 */ 12316 if (offset == 0) { 12317 /* Easy case, in-order reassembly so far. */ 12318 ipf->ipf_count += msg_len; 12319 ipf->ipf_tail_mp = tail_mp; 12320 /* 12321 * Keep track of next expected offset in 12322 * ipf_end. 12323 */ 12324 ipf->ipf_end = end; 12325 ipf->ipf_nf_hdr_len = hdr_length; 12326 } else { 12327 /* Hard case, hole at the beginning. */ 12328 ipf->ipf_tail_mp = NULL; 12329 /* 12330 * ipf_end == 0 means that we have given up 12331 * on easy reassembly. 12332 */ 12333 ipf->ipf_end = 0; 12334 12335 /* Forget checksum offload from now on */ 12336 ipf->ipf_checksum_flags = 0; 12337 12338 /* 12339 * ipf_hole_cnt is set by ip_reassemble. 12340 * ipf_count is updated by ip_reassemble. 12341 * No need to check for return value here 12342 * as we don't expect reassembly to complete 12343 * or fail for the first fragment itself. 12344 */ 12345 (void) ip_reassemble(mp, ipf, 12346 (frag_offset_flags & IPH_OFFSET) << 3, 12347 (frag_offset_flags & IPH_MF), ill, msg_len); 12348 } 12349 /* Update per ipfb and ill byte counts */ 12350 ipfb->ipfb_count += ipf->ipf_count; 12351 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12352 atomic_add_32(&ill->ill_frag_count, ipf->ipf_count); 12353 /* If the frag timer wasn't already going, start it. */ 12354 mutex_enter(&ill->ill_lock); 12355 ill_frag_timer_start(ill); 12356 mutex_exit(&ill->ill_lock); 12357 goto reass_done; 12358 } 12359 12360 /* 12361 * If the packet's flag has changed (it could be coming up 12362 * from an interface different than the previous, therefore 12363 * possibly different checksum capability), then forget about 12364 * any stored checksum states. Otherwise add the value to 12365 * the existing one stored in the fragment header. 12366 */ 12367 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 12368 sum_val += ipf->ipf_checksum; 12369 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12370 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12371 ipf->ipf_checksum = sum_val; 12372 } else if (ipf->ipf_checksum_flags != 0) { 12373 /* Forget checksum offload from now on */ 12374 ipf->ipf_checksum_flags = 0; 12375 } 12376 12377 /* 12378 * We have a new piece of a datagram which is already being 12379 * reassembled. Update the ECN info if all IP fragments 12380 * are ECN capable. If there is one which is not, clear 12381 * all the info. If there is at least one which has CE 12382 * code point, IP needs to report that up to transport. 12383 */ 12384 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 12385 if (ecn_info == IPH_ECN_CE) 12386 ipf->ipf_ecn = IPH_ECN_CE; 12387 } else { 12388 ipf->ipf_ecn = IPH_ECN_NECT; 12389 } 12390 if (offset && ipf->ipf_end == offset) { 12391 /* The new fragment fits at the end */ 12392 ipf->ipf_tail_mp->b_cont = mp; 12393 /* Update the byte count */ 12394 ipf->ipf_count += msg_len; 12395 /* Update per ipfb and ill byte counts */ 12396 ipfb->ipfb_count += msg_len; 12397 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12398 atomic_add_32(&ill->ill_frag_count, msg_len); 12399 if (frag_offset_flags & IPH_MF) { 12400 /* More to come. */ 12401 ipf->ipf_end = end; 12402 ipf->ipf_tail_mp = tail_mp; 12403 goto reass_done; 12404 } 12405 } else { 12406 /* Go do the hard cases. */ 12407 int ret; 12408 12409 if (offset == 0) 12410 ipf->ipf_nf_hdr_len = hdr_length; 12411 12412 /* Save current byte count */ 12413 count = ipf->ipf_count; 12414 ret = ip_reassemble(mp, ipf, 12415 (frag_offset_flags & IPH_OFFSET) << 3, 12416 (frag_offset_flags & IPH_MF), ill, msg_len); 12417 /* Count of bytes added and subtracted (freeb()ed) */ 12418 count = ipf->ipf_count - count; 12419 if (count) { 12420 /* Update per ipfb and ill byte counts */ 12421 ipfb->ipfb_count += count; 12422 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12423 atomic_add_32(&ill->ill_frag_count, count); 12424 } 12425 if (ret == IP_REASS_PARTIAL) { 12426 goto reass_done; 12427 } else if (ret == IP_REASS_FAILED) { 12428 /* Reassembly failed. Free up all resources */ 12429 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12430 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 12431 IP_REASS_SET_START(t_mp, 0); 12432 IP_REASS_SET_END(t_mp, 0); 12433 } 12434 freemsg(mp); 12435 goto reass_done; 12436 } 12437 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 12438 } 12439 /* 12440 * We have completed reassembly. Unhook the frag header from 12441 * the reassembly list. 12442 * 12443 * Before we free the frag header, record the ECN info 12444 * to report back to the transport. 12445 */ 12446 ecn_info = ipf->ipf_ecn; 12447 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs); 12448 ipfp = ipf->ipf_ptphn; 12449 12450 /* We need to supply these to caller */ 12451 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 12452 sum_val = ipf->ipf_checksum; 12453 else 12454 sum_val = 0; 12455 12456 mp1 = ipf->ipf_mp; 12457 count = ipf->ipf_count; 12458 ipf = ipf->ipf_hash_next; 12459 if (ipf != NULL) 12460 ipf->ipf_ptphn = ipfp; 12461 ipfp[0] = ipf; 12462 atomic_add_32(&ill->ill_frag_count, -count); 12463 ASSERT(ipfb->ipfb_count >= count); 12464 ipfb->ipfb_count -= count; 12465 ipfb->ipfb_frag_pkts--; 12466 mutex_exit(&ipfb->ipfb_lock); 12467 /* Ditch the frag header. */ 12468 mp = mp1->b_cont; 12469 12470 freeb(mp1); 12471 12472 /* Restore original IP length in header. */ 12473 packet_size = (uint32_t)msgdsize(mp); 12474 if (packet_size > IP_MAXPACKET) { 12475 freemsg(mp); 12476 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 12477 return (B_FALSE); 12478 } 12479 12480 if (DB_REF(mp) > 1) { 12481 mblk_t *mp2 = copymsg(mp); 12482 12483 freemsg(mp); 12484 if (mp2 == NULL) { 12485 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12486 return (B_FALSE); 12487 } 12488 mp = mp2; 12489 } 12490 ipha = (ipha_t *)mp->b_rptr; 12491 12492 ipha->ipha_length = htons((uint16_t)packet_size); 12493 /* We're now complete, zip the frag state */ 12494 ipha->ipha_fragment_offset_and_flags = 0; 12495 /* Record the ECN info. */ 12496 ipha->ipha_type_of_service &= 0xFC; 12497 ipha->ipha_type_of_service |= ecn_info; 12498 *mpp = mp; 12499 12500 /* Reassembly is successful; return checksum information if needed */ 12501 if (cksum_val != NULL) 12502 *cksum_val = sum_val; 12503 if (cksum_flags != NULL) 12504 *cksum_flags = sum_flags; 12505 12506 return (B_TRUE); 12507 } 12508 12509 /* 12510 * Perform ip header check sum update local options. 12511 * return B_TRUE if all is well, else return B_FALSE and release 12512 * the mp. caller is responsible for decrementing ire ref cnt. 12513 */ 12514 static boolean_t 12515 ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12516 ip_stack_t *ipst) 12517 { 12518 mblk_t *first_mp; 12519 boolean_t mctl_present; 12520 uint16_t sum; 12521 12522 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12523 /* 12524 * Don't do the checksum if it has gone through AH/ESP 12525 * processing. 12526 */ 12527 if (!mctl_present) { 12528 sum = ip_csum_hdr(ipha); 12529 if (sum != 0) { 12530 if (ill != NULL) { 12531 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12532 } else { 12533 BUMP_MIB(&ipst->ips_ip_mib, 12534 ipIfStatsInCksumErrs); 12535 } 12536 freemsg(first_mp); 12537 return (B_FALSE); 12538 } 12539 } 12540 12541 if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) { 12542 if (mctl_present) 12543 freeb(first_mp); 12544 return (B_FALSE); 12545 } 12546 12547 return (B_TRUE); 12548 } 12549 12550 /* 12551 * All udp packet are delivered to the local host via this routine. 12552 */ 12553 void 12554 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12555 ill_t *recv_ill) 12556 { 12557 uint32_t sum; 12558 uint32_t u1; 12559 boolean_t mctl_present; 12560 conn_t *connp; 12561 mblk_t *first_mp; 12562 uint16_t *up; 12563 ill_t *ill = (ill_t *)q->q_ptr; 12564 uint16_t reass_hck_flags = 0; 12565 ip_stack_t *ipst; 12566 12567 ASSERT(recv_ill != NULL); 12568 ipst = recv_ill->ill_ipst; 12569 12570 #define rptr ((uchar_t *)ipha) 12571 12572 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12573 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 12574 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12575 ASSERT(ill != NULL); 12576 12577 /* 12578 * FAST PATH for udp packets 12579 */ 12580 12581 /* u1 is # words of IP options */ 12582 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 12583 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12584 12585 /* IP options present */ 12586 if (u1 != 0) 12587 goto ipoptions; 12588 12589 /* Check the IP header checksum. */ 12590 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { 12591 /* Clear the IP header h/w cksum flag */ 12592 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12593 } else if (!mctl_present) { 12594 /* 12595 * Don't verify header checksum if this packet is coming 12596 * back from AH/ESP as we already did it. 12597 */ 12598 #define uph ((uint16_t *)ipha) 12599 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 12600 uph[6] + uph[7] + uph[8] + uph[9]; 12601 #undef uph 12602 /* finish doing IP checksum */ 12603 sum = (sum & 0xFFFF) + (sum >> 16); 12604 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12605 if (sum != 0 && sum != 0xFFFF) { 12606 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12607 freemsg(first_mp); 12608 return; 12609 } 12610 } 12611 12612 /* 12613 * Count for SNMP of inbound packets for ire. 12614 * if mctl is present this might be a secure packet and 12615 * has already been counted for in ip_proto_input(). 12616 */ 12617 if (!mctl_present) { 12618 UPDATE_IB_PKT_COUNT(ire); 12619 ire->ire_last_used_time = lbolt; 12620 } 12621 12622 /* packet part of fragmented IP packet? */ 12623 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12624 if (u1 & (IPH_MF | IPH_OFFSET)) { 12625 goto fragmented; 12626 } 12627 12628 /* u1 = IP header length (20 bytes) */ 12629 u1 = IP_SIMPLE_HDR_LENGTH; 12630 12631 /* packet does not contain complete IP & UDP headers */ 12632 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 12633 goto udppullup; 12634 12635 /* up points to UDP header */ 12636 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 12637 #define iphs ((uint16_t *)ipha) 12638 12639 /* if udp hdr cksum != 0, then need to checksum udp packet */ 12640 if (up[3] != 0) { 12641 mblk_t *mp1 = mp->b_cont; 12642 boolean_t cksum_err; 12643 uint16_t hck_flags = 0; 12644 12645 /* Pseudo-header checksum */ 12646 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12647 iphs[9] + up[2]; 12648 12649 /* 12650 * Revert to software checksum calculation if the interface 12651 * isn't capable of checksum offload or if IPsec is present. 12652 */ 12653 if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) 12654 hck_flags = DB_CKSUMFLAGS(mp); 12655 12656 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12657 IP_STAT(ipst, ip_in_sw_cksum); 12658 12659 IP_CKSUM_RECV(hck_flags, u1, 12660 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12661 (int32_t)((uchar_t *)up - rptr), 12662 mp, mp1, cksum_err); 12663 12664 if (cksum_err) { 12665 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12666 if (hck_flags & HCK_FULLCKSUM) 12667 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12668 else if (hck_flags & HCK_PARTIALCKSUM) 12669 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12670 else 12671 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12672 12673 freemsg(first_mp); 12674 return; 12675 } 12676 } 12677 12678 /* Non-fragmented broadcast or multicast packet? */ 12679 if (ire->ire_type == IRE_BROADCAST) 12680 goto udpslowpath; 12681 12682 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 12683 ire->ire_zoneid, ipst)) != NULL) { 12684 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); 12685 IP_STAT(ipst, ip_udp_fast_path); 12686 12687 if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || 12688 (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { 12689 freemsg(mp); 12690 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 12691 } else { 12692 if (!mctl_present) { 12693 BUMP_MIB(ill->ill_ip_mib, 12694 ipIfStatsHCInDelivers); 12695 } 12696 /* 12697 * mp and first_mp can change. 12698 */ 12699 if (ip_udp_check(q, connp, recv_ill, 12700 ipha, &mp, &first_mp, mctl_present, ire)) { 12701 /* Send it upstream */ 12702 (connp->conn_recv)(connp, mp, NULL); 12703 } 12704 } 12705 /* 12706 * freeb() cannot deal with null mblk being passed 12707 * in and first_mp can be set to null in the call 12708 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 12709 */ 12710 if (mctl_present && first_mp != NULL) { 12711 freeb(first_mp); 12712 } 12713 CONN_DEC_REF(connp); 12714 return; 12715 } 12716 12717 /* 12718 * if we got here we know the packet is not fragmented and 12719 * has no options. The classifier could not find a conn_t and 12720 * most likely its an icmp packet so send it through slow path. 12721 */ 12722 12723 goto udpslowpath; 12724 12725 ipoptions: 12726 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 12727 goto slow_done; 12728 } 12729 12730 UPDATE_IB_PKT_COUNT(ire); 12731 ire->ire_last_used_time = lbolt; 12732 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12733 if (u1 & (IPH_MF | IPH_OFFSET)) { 12734 fragmented: 12735 /* 12736 * "sum" and "reass_hck_flags" are non-zero if the 12737 * reassembled packet has a valid hardware computed 12738 * checksum information associated with it. 12739 */ 12740 if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum, 12741 &reass_hck_flags)) { 12742 goto slow_done; 12743 } 12744 12745 /* 12746 * Make sure that first_mp points back to mp as 12747 * the mp we came in with could have changed in 12748 * ip_rput_fragment(). 12749 */ 12750 ASSERT(!mctl_present); 12751 ipha = (ipha_t *)mp->b_rptr; 12752 first_mp = mp; 12753 } 12754 12755 /* Now we have a complete datagram, destined for this machine. */ 12756 u1 = IPH_HDR_LENGTH(ipha); 12757 /* Pull up the UDP header, if necessary. */ 12758 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12759 udppullup: 12760 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12761 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12762 freemsg(first_mp); 12763 goto slow_done; 12764 } 12765 ipha = (ipha_t *)mp->b_rptr; 12766 } 12767 12768 /* 12769 * Validate the checksum for the reassembled packet; for the 12770 * pullup case we calculate the payload checksum in software. 12771 */ 12772 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12773 if (up[3] != 0) { 12774 boolean_t cksum_err; 12775 12776 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12777 IP_STAT(ipst, ip_in_sw_cksum); 12778 12779 IP_CKSUM_RECV_REASS(reass_hck_flags, 12780 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12781 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12782 iphs[9] + up[2], sum, cksum_err); 12783 12784 if (cksum_err) { 12785 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12786 12787 if (reass_hck_flags & HCK_FULLCKSUM) 12788 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12789 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12790 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12791 else 12792 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12793 12794 freemsg(first_mp); 12795 goto slow_done; 12796 } 12797 } 12798 udpslowpath: 12799 12800 /* Clear hardware checksum flag to be safe */ 12801 DB_CKSUMFLAGS(mp) = 0; 12802 12803 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12804 (ire->ire_type == IRE_BROADCAST), 12805 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO, 12806 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12807 12808 slow_done: 12809 IP_STAT(ipst, ip_udp_slow_path); 12810 return; 12811 12812 #undef iphs 12813 #undef rptr 12814 } 12815 12816 static boolean_t 12817 ip_iptun_input(mblk_t *ipsec_mp, mblk_t *data_mp, ipha_t *ipha, ill_t *ill, 12818 ire_t *ire, ip_stack_t *ipst) 12819 { 12820 conn_t *connp; 12821 12822 ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp); 12823 12824 if ((connp = ipcl_classify_v4(data_mp, ipha->ipha_protocol, 12825 IP_SIMPLE_HDR_LENGTH, ire->ire_zoneid, ipst)) != NULL) { 12826 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 12827 connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp, 12828 NULL); 12829 CONN_DEC_REF(connp); 12830 return (B_TRUE); 12831 } 12832 return (B_FALSE); 12833 } 12834 12835 /* ARGSUSED */ 12836 static mblk_t * 12837 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12838 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12839 ill_rx_ring_t *ill_ring) 12840 { 12841 conn_t *connp; 12842 uint32_t sum; 12843 uint32_t u1; 12844 uint16_t *up; 12845 int offset; 12846 ssize_t len; 12847 mblk_t *mp1; 12848 boolean_t syn_present = B_FALSE; 12849 tcph_t *tcph; 12850 uint_t tcph_flags; 12851 uint_t ip_hdr_len; 12852 ill_t *ill = (ill_t *)q->q_ptr; 12853 zoneid_t zoneid = ire->ire_zoneid; 12854 boolean_t cksum_err; 12855 uint16_t hck_flags = 0; 12856 ip_stack_t *ipst = recv_ill->ill_ipst; 12857 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12858 12859 #define rptr ((uchar_t *)ipha) 12860 12861 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12862 ASSERT(ill != NULL); 12863 12864 /* 12865 * FAST PATH for tcp packets 12866 */ 12867 12868 /* u1 is # words of IP options */ 12869 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12870 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12871 12872 /* IP options present */ 12873 if (u1) { 12874 goto ipoptions; 12875 } else if (!mctl_present) { 12876 /* Check the IP header checksum. */ 12877 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { 12878 /* Clear the IP header h/w cksum flag */ 12879 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12880 } else if (!mctl_present) { 12881 /* 12882 * Don't verify header checksum if this packet 12883 * is coming back from AH/ESP as we already did it. 12884 */ 12885 #define uph ((uint16_t *)ipha) 12886 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12887 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12888 #undef uph 12889 /* finish doing IP checksum */ 12890 sum = (sum & 0xFFFF) + (sum >> 16); 12891 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12892 if (sum != 0 && sum != 0xFFFF) { 12893 BUMP_MIB(ill->ill_ip_mib, 12894 ipIfStatsInCksumErrs); 12895 goto error; 12896 } 12897 } 12898 } 12899 12900 if (!mctl_present) { 12901 UPDATE_IB_PKT_COUNT(ire); 12902 ire->ire_last_used_time = lbolt; 12903 } 12904 12905 /* packet part of fragmented IP packet? */ 12906 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12907 if (u1 & (IPH_MF | IPH_OFFSET)) { 12908 goto fragmented; 12909 } 12910 12911 /* u1 = IP header length (20 bytes) */ 12912 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 12913 12914 /* does packet contain IP+TCP headers? */ 12915 len = mp->b_wptr - rptr; 12916 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 12917 IP_STAT(ipst, ip_tcppullup); 12918 goto tcppullup; 12919 } 12920 12921 /* TCP options present? */ 12922 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 12923 12924 /* 12925 * If options need to be pulled up, then goto tcpoptions. 12926 * otherwise we are still in the fast path 12927 */ 12928 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 12929 IP_STAT(ipst, ip_tcpoptions); 12930 goto tcpoptions; 12931 } 12932 12933 /* multiple mblks of tcp data? */ 12934 if ((mp1 = mp->b_cont) != NULL) { 12935 IP_STAT(ipst, ip_multipkttcp); 12936 len += msgdsize(mp1); 12937 } 12938 12939 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 12940 12941 /* part of pseudo checksum */ 12942 12943 /* TCP datagram length */ 12944 u1 = len - IP_SIMPLE_HDR_LENGTH; 12945 12946 #define iphs ((uint16_t *)ipha) 12947 12948 #ifdef _BIG_ENDIAN 12949 u1 += IPPROTO_TCP; 12950 #else 12951 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12952 #endif 12953 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12954 12955 /* 12956 * Revert to software checksum calculation if the interface 12957 * isn't capable of checksum offload or if IPsec is present. 12958 */ 12959 if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) 12960 hck_flags = DB_CKSUMFLAGS(mp); 12961 12962 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12963 IP_STAT(ipst, ip_in_sw_cksum); 12964 12965 IP_CKSUM_RECV(hck_flags, u1, 12966 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12967 (int32_t)((uchar_t *)up - rptr), 12968 mp, mp1, cksum_err); 12969 12970 if (cksum_err) { 12971 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 12972 12973 if (hck_flags & HCK_FULLCKSUM) 12974 IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); 12975 else if (hck_flags & HCK_PARTIALCKSUM) 12976 IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); 12977 else 12978 IP_STAT(ipst, ip_tcp_in_sw_cksum_err); 12979 12980 goto error; 12981 } 12982 12983 try_again: 12984 12985 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 12986 zoneid, ipst)) == NULL) { 12987 /* Send the TH_RST */ 12988 goto no_conn; 12989 } 12990 12991 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 12992 tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG); 12993 12994 /* 12995 * TCP FAST PATH for AF_INET socket. 12996 * 12997 * TCP fast path to avoid extra work. An AF_INET socket type 12998 * does not have facility to receive extra information via 12999 * ip_process or ip_add_info. Also, when the connection was 13000 * established, we made a check if this connection is impacted 13001 * by any global IPsec policy or per connection policy (a 13002 * policy that comes in effect later will not apply to this 13003 * connection). Since all this can be determined at the 13004 * connection establishment time, a quick check of flags 13005 * can avoid extra work. 13006 */ 13007 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 13008 !IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13009 ASSERT(first_mp == mp); 13010 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13011 if (tcph_flags != (TH_SYN | TH_ACK)) { 13012 SET_SQUEUE(mp, tcp_rput_data, connp); 13013 return (mp); 13014 } 13015 mp->b_datap->db_struioflag |= STRUIO_CONNECT; 13016 DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring); 13017 SET_SQUEUE(mp, tcp_input, connp); 13018 return (mp); 13019 } 13020 13021 if (tcph_flags == TH_SYN) { 13022 if (IPCL_IS_TCP(connp)) { 13023 mp->b_datap->db_struioflag |= STRUIO_EAGER; 13024 DB_CKSUMSTART(mp) = 13025 (intptr_t)ip_squeue_get(ill_ring); 13026 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 13027 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13028 BUMP_MIB(ill->ill_ip_mib, 13029 ipIfStatsHCInDelivers); 13030 SET_SQUEUE(mp, connp->conn_recv, connp); 13031 return (mp); 13032 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 13033 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13034 BUMP_MIB(ill->ill_ip_mib, 13035 ipIfStatsHCInDelivers); 13036 ip_squeue_enter_unbound++; 13037 SET_SQUEUE(mp, tcp_conn_request_unbound, 13038 connp); 13039 return (mp); 13040 } 13041 syn_present = B_TRUE; 13042 } 13043 } 13044 13045 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 13046 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 13047 13048 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13049 /* No need to send this packet to TCP */ 13050 if ((flags & TH_RST) || (flags & TH_URG)) { 13051 CONN_DEC_REF(connp); 13052 freemsg(first_mp); 13053 return (NULL); 13054 } 13055 if (flags & TH_ACK) { 13056 ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid, 13057 ipst->ips_netstack->netstack_tcp, connp); 13058 CONN_DEC_REF(connp); 13059 return (NULL); 13060 } 13061 13062 CONN_DEC_REF(connp); 13063 freemsg(first_mp); 13064 return (NULL); 13065 } 13066 13067 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 13068 first_mp = ipsec_check_inbound_policy(first_mp, connp, 13069 ipha, NULL, mctl_present); 13070 if (first_mp == NULL) { 13071 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13072 CONN_DEC_REF(connp); 13073 return (NULL); 13074 } 13075 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 13076 ASSERT(syn_present); 13077 if (mctl_present) { 13078 ASSERT(first_mp != mp); 13079 first_mp->b_datap->db_struioflag |= 13080 STRUIO_POLICY; 13081 } else { 13082 ASSERT(first_mp == mp); 13083 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 13084 mp->b_datap->db_struioflag |= STRUIO_POLICY; 13085 } 13086 } else { 13087 /* 13088 * Discard first_mp early since we're dealing with a 13089 * fully-connected conn_t and tcp doesn't do policy in 13090 * this case. 13091 */ 13092 if (mctl_present) { 13093 freeb(first_mp); 13094 mctl_present = B_FALSE; 13095 } 13096 first_mp = mp; 13097 } 13098 } 13099 13100 /* Initiate IPPF processing for fastpath */ 13101 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13102 uint32_t ill_index; 13103 13104 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13105 ip_process(IPP_LOCAL_IN, &mp, ill_index); 13106 if (mp == NULL) { 13107 ip2dbg(("ip_input_ipsec_process: TCP pkt " 13108 "deferred/dropped during IPPF processing\n")); 13109 CONN_DEC_REF(connp); 13110 if (mctl_present) 13111 freeb(first_mp); 13112 return (NULL); 13113 } else if (mctl_present) { 13114 /* 13115 * ip_process might return a new mp. 13116 */ 13117 ASSERT(first_mp != mp); 13118 first_mp->b_cont = mp; 13119 } else { 13120 first_mp = mp; 13121 } 13122 13123 } 13124 13125 if (!syn_present && connp->conn_ip_recvpktinfo) { 13126 /* 13127 * TCP does not support IP_RECVPKTINFO for v4 so lets 13128 * make sure IPF_RECVIF is passed to ip_add_info. 13129 */ 13130 mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF, 13131 IPCL_ZONEID(connp), ipst); 13132 if (mp == NULL) { 13133 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13134 CONN_DEC_REF(connp); 13135 if (mctl_present) 13136 freeb(first_mp); 13137 return (NULL); 13138 } else if (mctl_present) { 13139 /* 13140 * ip_add_info might return a new mp. 13141 */ 13142 ASSERT(first_mp != mp); 13143 first_mp->b_cont = mp; 13144 } else { 13145 first_mp = mp; 13146 } 13147 } 13148 13149 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13150 if (IPCL_IS_TCP(connp)) { 13151 SET_SQUEUE(first_mp, connp->conn_recv, connp); 13152 return (first_mp); 13153 } else { 13154 /* SOCK_RAW, IPPROTO_TCP case */ 13155 (connp->conn_recv)(connp, first_mp, NULL); 13156 CONN_DEC_REF(connp); 13157 return (NULL); 13158 } 13159 13160 no_conn: 13161 /* Initiate IPPf processing, if needed. */ 13162 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13163 uint32_t ill_index; 13164 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13165 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 13166 if (first_mp == NULL) { 13167 return (NULL); 13168 } 13169 } 13170 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13171 13172 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid, 13173 ipst->ips_netstack->netstack_tcp, NULL); 13174 return (NULL); 13175 ipoptions: 13176 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) { 13177 goto slow_done; 13178 } 13179 13180 UPDATE_IB_PKT_COUNT(ire); 13181 ire->ire_last_used_time = lbolt; 13182 13183 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13184 if (u1 & (IPH_MF | IPH_OFFSET)) { 13185 fragmented: 13186 if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) { 13187 if (mctl_present) 13188 freeb(first_mp); 13189 goto slow_done; 13190 } 13191 /* 13192 * Make sure that first_mp points back to mp as 13193 * the mp we came in with could have changed in 13194 * ip_rput_fragment(). 13195 */ 13196 ASSERT(!mctl_present); 13197 ipha = (ipha_t *)mp->b_rptr; 13198 first_mp = mp; 13199 } 13200 13201 /* Now we have a complete datagram, destined for this machine. */ 13202 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 13203 13204 len = mp->b_wptr - mp->b_rptr; 13205 /* Pull up a minimal TCP header, if necessary. */ 13206 if (len < (u1 + 20)) { 13207 tcppullup: 13208 if (!pullupmsg(mp, u1 + 20)) { 13209 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13210 goto error; 13211 } 13212 ipha = (ipha_t *)mp->b_rptr; 13213 len = mp->b_wptr - mp->b_rptr; 13214 } 13215 13216 /* 13217 * Extract the offset field from the TCP header. As usual, we 13218 * try to help the compiler more than the reader. 13219 */ 13220 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 13221 if (offset != 5) { 13222 tcpoptions: 13223 if (offset < 5) { 13224 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13225 goto error; 13226 } 13227 /* 13228 * There must be TCP options. 13229 * Make sure we can grab them. 13230 */ 13231 offset <<= 2; 13232 offset += u1; 13233 if (len < offset) { 13234 if (!pullupmsg(mp, offset)) { 13235 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13236 goto error; 13237 } 13238 ipha = (ipha_t *)mp->b_rptr; 13239 len = mp->b_wptr - rptr; 13240 } 13241 } 13242 13243 /* Get the total packet length in len, including headers. */ 13244 if (mp->b_cont) 13245 len = msgdsize(mp); 13246 13247 /* 13248 * Check the TCP checksum by pulling together the pseudo- 13249 * header checksum, and passing it to ip_csum to be added in 13250 * with the TCP datagram. 13251 * 13252 * Since we are not using the hwcksum if available we must 13253 * clear the flag. We may come here via tcppullup or tcpoptions. 13254 * If either of these fails along the way the mblk is freed. 13255 * If this logic ever changes and mblk is reused to say send 13256 * ICMP's back, then this flag may need to be cleared in 13257 * other places as well. 13258 */ 13259 DB_CKSUMFLAGS(mp) = 0; 13260 13261 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 13262 13263 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 13264 #ifdef _BIG_ENDIAN 13265 u1 += IPPROTO_TCP; 13266 #else 13267 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13268 #endif 13269 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13270 /* 13271 * Not M_DATA mblk or its a dup, so do the checksum now. 13272 */ 13273 IP_STAT(ipst, ip_in_sw_cksum); 13274 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 13275 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13276 goto error; 13277 } 13278 13279 IP_STAT(ipst, ip_tcp_slow_path); 13280 goto try_again; 13281 #undef iphs 13282 #undef rptr 13283 13284 error: 13285 freemsg(first_mp); 13286 slow_done: 13287 return (NULL); 13288 } 13289 13290 /* ARGSUSED */ 13291 static void 13292 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 13293 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 13294 { 13295 conn_t *connp; 13296 uint32_t sum; 13297 uint32_t u1; 13298 ssize_t len; 13299 sctp_hdr_t *sctph; 13300 zoneid_t zoneid = ire->ire_zoneid; 13301 uint32_t pktsum; 13302 uint32_t calcsum; 13303 uint32_t ports; 13304 in6_addr_t map_src, map_dst; 13305 ill_t *ill = (ill_t *)q->q_ptr; 13306 ip_stack_t *ipst; 13307 sctp_stack_t *sctps; 13308 boolean_t sctp_csum_err = B_FALSE; 13309 13310 ASSERT(recv_ill != NULL); 13311 ipst = recv_ill->ill_ipst; 13312 sctps = ipst->ips_netstack->netstack_sctp; 13313 13314 #define rptr ((uchar_t *)ipha) 13315 13316 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 13317 ASSERT(ill != NULL); 13318 13319 /* u1 is # words of IP options */ 13320 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 13321 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 13322 13323 /* IP options present */ 13324 if (u1 > 0) { 13325 goto ipoptions; 13326 } else { 13327 /* Check the IP header checksum. */ 13328 if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) && 13329 !mctl_present) { 13330 #define uph ((uint16_t *)ipha) 13331 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 13332 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 13333 #undef uph 13334 /* finish doing IP checksum */ 13335 sum = (sum & 0xFFFF) + (sum >> 16); 13336 sum = ~(sum + (sum >> 16)) & 0xFFFF; 13337 /* 13338 * Don't verify header checksum if this packet 13339 * is coming back from AH/ESP as we already did it. 13340 */ 13341 if (sum != 0 && sum != 0xFFFF) { 13342 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 13343 goto error; 13344 } 13345 } 13346 /* 13347 * Since there is no SCTP h/w cksum support yet, just 13348 * clear the flag. 13349 */ 13350 DB_CKSUMFLAGS(mp) = 0; 13351 } 13352 13353 /* 13354 * Don't verify header checksum if this packet is coming 13355 * back from AH/ESP as we already did it. 13356 */ 13357 if (!mctl_present) { 13358 UPDATE_IB_PKT_COUNT(ire); 13359 ire->ire_last_used_time = lbolt; 13360 } 13361 13362 /* packet part of fragmented IP packet? */ 13363 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13364 if (u1 & (IPH_MF | IPH_OFFSET)) 13365 goto fragmented; 13366 13367 /* u1 = IP header length (20 bytes) */ 13368 u1 = IP_SIMPLE_HDR_LENGTH; 13369 13370 find_sctp_client: 13371 /* Pullup if we don't have the sctp common header. */ 13372 len = MBLKL(mp); 13373 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 13374 if (mp->b_cont == NULL || 13375 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 13376 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13377 goto error; 13378 } 13379 ipha = (ipha_t *)mp->b_rptr; 13380 len = MBLKL(mp); 13381 } 13382 13383 sctph = (sctp_hdr_t *)(rptr + u1); 13384 #ifdef DEBUG 13385 if (!skip_sctp_cksum) { 13386 #endif 13387 pktsum = sctph->sh_chksum; 13388 sctph->sh_chksum = 0; 13389 calcsum = sctp_cksum(mp, u1); 13390 sctph->sh_chksum = pktsum; 13391 if (calcsum != pktsum) 13392 sctp_csum_err = B_TRUE; 13393 #ifdef DEBUG /* skip_sctp_cksum */ 13394 } 13395 #endif 13396 /* get the ports */ 13397 ports = *(uint32_t *)&sctph->sh_sport; 13398 13399 IRE_REFRELE(ire); 13400 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 13401 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 13402 if (sctp_csum_err) { 13403 /* 13404 * No potential sctp checksum errors go to the Sun 13405 * sctp stack however they might be Adler-32 summed 13406 * packets a userland stack bound to a raw IP socket 13407 * could reasonably use. Note though that Adler-32 is 13408 * a long deprecated algorithm and customer sctp 13409 * networks should eventually migrate to CRC-32 at 13410 * which time this facility should be removed. 13411 */ 13412 flags |= IP_FF_SCTP_CSUM_ERR; 13413 goto no_conn; 13414 } 13415 if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp, 13416 sctps)) == NULL) { 13417 /* Check for raw socket or OOTB handling */ 13418 goto no_conn; 13419 } 13420 13421 /* Found a client; up it goes */ 13422 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13423 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 13424 return; 13425 13426 no_conn: 13427 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 13428 ports, mctl_present, flags, B_TRUE, zoneid); 13429 return; 13430 13431 ipoptions: 13432 DB_CKSUMFLAGS(mp) = 0; 13433 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) 13434 goto slow_done; 13435 13436 UPDATE_IB_PKT_COUNT(ire); 13437 ire->ire_last_used_time = lbolt; 13438 13439 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13440 if (u1 & (IPH_MF | IPH_OFFSET)) { 13441 fragmented: 13442 if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) 13443 goto slow_done; 13444 /* 13445 * Make sure that first_mp points back to mp as 13446 * the mp we came in with could have changed in 13447 * ip_rput_fragment(). 13448 */ 13449 ASSERT(!mctl_present); 13450 ipha = (ipha_t *)mp->b_rptr; 13451 first_mp = mp; 13452 } 13453 13454 /* Now we have a complete datagram, destined for this machine. */ 13455 u1 = IPH_HDR_LENGTH(ipha); 13456 goto find_sctp_client; 13457 #undef iphs 13458 #undef rptr 13459 13460 error: 13461 freemsg(first_mp); 13462 slow_done: 13463 IRE_REFRELE(ire); 13464 } 13465 13466 #define VER_BITS 0xF0 13467 #define VERSION_6 0x60 13468 13469 static boolean_t 13470 ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp, 13471 ipaddr_t *dstp, ip_stack_t *ipst) 13472 { 13473 uint_t opt_len; 13474 ipha_t *ipha; 13475 ssize_t len; 13476 uint_t pkt_len; 13477 13478 ASSERT(ill != NULL); 13479 IP_STAT(ipst, ip_ipoptions); 13480 ipha = *iphapp; 13481 13482 #define rptr ((uchar_t *)ipha) 13483 /* Assume no IPv6 packets arrive over the IPv4 queue */ 13484 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 13485 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); 13486 freemsg(mp); 13487 return (B_FALSE); 13488 } 13489 13490 /* multiple mblk or too short */ 13491 pkt_len = ntohs(ipha->ipha_length); 13492 13493 /* Get the number of words of IP options in the IP header. */ 13494 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 13495 if (opt_len) { 13496 /* IP Options present! Validate and process. */ 13497 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 13498 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13499 goto done; 13500 } 13501 /* 13502 * Recompute complete header length and make sure we 13503 * have access to all of it. 13504 */ 13505 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 13506 if (len > (mp->b_wptr - rptr)) { 13507 if (len > pkt_len) { 13508 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13509 goto done; 13510 } 13511 if (!pullupmsg(mp, len)) { 13512 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13513 goto done; 13514 } 13515 ipha = (ipha_t *)mp->b_rptr; 13516 } 13517 /* 13518 * Go off to ip_rput_options which returns the next hop 13519 * destination address, which may have been affected 13520 * by source routing. 13521 */ 13522 IP_STAT(ipst, ip_opt); 13523 if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) { 13524 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13525 return (B_FALSE); 13526 } 13527 } 13528 *iphapp = ipha; 13529 return (B_TRUE); 13530 done: 13531 /* clear b_prev - used by ip_mroute_decap */ 13532 mp->b_prev = NULL; 13533 freemsg(mp); 13534 return (B_FALSE); 13535 #undef rptr 13536 } 13537 13538 /* 13539 * Deal with the fact that there is no ire for the destination. 13540 */ 13541 static ire_t * 13542 ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst) 13543 { 13544 ipha_t *ipha; 13545 ill_t *ill; 13546 ire_t *ire; 13547 ip_stack_t *ipst; 13548 enum ire_forward_action ret_action; 13549 13550 ipha = (ipha_t *)mp->b_rptr; 13551 ill = (ill_t *)q->q_ptr; 13552 13553 ASSERT(ill != NULL); 13554 ipst = ill->ill_ipst; 13555 13556 /* 13557 * No IRE for this destination, so it can't be for us. 13558 * Unless we are forwarding, drop the packet. 13559 * We have to let source routed packets through 13560 * since we don't yet know if they are 'ping -l' 13561 * packets i.e. if they will go out over the 13562 * same interface as they came in on. 13563 */ 13564 if (ll_multicast) { 13565 freemsg(mp); 13566 return (NULL); 13567 } 13568 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { 13569 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13570 freemsg(mp); 13571 return (NULL); 13572 } 13573 13574 /* 13575 * Mark this packet as having originated externally. 13576 * 13577 * For non-forwarding code path, ire_send later double 13578 * checks this interface to see if it is still exists 13579 * post-ARP resolution. 13580 * 13581 * Also, IPQOS uses this to differentiate between 13582 * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP 13583 * QOS packet processing in ip_wput_attach_llhdr(). 13584 * The QoS module can mark the b_band for a fastpath message 13585 * or the dl_priority field in a unitdata_req header for 13586 * CoS marking. This info can only be found in 13587 * ip_wput_attach_llhdr(). 13588 */ 13589 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 13590 /* 13591 * Clear the indication that this may have a hardware checksum 13592 * as we are not using it 13593 */ 13594 DB_CKSUMFLAGS(mp) = 0; 13595 13596 ire = ire_forward(dst, &ret_action, NULL, NULL, 13597 msg_getlabel(mp), ipst); 13598 13599 if (ire == NULL && ret_action == Forward_check_multirt) { 13600 /* Let ip_newroute handle CGTP */ 13601 ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst); 13602 return (NULL); 13603 } 13604 13605 if (ire != NULL) 13606 return (ire); 13607 13608 mp->b_prev = mp->b_next = 0; 13609 13610 if (ret_action == Forward_blackhole) { 13611 freemsg(mp); 13612 return (NULL); 13613 } 13614 /* send icmp unreachable */ 13615 q = WR(q); 13616 /* Sent by forwarding path, and router is global zone */ 13617 if (ip_source_routed(ipha, ipst)) { 13618 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, 13619 GLOBAL_ZONEID, ipst); 13620 } else { 13621 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID, 13622 ipst); 13623 } 13624 13625 return (NULL); 13626 13627 } 13628 13629 /* 13630 * check ip header length and align it. 13631 */ 13632 static boolean_t 13633 ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) 13634 { 13635 ssize_t len; 13636 ill_t *ill; 13637 ipha_t *ipha; 13638 13639 len = MBLKL(mp); 13640 13641 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13642 ill = (ill_t *)q->q_ptr; 13643 13644 if (!OK_32PTR(mp->b_rptr)) 13645 IP_STAT(ipst, ip_notaligned1); 13646 else 13647 IP_STAT(ipst, ip_notaligned2); 13648 /* Guard against bogus device drivers */ 13649 if (len < 0) { 13650 /* clear b_prev - used by ip_mroute_decap */ 13651 mp->b_prev = NULL; 13652 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13653 freemsg(mp); 13654 return (B_FALSE); 13655 } 13656 13657 if (ip_rput_pullups++ == 0) { 13658 ipha = (ipha_t *)mp->b_rptr; 13659 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 13660 "ip_check_and_align_header: %s forced us to " 13661 " pullup pkt, hdr len %ld, hdr addr %p", 13662 ill->ill_name, len, (void *)ipha); 13663 } 13664 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 13665 /* clear b_prev - used by ip_mroute_decap */ 13666 mp->b_prev = NULL; 13667 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13668 freemsg(mp); 13669 return (B_FALSE); 13670 } 13671 } 13672 return (B_TRUE); 13673 } 13674 13675 /* 13676 * Handle the situation where a packet came in on `ill' but matched an IRE 13677 * whose ire_rfq doesn't match `ill'. We return the IRE that should be used 13678 * for interface statistics. 13679 */ 13680 ire_t * 13681 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) 13682 { 13683 ire_t *new_ire; 13684 ill_t *ire_ill; 13685 uint_t ifindex; 13686 ip_stack_t *ipst = ill->ill_ipst; 13687 boolean_t strict_check = B_FALSE; 13688 13689 /* 13690 * IPMP common case: if IRE and ILL are in the same group, there's no 13691 * issue (e.g. packet received on an underlying interface matched an 13692 * IRE_LOCAL on its associated group interface). 13693 */ 13694 if (ire->ire_rfq != NULL && 13695 IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { 13696 return (ire); 13697 } 13698 13699 /* 13700 * Do another ire lookup here, using the ingress ill, to see if the 13701 * interface is in a usesrc group. 13702 * As long as the ills belong to the same group, we don't consider 13703 * them to be arriving on the wrong interface. Thus, if the switch 13704 * is doing inbound load spreading, we won't drop packets when the 13705 * ip*_strict_dst_multihoming switch is on. 13706 * We also need to check for IPIF_UNNUMBERED point2point interfaces 13707 * where the local address may not be unique. In this case we were 13708 * at the mercy of the initial ire cache lookup and the IRE_LOCAL it 13709 * actually returned. The new lookup, which is more specific, should 13710 * only find the IRE_LOCAL associated with the ingress ill if one 13711 * exists. 13712 */ 13713 13714 if (ire->ire_ipversion == IPV4_VERSION) { 13715 if (ipst->ips_ip_strict_dst_multihoming) 13716 strict_check = B_TRUE; 13717 new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, 13718 ill->ill_ipif, ALL_ZONES, NULL, 13719 (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); 13720 } else { 13721 ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); 13722 if (ipst->ips_ipv6_strict_dst_multihoming) 13723 strict_check = B_TRUE; 13724 new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, 13725 IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, 13726 (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); 13727 } 13728 /* 13729 * If the same ire that was returned in ip_input() is found then this 13730 * is an indication that usesrc groups are in use. The packet 13731 * arrived on a different ill in the group than the one associated with 13732 * the destination address. If a different ire was found then the same 13733 * IP address must be hosted on multiple ills. This is possible with 13734 * unnumbered point2point interfaces. We switch to use this new ire in 13735 * order to have accurate interface statistics. 13736 */ 13737 if (new_ire != NULL) { 13738 if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) { 13739 ire_refrele(ire); 13740 ire = new_ire; 13741 } else { 13742 ire_refrele(new_ire); 13743 } 13744 return (ire); 13745 } else if ((ire->ire_rfq == NULL) && 13746 (ire->ire_ipversion == IPV4_VERSION)) { 13747 /* 13748 * The best match could have been the original ire which 13749 * was created against an IRE_LOCAL on lo0. In the IPv4 case 13750 * the strict multihoming checks are irrelevant as we consider 13751 * local addresses hosted on lo0 to be interface agnostic. We 13752 * only expect a null ire_rfq on IREs which are associated with 13753 * lo0 hence we can return now. 13754 */ 13755 return (ire); 13756 } 13757 13758 /* 13759 * Chase pointers once and store locally. 13760 */ 13761 ire_ill = (ire->ire_rfq == NULL) ? NULL : 13762 (ill_t *)(ire->ire_rfq->q_ptr); 13763 ifindex = ill->ill_usesrc_ifindex; 13764 13765 /* 13766 * Check if it's a legal address on the 'usesrc' interface. 13767 */ 13768 if ((ifindex != 0) && (ire_ill != NULL) && 13769 (ifindex == ire_ill->ill_phyint->phyint_ifindex)) { 13770 return (ire); 13771 } 13772 13773 /* 13774 * If the ip*_strict_dst_multihoming switch is on then we can 13775 * only accept this packet if the interface is marked as routing. 13776 */ 13777 if (!(strict_check)) 13778 return (ire); 13779 13780 if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & 13781 ILLF_ROUTER) != 0) { 13782 return (ire); 13783 } 13784 13785 ire_refrele(ire); 13786 return (NULL); 13787 } 13788 13789 /* 13790 * 13791 * This is the fast forward path. If we are here, we dont need to 13792 * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup 13793 * needed to find the nexthop in this case is much simpler 13794 */ 13795 ire_t * 13796 ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) 13797 { 13798 ipha_t *ipha; 13799 ire_t *src_ire; 13800 ill_t *stq_ill; 13801 uint_t hlen; 13802 uint_t pkt_len; 13803 uint32_t sum; 13804 queue_t *dev_q; 13805 ip_stack_t *ipst = ill->ill_ipst; 13806 mblk_t *fpmp; 13807 enum ire_forward_action ret_action; 13808 13809 ipha = (ipha_t *)mp->b_rptr; 13810 13811 if (ire != NULL && 13812 ire->ire_zoneid != GLOBAL_ZONEID && 13813 ire->ire_zoneid != ALL_ZONES) { 13814 /* 13815 * Should only use IREs that are visible to the global 13816 * zone for forwarding. 13817 */ 13818 ire_refrele(ire); 13819 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst); 13820 /* 13821 * ire_cache_lookup() can return ire of IRE_LOCAL in 13822 * transient cases. In such case, just drop the packet 13823 */ 13824 if (ire != NULL && ire->ire_type != IRE_CACHE) 13825 goto indiscard; 13826 } 13827 13828 /* 13829 * Martian Address Filtering [RFC 1812, Section 5.3.7] 13830 * The loopback address check for both src and dst has already 13831 * been checked in ip_input 13832 */ 13833 13834 if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) { 13835 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13836 goto drop; 13837 } 13838 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 13839 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 13840 13841 if (src_ire != NULL) { 13842 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13843 ire_refrele(src_ire); 13844 goto drop; 13845 } 13846 13847 /* No ire cache of nexthop. So first create one */ 13848 if (ire == NULL) { 13849 13850 ire = ire_forward_simple(dst, &ret_action, ipst); 13851 13852 /* 13853 * We only come to ip_fast_forward if ip_cgtp_filter 13854 * is not set. So ire_forward() should not return with 13855 * Forward_check_multirt as the next action. 13856 */ 13857 ASSERT(ret_action != Forward_check_multirt); 13858 if (ire == NULL) { 13859 /* An attempt was made to forward the packet */ 13860 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 13861 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13862 mp->b_prev = mp->b_next = 0; 13863 /* send icmp unreachable */ 13864 /* Sent by forwarding path, and router is global zone */ 13865 if (ret_action == Forward_ret_icmp_err) { 13866 if (ip_source_routed(ipha, ipst)) { 13867 icmp_unreachable(ill->ill_wq, mp, 13868 ICMP_SOURCE_ROUTE_FAILED, 13869 GLOBAL_ZONEID, ipst); 13870 } else { 13871 icmp_unreachable(ill->ill_wq, mp, 13872 ICMP_HOST_UNREACHABLE, 13873 GLOBAL_ZONEID, ipst); 13874 } 13875 } else { 13876 freemsg(mp); 13877 } 13878 return (NULL); 13879 } 13880 } 13881 13882 /* 13883 * Forwarding fastpath exception case: 13884 * If any of the following are true, we take the slowpath: 13885 * o forwarding is not enabled 13886 * o incoming and outgoing interface are the same, or in the same 13887 * IPMP group. 13888 * o corresponding ire is in incomplete state 13889 * o packet needs fragmentation 13890 * o ARP cache is not resolved 13891 * 13892 * The codeflow from here on is thus: 13893 * ip_rput_process_forward->ip_rput_forward->ip_xmit_v4 13894 */ 13895 pkt_len = ntohs(ipha->ipha_length); 13896 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 13897 if (!(stq_ill->ill_flags & ILLF_ROUTER) || 13898 (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) || 13899 (ire->ire_nce == NULL) || 13900 (pkt_len > ire->ire_max_frag) || 13901 ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || 13902 ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) || 13903 ipha->ipha_ttl <= 1) { 13904 ip_rput_process_forward(ill->ill_rq, mp, ire, 13905 ipha, ill, B_FALSE, B_TRUE); 13906 return (ire); 13907 } 13908 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 13909 13910 DTRACE_PROBE4(ip4__forwarding__start, 13911 ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); 13912 13913 FW_HOOKS(ipst->ips_ip4_forwarding_event, 13914 ipst->ips_ipv4firewall_forwarding, 13915 ill, stq_ill, ipha, mp, mp, 0, ipst); 13916 13917 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 13918 13919 if (mp == NULL) 13920 goto drop; 13921 13922 mp->b_datap->db_struioun.cksum.flags = 0; 13923 /* Adjust the checksum to reflect the ttl decrement. */ 13924 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 13925 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 13926 ipha->ipha_ttl--; 13927 13928 /* 13929 * Write the link layer header. We can do this safely here, 13930 * because we have already tested to make sure that the IP 13931 * policy is not set, and that we have a fast path destination 13932 * header. 13933 */ 13934 mp->b_rptr -= hlen; 13935 bcopy(fpmp->b_rptr, mp->b_rptr, hlen); 13936 13937 UPDATE_IB_PKT_COUNT(ire); 13938 ire->ire_last_used_time = lbolt; 13939 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 13940 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 13941 UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len); 13942 13943 if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) { 13944 dev_q = ire->ire_stq->q_next; 13945 if (DEV_Q_FLOW_BLOCKED(dev_q)) 13946 goto indiscard; 13947 } 13948 13949 DTRACE_PROBE4(ip4__physical__out__start, 13950 ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); 13951 FW_HOOKS(ipst->ips_ip4_physical_out_event, 13952 ipst->ips_ipv4firewall_physical_out, 13953 NULL, stq_ill, ipha, mp, mp, 0, ipst); 13954 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 13955 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 13956 ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, 13957 ip6_t *, NULL, int, 0); 13958 13959 if (mp != NULL) { 13960 if (ipst->ips_ipobs_enabled) { 13961 zoneid_t szone; 13962 13963 szone = ip_get_zoneid_v4(ipha->ipha_src, mp, 13964 ipst, ALL_ZONES); 13965 ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, 13966 ALL_ZONES, ill, IPV4_VERSION, hlen, ipst); 13967 } 13968 ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL); 13969 } 13970 return (ire); 13971 13972 indiscard: 13973 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13974 drop: 13975 if (mp != NULL) 13976 freemsg(mp); 13977 return (ire); 13978 13979 } 13980 13981 /* 13982 * This function is called in the forwarding slowpath, when 13983 * either the ire lacks the link-layer address, or the packet needs 13984 * further processing(eg. fragmentation), before transmission. 13985 */ 13986 13987 static void 13988 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 13989 ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward) 13990 { 13991 queue_t *dev_q; 13992 ire_t *src_ire; 13993 ip_stack_t *ipst = ill->ill_ipst; 13994 boolean_t same_illgrp = B_FALSE; 13995 13996 ASSERT(ire->ire_stq != NULL); 13997 13998 mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */ 13999 mp->b_next = NULL; /* ip_rput_noire sets dst here */ 14000 14001 /* 14002 * If the caller of this function is ip_fast_forward() skip the 14003 * next three checks as it does not apply. 14004 */ 14005 if (from_ip_fast_forward) 14006 goto skip; 14007 14008 if (ll_multicast != 0) { 14009 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14010 goto drop_pkt; 14011 } 14012 14013 /* 14014 * check if ipha_src is a broadcast address. Note that this 14015 * check is redundant when we get here from ip_fast_forward() 14016 * which has already done this check. However, since we can 14017 * also get here from ip_rput_process_broadcast() or, for 14018 * for the slow path through ip_fast_forward(), we perform 14019 * the check again for code-reusability 14020 */ 14021 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 14022 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 14023 if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) { 14024 if (src_ire != NULL) 14025 ire_refrele(src_ire); 14026 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14027 ip2dbg(("ip_rput_process_forward: Received packet with" 14028 " bad src/dst address on %s\n", ill->ill_name)); 14029 goto drop_pkt; 14030 } 14031 14032 /* 14033 * Check if we want to forward this one at this time. 14034 * We allow source routed packets on a host provided that 14035 * they go out the same ill or illgrp as they came in on. 14036 * 14037 * XXX To be quicker, we may wish to not chase pointers to 14038 * get the ILLF_ROUTER flag and instead store the 14039 * forwarding policy in the ire. An unfortunate 14040 * side-effect of that would be requiring an ire flush 14041 * whenever the ILLF_ROUTER flag changes. 14042 */ 14043 skip: 14044 same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr); 14045 14046 if (((ill->ill_flags & 14047 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) && 14048 !(ip_source_routed(ipha, ipst) && 14049 (ire->ire_rfq == q || same_illgrp))) { 14050 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14051 if (ip_source_routed(ipha, ipst)) { 14052 q = WR(q); 14053 /* 14054 * Clear the indication that this may have 14055 * hardware checksum as we are not using it. 14056 */ 14057 DB_CKSUMFLAGS(mp) = 0; 14058 /* Sent by forwarding path, and router is global zone */ 14059 icmp_unreachable(q, mp, 14060 ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst); 14061 return; 14062 } 14063 goto drop_pkt; 14064 } 14065 14066 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 14067 14068 /* Packet is being forwarded. Turning off hwcksum flag. */ 14069 DB_CKSUMFLAGS(mp) = 0; 14070 if (ipst->ips_ip_g_send_redirects) { 14071 /* 14072 * Check whether the incoming interface and outgoing 14073 * interface is part of the same group. If so, 14074 * send redirects. 14075 * 14076 * Check the source address to see if it originated 14077 * on the same logical subnet it is going back out on. 14078 * If so, we should be able to send it a redirect. 14079 * Avoid sending a redirect if the destination 14080 * is directly connected (i.e., ipha_dst is the same 14081 * as ire_gateway_addr or the ire_addr of the 14082 * nexthop IRE_CACHE ), or if the packet was source 14083 * routed out this interface. 14084 */ 14085 ipaddr_t src, nhop; 14086 mblk_t *mp1; 14087 ire_t *nhop_ire = NULL; 14088 14089 /* 14090 * Check whether ire_rfq and q are from the same ill or illgrp. 14091 * If so, send redirects. 14092 */ 14093 if ((ire->ire_rfq == q || same_illgrp) && 14094 !ip_source_routed(ipha, ipst)) { 14095 14096 nhop = (ire->ire_gateway_addr != 0 ? 14097 ire->ire_gateway_addr : ire->ire_addr); 14098 14099 if (ipha->ipha_dst == nhop) { 14100 /* 14101 * We avoid sending a redirect if the 14102 * destination is directly connected 14103 * because it is possible that multiple 14104 * IP subnets may have been configured on 14105 * the link, and the source may not 14106 * be on the same subnet as ip destination, 14107 * even though they are on the same 14108 * physical link. 14109 */ 14110 goto sendit; 14111 } 14112 14113 src = ipha->ipha_src; 14114 14115 /* 14116 * We look up the interface ire for the nexthop, 14117 * to see if ipha_src is in the same subnet 14118 * as the nexthop. 14119 * 14120 * Note that, if, in the future, IRE_CACHE entries 14121 * are obsoleted, this lookup will not be needed, 14122 * as the ire passed to this function will be the 14123 * same as the nhop_ire computed below. 14124 */ 14125 nhop_ire = ire_ftable_lookup(nhop, 0, 0, 14126 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 14127 0, NULL, MATCH_IRE_TYPE, ipst); 14128 14129 if (nhop_ire != NULL) { 14130 if ((src & nhop_ire->ire_mask) == 14131 (nhop & nhop_ire->ire_mask)) { 14132 /* 14133 * The source is directly connected. 14134 * Just copy the ip header (which is 14135 * in the first mblk) 14136 */ 14137 mp1 = copyb(mp); 14138 if (mp1 != NULL) { 14139 icmp_send_redirect(WR(q), mp1, 14140 nhop, ipst); 14141 } 14142 } 14143 ire_refrele(nhop_ire); 14144 } 14145 } 14146 } 14147 sendit: 14148 dev_q = ire->ire_stq->q_next; 14149 if (DEV_Q_FLOW_BLOCKED(dev_q)) { 14150 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14151 freemsg(mp); 14152 return; 14153 } 14154 14155 ip_rput_forward(ire, ipha, mp, ill); 14156 return; 14157 14158 drop_pkt: 14159 ip2dbg(("ip_rput_process_forward: drop pkt\n")); 14160 freemsg(mp); 14161 } 14162 14163 ire_t * 14164 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14165 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 14166 { 14167 queue_t *q; 14168 uint16_t hcksumflags; 14169 ip_stack_t *ipst = ill->ill_ipst; 14170 14171 q = *qp; 14172 14173 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); 14174 14175 /* 14176 * Clear the indication that this may have hardware 14177 * checksum as we are not using it for forwarding. 14178 */ 14179 hcksumflags = DB_CKSUMFLAGS(mp); 14180 DB_CKSUMFLAGS(mp) = 0; 14181 14182 /* 14183 * Directed broadcast forwarding: if the packet came in over a 14184 * different interface then it is routed out over we can forward it. 14185 */ 14186 if (ipha->ipha_protocol == IPPROTO_TCP) { 14187 ire_refrele(ire); 14188 freemsg(mp); 14189 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14190 return (NULL); 14191 } 14192 /* 14193 * For multicast we have set dst to be INADDR_BROADCAST 14194 * for delivering to all STREAMS. 14195 */ 14196 if (!CLASSD(ipha->ipha_dst)) { 14197 ire_t *new_ire; 14198 ipif_t *ipif; 14199 14200 ipif = ipif_get_next_ipif(NULL, ill); 14201 if (ipif == NULL) { 14202 discard: ire_refrele(ire); 14203 freemsg(mp); 14204 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14205 return (NULL); 14206 } 14207 new_ire = ire_ctable_lookup(dst, 0, 0, 14208 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst); 14209 ipif_refrele(ipif); 14210 14211 if (new_ire != NULL) { 14212 /* 14213 * If the matching IRE_BROADCAST is part of an IPMP 14214 * group, then drop the packet unless our ill has been 14215 * nominated to receive for the group. 14216 */ 14217 if (IS_IPMP(new_ire->ire_ipif->ipif_ill) && 14218 new_ire->ire_rfq != q) { 14219 ire_refrele(new_ire); 14220 goto discard; 14221 } 14222 14223 /* 14224 * In the special case of multirouted broadcast 14225 * packets, we unconditionally need to "gateway" 14226 * them to the appropriate interface here. 14227 * In the normal case, this cannot happen, because 14228 * there is no broadcast IRE tagged with the 14229 * RTF_MULTIRT flag. 14230 */ 14231 if (new_ire->ire_flags & RTF_MULTIRT) { 14232 ire_refrele(new_ire); 14233 if (ire->ire_rfq != NULL) { 14234 q = ire->ire_rfq; 14235 *qp = q; 14236 } 14237 } else { 14238 ire_refrele(ire); 14239 ire = new_ire; 14240 } 14241 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 14242 if (!ipst->ips_ip_g_forward_directed_bcast) { 14243 /* 14244 * Free the message if 14245 * ip_g_forward_directed_bcast is turned 14246 * off for non-local broadcast. 14247 */ 14248 ire_refrele(ire); 14249 freemsg(mp); 14250 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14251 return (NULL); 14252 } 14253 } else { 14254 /* 14255 * This CGTP packet successfully passed the 14256 * CGTP filter, but the related CGTP 14257 * broadcast IRE has not been found, 14258 * meaning that the redundant ipif is 14259 * probably down. However, if we discarded 14260 * this packet, its duplicate would be 14261 * filtered out by the CGTP filter so none 14262 * of them would get through. So we keep 14263 * going with this one. 14264 */ 14265 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 14266 if (ire->ire_rfq != NULL) { 14267 q = ire->ire_rfq; 14268 *qp = q; 14269 } 14270 } 14271 } 14272 if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) { 14273 /* 14274 * Verify that there are not more then one 14275 * IRE_BROADCAST with this broadcast address which 14276 * has ire_stq set. 14277 * TODO: simplify, loop over all IRE's 14278 */ 14279 ire_t *ire1; 14280 int num_stq = 0; 14281 mblk_t *mp1; 14282 14283 /* Find the first one with ire_stq set */ 14284 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 14285 for (ire1 = ire; ire1 && 14286 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 14287 ire1 = ire1->ire_next) 14288 ; 14289 if (ire1) { 14290 ire_refrele(ire); 14291 ire = ire1; 14292 IRE_REFHOLD(ire); 14293 } 14294 14295 /* Check if there are additional ones with stq set */ 14296 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 14297 if (ire->ire_addr != ire1->ire_addr) 14298 break; 14299 if (ire1->ire_stq) { 14300 num_stq++; 14301 break; 14302 } 14303 } 14304 rw_exit(&ire->ire_bucket->irb_lock); 14305 if (num_stq == 1 && ire->ire_stq != NULL) { 14306 ip1dbg(("ip_rput_process_broadcast: directed " 14307 "broadcast to 0x%x\n", 14308 ntohl(ire->ire_addr))); 14309 mp1 = copymsg(mp); 14310 if (mp1) { 14311 switch (ipha->ipha_protocol) { 14312 case IPPROTO_UDP: 14313 ip_udp_input(q, mp1, ipha, ire, ill); 14314 break; 14315 default: 14316 ip_proto_input(q, mp1, ipha, ire, ill, 14317 0); 14318 break; 14319 } 14320 } 14321 /* 14322 * Adjust ttl to 2 (1+1 - the forward engine 14323 * will decrement it by one. 14324 */ 14325 if (ip_csum_hdr(ipha)) { 14326 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 14327 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 14328 freemsg(mp); 14329 ire_refrele(ire); 14330 return (NULL); 14331 } 14332 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; 14333 ipha->ipha_hdr_checksum = 0; 14334 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 14335 ip_rput_process_forward(q, mp, ire, ipha, 14336 ill, ll_multicast, B_FALSE); 14337 ire_refrele(ire); 14338 return (NULL); 14339 } 14340 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 14341 ntohl(ire->ire_addr))); 14342 } 14343 14344 /* Restore any hardware checksum flags */ 14345 DB_CKSUMFLAGS(mp) = hcksumflags; 14346 return (ire); 14347 } 14348 14349 /* ARGSUSED */ 14350 static boolean_t 14351 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 14352 int *ll_multicast, ipaddr_t *dstp) 14353 { 14354 ip_stack_t *ipst = ill->ill_ipst; 14355 14356 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); 14357 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, 14358 ntohs(ipha->ipha_length)); 14359 14360 /* 14361 * So that we don't end up with dups, only one ill in an IPMP group is 14362 * nominated to receive multicast traffic. 14363 */ 14364 if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast) 14365 goto drop_pkt; 14366 14367 /* 14368 * Forward packets only if we have joined the allmulti 14369 * group on this interface. 14370 */ 14371 if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) { 14372 int retval; 14373 14374 /* 14375 * Clear the indication that this may have hardware 14376 * checksum as we are not using it. 14377 */ 14378 DB_CKSUMFLAGS(mp) = 0; 14379 retval = ip_mforward(ill, ipha, mp); 14380 /* ip_mforward updates mib variables if needed */ 14381 /* clear b_prev - used by ip_mroute_decap */ 14382 mp->b_prev = NULL; 14383 14384 switch (retval) { 14385 case 0: 14386 /* 14387 * pkt is okay and arrived on phyint. 14388 * 14389 * If we are running as a multicast router 14390 * we need to see all IGMP and/or PIM packets. 14391 */ 14392 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 14393 (ipha->ipha_protocol == IPPROTO_PIM)) { 14394 goto done; 14395 } 14396 break; 14397 case -1: 14398 /* pkt is mal-formed, toss it */ 14399 goto drop_pkt; 14400 case 1: 14401 /* pkt is okay and arrived on a tunnel */ 14402 /* 14403 * If we are running a multicast router 14404 * we need to see all igmp packets. 14405 */ 14406 if (ipha->ipha_protocol == IPPROTO_IGMP) { 14407 *dstp = INADDR_BROADCAST; 14408 *ll_multicast = 1; 14409 return (B_FALSE); 14410 } 14411 14412 goto drop_pkt; 14413 } 14414 } 14415 14416 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 14417 /* 14418 * This might just be caused by the fact that 14419 * multiple IP Multicast addresses map to the same 14420 * link layer multicast - no need to increment counter! 14421 */ 14422 freemsg(mp); 14423 return (B_TRUE); 14424 } 14425 done: 14426 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 14427 /* 14428 * This assumes the we deliver to all streams for multicast 14429 * and broadcast packets. 14430 */ 14431 *dstp = INADDR_BROADCAST; 14432 *ll_multicast = 1; 14433 return (B_FALSE); 14434 drop_pkt: 14435 ip2dbg(("ip_rput: drop pkt\n")); 14436 freemsg(mp); 14437 return (B_TRUE); 14438 } 14439 14440 /* 14441 * This function is used to both return an indication of whether or not 14442 * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND) 14443 * and in doing so, determine whether or not it is broadcast vs multicast. 14444 * For it to be a broadcast packet, we must have the appropriate mblk_t 14445 * hanging off the ill_t. If this is either not present or doesn't match 14446 * the destination mac address in the DL_UNITDATA_IND, the packet is deemed 14447 * to be multicast. Thus NICs that have no broadcast address (or no 14448 * capability for one, such as point to point links) cannot return as 14449 * the packet being broadcast. The use of HPE_BROADCAST/HPE_MULTICAST as 14450 * the return values simplifies the current use of the return value of this 14451 * function, which is to pass through the multicast/broadcast characteristic 14452 * to consumers of the netinfo/pfhooks API. While this is not cast in stone, 14453 * changing the return value to some other symbol demands the appropriate 14454 * "translation" when hpe_flags is set prior to calling hook_run() for 14455 * packet events. 14456 */ 14457 int 14458 ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb) 14459 { 14460 dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr; 14461 mblk_t *bmp; 14462 14463 if (ind->dl_group_address) { 14464 if (ind->dl_dest_addr_offset > sizeof (*ind) && 14465 ind->dl_dest_addr_offset + ind->dl_dest_addr_length < 14466 MBLKL(mb) && 14467 (bmp = ill->ill_bcast_mp) != NULL) { 14468 dl_unitdata_req_t *dlur; 14469 uint8_t *bphys_addr; 14470 14471 dlur = (dl_unitdata_req_t *)bmp->b_rptr; 14472 if (ill->ill_sap_length < 0) 14473 bphys_addr = (uchar_t *)dlur + 14474 dlur->dl_dest_addr_offset; 14475 else 14476 bphys_addr = (uchar_t *)dlur + 14477 dlur->dl_dest_addr_offset + 14478 ill->ill_sap_length; 14479 14480 if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset, 14481 bphys_addr, ind->dl_dest_addr_length) == 0) { 14482 return (HPE_BROADCAST); 14483 } 14484 return (HPE_MULTICAST); 14485 } 14486 return (HPE_MULTICAST); 14487 } 14488 return (0); 14489 } 14490 14491 static boolean_t 14492 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 14493 int *ll_multicast, mblk_t **mpp) 14494 { 14495 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 14496 boolean_t must_copy = B_FALSE; 14497 struct iocblk *iocp; 14498 ipha_t *ipha; 14499 ip_stack_t *ipst = ill->ill_ipst; 14500 14501 #define rptr ((uchar_t *)ipha) 14502 14503 first_mp = *first_mpp; 14504 mp = *mpp; 14505 14506 ASSERT(first_mp == mp); 14507 14508 /* 14509 * if db_ref > 1 then copymsg and free original. Packet may be 14510 * changed and do not want other entity who has a reference to this 14511 * message to trip over the changes. This is a blind change because 14512 * trying to catch all places that might change packet is too 14513 * difficult (since it may be a module above this one) 14514 * 14515 * This corresponds to the non-fast path case. We walk down the full 14516 * chain in this case, and check the db_ref count of all the dblks, 14517 * and do a copymsg if required. It is possible that the db_ref counts 14518 * of the data blocks in the mblk chain can be different. 14519 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 14520 * count of 1, followed by a M_DATA block with a ref count of 2, if 14521 * 'snoop' is running. 14522 */ 14523 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 14524 if (mp1->b_datap->db_ref > 1) { 14525 must_copy = B_TRUE; 14526 break; 14527 } 14528 } 14529 14530 if (must_copy) { 14531 mp1 = copymsg(mp); 14532 if (mp1 == NULL) { 14533 for (mp1 = mp; mp1 != NULL; 14534 mp1 = mp1->b_cont) { 14535 mp1->b_next = NULL; 14536 mp1->b_prev = NULL; 14537 } 14538 freemsg(mp); 14539 if (ill != NULL) { 14540 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14541 } else { 14542 BUMP_MIB(&ipst->ips_ip_mib, 14543 ipIfStatsInDiscards); 14544 } 14545 return (B_TRUE); 14546 } 14547 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 14548 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 14549 /* Copy b_prev - used by ip_mroute_decap */ 14550 to_mp->b_prev = from_mp->b_prev; 14551 from_mp->b_prev = NULL; 14552 } 14553 *first_mpp = first_mp = mp1; 14554 freemsg(mp); 14555 mp = mp1; 14556 *mpp = mp1; 14557 } 14558 14559 ipha = (ipha_t *)mp->b_rptr; 14560 14561 /* 14562 * previous code has a case for M_DATA. 14563 * We want to check how that happens. 14564 */ 14565 ASSERT(first_mp->b_datap->db_type != M_DATA); 14566 switch (first_mp->b_datap->db_type) { 14567 case M_PROTO: 14568 case M_PCPROTO: 14569 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 14570 DL_UNITDATA_IND) { 14571 /* Go handle anything other than data elsewhere. */ 14572 ip_rput_dlpi(q, mp); 14573 return (B_TRUE); 14574 } 14575 14576 *ll_multicast = ip_get_dlpi_mbcast(ill, mp); 14577 /* Ditch the DLPI header. */ 14578 mp1 = mp->b_cont; 14579 ASSERT(first_mp == mp); 14580 *first_mpp = mp1; 14581 freeb(mp); 14582 *mpp = mp1; 14583 return (B_FALSE); 14584 case M_IOCACK: 14585 ip1dbg(("got iocack ")); 14586 iocp = (struct iocblk *)mp->b_rptr; 14587 switch (iocp->ioc_cmd) { 14588 case DL_IOC_HDR_INFO: 14589 ill = (ill_t *)q->q_ptr; 14590 ill_fastpath_ack(ill, mp); 14591 return (B_TRUE); 14592 default: 14593 putnext(q, mp); 14594 return (B_TRUE); 14595 } 14596 /* FALLTHRU */ 14597 case M_ERROR: 14598 case M_HANGUP: 14599 /* 14600 * Since this is on the ill stream we unconditionally 14601 * bump up the refcount 14602 */ 14603 ill_refhold(ill); 14604 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14605 return (B_TRUE); 14606 case M_CTL: 14607 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 14608 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 14609 IPHADA_M_CTL)) { 14610 /* 14611 * It's an IPsec accelerated packet. 14612 * Make sure that the ill from which we received the 14613 * packet has enabled IPsec hardware acceleration. 14614 */ 14615 if (!(ill->ill_capabilities & 14616 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 14617 /* IPsec kstats: bean counter */ 14618 freemsg(mp); 14619 return (B_TRUE); 14620 } 14621 14622 /* 14623 * Make mp point to the mblk following the M_CTL, 14624 * then process according to type of mp. 14625 * After this processing, first_mp will point to 14626 * the data-attributes and mp to the pkt following 14627 * the M_CTL. 14628 */ 14629 mp = first_mp->b_cont; 14630 if (mp == NULL) { 14631 freemsg(first_mp); 14632 return (B_TRUE); 14633 } 14634 /* 14635 * A Hardware Accelerated packet can only be M_DATA 14636 * ESP or AH packet. 14637 */ 14638 if (mp->b_datap->db_type != M_DATA) { 14639 /* non-M_DATA IPsec accelerated packet */ 14640 IPSECHW_DEBUG(IPSECHW_PKT, 14641 ("non-M_DATA IPsec accelerated pkt\n")); 14642 freemsg(first_mp); 14643 return (B_TRUE); 14644 } 14645 ipha = (ipha_t *)mp->b_rptr; 14646 if (ipha->ipha_protocol != IPPROTO_AH && 14647 ipha->ipha_protocol != IPPROTO_ESP) { 14648 IPSECHW_DEBUG(IPSECHW_PKT, 14649 ("non-M_DATA IPsec accelerated pkt\n")); 14650 freemsg(first_mp); 14651 return (B_TRUE); 14652 } 14653 *mpp = mp; 14654 return (B_FALSE); 14655 } 14656 putnext(q, mp); 14657 return (B_TRUE); 14658 case M_IOCNAK: 14659 ip1dbg(("got iocnak ")); 14660 iocp = (struct iocblk *)mp->b_rptr; 14661 switch (iocp->ioc_cmd) { 14662 case DL_IOC_HDR_INFO: 14663 ip_rput_other(NULL, q, mp, NULL); 14664 return (B_TRUE); 14665 default: 14666 break; 14667 } 14668 /* FALLTHRU */ 14669 default: 14670 putnext(q, mp); 14671 return (B_TRUE); 14672 } 14673 } 14674 14675 /* Read side put procedure. Packets coming from the wire arrive here. */ 14676 void 14677 ip_rput(queue_t *q, mblk_t *mp) 14678 { 14679 ill_t *ill; 14680 union DL_primitives *dl; 14681 14682 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 14683 14684 ill = (ill_t *)q->q_ptr; 14685 14686 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 14687 /* 14688 * If things are opening or closing, only accept high-priority 14689 * DLPI messages. (On open ill->ill_ipif has not yet been 14690 * created; on close, things hanging off the ill may have been 14691 * freed already.) 14692 */ 14693 dl = (union DL_primitives *)mp->b_rptr; 14694 if (DB_TYPE(mp) != M_PCPROTO || 14695 dl->dl_primitive == DL_UNITDATA_IND) { 14696 inet_freemsg(mp); 14697 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14698 "ip_rput_end: q %p (%S)", q, "uninit"); 14699 return; 14700 } 14701 } 14702 14703 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14704 "ip_rput_end: q %p (%S)", q, "end"); 14705 14706 ip_input(ill, NULL, mp, NULL); 14707 } 14708 14709 static mblk_t * 14710 ip_fix_dbref(ill_t *ill, mblk_t *mp) 14711 { 14712 mblk_t *mp1; 14713 boolean_t adjusted = B_FALSE; 14714 ip_stack_t *ipst = ill->ill_ipst; 14715 14716 IP_STAT(ipst, ip_db_ref); 14717 /* 14718 * The IP_RECVSLLA option depends on having the 14719 * link layer header. First check that: 14720 * a> the underlying device is of type ether, 14721 * since this option is currently supported only 14722 * over ethernet. 14723 * b> there is enough room to copy over the link 14724 * layer header. 14725 * 14726 * Once the checks are done, adjust rptr so that 14727 * the link layer header will be copied via 14728 * copymsg. Note that, IFT_ETHER may be returned 14729 * by some non-ethernet drivers but in this case 14730 * the second check will fail. 14731 */ 14732 if (ill->ill_type == IFT_ETHER && 14733 (mp->b_rptr - mp->b_datap->db_base) >= 14734 sizeof (struct ether_header)) { 14735 mp->b_rptr -= sizeof (struct ether_header); 14736 adjusted = B_TRUE; 14737 } 14738 mp1 = copymsg(mp); 14739 14740 if (mp1 == NULL) { 14741 mp->b_next = NULL; 14742 /* clear b_prev - used by ip_mroute_decap */ 14743 mp->b_prev = NULL; 14744 freemsg(mp); 14745 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14746 return (NULL); 14747 } 14748 14749 if (adjusted) { 14750 /* 14751 * Copy is done. Restore the pointer in 14752 * the _new_ mblk 14753 */ 14754 mp1->b_rptr += sizeof (struct ether_header); 14755 } 14756 14757 /* Copy b_prev - used by ip_mroute_decap */ 14758 mp1->b_prev = mp->b_prev; 14759 mp->b_prev = NULL; 14760 14761 /* preserve the hardware checksum flags and data, if present */ 14762 if (DB_CKSUMFLAGS(mp) != 0) { 14763 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 14764 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 14765 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 14766 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 14767 DB_CKSUM16(mp1) = DB_CKSUM16(mp); 14768 } 14769 14770 freemsg(mp); 14771 return (mp1); 14772 } 14773 14774 #define ADD_TO_CHAIN(head, tail, cnt, mp) { \ 14775 if (tail != NULL) \ 14776 tail->b_next = mp; \ 14777 else \ 14778 head = mp; \ 14779 tail = mp; \ 14780 cnt++; \ 14781 } 14782 14783 /* 14784 * Direct read side procedure capable of dealing with chains. GLDv3 based 14785 * drivers call this function directly with mblk chains while STREAMS 14786 * read side procedure ip_rput() calls this for single packet with ip_ring 14787 * set to NULL to process one packet at a time. 14788 * 14789 * The ill will always be valid if this function is called directly from 14790 * the driver. 14791 * 14792 * If ip_input() is called from GLDv3: 14793 * 14794 * - This must be a non-VLAN IP stream. 14795 * - 'mp' is either an untagged or a special priority-tagged packet. 14796 * - Any VLAN tag that was in the MAC header has been stripped. 14797 * 14798 * If the IP header in packet is not 32-bit aligned, every message in the 14799 * chain will be aligned before further operations. This is required on SPARC 14800 * platform. 14801 */ 14802 /* ARGSUSED */ 14803 void 14804 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 14805 struct mac_header_info_s *mhip) 14806 { 14807 ipaddr_t dst = NULL; 14808 ipaddr_t prev_dst; 14809 ire_t *ire = NULL; 14810 ipha_t *ipha; 14811 uint_t pkt_len; 14812 ssize_t len; 14813 uint_t opt_len; 14814 int ll_multicast; 14815 int cgtp_flt_pkt; 14816 queue_t *q = ill->ill_rq; 14817 squeue_t *curr_sqp = NULL; 14818 mblk_t *head = NULL; 14819 mblk_t *tail = NULL; 14820 mblk_t *first_mp; 14821 int cnt = 0; 14822 ip_stack_t *ipst = ill->ill_ipst; 14823 mblk_t *mp; 14824 mblk_t *dmp; 14825 uint8_t tag; 14826 14827 ASSERT(mp_chain != NULL); 14828 ASSERT(ill != NULL); 14829 14830 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 14831 14832 tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT; 14833 14834 #define rptr ((uchar_t *)ipha) 14835 14836 while (mp_chain != NULL) { 14837 mp = mp_chain; 14838 mp_chain = mp_chain->b_next; 14839 mp->b_next = NULL; 14840 ll_multicast = 0; 14841 14842 /* 14843 * We do ire caching from one iteration to 14844 * another. In the event the packet chain contains 14845 * all packets from the same dst, this caching saves 14846 * an ire_cache_lookup for each of the succeeding 14847 * packets in a packet chain. 14848 */ 14849 prev_dst = dst; 14850 14851 /* 14852 * if db_ref > 1 then copymsg and free original. Packet 14853 * may be changed and we do not want the other entity 14854 * who has a reference to this message to trip over the 14855 * changes. This is a blind change because trying to 14856 * catch all places that might change the packet is too 14857 * difficult. 14858 * 14859 * This corresponds to the fast path case, where we have 14860 * a chain of M_DATA mblks. We check the db_ref count 14861 * of only the 1st data block in the mblk chain. There 14862 * doesn't seem to be a reason why a device driver would 14863 * send up data with varying db_ref counts in the mblk 14864 * chain. In any case the Fast path is a private 14865 * interface, and our drivers don't do such a thing. 14866 * Given the above assumption, there is no need to walk 14867 * down the entire mblk chain (which could have a 14868 * potential performance problem) 14869 * 14870 * The "(DB_REF(mp) > 1)" check was moved from ip_rput() 14871 * to here because of exclusive ip stacks and vnics. 14872 * Packets transmitted from exclusive stack over vnic 14873 * can have db_ref > 1 and when it gets looped back to 14874 * another vnic in a different zone, you have ip_input() 14875 * getting dblks with db_ref > 1. So if someone 14876 * complains of TCP performance under this scenario, 14877 * take a serious look here on the impact of copymsg(). 14878 */ 14879 14880 if (DB_REF(mp) > 1) { 14881 if ((mp = ip_fix_dbref(ill, mp)) == NULL) 14882 continue; 14883 } 14884 14885 /* 14886 * Check and align the IP header. 14887 */ 14888 first_mp = mp; 14889 if (DB_TYPE(mp) == M_DATA) { 14890 dmp = mp; 14891 } else if (DB_TYPE(mp) == M_PROTO && 14892 *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) { 14893 dmp = mp->b_cont; 14894 } else { 14895 dmp = NULL; 14896 } 14897 if (dmp != NULL) { 14898 /* 14899 * IP header ptr not aligned? 14900 * OR IP header not complete in first mblk 14901 */ 14902 if (!OK_32PTR(dmp->b_rptr) || 14903 MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) { 14904 if (!ip_check_and_align_header(q, dmp, ipst)) 14905 continue; 14906 } 14907 } 14908 14909 /* 14910 * ip_input fast path 14911 */ 14912 14913 /* mblk type is not M_DATA */ 14914 if (DB_TYPE(mp) != M_DATA) { 14915 if (ip_rput_process_notdata(q, &first_mp, ill, 14916 &ll_multicast, &mp)) 14917 continue; 14918 14919 /* 14920 * The only way we can get here is if we had a 14921 * packet that was either a DL_UNITDATA_IND or 14922 * an M_CTL for an IPsec accelerated packet. 14923 * 14924 * In either case, the first_mp will point to 14925 * the leading M_PROTO or M_CTL. 14926 */ 14927 ASSERT(first_mp != NULL); 14928 } else if (mhip != NULL) { 14929 /* 14930 * ll_multicast is set here so that it is ready 14931 * for easy use with FW_HOOKS(). ip_get_dlpi_mbcast 14932 * manipulates ll_multicast in the same fashion when 14933 * called from ip_rput_process_notdata. 14934 */ 14935 switch (mhip->mhi_dsttype) { 14936 case MAC_ADDRTYPE_MULTICAST : 14937 ll_multicast = HPE_MULTICAST; 14938 break; 14939 case MAC_ADDRTYPE_BROADCAST : 14940 ll_multicast = HPE_BROADCAST; 14941 break; 14942 default : 14943 break; 14944 } 14945 } 14946 14947 /* Only M_DATA can come here and it is always aligned */ 14948 ASSERT(DB_TYPE(mp) == M_DATA); 14949 ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr)); 14950 14951 ipha = (ipha_t *)mp->b_rptr; 14952 len = mp->b_wptr - rptr; 14953 pkt_len = ntohs(ipha->ipha_length); 14954 14955 /* 14956 * We must count all incoming packets, even if they end 14957 * up being dropped later on. 14958 */ 14959 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 14960 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); 14961 14962 /* multiple mblk or too short */ 14963 len -= pkt_len; 14964 if (len != 0) { 14965 /* 14966 * Make sure we have data length consistent 14967 * with the IP header. 14968 */ 14969 if (mp->b_cont == NULL) { 14970 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 14971 BUMP_MIB(ill->ill_ip_mib, 14972 ipIfStatsInHdrErrors); 14973 ip2dbg(("ip_input: drop pkt\n")); 14974 freemsg(mp); 14975 continue; 14976 } 14977 mp->b_wptr = rptr + pkt_len; 14978 } else if ((len += msgdsize(mp->b_cont)) != 0) { 14979 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 14980 BUMP_MIB(ill->ill_ip_mib, 14981 ipIfStatsInHdrErrors); 14982 ip2dbg(("ip_input: drop pkt\n")); 14983 freemsg(mp); 14984 continue; 14985 } 14986 (void) adjmsg(mp, -len); 14987 /* 14988 * adjmsg may have freed an mblk from the chain, 14989 * hence invalidate any hw checksum here. This 14990 * will force IP to calculate the checksum in 14991 * sw, but only for this packet. 14992 */ 14993 DB_CKSUMFLAGS(mp) = 0; 14994 IP_STAT(ipst, ip_multimblk3); 14995 } 14996 } 14997 14998 /* Obtain the dst of the current packet */ 14999 dst = ipha->ipha_dst; 15000 15001 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, 15002 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, 15003 ipha, ip6_t *, NULL, int, 0); 15004 15005 /* 15006 * The following test for loopback is faster than 15007 * IP_LOOPBACK_ADDR(), because it avoids any bitwise 15008 * operations. 15009 * Note that these addresses are always in network byte order 15010 */ 15011 if (((*(uchar_t *)&ipha->ipha_dst) == 127) || 15012 ((*(uchar_t *)&ipha->ipha_src) == 127)) { 15013 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 15014 freemsg(mp); 15015 continue; 15016 } 15017 15018 /* 15019 * The event for packets being received from a 'physical' 15020 * interface is placed after validation of the source and/or 15021 * destination address as being local so that packets can be 15022 * redirected to loopback addresses using ipnat. 15023 */ 15024 DTRACE_PROBE4(ip4__physical__in__start, 15025 ill_t *, ill, ill_t *, NULL, 15026 ipha_t *, ipha, mblk_t *, first_mp); 15027 15028 FW_HOOKS(ipst->ips_ip4_physical_in_event, 15029 ipst->ips_ipv4firewall_physical_in, 15030 ill, NULL, ipha, first_mp, mp, ll_multicast, ipst); 15031 15032 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp); 15033 15034 if (first_mp == NULL) { 15035 continue; 15036 } 15037 dst = ipha->ipha_dst; 15038 /* 15039 * Attach any necessary label information to 15040 * this packet 15041 */ 15042 if (is_system_labeled() && 15043 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 15044 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 15045 freemsg(mp); 15046 continue; 15047 } 15048 15049 if (ipst->ips_ipobs_enabled) { 15050 zoneid_t dzone; 15051 15052 /* 15053 * On the inbound path the src zone will be unknown as 15054 * this packet has come from the wire. 15055 */ 15056 dzone = ip_get_zoneid_v4(dst, mp, ipst, ALL_ZONES); 15057 ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, 15058 ill, IPV4_VERSION, 0, ipst); 15059 } 15060 15061 /* 15062 * Reuse the cached ire only if the ipha_dst of the previous 15063 * packet is the same as the current packet AND it is not 15064 * INADDR_ANY. 15065 */ 15066 if (!(dst == prev_dst && dst != INADDR_ANY) && 15067 (ire != NULL)) { 15068 ire_refrele(ire); 15069 ire = NULL; 15070 } 15071 15072 opt_len = ipha->ipha_version_and_hdr_length - 15073 IP_SIMPLE_HDR_VERSION; 15074 15075 /* 15076 * Check to see if we can take the fastpath. 15077 * That is possible if the following conditions are met 15078 * o Tsol disabled 15079 * o CGTP disabled 15080 * o ipp_action_count is 0 15081 * o no options in the packet 15082 * o not a RSVP packet 15083 * o not a multicast packet 15084 * o ill not in IP_DHCPINIT_IF mode 15085 */ 15086 if (!is_system_labeled() && 15087 !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 && 15088 opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP && 15089 !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) { 15090 if (ire == NULL) 15091 ire = ire_cache_lookup_simple(dst, ipst); 15092 /* 15093 * Unless forwarding is enabled, dont call 15094 * ip_fast_forward(). Incoming packet is for forwarding 15095 */ 15096 if ((ill->ill_flags & ILLF_ROUTER) && 15097 (ire == NULL || (ire->ire_type & IRE_CACHE))) { 15098 ire = ip_fast_forward(ire, dst, ill, mp); 15099 continue; 15100 } 15101 /* incoming packet is for local consumption */ 15102 if ((ire != NULL) && (ire->ire_type & IRE_LOCAL)) 15103 goto local; 15104 } 15105 15106 /* 15107 * Disable ire caching for anything more complex 15108 * than the simple fast path case we checked for above. 15109 */ 15110 if (ire != NULL) { 15111 ire_refrele(ire); 15112 ire = NULL; 15113 } 15114 15115 /* 15116 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP 15117 * server to unicast DHCP packets to a DHCP client using the 15118 * IP address it is offering to the client. This can be 15119 * disabled through the "broadcast bit", but not all DHCP 15120 * servers honor that bit. Therefore, to interoperate with as 15121 * many DHCP servers as possible, the DHCP client allows the 15122 * server to unicast, but we treat those packets as broadcast 15123 * here. Note that we don't rewrite the packet itself since 15124 * (a) that would mess up the checksums and (b) the DHCP 15125 * client conn is bound to INADDR_ANY so ip_fanout_udp() will 15126 * hand it the packet regardless. 15127 */ 15128 if (ill->ill_dhcpinit != 0 && 15129 IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP && 15130 pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) { 15131 udpha_t *udpha; 15132 15133 /* 15134 * Reload ipha since pullupmsg() can change b_rptr. 15135 */ 15136 ipha = (ipha_t *)mp->b_rptr; 15137 udpha = (udpha_t *)&ipha[1]; 15138 15139 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { 15140 DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, 15141 mblk_t *, mp); 15142 dst = INADDR_BROADCAST; 15143 } 15144 } 15145 15146 /* Full-blown slow path */ 15147 if (opt_len != 0) { 15148 if (len != 0) 15149 IP_STAT(ipst, ip_multimblk4); 15150 else 15151 IP_STAT(ipst, ip_ipoptions); 15152 if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha, 15153 &dst, ipst)) 15154 continue; 15155 } 15156 15157 /* 15158 * Invoke the CGTP (multirouting) filtering module to process 15159 * the incoming packet. Packets identified as duplicates 15160 * must be discarded. Filtering is active only if the 15161 * the ip_cgtp_filter ndd variable is non-zero. 15162 */ 15163 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 15164 if (ipst->ips_ip_cgtp_filter && 15165 ipst->ips_ip_cgtp_filter_ops != NULL) { 15166 netstackid_t stackid; 15167 15168 stackid = ipst->ips_netstack->netstack_stackid; 15169 cgtp_flt_pkt = 15170 ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, 15171 ill->ill_phyint->phyint_ifindex, mp); 15172 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 15173 freemsg(first_mp); 15174 continue; 15175 } 15176 } 15177 15178 /* 15179 * If rsvpd is running, let RSVP daemon handle its processing 15180 * and forwarding of RSVP multicast/unicast packets. 15181 * If rsvpd is not running but mrouted is running, RSVP 15182 * multicast packets are forwarded as multicast traffic 15183 * and RSVP unicast packets are forwarded by unicast router. 15184 * If neither rsvpd nor mrouted is running, RSVP multicast 15185 * packets are not forwarded, but the unicast packets are 15186 * forwarded like unicast traffic. 15187 */ 15188 if (ipha->ipha_protocol == IPPROTO_RSVP && 15189 ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head != 15190 NULL) { 15191 /* RSVP packet and rsvpd running. Treat as ours */ 15192 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 15193 /* 15194 * This assumes that we deliver to all streams for 15195 * multicast and broadcast packets. 15196 * We have to force ll_multicast to 1 to handle the 15197 * M_DATA messages passed in from ip_mroute_decap. 15198 */ 15199 dst = INADDR_BROADCAST; 15200 ll_multicast = 1; 15201 } else if (CLASSD(dst)) { 15202 /* packet is multicast */ 15203 mp->b_next = NULL; 15204 if (ip_rput_process_multicast(q, mp, ill, ipha, 15205 &ll_multicast, &dst)) 15206 continue; 15207 } 15208 15209 if (ire == NULL) { 15210 ire = ire_cache_lookup(dst, ALL_ZONES, 15211 msg_getlabel(mp), ipst); 15212 } 15213 15214 if (ire != NULL && ire->ire_stq != NULL && 15215 ire->ire_zoneid != GLOBAL_ZONEID && 15216 ire->ire_zoneid != ALL_ZONES) { 15217 /* 15218 * Should only use IREs that are visible from the 15219 * global zone for forwarding. 15220 */ 15221 ire_refrele(ire); 15222 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, 15223 msg_getlabel(mp), ipst); 15224 } 15225 15226 if (ire == NULL) { 15227 /* 15228 * No IRE for this destination, so it can't be for us. 15229 * Unless we are forwarding, drop the packet. 15230 * We have to let source routed packets through 15231 * since we don't yet know if they are 'ping -l' 15232 * packets i.e. if they will go out over the 15233 * same interface as they came in on. 15234 */ 15235 ire = ip_rput_noire(q, mp, ll_multicast, dst); 15236 if (ire == NULL) 15237 continue; 15238 } 15239 15240 /* 15241 * Broadcast IRE may indicate either broadcast or 15242 * multicast packet 15243 */ 15244 if (ire->ire_type == IRE_BROADCAST) { 15245 /* 15246 * Skip broadcast checks if packet is UDP multicast; 15247 * we'd rather not enter ip_rput_process_broadcast() 15248 * unless the packet is broadcast for real, since 15249 * that routine is a no-op for multicast. 15250 */ 15251 if (ipha->ipha_protocol != IPPROTO_UDP || 15252 !CLASSD(ipha->ipha_dst)) { 15253 ire = ip_rput_process_broadcast(&q, mp, 15254 ire, ipha, ill, dst, cgtp_flt_pkt, 15255 ll_multicast); 15256 if (ire == NULL) 15257 continue; 15258 } 15259 } else if (ire->ire_stq != NULL) { 15260 /* fowarding? */ 15261 ip_rput_process_forward(q, mp, ire, ipha, ill, 15262 ll_multicast, B_FALSE); 15263 /* ip_rput_process_forward consumed the packet */ 15264 continue; 15265 } 15266 15267 local: 15268 /* 15269 * If the queue in the ire is different to the ingress queue 15270 * then we need to check to see if we can accept the packet. 15271 * Note that for multicast packets and broadcast packets sent 15272 * to a broadcast address which is shared between multiple 15273 * interfaces we should not do this since we just got a random 15274 * broadcast ire. 15275 */ 15276 if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { 15277 ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); 15278 if (ire == NULL) { 15279 /* Drop packet */ 15280 BUMP_MIB(ill->ill_ip_mib, 15281 ipIfStatsForwProhibits); 15282 freemsg(mp); 15283 continue; 15284 } 15285 if (ire->ire_rfq != NULL) 15286 q = ire->ire_rfq; 15287 } 15288 15289 switch (ipha->ipha_protocol) { 15290 case IPPROTO_TCP: 15291 ASSERT(first_mp == mp); 15292 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 15293 mp, 0, q, ip_ring)) != NULL) { 15294 if (curr_sqp == NULL) { 15295 curr_sqp = GET_SQUEUE(mp); 15296 ASSERT(cnt == 0); 15297 cnt++; 15298 head = tail = mp; 15299 } else if (curr_sqp == GET_SQUEUE(mp)) { 15300 ASSERT(tail != NULL); 15301 cnt++; 15302 tail->b_next = mp; 15303 tail = mp; 15304 } else { 15305 /* 15306 * A different squeue. Send the 15307 * chain for the previous squeue on 15308 * its way. This shouldn't happen 15309 * often unless interrupt binding 15310 * changes. 15311 */ 15312 IP_STAT(ipst, ip_input_multi_squeue); 15313 SQUEUE_ENTER(curr_sqp, head, 15314 tail, cnt, SQ_PROCESS, tag); 15315 curr_sqp = GET_SQUEUE(mp); 15316 head = mp; 15317 tail = mp; 15318 cnt = 1; 15319 } 15320 } 15321 continue; 15322 case IPPROTO_UDP: 15323 ASSERT(first_mp == mp); 15324 ip_udp_input(q, mp, ipha, ire, ill); 15325 continue; 15326 case IPPROTO_SCTP: 15327 ASSERT(first_mp == mp); 15328 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 15329 q, dst); 15330 /* ire has been released by ip_sctp_input */ 15331 ire = NULL; 15332 continue; 15333 case IPPROTO_ENCAP: 15334 case IPPROTO_IPV6: 15335 ASSERT(first_mp == mp); 15336 if (ip_iptun_input(NULL, mp, ipha, ill, ire, ipst)) 15337 break; 15338 /* 15339 * If there was no IP tunnel data-link bound to 15340 * receive this packet, then we fall through to 15341 * allow potential raw sockets bound to either of 15342 * these protocols to pick it up. 15343 */ 15344 default: 15345 ip_proto_input(q, first_mp, ipha, ire, ill, 0); 15346 continue; 15347 } 15348 } 15349 15350 if (ire != NULL) 15351 ire_refrele(ire); 15352 15353 if (head != NULL) 15354 SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag); 15355 15356 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 15357 "ip_input_end: q %p (%S)", q, "end"); 15358 #undef rptr 15359 } 15360 15361 /* 15362 * ip_accept_tcp() - This function is called by the squeue when it retrieves 15363 * a chain of packets in the poll mode. The packets have gone through the 15364 * data link processing but not IP processing. For performance and latency 15365 * reasons, the squeue wants to process the chain in line instead of feeding 15366 * it back via ip_input path. 15367 * 15368 * So this is a light weight function which checks to see if the packets 15369 * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring 15370 * but we still do the paranoid check) meant for local machine and we don't 15371 * have labels etc enabled. Packets that meet the criterion are returned to 15372 * the squeue and processed inline while the rest go via ip_input path. 15373 */ 15374 /*ARGSUSED*/ 15375 mblk_t * 15376 ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, 15377 mblk_t *mp_chain, mblk_t **last, uint_t *cnt) 15378 { 15379 mblk_t *mp; 15380 ipaddr_t dst = NULL; 15381 ipaddr_t prev_dst; 15382 ire_t *ire = NULL; 15383 ipha_t *ipha; 15384 uint_t pkt_len; 15385 ssize_t len; 15386 uint_t opt_len; 15387 queue_t *q = ill->ill_rq; 15388 squeue_t *curr_sqp; 15389 mblk_t *ahead = NULL; /* Accepted head */ 15390 mblk_t *atail = NULL; /* Accepted tail */ 15391 uint_t acnt = 0; /* Accepted count */ 15392 mblk_t *utail = NULL; /* Unaccepted head */ 15393 mblk_t *uhead = NULL; /* Unaccepted tail */ 15394 uint_t ucnt = 0; /* Unaccepted cnt */ 15395 ip_stack_t *ipst = ill->ill_ipst; 15396 15397 *cnt = 0; 15398 15399 ASSERT(ill != NULL); 15400 ASSERT(ip_ring != NULL); 15401 15402 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q); 15403 15404 #define rptr ((uchar_t *)ipha) 15405 15406 while (mp_chain != NULL) { 15407 mp = mp_chain; 15408 mp_chain = mp_chain->b_next; 15409 mp->b_next = NULL; 15410 15411 /* 15412 * We do ire caching from one iteration to 15413 * another. In the event the packet chain contains 15414 * all packets from the same dst, this caching saves 15415 * an ire_cache_lookup for each of the succeeding 15416 * packets in a packet chain. 15417 */ 15418 prev_dst = dst; 15419 15420 ipha = (ipha_t *)mp->b_rptr; 15421 len = mp->b_wptr - rptr; 15422 15423 ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha)); 15424 15425 /* 15426 * If it is a non TCP packet, or doesn't have H/W cksum, 15427 * or doesn't have min len, reject. 15428 */ 15429 if ((ipha->ipha_protocol != IPPROTO_TCP) || (len < 15430 (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) { 15431 ADD_TO_CHAIN(uhead, utail, ucnt, mp); 15432 continue; 15433 } 15434 15435 pkt_len = ntohs(ipha->ipha_length); 15436 if (len != pkt_len) { 15437 if (len > pkt_len) { 15438 mp->b_wptr = rptr + pkt_len; 15439 } else { 15440 ADD_TO_CHAIN(uhead, utail, ucnt, mp); 15441 continue; 15442 } 15443 } 15444 15445 opt_len = ipha->ipha_version_and_hdr_length - 15446 IP_SIMPLE_HDR_VERSION; 15447 dst = ipha->ipha_dst; 15448 15449 /* IP version bad or there are IP options */ 15450 if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill, 15451 mp, &ipha, &dst, ipst))) 15452 continue; 15453 15454 if (is_system_labeled() || (ill->ill_dhcpinit != 0) || 15455 (ipst->ips_ip_cgtp_filter && 15456 ipst->ips_ip_cgtp_filter_ops != NULL)) { 15457 ADD_TO_CHAIN(uhead, utail, ucnt, mp); 15458 continue; 15459 } 15460 15461 /* 15462 * Reuse the cached ire only if the ipha_dst of the previous 15463 * packet is the same as the current packet AND it is not 15464 * INADDR_ANY. 15465 */ 15466 if (!(dst == prev_dst && dst != INADDR_ANY) && 15467 (ire != NULL)) { 15468 ire_refrele(ire); 15469 ire = NULL; 15470 } 15471 15472 if (ire == NULL) 15473 ire = ire_cache_lookup_simple(dst, ipst); 15474 15475 /* 15476 * Unless forwarding is enabled, dont call 15477 * ip_fast_forward(). Incoming packet is for forwarding 15478 */ 15479 if ((ill->ill_flags & ILLF_ROUTER) && 15480 (ire == NULL || (ire->ire_type & IRE_CACHE))) { 15481 15482 DTRACE_PROBE4(ip4__physical__in__start, 15483 ill_t *, ill, ill_t *, NULL, 15484 ipha_t *, ipha, mblk_t *, mp); 15485 15486 FW_HOOKS(ipst->ips_ip4_physical_in_event, 15487 ipst->ips_ipv4firewall_physical_in, 15488 ill, NULL, ipha, mp, mp, 0, ipst); 15489 15490 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); 15491 15492 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 15493 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, 15494 pkt_len); 15495 15496 if (mp != NULL) 15497 ire = ip_fast_forward(ire, dst, ill, mp); 15498 continue; 15499 } 15500 15501 /* incoming packet is for local consumption */ 15502 if ((ire != NULL) && (ire->ire_type & IRE_LOCAL)) 15503 goto local_accept; 15504 15505 /* 15506 * Disable ire caching for anything more complex 15507 * than the simple fast path case we checked for above. 15508 */ 15509 if (ire != NULL) { 15510 ire_refrele(ire); 15511 ire = NULL; 15512 } 15513 15514 ire = ire_cache_lookup(dst, ALL_ZONES, msg_getlabel(mp), 15515 ipst); 15516 if (ire == NULL || ire->ire_type == IRE_BROADCAST || 15517 ire->ire_stq != NULL) { 15518 ADD_TO_CHAIN(uhead, utail, ucnt, mp); 15519 if (ire != NULL) { 15520 ire_refrele(ire); 15521 ire = NULL; 15522 } 15523 continue; 15524 } 15525 15526 local_accept: 15527 15528 if (ire->ire_rfq != q) { 15529 ADD_TO_CHAIN(uhead, utail, ucnt, mp); 15530 if (ire != NULL) { 15531 ire_refrele(ire); 15532 ire = NULL; 15533 } 15534 continue; 15535 } 15536 15537 /* 15538 * The event for packets being received from a 'physical' 15539 * interface is placed after validation of the source and/or 15540 * destination address as being local so that packets can be 15541 * redirected to loopback addresses using ipnat. 15542 */ 15543 DTRACE_PROBE4(ip4__physical__in__start, 15544 ill_t *, ill, ill_t *, NULL, 15545 ipha_t *, ipha, mblk_t *, mp); 15546 15547 FW_HOOKS(ipst->ips_ip4_physical_in_event, 15548 ipst->ips_ipv4firewall_physical_in, 15549 ill, NULL, ipha, mp, mp, 0, ipst); 15550 15551 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); 15552 15553 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 15554 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); 15555 15556 if (mp != NULL && 15557 (mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp, 15558 0, q, ip_ring)) != NULL) { 15559 if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) { 15560 ADD_TO_CHAIN(ahead, atail, acnt, mp); 15561 } else { 15562 SQUEUE_ENTER(curr_sqp, mp, mp, 1, 15563 SQ_FILL, SQTAG_IP_INPUT); 15564 } 15565 } 15566 } 15567 15568 if (ire != NULL) 15569 ire_refrele(ire); 15570 15571 if (uhead != NULL) 15572 ip_input(ill, ip_ring, uhead, NULL); 15573 15574 if (ahead != NULL) { 15575 *last = atail; 15576 *cnt = acnt; 15577 return (ahead); 15578 } 15579 15580 return (NULL); 15581 #undef rptr 15582 } 15583 15584 static void 15585 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 15586 t_uscalar_t err) 15587 { 15588 if (dl_err == DL_SYSERR) { 15589 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15590 "%s: %s failed: DL_SYSERR (errno %u)\n", 15591 ill->ill_name, dl_primstr(prim), err); 15592 return; 15593 } 15594 15595 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15596 "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim), 15597 dl_errstr(dl_err)); 15598 } 15599 15600 /* 15601 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 15602 * than DL_UNITDATA_IND messages. If we need to process this message 15603 * exclusively, we call qwriter_ip, in which case we also need to call 15604 * ill_refhold before that, since qwriter_ip does an ill_refrele. 15605 */ 15606 void 15607 ip_rput_dlpi(queue_t *q, mblk_t *mp) 15608 { 15609 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15610 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15611 ill_t *ill = q->q_ptr; 15612 t_uscalar_t prim = dloa->dl_primitive; 15613 t_uscalar_t reqprim = DL_PRIM_INVAL; 15614 15615 ip1dbg(("ip_rput_dlpi")); 15616 15617 /* 15618 * If we received an ACK but didn't send a request for it, then it 15619 * can't be part of any pending operation; discard up-front. 15620 */ 15621 switch (prim) { 15622 case DL_ERROR_ACK: 15623 reqprim = dlea->dl_error_primitive; 15624 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s " 15625 "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim), 15626 reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno, 15627 dlea->dl_unix_errno)); 15628 break; 15629 case DL_OK_ACK: 15630 reqprim = dloa->dl_correct_primitive; 15631 break; 15632 case DL_INFO_ACK: 15633 reqprim = DL_INFO_REQ; 15634 break; 15635 case DL_BIND_ACK: 15636 reqprim = DL_BIND_REQ; 15637 break; 15638 case DL_PHYS_ADDR_ACK: 15639 reqprim = DL_PHYS_ADDR_REQ; 15640 break; 15641 case DL_NOTIFY_ACK: 15642 reqprim = DL_NOTIFY_REQ; 15643 break; 15644 case DL_CONTROL_ACK: 15645 reqprim = DL_CONTROL_REQ; 15646 break; 15647 case DL_CAPABILITY_ACK: 15648 reqprim = DL_CAPABILITY_REQ; 15649 break; 15650 } 15651 15652 if (prim != DL_NOTIFY_IND) { 15653 if (reqprim == DL_PRIM_INVAL || 15654 !ill_dlpi_pending(ill, reqprim)) { 15655 /* Not a DLPI message we support or expected */ 15656 freemsg(mp); 15657 return; 15658 } 15659 ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim), 15660 dl_primstr(reqprim))); 15661 } 15662 15663 switch (reqprim) { 15664 case DL_UNBIND_REQ: 15665 /* 15666 * NOTE: we mark the unbind as complete even if we got a 15667 * DL_ERROR_ACK, since there's not much else we can do. 15668 */ 15669 mutex_enter(&ill->ill_lock); 15670 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 15671 cv_signal(&ill->ill_cv); 15672 mutex_exit(&ill->ill_lock); 15673 break; 15674 15675 case DL_ENABMULTI_REQ: 15676 if (prim == DL_OK_ACK) { 15677 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15678 ill->ill_dlpi_multicast_state = IDS_OK; 15679 } 15680 break; 15681 } 15682 15683 /* 15684 * The message is one we're waiting for (or DL_NOTIFY_IND), but we 15685 * need to become writer to continue to process it. Because an 15686 * exclusive operation doesn't complete until replies to all queued 15687 * DLPI messages have been received, we know we're in the middle of an 15688 * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND). 15689 * 15690 * As required by qwriter_ip(), we refhold the ill; it will refrele. 15691 * Since this is on the ill stream we unconditionally bump up the 15692 * refcount without doing ILL_CAN_LOOKUP(). 15693 */ 15694 ill_refhold(ill); 15695 if (prim == DL_NOTIFY_IND) 15696 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE); 15697 else 15698 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE); 15699 } 15700 15701 /* 15702 * Handling of DLPI messages that require exclusive access to the ipsq. 15703 * 15704 * Need to do ill_pending_mp_release on ioctl completion, which could 15705 * happen here. (along with mi_copy_done) 15706 */ 15707 /* ARGSUSED */ 15708 static void 15709 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15710 { 15711 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15712 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15713 int err = 0; 15714 ill_t *ill; 15715 ipif_t *ipif = NULL; 15716 mblk_t *mp1 = NULL; 15717 conn_t *connp = NULL; 15718 t_uscalar_t paddrreq; 15719 mblk_t *mp_hw; 15720 boolean_t success; 15721 boolean_t ioctl_aborted = B_FALSE; 15722 boolean_t log = B_TRUE; 15723 ip_stack_t *ipst; 15724 15725 ip1dbg(("ip_rput_dlpi_writer ..")); 15726 ill = (ill_t *)q->q_ptr; 15727 ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop); 15728 ASSERT(IAM_WRITER_ILL(ill)); 15729 15730 ipst = ill->ill_ipst; 15731 15732 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 15733 /* 15734 * The current ioctl could have been aborted by the user and a new 15735 * ioctl to bring up another ill could have started. We could still 15736 * get a response from the driver later. 15737 */ 15738 if (ipif != NULL && ipif->ipif_ill != ill) 15739 ioctl_aborted = B_TRUE; 15740 15741 switch (dloa->dl_primitive) { 15742 case DL_ERROR_ACK: 15743 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n", 15744 dl_primstr(dlea->dl_error_primitive))); 15745 15746 switch (dlea->dl_error_primitive) { 15747 case DL_DISABMULTI_REQ: 15748 ill_dlpi_done(ill, dlea->dl_error_primitive); 15749 break; 15750 case DL_PROMISCON_REQ: 15751 case DL_PROMISCOFF_REQ: 15752 case DL_UNBIND_REQ: 15753 case DL_ATTACH_REQ: 15754 case DL_INFO_REQ: 15755 ill_dlpi_done(ill, dlea->dl_error_primitive); 15756 break; 15757 case DL_NOTIFY_REQ: 15758 ill_dlpi_done(ill, DL_NOTIFY_REQ); 15759 log = B_FALSE; 15760 break; 15761 case DL_PHYS_ADDR_REQ: 15762 /* 15763 * For IPv6 only, there are two additional 15764 * phys_addr_req's sent to the driver to get the 15765 * IPv6 token and lla. This allows IP to acquire 15766 * the hardware address format for a given interface 15767 * without having built in knowledge of the hardware 15768 * address. ill_phys_addr_pend keeps track of the last 15769 * DL_PAR sent so we know which response we are 15770 * dealing with. ill_dlpi_done will update 15771 * ill_phys_addr_pend when it sends the next req. 15772 * We don't complete the IOCTL until all three DL_PARs 15773 * have been attempted, so set *_len to 0 and break. 15774 */ 15775 paddrreq = ill->ill_phys_addr_pend; 15776 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 15777 if (paddrreq == DL_IPV6_TOKEN) { 15778 ill->ill_token_length = 0; 15779 log = B_FALSE; 15780 break; 15781 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 15782 ill->ill_nd_lla_len = 0; 15783 log = B_FALSE; 15784 break; 15785 } 15786 /* 15787 * Something went wrong with the DL_PHYS_ADDR_REQ. 15788 * We presumably have an IOCTL hanging out waiting 15789 * for completion. Find it and complete the IOCTL 15790 * with the error noted. 15791 * However, ill_dl_phys was called on an ill queue 15792 * (from SIOCSLIFNAME), thus conn_pending_ill is not 15793 * set. But the ioctl is known to be pending on ill_wq. 15794 */ 15795 if (!ill->ill_ifname_pending) 15796 break; 15797 ill->ill_ifname_pending = 0; 15798 if (!ioctl_aborted) 15799 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15800 if (mp1 != NULL) { 15801 /* 15802 * This operation (SIOCSLIFNAME) must have 15803 * happened on the ill. Assert there is no conn 15804 */ 15805 ASSERT(connp == NULL); 15806 q = ill->ill_wq; 15807 } 15808 break; 15809 case DL_BIND_REQ: 15810 ill_dlpi_done(ill, DL_BIND_REQ); 15811 if (ill->ill_ifname_pending) 15812 break; 15813 /* 15814 * Something went wrong with the bind. We presumably 15815 * have an IOCTL hanging out waiting for completion. 15816 * Find it, take down the interface that was coming 15817 * up, and complete the IOCTL with the error noted. 15818 */ 15819 if (!ioctl_aborted) 15820 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15821 if (mp1 != NULL) { 15822 /* 15823 * This might be a result of a DL_NOTE_REPLUMB 15824 * notification. In that case, connp is NULL. 15825 */ 15826 if (connp != NULL) 15827 q = CONNP_TO_WQ(connp); 15828 15829 (void) ipif_down(ipif, NULL, NULL); 15830 /* error is set below the switch */ 15831 } 15832 break; 15833 case DL_ENABMULTI_REQ: 15834 ill_dlpi_done(ill, DL_ENABMULTI_REQ); 15835 15836 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15837 ill->ill_dlpi_multicast_state = IDS_FAILED; 15838 if (ill->ill_dlpi_multicast_state == IDS_FAILED) { 15839 ipif_t *ipif; 15840 15841 printf("ip: joining multicasts failed (%d)" 15842 " on %s - will use link layer " 15843 "broadcasts for multicast\n", 15844 dlea->dl_errno, ill->ill_name); 15845 15846 /* 15847 * Set up the multicast mapping alone. 15848 * writer, so ok to access ill->ill_ipif 15849 * without any lock. 15850 */ 15851 ipif = ill->ill_ipif; 15852 mutex_enter(&ill->ill_phyint->phyint_lock); 15853 ill->ill_phyint->phyint_flags |= 15854 PHYI_MULTI_BCAST; 15855 mutex_exit(&ill->ill_phyint->phyint_lock); 15856 15857 if (!ill->ill_isv6) { 15858 (void) ipif_arp_setup_multicast(ipif, 15859 NULL); 15860 } else { 15861 (void) ipif_ndp_setup_multicast(ipif, 15862 NULL); 15863 } 15864 } 15865 freemsg(mp); /* Don't want to pass this up */ 15866 return; 15867 case DL_CONTROL_REQ: 15868 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " 15869 "DL_CONTROL_REQ\n")); 15870 ill_dlpi_done(ill, dlea->dl_error_primitive); 15871 freemsg(mp); 15872 return; 15873 case DL_CAPABILITY_REQ: 15874 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " 15875 "DL_CAPABILITY REQ\n")); 15876 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 15877 ill->ill_dlpi_capab_state = IDCS_FAILED; 15878 ill_capability_done(ill); 15879 freemsg(mp); 15880 return; 15881 } 15882 /* 15883 * Note the error for IOCTL completion (mp1 is set when 15884 * ready to complete ioctl). If ill_ifname_pending_err is 15885 * set, an error occured during plumbing (ill_ifname_pending), 15886 * so we want to report that error. 15887 * 15888 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 15889 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 15890 * expected to get errack'd if the driver doesn't support 15891 * these flags (e.g. ethernet). log will be set to B_FALSE 15892 * if these error conditions are encountered. 15893 */ 15894 if (mp1 != NULL) { 15895 if (ill->ill_ifname_pending_err != 0) { 15896 err = ill->ill_ifname_pending_err; 15897 ill->ill_ifname_pending_err = 0; 15898 } else { 15899 err = dlea->dl_unix_errno ? 15900 dlea->dl_unix_errno : ENXIO; 15901 } 15902 /* 15903 * If we're plumbing an interface and an error hasn't already 15904 * been saved, set ill_ifname_pending_err to the error passed 15905 * up. Ignore the error if log is B_FALSE (see comment above). 15906 */ 15907 } else if (log && ill->ill_ifname_pending && 15908 ill->ill_ifname_pending_err == 0) { 15909 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 15910 dlea->dl_unix_errno : ENXIO; 15911 } 15912 15913 if (log) 15914 ip_dlpi_error(ill, dlea->dl_error_primitive, 15915 dlea->dl_errno, dlea->dl_unix_errno); 15916 break; 15917 case DL_CAPABILITY_ACK: 15918 ill_capability_ack(ill, mp); 15919 /* 15920 * The message has been handed off to ill_capability_ack 15921 * and must not be freed below 15922 */ 15923 mp = NULL; 15924 break; 15925 15926 case DL_CONTROL_ACK: 15927 /* We treat all of these as "fire and forget" */ 15928 ill_dlpi_done(ill, DL_CONTROL_REQ); 15929 break; 15930 case DL_INFO_ACK: 15931 /* Call a routine to handle this one. */ 15932 ill_dlpi_done(ill, DL_INFO_REQ); 15933 ip_ll_subnet_defaults(ill, mp); 15934 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 15935 return; 15936 case DL_BIND_ACK: 15937 /* 15938 * We should have an IOCTL waiting on this unless 15939 * sent by ill_dl_phys, in which case just return 15940 */ 15941 ill_dlpi_done(ill, DL_BIND_REQ); 15942 if (ill->ill_ifname_pending) 15943 break; 15944 15945 if (!ioctl_aborted) 15946 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15947 if (mp1 == NULL) 15948 break; 15949 /* 15950 * mp1 was added by ill_dl_up(). if that is a result of 15951 * a DL_NOTE_REPLUMB notification, connp could be NULL. 15952 */ 15953 if (connp != NULL) 15954 q = CONNP_TO_WQ(connp); 15955 15956 /* 15957 * We are exclusive. So nothing can change even after 15958 * we get the pending mp. If need be we can put it back 15959 * and restart, as in calling ipif_arp_up() below. 15960 */ 15961 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 15962 15963 mutex_enter(&ill->ill_lock); 15964 ill->ill_dl_up = 1; 15965 ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0); 15966 mutex_exit(&ill->ill_lock); 15967 15968 /* 15969 * Now bring up the resolver; when that is complete, we'll 15970 * create IREs. Note that we intentionally mirror what 15971 * ipif_up() would have done, because we got here by way of 15972 * ill_dl_up(), which stopped ipif_up()'s processing. 15973 */ 15974 if (ill->ill_isv6) { 15975 if (ill->ill_flags & ILLF_XRESOLV) { 15976 if (connp != NULL) 15977 mutex_enter(&connp->conn_lock); 15978 mutex_enter(&ill->ill_lock); 15979 success = ipsq_pending_mp_add(connp, ipif, q, 15980 mp1, 0); 15981 mutex_exit(&ill->ill_lock); 15982 if (connp != NULL) 15983 mutex_exit(&connp->conn_lock); 15984 if (success) { 15985 err = ipif_resolver_up(ipif, 15986 Res_act_initial); 15987 if (err == EINPROGRESS) { 15988 freemsg(mp); 15989 return; 15990 } 15991 ASSERT(err != 0); 15992 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15993 ASSERT(mp1 != NULL); 15994 } else { 15995 /* conn has started closing */ 15996 err = EINTR; 15997 } 15998 } else { /* Non XRESOLV interface */ 15999 (void) ipif_resolver_up(ipif, Res_act_initial); 16000 if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) 16001 err = ipif_up_done_v6(ipif); 16002 } 16003 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 16004 /* 16005 * ARP and other v4 external resolvers. 16006 * Leave the pending mblk intact so that 16007 * the ioctl completes in ip_rput(). 16008 */ 16009 if (connp != NULL) 16010 mutex_enter(&connp->conn_lock); 16011 mutex_enter(&ill->ill_lock); 16012 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 16013 mutex_exit(&ill->ill_lock); 16014 if (connp != NULL) 16015 mutex_exit(&connp->conn_lock); 16016 if (success) { 16017 err = ipif_resolver_up(ipif, Res_act_initial); 16018 if (err == EINPROGRESS) { 16019 freemsg(mp); 16020 return; 16021 } 16022 ASSERT(err != 0); 16023 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16024 } else { 16025 /* The conn has started closing */ 16026 err = EINTR; 16027 } 16028 } else { 16029 /* 16030 * This one is complete. Reply to pending ioctl. 16031 */ 16032 (void) ipif_resolver_up(ipif, Res_act_initial); 16033 err = ipif_up_done(ipif); 16034 } 16035 16036 if ((err == 0) && (ill->ill_up_ipifs)) { 16037 err = ill_up_ipifs(ill, q, mp1); 16038 if (err == EINPROGRESS) { 16039 freemsg(mp); 16040 return; 16041 } 16042 } 16043 16044 /* 16045 * If we have a moved ipif to bring up, and everything has 16046 * succeeded to this point, bring it up on the IPMP ill. 16047 * Otherwise, leave it down -- the admin can try to bring it 16048 * up by hand if need be. 16049 */ 16050 if (ill->ill_move_ipif != NULL) { 16051 if (err != 0) { 16052 ill->ill_move_ipif = NULL; 16053 } else { 16054 ipif = ill->ill_move_ipif; 16055 ill->ill_move_ipif = NULL; 16056 err = ipif_up(ipif, q, mp1); 16057 if (err == EINPROGRESS) { 16058 freemsg(mp); 16059 return; 16060 } 16061 } 16062 } 16063 break; 16064 16065 case DL_NOTIFY_IND: { 16066 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 16067 ire_t *ire; 16068 uint_t orig_mtu; 16069 boolean_t need_ire_walk_v4 = B_FALSE; 16070 boolean_t need_ire_walk_v6 = B_FALSE; 16071 16072 switch (notify->dl_notification) { 16073 case DL_NOTE_PHYS_ADDR: 16074 err = ill_set_phys_addr(ill, mp); 16075 break; 16076 16077 case DL_NOTE_REPLUMB: 16078 /* 16079 * Directly return after calling ill_replumb(). 16080 * Note that we should not free mp as it is reused 16081 * in the ill_replumb() function. 16082 */ 16083 err = ill_replumb(ill, mp); 16084 return; 16085 16086 case DL_NOTE_FASTPATH_FLUSH: 16087 ill_fastpath_flush(ill); 16088 break; 16089 16090 case DL_NOTE_SDU_SIZE: 16091 /* 16092 * Change the MTU size of the interface, of all 16093 * attached ipif's, and of all relevant ire's. The 16094 * new value's a uint32_t at notify->dl_data. 16095 * Mtu change Vs. new ire creation - protocol below. 16096 * 16097 * a Mark the ipif as IPIF_CHANGING. 16098 * b Set the new mtu in the ipif. 16099 * c Change the ire_max_frag on all affected ires 16100 * d Unmark the IPIF_CHANGING 16101 * 16102 * To see how the protocol works, assume an interface 16103 * route is also being added simultaneously by 16104 * ip_rt_add and let 'ipif' be the ipif referenced by 16105 * the ire. If the ire is created before step a, 16106 * it will be cleaned up by step c. If the ire is 16107 * created after step d, it will see the new value of 16108 * ipif_mtu. Any attempt to create the ire between 16109 * steps a to d will fail because of the IPIF_CHANGING 16110 * flag. Note that ire_create() is passed a pointer to 16111 * the ipif_mtu, and not the value. During ire_add 16112 * under the bucket lock, the ire_max_frag of the 16113 * new ire being created is set from the ipif/ire from 16114 * which it is being derived. 16115 */ 16116 mutex_enter(&ill->ill_lock); 16117 16118 orig_mtu = ill->ill_max_mtu; 16119 ill->ill_max_frag = (uint_t)notify->dl_data; 16120 ill->ill_max_mtu = (uint_t)notify->dl_data; 16121 16122 /* 16123 * If ill_user_mtu was set (via SIOCSLIFLNKINFO), 16124 * clamp ill_max_mtu at it. 16125 */ 16126 if (ill->ill_user_mtu != 0 && 16127 ill->ill_user_mtu < ill->ill_max_mtu) 16128 ill->ill_max_mtu = ill->ill_user_mtu; 16129 16130 /* 16131 * If the MTU is unchanged, we're done. 16132 */ 16133 if (orig_mtu == ill->ill_max_mtu) { 16134 mutex_exit(&ill->ill_lock); 16135 break; 16136 } 16137 16138 if (ill->ill_isv6) { 16139 if (ill->ill_max_mtu < IPV6_MIN_MTU) 16140 ill->ill_max_mtu = IPV6_MIN_MTU; 16141 } else { 16142 if (ill->ill_max_mtu < IP_MIN_MTU) 16143 ill->ill_max_mtu = IP_MIN_MTU; 16144 } 16145 for (ipif = ill->ill_ipif; ipif != NULL; 16146 ipif = ipif->ipif_next) { 16147 /* 16148 * Don't override the mtu if the user 16149 * has explicitly set it. 16150 */ 16151 if (ipif->ipif_flags & IPIF_FIXEDMTU) 16152 continue; 16153 ipif->ipif_mtu = (uint_t)notify->dl_data; 16154 if (ipif->ipif_isv6) 16155 ire = ipif_to_ire_v6(ipif); 16156 else 16157 ire = ipif_to_ire(ipif); 16158 if (ire != NULL) { 16159 ire->ire_max_frag = ipif->ipif_mtu; 16160 ire_refrele(ire); 16161 } 16162 if (ipif->ipif_flags & IPIF_UP) { 16163 if (ill->ill_isv6) 16164 need_ire_walk_v6 = B_TRUE; 16165 else 16166 need_ire_walk_v4 = B_TRUE; 16167 } 16168 } 16169 mutex_exit(&ill->ill_lock); 16170 if (need_ire_walk_v4) 16171 ire_walk_v4(ill_mtu_change, (char *)ill, 16172 ALL_ZONES, ipst); 16173 if (need_ire_walk_v6) 16174 ire_walk_v6(ill_mtu_change, (char *)ill, 16175 ALL_ZONES, ipst); 16176 16177 /* 16178 * Refresh IPMP meta-interface MTU if necessary. 16179 */ 16180 if (IS_UNDER_IPMP(ill)) 16181 ipmp_illgrp_refresh_mtu(ill->ill_grp); 16182 break; 16183 16184 case DL_NOTE_LINK_UP: 16185 case DL_NOTE_LINK_DOWN: { 16186 /* 16187 * We are writer. ill / phyint / ipsq assocs stable. 16188 * The RUNNING flag reflects the state of the link. 16189 */ 16190 phyint_t *phyint = ill->ill_phyint; 16191 uint64_t new_phyint_flags; 16192 boolean_t changed = B_FALSE; 16193 boolean_t went_up; 16194 16195 went_up = notify->dl_notification == DL_NOTE_LINK_UP; 16196 mutex_enter(&phyint->phyint_lock); 16197 16198 new_phyint_flags = went_up ? 16199 phyint->phyint_flags | PHYI_RUNNING : 16200 phyint->phyint_flags & ~PHYI_RUNNING; 16201 16202 if (IS_IPMP(ill)) { 16203 new_phyint_flags = went_up ? 16204 new_phyint_flags & ~PHYI_FAILED : 16205 new_phyint_flags | PHYI_FAILED; 16206 } 16207 16208 if (new_phyint_flags != phyint->phyint_flags) { 16209 phyint->phyint_flags = new_phyint_flags; 16210 changed = B_TRUE; 16211 } 16212 mutex_exit(&phyint->phyint_lock); 16213 /* 16214 * ill_restart_dad handles the DAD restart and routing 16215 * socket notification logic. 16216 */ 16217 if (changed) { 16218 ill_restart_dad(phyint->phyint_illv4, went_up); 16219 ill_restart_dad(phyint->phyint_illv6, went_up); 16220 } 16221 break; 16222 } 16223 case DL_NOTE_PROMISC_ON_PHYS: { 16224 phyint_t *phyint = ill->ill_phyint; 16225 16226 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16227 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 16228 mutex_enter(&phyint->phyint_lock); 16229 phyint->phyint_flags |= PHYI_PROMISC; 16230 mutex_exit(&phyint->phyint_lock); 16231 break; 16232 } 16233 case DL_NOTE_PROMISC_OFF_PHYS: { 16234 phyint_t *phyint = ill->ill_phyint; 16235 16236 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16237 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 16238 mutex_enter(&phyint->phyint_lock); 16239 phyint->phyint_flags &= ~PHYI_PROMISC; 16240 mutex_exit(&phyint->phyint_lock); 16241 break; 16242 } 16243 case DL_NOTE_CAPAB_RENEG: 16244 /* 16245 * Something changed on the driver side. 16246 * It wants us to renegotiate the capabilities 16247 * on this ill. One possible cause is the aggregation 16248 * interface under us where a port got added or 16249 * went away. 16250 * 16251 * If the capability negotiation is already done 16252 * or is in progress, reset the capabilities and 16253 * mark the ill's ill_capab_reneg to be B_TRUE, 16254 * so that when the ack comes back, we can start 16255 * the renegotiation process. 16256 * 16257 * Note that if ill_capab_reneg is already B_TRUE 16258 * (ill_dlpi_capab_state is IDS_UNKNOWN in this case), 16259 * the capability resetting request has been sent 16260 * and the renegotiation has not been started yet; 16261 * nothing needs to be done in this case. 16262 */ 16263 ipsq_current_start(ipsq, ill->ill_ipif, 0); 16264 ill_capability_reset(ill, B_TRUE); 16265 ipsq_current_finish(ipsq); 16266 break; 16267 default: 16268 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 16269 "type 0x%x for DL_NOTIFY_IND\n", 16270 notify->dl_notification)); 16271 break; 16272 } 16273 16274 /* 16275 * As this is an asynchronous operation, we 16276 * should not call ill_dlpi_done 16277 */ 16278 break; 16279 } 16280 case DL_NOTIFY_ACK: { 16281 dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr; 16282 16283 if (noteack->dl_notifications & DL_NOTE_LINK_UP) 16284 ill->ill_note_link = 1; 16285 ill_dlpi_done(ill, DL_NOTIFY_REQ); 16286 break; 16287 } 16288 case DL_PHYS_ADDR_ACK: { 16289 /* 16290 * As part of plumbing the interface via SIOCSLIFNAME, 16291 * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs, 16292 * whose answers we receive here. As each answer is received, 16293 * we call ill_dlpi_done() to dispatch the next request as 16294 * we're processing the current one. Once all answers have 16295 * been received, we use ipsq_pending_mp_get() to dequeue the 16296 * outstanding IOCTL and reply to it. (Because ill_dl_phys() 16297 * is invoked from an ill queue, conn_oper_pending_ill is not 16298 * available, but we know the ioctl is pending on ill_wq.) 16299 */ 16300 uint_t paddrlen, paddroff; 16301 uint8_t *addr; 16302 16303 paddrreq = ill->ill_phys_addr_pend; 16304 paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length; 16305 paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset; 16306 addr = mp->b_rptr + paddroff; 16307 16308 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 16309 if (paddrreq == DL_IPV6_TOKEN) { 16310 /* 16311 * bcopy to low-order bits of ill_token 16312 * 16313 * XXX Temporary hack - currently, all known tokens 16314 * are 64 bits, so I'll cheat for the moment. 16315 */ 16316 bcopy(addr, &ill->ill_token.s6_addr32[2], paddrlen); 16317 ill->ill_token_length = paddrlen; 16318 break; 16319 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 16320 ASSERT(ill->ill_nd_lla_mp == NULL); 16321 ill_set_ndmp(ill, mp, paddroff, paddrlen); 16322 mp = NULL; 16323 break; 16324 } else if (paddrreq == DL_CURR_DEST_ADDR) { 16325 ASSERT(ill->ill_dest_addr_mp == NULL); 16326 ill->ill_dest_addr_mp = mp; 16327 ill->ill_dest_addr = addr; 16328 mp = NULL; 16329 if (ill->ill_isv6) { 16330 ill_setdesttoken(ill); 16331 ipif_setdestlinklocal(ill->ill_ipif); 16332 } 16333 break; 16334 } 16335 16336 ASSERT(paddrreq == DL_CURR_PHYS_ADDR); 16337 ASSERT(ill->ill_phys_addr_mp == NULL); 16338 if (!ill->ill_ifname_pending) 16339 break; 16340 ill->ill_ifname_pending = 0; 16341 if (!ioctl_aborted) 16342 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16343 if (mp1 != NULL) { 16344 ASSERT(connp == NULL); 16345 q = ill->ill_wq; 16346 } 16347 /* 16348 * If any error acks received during the plumbing sequence, 16349 * ill_ifname_pending_err will be set. Break out and send up 16350 * the error to the pending ioctl. 16351 */ 16352 if (ill->ill_ifname_pending_err != 0) { 16353 err = ill->ill_ifname_pending_err; 16354 ill->ill_ifname_pending_err = 0; 16355 break; 16356 } 16357 16358 ill->ill_phys_addr_mp = mp; 16359 ill->ill_phys_addr = (paddrlen == 0 ? NULL : addr); 16360 mp = NULL; 16361 16362 /* 16363 * If paddrlen or ill_phys_addr_length is zero, the DLPI 16364 * provider doesn't support physical addresses. We check both 16365 * paddrlen and ill_phys_addr_length because sppp (PPP) does 16366 * not have physical addresses, but historically adversises a 16367 * physical address length of 0 in its DL_INFO_ACK, but 6 in 16368 * its DL_PHYS_ADDR_ACK. 16369 */ 16370 if (paddrlen == 0 || ill->ill_phys_addr_length == 0) { 16371 ill->ill_phys_addr = NULL; 16372 } else if (paddrlen != ill->ill_phys_addr_length) { 16373 ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d", 16374 paddrlen, ill->ill_phys_addr_length)); 16375 err = EINVAL; 16376 break; 16377 } 16378 16379 if (ill->ill_nd_lla_mp == NULL) { 16380 if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) { 16381 err = ENOMEM; 16382 break; 16383 } 16384 ill_set_ndmp(ill, mp_hw, paddroff, paddrlen); 16385 } 16386 16387 if (ill->ill_isv6) { 16388 ill_setdefaulttoken(ill); 16389 ipif_setlinklocal(ill->ill_ipif); 16390 } 16391 break; 16392 } 16393 case DL_OK_ACK: 16394 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 16395 dl_primstr((int)dloa->dl_correct_primitive), 16396 dloa->dl_correct_primitive)); 16397 switch (dloa->dl_correct_primitive) { 16398 case DL_ENABMULTI_REQ: 16399 case DL_DISABMULTI_REQ: 16400 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16401 break; 16402 case DL_PROMISCON_REQ: 16403 case DL_PROMISCOFF_REQ: 16404 case DL_UNBIND_REQ: 16405 case DL_ATTACH_REQ: 16406 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16407 break; 16408 } 16409 break; 16410 default: 16411 break; 16412 } 16413 16414 freemsg(mp); 16415 if (mp1 == NULL) 16416 return; 16417 16418 /* 16419 * The operation must complete without EINPROGRESS since 16420 * ipsq_pending_mp_get() has removed the mblk (mp1). Otherwise, 16421 * the operation will be stuck forever inside the IPSQ. 16422 */ 16423 ASSERT(err != EINPROGRESS); 16424 16425 switch (ipsq->ipsq_xop->ipx_current_ioctl) { 16426 case 0: 16427 ipsq_current_finish(ipsq); 16428 break; 16429 16430 case SIOCSLIFNAME: 16431 case IF_UNITSEL: { 16432 ill_t *ill_other = ILL_OTHER(ill); 16433 16434 /* 16435 * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the 16436 * ill has a peer which is in an IPMP group, then place ill 16437 * into the same group. One catch: although ifconfig plumbs 16438 * the appropriate IPMP meta-interface prior to plumbing this 16439 * ill, it is possible for multiple ifconfig applications to 16440 * race (or for another application to adjust plumbing), in 16441 * which case the IPMP meta-interface we need will be missing. 16442 * If so, kick the phyint out of the group. 16443 */ 16444 if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) { 16445 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 16446 ipmp_illgrp_t *illg; 16447 16448 illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4; 16449 if (illg == NULL) 16450 ipmp_phyint_leave_grp(ill->ill_phyint); 16451 else 16452 ipmp_ill_join_illgrp(ill, illg); 16453 } 16454 16455 if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL) 16456 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 16457 else 16458 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); 16459 break; 16460 } 16461 case SIOCLIFADDIF: 16462 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); 16463 break; 16464 16465 default: 16466 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 16467 break; 16468 } 16469 } 16470 16471 /* 16472 * ip_rput_other is called by ip_rput to handle messages modifying the global 16473 * state in IP. If 'ipsq' is non-NULL, caller is writer on it. 16474 */ 16475 /* ARGSUSED */ 16476 void 16477 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 16478 { 16479 ill_t *ill = q->q_ptr; 16480 struct iocblk *iocp; 16481 16482 ip1dbg(("ip_rput_other ")); 16483 if (ipsq != NULL) { 16484 ASSERT(IAM_WRITER_IPSQ(ipsq)); 16485 ASSERT(ipsq->ipsq_xop == 16486 ill->ill_phyint->phyint_ipsq->ipsq_xop); 16487 } 16488 16489 switch (mp->b_datap->db_type) { 16490 case M_ERROR: 16491 case M_HANGUP: 16492 /* 16493 * The device has a problem. We force the ILL down. It can 16494 * be brought up again manually using SIOCSIFFLAGS (via 16495 * ifconfig or equivalent). 16496 */ 16497 ASSERT(ipsq != NULL); 16498 if (mp->b_rptr < mp->b_wptr) 16499 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 16500 if (ill->ill_error == 0) 16501 ill->ill_error = ENXIO; 16502 if (!ill_down_start(q, mp)) 16503 return; 16504 ipif_all_down_tail(ipsq, q, mp, NULL); 16505 break; 16506 case M_IOCNAK: { 16507 iocp = (struct iocblk *)mp->b_rptr; 16508 16509 ASSERT(iocp->ioc_cmd == DL_IOC_HDR_INFO); 16510 /* 16511 * If this was the first attempt, turn off the fastpath 16512 * probing. 16513 */ 16514 mutex_enter(&ill->ill_lock); 16515 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) { 16516 ill->ill_dlpi_fastpath_state = IDS_FAILED; 16517 mutex_exit(&ill->ill_lock); 16518 ill_fastpath_nack(ill); 16519 ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n", 16520 ill->ill_name)); 16521 } else { 16522 mutex_exit(&ill->ill_lock); 16523 } 16524 freemsg(mp); 16525 break; 16526 } 16527 default: 16528 ASSERT(0); 16529 break; 16530 } 16531 } 16532 16533 /* 16534 * NOTE : This function does not ire_refrele the ire argument passed in. 16535 * 16536 * IPQoS notes 16537 * IP policy is invoked twice for a forwarded packet, once on the read side 16538 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 16539 * enabled. An additional parameter, in_ill, has been added for this purpose. 16540 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 16541 * because ip_mroute drops this information. 16542 * 16543 */ 16544 void 16545 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 16546 { 16547 uint32_t old_pkt_len; 16548 uint32_t pkt_len; 16549 queue_t *q; 16550 uint32_t sum; 16551 #define rptr ((uchar_t *)ipha) 16552 uint32_t max_frag; 16553 uint32_t ill_index; 16554 ill_t *out_ill; 16555 mib2_ipIfStatsEntry_t *mibptr; 16556 ip_stack_t *ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst; 16557 16558 /* Get the ill_index of the incoming ILL */ 16559 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 16560 mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib; 16561 16562 /* Initiate Read side IPPF processing */ 16563 if (IPP_ENABLED(IPP_FWD_IN, ipst)) { 16564 ip_process(IPP_FWD_IN, &mp, ill_index); 16565 if (mp == NULL) { 16566 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 16567 "during IPPF processing\n")); 16568 return; 16569 } 16570 } 16571 16572 /* Adjust the checksum to reflect the ttl decrement. */ 16573 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 16574 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 16575 16576 if (ipha->ipha_ttl-- <= 1) { 16577 if (ip_csum_hdr(ipha)) { 16578 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16579 goto drop_pkt; 16580 } 16581 /* 16582 * Note: ire_stq this will be NULL for multicast 16583 * datagrams using the long path through arp (the IRE 16584 * is not an IRE_CACHE). This should not cause 16585 * problems since we don't generate ICMP errors for 16586 * multicast packets. 16587 */ 16588 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16589 q = ire->ire_stq; 16590 if (q != NULL) { 16591 /* Sent by forwarding path, and router is global zone */ 16592 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED, 16593 GLOBAL_ZONEID, ipst); 16594 } else 16595 freemsg(mp); 16596 return; 16597 } 16598 16599 /* 16600 * Don't forward if the interface is down 16601 */ 16602 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 16603 BUMP_MIB(mibptr, ipIfStatsInDiscards); 16604 ip2dbg(("ip_rput_forward:interface is down\n")); 16605 goto drop_pkt; 16606 } 16607 16608 /* Get the ill_index of the outgoing ILL */ 16609 out_ill = ire_to_ill(ire); 16610 ill_index = out_ill->ill_phyint->phyint_ifindex; 16611 16612 DTRACE_PROBE4(ip4__forwarding__start, 16613 ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16614 16615 FW_HOOKS(ipst->ips_ip4_forwarding_event, 16616 ipst->ips_ipv4firewall_forwarding, 16617 in_ill, out_ill, ipha, mp, mp, 0, ipst); 16618 16619 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 16620 16621 if (mp == NULL) 16622 return; 16623 old_pkt_len = pkt_len = ntohs(ipha->ipha_length); 16624 16625 if (is_system_labeled()) { 16626 mblk_t *mp1; 16627 16628 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 16629 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16630 goto drop_pkt; 16631 } 16632 /* Size may have changed */ 16633 mp = mp1; 16634 ipha = (ipha_t *)mp->b_rptr; 16635 pkt_len = ntohs(ipha->ipha_length); 16636 } 16637 16638 /* Check if there are options to update */ 16639 if (!IS_SIMPLE_IPH(ipha)) { 16640 if (ip_csum_hdr(ipha)) { 16641 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16642 goto drop_pkt; 16643 } 16644 if (ip_rput_forward_options(mp, ipha, ire, ipst)) { 16645 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16646 return; 16647 } 16648 16649 ipha->ipha_hdr_checksum = 0; 16650 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 16651 } 16652 max_frag = ire->ire_max_frag; 16653 if (pkt_len > max_frag) { 16654 /* 16655 * It needs fragging on its way out. We haven't 16656 * verified the header checksum yet. Since we 16657 * are going to put a surely good checksum in the 16658 * outgoing header, we have to make sure that it 16659 * was good coming in. 16660 */ 16661 if (ip_csum_hdr(ipha)) { 16662 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16663 goto drop_pkt; 16664 } 16665 /* Initiate Write side IPPF processing */ 16666 if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { 16667 ip_process(IPP_FWD_OUT, &mp, ill_index); 16668 if (mp == NULL) { 16669 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 16670 " during IPPF processing\n")); 16671 return; 16672 } 16673 } 16674 /* 16675 * Handle labeled packet resizing. 16676 * 16677 * If we have added a label, inform ip_wput_frag() of its 16678 * effect on the MTU for ICMP messages. 16679 */ 16680 if (pkt_len > old_pkt_len) { 16681 uint32_t secopt_size; 16682 16683 secopt_size = pkt_len - old_pkt_len; 16684 if (secopt_size < max_frag) 16685 max_frag -= secopt_size; 16686 } 16687 16688 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, 16689 GLOBAL_ZONEID, ipst, NULL); 16690 ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n")); 16691 return; 16692 } 16693 16694 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 16695 ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16696 FW_HOOKS(ipst->ips_ip4_physical_out_event, 16697 ipst->ips_ipv4firewall_physical_out, 16698 NULL, out_ill, ipha, mp, mp, 0, ipst); 16699 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 16700 if (mp == NULL) 16701 return; 16702 16703 mp->b_prev = (mblk_t *)IPP_FWD_OUT; 16704 ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n")); 16705 (void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL); 16706 /* ip_xmit_v4 always consumes the packet */ 16707 return; 16708 16709 drop_pkt:; 16710 ip1dbg(("ip_rput_forward: drop pkt\n")); 16711 freemsg(mp); 16712 #undef rptr 16713 } 16714 16715 void 16716 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 16717 { 16718 ire_t *ire; 16719 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16720 16721 ASSERT(!ipif->ipif_isv6); 16722 /* 16723 * Find an IRE which matches the destination and the outgoing 16724 * queue in the cache table. All we need is an IRE_CACHE which 16725 * is pointing at ipif->ipif_ill. 16726 */ 16727 if (ipif->ipif_flags & IPIF_POINTOPOINT) 16728 dst = ipif->ipif_pp_dst_addr; 16729 16730 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, msg_getlabel(mp), 16731 MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst); 16732 if (ire == NULL) { 16733 /* 16734 * Mark this packet to make it be delivered to 16735 * ip_rput_forward after the new ire has been 16736 * created. 16737 */ 16738 mp->b_prev = NULL; 16739 mp->b_next = mp; 16740 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 16741 NULL, 0, GLOBAL_ZONEID, &zero_info); 16742 } else { 16743 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 16744 IRE_REFRELE(ire); 16745 } 16746 } 16747 16748 /* Update any source route, record route or timestamp options */ 16749 static int 16750 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) 16751 { 16752 ipoptp_t opts; 16753 uchar_t *opt; 16754 uint8_t optval; 16755 uint8_t optlen; 16756 ipaddr_t dst; 16757 uint32_t ts; 16758 ire_t *dst_ire = NULL; 16759 ire_t *tmp_ire = NULL; 16760 timestruc_t now; 16761 16762 ip2dbg(("ip_rput_forward_options\n")); 16763 dst = ipha->ipha_dst; 16764 for (optval = ipoptp_first(&opts, ipha); 16765 optval != IPOPT_EOL; 16766 optval = ipoptp_next(&opts)) { 16767 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16768 opt = opts.ipoptp_cur; 16769 optlen = opts.ipoptp_len; 16770 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 16771 optval, opts.ipoptp_len)); 16772 switch (optval) { 16773 uint32_t off; 16774 case IPOPT_SSRR: 16775 case IPOPT_LSRR: 16776 /* Check if adminstratively disabled */ 16777 if (!ipst->ips_ip_forward_src_routed) { 16778 if (ire->ire_stq != NULL) { 16779 /* 16780 * Sent by forwarding path, and router 16781 * is global zone 16782 */ 16783 icmp_unreachable(ire->ire_stq, mp, 16784 ICMP_SOURCE_ROUTE_FAILED, 16785 GLOBAL_ZONEID, ipst); 16786 } else { 16787 ip0dbg(("ip_rput_forward_options: " 16788 "unable to send unreach\n")); 16789 freemsg(mp); 16790 } 16791 return (-1); 16792 } 16793 16794 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16795 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16796 if (dst_ire == NULL) { 16797 /* 16798 * Must be partial since ip_rput_options 16799 * checked for strict. 16800 */ 16801 break; 16802 } 16803 off = opt[IPOPT_OFFSET]; 16804 off--; 16805 redo_srr: 16806 if (optlen < IP_ADDR_LEN || 16807 off > optlen - IP_ADDR_LEN) { 16808 /* End of source route */ 16809 ip1dbg(( 16810 "ip_rput_forward_options: end of SR\n")); 16811 ire_refrele(dst_ire); 16812 break; 16813 } 16814 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16815 bcopy(&ire->ire_src_addr, (char *)opt + off, 16816 IP_ADDR_LEN); 16817 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 16818 ntohl(dst))); 16819 16820 /* 16821 * Check if our address is present more than 16822 * once as consecutive hops in source route. 16823 */ 16824 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16825 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16826 if (tmp_ire != NULL) { 16827 ire_refrele(tmp_ire); 16828 off += IP_ADDR_LEN; 16829 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16830 goto redo_srr; 16831 } 16832 ipha->ipha_dst = dst; 16833 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16834 ire_refrele(dst_ire); 16835 break; 16836 case IPOPT_RR: 16837 off = opt[IPOPT_OFFSET]; 16838 off--; 16839 if (optlen < IP_ADDR_LEN || 16840 off > optlen - IP_ADDR_LEN) { 16841 /* No more room - ignore */ 16842 ip1dbg(( 16843 "ip_rput_forward_options: end of RR\n")); 16844 break; 16845 } 16846 bcopy(&ire->ire_src_addr, (char *)opt + off, 16847 IP_ADDR_LEN); 16848 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16849 break; 16850 case IPOPT_TS: 16851 /* Insert timestamp if there is room */ 16852 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16853 case IPOPT_TS_TSONLY: 16854 off = IPOPT_TS_TIMELEN; 16855 break; 16856 case IPOPT_TS_PRESPEC: 16857 case IPOPT_TS_PRESPEC_RFC791: 16858 /* Verify that the address matched */ 16859 off = opt[IPOPT_OFFSET] - 1; 16860 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16861 dst_ire = ire_ctable_lookup(dst, 0, 16862 IRE_LOCAL, NULL, ALL_ZONES, NULL, 16863 MATCH_IRE_TYPE, ipst); 16864 if (dst_ire == NULL) { 16865 /* Not for us */ 16866 break; 16867 } 16868 ire_refrele(dst_ire); 16869 /* FALLTHRU */ 16870 case IPOPT_TS_TSANDADDR: 16871 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16872 break; 16873 default: 16874 /* 16875 * ip_*put_options should have already 16876 * dropped this packet. 16877 */ 16878 cmn_err(CE_PANIC, "ip_rput_forward_options: " 16879 "unknown IT - bug in ip_rput_options?\n"); 16880 return (0); /* Keep "lint" happy */ 16881 } 16882 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16883 /* Increase overflow counter */ 16884 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16885 opt[IPOPT_POS_OV_FLG] = 16886 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16887 (off << 4)); 16888 break; 16889 } 16890 off = opt[IPOPT_OFFSET] - 1; 16891 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16892 case IPOPT_TS_PRESPEC: 16893 case IPOPT_TS_PRESPEC_RFC791: 16894 case IPOPT_TS_TSANDADDR: 16895 bcopy(&ire->ire_src_addr, 16896 (char *)opt + off, IP_ADDR_LEN); 16897 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16898 /* FALLTHRU */ 16899 case IPOPT_TS_TSONLY: 16900 off = opt[IPOPT_OFFSET] - 1; 16901 /* Compute # of milliseconds since midnight */ 16902 gethrestime(&now); 16903 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16904 now.tv_nsec / (NANOSEC / MILLISEC); 16905 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16906 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16907 break; 16908 } 16909 break; 16910 } 16911 } 16912 return (0); 16913 } 16914 16915 /* 16916 * This is called after processing at least one of AH/ESP headers. 16917 * 16918 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 16919 * the actual, physical interface on which the packet was received, 16920 * but, when ip_strict_dst_multihoming is set to 1, could be the 16921 * interface which had the ipha_dst configured when the packet went 16922 * through ip_rput. The ill_index corresponding to the recv_ill 16923 * is saved in ipsec_in_rill_index 16924 * 16925 * NOTE2: The "ire" argument is only used in IPv4 cases. This function 16926 * cannot assume "ire" points to valid data for any IPv6 cases. 16927 */ 16928 void 16929 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 16930 { 16931 mblk_t *mp; 16932 ipaddr_t dst; 16933 in6_addr_t *v6dstp; 16934 ipha_t *ipha; 16935 ip6_t *ip6h; 16936 ipsec_in_t *ii; 16937 boolean_t ill_need_rele = B_FALSE; 16938 boolean_t rill_need_rele = B_FALSE; 16939 boolean_t ire_need_rele = B_FALSE; 16940 netstack_t *ns; 16941 ip_stack_t *ipst; 16942 16943 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 16944 ASSERT(ii->ipsec_in_ill_index != 0); 16945 ns = ii->ipsec_in_ns; 16946 ASSERT(ii->ipsec_in_ns != NULL); 16947 ipst = ns->netstack_ip; 16948 16949 mp = ipsec_mp->b_cont; 16950 ASSERT(mp != NULL); 16951 16952 if (ill == NULL) { 16953 ASSERT(recv_ill == NULL); 16954 /* 16955 * We need to get the original queue on which ip_rput_local 16956 * or ip_rput_data_v6 was called. 16957 */ 16958 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 16959 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst); 16960 ill_need_rele = B_TRUE; 16961 16962 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 16963 recv_ill = ill_lookup_on_ifindex( 16964 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 16965 NULL, NULL, NULL, NULL, ipst); 16966 rill_need_rele = B_TRUE; 16967 } else { 16968 recv_ill = ill; 16969 } 16970 16971 if ((ill == NULL) || (recv_ill == NULL)) { 16972 ip0dbg(("ip_fanout_proto_again: interface " 16973 "disappeared\n")); 16974 if (ill != NULL) 16975 ill_refrele(ill); 16976 if (recv_ill != NULL) 16977 ill_refrele(recv_ill); 16978 freemsg(ipsec_mp); 16979 return; 16980 } 16981 } 16982 16983 ASSERT(ill != NULL && recv_ill != NULL); 16984 16985 if (mp->b_datap->db_type == M_CTL) { 16986 /* 16987 * AH/ESP is returning the ICMP message after 16988 * removing their headers. Fanout again till 16989 * it gets to the right protocol. 16990 */ 16991 if (ii->ipsec_in_v4) { 16992 icmph_t *icmph; 16993 int iph_hdr_length; 16994 int hdr_length; 16995 16996 ipha = (ipha_t *)mp->b_rptr; 16997 iph_hdr_length = IPH_HDR_LENGTH(ipha); 16998 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 16999 ipha = (ipha_t *)&icmph[1]; 17000 hdr_length = IPH_HDR_LENGTH(ipha); 17001 /* 17002 * icmp_inbound_error_fanout may need to do pullupmsg. 17003 * Reset the type to M_DATA. 17004 */ 17005 mp->b_datap->db_type = M_DATA; 17006 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 17007 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 17008 B_FALSE, ill, ii->ipsec_in_zoneid); 17009 } else { 17010 icmp6_t *icmp6; 17011 int hdr_length; 17012 17013 ip6h = (ip6_t *)mp->b_rptr; 17014 /* Don't call hdr_length_v6() unless you have to. */ 17015 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 17016 hdr_length = ip_hdr_length_v6(mp, ip6h); 17017 else 17018 hdr_length = IPV6_HDR_LEN; 17019 17020 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 17021 /* 17022 * icmp_inbound_error_fanout_v6 may need to do 17023 * pullupmsg. Reset the type to M_DATA. 17024 */ 17025 mp->b_datap->db_type = M_DATA; 17026 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 17027 ip6h, icmp6, ill, recv_ill, B_TRUE, 17028 ii->ipsec_in_zoneid); 17029 } 17030 if (ill_need_rele) 17031 ill_refrele(ill); 17032 if (rill_need_rele) 17033 ill_refrele(recv_ill); 17034 return; 17035 } 17036 17037 if (ii->ipsec_in_v4) { 17038 ipha = (ipha_t *)mp->b_rptr; 17039 dst = ipha->ipha_dst; 17040 if (CLASSD(dst)) { 17041 /* 17042 * Multicast has to be delivered to all streams. 17043 */ 17044 dst = INADDR_BROADCAST; 17045 } 17046 17047 if (ire == NULL) { 17048 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 17049 msg_getlabel(mp), ipst); 17050 if (ire == NULL) { 17051 if (ill_need_rele) 17052 ill_refrele(ill); 17053 if (rill_need_rele) 17054 ill_refrele(recv_ill); 17055 ip1dbg(("ip_fanout_proto_again: " 17056 "IRE not found")); 17057 freemsg(ipsec_mp); 17058 return; 17059 } 17060 ire_need_rele = B_TRUE; 17061 } 17062 17063 switch (ipha->ipha_protocol) { 17064 case IPPROTO_UDP: 17065 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 17066 recv_ill); 17067 if (ire_need_rele) 17068 ire_refrele(ire); 17069 break; 17070 case IPPROTO_TCP: 17071 if (!ire_need_rele) 17072 IRE_REFHOLD(ire); 17073 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 17074 ire, ipsec_mp, 0, ill->ill_rq, NULL); 17075 IRE_REFRELE(ire); 17076 if (mp != NULL) { 17077 SQUEUE_ENTER(GET_SQUEUE(mp), mp, 17078 mp, 1, SQ_PROCESS, 17079 SQTAG_IP_PROTO_AGAIN); 17080 } 17081 break; 17082 case IPPROTO_SCTP: 17083 if (!ire_need_rele) 17084 IRE_REFHOLD(ire); 17085 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 17086 ipsec_mp, 0, ill->ill_rq, dst); 17087 break; 17088 case IPPROTO_ENCAP: 17089 case IPPROTO_IPV6: 17090 if (ip_iptun_input(ipsec_mp, mp, ipha, ill, ire, 17091 ill->ill_ipst)) { 17092 /* 17093 * If we made it here, we don't need to worry 17094 * about the raw-socket/protocol fanout. 17095 */ 17096 if (ire_need_rele) 17097 ire_refrele(ire); 17098 break; 17099 } 17100 /* else FALLTHRU */ 17101 default: 17102 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 17103 recv_ill, 0); 17104 if (ire_need_rele) 17105 ire_refrele(ire); 17106 break; 17107 } 17108 } else { 17109 uint32_t rput_flags = 0; 17110 17111 ip6h = (ip6_t *)mp->b_rptr; 17112 v6dstp = &ip6h->ip6_dst; 17113 /* 17114 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 17115 * address. 17116 * 17117 * Currently, we don't store that state in the IPSEC_IN 17118 * message, and we may need to. 17119 */ 17120 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 17121 IP6_IN_LLMCAST : 0); 17122 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 17123 NULL, NULL); 17124 } 17125 if (ill_need_rele) 17126 ill_refrele(ill); 17127 if (rill_need_rele) 17128 ill_refrele(recv_ill); 17129 } 17130 17131 /* 17132 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 17133 * returns 'true' if there are still fragments left on the queue, in 17134 * which case we restart the timer. 17135 */ 17136 void 17137 ill_frag_timer(void *arg) 17138 { 17139 ill_t *ill = (ill_t *)arg; 17140 boolean_t frag_pending; 17141 ip_stack_t *ipst = ill->ill_ipst; 17142 time_t timeout; 17143 17144 mutex_enter(&ill->ill_lock); 17145 ASSERT(!ill->ill_fragtimer_executing); 17146 if (ill->ill_state_flags & ILL_CONDEMNED) { 17147 ill->ill_frag_timer_id = 0; 17148 mutex_exit(&ill->ill_lock); 17149 return; 17150 } 17151 ill->ill_fragtimer_executing = 1; 17152 mutex_exit(&ill->ill_lock); 17153 17154 if (ill->ill_isv6) 17155 timeout = ipst->ips_ipv6_frag_timeout; 17156 else 17157 timeout = ipst->ips_ip_g_frag_timeout; 17158 17159 frag_pending = ill_frag_timeout(ill, timeout); 17160 17161 /* 17162 * Restart the timer, if we have fragments pending or if someone 17163 * wanted us to be scheduled again. 17164 */ 17165 mutex_enter(&ill->ill_lock); 17166 ill->ill_fragtimer_executing = 0; 17167 ill->ill_frag_timer_id = 0; 17168 if (frag_pending || ill->ill_fragtimer_needrestart) 17169 ill_frag_timer_start(ill); 17170 mutex_exit(&ill->ill_lock); 17171 } 17172 17173 void 17174 ill_frag_timer_start(ill_t *ill) 17175 { 17176 ip_stack_t *ipst = ill->ill_ipst; 17177 clock_t timeo_ms; 17178 17179 ASSERT(MUTEX_HELD(&ill->ill_lock)); 17180 17181 /* If the ill is closing or opening don't proceed */ 17182 if (ill->ill_state_flags & ILL_CONDEMNED) 17183 return; 17184 17185 if (ill->ill_fragtimer_executing) { 17186 /* 17187 * ill_frag_timer is currently executing. Just record the 17188 * the fact that we want the timer to be restarted. 17189 * ill_frag_timer will post a timeout before it returns, 17190 * ensuring it will be called again. 17191 */ 17192 ill->ill_fragtimer_needrestart = 1; 17193 return; 17194 } 17195 17196 if (ill->ill_frag_timer_id == 0) { 17197 if (ill->ill_isv6) 17198 timeo_ms = ipst->ips_ipv6_frag_timo_ms; 17199 else 17200 timeo_ms = ipst->ips_ip_g_frag_timo_ms; 17201 /* 17202 * The timer is neither running nor is the timeout handler 17203 * executing. Post a timeout so that ill_frag_timer will be 17204 * called 17205 */ 17206 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 17207 MSEC_TO_TICK(timeo_ms >> 1)); 17208 ill->ill_fragtimer_needrestart = 0; 17209 } 17210 } 17211 17212 /* 17213 * This routine is needed for loopback when forwarding multicasts. 17214 * 17215 * IPQoS Notes: 17216 * IPPF processing is done in fanout routines. 17217 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 17218 * processing for IPsec packets is done when it comes back in clear. 17219 * NOTE : The callers of this function need to do the ire_refrele for the 17220 * ire that is being passed in. 17221 */ 17222 void 17223 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17224 ill_t *recv_ill, uint32_t esp_udp_ports) 17225 { 17226 boolean_t esp_in_udp_packet = (esp_udp_ports != 0); 17227 ill_t *ill = (ill_t *)q->q_ptr; 17228 uint32_t sum; 17229 uint32_t u1; 17230 uint32_t u2; 17231 int hdr_length; 17232 boolean_t mctl_present; 17233 mblk_t *first_mp = mp; 17234 mblk_t *hada_mp = NULL; 17235 ipha_t *inner_ipha; 17236 ip_stack_t *ipst; 17237 17238 ASSERT(recv_ill != NULL); 17239 ipst = recv_ill->ill_ipst; 17240 17241 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 17242 "ip_rput_locl_start: q %p", q); 17243 17244 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17245 ASSERT(ill != NULL); 17246 17247 #define rptr ((uchar_t *)ipha) 17248 #define iphs ((uint16_t *)ipha) 17249 17250 /* 17251 * no UDP or TCP packet should come here anymore. 17252 */ 17253 ASSERT(ipha->ipha_protocol != IPPROTO_TCP && 17254 ipha->ipha_protocol != IPPROTO_UDP); 17255 17256 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 17257 if (mctl_present && 17258 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 17259 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 17260 17261 /* 17262 * It's an IPsec accelerated packet. 17263 * Keep a pointer to the data attributes around until 17264 * we allocate the ipsec_info_t. 17265 */ 17266 IPSECHW_DEBUG(IPSECHW_PKT, 17267 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 17268 hada_mp = first_mp; 17269 hada_mp->b_cont = NULL; 17270 /* 17271 * Since it is accelerated, it comes directly from 17272 * the ill and the data attributes is followed by 17273 * the packet data. 17274 */ 17275 ASSERT(mp->b_datap->db_type != M_CTL); 17276 first_mp = mp; 17277 mctl_present = B_FALSE; 17278 } 17279 17280 /* 17281 * IF M_CTL is not present, then ipsec_in_is_secure 17282 * should return B_TRUE. There is a case where loopback 17283 * packets has an M_CTL in the front with all the 17284 * IPsec options set to IPSEC_PREF_NEVER - which means 17285 * ipsec_in_is_secure will return B_FALSE. As loopback 17286 * packets never comes here, it is safe to ASSERT the 17287 * following. 17288 */ 17289 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 17290 17291 /* 17292 * Also, we should never have an mctl_present if this is an 17293 * ESP-in-UDP packet. 17294 */ 17295 ASSERT(!mctl_present || !esp_in_udp_packet); 17296 17297 /* u1 is # words of IP options */ 17298 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 17299 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 17300 17301 /* 17302 * Don't verify header checksum if we just removed UDP header or 17303 * packet is coming back from AH/ESP. 17304 */ 17305 if (!esp_in_udp_packet && !mctl_present) { 17306 if (u1) { 17307 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 17308 if (hada_mp != NULL) 17309 freemsg(hada_mp); 17310 return; 17311 } 17312 } else { 17313 /* Check the IP header checksum. */ 17314 #define uph ((uint16_t *)ipha) 17315 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 17316 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 17317 #undef uph 17318 /* finish doing IP checksum */ 17319 sum = (sum & 0xFFFF) + (sum >> 16); 17320 sum = ~(sum + (sum >> 16)) & 0xFFFF; 17321 if (sum && sum != 0xFFFF) { 17322 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 17323 goto drop_pkt; 17324 } 17325 } 17326 } 17327 17328 /* 17329 * Count for SNMP of inbound packets for ire. As ip_proto_input 17330 * might be called more than once for secure packets, count only 17331 * the first time. 17332 */ 17333 if (!mctl_present) { 17334 UPDATE_IB_PKT_COUNT(ire); 17335 ire->ire_last_used_time = lbolt; 17336 } 17337 17338 /* Check for fragmentation offset. */ 17339 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 17340 u1 = u2 & (IPH_MF | IPH_OFFSET); 17341 if (u1) { 17342 /* 17343 * We re-assemble fragments before we do the AH/ESP 17344 * processing. Thus, M_CTL should not be present 17345 * while we are re-assembling. 17346 */ 17347 ASSERT(!mctl_present); 17348 ASSERT(first_mp == mp); 17349 if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) 17350 return; 17351 17352 /* 17353 * Make sure that first_mp points back to mp as 17354 * the mp we came in with could have changed in 17355 * ip_rput_fragment(). 17356 */ 17357 ipha = (ipha_t *)mp->b_rptr; 17358 first_mp = mp; 17359 } 17360 17361 /* 17362 * Clear hardware checksumming flag as it is currently only 17363 * used by TCP and UDP. 17364 */ 17365 DB_CKSUMFLAGS(mp) = 0; 17366 17367 /* Now we have a complete datagram, destined for this machine. */ 17368 u1 = IPH_HDR_LENGTH(ipha); 17369 switch (ipha->ipha_protocol) { 17370 case IPPROTO_ICMP: { 17371 ire_t *ire_zone; 17372 ilm_t *ilm; 17373 mblk_t *mp1; 17374 zoneid_t last_zoneid; 17375 ilm_walker_t ilw; 17376 17377 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { 17378 ASSERT(ire->ire_type == IRE_BROADCAST); 17379 17380 /* 17381 * In the multicast case, applications may have joined 17382 * the group from different zones, so we need to deliver 17383 * the packet to each of them. Loop through the 17384 * multicast memberships structures (ilm) on the receive 17385 * ill and send a copy of the packet up each matching 17386 * one. However, we don't do this for multicasts sent on 17387 * the loopback interface (PHYI_LOOPBACK flag set) as 17388 * they must stay in the sender's zone. 17389 * 17390 * ilm_add_v6() ensures that ilms in the same zone are 17391 * contiguous in the ill_ilm list. We use this property 17392 * to avoid sending duplicates needed when two 17393 * applications in the same zone join the same group on 17394 * different logical interfaces: we ignore the ilm if 17395 * its zoneid is the same as the last matching one. 17396 * In addition, the sending of the packet for 17397 * ire_zoneid is delayed until all of the other ilms 17398 * have been exhausted. 17399 */ 17400 last_zoneid = -1; 17401 ilm = ilm_walker_start(&ilw, recv_ill); 17402 for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { 17403 if (ipha->ipha_dst != ilm->ilm_addr || 17404 ilm->ilm_zoneid == last_zoneid || 17405 ilm->ilm_zoneid == ire->ire_zoneid || 17406 ilm->ilm_zoneid == ALL_ZONES || 17407 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 17408 continue; 17409 mp1 = ip_copymsg(first_mp); 17410 if (mp1 == NULL) 17411 continue; 17412 icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, 17413 0, sum, mctl_present, B_TRUE, 17414 recv_ill, ilm->ilm_zoneid); 17415 last_zoneid = ilm->ilm_zoneid; 17416 } 17417 ilm_walker_finish(&ilw); 17418 } else if (ire->ire_type == IRE_BROADCAST) { 17419 /* 17420 * In the broadcast case, there may be many zones 17421 * which need a copy of the packet delivered to them. 17422 * There is one IRE_BROADCAST per broadcast address 17423 * and per zone; we walk those using a helper function. 17424 * In addition, the sending of the packet for ire is 17425 * delayed until all of the other ires have been 17426 * processed. 17427 */ 17428 IRB_REFHOLD(ire->ire_bucket); 17429 ire_zone = NULL; 17430 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 17431 ire)) != NULL) { 17432 mp1 = ip_copymsg(first_mp); 17433 if (mp1 == NULL) 17434 continue; 17435 17436 UPDATE_IB_PKT_COUNT(ire_zone); 17437 ire_zone->ire_last_used_time = lbolt; 17438 icmp_inbound(q, mp1, B_TRUE, ill, 17439 0, sum, mctl_present, B_TRUE, 17440 recv_ill, ire_zone->ire_zoneid); 17441 } 17442 IRB_REFRELE(ire->ire_bucket); 17443 } 17444 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 17445 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 17446 ire->ire_zoneid); 17447 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17448 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 17449 return; 17450 } 17451 case IPPROTO_IGMP: 17452 /* 17453 * If we are not willing to accept IGMP packets in clear, 17454 * then check with global policy. 17455 */ 17456 if (ipst->ips_igmp_accept_clear_messages == 0) { 17457 first_mp = ipsec_check_global_policy(first_mp, NULL, 17458 ipha, NULL, mctl_present, ipst->ips_netstack); 17459 if (first_mp == NULL) 17460 return; 17461 } 17462 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17463 freemsg(first_mp); 17464 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 17465 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17466 return; 17467 } 17468 if ((mp = igmp_input(q, mp, ill)) == NULL) { 17469 /* Bad packet - discarded by igmp_input */ 17470 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17471 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 17472 if (mctl_present) 17473 freeb(first_mp); 17474 return; 17475 } 17476 /* 17477 * igmp_input() may have returned the pulled up message. 17478 * So first_mp and ipha need to be reinitialized. 17479 */ 17480 ipha = (ipha_t *)mp->b_rptr; 17481 if (mctl_present) 17482 first_mp->b_cont = mp; 17483 else 17484 first_mp = mp; 17485 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17486 connf_head != NULL) { 17487 /* No user-level listener for IGMP packets */ 17488 goto drop_pkt; 17489 } 17490 /* deliver to local raw users */ 17491 break; 17492 case IPPROTO_PIM: 17493 /* 17494 * If we are not willing to accept PIM packets in clear, 17495 * then check with global policy. 17496 */ 17497 if (ipst->ips_pim_accept_clear_messages == 0) { 17498 first_mp = ipsec_check_global_policy(first_mp, NULL, 17499 ipha, NULL, mctl_present, ipst->ips_netstack); 17500 if (first_mp == NULL) 17501 return; 17502 } 17503 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17504 freemsg(first_mp); 17505 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 17506 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17507 return; 17508 } 17509 if (pim_input(q, mp, ill) != 0) { 17510 /* Bad packet - discarded by pim_input */ 17511 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17512 "ip_rput_locl_end: q %p (%S)", q, "pim"); 17513 if (mctl_present) 17514 freeb(first_mp); 17515 return; 17516 } 17517 17518 /* 17519 * pim_input() may have pulled up the message so ipha needs to 17520 * be reinitialized. 17521 */ 17522 ipha = (ipha_t *)mp->b_rptr; 17523 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17524 connf_head != NULL) { 17525 /* No user-level listener for PIM packets */ 17526 goto drop_pkt; 17527 } 17528 /* deliver to local raw users */ 17529 break; 17530 case IPPROTO_ENCAP: 17531 /* 17532 * Handle self-encapsulated packets (IP-in-IP where 17533 * the inner addresses == the outer addresses). 17534 */ 17535 hdr_length = IPH_HDR_LENGTH(ipha); 17536 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 17537 mp->b_wptr) { 17538 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 17539 sizeof (ipha_t) - mp->b_rptr)) { 17540 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17541 freemsg(first_mp); 17542 return; 17543 } 17544 ipha = (ipha_t *)mp->b_rptr; 17545 } 17546 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 17547 /* 17548 * Check the sanity of the inner IP header. 17549 */ 17550 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 17551 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17552 freemsg(first_mp); 17553 return; 17554 } 17555 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 17556 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17557 freemsg(first_mp); 17558 return; 17559 } 17560 if (inner_ipha->ipha_src == ipha->ipha_src && 17561 inner_ipha->ipha_dst == ipha->ipha_dst) { 17562 ipsec_in_t *ii; 17563 17564 /* 17565 * Self-encapsulated tunnel packet. Remove 17566 * the outer IP header and fanout again. 17567 * We also need to make sure that the inner 17568 * header is pulled up until options. 17569 */ 17570 mp->b_rptr = (uchar_t *)inner_ipha; 17571 ipha = inner_ipha; 17572 hdr_length = IPH_HDR_LENGTH(ipha); 17573 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 17574 if (!pullupmsg(mp, (uchar_t *)ipha + 17575 + hdr_length - mp->b_rptr)) { 17576 freemsg(first_mp); 17577 return; 17578 } 17579 ipha = (ipha_t *)mp->b_rptr; 17580 } 17581 if (hdr_length > sizeof (ipha_t)) { 17582 /* We got options on the inner packet. */ 17583 ipaddr_t dst = ipha->ipha_dst; 17584 17585 if (ip_rput_options(q, mp, ipha, &dst, ipst) == 17586 -1) { 17587 /* Bad options! */ 17588 return; 17589 } 17590 if (dst != ipha->ipha_dst) { 17591 /* 17592 * Someone put a source-route in 17593 * the inside header of a self- 17594 * encapsulated packet. Drop it 17595 * with extreme prejudice and let 17596 * the sender know. 17597 */ 17598 icmp_unreachable(q, first_mp, 17599 ICMP_SOURCE_ROUTE_FAILED, 17600 recv_ill->ill_zoneid, ipst); 17601 return; 17602 } 17603 } 17604 if (!mctl_present) { 17605 ASSERT(first_mp == mp); 17606 /* 17607 * This means that somebody is sending 17608 * Self-encapsualted packets without AH/ESP. 17609 * If AH/ESP was present, we would have already 17610 * allocated the first_mp. 17611 * 17612 * Send this packet to find a tunnel endpoint. 17613 * if I can't find one, an ICMP 17614 * PROTOCOL_UNREACHABLE will get sent. 17615 */ 17616 goto fanout; 17617 } 17618 /* 17619 * We generally store the ill_index if we need to 17620 * do IPsec processing as we lose the ill queue when 17621 * we come back. But in this case, we never should 17622 * have to store the ill_index here as it should have 17623 * been stored previously when we processed the 17624 * AH/ESP header in this routine or for non-ipsec 17625 * cases, we still have the queue. But for some bad 17626 * packets from the wire, we can get to IPsec after 17627 * this and we better store the index for that case. 17628 */ 17629 ill = (ill_t *)q->q_ptr; 17630 ii = (ipsec_in_t *)first_mp->b_rptr; 17631 ii->ipsec_in_ill_index = 17632 ill->ill_phyint->phyint_ifindex; 17633 ii->ipsec_in_rill_index = 17634 recv_ill->ill_phyint->phyint_ifindex; 17635 if (ii->ipsec_in_decaps) { 17636 /* 17637 * This packet is self-encapsulated multiple 17638 * times. We don't want to recurse infinitely. 17639 * To keep it simple, drop the packet. 17640 */ 17641 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17642 freemsg(first_mp); 17643 return; 17644 } 17645 ii->ipsec_in_decaps = B_TRUE; 17646 ip_fanout_proto_again(first_mp, recv_ill, recv_ill, 17647 ire); 17648 return; 17649 } 17650 break; 17651 case IPPROTO_AH: 17652 case IPPROTO_ESP: { 17653 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 17654 17655 /* 17656 * Fast path for AH/ESP. If this is the first time 17657 * we are sending a datagram to AH/ESP, allocate 17658 * a IPSEC_IN message and prepend it. Otherwise, 17659 * just fanout. 17660 */ 17661 17662 int ipsec_rc; 17663 ipsec_in_t *ii; 17664 netstack_t *ns = ipst->ips_netstack; 17665 17666 IP_STAT(ipst, ipsec_proto_ahesp); 17667 if (!mctl_present) { 17668 ASSERT(first_mp == mp); 17669 first_mp = ipsec_in_alloc(B_TRUE, ns); 17670 if (first_mp == NULL) { 17671 ip1dbg(("ip_proto_input: IPSEC_IN " 17672 "allocation failure.\n")); 17673 freemsg(hada_mp); /* okay ifnull */ 17674 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17675 freemsg(mp); 17676 return; 17677 } 17678 /* 17679 * Store the ill_index so that when we come back 17680 * from IPsec we ride on the same queue. 17681 */ 17682 ill = (ill_t *)q->q_ptr; 17683 ii = (ipsec_in_t *)first_mp->b_rptr; 17684 ii->ipsec_in_ill_index = 17685 ill->ill_phyint->phyint_ifindex; 17686 ii->ipsec_in_rill_index = 17687 recv_ill->ill_phyint->phyint_ifindex; 17688 first_mp->b_cont = mp; 17689 /* 17690 * Cache hardware acceleration info. 17691 */ 17692 if (hada_mp != NULL) { 17693 IPSECHW_DEBUG(IPSECHW_PKT, 17694 ("ip_rput_local: caching data attr.\n")); 17695 ii->ipsec_in_accelerated = B_TRUE; 17696 ii->ipsec_in_da = hada_mp; 17697 hada_mp = NULL; 17698 } 17699 } else { 17700 ii = (ipsec_in_t *)first_mp->b_rptr; 17701 } 17702 17703 ii->ipsec_in_esp_udp_ports = esp_udp_ports; 17704 17705 if (!ipsec_loaded(ipss)) { 17706 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 17707 ire->ire_zoneid, ipst); 17708 return; 17709 } 17710 17711 ns = ipst->ips_netstack; 17712 /* select inbound SA and have IPsec process the pkt */ 17713 if (ipha->ipha_protocol == IPPROTO_ESP) { 17714 esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns); 17715 boolean_t esp_in_udp_sa; 17716 if (esph == NULL) 17717 return; 17718 ASSERT(ii->ipsec_in_esp_sa != NULL); 17719 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 17720 esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags & 17721 IPSA_F_NATT) != 0); 17722 /* 17723 * The following is a fancy, but quick, way of saying: 17724 * ESP-in-UDP SA and Raw ESP packet --> drop 17725 * OR 17726 * ESP SA and ESP-in-UDP packet --> drop 17727 */ 17728 if (esp_in_udp_sa != esp_in_udp_packet) { 17729 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17730 ip_drop_packet(first_mp, B_TRUE, ill, NULL, 17731 DROPPER(ns->netstack_ipsec, ipds_esp_no_sa), 17732 &ns->netstack_ipsec->ipsec_dropper); 17733 return; 17734 } 17735 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 17736 first_mp, esph); 17737 } else { 17738 ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns); 17739 if (ah == NULL) 17740 return; 17741 ASSERT(ii->ipsec_in_ah_sa != NULL); 17742 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 17743 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 17744 first_mp, ah); 17745 } 17746 17747 switch (ipsec_rc) { 17748 case IPSEC_STATUS_SUCCESS: 17749 break; 17750 case IPSEC_STATUS_FAILED: 17751 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17752 /* FALLTHRU */ 17753 case IPSEC_STATUS_PENDING: 17754 return; 17755 } 17756 /* we're done with IPsec processing, send it up */ 17757 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 17758 return; 17759 } 17760 default: 17761 break; 17762 } 17763 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 17764 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 17765 ire->ire_zoneid)); 17766 goto drop_pkt; 17767 } 17768 /* 17769 * Handle protocols with which IP is less intimate. There 17770 * can be more than one stream bound to a particular 17771 * protocol. When this is the case, each one gets a copy 17772 * of any incoming packets. 17773 */ 17774 fanout: 17775 ip_fanout_proto(q, first_mp, ill, ipha, 17776 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 17777 B_TRUE, recv_ill, ire->ire_zoneid); 17778 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17779 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 17780 return; 17781 17782 drop_pkt: 17783 freemsg(first_mp); 17784 if (hada_mp != NULL) 17785 freeb(hada_mp); 17786 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17787 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 17788 #undef rptr 17789 #undef iphs 17790 17791 } 17792 17793 /* 17794 * Update any source route, record route or timestamp options. 17795 * Check that we are at end of strict source route. 17796 * The options have already been checked for sanity in ip_rput_options(). 17797 */ 17798 static boolean_t 17799 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17800 ip_stack_t *ipst) 17801 { 17802 ipoptp_t opts; 17803 uchar_t *opt; 17804 uint8_t optval; 17805 uint8_t optlen; 17806 ipaddr_t dst; 17807 uint32_t ts; 17808 ire_t *dst_ire; 17809 timestruc_t now; 17810 zoneid_t zoneid; 17811 ill_t *ill; 17812 17813 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17814 17815 ip2dbg(("ip_rput_local_options\n")); 17816 17817 for (optval = ipoptp_first(&opts, ipha); 17818 optval != IPOPT_EOL; 17819 optval = ipoptp_next(&opts)) { 17820 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 17821 opt = opts.ipoptp_cur; 17822 optlen = opts.ipoptp_len; 17823 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 17824 optval, optlen)); 17825 switch (optval) { 17826 uint32_t off; 17827 case IPOPT_SSRR: 17828 case IPOPT_LSRR: 17829 off = opt[IPOPT_OFFSET]; 17830 off--; 17831 if (optlen < IP_ADDR_LEN || 17832 off > optlen - IP_ADDR_LEN) { 17833 /* End of source route */ 17834 ip1dbg(("ip_rput_local_options: end of SR\n")); 17835 break; 17836 } 17837 /* 17838 * This will only happen if two consecutive entries 17839 * in the source route contains our address or if 17840 * it is a packet with a loose source route which 17841 * reaches us before consuming the whole source route 17842 */ 17843 ip1dbg(("ip_rput_local_options: not end of SR\n")); 17844 if (optval == IPOPT_SSRR) { 17845 goto bad_src_route; 17846 } 17847 /* 17848 * Hack: instead of dropping the packet truncate the 17849 * source route to what has been used by filling the 17850 * rest with IPOPT_NOP. 17851 */ 17852 opt[IPOPT_OLEN] = (uint8_t)off; 17853 while (off < optlen) { 17854 opt[off++] = IPOPT_NOP; 17855 } 17856 break; 17857 case IPOPT_RR: 17858 off = opt[IPOPT_OFFSET]; 17859 off--; 17860 if (optlen < IP_ADDR_LEN || 17861 off > optlen - IP_ADDR_LEN) { 17862 /* No more room - ignore */ 17863 ip1dbg(( 17864 "ip_rput_local_options: end of RR\n")); 17865 break; 17866 } 17867 bcopy(&ire->ire_src_addr, (char *)opt + off, 17868 IP_ADDR_LEN); 17869 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17870 break; 17871 case IPOPT_TS: 17872 /* Insert timestamp if there is romm */ 17873 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17874 case IPOPT_TS_TSONLY: 17875 off = IPOPT_TS_TIMELEN; 17876 break; 17877 case IPOPT_TS_PRESPEC: 17878 case IPOPT_TS_PRESPEC_RFC791: 17879 /* Verify that the address matched */ 17880 off = opt[IPOPT_OFFSET] - 1; 17881 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17882 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 17883 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 17884 ipst); 17885 if (dst_ire == NULL) { 17886 /* Not for us */ 17887 break; 17888 } 17889 ire_refrele(dst_ire); 17890 /* FALLTHRU */ 17891 case IPOPT_TS_TSANDADDR: 17892 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 17893 break; 17894 default: 17895 /* 17896 * ip_*put_options should have already 17897 * dropped this packet. 17898 */ 17899 cmn_err(CE_PANIC, "ip_rput_local_options: " 17900 "unknown IT - bug in ip_rput_options?\n"); 17901 return (B_TRUE); /* Keep "lint" happy */ 17902 } 17903 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 17904 /* Increase overflow counter */ 17905 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 17906 opt[IPOPT_POS_OV_FLG] = 17907 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 17908 (off << 4)); 17909 break; 17910 } 17911 off = opt[IPOPT_OFFSET] - 1; 17912 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17913 case IPOPT_TS_PRESPEC: 17914 case IPOPT_TS_PRESPEC_RFC791: 17915 case IPOPT_TS_TSANDADDR: 17916 bcopy(&ire->ire_src_addr, (char *)opt + off, 17917 IP_ADDR_LEN); 17918 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17919 /* FALLTHRU */ 17920 case IPOPT_TS_TSONLY: 17921 off = opt[IPOPT_OFFSET] - 1; 17922 /* Compute # of milliseconds since midnight */ 17923 gethrestime(&now); 17924 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 17925 now.tv_nsec / (NANOSEC / MILLISEC); 17926 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 17927 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 17928 break; 17929 } 17930 break; 17931 } 17932 } 17933 return (B_TRUE); 17934 17935 bad_src_route: 17936 q = WR(q); 17937 if (q->q_next != NULL) 17938 ill = q->q_ptr; 17939 else 17940 ill = NULL; 17941 17942 /* make sure we clear any indication of a hardware checksum */ 17943 DB_CKSUMFLAGS(mp) = 0; 17944 zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst); 17945 if (zoneid == ALL_ZONES) 17946 freemsg(mp); 17947 else 17948 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 17949 return (B_FALSE); 17950 17951 } 17952 17953 /* 17954 * Process IP options in an inbound packet. If an option affects the 17955 * effective destination address, return the next hop address via dstp. 17956 * Returns -1 if something fails in which case an ICMP error has been sent 17957 * and mp freed. 17958 */ 17959 static int 17960 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, 17961 ip_stack_t *ipst) 17962 { 17963 ipoptp_t opts; 17964 uchar_t *opt; 17965 uint8_t optval; 17966 uint8_t optlen; 17967 ipaddr_t dst; 17968 intptr_t code = 0; 17969 ire_t *ire = NULL; 17970 zoneid_t zoneid; 17971 ill_t *ill; 17972 17973 ip2dbg(("ip_rput_options\n")); 17974 dst = ipha->ipha_dst; 17975 for (optval = ipoptp_first(&opts, ipha); 17976 optval != IPOPT_EOL; 17977 optval = ipoptp_next(&opts)) { 17978 opt = opts.ipoptp_cur; 17979 optlen = opts.ipoptp_len; 17980 ip2dbg(("ip_rput_options: opt %d, len %d\n", 17981 optval, optlen)); 17982 /* 17983 * Note: we need to verify the checksum before we 17984 * modify anything thus this routine only extracts the next 17985 * hop dst from any source route. 17986 */ 17987 switch (optval) { 17988 uint32_t off; 17989 case IPOPT_SSRR: 17990 case IPOPT_LSRR: 17991 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 17992 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 17993 if (ire == NULL) { 17994 if (optval == IPOPT_SSRR) { 17995 ip1dbg(("ip_rput_options: not next" 17996 " strict source route 0x%x\n", 17997 ntohl(dst))); 17998 code = (char *)&ipha->ipha_dst - 17999 (char *)ipha; 18000 goto param_prob; /* RouterReq's */ 18001 } 18002 ip2dbg(("ip_rput_options: " 18003 "not next source route 0x%x\n", 18004 ntohl(dst))); 18005 break; 18006 } 18007 ire_refrele(ire); 18008 18009 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18010 ip1dbg(( 18011 "ip_rput_options: bad option offset\n")); 18012 code = (char *)&opt[IPOPT_OLEN] - 18013 (char *)ipha; 18014 goto param_prob; 18015 } 18016 off = opt[IPOPT_OFFSET]; 18017 off--; 18018 redo_srr: 18019 if (optlen < IP_ADDR_LEN || 18020 off > optlen - IP_ADDR_LEN) { 18021 /* End of source route */ 18022 ip1dbg(("ip_rput_options: end of SR\n")); 18023 break; 18024 } 18025 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 18026 ip1dbg(("ip_rput_options: next hop 0x%x\n", 18027 ntohl(dst))); 18028 18029 /* 18030 * Check if our address is present more than 18031 * once as consecutive hops in source route. 18032 * XXX verify per-interface ip_forwarding 18033 * for source route? 18034 */ 18035 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 18036 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 18037 18038 if (ire != NULL) { 18039 ire_refrele(ire); 18040 off += IP_ADDR_LEN; 18041 goto redo_srr; 18042 } 18043 18044 if (dst == htonl(INADDR_LOOPBACK)) { 18045 ip1dbg(("ip_rput_options: loopback addr in " 18046 "source route!\n")); 18047 goto bad_src_route; 18048 } 18049 /* 18050 * For strict: verify that dst is directly 18051 * reachable. 18052 */ 18053 if (optval == IPOPT_SSRR) { 18054 ire = ire_ftable_lookup(dst, 0, 0, 18055 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 18056 msg_getlabel(mp), 18057 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 18058 if (ire == NULL) { 18059 ip1dbg(("ip_rput_options: SSRR not " 18060 "directly reachable: 0x%x\n", 18061 ntohl(dst))); 18062 goto bad_src_route; 18063 } 18064 ire_refrele(ire); 18065 } 18066 /* 18067 * Defer update of the offset and the record route 18068 * until the packet is forwarded. 18069 */ 18070 break; 18071 case IPOPT_RR: 18072 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18073 ip1dbg(( 18074 "ip_rput_options: bad option offset\n")); 18075 code = (char *)&opt[IPOPT_OLEN] - 18076 (char *)ipha; 18077 goto param_prob; 18078 } 18079 break; 18080 case IPOPT_TS: 18081 /* 18082 * Verify that length >= 5 and that there is either 18083 * room for another timestamp or that the overflow 18084 * counter is not maxed out. 18085 */ 18086 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 18087 if (optlen < IPOPT_MINLEN_IT) { 18088 goto param_prob; 18089 } 18090 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18091 ip1dbg(( 18092 "ip_rput_options: bad option offset\n")); 18093 code = (char *)&opt[IPOPT_OFFSET] - 18094 (char *)ipha; 18095 goto param_prob; 18096 } 18097 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 18098 case IPOPT_TS_TSONLY: 18099 off = IPOPT_TS_TIMELEN; 18100 break; 18101 case IPOPT_TS_TSANDADDR: 18102 case IPOPT_TS_PRESPEC: 18103 case IPOPT_TS_PRESPEC_RFC791: 18104 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 18105 break; 18106 default: 18107 code = (char *)&opt[IPOPT_POS_OV_FLG] - 18108 (char *)ipha; 18109 goto param_prob; 18110 } 18111 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 18112 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 18113 /* 18114 * No room and the overflow counter is 15 18115 * already. 18116 */ 18117 goto param_prob; 18118 } 18119 break; 18120 } 18121 } 18122 18123 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 18124 *dstp = dst; 18125 return (0); 18126 } 18127 18128 ip1dbg(("ip_rput_options: error processing IP options.")); 18129 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 18130 18131 param_prob: 18132 q = WR(q); 18133 if (q->q_next != NULL) 18134 ill = q->q_ptr; 18135 else 18136 ill = NULL; 18137 18138 /* make sure we clear any indication of a hardware checksum */ 18139 DB_CKSUMFLAGS(mp) = 0; 18140 /* Don't know whether this is for non-global or global/forwarding */ 18141 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18142 if (zoneid == ALL_ZONES) 18143 freemsg(mp); 18144 else 18145 icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst); 18146 return (-1); 18147 18148 bad_src_route: 18149 q = WR(q); 18150 if (q->q_next != NULL) 18151 ill = q->q_ptr; 18152 else 18153 ill = NULL; 18154 18155 /* make sure we clear any indication of a hardware checksum */ 18156 DB_CKSUMFLAGS(mp) = 0; 18157 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18158 if (zoneid == ALL_ZONES) 18159 freemsg(mp); 18160 else 18161 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 18162 return (-1); 18163 } 18164 18165 /* 18166 * IP & ICMP info in >=14 msg's ... 18167 * - ip fixed part (mib2_ip_t) 18168 * - icmp fixed part (mib2_icmp_t) 18169 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 18170 * - ipRouteEntryTable (ip 21) all IPv4 IREs 18171 * - ipNetToMediaEntryTable (ip 22) [filled in by the arp module] 18172 * - ipRouteAttributeTable (ip 102) labeled routes 18173 * - ip multicast membership (ip_member_t) 18174 * - ip multicast source filtering (ip_grpsrc_t) 18175 * - igmp fixed part (struct igmpstat) 18176 * - multicast routing stats (struct mrtstat) 18177 * - multicast routing vifs (array of struct vifctl) 18178 * - multicast routing routes (array of struct mfcctl) 18179 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 18180 * One per ill plus one generic 18181 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 18182 * One per ill plus one generic 18183 * - ipv6RouteEntry all IPv6 IREs 18184 * - ipv6RouteAttributeTable (ip6 102) labeled routes 18185 * - ipv6NetToMediaEntry all Neighbor Cache entries 18186 * - ipv6AddrEntry all IPv6 ipifs 18187 * - ipv6 multicast membership (ipv6_member_t) 18188 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 18189 * 18190 * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries. 18191 * 18192 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 18193 * already filled in by the caller. 18194 * Return value of 0 indicates that no messages were sent and caller 18195 * should free mpctl. 18196 */ 18197 int 18198 ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) 18199 { 18200 ip_stack_t *ipst; 18201 sctp_stack_t *sctps; 18202 18203 if (q->q_next != NULL) { 18204 ipst = ILLQ_TO_IPST(q); 18205 } else { 18206 ipst = CONNQ_TO_IPST(q); 18207 } 18208 ASSERT(ipst != NULL); 18209 sctps = ipst->ips_netstack->netstack_sctp; 18210 18211 if (mpctl == NULL || mpctl->b_cont == NULL) { 18212 return (0); 18213 } 18214 18215 /* 18216 * For the purposes of the (broken) packet shell use 18217 * of the level we make sure MIB2_TCP/MIB2_UDP can be used 18218 * to make TCP and UDP appear first in the list of mib items. 18219 * TBD: We could expand this and use it in netstat so that 18220 * the kernel doesn't have to produce large tables (connections, 18221 * routes, etc) when netstat only wants the statistics or a particular 18222 * table. 18223 */ 18224 if (!(level == MIB2_TCP || level == MIB2_UDP)) { 18225 if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) { 18226 return (1); 18227 } 18228 } 18229 18230 if (level != MIB2_TCP) { 18231 if ((mpctl = udp_snmp_get(q, mpctl)) == NULL) { 18232 return (1); 18233 } 18234 } 18235 18236 if (level != MIB2_UDP) { 18237 if ((mpctl = tcp_snmp_get(q, mpctl)) == NULL) { 18238 return (1); 18239 } 18240 } 18241 18242 if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl, 18243 ipst)) == NULL) { 18244 return (1); 18245 } 18246 18247 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst)) == NULL) { 18248 return (1); 18249 } 18250 18251 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) { 18252 return (1); 18253 } 18254 18255 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) { 18256 return (1); 18257 } 18258 18259 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) { 18260 return (1); 18261 } 18262 18263 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) { 18264 return (1); 18265 } 18266 18267 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst)) == NULL) { 18268 return (1); 18269 } 18270 18271 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst)) == NULL) { 18272 return (1); 18273 } 18274 18275 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) { 18276 return (1); 18277 } 18278 18279 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) { 18280 return (1); 18281 } 18282 18283 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) { 18284 return (1); 18285 } 18286 18287 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) { 18288 return (1); 18289 } 18290 18291 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) { 18292 return (1); 18293 } 18294 18295 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) { 18296 return (1); 18297 } 18298 18299 mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst); 18300 if (mpctl == NULL) 18301 return (1); 18302 18303 mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst); 18304 if (mpctl == NULL) 18305 return (1); 18306 18307 if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { 18308 return (1); 18309 } 18310 freemsg(mpctl); 18311 return (1); 18312 } 18313 18314 /* Get global (legacy) IPv4 statistics */ 18315 static mblk_t * 18316 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib, 18317 ip_stack_t *ipst) 18318 { 18319 mib2_ip_t old_ip_mib; 18320 struct opthdr *optp; 18321 mblk_t *mp2ctl; 18322 18323 /* 18324 * make a copy of the original message 18325 */ 18326 mp2ctl = copymsg(mpctl); 18327 18328 /* fixed length IP structure... */ 18329 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18330 optp->level = MIB2_IP; 18331 optp->name = 0; 18332 SET_MIB(old_ip_mib.ipForwarding, 18333 (WE_ARE_FORWARDING(ipst) ? 1 : 2)); 18334 SET_MIB(old_ip_mib.ipDefaultTTL, 18335 (uint32_t)ipst->ips_ip_def_ttl); 18336 SET_MIB(old_ip_mib.ipReasmTimeout, 18337 ipst->ips_ip_g_frag_timeout); 18338 SET_MIB(old_ip_mib.ipAddrEntrySize, 18339 sizeof (mib2_ipAddrEntry_t)); 18340 SET_MIB(old_ip_mib.ipRouteEntrySize, 18341 sizeof (mib2_ipRouteEntry_t)); 18342 SET_MIB(old_ip_mib.ipNetToMediaEntrySize, 18343 sizeof (mib2_ipNetToMediaEntry_t)); 18344 SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 18345 SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 18346 SET_MIB(old_ip_mib.ipRouteAttributeSize, 18347 sizeof (mib2_ipAttributeEntry_t)); 18348 SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 18349 18350 /* 18351 * Grab the statistics from the new IP MIB 18352 */ 18353 SET_MIB(old_ip_mib.ipInReceives, 18354 (uint32_t)ipmib->ipIfStatsHCInReceives); 18355 SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors); 18356 SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors); 18357 SET_MIB(old_ip_mib.ipForwDatagrams, 18358 (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams); 18359 SET_MIB(old_ip_mib.ipInUnknownProtos, 18360 ipmib->ipIfStatsInUnknownProtos); 18361 SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards); 18362 SET_MIB(old_ip_mib.ipInDelivers, 18363 (uint32_t)ipmib->ipIfStatsHCInDelivers); 18364 SET_MIB(old_ip_mib.ipOutRequests, 18365 (uint32_t)ipmib->ipIfStatsHCOutRequests); 18366 SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards); 18367 SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes); 18368 SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds); 18369 SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs); 18370 SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails); 18371 SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs); 18372 SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails); 18373 SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates); 18374 18375 /* ipRoutingDiscards is not being used */ 18376 SET_MIB(old_ip_mib.ipRoutingDiscards, 0); 18377 SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs); 18378 SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts); 18379 SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs); 18380 SET_MIB(old_ip_mib.ipReasmDuplicates, 18381 ipmib->ipIfStatsReasmDuplicates); 18382 SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups); 18383 SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits); 18384 SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs); 18385 SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows); 18386 SET_MIB(old_ip_mib.rawipInOverflows, 18387 ipmib->rawipIfStatsInOverflows); 18388 18389 SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded); 18390 SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed); 18391 SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion); 18392 SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion); 18393 SET_MIB(old_ip_mib.ipOutSwitchIPv6, 18394 ipmib->ipIfStatsOutSwitchIPVersion); 18395 18396 if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib, 18397 (int)sizeof (old_ip_mib))) { 18398 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 18399 (uint_t)sizeof (old_ip_mib))); 18400 } 18401 18402 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18403 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 18404 (int)optp->level, (int)optp->name, (int)optp->len)); 18405 qreply(q, mpctl); 18406 return (mp2ctl); 18407 } 18408 18409 /* Per interface IPv4 statistics */ 18410 static mblk_t * 18411 ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18412 { 18413 struct opthdr *optp; 18414 mblk_t *mp2ctl; 18415 ill_t *ill; 18416 ill_walk_context_t ctx; 18417 mblk_t *mp_tail = NULL; 18418 mib2_ipIfStatsEntry_t global_ip_mib; 18419 18420 /* 18421 * Make a copy of the original message 18422 */ 18423 mp2ctl = copymsg(mpctl); 18424 18425 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18426 optp->level = MIB2_IP; 18427 optp->name = MIB2_IP_TRAFFIC_STATS; 18428 /* Include "unknown interface" ip_mib */ 18429 ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 18430 ipst->ips_ip_mib.ipIfStatsIfIndex = 18431 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 18432 SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding, 18433 (ipst->ips_ip_g_forward ? 1 : 2)); 18434 SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL, 18435 (uint32_t)ipst->ips_ip_def_ttl); 18436 SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize, 18437 sizeof (mib2_ipIfStatsEntry_t)); 18438 SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize, 18439 sizeof (mib2_ipAddrEntry_t)); 18440 SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize, 18441 sizeof (mib2_ipRouteEntry_t)); 18442 SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize, 18443 sizeof (mib2_ipNetToMediaEntry_t)); 18444 SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize, 18445 sizeof (ip_member_t)); 18446 SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize, 18447 sizeof (ip_grpsrc_t)); 18448 18449 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18450 (char *)&ipst->ips_ip_mib, (int)sizeof (ipst->ips_ip_mib))) { 18451 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18452 "failed to allocate %u bytes\n", 18453 (uint_t)sizeof (ipst->ips_ip_mib))); 18454 } 18455 18456 bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib)); 18457 18458 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18459 ill = ILL_START_WALK_V4(&ctx, ipst); 18460 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18461 ill->ill_ip_mib->ipIfStatsIfIndex = 18462 ill->ill_phyint->phyint_ifindex; 18463 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 18464 (ipst->ips_ip_g_forward ? 1 : 2)); 18465 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL, 18466 (uint32_t)ipst->ips_ip_def_ttl); 18467 18468 ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib); 18469 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18470 (char *)ill->ill_ip_mib, 18471 (int)sizeof (*ill->ill_ip_mib))) { 18472 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18473 "failed to allocate %u bytes\n", 18474 (uint_t)sizeof (*ill->ill_ip_mib))); 18475 } 18476 } 18477 rw_exit(&ipst->ips_ill_g_lock); 18478 18479 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18480 ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18481 "level %d, name %d, len %d\n", 18482 (int)optp->level, (int)optp->name, (int)optp->len)); 18483 qreply(q, mpctl); 18484 18485 if (mp2ctl == NULL) 18486 return (NULL); 18487 18488 return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst)); 18489 } 18490 18491 /* Global IPv4 ICMP statistics */ 18492 static mblk_t * 18493 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18494 { 18495 struct opthdr *optp; 18496 mblk_t *mp2ctl; 18497 18498 /* 18499 * Make a copy of the original message 18500 */ 18501 mp2ctl = copymsg(mpctl); 18502 18503 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18504 optp->level = MIB2_ICMP; 18505 optp->name = 0; 18506 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib, 18507 (int)sizeof (ipst->ips_icmp_mib))) { 18508 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 18509 (uint_t)sizeof (ipst->ips_icmp_mib))); 18510 } 18511 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18512 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 18513 (int)optp->level, (int)optp->name, (int)optp->len)); 18514 qreply(q, mpctl); 18515 return (mp2ctl); 18516 } 18517 18518 /* Global IPv4 IGMP statistics */ 18519 static mblk_t * 18520 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18521 { 18522 struct opthdr *optp; 18523 mblk_t *mp2ctl; 18524 18525 /* 18526 * make a copy of the original message 18527 */ 18528 mp2ctl = copymsg(mpctl); 18529 18530 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18531 optp->level = EXPER_IGMP; 18532 optp->name = 0; 18533 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat, 18534 (int)sizeof (ipst->ips_igmpstat))) { 18535 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 18536 (uint_t)sizeof (ipst->ips_igmpstat))); 18537 } 18538 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18539 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 18540 (int)optp->level, (int)optp->name, (int)optp->len)); 18541 qreply(q, mpctl); 18542 return (mp2ctl); 18543 } 18544 18545 /* Global IPv4 Multicast Routing statistics */ 18546 static mblk_t * 18547 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18548 { 18549 struct opthdr *optp; 18550 mblk_t *mp2ctl; 18551 18552 /* 18553 * make a copy of the original message 18554 */ 18555 mp2ctl = copymsg(mpctl); 18556 18557 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18558 optp->level = EXPER_DVMRP; 18559 optp->name = 0; 18560 if (!ip_mroute_stats(mpctl->b_cont, ipst)) { 18561 ip0dbg(("ip_mroute_stats: failed\n")); 18562 } 18563 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18564 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 18565 (int)optp->level, (int)optp->name, (int)optp->len)); 18566 qreply(q, mpctl); 18567 return (mp2ctl); 18568 } 18569 18570 /* IPv4 address information */ 18571 static mblk_t * 18572 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18573 { 18574 struct opthdr *optp; 18575 mblk_t *mp2ctl; 18576 mblk_t *mp_tail = NULL; 18577 ill_t *ill; 18578 ipif_t *ipif; 18579 uint_t bitval; 18580 mib2_ipAddrEntry_t mae; 18581 zoneid_t zoneid; 18582 ill_walk_context_t ctx; 18583 18584 /* 18585 * make a copy of the original message 18586 */ 18587 mp2ctl = copymsg(mpctl); 18588 18589 /* ipAddrEntryTable */ 18590 18591 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18592 optp->level = MIB2_IP; 18593 optp->name = MIB2_IP_ADDR; 18594 zoneid = Q_TO_CONN(q)->conn_zoneid; 18595 18596 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18597 ill = ILL_START_WALK_V4(&ctx, ipst); 18598 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18599 for (ipif = ill->ill_ipif; ipif != NULL; 18600 ipif = ipif->ipif_next) { 18601 if (ipif->ipif_zoneid != zoneid && 18602 ipif->ipif_zoneid != ALL_ZONES) 18603 continue; 18604 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18605 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18606 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18607 18608 ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes, 18609 OCTET_LENGTH); 18610 mae.ipAdEntIfIndex.o_length = 18611 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 18612 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 18613 mae.ipAdEntNetMask = ipif->ipif_net_mask; 18614 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 18615 mae.ipAdEntInfo.ae_subnet_len = 18616 ip_mask_to_plen(ipif->ipif_net_mask); 18617 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 18618 for (bitval = 1; 18619 bitval && 18620 !(bitval & ipif->ipif_brd_addr); 18621 bitval <<= 1) 18622 noop; 18623 mae.ipAdEntBcastAddr = bitval; 18624 mae.ipAdEntReasmMaxSize = IP_MAXPACKET; 18625 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 18626 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 18627 mae.ipAdEntInfo.ae_broadcast_addr = 18628 ipif->ipif_brd_addr; 18629 mae.ipAdEntInfo.ae_pp_dst_addr = 18630 ipif->ipif_pp_dst_addr; 18631 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 18632 ill->ill_flags | ill->ill_phyint->phyint_flags; 18633 mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL; 18634 18635 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18636 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 18637 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 18638 "allocate %u bytes\n", 18639 (uint_t)sizeof (mib2_ipAddrEntry_t))); 18640 } 18641 } 18642 } 18643 rw_exit(&ipst->ips_ill_g_lock); 18644 18645 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18646 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 18647 (int)optp->level, (int)optp->name, (int)optp->len)); 18648 qreply(q, mpctl); 18649 return (mp2ctl); 18650 } 18651 18652 /* IPv6 address information */ 18653 static mblk_t * 18654 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18655 { 18656 struct opthdr *optp; 18657 mblk_t *mp2ctl; 18658 mblk_t *mp_tail = NULL; 18659 ill_t *ill; 18660 ipif_t *ipif; 18661 mib2_ipv6AddrEntry_t mae6; 18662 zoneid_t zoneid; 18663 ill_walk_context_t ctx; 18664 18665 /* 18666 * make a copy of the original message 18667 */ 18668 mp2ctl = copymsg(mpctl); 18669 18670 /* ipv6AddrEntryTable */ 18671 18672 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18673 optp->level = MIB2_IP6; 18674 optp->name = MIB2_IP6_ADDR; 18675 zoneid = Q_TO_CONN(q)->conn_zoneid; 18676 18677 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18678 ill = ILL_START_WALK_V6(&ctx, ipst); 18679 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18680 for (ipif = ill->ill_ipif; ipif != NULL; 18681 ipif = ipif->ipif_next) { 18682 if (ipif->ipif_zoneid != zoneid && 18683 ipif->ipif_zoneid != ALL_ZONES) 18684 continue; 18685 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18686 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18687 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18688 18689 ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes, 18690 OCTET_LENGTH); 18691 mae6.ipv6AddrIfIndex.o_length = 18692 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 18693 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 18694 mae6.ipv6AddrPfxLength = 18695 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 18696 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 18697 mae6.ipv6AddrInfo.ae_subnet_len = 18698 mae6.ipv6AddrPfxLength; 18699 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 18700 18701 /* Type: stateless(1), stateful(2), unknown(3) */ 18702 if (ipif->ipif_flags & IPIF_ADDRCONF) 18703 mae6.ipv6AddrType = 1; 18704 else 18705 mae6.ipv6AddrType = 2; 18706 /* Anycast: true(1), false(2) */ 18707 if (ipif->ipif_flags & IPIF_ANYCAST) 18708 mae6.ipv6AddrAnycastFlag = 1; 18709 else 18710 mae6.ipv6AddrAnycastFlag = 2; 18711 18712 /* 18713 * Address status: preferred(1), deprecated(2), 18714 * invalid(3), inaccessible(4), unknown(5) 18715 */ 18716 if (ipif->ipif_flags & IPIF_NOLOCAL) 18717 mae6.ipv6AddrStatus = 3; 18718 else if (ipif->ipif_flags & IPIF_DEPRECATED) 18719 mae6.ipv6AddrStatus = 2; 18720 else 18721 mae6.ipv6AddrStatus = 1; 18722 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 18723 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 18724 mae6.ipv6AddrInfo.ae_pp_dst_addr = 18725 ipif->ipif_v6pp_dst_addr; 18726 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 18727 ill->ill_flags | ill->ill_phyint->phyint_flags; 18728 mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET; 18729 mae6.ipv6AddrIdentifier = ill->ill_token; 18730 mae6.ipv6AddrIdentifierLen = ill->ill_token_length; 18731 mae6.ipv6AddrReachableTime = ill->ill_reachable_time; 18732 mae6.ipv6AddrRetransmitTime = 18733 ill->ill_reachable_retrans_time; 18734 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18735 (char *)&mae6, 18736 (int)sizeof (mib2_ipv6AddrEntry_t))) { 18737 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 18738 "allocate %u bytes\n", 18739 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 18740 } 18741 } 18742 } 18743 rw_exit(&ipst->ips_ill_g_lock); 18744 18745 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18746 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 18747 (int)optp->level, (int)optp->name, (int)optp->len)); 18748 qreply(q, mpctl); 18749 return (mp2ctl); 18750 } 18751 18752 /* IPv4 multicast group membership. */ 18753 static mblk_t * 18754 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18755 { 18756 struct opthdr *optp; 18757 mblk_t *mp2ctl; 18758 ill_t *ill; 18759 ipif_t *ipif; 18760 ilm_t *ilm; 18761 ip_member_t ipm; 18762 mblk_t *mp_tail = NULL; 18763 ill_walk_context_t ctx; 18764 zoneid_t zoneid; 18765 ilm_walker_t ilw; 18766 18767 /* 18768 * make a copy of the original message 18769 */ 18770 mp2ctl = copymsg(mpctl); 18771 zoneid = Q_TO_CONN(q)->conn_zoneid; 18772 18773 /* ipGroupMember table */ 18774 optp = (struct opthdr *)&mpctl->b_rptr[ 18775 sizeof (struct T_optmgmt_ack)]; 18776 optp->level = MIB2_IP; 18777 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 18778 18779 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18780 ill = ILL_START_WALK_V4(&ctx, ipst); 18781 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18782 if (IS_UNDER_IPMP(ill)) 18783 continue; 18784 18785 ilm = ilm_walker_start(&ilw, ill); 18786 for (ipif = ill->ill_ipif; ipif != NULL; 18787 ipif = ipif->ipif_next) { 18788 if (ipif->ipif_zoneid != zoneid && 18789 ipif->ipif_zoneid != ALL_ZONES) 18790 continue; /* not this zone */ 18791 ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes, 18792 OCTET_LENGTH); 18793 ipm.ipGroupMemberIfIndex.o_length = 18794 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 18795 for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { 18796 ASSERT(ilm->ilm_ipif != NULL); 18797 ASSERT(ilm->ilm_ill == NULL); 18798 if (ilm->ilm_ipif != ipif) 18799 continue; 18800 ipm.ipGroupMemberAddress = ilm->ilm_addr; 18801 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 18802 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 18803 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18804 (char *)&ipm, (int)sizeof (ipm))) { 18805 ip1dbg(("ip_snmp_get_mib2_ip_group: " 18806 "failed to allocate %u bytes\n", 18807 (uint_t)sizeof (ipm))); 18808 } 18809 } 18810 } 18811 ilm_walker_finish(&ilw); 18812 } 18813 rw_exit(&ipst->ips_ill_g_lock); 18814 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18815 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18816 (int)optp->level, (int)optp->name, (int)optp->len)); 18817 qreply(q, mpctl); 18818 return (mp2ctl); 18819 } 18820 18821 /* IPv6 multicast group membership. */ 18822 static mblk_t * 18823 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18824 { 18825 struct opthdr *optp; 18826 mblk_t *mp2ctl; 18827 ill_t *ill; 18828 ilm_t *ilm; 18829 ipv6_member_t ipm6; 18830 mblk_t *mp_tail = NULL; 18831 ill_walk_context_t ctx; 18832 zoneid_t zoneid; 18833 ilm_walker_t ilw; 18834 18835 /* 18836 * make a copy of the original message 18837 */ 18838 mp2ctl = copymsg(mpctl); 18839 zoneid = Q_TO_CONN(q)->conn_zoneid; 18840 18841 /* ip6GroupMember table */ 18842 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18843 optp->level = MIB2_IP6; 18844 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 18845 18846 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18847 ill = ILL_START_WALK_V6(&ctx, ipst); 18848 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18849 if (IS_UNDER_IPMP(ill)) 18850 continue; 18851 18852 ilm = ilm_walker_start(&ilw, ill); 18853 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 18854 for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { 18855 ASSERT(ilm->ilm_ipif == NULL); 18856 ASSERT(ilm->ilm_ill != NULL); 18857 if (ilm->ilm_zoneid != zoneid) 18858 continue; /* not this zone */ 18859 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 18860 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 18861 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 18862 if (!snmp_append_data2(mpctl->b_cont, 18863 &mp_tail, 18864 (char *)&ipm6, (int)sizeof (ipm6))) { 18865 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 18866 "failed to allocate %u bytes\n", 18867 (uint_t)sizeof (ipm6))); 18868 } 18869 } 18870 ilm_walker_finish(&ilw); 18871 } 18872 rw_exit(&ipst->ips_ill_g_lock); 18873 18874 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18875 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18876 (int)optp->level, (int)optp->name, (int)optp->len)); 18877 qreply(q, mpctl); 18878 return (mp2ctl); 18879 } 18880 18881 /* IP multicast filtered sources */ 18882 static mblk_t * 18883 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18884 { 18885 struct opthdr *optp; 18886 mblk_t *mp2ctl; 18887 ill_t *ill; 18888 ipif_t *ipif; 18889 ilm_t *ilm; 18890 ip_grpsrc_t ips; 18891 mblk_t *mp_tail = NULL; 18892 ill_walk_context_t ctx; 18893 zoneid_t zoneid; 18894 int i; 18895 slist_t *sl; 18896 ilm_walker_t ilw; 18897 18898 /* 18899 * make a copy of the original message 18900 */ 18901 mp2ctl = copymsg(mpctl); 18902 zoneid = Q_TO_CONN(q)->conn_zoneid; 18903 18904 /* ipGroupSource table */ 18905 optp = (struct opthdr *)&mpctl->b_rptr[ 18906 sizeof (struct T_optmgmt_ack)]; 18907 optp->level = MIB2_IP; 18908 optp->name = EXPER_IP_GROUP_SOURCES; 18909 18910 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18911 ill = ILL_START_WALK_V4(&ctx, ipst); 18912 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18913 if (IS_UNDER_IPMP(ill)) 18914 continue; 18915 18916 ilm = ilm_walker_start(&ilw, ill); 18917 for (ipif = ill->ill_ipif; ipif != NULL; 18918 ipif = ipif->ipif_next) { 18919 if (ipif->ipif_zoneid != zoneid) 18920 continue; /* not this zone */ 18921 ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes, 18922 OCTET_LENGTH); 18923 ips.ipGroupSourceIfIndex.o_length = 18924 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 18925 for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { 18926 ASSERT(ilm->ilm_ipif != NULL); 18927 ASSERT(ilm->ilm_ill == NULL); 18928 sl = ilm->ilm_filter; 18929 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 18930 continue; 18931 ips.ipGroupSourceGroup = ilm->ilm_addr; 18932 for (i = 0; i < sl->sl_numsrc; i++) { 18933 if (!IN6_IS_ADDR_V4MAPPED( 18934 &sl->sl_addr[i])) 18935 continue; 18936 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 18937 ips.ipGroupSourceAddress); 18938 if (snmp_append_data2(mpctl->b_cont, 18939 &mp_tail, (char *)&ips, 18940 (int)sizeof (ips)) == 0) { 18941 ip1dbg(("ip_snmp_get_mib2_" 18942 "ip_group_src: failed to " 18943 "allocate %u bytes\n", 18944 (uint_t)sizeof (ips))); 18945 } 18946 } 18947 } 18948 } 18949 ilm_walker_finish(&ilw); 18950 } 18951 rw_exit(&ipst->ips_ill_g_lock); 18952 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18953 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18954 (int)optp->level, (int)optp->name, (int)optp->len)); 18955 qreply(q, mpctl); 18956 return (mp2ctl); 18957 } 18958 18959 /* IPv6 multicast filtered sources. */ 18960 static mblk_t * 18961 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18962 { 18963 struct opthdr *optp; 18964 mblk_t *mp2ctl; 18965 ill_t *ill; 18966 ilm_t *ilm; 18967 ipv6_grpsrc_t ips6; 18968 mblk_t *mp_tail = NULL; 18969 ill_walk_context_t ctx; 18970 zoneid_t zoneid; 18971 int i; 18972 slist_t *sl; 18973 ilm_walker_t ilw; 18974 18975 /* 18976 * make a copy of the original message 18977 */ 18978 mp2ctl = copymsg(mpctl); 18979 zoneid = Q_TO_CONN(q)->conn_zoneid; 18980 18981 /* ip6GroupMember table */ 18982 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18983 optp->level = MIB2_IP6; 18984 optp->name = EXPER_IP6_GROUP_SOURCES; 18985 18986 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18987 ill = ILL_START_WALK_V6(&ctx, ipst); 18988 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18989 if (IS_UNDER_IPMP(ill)) 18990 continue; 18991 18992 ilm = ilm_walker_start(&ilw, ill); 18993 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 18994 for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { 18995 ASSERT(ilm->ilm_ipif == NULL); 18996 ASSERT(ilm->ilm_ill != NULL); 18997 sl = ilm->ilm_filter; 18998 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 18999 continue; 19000 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 19001 for (i = 0; i < sl->sl_numsrc; i++) { 19002 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 19003 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19004 (char *)&ips6, (int)sizeof (ips6))) { 19005 ip1dbg(("ip_snmp_get_mib2_ip6_" 19006 "group_src: failed to allocate " 19007 "%u bytes\n", 19008 (uint_t)sizeof (ips6))); 19009 } 19010 } 19011 } 19012 ilm_walker_finish(&ilw); 19013 } 19014 rw_exit(&ipst->ips_ill_g_lock); 19015 19016 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19017 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 19018 (int)optp->level, (int)optp->name, (int)optp->len)); 19019 qreply(q, mpctl); 19020 return (mp2ctl); 19021 } 19022 19023 /* Multicast routing virtual interface table. */ 19024 static mblk_t * 19025 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19026 { 19027 struct opthdr *optp; 19028 mblk_t *mp2ctl; 19029 19030 /* 19031 * make a copy of the original message 19032 */ 19033 mp2ctl = copymsg(mpctl); 19034 19035 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19036 optp->level = EXPER_DVMRP; 19037 optp->name = EXPER_DVMRP_VIF; 19038 if (!ip_mroute_vif(mpctl->b_cont, ipst)) { 19039 ip0dbg(("ip_mroute_vif: failed\n")); 19040 } 19041 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19042 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 19043 (int)optp->level, (int)optp->name, (int)optp->len)); 19044 qreply(q, mpctl); 19045 return (mp2ctl); 19046 } 19047 19048 /* Multicast routing table. */ 19049 static mblk_t * 19050 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19051 { 19052 struct opthdr *optp; 19053 mblk_t *mp2ctl; 19054 19055 /* 19056 * make a copy of the original message 19057 */ 19058 mp2ctl = copymsg(mpctl); 19059 19060 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19061 optp->level = EXPER_DVMRP; 19062 optp->name = EXPER_DVMRP_MRT; 19063 if (!ip_mroute_mrt(mpctl->b_cont, ipst)) { 19064 ip0dbg(("ip_mroute_mrt: failed\n")); 19065 } 19066 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19067 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 19068 (int)optp->level, (int)optp->name, (int)optp->len)); 19069 qreply(q, mpctl); 19070 return (mp2ctl); 19071 } 19072 19073 /* 19074 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 19075 * in one IRE walk. 19076 */ 19077 static mblk_t * 19078 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level, 19079 ip_stack_t *ipst) 19080 { 19081 struct opthdr *optp; 19082 mblk_t *mp2ctl; /* Returned */ 19083 mblk_t *mp3ctl; /* nettomedia */ 19084 mblk_t *mp4ctl; /* routeattrs */ 19085 iproutedata_t ird; 19086 zoneid_t zoneid; 19087 19088 /* 19089 * make copies of the original message 19090 * - mp2ctl is returned unchanged to the caller for his use 19091 * - mpctl is sent upstream as ipRouteEntryTable 19092 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 19093 * - mp4ctl is sent upstream as ipRouteAttributeTable 19094 */ 19095 mp2ctl = copymsg(mpctl); 19096 mp3ctl = copymsg(mpctl); 19097 mp4ctl = copymsg(mpctl); 19098 if (mp3ctl == NULL || mp4ctl == NULL) { 19099 freemsg(mp4ctl); 19100 freemsg(mp3ctl); 19101 freemsg(mp2ctl); 19102 freemsg(mpctl); 19103 return (NULL); 19104 } 19105 19106 bzero(&ird, sizeof (ird)); 19107 19108 ird.ird_route.lp_head = mpctl->b_cont; 19109 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19110 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19111 /* 19112 * If the level has been set the special EXPER_IP_AND_TESTHIDDEN 19113 * value, then also include IRE_MARK_TESTHIDDEN IREs. This is 19114 * intended a temporary solution until a proper MIB API is provided 19115 * that provides complete filtering/caller-opt-in. 19116 */ 19117 if (level == EXPER_IP_AND_TESTHIDDEN) 19118 ird.ird_flags |= IRD_REPORT_TESTHIDDEN; 19119 19120 zoneid = Q_TO_CONN(q)->conn_zoneid; 19121 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); 19122 19123 /* ipRouteEntryTable in mpctl */ 19124 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19125 optp->level = MIB2_IP; 19126 optp->name = MIB2_IP_ROUTE; 19127 optp->len = msgdsize(ird.ird_route.lp_head); 19128 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19129 (int)optp->level, (int)optp->name, (int)optp->len)); 19130 qreply(q, mpctl); 19131 19132 /* ipNetToMediaEntryTable in mp3ctl */ 19133 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19134 optp->level = MIB2_IP; 19135 optp->name = MIB2_IP_MEDIA; 19136 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19137 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19138 (int)optp->level, (int)optp->name, (int)optp->len)); 19139 qreply(q, mp3ctl); 19140 19141 /* ipRouteAttributeTable in mp4ctl */ 19142 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19143 optp->level = MIB2_IP; 19144 optp->name = EXPER_IP_RTATTR; 19145 optp->len = msgdsize(ird.ird_attrs.lp_head); 19146 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19147 (int)optp->level, (int)optp->name, (int)optp->len)); 19148 if (optp->len == 0) 19149 freemsg(mp4ctl); 19150 else 19151 qreply(q, mp4ctl); 19152 19153 return (mp2ctl); 19154 } 19155 19156 /* 19157 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 19158 * ipv6NetToMediaEntryTable in an NDP walk. 19159 */ 19160 static mblk_t * 19161 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level, 19162 ip_stack_t *ipst) 19163 { 19164 struct opthdr *optp; 19165 mblk_t *mp2ctl; /* Returned */ 19166 mblk_t *mp3ctl; /* nettomedia */ 19167 mblk_t *mp4ctl; /* routeattrs */ 19168 iproutedata_t ird; 19169 zoneid_t zoneid; 19170 19171 /* 19172 * make copies of the original message 19173 * - mp2ctl is returned unchanged to the caller for his use 19174 * - mpctl is sent upstream as ipv6RouteEntryTable 19175 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 19176 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 19177 */ 19178 mp2ctl = copymsg(mpctl); 19179 mp3ctl = copymsg(mpctl); 19180 mp4ctl = copymsg(mpctl); 19181 if (mp3ctl == NULL || mp4ctl == NULL) { 19182 freemsg(mp4ctl); 19183 freemsg(mp3ctl); 19184 freemsg(mp2ctl); 19185 freemsg(mpctl); 19186 return (NULL); 19187 } 19188 19189 bzero(&ird, sizeof (ird)); 19190 19191 ird.ird_route.lp_head = mpctl->b_cont; 19192 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19193 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19194 /* 19195 * If the level has been set the special EXPER_IP_AND_TESTHIDDEN 19196 * value, then also include IRE_MARK_TESTHIDDEN IREs. This is 19197 * intended a temporary solution until a proper MIB API is provided 19198 * that provides complete filtering/caller-opt-in. 19199 */ 19200 if (level == EXPER_IP_AND_TESTHIDDEN) 19201 ird.ird_flags |= IRD_REPORT_TESTHIDDEN; 19202 19203 zoneid = Q_TO_CONN(q)->conn_zoneid; 19204 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); 19205 19206 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19207 optp->level = MIB2_IP6; 19208 optp->name = MIB2_IP6_ROUTE; 19209 optp->len = msgdsize(ird.ird_route.lp_head); 19210 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19211 (int)optp->level, (int)optp->name, (int)optp->len)); 19212 qreply(q, mpctl); 19213 19214 /* ipv6NetToMediaEntryTable in mp3ctl */ 19215 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst); 19216 19217 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19218 optp->level = MIB2_IP6; 19219 optp->name = MIB2_IP6_MEDIA; 19220 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19221 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19222 (int)optp->level, (int)optp->name, (int)optp->len)); 19223 qreply(q, mp3ctl); 19224 19225 /* ipv6RouteAttributeTable in mp4ctl */ 19226 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19227 optp->level = MIB2_IP6; 19228 optp->name = EXPER_IP_RTATTR; 19229 optp->len = msgdsize(ird.ird_attrs.lp_head); 19230 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19231 (int)optp->level, (int)optp->name, (int)optp->len)); 19232 if (optp->len == 0) 19233 freemsg(mp4ctl); 19234 else 19235 qreply(q, mp4ctl); 19236 19237 return (mp2ctl); 19238 } 19239 19240 /* 19241 * IPv6 mib: One per ill 19242 */ 19243 static mblk_t * 19244 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19245 { 19246 struct opthdr *optp; 19247 mblk_t *mp2ctl; 19248 ill_t *ill; 19249 ill_walk_context_t ctx; 19250 mblk_t *mp_tail = NULL; 19251 19252 /* 19253 * Make a copy of the original message 19254 */ 19255 mp2ctl = copymsg(mpctl); 19256 19257 /* fixed length IPv6 structure ... */ 19258 19259 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19260 optp->level = MIB2_IP6; 19261 optp->name = 0; 19262 /* Include "unknown interface" ip6_mib */ 19263 ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 19264 ipst->ips_ip6_mib.ipIfStatsIfIndex = 19265 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 19266 SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding, 19267 ipst->ips_ipv6_forward ? 1 : 2); 19268 SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit, 19269 ipst->ips_ipv6_def_hops); 19270 SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize, 19271 sizeof (mib2_ipIfStatsEntry_t)); 19272 SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize, 19273 sizeof (mib2_ipv6AddrEntry_t)); 19274 SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize, 19275 sizeof (mib2_ipv6RouteEntry_t)); 19276 SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize, 19277 sizeof (mib2_ipv6NetToMediaEntry_t)); 19278 SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize, 19279 sizeof (ipv6_member_t)); 19280 SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize, 19281 sizeof (ipv6_grpsrc_t)); 19282 19283 /* 19284 * Synchronize 64- and 32-bit counters 19285 */ 19286 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives, 19287 ipIfStatsHCInReceives); 19288 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers, 19289 ipIfStatsHCInDelivers); 19290 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests, 19291 ipIfStatsHCOutRequests); 19292 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams, 19293 ipIfStatsHCOutForwDatagrams); 19294 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts, 19295 ipIfStatsHCOutMcastPkts); 19296 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts, 19297 ipIfStatsHCInMcastPkts); 19298 19299 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19300 (char *)&ipst->ips_ip6_mib, (int)sizeof (ipst->ips_ip6_mib))) { 19301 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 19302 (uint_t)sizeof (ipst->ips_ip6_mib))); 19303 } 19304 19305 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19306 ill = ILL_START_WALK_V6(&ctx, ipst); 19307 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19308 ill->ill_ip_mib->ipIfStatsIfIndex = 19309 ill->ill_phyint->phyint_ifindex; 19310 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 19311 ipst->ips_ipv6_forward ? 1 : 2); 19312 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit, 19313 ill->ill_max_hops); 19314 19315 /* 19316 * Synchronize 64- and 32-bit counters 19317 */ 19318 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives, 19319 ipIfStatsHCInReceives); 19320 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers, 19321 ipIfStatsHCInDelivers); 19322 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests, 19323 ipIfStatsHCOutRequests); 19324 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams, 19325 ipIfStatsHCOutForwDatagrams); 19326 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts, 19327 ipIfStatsHCOutMcastPkts); 19328 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts, 19329 ipIfStatsHCInMcastPkts); 19330 19331 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19332 (char *)ill->ill_ip_mib, 19333 (int)sizeof (*ill->ill_ip_mib))) { 19334 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 19335 "%u bytes\n", (uint_t)sizeof (*ill->ill_ip_mib))); 19336 } 19337 } 19338 rw_exit(&ipst->ips_ill_g_lock); 19339 19340 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19341 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 19342 (int)optp->level, (int)optp->name, (int)optp->len)); 19343 qreply(q, mpctl); 19344 return (mp2ctl); 19345 } 19346 19347 /* 19348 * ICMPv6 mib: One per ill 19349 */ 19350 static mblk_t * 19351 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19352 { 19353 struct opthdr *optp; 19354 mblk_t *mp2ctl; 19355 ill_t *ill; 19356 ill_walk_context_t ctx; 19357 mblk_t *mp_tail = NULL; 19358 /* 19359 * Make a copy of the original message 19360 */ 19361 mp2ctl = copymsg(mpctl); 19362 19363 /* fixed length ICMPv6 structure ... */ 19364 19365 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19366 optp->level = MIB2_ICMP6; 19367 optp->name = 0; 19368 /* Include "unknown interface" icmp6_mib */ 19369 ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex = 19370 MIB2_UNKNOWN_INTERFACE; /* netstat flag */ 19371 ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize = 19372 sizeof (mib2_ipv6IfIcmpEntry_t); 19373 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19374 (char *)&ipst->ips_icmp6_mib, 19375 (int)sizeof (ipst->ips_icmp6_mib))) { 19376 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 19377 (uint_t)sizeof (ipst->ips_icmp6_mib))); 19378 } 19379 19380 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19381 ill = ILL_START_WALK_V6(&ctx, ipst); 19382 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19383 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 19384 ill->ill_phyint->phyint_ifindex; 19385 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19386 (char *)ill->ill_icmp6_mib, 19387 (int)sizeof (*ill->ill_icmp6_mib))) { 19388 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 19389 "%u bytes\n", 19390 (uint_t)sizeof (*ill->ill_icmp6_mib))); 19391 } 19392 } 19393 rw_exit(&ipst->ips_ill_g_lock); 19394 19395 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19396 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 19397 (int)optp->level, (int)optp->name, (int)optp->len)); 19398 qreply(q, mpctl); 19399 return (mp2ctl); 19400 } 19401 19402 /* 19403 * ire_walk routine to create both ipRouteEntryTable and 19404 * ipRouteAttributeTable in one IRE walk 19405 */ 19406 static void 19407 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 19408 { 19409 ill_t *ill; 19410 ipif_t *ipif; 19411 mib2_ipRouteEntry_t *re; 19412 mib2_ipAttributeEntry_t *iae, *iaeptr; 19413 ipaddr_t gw_addr; 19414 tsol_ire_gw_secattr_t *attrp; 19415 tsol_gc_t *gc = NULL; 19416 tsol_gcgrp_t *gcgrp = NULL; 19417 uint_t sacnt = 0; 19418 int i; 19419 19420 ASSERT(ire->ire_ipversion == IPV4_VERSION); 19421 19422 if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && 19423 ire->ire_marks & IRE_MARK_TESTHIDDEN) { 19424 return; 19425 } 19426 19427 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19428 return; 19429 19430 if ((attrp = ire->ire_gw_secattr) != NULL) { 19431 mutex_enter(&attrp->igsa_lock); 19432 if ((gc = attrp->igsa_gc) != NULL) { 19433 gcgrp = gc->gc_grp; 19434 ASSERT(gcgrp != NULL); 19435 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19436 sacnt = 1; 19437 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19438 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19439 gc = gcgrp->gcgrp_head; 19440 sacnt = gcgrp->gcgrp_count; 19441 } 19442 mutex_exit(&attrp->igsa_lock); 19443 19444 /* do nothing if there's no gc to report */ 19445 if (gc == NULL) { 19446 ASSERT(sacnt == 0); 19447 if (gcgrp != NULL) { 19448 /* we might as well drop the lock now */ 19449 rw_exit(&gcgrp->gcgrp_rwlock); 19450 gcgrp = NULL; 19451 } 19452 attrp = NULL; 19453 } 19454 19455 ASSERT(gc == NULL || (gcgrp != NULL && 19456 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19457 } 19458 ASSERT(sacnt == 0 || gc != NULL); 19459 19460 if (sacnt != 0 && 19461 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19462 kmem_free(re, sizeof (*re)); 19463 rw_exit(&gcgrp->gcgrp_rwlock); 19464 return; 19465 } 19466 19467 /* 19468 * Return all IRE types for route table... let caller pick and choose 19469 */ 19470 re->ipRouteDest = ire->ire_addr; 19471 ipif = ire->ire_ipif; 19472 re->ipRouteIfIndex.o_length = 0; 19473 if (ire->ire_type == IRE_CACHE) { 19474 ill = (ill_t *)ire->ire_stq->q_ptr; 19475 re->ipRouteIfIndex.o_length = 19476 ill->ill_name_length == 0 ? 0 : 19477 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19478 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 19479 re->ipRouteIfIndex.o_length); 19480 } else if (ipif != NULL) { 19481 ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH); 19482 re->ipRouteIfIndex.o_length = 19483 mi_strlen(re->ipRouteIfIndex.o_bytes); 19484 } 19485 re->ipRouteMetric1 = -1; 19486 re->ipRouteMetric2 = -1; 19487 re->ipRouteMetric3 = -1; 19488 re->ipRouteMetric4 = -1; 19489 19490 gw_addr = ire->ire_gateway_addr; 19491 19492 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 19493 re->ipRouteNextHop = ire->ire_src_addr; 19494 else 19495 re->ipRouteNextHop = gw_addr; 19496 /* indirect(4), direct(3), or invalid(2) */ 19497 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19498 re->ipRouteType = 2; 19499 else 19500 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 19501 re->ipRouteProto = -1; 19502 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 19503 re->ipRouteMask = ire->ire_mask; 19504 re->ipRouteMetric5 = -1; 19505 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 19506 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 19507 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19508 re->ipRouteInfo.re_ref = ire->ire_refcnt; 19509 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 19510 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19511 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19512 re->ipRouteInfo.re_flags = ire->ire_flags; 19513 19514 if (ire->ire_flags & RTF_DYNAMIC) { 19515 re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19516 } else { 19517 re->ipRouteInfo.re_ire_type = ire->ire_type; 19518 } 19519 19520 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19521 (char *)re, (int)sizeof (*re))) { 19522 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19523 (uint_t)sizeof (*re))); 19524 } 19525 19526 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19527 iaeptr->iae_routeidx = ird->ird_idx; 19528 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19529 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19530 } 19531 19532 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19533 (char *)iae, sacnt * sizeof (*iae))) { 19534 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19535 (unsigned)(sacnt * sizeof (*iae)))); 19536 } 19537 19538 /* bump route index for next pass */ 19539 ird->ird_idx++; 19540 19541 kmem_free(re, sizeof (*re)); 19542 if (sacnt != 0) 19543 kmem_free(iae, sacnt * sizeof (*iae)); 19544 19545 if (gcgrp != NULL) 19546 rw_exit(&gcgrp->gcgrp_rwlock); 19547 } 19548 19549 /* 19550 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 19551 */ 19552 static void 19553 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 19554 { 19555 ill_t *ill; 19556 ipif_t *ipif; 19557 mib2_ipv6RouteEntry_t *re; 19558 mib2_ipAttributeEntry_t *iae, *iaeptr; 19559 in6_addr_t gw_addr_v6; 19560 tsol_ire_gw_secattr_t *attrp; 19561 tsol_gc_t *gc = NULL; 19562 tsol_gcgrp_t *gcgrp = NULL; 19563 uint_t sacnt = 0; 19564 int i; 19565 19566 ASSERT(ire->ire_ipversion == IPV6_VERSION); 19567 19568 if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && 19569 ire->ire_marks & IRE_MARK_TESTHIDDEN) { 19570 return; 19571 } 19572 19573 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19574 return; 19575 19576 if ((attrp = ire->ire_gw_secattr) != NULL) { 19577 mutex_enter(&attrp->igsa_lock); 19578 if ((gc = attrp->igsa_gc) != NULL) { 19579 gcgrp = gc->gc_grp; 19580 ASSERT(gcgrp != NULL); 19581 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19582 sacnt = 1; 19583 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19584 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19585 gc = gcgrp->gcgrp_head; 19586 sacnt = gcgrp->gcgrp_count; 19587 } 19588 mutex_exit(&attrp->igsa_lock); 19589 19590 /* do nothing if there's no gc to report */ 19591 if (gc == NULL) { 19592 ASSERT(sacnt == 0); 19593 if (gcgrp != NULL) { 19594 /* we might as well drop the lock now */ 19595 rw_exit(&gcgrp->gcgrp_rwlock); 19596 gcgrp = NULL; 19597 } 19598 attrp = NULL; 19599 } 19600 19601 ASSERT(gc == NULL || (gcgrp != NULL && 19602 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19603 } 19604 ASSERT(sacnt == 0 || gc != NULL); 19605 19606 if (sacnt != 0 && 19607 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19608 kmem_free(re, sizeof (*re)); 19609 rw_exit(&gcgrp->gcgrp_rwlock); 19610 return; 19611 } 19612 19613 /* 19614 * Return all IRE types for route table... let caller pick and choose 19615 */ 19616 re->ipv6RouteDest = ire->ire_addr_v6; 19617 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 19618 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 19619 re->ipv6RouteIfIndex.o_length = 0; 19620 ipif = ire->ire_ipif; 19621 if (ire->ire_type == IRE_CACHE) { 19622 ill = (ill_t *)ire->ire_stq->q_ptr; 19623 re->ipv6RouteIfIndex.o_length = 19624 ill->ill_name_length == 0 ? 0 : 19625 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19626 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 19627 re->ipv6RouteIfIndex.o_length); 19628 } else if (ipif != NULL) { 19629 ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH); 19630 re->ipv6RouteIfIndex.o_length = 19631 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 19632 } 19633 19634 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19635 19636 mutex_enter(&ire->ire_lock); 19637 gw_addr_v6 = ire->ire_gateway_addr_v6; 19638 mutex_exit(&ire->ire_lock); 19639 19640 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 19641 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 19642 else 19643 re->ipv6RouteNextHop = gw_addr_v6; 19644 19645 /* remote(4), local(3), or discard(2) */ 19646 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19647 re->ipv6RouteType = 2; 19648 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 19649 re->ipv6RouteType = 3; 19650 else 19651 re->ipv6RouteType = 4; 19652 19653 re->ipv6RouteProtocol = -1; 19654 re->ipv6RoutePolicy = 0; 19655 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 19656 re->ipv6RouteNextHopRDI = 0; 19657 re->ipv6RouteWeight = 0; 19658 re->ipv6RouteMetric = 0; 19659 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 19660 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 19661 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19662 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 19663 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19664 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19665 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 19666 re->ipv6RouteInfo.re_flags = ire->ire_flags; 19667 19668 if (ire->ire_flags & RTF_DYNAMIC) { 19669 re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19670 } else { 19671 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 19672 } 19673 19674 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19675 (char *)re, (int)sizeof (*re))) { 19676 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19677 (uint_t)sizeof (*re))); 19678 } 19679 19680 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19681 iaeptr->iae_routeidx = ird->ird_idx; 19682 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19683 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19684 } 19685 19686 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19687 (char *)iae, sacnt * sizeof (*iae))) { 19688 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19689 (unsigned)(sacnt * sizeof (*iae)))); 19690 } 19691 19692 /* bump route index for next pass */ 19693 ird->ird_idx++; 19694 19695 kmem_free(re, sizeof (*re)); 19696 if (sacnt != 0) 19697 kmem_free(iae, sacnt * sizeof (*iae)); 19698 19699 if (gcgrp != NULL) 19700 rw_exit(&gcgrp->gcgrp_rwlock); 19701 } 19702 19703 /* 19704 * ndp_walk routine to create ipv6NetToMediaEntryTable 19705 */ 19706 static int 19707 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 19708 { 19709 ill_t *ill; 19710 mib2_ipv6NetToMediaEntry_t ntme; 19711 dl_unitdata_req_t *dl; 19712 19713 ill = nce->nce_ill; 19714 if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */ 19715 return (0); 19716 19717 /* 19718 * Neighbor cache entry attached to IRE with on-link 19719 * destination. 19720 */ 19721 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 19722 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 19723 if ((ill->ill_flags & ILLF_XRESOLV) && 19724 (nce->nce_res_mp != NULL)) { 19725 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 19726 ntme.ipv6NetToMediaPhysAddress.o_length = 19727 dl->dl_dest_addr_length; 19728 } else { 19729 ntme.ipv6NetToMediaPhysAddress.o_length = 19730 ill->ill_phys_addr_length; 19731 } 19732 if (nce->nce_res_mp != NULL) { 19733 bcopy((char *)nce->nce_res_mp->b_rptr + 19734 NCE_LL_ADDR_OFFSET(ill), 19735 ntme.ipv6NetToMediaPhysAddress.o_bytes, 19736 ntme.ipv6NetToMediaPhysAddress.o_length); 19737 } else { 19738 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 19739 ill->ill_phys_addr_length); 19740 } 19741 /* 19742 * Note: Returns ND_* states. Should be: 19743 * reachable(1), stale(2), delay(3), probe(4), 19744 * invalid(5), unknown(6) 19745 */ 19746 ntme.ipv6NetToMediaState = nce->nce_state; 19747 ntme.ipv6NetToMediaLastUpdated = 0; 19748 19749 /* other(1), dynamic(2), static(3), local(4) */ 19750 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 19751 ntme.ipv6NetToMediaType = 4; 19752 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 19753 ntme.ipv6NetToMediaType = 1; 19754 } else { 19755 ntme.ipv6NetToMediaType = 2; 19756 } 19757 19758 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 19759 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 19760 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 19761 (uint_t)sizeof (ntme))); 19762 } 19763 return (0); 19764 } 19765 19766 /* 19767 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 19768 */ 19769 /* ARGSUSED */ 19770 int 19771 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 19772 { 19773 switch (level) { 19774 case MIB2_IP: 19775 case MIB2_ICMP: 19776 switch (name) { 19777 default: 19778 break; 19779 } 19780 return (1); 19781 default: 19782 return (1); 19783 } 19784 } 19785 19786 /* 19787 * When there exists both a 64- and 32-bit counter of a particular type 19788 * (i.e., InReceives), only the 64-bit counters are added. 19789 */ 19790 void 19791 ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2) 19792 { 19793 UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors); 19794 UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors); 19795 UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes); 19796 UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors); 19797 UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos); 19798 UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts); 19799 UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards); 19800 UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards); 19801 UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs); 19802 UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails); 19803 UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates); 19804 UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds); 19805 UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs); 19806 UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails); 19807 UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes); 19808 UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates); 19809 UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups); 19810 UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits); 19811 UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs); 19812 UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows); 19813 UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows); 19814 UPDATE_MIB(o1, ipIfStatsInWrongIPVersion, 19815 o2->ipIfStatsInWrongIPVersion); 19816 UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion, 19817 o2->ipIfStatsInWrongIPVersion); 19818 UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion, 19819 o2->ipIfStatsOutSwitchIPVersion); 19820 UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives); 19821 UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets); 19822 UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams, 19823 o2->ipIfStatsHCInForwDatagrams); 19824 UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers); 19825 UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests); 19826 UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams, 19827 o2->ipIfStatsHCOutForwDatagrams); 19828 UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds); 19829 UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits); 19830 UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets); 19831 UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts); 19832 UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets); 19833 UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts); 19834 UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets, 19835 o2->ipIfStatsHCOutMcastOctets); 19836 UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts); 19837 UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts); 19838 UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded); 19839 UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed); 19840 UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs); 19841 UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs); 19842 UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts); 19843 } 19844 19845 void 19846 ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2) 19847 { 19848 UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs); 19849 UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors); 19850 UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs); 19851 UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs); 19852 UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds); 19853 UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems); 19854 UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs); 19855 UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos); 19856 UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies); 19857 UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits, 19858 o2->ipv6IfIcmpInRouterSolicits); 19859 UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements, 19860 o2->ipv6IfIcmpInRouterAdvertisements); 19861 UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits, 19862 o2->ipv6IfIcmpInNeighborSolicits); 19863 UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements, 19864 o2->ipv6IfIcmpInNeighborAdvertisements); 19865 UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects); 19866 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries, 19867 o2->ipv6IfIcmpInGroupMembQueries); 19868 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses, 19869 o2->ipv6IfIcmpInGroupMembResponses); 19870 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions, 19871 o2->ipv6IfIcmpInGroupMembReductions); 19872 UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs); 19873 UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors); 19874 UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs, 19875 o2->ipv6IfIcmpOutDestUnreachs); 19876 UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs, 19877 o2->ipv6IfIcmpOutAdminProhibs); 19878 UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds); 19879 UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems, 19880 o2->ipv6IfIcmpOutParmProblems); 19881 UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs); 19882 UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos); 19883 UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies); 19884 UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits, 19885 o2->ipv6IfIcmpOutRouterSolicits); 19886 UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements, 19887 o2->ipv6IfIcmpOutRouterAdvertisements); 19888 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits, 19889 o2->ipv6IfIcmpOutNeighborSolicits); 19890 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements, 19891 o2->ipv6IfIcmpOutNeighborAdvertisements); 19892 UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects); 19893 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries, 19894 o2->ipv6IfIcmpOutGroupMembQueries); 19895 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses, 19896 o2->ipv6IfIcmpOutGroupMembResponses); 19897 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions, 19898 o2->ipv6IfIcmpOutGroupMembReductions); 19899 UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows); 19900 UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit); 19901 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements, 19902 o2->ipv6IfIcmpInBadNeighborAdvertisements); 19903 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations, 19904 o2->ipv6IfIcmpInBadNeighborSolicitations); 19905 UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects); 19906 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal, 19907 o2->ipv6IfIcmpInGroupMembTotal); 19908 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries, 19909 o2->ipv6IfIcmpInGroupMembBadQueries); 19910 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports, 19911 o2->ipv6IfIcmpInGroupMembBadReports); 19912 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports, 19913 o2->ipv6IfIcmpInGroupMembOurReports); 19914 } 19915 19916 /* 19917 * Called before the options are updated to check if this packet will 19918 * be source routed from here. 19919 * This routine assumes that the options are well formed i.e. that they 19920 * have already been checked. 19921 */ 19922 static boolean_t 19923 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) 19924 { 19925 ipoptp_t opts; 19926 uchar_t *opt; 19927 uint8_t optval; 19928 uint8_t optlen; 19929 ipaddr_t dst; 19930 ire_t *ire; 19931 19932 if (IS_SIMPLE_IPH(ipha)) { 19933 ip2dbg(("not source routed\n")); 19934 return (B_FALSE); 19935 } 19936 dst = ipha->ipha_dst; 19937 for (optval = ipoptp_first(&opts, ipha); 19938 optval != IPOPT_EOL; 19939 optval = ipoptp_next(&opts)) { 19940 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 19941 opt = opts.ipoptp_cur; 19942 optlen = opts.ipoptp_len; 19943 ip2dbg(("ip_source_routed: opt %d, len %d\n", 19944 optval, optlen)); 19945 switch (optval) { 19946 uint32_t off; 19947 case IPOPT_SSRR: 19948 case IPOPT_LSRR: 19949 /* 19950 * If dst is one of our addresses and there are some 19951 * entries left in the source route return (true). 19952 */ 19953 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 19954 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19955 if (ire == NULL) { 19956 ip2dbg(("ip_source_routed: not next" 19957 " source route 0x%x\n", 19958 ntohl(dst))); 19959 return (B_FALSE); 19960 } 19961 ire_refrele(ire); 19962 off = opt[IPOPT_OFFSET]; 19963 off--; 19964 if (optlen < IP_ADDR_LEN || 19965 off > optlen - IP_ADDR_LEN) { 19966 /* End of source route */ 19967 ip1dbg(("ip_source_routed: end of SR\n")); 19968 return (B_FALSE); 19969 } 19970 return (B_TRUE); 19971 } 19972 } 19973 ip2dbg(("not source routed\n")); 19974 return (B_FALSE); 19975 } 19976 19977 /* 19978 * Check if the packet contains any source route. 19979 */ 19980 static boolean_t 19981 ip_source_route_included(ipha_t *ipha) 19982 { 19983 ipoptp_t opts; 19984 uint8_t optval; 19985 19986 if (IS_SIMPLE_IPH(ipha)) 19987 return (B_FALSE); 19988 for (optval = ipoptp_first(&opts, ipha); 19989 optval != IPOPT_EOL; 19990 optval = ipoptp_next(&opts)) { 19991 switch (optval) { 19992 case IPOPT_SSRR: 19993 case IPOPT_LSRR: 19994 return (B_TRUE); 19995 } 19996 } 19997 return (B_FALSE); 19998 } 19999 20000 /* 20001 * Called when the IRE expiration timer fires. 20002 */ 20003 void 20004 ip_trash_timer_expire(void *args) 20005 { 20006 int flush_flag = 0; 20007 ire_expire_arg_t iea; 20008 ip_stack_t *ipst = (ip_stack_t *)args; 20009 20010 iea.iea_ipst = ipst; /* No netstack_hold */ 20011 20012 /* 20013 * ip_ire_expire_id is protected by ip_trash_timer_lock. 20014 * This lock makes sure that a new invocation of this function 20015 * that occurs due to an almost immediate timer firing will not 20016 * progress beyond this point until the current invocation is done 20017 */ 20018 mutex_enter(&ipst->ips_ip_trash_timer_lock); 20019 ipst->ips_ip_ire_expire_id = 0; 20020 mutex_exit(&ipst->ips_ip_trash_timer_lock); 20021 20022 /* Periodic timer */ 20023 if (ipst->ips_ip_ire_arp_time_elapsed >= 20024 ipst->ips_ip_ire_arp_interval) { 20025 /* 20026 * Remove all IRE_CACHE entries since they might 20027 * contain arp information. 20028 */ 20029 flush_flag |= FLUSH_ARP_TIME; 20030 ipst->ips_ip_ire_arp_time_elapsed = 0; 20031 IP_STAT(ipst, ip_ire_arp_timer_expired); 20032 } 20033 if (ipst->ips_ip_ire_rd_time_elapsed >= 20034 ipst->ips_ip_ire_redir_interval) { 20035 /* Remove all redirects */ 20036 flush_flag |= FLUSH_REDIRECT_TIME; 20037 ipst->ips_ip_ire_rd_time_elapsed = 0; 20038 IP_STAT(ipst, ip_ire_redirect_timer_expired); 20039 } 20040 if (ipst->ips_ip_ire_pmtu_time_elapsed >= 20041 ipst->ips_ip_ire_pathmtu_interval) { 20042 /* Increase path mtu */ 20043 flush_flag |= FLUSH_MTU_TIME; 20044 ipst->ips_ip_ire_pmtu_time_elapsed = 0; 20045 IP_STAT(ipst, ip_ire_pmtu_timer_expired); 20046 } 20047 20048 /* 20049 * Optimize for the case when there are no redirects in the 20050 * ftable, that is, no need to walk the ftable in that case. 20051 */ 20052 if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) { 20053 iea.iea_flush_flag = flush_flag; 20054 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire, 20055 (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL, 20056 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 20057 NULL, ALL_ZONES, ipst); 20058 } 20059 if ((flush_flag & FLUSH_REDIRECT_TIME) && 20060 ipst->ips_ip_redirect_cnt > 0) { 20061 iea.iea_flush_flag = flush_flag; 20062 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE, 20063 ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 20064 0, NULL, 0, NULL, NULL, ALL_ZONES, ipst); 20065 } 20066 if (flush_flag & FLUSH_MTU_TIME) { 20067 /* 20068 * Walk all IPv6 IRE's and update them 20069 * Note that ARP and redirect timers are not 20070 * needed since NUD handles stale entries. 20071 */ 20072 flush_flag = FLUSH_MTU_TIME; 20073 iea.iea_flush_flag = flush_flag; 20074 ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea, 20075 ALL_ZONES, ipst); 20076 } 20077 20078 ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval; 20079 ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval; 20080 ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval; 20081 20082 /* 20083 * Hold the lock to serialize timeout calls and prevent 20084 * stale values in ip_ire_expire_id. Otherwise it is possible 20085 * for the timer to fire and a new invocation of this function 20086 * to start before the return value of timeout has been stored 20087 * in ip_ire_expire_id by the current invocation. 20088 */ 20089 mutex_enter(&ipst->ips_ip_trash_timer_lock); 20090 ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire, 20091 (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 20092 mutex_exit(&ipst->ips_ip_trash_timer_lock); 20093 } 20094 20095 /* 20096 * Called by the memory allocator subsystem directly, when the system 20097 * is running low on memory. 20098 */ 20099 /* ARGSUSED */ 20100 void 20101 ip_trash_ire_reclaim(void *args) 20102 { 20103 netstack_handle_t nh; 20104 netstack_t *ns; 20105 20106 netstack_next_init(&nh); 20107 while ((ns = netstack_next(&nh)) != NULL) { 20108 ip_trash_ire_reclaim_stack(ns->netstack_ip); 20109 netstack_rele(ns); 20110 } 20111 netstack_next_fini(&nh); 20112 } 20113 20114 static void 20115 ip_trash_ire_reclaim_stack(ip_stack_t *ipst) 20116 { 20117 ire_cache_count_t icc; 20118 ire_cache_reclaim_t icr; 20119 ncc_cache_count_t ncc; 20120 nce_cache_reclaim_t ncr; 20121 uint_t delete_cnt; 20122 /* 20123 * Memory reclaim call back. 20124 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 20125 * Then, with a target of freeing 1/Nth of IRE_CACHE 20126 * entries, determine what fraction to free for 20127 * each category of IRE_CACHE entries giving absolute priority 20128 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 20129 * entry will be freed unless all offlink entries are freed). 20130 */ 20131 icc.icc_total = 0; 20132 icc.icc_unused = 0; 20133 icc.icc_offlink = 0; 20134 icc.icc_pmtu = 0; 20135 icc.icc_onlink = 0; 20136 ire_walk(ire_cache_count, (char *)&icc, ipst); 20137 20138 /* 20139 * Free NCEs for IPv6 like the onlink ires. 20140 */ 20141 ncc.ncc_total = 0; 20142 ncc.ncc_host = 0; 20143 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst); 20144 20145 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 20146 icc.icc_pmtu + icc.icc_onlink); 20147 delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction; 20148 IP_STAT(ipst, ip_trash_ire_reclaim_calls); 20149 if (delete_cnt == 0) 20150 return; 20151 IP_STAT(ipst, ip_trash_ire_reclaim_success); 20152 /* Always delete all unused offlink entries */ 20153 icr.icr_ipst = ipst; 20154 icr.icr_unused = 1; 20155 if (delete_cnt <= icc.icc_unused) { 20156 /* 20157 * Only need to free unused entries. In other words, 20158 * there are enough unused entries to free to meet our 20159 * target number of freed ire cache entries. 20160 */ 20161 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 20162 ncr.ncr_host = 0; 20163 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 20164 /* 20165 * Only need to free unused entries, plus a fraction of offlink 20166 * entries. It follows from the first if statement that 20167 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 20168 */ 20169 delete_cnt -= icc.icc_unused; 20170 /* Round up # deleted by truncating fraction */ 20171 icr.icr_offlink = icc.icc_offlink / delete_cnt; 20172 icr.icr_pmtu = icr.icr_onlink = 0; 20173 ncr.ncr_host = 0; 20174 } else if (delete_cnt <= 20175 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 20176 /* 20177 * Free all unused and offlink entries, plus a fraction of 20178 * pmtu entries. It follows from the previous if statement 20179 * that icc_pmtu is non-zero, and that 20180 * delete_cnt != icc_unused + icc_offlink. 20181 */ 20182 icr.icr_offlink = 1; 20183 delete_cnt -= icc.icc_unused + icc.icc_offlink; 20184 /* Round up # deleted by truncating fraction */ 20185 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 20186 icr.icr_onlink = 0; 20187 ncr.ncr_host = 0; 20188 } else { 20189 /* 20190 * Free all unused, offlink, and pmtu entries, plus a fraction 20191 * of onlink entries. If we're here, then we know that 20192 * icc_onlink is non-zero, and that 20193 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 20194 */ 20195 icr.icr_offlink = icr.icr_pmtu = 1; 20196 delete_cnt -= icc.icc_unused + icc.icc_offlink + 20197 icc.icc_pmtu; 20198 /* Round up # deleted by truncating fraction */ 20199 icr.icr_onlink = icc.icc_onlink / delete_cnt; 20200 /* Using the same delete fraction as for onlink IREs */ 20201 ncr.ncr_host = ncc.ncc_host / delete_cnt; 20202 } 20203 #ifdef DEBUG 20204 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 20205 "fractions %d/%d/%d/%d\n", 20206 icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total, 20207 icc.icc_unused, icc.icc_offlink, 20208 icc.icc_pmtu, icc.icc_onlink, 20209 icr.icr_unused, icr.icr_offlink, 20210 icr.icr_pmtu, icr.icr_onlink)); 20211 #endif 20212 ire_walk(ire_cache_reclaim, (char *)&icr, ipst); 20213 if (ncr.ncr_host != 0) 20214 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 20215 (uchar_t *)&ncr, ipst); 20216 #ifdef DEBUG 20217 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 20218 icc.icc_pmtu = 0; icc.icc_onlink = 0; 20219 ire_walk(ire_cache_count, (char *)&icc, ipst); 20220 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 20221 icc.icc_total, icc.icc_unused, icc.icc_offlink, 20222 icc.icc_pmtu, icc.icc_onlink)); 20223 #endif 20224 } 20225 20226 /* 20227 * ip_unbind is called when a copy of an unbind request is received from the 20228 * upper level protocol. We remove this conn from any fanout hash list it is 20229 * on, and zero out the bind information. No reply is expected up above. 20230 */ 20231 void 20232 ip_unbind(conn_t *connp) 20233 { 20234 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 20235 20236 if (is_system_labeled() && connp->conn_anon_port) { 20237 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 20238 connp->conn_mlp_type, connp->conn_ulp, 20239 ntohs(connp->conn_lport), B_FALSE); 20240 connp->conn_anon_port = 0; 20241 } 20242 connp->conn_mlp_type = mlptSingle; 20243 20244 ipcl_hash_remove(connp); 20245 } 20246 20247 /* 20248 * Write side put procedure. Outbound data, IOCTLs, responses from 20249 * resolvers, etc, come down through here. 20250 * 20251 * arg2 is always a queue_t *. 20252 * When that queue is an ill_t (i.e. q_next != NULL), then arg must be 20253 * the zoneid. 20254 * When that queue is not an ill_t, then arg must be a conn_t pointer. 20255 */ 20256 void 20257 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 20258 { 20259 ip_output_options(arg, mp, arg2, caller, &zero_info); 20260 } 20261 20262 void 20263 ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, 20264 ip_opt_info_t *infop) 20265 { 20266 conn_t *connp = NULL; 20267 queue_t *q = (queue_t *)arg2; 20268 ipha_t *ipha; 20269 #define rptr ((uchar_t *)ipha) 20270 ire_t *ire = NULL; 20271 ire_t *sctp_ire = NULL; 20272 uint32_t v_hlen_tos_len; 20273 ipaddr_t dst; 20274 mblk_t *first_mp = NULL; 20275 boolean_t mctl_present; 20276 ipsec_out_t *io; 20277 int match_flags; 20278 ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ 20279 ipif_t *dst_ipif; 20280 boolean_t multirt_need_resolve = B_FALSE; 20281 mblk_t *copy_mp = NULL; 20282 int err = 0; 20283 zoneid_t zoneid; 20284 boolean_t need_decref = B_FALSE; 20285 boolean_t ignore_dontroute = B_FALSE; 20286 boolean_t ignore_nexthop = B_FALSE; 20287 boolean_t ip_nexthop = B_FALSE; 20288 ipaddr_t nexthop_addr; 20289 ip_stack_t *ipst; 20290 20291 #ifdef _BIG_ENDIAN 20292 #define V_HLEN (v_hlen_tos_len >> 24) 20293 #else 20294 #define V_HLEN (v_hlen_tos_len & 0xFF) 20295 #endif 20296 20297 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 20298 "ip_wput_start: q %p", q); 20299 20300 /* 20301 * ip_wput fast path 20302 */ 20303 20304 /* is packet from ARP ? */ 20305 if (q->q_next != NULL) { 20306 zoneid = (zoneid_t)(uintptr_t)arg; 20307 goto qnext; 20308 } 20309 20310 connp = (conn_t *)arg; 20311 ASSERT(connp != NULL); 20312 zoneid = connp->conn_zoneid; 20313 ipst = connp->conn_netstack->netstack_ip; 20314 ASSERT(ipst != NULL); 20315 20316 /* is queue flow controlled? */ 20317 if ((q->q_first != NULL || connp->conn_draining) && 20318 (caller == IP_WPUT)) { 20319 ASSERT(!need_decref); 20320 ASSERT(!IP_FLOW_CONTROLLED_ULP(connp->conn_ulp)); 20321 (void) putq(q, mp); 20322 return; 20323 } 20324 20325 /* Multidata transmit? */ 20326 if (DB_TYPE(mp) == M_MULTIDATA) { 20327 /* 20328 * We should never get here, since all Multidata messages 20329 * originating from tcp should have been directed over to 20330 * tcp_multisend() in the first place. 20331 */ 20332 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20333 freemsg(mp); 20334 return; 20335 } else if (DB_TYPE(mp) != M_DATA) 20336 goto notdata; 20337 20338 if (mp->b_flag & MSGHASREF) { 20339 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20340 mp->b_flag &= ~MSGHASREF; 20341 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 20342 need_decref = B_TRUE; 20343 } 20344 ipha = (ipha_t *)mp->b_rptr; 20345 20346 /* is IP header non-aligned or mblk smaller than basic IP header */ 20347 #ifndef SAFETY_BEFORE_SPEED 20348 if (!OK_32PTR(rptr) || 20349 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 20350 goto hdrtoosmall; 20351 #endif 20352 20353 ASSERT(OK_32PTR(ipha)); 20354 20355 /* 20356 * This function assumes that mp points to an IPv4 packet. If it's the 20357 * wrong version, we'll catch it again in ip_output_v6. 20358 * 20359 * Note that this is *only* locally-generated output here, and never 20360 * forwarded data, and that we need to deal only with transports that 20361 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 20362 * label.) 20363 */ 20364 if (is_system_labeled() && 20365 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 20366 !connp->conn_ulp_labeled) { 20367 cred_t *credp; 20368 pid_t pid; 20369 20370 credp = BEST_CRED(mp, connp, &pid); 20371 err = tsol_check_label(credp, &mp, 20372 connp->conn_mac_exempt, ipst, pid); 20373 ipha = (ipha_t *)mp->b_rptr; 20374 if (err != 0) { 20375 first_mp = mp; 20376 if (err == EINVAL) 20377 goto icmp_parameter_problem; 20378 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 20379 goto discard_pkt; 20380 } 20381 } 20382 20383 ASSERT(infop != NULL); 20384 20385 if (infop->ip_opt_flags & IP_VERIFY_SRC) { 20386 /* 20387 * IP_PKTINFO ancillary option is present. 20388 * IPCL_ZONEID is used to honor IP_ALLZONES option which 20389 * allows using address of any zone as the source address. 20390 */ 20391 ire = ire_ctable_lookup(ipha->ipha_src, 0, 20392 (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp), 20393 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 20394 if (ire == NULL) 20395 goto drop_pkt; 20396 ire_refrele(ire); 20397 ire = NULL; 20398 } 20399 20400 /* 20401 * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO. 20402 */ 20403 if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) { 20404 xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index, 20405 B_FALSE, NULL, NULL, NULL, NULL, ipst); 20406 20407 if (xmit_ill == NULL || IS_VNI(xmit_ill)) 20408 goto drop_pkt; 20409 /* 20410 * check that there is an ipif belonging 20411 * to our zone. IPCL_ZONEID is not used because 20412 * IP_ALLZONES option is valid only when the ill is 20413 * accessible from all zones i.e has a valid ipif in 20414 * all zones. 20415 */ 20416 if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) { 20417 goto drop_pkt; 20418 } 20419 } 20420 20421 /* 20422 * If there is a policy, try to attach an ipsec_out in 20423 * the front. At the end, first_mp either points to a 20424 * M_DATA message or IPSEC_OUT message linked to a 20425 * M_DATA message. We have to do it now as we might 20426 * lose the "conn" if we go through ip_newroute. 20427 */ 20428 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 20429 if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL, 20430 ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) { 20431 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20432 if (need_decref) 20433 CONN_DEC_REF(connp); 20434 return; 20435 } else { 20436 ASSERT(mp->b_datap->db_type == M_CTL); 20437 first_mp = mp; 20438 mp = mp->b_cont; 20439 mctl_present = B_TRUE; 20440 } 20441 } else { 20442 first_mp = mp; 20443 mctl_present = B_FALSE; 20444 } 20445 20446 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20447 20448 /* is wrong version or IP options present */ 20449 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 20450 goto version_hdrlen_check; 20451 dst = ipha->ipha_dst; 20452 20453 /* If IP_BOUND_IF has been set, use that ill. */ 20454 if (connp->conn_outgoing_ill != NULL) { 20455 xmit_ill = conn_get_held_ill(connp, 20456 &connp->conn_outgoing_ill, &err); 20457 if (err == ILL_LOOKUP_FAILED) 20458 goto drop_pkt; 20459 20460 goto send_from_ill; 20461 } 20462 20463 /* is packet multicast? */ 20464 if (CLASSD(dst)) 20465 goto multicast; 20466 20467 /* 20468 * If xmit_ill is set above due to index passed in ip_pkt_info. It 20469 * takes precedence over conn_dontroute and conn_nexthop_set 20470 */ 20471 if (xmit_ill != NULL) 20472 goto send_from_ill; 20473 20474 if (connp->conn_dontroute || connp->conn_nexthop_set) { 20475 /* 20476 * If the destination is a broadcast, local, or loopback 20477 * address, SO_DONTROUTE and IP_NEXTHOP go through the 20478 * standard path. 20479 */ 20480 ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); 20481 if ((ire == NULL) || (ire->ire_type & 20482 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) { 20483 if (ire != NULL) { 20484 ire_refrele(ire); 20485 /* No more access to ire */ 20486 ire = NULL; 20487 } 20488 /* 20489 * bypass routing checks and go directly to interface. 20490 */ 20491 if (connp->conn_dontroute) 20492 goto dontroute; 20493 20494 ASSERT(connp->conn_nexthop_set); 20495 ip_nexthop = B_TRUE; 20496 nexthop_addr = connp->conn_nexthop_v4; 20497 goto send_from_ill; 20498 } 20499 20500 /* Must be a broadcast, a loopback or a local ire */ 20501 ire_refrele(ire); 20502 /* No more access to ire */ 20503 ire = NULL; 20504 } 20505 20506 /* 20507 * We cache IRE_CACHEs to avoid lookups. We don't do 20508 * this for the tcp global queue and listen end point 20509 * as it does not really have a real destination to 20510 * talk to. This is also true for SCTP. 20511 */ 20512 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 20513 !connp->conn_fully_bound) { 20514 ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); 20515 if (ire == NULL) 20516 goto noirefound; 20517 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20518 "ip_wput_end: q %p (%S)", q, "end"); 20519 20520 /* 20521 * Check if the ire has the RTF_MULTIRT flag, inherited 20522 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20523 */ 20524 if (ire->ire_flags & RTF_MULTIRT) { 20525 20526 /* 20527 * Force the TTL of multirouted packets if required. 20528 * The TTL of such packets is bounded by the 20529 * ip_multirt_ttl ndd variable. 20530 */ 20531 if ((ipst->ips_ip_multirt_ttl > 0) && 20532 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20533 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20534 "(was %d), dst 0x%08x\n", 20535 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20536 ntohl(ire->ire_addr))); 20537 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20538 } 20539 /* 20540 * We look at this point if there are pending 20541 * unresolved routes. ire_multirt_resolvable() 20542 * checks in O(n) that all IRE_OFFSUBNET ire 20543 * entries for the packet's destination and 20544 * flagged RTF_MULTIRT are currently resolved. 20545 * If some remain unresolved, we make a copy 20546 * of the current message. It will be used 20547 * to initiate additional route resolutions. 20548 */ 20549 multirt_need_resolve = 20550 ire_multirt_need_resolve(ire->ire_addr, 20551 msg_getlabel(first_mp), ipst); 20552 ip2dbg(("ip_wput[TCP]: ire %p, " 20553 "multirt_need_resolve %d, first_mp %p\n", 20554 (void *)ire, multirt_need_resolve, 20555 (void *)first_mp)); 20556 if (multirt_need_resolve) { 20557 copy_mp = copymsg(first_mp); 20558 if (copy_mp != NULL) { 20559 MULTIRT_DEBUG_TAG(copy_mp); 20560 } 20561 } 20562 } 20563 20564 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20565 20566 /* 20567 * Try to resolve another multiroute if 20568 * ire_multirt_need_resolve() deemed it necessary. 20569 */ 20570 if (copy_mp != NULL) 20571 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20572 if (need_decref) 20573 CONN_DEC_REF(connp); 20574 return; 20575 } 20576 20577 /* 20578 * Access to conn_ire_cache. (protected by conn_lock) 20579 * 20580 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 20581 * the ire bucket lock here to check for CONDEMNED as it is okay to 20582 * send a packet or two with the IRE_CACHE that is going away. 20583 * Access to the ire requires an ire refhold on the ire prior to 20584 * its use since an interface unplumb thread may delete the cached 20585 * ire and release the refhold at any time. 20586 * 20587 * Caching an ire in the conn_ire_cache 20588 * 20589 * o Caching an ire pointer in the conn requires a strict check for 20590 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 20591 * ires before cleaning up the conns. So the caching of an ire pointer 20592 * in the conn is done after making sure under the bucket lock that the 20593 * ire has not yet been marked CONDEMNED. Otherwise we will end up 20594 * caching an ire after the unplumb thread has cleaned up the conn. 20595 * If the conn does not send a packet subsequently the unplumb thread 20596 * will be hanging waiting for the ire count to drop to zero. 20597 * 20598 * o We also need to atomically test for a null conn_ire_cache and 20599 * set the conn_ire_cache under the the protection of the conn_lock 20600 * to avoid races among concurrent threads trying to simultaneously 20601 * cache an ire in the conn_ire_cache. 20602 */ 20603 mutex_enter(&connp->conn_lock); 20604 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 20605 20606 if (ire != NULL && ire->ire_addr == dst && 20607 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20608 20609 IRE_REFHOLD(ire); 20610 mutex_exit(&connp->conn_lock); 20611 20612 } else { 20613 boolean_t cached = B_FALSE; 20614 connp->conn_ire_cache = NULL; 20615 mutex_exit(&connp->conn_lock); 20616 /* Release the old ire */ 20617 if (ire != NULL && sctp_ire == NULL) 20618 IRE_REFRELE_NOTR(ire); 20619 20620 ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); 20621 if (ire == NULL) 20622 goto noirefound; 20623 IRE_REFHOLD_NOTR(ire); 20624 20625 mutex_enter(&connp->conn_lock); 20626 if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) { 20627 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20628 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20629 if (connp->conn_ulp == IPPROTO_TCP) 20630 TCP_CHECK_IREINFO(connp->conn_tcp, ire); 20631 connp->conn_ire_cache = ire; 20632 cached = B_TRUE; 20633 } 20634 rw_exit(&ire->ire_bucket->irb_lock); 20635 } 20636 mutex_exit(&connp->conn_lock); 20637 20638 /* 20639 * We can continue to use the ire but since it was 20640 * not cached, we should drop the extra reference. 20641 */ 20642 if (!cached) 20643 IRE_REFRELE_NOTR(ire); 20644 } 20645 20646 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20647 "ip_wput_end: q %p (%S)", q, "end"); 20648 20649 /* 20650 * Check if the ire has the RTF_MULTIRT flag, inherited 20651 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20652 */ 20653 if (ire->ire_flags & RTF_MULTIRT) { 20654 /* 20655 * Force the TTL of multirouted packets if required. 20656 * The TTL of such packets is bounded by the 20657 * ip_multirt_ttl ndd variable. 20658 */ 20659 if ((ipst->ips_ip_multirt_ttl > 0) && 20660 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20661 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20662 "(was %d), dst 0x%08x\n", 20663 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20664 ntohl(ire->ire_addr))); 20665 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20666 } 20667 20668 /* 20669 * At this point, we check to see if there are any pending 20670 * unresolved routes. ire_multirt_resolvable() 20671 * checks in O(n) that all IRE_OFFSUBNET ire 20672 * entries for the packet's destination and 20673 * flagged RTF_MULTIRT are currently resolved. 20674 * If some remain unresolved, we make a copy 20675 * of the current message. It will be used 20676 * to initiate additional route resolutions. 20677 */ 20678 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 20679 msg_getlabel(first_mp), ipst); 20680 ip2dbg(("ip_wput[not TCP]: ire %p, " 20681 "multirt_need_resolve %d, first_mp %p\n", 20682 (void *)ire, multirt_need_resolve, (void *)first_mp)); 20683 if (multirt_need_resolve) { 20684 copy_mp = copymsg(first_mp); 20685 if (copy_mp != NULL) { 20686 MULTIRT_DEBUG_TAG(copy_mp); 20687 } 20688 } 20689 } 20690 20691 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20692 20693 /* 20694 * Try to resolve another multiroute if 20695 * ire_multirt_resolvable() deemed it necessary 20696 */ 20697 if (copy_mp != NULL) 20698 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20699 if (need_decref) 20700 CONN_DEC_REF(connp); 20701 return; 20702 20703 qnext: 20704 /* 20705 * Upper Level Protocols pass down complete IP datagrams 20706 * as M_DATA messages. Everything else is a sideshow. 20707 * 20708 * 1) We could be re-entering ip_wput because of ip_neworute 20709 * in which case we could have a IPSEC_OUT message. We 20710 * need to pass through ip_wput like other datagrams and 20711 * hence cannot branch to ip_wput_nondata. 20712 * 20713 * 2) ARP, AH, ESP, and other clients who are on the module 20714 * instance of IP stream, give us something to deal with. 20715 * We will handle AH and ESP here and rest in ip_wput_nondata. 20716 * 20717 * 3) ICMP replies also could come here. 20718 */ 20719 ipst = ILLQ_TO_IPST(q); 20720 20721 if (DB_TYPE(mp) != M_DATA) { 20722 notdata: 20723 if (DB_TYPE(mp) == M_CTL) { 20724 /* 20725 * M_CTL messages are used by ARP, AH and ESP to 20726 * communicate with IP. We deal with IPSEC_IN and 20727 * IPSEC_OUT here. ip_wput_nondata handles other 20728 * cases. 20729 */ 20730 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 20731 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 20732 first_mp = mp->b_cont; 20733 first_mp->b_flag &= ~MSGHASREF; 20734 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20735 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 20736 CONN_DEC_REF(connp); 20737 connp = NULL; 20738 } 20739 if (ii->ipsec_info_type == IPSEC_IN) { 20740 /* 20741 * Either this message goes back to 20742 * IPsec for further processing or to 20743 * ULP after policy checks. 20744 */ 20745 ip_fanout_proto_again(mp, NULL, NULL, NULL); 20746 return; 20747 } else if (ii->ipsec_info_type == IPSEC_OUT) { 20748 io = (ipsec_out_t *)ii; 20749 if (io->ipsec_out_proc_begin) { 20750 /* 20751 * IPsec processing has already started. 20752 * Complete it. 20753 * IPQoS notes: We don't care what is 20754 * in ipsec_out_ill_index since this 20755 * won't be processed for IPQoS policies 20756 * in ipsec_out_process. 20757 */ 20758 ipsec_out_process(q, mp, NULL, 20759 io->ipsec_out_ill_index); 20760 return; 20761 } else { 20762 connp = (q->q_next != NULL) ? 20763 NULL : Q_TO_CONN(q); 20764 first_mp = mp; 20765 mp = mp->b_cont; 20766 mctl_present = B_TRUE; 20767 } 20768 zoneid = io->ipsec_out_zoneid; 20769 ASSERT(zoneid != ALL_ZONES); 20770 } else if (ii->ipsec_info_type == IPSEC_CTL) { 20771 /* 20772 * It's an IPsec control message requesting 20773 * an SADB update to be sent to the IPsec 20774 * hardware acceleration capable ills. 20775 */ 20776 ipsec_ctl_t *ipsec_ctl = 20777 (ipsec_ctl_t *)mp->b_rptr; 20778 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 20779 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 20780 mblk_t *cmp = mp->b_cont; 20781 20782 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 20783 ASSERT(cmp != NULL); 20784 20785 freeb(mp); 20786 ill_ipsec_capab_send_all(satype, cmp, sa, 20787 ipst->ips_netstack); 20788 return; 20789 } else { 20790 /* 20791 * This must be ARP or special TSOL signaling. 20792 */ 20793 ip_wput_nondata(NULL, q, mp, NULL); 20794 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20795 "ip_wput_end: q %p (%S)", q, "nondata"); 20796 return; 20797 } 20798 } else { 20799 /* 20800 * This must be non-(ARP/AH/ESP) messages. 20801 */ 20802 ASSERT(!need_decref); 20803 ip_wput_nondata(NULL, q, mp, NULL); 20804 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20805 "ip_wput_end: q %p (%S)", q, "nondata"); 20806 return; 20807 } 20808 } else { 20809 first_mp = mp; 20810 mctl_present = B_FALSE; 20811 } 20812 20813 ASSERT(first_mp != NULL); 20814 20815 if (mctl_present) { 20816 io = (ipsec_out_t *)first_mp->b_rptr; 20817 if (io->ipsec_out_ip_nexthop) { 20818 /* 20819 * We may have lost the conn context if we are 20820 * coming here from ip_newroute(). Copy the 20821 * nexthop information. 20822 */ 20823 ip_nexthop = B_TRUE; 20824 nexthop_addr = io->ipsec_out_nexthop_addr; 20825 20826 ipha = (ipha_t *)mp->b_rptr; 20827 dst = ipha->ipha_dst; 20828 goto send_from_ill; 20829 } 20830 } 20831 20832 ASSERT(xmit_ill == NULL); 20833 20834 /* We have a complete IP datagram heading outbound. */ 20835 ipha = (ipha_t *)mp->b_rptr; 20836 20837 #ifndef SPEED_BEFORE_SAFETY 20838 /* 20839 * Make sure we have a full-word aligned message and that at least 20840 * a simple IP header is accessible in the first message. If not, 20841 * try a pullup. For labeled systems we need to always take this 20842 * path as M_CTLs are "notdata" but have trailing data to process. 20843 */ 20844 if (!OK_32PTR(rptr) || 20845 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) { 20846 hdrtoosmall: 20847 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 20848 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20849 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 20850 if (first_mp == NULL) 20851 first_mp = mp; 20852 goto discard_pkt; 20853 } 20854 20855 /* This function assumes that mp points to an IPv4 packet. */ 20856 if (is_system_labeled() && 20857 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 20858 (connp == NULL || !connp->conn_ulp_labeled)) { 20859 cred_t *credp; 20860 pid_t pid; 20861 20862 if (connp != NULL) { 20863 credp = BEST_CRED(mp, connp, &pid); 20864 err = tsol_check_label(credp, &mp, 20865 connp->conn_mac_exempt, ipst, pid); 20866 } else if ((credp = msg_getcred(mp, &pid)) != NULL) { 20867 err = tsol_check_label(credp, &mp, 20868 B_FALSE, ipst, pid); 20869 } 20870 ipha = (ipha_t *)mp->b_rptr; 20871 if (mctl_present) 20872 first_mp->b_cont = mp; 20873 else 20874 first_mp = mp; 20875 if (err != 0) { 20876 if (err == EINVAL) 20877 goto icmp_parameter_problem; 20878 ip2dbg(("ip_wput: label check failed (%d)\n", 20879 err)); 20880 goto discard_pkt; 20881 } 20882 } 20883 20884 ipha = (ipha_t *)mp->b_rptr; 20885 if (first_mp == NULL) { 20886 ASSERT(xmit_ill == NULL); 20887 /* 20888 * If we got here because of "goto hdrtoosmall" 20889 * We need to attach a IPSEC_OUT. 20890 */ 20891 if (connp->conn_out_enforce_policy) { 20892 if (((mp = ipsec_attach_ipsec_out(&mp, connp, 20893 NULL, ipha->ipha_protocol, 20894 ipst->ips_netstack)) == NULL)) { 20895 BUMP_MIB(&ipst->ips_ip_mib, 20896 ipIfStatsOutDiscards); 20897 if (need_decref) 20898 CONN_DEC_REF(connp); 20899 return; 20900 } else { 20901 ASSERT(mp->b_datap->db_type == M_CTL); 20902 first_mp = mp; 20903 mp = mp->b_cont; 20904 mctl_present = B_TRUE; 20905 } 20906 } else { 20907 first_mp = mp; 20908 mctl_present = B_FALSE; 20909 } 20910 } 20911 } 20912 #endif 20913 20914 /* Most of the code below is written for speed, not readability */ 20915 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20916 20917 /* 20918 * If ip_newroute() fails, we're going to need a full 20919 * header for the icmp wraparound. 20920 */ 20921 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 20922 uint_t v_hlen; 20923 version_hdrlen_check: 20924 ASSERT(first_mp != NULL); 20925 v_hlen = V_HLEN; 20926 /* 20927 * siphon off IPv6 packets coming down from transport 20928 * layer modules here. 20929 * Note: high-order bit carries NUD reachability confirmation 20930 */ 20931 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 20932 /* 20933 * FIXME: assume that callers of ip_output* call 20934 * the right version? 20935 */ 20936 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); 20937 ASSERT(xmit_ill == NULL); 20938 if (need_decref) 20939 mp->b_flag |= MSGHASREF; 20940 (void) ip_output_v6(arg, first_mp, arg2, caller); 20941 return; 20942 } 20943 20944 if ((v_hlen >> 4) != IP_VERSION) { 20945 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20946 "ip_wput_end: q %p (%S)", q, "badvers"); 20947 goto discard_pkt; 20948 } 20949 /* 20950 * Is the header length at least 20 bytes? 20951 * 20952 * Are there enough bytes accessible in the header? If 20953 * not, try a pullup. 20954 */ 20955 v_hlen &= 0xF; 20956 v_hlen <<= 2; 20957 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 20958 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20959 "ip_wput_end: q %p (%S)", q, "badlen"); 20960 goto discard_pkt; 20961 } 20962 if (v_hlen > (mp->b_wptr - rptr)) { 20963 if (!pullupmsg(mp, v_hlen)) { 20964 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20965 "ip_wput_end: q %p (%S)", q, "badpullup2"); 20966 goto discard_pkt; 20967 } 20968 ipha = (ipha_t *)mp->b_rptr; 20969 } 20970 /* 20971 * Move first entry from any source route into ipha_dst and 20972 * verify the options 20973 */ 20974 if (ip_wput_options(q, first_mp, ipha, mctl_present, 20975 zoneid, ipst)) { 20976 ASSERT(xmit_ill == NULL); 20977 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20978 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20979 "ip_wput_end: q %p (%S)", q, "badopts"); 20980 if (need_decref) 20981 CONN_DEC_REF(connp); 20982 return; 20983 } 20984 } 20985 dst = ipha->ipha_dst; 20986 20987 /* 20988 * Try to get an IRE_CACHE for the destination address. If we can't, 20989 * we have to run the packet through ip_newroute which will take 20990 * the appropriate action to arrange for an IRE_CACHE, such as querying 20991 * a resolver, or assigning a default gateway, etc. 20992 */ 20993 if (CLASSD(dst)) { 20994 ipif_t *ipif; 20995 uint32_t setsrc = 0; 20996 20997 multicast: 20998 ASSERT(first_mp != NULL); 20999 ip2dbg(("ip_wput: CLASSD\n")); 21000 if (connp == NULL) { 21001 /* 21002 * Use the first good ipif on the ill. 21003 * XXX Should this ever happen? (Appears 21004 * to show up with just ppp and no ethernet due 21005 * to in.rdisc.) 21006 * However, ire_send should be able to 21007 * call ip_wput_ire directly. 21008 * 21009 * XXX Also, this can happen for ICMP and other packets 21010 * with multicast source addresses. Perhaps we should 21011 * fix things so that we drop the packet in question, 21012 * but for now, just run with it. 21013 */ 21014 ill_t *ill = (ill_t *)q->q_ptr; 21015 21016 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 21017 if (ipif == NULL) { 21018 if (need_decref) 21019 CONN_DEC_REF(connp); 21020 freemsg(first_mp); 21021 return; 21022 } 21023 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 21024 ntohl(dst), ill->ill_name)); 21025 } else { 21026 /* 21027 * The order of precedence is IP_BOUND_IF, IP_PKTINFO 21028 * and IP_MULTICAST_IF. The block comment above this 21029 * function explains the locking mechanism used here. 21030 */ 21031 if (xmit_ill == NULL) { 21032 xmit_ill = conn_get_held_ill(connp, 21033 &connp->conn_outgoing_ill, &err); 21034 if (err == ILL_LOOKUP_FAILED) { 21035 ip1dbg(("ip_wput: No ill for " 21036 "IP_BOUND_IF\n")); 21037 BUMP_MIB(&ipst->ips_ip_mib, 21038 ipIfStatsOutNoRoutes); 21039 goto drop_pkt; 21040 } 21041 } 21042 21043 if (xmit_ill == NULL) { 21044 ipif = conn_get_held_ipif(connp, 21045 &connp->conn_multicast_ipif, &err); 21046 if (err == IPIF_LOOKUP_FAILED) { 21047 ip1dbg(("ip_wput: No ipif for " 21048 "multicast\n")); 21049 BUMP_MIB(&ipst->ips_ip_mib, 21050 ipIfStatsOutNoRoutes); 21051 goto drop_pkt; 21052 } 21053 } 21054 if (xmit_ill != NULL) { 21055 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21056 if (ipif == NULL) { 21057 ip1dbg(("ip_wput: No ipif for " 21058 "xmit_ill\n")); 21059 BUMP_MIB(&ipst->ips_ip_mib, 21060 ipIfStatsOutNoRoutes); 21061 goto drop_pkt; 21062 } 21063 } else if (ipif == NULL || ipif->ipif_isv6) { 21064 /* 21065 * We must do this ipif determination here 21066 * else we could pass through ip_newroute 21067 * and come back here without the conn context. 21068 * 21069 * Note: we do late binding i.e. we bind to 21070 * the interface when the first packet is sent. 21071 * For performance reasons we do not rebind on 21072 * each packet but keep the binding until the 21073 * next IP_MULTICAST_IF option. 21074 * 21075 * conn_multicast_{ipif,ill} are shared between 21076 * IPv4 and IPv6 and AF_INET6 sockets can 21077 * send both IPv4 and IPv6 packets. Hence 21078 * we have to check that "isv6" matches above. 21079 */ 21080 if (ipif != NULL) 21081 ipif_refrele(ipif); 21082 ipif = ipif_lookup_group(dst, zoneid, ipst); 21083 if (ipif == NULL) { 21084 ip1dbg(("ip_wput: No ipif for " 21085 "multicast\n")); 21086 BUMP_MIB(&ipst->ips_ip_mib, 21087 ipIfStatsOutNoRoutes); 21088 goto drop_pkt; 21089 } 21090 err = conn_set_held_ipif(connp, 21091 &connp->conn_multicast_ipif, ipif); 21092 if (err == IPIF_LOOKUP_FAILED) { 21093 ipif_refrele(ipif); 21094 ip1dbg(("ip_wput: No ipif for " 21095 "multicast\n")); 21096 BUMP_MIB(&ipst->ips_ip_mib, 21097 ipIfStatsOutNoRoutes); 21098 goto drop_pkt; 21099 } 21100 } 21101 } 21102 ASSERT(!ipif->ipif_isv6); 21103 /* 21104 * As we may lose the conn by the time we reach ip_wput_ire, 21105 * we copy conn_multicast_loop and conn_dontroute on to an 21106 * ipsec_out. In case if this datagram goes out secure, 21107 * we need the ill_index also. Copy that also into the 21108 * ipsec_out. 21109 */ 21110 if (mctl_present) { 21111 io = (ipsec_out_t *)first_mp->b_rptr; 21112 ASSERT(first_mp->b_datap->db_type == M_CTL); 21113 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21114 } else { 21115 ASSERT(mp == first_mp); 21116 if ((first_mp = allocb(sizeof (ipsec_info_t), 21117 BPRI_HI)) == NULL) { 21118 ipif_refrele(ipif); 21119 first_mp = mp; 21120 goto discard_pkt; 21121 } 21122 first_mp->b_datap->db_type = M_CTL; 21123 first_mp->b_wptr += sizeof (ipsec_info_t); 21124 /* ipsec_out_secure is B_FALSE now */ 21125 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 21126 io = (ipsec_out_t *)first_mp->b_rptr; 21127 io->ipsec_out_type = IPSEC_OUT; 21128 io->ipsec_out_len = sizeof (ipsec_out_t); 21129 io->ipsec_out_use_global_policy = B_TRUE; 21130 io->ipsec_out_ns = ipst->ips_netstack; 21131 first_mp->b_cont = mp; 21132 mctl_present = B_TRUE; 21133 } 21134 21135 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21136 io->ipsec_out_ill_index = 21137 ipif->ipif_ill->ill_phyint->phyint_ifindex; 21138 21139 if (connp != NULL) { 21140 io->ipsec_out_multicast_loop = 21141 connp->conn_multicast_loop; 21142 io->ipsec_out_dontroute = connp->conn_dontroute; 21143 io->ipsec_out_zoneid = connp->conn_zoneid; 21144 } 21145 /* 21146 * If the application uses IP_MULTICAST_IF with 21147 * different logical addresses of the same ILL, we 21148 * need to make sure that the soruce address of 21149 * the packet matches the logical IP address used 21150 * in the option. We do it by initializing ipha_src 21151 * here. This should keep IPsec also happy as 21152 * when we return from IPsec processing, we don't 21153 * have to worry about getting the right address on 21154 * the packet. Thus it is sufficient to look for 21155 * IRE_CACHE using MATCH_IRE_ILL rathen than 21156 * MATCH_IRE_IPIF. 21157 * 21158 * NOTE : We need to do it for non-secure case also as 21159 * this might go out secure if there is a global policy 21160 * match in ip_wput_ire. 21161 * 21162 * As we do not have the ire yet, it is possible that 21163 * we set the source address here and then later discover 21164 * that the ire implies the source address to be assigned 21165 * through the RTF_SETSRC flag. 21166 * In that case, the setsrc variable will remind us 21167 * that overwritting the source address by the one 21168 * of the RTF_SETSRC-flagged ire is allowed. 21169 */ 21170 if (ipha->ipha_src == INADDR_ANY && 21171 (connp == NULL || !connp->conn_unspec_src)) { 21172 ipha->ipha_src = ipif->ipif_src_addr; 21173 setsrc = RTF_SETSRC; 21174 } 21175 /* 21176 * Find an IRE which matches the destination and the outgoing 21177 * queue (i.e. the outgoing interface.) 21178 * For loopback use a unicast IP address for 21179 * the ire lookup. 21180 */ 21181 if (IS_LOOPBACK(ipif->ipif_ill)) 21182 dst = ipif->ipif_lcl_addr; 21183 21184 /* 21185 * If xmit_ill is set, we branch out to ip_newroute_ipif. 21186 * We don't need to lookup ire in ctable as the packet 21187 * needs to be sent to the destination through the specified 21188 * ill irrespective of ires in the cache table. 21189 */ 21190 ire = NULL; 21191 if (xmit_ill == NULL) { 21192 ire = ire_ctable_lookup(dst, 0, 0, ipif, 21193 zoneid, msg_getlabel(mp), match_flags, ipst); 21194 } 21195 21196 if (ire == NULL) { 21197 /* 21198 * Multicast loopback and multicast forwarding is 21199 * done in ip_wput_ire. 21200 * 21201 * Mark this packet to make it be delivered to 21202 * ip_wput_ire after the new ire has been 21203 * created. 21204 * 21205 * The call to ip_newroute_ipif takes into account 21206 * the setsrc reminder. In any case, we take care 21207 * of the RTF_MULTIRT flag. 21208 */ 21209 mp->b_prev = mp->b_next = NULL; 21210 if (xmit_ill == NULL || 21211 xmit_ill->ill_ipif_up_count > 0) { 21212 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 21213 setsrc | RTF_MULTIRT, zoneid, infop); 21214 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21215 "ip_wput_end: q %p (%S)", q, "noire"); 21216 } else { 21217 freemsg(first_mp); 21218 } 21219 ipif_refrele(ipif); 21220 if (xmit_ill != NULL) 21221 ill_refrele(xmit_ill); 21222 if (need_decref) 21223 CONN_DEC_REF(connp); 21224 return; 21225 } 21226 21227 ipif_refrele(ipif); 21228 ipif = NULL; 21229 ASSERT(xmit_ill == NULL); 21230 21231 /* 21232 * Honor the RTF_SETSRC flag for multicast packets, 21233 * if allowed by the setsrc reminder. 21234 */ 21235 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 21236 ipha->ipha_src = ire->ire_src_addr; 21237 } 21238 21239 /* 21240 * Unconditionally force the TTL to 1 for 21241 * multirouted multicast packets: 21242 * multirouted multicast should not cross 21243 * multicast routers. 21244 */ 21245 if (ire->ire_flags & RTF_MULTIRT) { 21246 if (ipha->ipha_ttl > 1) { 21247 ip2dbg(("ip_wput: forcing multicast " 21248 "multirt TTL to 1 (was %d), dst 0x%08x\n", 21249 ipha->ipha_ttl, ntohl(ire->ire_addr))); 21250 ipha->ipha_ttl = 1; 21251 } 21252 } 21253 } else { 21254 ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); 21255 if ((ire != NULL) && (ire->ire_type & 21256 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 21257 ignore_dontroute = B_TRUE; 21258 ignore_nexthop = B_TRUE; 21259 } 21260 if (ire != NULL) { 21261 ire_refrele(ire); 21262 ire = NULL; 21263 } 21264 /* 21265 * Guard against coming in from arp in which case conn is NULL. 21266 * Also guard against non M_DATA with dontroute set but 21267 * destined to local, loopback or broadcast addresses. 21268 */ 21269 if (connp != NULL && connp->conn_dontroute && 21270 !ignore_dontroute) { 21271 dontroute: 21272 /* 21273 * Set TTL to 1 if SO_DONTROUTE is set to prevent 21274 * routing protocols from seeing false direct 21275 * connectivity. 21276 */ 21277 ipha->ipha_ttl = 1; 21278 /* If suitable ipif not found, drop packet */ 21279 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst); 21280 if (dst_ipif == NULL) { 21281 noroute: 21282 ip1dbg(("ip_wput: no route for dst using" 21283 " SO_DONTROUTE\n")); 21284 BUMP_MIB(&ipst->ips_ip_mib, 21285 ipIfStatsOutNoRoutes); 21286 mp->b_prev = mp->b_next = NULL; 21287 if (first_mp == NULL) 21288 first_mp = mp; 21289 goto drop_pkt; 21290 } else { 21291 /* 21292 * If suitable ipif has been found, set 21293 * xmit_ill to the corresponding 21294 * ipif_ill because we'll be using the 21295 * send_from_ill logic below. 21296 */ 21297 ASSERT(xmit_ill == NULL); 21298 xmit_ill = dst_ipif->ipif_ill; 21299 mutex_enter(&xmit_ill->ill_lock); 21300 if (!ILL_CAN_LOOKUP(xmit_ill)) { 21301 mutex_exit(&xmit_ill->ill_lock); 21302 xmit_ill = NULL; 21303 ipif_refrele(dst_ipif); 21304 goto noroute; 21305 } 21306 ill_refhold_locked(xmit_ill); 21307 mutex_exit(&xmit_ill->ill_lock); 21308 ipif_refrele(dst_ipif); 21309 } 21310 } 21311 21312 send_from_ill: 21313 if (xmit_ill != NULL) { 21314 ipif_t *ipif; 21315 21316 /* 21317 * Mark this packet as originated locally 21318 */ 21319 mp->b_prev = mp->b_next = NULL; 21320 21321 /* 21322 * Could be SO_DONTROUTE case also. 21323 * Verify that at least one ipif is up on the ill. 21324 */ 21325 if (xmit_ill->ill_ipif_up_count == 0) { 21326 ip1dbg(("ip_output: xmit_ill %s is down\n", 21327 xmit_ill->ill_name)); 21328 goto drop_pkt; 21329 } 21330 21331 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21332 if (ipif == NULL) { 21333 ip1dbg(("ip_output: xmit_ill %s NULL ipif\n", 21334 xmit_ill->ill_name)); 21335 goto drop_pkt; 21336 } 21337 21338 match_flags = 0; 21339 if (IS_UNDER_IPMP(xmit_ill)) 21340 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 21341 21342 /* 21343 * Look for a ire that is part of the group, 21344 * if found use it else call ip_newroute_ipif. 21345 * IPCL_ZONEID is not used for matching because 21346 * IP_ALLZONES option is valid only when the 21347 * ill is accessible from all zones i.e has a 21348 * valid ipif in all zones. 21349 */ 21350 match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21351 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 21352 msg_getlabel(mp), match_flags, ipst); 21353 /* 21354 * If an ire exists use it or else create 21355 * an ire but don't add it to the cache. 21356 * Adding an ire may cause issues with 21357 * asymmetric routing. 21358 * In case of multiroute always act as if 21359 * ire does not exist. 21360 */ 21361 if (ire == NULL || ire->ire_flags & RTF_MULTIRT) { 21362 if (ire != NULL) 21363 ire_refrele(ire); 21364 ip_newroute_ipif(q, first_mp, ipif, 21365 dst, connp, 0, zoneid, infop); 21366 ipif_refrele(ipif); 21367 ip1dbg(("ip_output: xmit_ill via %s\n", 21368 xmit_ill->ill_name)); 21369 ill_refrele(xmit_ill); 21370 if (need_decref) 21371 CONN_DEC_REF(connp); 21372 return; 21373 } 21374 ipif_refrele(ipif); 21375 } else if (ip_nexthop || (connp != NULL && 21376 (connp->conn_nexthop_set)) && !ignore_nexthop) { 21377 if (!ip_nexthop) { 21378 ip_nexthop = B_TRUE; 21379 nexthop_addr = connp->conn_nexthop_v4; 21380 } 21381 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 21382 MATCH_IRE_GW; 21383 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 21384 NULL, zoneid, msg_getlabel(mp), match_flags, ipst); 21385 } else { 21386 ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), 21387 ipst); 21388 } 21389 if (!ire) { 21390 if (ip_nexthop && !ignore_nexthop) { 21391 if (mctl_present) { 21392 io = (ipsec_out_t *)first_mp->b_rptr; 21393 ASSERT(first_mp->b_datap->db_type == 21394 M_CTL); 21395 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21396 } else { 21397 ASSERT(mp == first_mp); 21398 first_mp = allocb( 21399 sizeof (ipsec_info_t), BPRI_HI); 21400 if (first_mp == NULL) { 21401 first_mp = mp; 21402 goto discard_pkt; 21403 } 21404 first_mp->b_datap->db_type = M_CTL; 21405 first_mp->b_wptr += 21406 sizeof (ipsec_info_t); 21407 /* ipsec_out_secure is B_FALSE now */ 21408 bzero(first_mp->b_rptr, 21409 sizeof (ipsec_info_t)); 21410 io = (ipsec_out_t *)first_mp->b_rptr; 21411 io->ipsec_out_type = IPSEC_OUT; 21412 io->ipsec_out_len = 21413 sizeof (ipsec_out_t); 21414 io->ipsec_out_use_global_policy = 21415 B_TRUE; 21416 io->ipsec_out_ns = ipst->ips_netstack; 21417 first_mp->b_cont = mp; 21418 mctl_present = B_TRUE; 21419 } 21420 io->ipsec_out_ip_nexthop = ip_nexthop; 21421 io->ipsec_out_nexthop_addr = nexthop_addr; 21422 } 21423 noirefound: 21424 /* 21425 * Mark this packet as having originated on 21426 * this machine. This will be noted in 21427 * ire_add_then_send, which needs to know 21428 * whether to run it back through ip_wput or 21429 * ip_rput following successful resolution. 21430 */ 21431 mp->b_prev = NULL; 21432 mp->b_next = NULL; 21433 ip_newroute(q, first_mp, dst, connp, zoneid, ipst); 21434 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21435 "ip_wput_end: q %p (%S)", q, "newroute"); 21436 if (xmit_ill != NULL) 21437 ill_refrele(xmit_ill); 21438 if (need_decref) 21439 CONN_DEC_REF(connp); 21440 return; 21441 } 21442 } 21443 21444 /* We now know where we are going with it. */ 21445 21446 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21447 "ip_wput_end: q %p (%S)", q, "end"); 21448 21449 /* 21450 * Check if the ire has the RTF_MULTIRT flag, inherited 21451 * from an IRE_OFFSUBNET ire entry in ip_newroute. 21452 */ 21453 if (ire->ire_flags & RTF_MULTIRT) { 21454 /* 21455 * Force the TTL of multirouted packets if required. 21456 * The TTL of such packets is bounded by the 21457 * ip_multirt_ttl ndd variable. 21458 */ 21459 if ((ipst->ips_ip_multirt_ttl > 0) && 21460 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 21461 ip2dbg(("ip_wput: forcing multirt TTL to %d " 21462 "(was %d), dst 0x%08x\n", 21463 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 21464 ntohl(ire->ire_addr))); 21465 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 21466 } 21467 /* 21468 * At this point, we check to see if there are any pending 21469 * unresolved routes. ire_multirt_resolvable() 21470 * checks in O(n) that all IRE_OFFSUBNET ire 21471 * entries for the packet's destination and 21472 * flagged RTF_MULTIRT are currently resolved. 21473 * If some remain unresolved, we make a copy 21474 * of the current message. It will be used 21475 * to initiate additional route resolutions. 21476 */ 21477 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 21478 msg_getlabel(first_mp), ipst); 21479 ip2dbg(("ip_wput[noirefound]: ire %p, " 21480 "multirt_need_resolve %d, first_mp %p\n", 21481 (void *)ire, multirt_need_resolve, (void *)first_mp)); 21482 if (multirt_need_resolve) { 21483 copy_mp = copymsg(first_mp); 21484 if (copy_mp != NULL) { 21485 MULTIRT_DEBUG_TAG(copy_mp); 21486 } 21487 } 21488 } 21489 21490 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 21491 /* 21492 * Try to resolve another multiroute if 21493 * ire_multirt_resolvable() deemed it necessary. 21494 * At this point, we need to distinguish 21495 * multicasts from other packets. For multicasts, 21496 * we call ip_newroute_ipif() and request that both 21497 * multirouting and setsrc flags are checked. 21498 */ 21499 if (copy_mp != NULL) { 21500 if (CLASSD(dst)) { 21501 ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst); 21502 if (ipif) { 21503 ASSERT(infop->ip_opt_ill_index == 0); 21504 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 21505 RTF_SETSRC | RTF_MULTIRT, zoneid, infop); 21506 ipif_refrele(ipif); 21507 } else { 21508 MULTIRT_DEBUG_UNTAG(copy_mp); 21509 freemsg(copy_mp); 21510 copy_mp = NULL; 21511 } 21512 } else { 21513 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 21514 } 21515 } 21516 if (xmit_ill != NULL) 21517 ill_refrele(xmit_ill); 21518 if (need_decref) 21519 CONN_DEC_REF(connp); 21520 return; 21521 21522 icmp_parameter_problem: 21523 /* could not have originated externally */ 21524 ASSERT(mp->b_prev == NULL); 21525 if (ip_hdr_complete(ipha, zoneid, ipst) == 0) { 21526 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 21527 /* it's the IP header length that's in trouble */ 21528 icmp_param_problem(q, first_mp, 0, zoneid, ipst); 21529 first_mp = NULL; 21530 } 21531 21532 discard_pkt: 21533 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 21534 drop_pkt: 21535 ip1dbg(("ip_wput: dropped packet\n")); 21536 if (ire != NULL) 21537 ire_refrele(ire); 21538 if (need_decref) 21539 CONN_DEC_REF(connp); 21540 freemsg(first_mp); 21541 if (xmit_ill != NULL) 21542 ill_refrele(xmit_ill); 21543 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21544 "ip_wput_end: q %p (%S)", q, "droppkt"); 21545 } 21546 21547 /* 21548 * If this is a conn_t queue, then we pass in the conn. This includes the 21549 * zoneid. 21550 * Otherwise, this is a message coming back from ARP or for an ill_t queue, 21551 * in which case we use the global zoneid since those are all part of 21552 * the global zone. 21553 */ 21554 void 21555 ip_wput(queue_t *q, mblk_t *mp) 21556 { 21557 if (CONN_Q(q)) 21558 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 21559 else 21560 ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT); 21561 } 21562 21563 /* 21564 * 21565 * The following rules must be observed when accessing any ipif or ill 21566 * that has been cached in the conn. Typically conn_outgoing_ill, 21567 * conn_multicast_ipif and conn_multicast_ill. 21568 * 21569 * Access: The ipif or ill pointed to from the conn can be accessed under 21570 * the protection of the conn_lock or after it has been refheld under the 21571 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 21572 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 21573 * The reason for this is that a concurrent unplumb could actually be 21574 * cleaning up these cached pointers by walking the conns and might have 21575 * finished cleaning up the conn in question. The macros check that an 21576 * unplumb has not yet started on the ipif or ill. 21577 * 21578 * Caching: An ipif or ill pointer may be cached in the conn only after 21579 * making sure that an unplumb has not started. So the caching is done 21580 * while holding both the conn_lock and the ill_lock and after using the 21581 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 21582 * flag before starting the cleanup of conns. 21583 * 21584 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 21585 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 21586 * or a reference to the ipif or a reference to an ire that references the 21587 * ipif. An ipif only changes its ill when migrating from an underlying ill 21588 * to an IPMP ill in ipif_up(). 21589 */ 21590 ipif_t * 21591 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 21592 { 21593 ipif_t *ipif; 21594 ill_t *ill; 21595 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 21596 21597 *err = 0; 21598 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21599 mutex_enter(&connp->conn_lock); 21600 ipif = *ipifp; 21601 if (ipif != NULL) { 21602 ill = ipif->ipif_ill; 21603 mutex_enter(&ill->ill_lock); 21604 if (IPIF_CAN_LOOKUP(ipif)) { 21605 ipif_refhold_locked(ipif); 21606 mutex_exit(&ill->ill_lock); 21607 mutex_exit(&connp->conn_lock); 21608 rw_exit(&ipst->ips_ill_g_lock); 21609 return (ipif); 21610 } else { 21611 *err = IPIF_LOOKUP_FAILED; 21612 } 21613 mutex_exit(&ill->ill_lock); 21614 } 21615 mutex_exit(&connp->conn_lock); 21616 rw_exit(&ipst->ips_ill_g_lock); 21617 return (NULL); 21618 } 21619 21620 ill_t * 21621 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 21622 { 21623 ill_t *ill; 21624 21625 *err = 0; 21626 mutex_enter(&connp->conn_lock); 21627 ill = *illp; 21628 if (ill != NULL) { 21629 mutex_enter(&ill->ill_lock); 21630 if (ILL_CAN_LOOKUP(ill)) { 21631 ill_refhold_locked(ill); 21632 mutex_exit(&ill->ill_lock); 21633 mutex_exit(&connp->conn_lock); 21634 return (ill); 21635 } else { 21636 *err = ILL_LOOKUP_FAILED; 21637 } 21638 mutex_exit(&ill->ill_lock); 21639 } 21640 mutex_exit(&connp->conn_lock); 21641 return (NULL); 21642 } 21643 21644 static int 21645 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 21646 { 21647 ill_t *ill; 21648 21649 ill = ipif->ipif_ill; 21650 mutex_enter(&connp->conn_lock); 21651 mutex_enter(&ill->ill_lock); 21652 if (IPIF_CAN_LOOKUP(ipif)) { 21653 *ipifp = ipif; 21654 mutex_exit(&ill->ill_lock); 21655 mutex_exit(&connp->conn_lock); 21656 return (0); 21657 } 21658 mutex_exit(&ill->ill_lock); 21659 mutex_exit(&connp->conn_lock); 21660 return (IPIF_LOOKUP_FAILED); 21661 } 21662 21663 /* 21664 * This is called if the outbound datagram needs fragmentation. 21665 * 21666 * NOTE : This function does not ire_refrele the ire argument passed in. 21667 */ 21668 static void 21669 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, 21670 ip_stack_t *ipst, conn_t *connp) 21671 { 21672 ipha_t *ipha; 21673 mblk_t *mp; 21674 uint32_t v_hlen_tos_len; 21675 uint32_t max_frag; 21676 uint32_t frag_flag; 21677 boolean_t dont_use; 21678 21679 if (ipsec_mp->b_datap->db_type == M_CTL) { 21680 mp = ipsec_mp->b_cont; 21681 } else { 21682 mp = ipsec_mp; 21683 } 21684 21685 ipha = (ipha_t *)mp->b_rptr; 21686 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 21687 21688 #ifdef _BIG_ENDIAN 21689 #define V_HLEN (v_hlen_tos_len >> 24) 21690 #define LENGTH (v_hlen_tos_len & 0xFFFF) 21691 #else 21692 #define V_HLEN (v_hlen_tos_len & 0xFF) 21693 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 21694 #endif 21695 21696 #ifndef SPEED_BEFORE_SAFETY 21697 /* 21698 * Check that ipha_length is consistent with 21699 * the mblk length 21700 */ 21701 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 21702 ip0dbg(("Packet length mismatch: %d, %ld\n", 21703 LENGTH, msgdsize(mp))); 21704 freemsg(ipsec_mp); 21705 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21706 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 21707 "packet length mismatch"); 21708 return; 21709 } 21710 #endif 21711 /* 21712 * Don't use frag_flag if pre-built packet or source 21713 * routed or if multicast (since multicast packets do not solicit 21714 * ICMP "packet too big" messages). Get the values of 21715 * max_frag and frag_flag atomically by acquiring the 21716 * ire_lock. 21717 */ 21718 mutex_enter(&ire->ire_lock); 21719 max_frag = ire->ire_max_frag; 21720 frag_flag = ire->ire_frag_flag; 21721 mutex_exit(&ire->ire_lock); 21722 21723 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 21724 (V_HLEN != IP_SIMPLE_HDR_VERSION && 21725 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 21726 21727 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 21728 (dont_use ? 0 : frag_flag), zoneid, ipst, connp); 21729 } 21730 21731 /* 21732 * Used for deciding the MSS size for the upper layer. Thus 21733 * we need to check the outbound policy values in the conn. 21734 */ 21735 int 21736 conn_ipsec_length(conn_t *connp) 21737 { 21738 ipsec_latch_t *ipl; 21739 21740 ipl = connp->conn_latch; 21741 if (ipl == NULL) 21742 return (0); 21743 21744 if (ipl->ipl_out_policy == NULL) 21745 return (0); 21746 21747 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 21748 } 21749 21750 /* 21751 * Returns an estimate of the IPsec headers size. This is used if 21752 * we don't want to call into IPsec to get the exact size. 21753 */ 21754 int 21755 ipsec_out_extra_length(mblk_t *ipsec_mp) 21756 { 21757 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 21758 ipsec_action_t *a; 21759 21760 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21761 if (!io->ipsec_out_secure) 21762 return (0); 21763 21764 a = io->ipsec_out_act; 21765 21766 if (a == NULL) { 21767 ASSERT(io->ipsec_out_policy != NULL); 21768 a = io->ipsec_out_policy->ipsp_act; 21769 } 21770 ASSERT(a != NULL); 21771 21772 return (a->ipa_ovhd); 21773 } 21774 21775 /* 21776 * Returns an estimate of the IPsec headers size. This is used if 21777 * we don't want to call into IPsec to get the exact size. 21778 */ 21779 int 21780 ipsec_in_extra_length(mblk_t *ipsec_mp) 21781 { 21782 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21783 ipsec_action_t *a; 21784 21785 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21786 21787 a = ii->ipsec_in_action; 21788 return (a == NULL ? 0 : a->ipa_ovhd); 21789 } 21790 21791 /* 21792 * If there are any source route options, return the true final 21793 * destination. Otherwise, return the destination. 21794 */ 21795 ipaddr_t 21796 ip_get_dst(ipha_t *ipha) 21797 { 21798 ipoptp_t opts; 21799 uchar_t *opt; 21800 uint8_t optval; 21801 uint8_t optlen; 21802 ipaddr_t dst; 21803 uint32_t off; 21804 21805 dst = ipha->ipha_dst; 21806 21807 if (IS_SIMPLE_IPH(ipha)) 21808 return (dst); 21809 21810 for (optval = ipoptp_first(&opts, ipha); 21811 optval != IPOPT_EOL; 21812 optval = ipoptp_next(&opts)) { 21813 opt = opts.ipoptp_cur; 21814 optlen = opts.ipoptp_len; 21815 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 21816 switch (optval) { 21817 case IPOPT_SSRR: 21818 case IPOPT_LSRR: 21819 off = opt[IPOPT_OFFSET]; 21820 /* 21821 * If one of the conditions is true, it means 21822 * end of options and dst already has the right 21823 * value. 21824 */ 21825 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 21826 off = optlen - IP_ADDR_LEN; 21827 bcopy(&opt[off], &dst, IP_ADDR_LEN); 21828 } 21829 return (dst); 21830 default: 21831 break; 21832 } 21833 } 21834 21835 return (dst); 21836 } 21837 21838 mblk_t * 21839 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 21840 conn_t *connp, boolean_t unspec_src, zoneid_t zoneid) 21841 { 21842 ipsec_out_t *io; 21843 mblk_t *first_mp; 21844 boolean_t policy_present; 21845 ip_stack_t *ipst; 21846 ipsec_stack_t *ipss; 21847 21848 ASSERT(ire != NULL); 21849 ipst = ire->ire_ipst; 21850 ipss = ipst->ips_netstack->netstack_ipsec; 21851 21852 first_mp = mp; 21853 if (mp->b_datap->db_type == M_CTL) { 21854 io = (ipsec_out_t *)first_mp->b_rptr; 21855 /* 21856 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 21857 * 21858 * 1) There is per-socket policy (including cached global 21859 * policy) or a policy on the IP-in-IP tunnel. 21860 * 2) There is no per-socket policy, but it is 21861 * a multicast packet that needs to go out 21862 * on a specific interface. This is the case 21863 * where (ip_wput and ip_wput_multicast) attaches 21864 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 21865 * 21866 * In case (2) we check with global policy to 21867 * see if there is a match and set the ill_index 21868 * appropriately so that we can lookup the ire 21869 * properly in ip_wput_ipsec_out. 21870 */ 21871 21872 /* 21873 * ipsec_out_use_global_policy is set to B_FALSE 21874 * in ipsec_in_to_out(). Refer to that function for 21875 * details. 21876 */ 21877 if ((io->ipsec_out_latch == NULL) && 21878 (io->ipsec_out_use_global_policy)) { 21879 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 21880 ire, connp, unspec_src, zoneid)); 21881 } 21882 if (!io->ipsec_out_secure) { 21883 /* 21884 * If this is not a secure packet, drop 21885 * the IPSEC_OUT mp and treat it as a clear 21886 * packet. This happens when we are sending 21887 * a ICMP reply back to a clear packet. See 21888 * ipsec_in_to_out() for details. 21889 */ 21890 mp = first_mp->b_cont; 21891 freeb(first_mp); 21892 } 21893 return (mp); 21894 } 21895 /* 21896 * See whether we need to attach a global policy here. We 21897 * don't depend on the conn (as it could be null) for deciding 21898 * what policy this datagram should go through because it 21899 * should have happened in ip_wput if there was some 21900 * policy. This normally happens for connections which are not 21901 * fully bound preventing us from caching policies in 21902 * ip_bind. Packets coming from the TCP listener/global queue 21903 * - which are non-hard_bound - could also be affected by 21904 * applying policy here. 21905 * 21906 * If this packet is coming from tcp global queue or listener, 21907 * we will be applying policy here. This may not be *right* 21908 * if these packets are coming from the detached connection as 21909 * it could have gone in clear before. This happens only if a 21910 * TCP connection started when there is no policy and somebody 21911 * added policy before it became detached. Thus packets of the 21912 * detached connection could go out secure and the other end 21913 * would drop it because it will be expecting in clear. The 21914 * converse is not true i.e if somebody starts a TCP 21915 * connection and deletes the policy, all the packets will 21916 * still go out with the policy that existed before deleting 21917 * because ip_unbind sends up policy information which is used 21918 * by TCP on subsequent ip_wputs. The right solution is to fix 21919 * TCP to attach a dummy IPSEC_OUT and set 21920 * ipsec_out_use_global_policy to B_FALSE. As this might 21921 * affect performance for normal cases, we are not doing it. 21922 * Thus, set policy before starting any TCP connections. 21923 * 21924 * NOTE - We might apply policy even for a hard bound connection 21925 * - for which we cached policy in ip_bind - if somebody added 21926 * global policy after we inherited the policy in ip_bind. 21927 * This means that the packets that were going out in clear 21928 * previously would start going secure and hence get dropped 21929 * on the other side. To fix this, TCP attaches a dummy 21930 * ipsec_out and make sure that we don't apply global policy. 21931 */ 21932 if (ipha != NULL) 21933 policy_present = ipss->ipsec_outbound_v4_policy_present; 21934 else 21935 policy_present = ipss->ipsec_outbound_v6_policy_present; 21936 if (!policy_present) 21937 return (mp); 21938 21939 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src, 21940 zoneid)); 21941 } 21942 21943 /* 21944 * This function does the ire_refrele of the ire passed in as the 21945 * argument. As this function looks up more ires i.e broadcast ires, 21946 * it needs to REFRELE them. Currently, for simplicity we don't 21947 * differentiate the one passed in and looked up here. We always 21948 * REFRELE. 21949 * IPQoS Notes: 21950 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 21951 * IPsec packets are done in ipsec_out_process. 21952 */ 21953 void 21954 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, 21955 zoneid_t zoneid) 21956 { 21957 ipha_t *ipha; 21958 #define rptr ((uchar_t *)ipha) 21959 queue_t *stq; 21960 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 21961 uint32_t v_hlen_tos_len; 21962 uint32_t ttl_protocol; 21963 ipaddr_t src; 21964 ipaddr_t dst; 21965 uint32_t cksum; 21966 ipaddr_t orig_src; 21967 ire_t *ire1; 21968 mblk_t *next_mp; 21969 uint_t hlen; 21970 uint16_t *up; 21971 uint32_t max_frag = ire->ire_max_frag; 21972 ill_t *ill = ire_to_ill(ire); 21973 int clusterwide; 21974 uint16_t ip_hdr_included; /* IP header included by ULP? */ 21975 int ipsec_len; 21976 mblk_t *first_mp; 21977 ipsec_out_t *io; 21978 boolean_t conn_dontroute; /* conn value for multicast */ 21979 boolean_t conn_multicast_loop; /* conn value for multicast */ 21980 boolean_t multicast_forward; /* Should we forward ? */ 21981 boolean_t unspec_src; 21982 ill_t *conn_outgoing_ill = NULL; 21983 ill_t *ire_ill; 21984 ill_t *ire1_ill; 21985 ill_t *out_ill; 21986 uint32_t ill_index = 0; 21987 boolean_t multirt_send = B_FALSE; 21988 int err; 21989 ipxmit_state_t pktxmit_state; 21990 ip_stack_t *ipst = ire->ire_ipst; 21991 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 21992 21993 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 21994 "ip_wput_ire_start: q %p", q); 21995 21996 multicast_forward = B_FALSE; 21997 unspec_src = (connp != NULL && connp->conn_unspec_src); 21998 21999 if (ire->ire_flags & RTF_MULTIRT) { 22000 /* 22001 * Multirouting case. The bucket where ire is stored 22002 * probably holds other RTF_MULTIRT flagged ire 22003 * to the destination. In this call to ip_wput_ire, 22004 * we attempt to send the packet through all 22005 * those ires. Thus, we first ensure that ire is the 22006 * first RTF_MULTIRT ire in the bucket, 22007 * before walking the ire list. 22008 */ 22009 ire_t *first_ire; 22010 irb_t *irb = ire->ire_bucket; 22011 ASSERT(irb != NULL); 22012 22013 /* Make sure we do not omit any multiroute ire. */ 22014 IRB_REFHOLD(irb); 22015 for (first_ire = irb->irb_ire; 22016 first_ire != NULL; 22017 first_ire = first_ire->ire_next) { 22018 if ((first_ire->ire_flags & RTF_MULTIRT) && 22019 (first_ire->ire_addr == ire->ire_addr) && 22020 !(first_ire->ire_marks & 22021 (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) 22022 break; 22023 } 22024 22025 if ((first_ire != NULL) && (first_ire != ire)) { 22026 IRE_REFHOLD(first_ire); 22027 ire_refrele(ire); 22028 ire = first_ire; 22029 ill = ire_to_ill(ire); 22030 } 22031 IRB_REFRELE(irb); 22032 } 22033 22034 /* 22035 * conn_outgoing_ill variable is used only in the broadcast loop. 22036 * for performance we don't grab the mutexs in the fastpath 22037 */ 22038 if (ire->ire_type == IRE_BROADCAST && connp != NULL && 22039 connp->conn_outgoing_ill != NULL) { 22040 conn_outgoing_ill = conn_get_held_ill(connp, 22041 &connp->conn_outgoing_ill, &err); 22042 if (err == ILL_LOOKUP_FAILED) { 22043 ire_refrele(ire); 22044 freemsg(mp); 22045 return; 22046 } 22047 } 22048 22049 if (mp->b_datap->db_type != M_CTL) { 22050 ipha = (ipha_t *)mp->b_rptr; 22051 } else { 22052 io = (ipsec_out_t *)mp->b_rptr; 22053 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22054 ASSERT(zoneid == io->ipsec_out_zoneid); 22055 ASSERT(zoneid != ALL_ZONES); 22056 ipha = (ipha_t *)mp->b_cont->b_rptr; 22057 dst = ipha->ipha_dst; 22058 /* 22059 * For the multicast case, ipsec_out carries conn_dontroute and 22060 * conn_multicast_loop as conn may not be available here. We 22061 * need this for multicast loopback and forwarding which is done 22062 * later in the code. 22063 */ 22064 if (CLASSD(dst)) { 22065 conn_dontroute = io->ipsec_out_dontroute; 22066 conn_multicast_loop = io->ipsec_out_multicast_loop; 22067 /* 22068 * If conn_dontroute is not set or conn_multicast_loop 22069 * is set, we need to do forwarding/loopback. For 22070 * datagrams from ip_wput_multicast, conn_dontroute is 22071 * set to B_TRUE and conn_multicast_loop is set to 22072 * B_FALSE so that we neither do forwarding nor 22073 * loopback. 22074 */ 22075 if (!conn_dontroute || conn_multicast_loop) 22076 multicast_forward = B_TRUE; 22077 } 22078 } 22079 22080 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 22081 ire->ire_zoneid != ALL_ZONES) { 22082 /* 22083 * When a zone sends a packet to another zone, we try to deliver 22084 * the packet under the same conditions as if the destination 22085 * was a real node on the network. To do so, we look for a 22086 * matching route in the forwarding table. 22087 * RTF_REJECT and RTF_BLACKHOLE are handled just like 22088 * ip_newroute() does. 22089 * Note that IRE_LOCAL are special, since they are used 22090 * when the zoneid doesn't match in some cases. This means that 22091 * we need to handle ipha_src differently since ire_src_addr 22092 * belongs to the receiving zone instead of the sending zone. 22093 * When ip_restrict_interzone_loopback is set, then 22094 * ire_cache_lookup() ensures that IRE_LOCAL are only used 22095 * for loopback between zones when the logical "Ethernet" would 22096 * have looped them back. 22097 */ 22098 ire_t *src_ire; 22099 22100 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 22101 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 22102 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst); 22103 if (src_ire != NULL && 22104 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && 22105 (!ipst->ips_ip_restrict_interzone_loopback || 22106 ire_local_same_lan(ire, src_ire))) { 22107 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 22108 ipha->ipha_src = src_ire->ire_src_addr; 22109 ire_refrele(src_ire); 22110 } else { 22111 ire_refrele(ire); 22112 if (conn_outgoing_ill != NULL) 22113 ill_refrele(conn_outgoing_ill); 22114 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 22115 if (src_ire != NULL) { 22116 if (src_ire->ire_flags & RTF_BLACKHOLE) { 22117 ire_refrele(src_ire); 22118 freemsg(mp); 22119 return; 22120 } 22121 ire_refrele(src_ire); 22122 } 22123 if (ip_hdr_complete(ipha, zoneid, ipst)) { 22124 /* Failed */ 22125 freemsg(mp); 22126 return; 22127 } 22128 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid, 22129 ipst); 22130 return; 22131 } 22132 } 22133 22134 if (mp->b_datap->db_type == M_CTL || 22135 ipss->ipsec_outbound_v4_policy_present) { 22136 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 22137 unspec_src, zoneid); 22138 if (mp == NULL) { 22139 ire_refrele(ire); 22140 if (conn_outgoing_ill != NULL) 22141 ill_refrele(conn_outgoing_ill); 22142 return; 22143 } 22144 /* 22145 * Trusted Extensions supports all-zones interfaces, so 22146 * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to 22147 * the global zone. 22148 */ 22149 if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) { 22150 io = (ipsec_out_t *)mp->b_rptr; 22151 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22152 zoneid = io->ipsec_out_zoneid; 22153 } 22154 } 22155 22156 first_mp = mp; 22157 ipsec_len = 0; 22158 22159 if (first_mp->b_datap->db_type == M_CTL) { 22160 io = (ipsec_out_t *)first_mp->b_rptr; 22161 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22162 mp = first_mp->b_cont; 22163 ipsec_len = ipsec_out_extra_length(first_mp); 22164 ASSERT(ipsec_len >= 0); 22165 /* We already picked up the zoneid from the M_CTL above */ 22166 ASSERT(zoneid == io->ipsec_out_zoneid); 22167 ASSERT(zoneid != ALL_ZONES); 22168 22169 /* 22170 * Drop M_CTL here if IPsec processing is not needed. 22171 * (Non-IPsec use of M_CTL extracted any information it 22172 * needed above). 22173 */ 22174 if (ipsec_len == 0) { 22175 freeb(first_mp); 22176 first_mp = mp; 22177 } 22178 } 22179 22180 /* 22181 * Fast path for ip_wput_ire 22182 */ 22183 22184 ipha = (ipha_t *)mp->b_rptr; 22185 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22186 dst = ipha->ipha_dst; 22187 22188 /* 22189 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 22190 * if the socket is a SOCK_RAW type. The transport checksum should 22191 * be provided in the pre-built packet, so we don't need to compute it. 22192 * Also, other application set flags, like DF, should not be altered. 22193 * Other transport MUST pass down zero. 22194 */ 22195 ip_hdr_included = ipha->ipha_ident; 22196 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 22197 22198 if (CLASSD(dst)) { 22199 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 22200 ntohl(dst), 22201 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 22202 ntohl(ire->ire_addr))); 22203 } 22204 22205 /* Macros to extract header fields from data already in registers */ 22206 #ifdef _BIG_ENDIAN 22207 #define V_HLEN (v_hlen_tos_len >> 24) 22208 #define LENGTH (v_hlen_tos_len & 0xFFFF) 22209 #define PROTO (ttl_protocol & 0xFF) 22210 #else 22211 #define V_HLEN (v_hlen_tos_len & 0xFF) 22212 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 22213 #define PROTO (ttl_protocol >> 8) 22214 #endif 22215 22216 orig_src = src = ipha->ipha_src; 22217 /* (The loop back to "another" is explained down below.) */ 22218 another:; 22219 /* 22220 * Assign an ident value for this packet. We assign idents on 22221 * a per destination basis out of the IRE. There could be 22222 * other threads targeting the same destination, so we have to 22223 * arrange for a atomic increment. Note that we use a 32-bit 22224 * atomic add because it has better performance than its 22225 * 16-bit sibling. 22226 * 22227 * If running in cluster mode and if the source address 22228 * belongs to a replicated service then vector through 22229 * cl_inet_ipident vector to allocate ip identifier 22230 * NOTE: This is a contract private interface with the 22231 * clustering group. 22232 */ 22233 clusterwide = 0; 22234 if (cl_inet_ipident) { 22235 ASSERT(cl_inet_isclusterwide); 22236 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 22237 22238 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 22239 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 22240 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 22241 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 22242 (uint8_t *)(uintptr_t)dst, NULL); 22243 clusterwide = 1; 22244 } 22245 } 22246 if (!clusterwide) { 22247 ipha->ipha_ident = 22248 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 22249 } 22250 22251 #ifndef _BIG_ENDIAN 22252 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 22253 #endif 22254 22255 /* 22256 * Set source address unless sent on an ill or conn_unspec_src is set. 22257 * This is needed to obey conn_unspec_src when packets go through 22258 * ip_newroute + arp. 22259 * Assumes ip_newroute{,_multi} sets the source address as well. 22260 */ 22261 if (src == INADDR_ANY && !unspec_src) { 22262 /* 22263 * Assign the appropriate source address from the IRE if none 22264 * was specified. 22265 */ 22266 ASSERT(ire->ire_ipversion == IPV4_VERSION); 22267 22268 src = ire->ire_src_addr; 22269 if (connp == NULL) { 22270 ip1dbg(("ip_wput_ire: no connp and no src " 22271 "address for dst 0x%x, using src 0x%x\n", 22272 ntohl(dst), 22273 ntohl(src))); 22274 } 22275 ipha->ipha_src = src; 22276 } 22277 stq = ire->ire_stq; 22278 22279 /* 22280 * We only allow ire chains for broadcasts since there will 22281 * be multiple IRE_CACHE entries for the same multicast 22282 * address (one per ipif). 22283 */ 22284 next_mp = NULL; 22285 22286 /* broadcast packet */ 22287 if (ire->ire_type == IRE_BROADCAST) 22288 goto broadcast; 22289 22290 /* loopback ? */ 22291 if (stq == NULL) 22292 goto nullstq; 22293 22294 /* The ill_index for outbound ILL */ 22295 ill_index = Q_TO_INDEX(stq); 22296 22297 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 22298 ttl_protocol = ((uint16_t *)ipha)[4]; 22299 22300 /* pseudo checksum (do it in parts for IP header checksum) */ 22301 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 22302 22303 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 22304 queue_t *dev_q = stq->q_next; 22305 22306 /* 22307 * For DIRECT_CAPABLE, we do flow control at 22308 * the time of sending the packet. See 22309 * ILL_SEND_TX(). 22310 */ 22311 if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && 22312 (DEV_Q_FLOW_BLOCKED(dev_q))) 22313 goto blocked; 22314 22315 if ((PROTO == IPPROTO_UDP) && 22316 (ip_hdr_included != IP_HDR_INCLUDED)) { 22317 hlen = (V_HLEN & 0xF) << 2; 22318 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 22319 if (*up != 0) { 22320 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 22321 hlen, LENGTH, max_frag, ipsec_len, cksum); 22322 /* Software checksum? */ 22323 if (DB_CKSUMFLAGS(mp) == 0) { 22324 IP_STAT(ipst, ip_out_sw_cksum); 22325 IP_STAT_UPDATE(ipst, 22326 ip_udp_out_sw_cksum_bytes, 22327 LENGTH - hlen); 22328 } 22329 } 22330 } 22331 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 22332 hlen = (V_HLEN & 0xF) << 2; 22333 if (PROTO == IPPROTO_TCP) { 22334 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22335 /* 22336 * The packet header is processed once and for all, even 22337 * in the multirouting case. We disable hardware 22338 * checksum if the packet is multirouted, as it will be 22339 * replicated via several interfaces, and not all of 22340 * them may have this capability. 22341 */ 22342 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 22343 LENGTH, max_frag, ipsec_len, cksum); 22344 /* Software checksum? */ 22345 if (DB_CKSUMFLAGS(mp) == 0) { 22346 IP_STAT(ipst, ip_out_sw_cksum); 22347 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22348 LENGTH - hlen); 22349 } 22350 } else { 22351 sctp_hdr_t *sctph; 22352 22353 ASSERT(PROTO == IPPROTO_SCTP); 22354 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22355 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22356 /* 22357 * Zero out the checksum field to ensure proper 22358 * checksum calculation. 22359 */ 22360 sctph->sh_chksum = 0; 22361 #ifdef DEBUG 22362 if (!skip_sctp_cksum) 22363 #endif 22364 sctph->sh_chksum = sctp_cksum(mp, hlen); 22365 } 22366 } 22367 22368 /* 22369 * If this is a multicast packet and originated from ip_wput 22370 * we need to do loopback and forwarding checks. If it comes 22371 * from ip_wput_multicast, we SHOULD not do this. 22372 */ 22373 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 22374 22375 /* checksum */ 22376 cksum += ttl_protocol; 22377 22378 /* fragment the packet */ 22379 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 22380 goto fragmentit; 22381 /* 22382 * Don't use frag_flag if packet is pre-built or source 22383 * routed or if multicast (since multicast packets do 22384 * not solicit ICMP "packet too big" messages). 22385 */ 22386 if ((ip_hdr_included != IP_HDR_INCLUDED) && 22387 (V_HLEN == IP_SIMPLE_HDR_VERSION || 22388 !ip_source_route_included(ipha)) && 22389 !CLASSD(ipha->ipha_dst)) 22390 ipha->ipha_fragment_offset_and_flags |= 22391 htons(ire->ire_frag_flag); 22392 22393 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 22394 /* calculate IP header checksum */ 22395 cksum += ipha->ipha_ident; 22396 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 22397 cksum += ipha->ipha_fragment_offset_and_flags; 22398 22399 /* IP options present */ 22400 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 22401 if (hlen) 22402 goto checksumoptions; 22403 22404 /* calculate hdr checksum */ 22405 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 22406 cksum = ~(cksum + (cksum >> 16)); 22407 ipha->ipha_hdr_checksum = (uint16_t)cksum; 22408 } 22409 if (ipsec_len != 0) { 22410 /* 22411 * We will do the rest of the processing after 22412 * we come back from IPsec in ip_wput_ipsec_out(). 22413 */ 22414 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 22415 22416 io = (ipsec_out_t *)first_mp->b_rptr; 22417 io->ipsec_out_ill_index = 22418 ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 22419 ipsec_out_process(q, first_mp, ire, 0); 22420 ire_refrele(ire); 22421 if (conn_outgoing_ill != NULL) 22422 ill_refrele(conn_outgoing_ill); 22423 return; 22424 } 22425 22426 /* 22427 * In most cases, the emission loop below is entered only 22428 * once. Only in the case where the ire holds the 22429 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 22430 * flagged ires in the bucket, and send the packet 22431 * through all crossed RTF_MULTIRT routes. 22432 */ 22433 if (ire->ire_flags & RTF_MULTIRT) { 22434 multirt_send = B_TRUE; 22435 } 22436 do { 22437 if (multirt_send) { 22438 irb_t *irb; 22439 /* 22440 * We are in a multiple send case, need to get 22441 * the next ire and make a duplicate of the packet. 22442 * ire1 holds here the next ire to process in the 22443 * bucket. If multirouting is expected, 22444 * any non-RTF_MULTIRT ire that has the 22445 * right destination address is ignored. 22446 */ 22447 irb = ire->ire_bucket; 22448 ASSERT(irb != NULL); 22449 22450 IRB_REFHOLD(irb); 22451 for (ire1 = ire->ire_next; 22452 ire1 != NULL; 22453 ire1 = ire1->ire_next) { 22454 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22455 continue; 22456 if (ire1->ire_addr != ire->ire_addr) 22457 continue; 22458 if (ire1->ire_marks & 22459 (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) 22460 continue; 22461 22462 /* Got one */ 22463 IRE_REFHOLD(ire1); 22464 break; 22465 } 22466 IRB_REFRELE(irb); 22467 22468 if (ire1 != NULL) { 22469 next_mp = copyb(mp); 22470 if ((next_mp == NULL) || 22471 ((mp->b_cont != NULL) && 22472 ((next_mp->b_cont = 22473 dupmsg(mp->b_cont)) == NULL))) { 22474 freemsg(next_mp); 22475 next_mp = NULL; 22476 ire_refrele(ire1); 22477 ire1 = NULL; 22478 } 22479 } 22480 22481 /* Last multiroute ire; don't loop anymore. */ 22482 if (ire1 == NULL) { 22483 multirt_send = B_FALSE; 22484 } 22485 } 22486 22487 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 22488 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha, 22489 mblk_t *, mp); 22490 FW_HOOKS(ipst->ips_ip4_physical_out_event, 22491 ipst->ips_ipv4firewall_physical_out, 22492 NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst); 22493 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 22494 22495 if (mp == NULL) 22496 goto release_ire_and_ill; 22497 22498 if (ipst->ips_ipobs_enabled) { 22499 zoneid_t szone; 22500 22501 /* 22502 * On the outbound path the destination zone will be 22503 * unknown as we're sending this packet out on the 22504 * wire. 22505 */ 22506 szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst, 22507 ALL_ZONES); 22508 ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES, 22509 ire->ire_ipif->ipif_ill, IPV4_VERSION, 0, ipst); 22510 } 22511 mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT); 22512 DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire); 22513 22514 pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp); 22515 22516 if ((pktxmit_state == SEND_FAILED) || 22517 (pktxmit_state == LLHDR_RESLV_FAILED)) { 22518 ip2dbg(("ip_wput_ire: ip_xmit_v4 failed" 22519 "- packet dropped\n")); 22520 release_ire_and_ill: 22521 ire_refrele(ire); 22522 if (next_mp != NULL) { 22523 freemsg(next_mp); 22524 ire_refrele(ire1); 22525 } 22526 if (conn_outgoing_ill != NULL) 22527 ill_refrele(conn_outgoing_ill); 22528 return; 22529 } 22530 22531 if (CLASSD(dst)) { 22532 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts); 22533 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets, 22534 LENGTH); 22535 } 22536 22537 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22538 "ip_wput_ire_end: q %p (%S)", 22539 q, "last copy out"); 22540 IRE_REFRELE(ire); 22541 22542 if (multirt_send) { 22543 ASSERT(ire1); 22544 /* 22545 * Proceed with the next RTF_MULTIRT ire, 22546 * Also set up the send-to queue accordingly. 22547 */ 22548 ire = ire1; 22549 ire1 = NULL; 22550 stq = ire->ire_stq; 22551 mp = next_mp; 22552 next_mp = NULL; 22553 ipha = (ipha_t *)mp->b_rptr; 22554 ill_index = Q_TO_INDEX(stq); 22555 ill = (ill_t *)stq->q_ptr; 22556 } 22557 } while (multirt_send); 22558 if (conn_outgoing_ill != NULL) 22559 ill_refrele(conn_outgoing_ill); 22560 return; 22561 22562 /* 22563 * ire->ire_type == IRE_BROADCAST (minimize diffs) 22564 */ 22565 broadcast: 22566 { 22567 /* 22568 * To avoid broadcast storms, we usually set the TTL to 1 for 22569 * broadcasts. However, if SO_DONTROUTE isn't set, this value 22570 * can be overridden stack-wide through the ip_broadcast_ttl 22571 * ndd tunable, or on a per-connection basis through the 22572 * IP_BROADCAST_TTL socket option. 22573 * 22574 * In the event that we are replying to incoming ICMP packets, 22575 * connp could be NULL. 22576 */ 22577 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 22578 if (connp != NULL) { 22579 if (connp->conn_dontroute) 22580 ipha->ipha_ttl = 1; 22581 else if (connp->conn_broadcast_ttl != 0) 22582 ipha->ipha_ttl = connp->conn_broadcast_ttl; 22583 } 22584 22585 /* 22586 * Note that we are not doing a IRB_REFHOLD here. 22587 * Actually we don't care if the list changes i.e 22588 * if somebody deletes an IRE from the list while 22589 * we drop the lock, the next time we come around 22590 * ire_next will be NULL and hence we won't send 22591 * out multiple copies which is fine. 22592 */ 22593 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 22594 ire1 = ire->ire_next; 22595 if (conn_outgoing_ill != NULL) { 22596 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 22597 ASSERT(ire1 == ire->ire_next); 22598 if (ire1 != NULL && ire1->ire_addr == dst) { 22599 ire_refrele(ire); 22600 ire = ire1; 22601 IRE_REFHOLD(ire); 22602 ire1 = ire->ire_next; 22603 continue; 22604 } 22605 rw_exit(&ire->ire_bucket->irb_lock); 22606 /* Did not find a matching ill */ 22607 ip1dbg(("ip_wput_ire: broadcast with no " 22608 "matching IP_BOUND_IF ill %s dst %x\n", 22609 conn_outgoing_ill->ill_name, dst)); 22610 freemsg(first_mp); 22611 if (ire != NULL) 22612 ire_refrele(ire); 22613 ill_refrele(conn_outgoing_ill); 22614 return; 22615 } 22616 } else if (ire1 != NULL && ire1->ire_addr == dst) { 22617 /* 22618 * If the next IRE has the same address and is not one 22619 * of the two copies that we need to send, try to see 22620 * whether this copy should be sent at all. This 22621 * assumes that we insert loopbacks first and then 22622 * non-loopbacks. This is acheived by inserting the 22623 * loopback always before non-loopback. 22624 * This is used to send a single copy of a broadcast 22625 * packet out all physical interfaces that have an 22626 * matching IRE_BROADCAST while also looping 22627 * back one copy (to ip_wput_local) for each 22628 * matching physical interface. However, we avoid 22629 * sending packets out different logical that match by 22630 * having ipif_up/ipif_down supress duplicate 22631 * IRE_BROADCASTS. 22632 * 22633 * This feature is currently used to get broadcasts 22634 * sent to multiple interfaces, when the broadcast 22635 * address being used applies to multiple interfaces. 22636 * For example, a whole net broadcast will be 22637 * replicated on every connected subnet of 22638 * the target net. 22639 * 22640 * Each zone has its own set of IRE_BROADCASTs, so that 22641 * we're able to distribute inbound packets to multiple 22642 * zones who share a broadcast address. We avoid looping 22643 * back outbound packets in different zones but on the 22644 * same ill, as the application would see duplicates. 22645 * 22646 * This logic assumes that ire_add_v4() groups the 22647 * IRE_BROADCAST entries so that those with the same 22648 * ire_addr are kept together. 22649 */ 22650 ire_ill = ire->ire_ipif->ipif_ill; 22651 if (ire->ire_stq != NULL || ire1->ire_stq == NULL) { 22652 while (ire1 != NULL && ire1->ire_addr == dst) { 22653 ire1_ill = ire1->ire_ipif->ipif_ill; 22654 if (ire1_ill != ire_ill) 22655 break; 22656 ire1 = ire1->ire_next; 22657 } 22658 } 22659 } 22660 ASSERT(multirt_send == B_FALSE); 22661 if (ire1 != NULL && ire1->ire_addr == dst) { 22662 if ((ire->ire_flags & RTF_MULTIRT) && 22663 (ire1->ire_flags & RTF_MULTIRT)) { 22664 /* 22665 * We are in the multirouting case. 22666 * The message must be sent at least 22667 * on both ires. These ires have been 22668 * inserted AFTER the standard ones 22669 * in ip_rt_add(). There are thus no 22670 * other ire entries for the destination 22671 * address in the rest of the bucket 22672 * that do not have the RTF_MULTIRT 22673 * flag. We don't process a copy 22674 * of the message here. This will be 22675 * done in the final sending loop. 22676 */ 22677 multirt_send = B_TRUE; 22678 } else { 22679 next_mp = ip_copymsg(first_mp); 22680 if (next_mp != NULL) 22681 IRE_REFHOLD(ire1); 22682 } 22683 } 22684 rw_exit(&ire->ire_bucket->irb_lock); 22685 } 22686 22687 if (stq) { 22688 /* 22689 * A non-NULL send-to queue means this packet is going 22690 * out of this machine. 22691 */ 22692 out_ill = (ill_t *)stq->q_ptr; 22693 22694 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests); 22695 ttl_protocol = ((uint16_t *)ipha)[4]; 22696 /* 22697 * We accumulate the pseudo header checksum in cksum. 22698 * This is pretty hairy code, so watch close. One 22699 * thing to keep in mind is that UDP and TCP have 22700 * stored their respective datagram lengths in their 22701 * checksum fields. This lines things up real nice. 22702 */ 22703 cksum = (dst >> 16) + (dst & 0xFFFF) + 22704 (src >> 16) + (src & 0xFFFF); 22705 /* 22706 * We assume the udp checksum field contains the 22707 * length, so to compute the pseudo header checksum, 22708 * all we need is the protocol number and src/dst. 22709 */ 22710 /* Provide the checksums for UDP and TCP. */ 22711 if ((PROTO == IPPROTO_TCP) && 22712 (ip_hdr_included != IP_HDR_INCLUDED)) { 22713 /* hlen gets the number of uchar_ts in the IP header */ 22714 hlen = (V_HLEN & 0xF) << 2; 22715 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22716 IP_STAT(ipst, ip_out_sw_cksum); 22717 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22718 LENGTH - hlen); 22719 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 22720 } else if (PROTO == IPPROTO_SCTP && 22721 (ip_hdr_included != IP_HDR_INCLUDED)) { 22722 sctp_hdr_t *sctph; 22723 22724 hlen = (V_HLEN & 0xF) << 2; 22725 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22726 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22727 sctph->sh_chksum = 0; 22728 #ifdef DEBUG 22729 if (!skip_sctp_cksum) 22730 #endif 22731 sctph->sh_chksum = sctp_cksum(mp, hlen); 22732 } else { 22733 queue_t *dev_q = stq->q_next; 22734 22735 if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && 22736 (DEV_Q_FLOW_BLOCKED(dev_q))) { 22737 blocked: 22738 ipha->ipha_ident = ip_hdr_included; 22739 /* 22740 * If we don't have a conn to apply 22741 * backpressure, free the message. 22742 * In the ire_send path, we don't know 22743 * the position to requeue the packet. Rather 22744 * than reorder packets, we just drop this 22745 * packet. 22746 */ 22747 if (ipst->ips_ip_output_queue && 22748 connp != NULL && 22749 caller != IRE_SEND) { 22750 if (caller == IP_WSRV) { 22751 idl_tx_list_t *idl_txl; 22752 22753 idl_txl = 22754 &ipst->ips_idl_tx_list[0]; 22755 connp->conn_did_putbq = 1; 22756 (void) putbq(connp->conn_wq, 22757 first_mp); 22758 conn_drain_insert(connp, 22759 idl_txl); 22760 /* 22761 * This is the service thread, 22762 * and the queue is already 22763 * noenabled. The check for 22764 * canput and the putbq is not 22765 * atomic. So we need to check 22766 * again. 22767 */ 22768 if (canput(stq->q_next)) 22769 connp->conn_did_putbq 22770 = 0; 22771 IP_STAT(ipst, ip_conn_flputbq); 22772 } else { 22773 /* 22774 * We are not the service proc. 22775 * ip_wsrv will be scheduled or 22776 * is already running. 22777 */ 22778 22779 (void) putq(connp->conn_wq, 22780 first_mp); 22781 } 22782 } else { 22783 out_ill = (ill_t *)stq->q_ptr; 22784 BUMP_MIB(out_ill->ill_ip_mib, 22785 ipIfStatsOutDiscards); 22786 freemsg(first_mp); 22787 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22788 "ip_wput_ire_end: q %p (%S)", 22789 q, "discard"); 22790 } 22791 ire_refrele(ire); 22792 if (next_mp) { 22793 ire_refrele(ire1); 22794 freemsg(next_mp); 22795 } 22796 if (conn_outgoing_ill != NULL) 22797 ill_refrele(conn_outgoing_ill); 22798 return; 22799 } 22800 if ((PROTO == IPPROTO_UDP) && 22801 (ip_hdr_included != IP_HDR_INCLUDED)) { 22802 /* 22803 * hlen gets the number of uchar_ts in the 22804 * IP header 22805 */ 22806 hlen = (V_HLEN & 0xF) << 2; 22807 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 22808 max_frag = ire->ire_max_frag; 22809 if (*up != 0) { 22810 IP_CKSUM_XMIT(out_ill, ire, mp, ipha, 22811 up, PROTO, hlen, LENGTH, max_frag, 22812 ipsec_len, cksum); 22813 /* Software checksum? */ 22814 if (DB_CKSUMFLAGS(mp) == 0) { 22815 IP_STAT(ipst, ip_out_sw_cksum); 22816 IP_STAT_UPDATE(ipst, 22817 ip_udp_out_sw_cksum_bytes, 22818 LENGTH - hlen); 22819 } 22820 } 22821 } 22822 } 22823 /* 22824 * Need to do this even when fragmenting. The local 22825 * loopback can be done without computing checksums 22826 * but forwarding out other interface must be done 22827 * after the IP checksum (and ULP checksums) have been 22828 * computed. 22829 * 22830 * NOTE : multicast_forward is set only if this packet 22831 * originated from ip_wput. For packets originating from 22832 * ip_wput_multicast, it is not set. 22833 */ 22834 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 22835 multi_loopback: 22836 ip2dbg(("ip_wput: multicast, loop %d\n", 22837 conn_multicast_loop)); 22838 22839 /* Forget header checksum offload */ 22840 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 22841 22842 /* 22843 * Local loopback of multicasts? Check the 22844 * ill. 22845 * 22846 * Note that the loopback function will not come 22847 * in through ip_rput - it will only do the 22848 * client fanout thus we need to do an mforward 22849 * as well. The is different from the BSD 22850 * logic. 22851 */ 22852 if (ill != NULL) { 22853 if (ilm_lookup_ill(ill, ipha->ipha_dst, 22854 ALL_ZONES) != NULL) { 22855 /* 22856 * Pass along the virtual output q. 22857 * ip_wput_local() will distribute the 22858 * packet to all the matching zones, 22859 * except the sending zone when 22860 * IP_MULTICAST_LOOP is false. 22861 */ 22862 ip_multicast_loopback(q, ill, first_mp, 22863 conn_multicast_loop ? 0 : 22864 IP_FF_NO_MCAST_LOOP, zoneid); 22865 } 22866 } 22867 if (ipha->ipha_ttl == 0) { 22868 /* 22869 * 0 => only to this host i.e. we are 22870 * done. We are also done if this was the 22871 * loopback interface since it is sufficient 22872 * to loopback one copy of a multicast packet. 22873 */ 22874 freemsg(first_mp); 22875 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22876 "ip_wput_ire_end: q %p (%S)", 22877 q, "loopback"); 22878 ire_refrele(ire); 22879 if (conn_outgoing_ill != NULL) 22880 ill_refrele(conn_outgoing_ill); 22881 return; 22882 } 22883 /* 22884 * ILLF_MULTICAST is checked in ip_newroute 22885 * i.e. we don't need to check it here since 22886 * all IRE_CACHEs come from ip_newroute. 22887 * For multicast traffic, SO_DONTROUTE is interpreted 22888 * to mean only send the packet out the interface 22889 * (optionally specified with IP_MULTICAST_IF) 22890 * and do not forward it out additional interfaces. 22891 * RSVP and the rsvp daemon is an example of a 22892 * protocol and user level process that 22893 * handles it's own routing. Hence, it uses the 22894 * SO_DONTROUTE option to accomplish this. 22895 */ 22896 22897 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 22898 ill != NULL) { 22899 /* Unconditionally redo the checksum */ 22900 ipha->ipha_hdr_checksum = 0; 22901 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22902 22903 /* 22904 * If this needs to go out secure, we need 22905 * to wait till we finish the IPsec 22906 * processing. 22907 */ 22908 if (ipsec_len == 0 && 22909 ip_mforward(ill, ipha, mp)) { 22910 freemsg(first_mp); 22911 ip1dbg(("ip_wput: mforward failed\n")); 22912 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22913 "ip_wput_ire_end: q %p (%S)", 22914 q, "mforward failed"); 22915 ire_refrele(ire); 22916 if (conn_outgoing_ill != NULL) 22917 ill_refrele(conn_outgoing_ill); 22918 return; 22919 } 22920 } 22921 } 22922 max_frag = ire->ire_max_frag; 22923 cksum += ttl_protocol; 22924 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 22925 /* No fragmentation required for this one. */ 22926 /* 22927 * Don't use frag_flag if packet is pre-built or source 22928 * routed or if multicast (since multicast packets do 22929 * not solicit ICMP "packet too big" messages). 22930 */ 22931 if ((ip_hdr_included != IP_HDR_INCLUDED) && 22932 (V_HLEN == IP_SIMPLE_HDR_VERSION || 22933 !ip_source_route_included(ipha)) && 22934 !CLASSD(ipha->ipha_dst)) 22935 ipha->ipha_fragment_offset_and_flags |= 22936 htons(ire->ire_frag_flag); 22937 22938 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 22939 /* Complete the IP header checksum. */ 22940 cksum += ipha->ipha_ident; 22941 cksum += (v_hlen_tos_len >> 16)+ 22942 (v_hlen_tos_len & 0xFFFF); 22943 cksum += ipha->ipha_fragment_offset_and_flags; 22944 hlen = (V_HLEN & 0xF) - 22945 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 22946 if (hlen) { 22947 checksumoptions: 22948 /* 22949 * Account for the IP Options in the IP 22950 * header checksum. 22951 */ 22952 up = (uint16_t *)(rptr+ 22953 IP_SIMPLE_HDR_LENGTH); 22954 do { 22955 cksum += up[0]; 22956 cksum += up[1]; 22957 up += 2; 22958 } while (--hlen); 22959 } 22960 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 22961 cksum = ~(cksum + (cksum >> 16)); 22962 ipha->ipha_hdr_checksum = (uint16_t)cksum; 22963 } 22964 if (ipsec_len != 0) { 22965 ipsec_out_process(q, first_mp, ire, ill_index); 22966 if (!next_mp) { 22967 ire_refrele(ire); 22968 if (conn_outgoing_ill != NULL) 22969 ill_refrele(conn_outgoing_ill); 22970 return; 22971 } 22972 goto next; 22973 } 22974 22975 /* 22976 * multirt_send has already been handled 22977 * for broadcast, but not yet for multicast 22978 * or IP options. 22979 */ 22980 if (next_mp == NULL) { 22981 if (ire->ire_flags & RTF_MULTIRT) { 22982 multirt_send = B_TRUE; 22983 } 22984 } 22985 22986 /* 22987 * In most cases, the emission loop below is 22988 * entered only once. Only in the case where 22989 * the ire holds the RTF_MULTIRT flag, do we loop 22990 * to process all RTF_MULTIRT ires in the bucket, 22991 * and send the packet through all crossed 22992 * RTF_MULTIRT routes. 22993 */ 22994 do { 22995 if (multirt_send) { 22996 irb_t *irb; 22997 22998 irb = ire->ire_bucket; 22999 ASSERT(irb != NULL); 23000 /* 23001 * We are in a multiple send case, 23002 * need to get the next IRE and make 23003 * a duplicate of the packet. 23004 */ 23005 IRB_REFHOLD(irb); 23006 for (ire1 = ire->ire_next; 23007 ire1 != NULL; 23008 ire1 = ire1->ire_next) { 23009 if (!(ire1->ire_flags & 23010 RTF_MULTIRT)) 23011 continue; 23012 23013 if (ire1->ire_addr != 23014 ire->ire_addr) 23015 continue; 23016 23017 if (ire1->ire_marks & 23018 (IRE_MARK_CONDEMNED | 23019 IRE_MARK_TESTHIDDEN)) 23020 continue; 23021 23022 /* Got one */ 23023 IRE_REFHOLD(ire1); 23024 break; 23025 } 23026 IRB_REFRELE(irb); 23027 23028 if (ire1 != NULL) { 23029 next_mp = copyb(mp); 23030 if ((next_mp == NULL) || 23031 ((mp->b_cont != NULL) && 23032 ((next_mp->b_cont = 23033 dupmsg(mp->b_cont)) 23034 == NULL))) { 23035 freemsg(next_mp); 23036 next_mp = NULL; 23037 ire_refrele(ire1); 23038 ire1 = NULL; 23039 } 23040 } 23041 23042 /* 23043 * Last multiroute ire; don't loop 23044 * anymore. The emission is over 23045 * and next_mp is NULL. 23046 */ 23047 if (ire1 == NULL) { 23048 multirt_send = B_FALSE; 23049 } 23050 } 23051 23052 out_ill = ire_to_ill(ire); 23053 DTRACE_PROBE4(ip4__physical__out__start, 23054 ill_t *, NULL, 23055 ill_t *, out_ill, 23056 ipha_t *, ipha, mblk_t *, mp); 23057 FW_HOOKS(ipst->ips_ip4_physical_out_event, 23058 ipst->ips_ipv4firewall_physical_out, 23059 NULL, out_ill, ipha, mp, mp, 0, ipst); 23060 DTRACE_PROBE1(ip4__physical__out__end, 23061 mblk_t *, mp); 23062 if (mp == NULL) 23063 goto release_ire_and_ill_2; 23064 23065 ASSERT(ipsec_len == 0); 23066 mp->b_prev = 23067 SET_BPREV_FLAG(IPP_LOCAL_OUT); 23068 DTRACE_PROBE2(ip__xmit__2, 23069 mblk_t *, mp, ire_t *, ire); 23070 pktxmit_state = ip_xmit_v4(mp, ire, 23071 NULL, B_TRUE, connp); 23072 if ((pktxmit_state == SEND_FAILED) || 23073 (pktxmit_state == LLHDR_RESLV_FAILED)) { 23074 release_ire_and_ill_2: 23075 if (next_mp) { 23076 freemsg(next_mp); 23077 ire_refrele(ire1); 23078 } 23079 ire_refrele(ire); 23080 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23081 "ip_wput_ire_end: q %p (%S)", 23082 q, "discard MDATA"); 23083 if (conn_outgoing_ill != NULL) 23084 ill_refrele(conn_outgoing_ill); 23085 return; 23086 } 23087 23088 if (CLASSD(dst)) { 23089 BUMP_MIB(out_ill->ill_ip_mib, 23090 ipIfStatsHCOutMcastPkts); 23091 UPDATE_MIB(out_ill->ill_ip_mib, 23092 ipIfStatsHCOutMcastOctets, 23093 LENGTH); 23094 } else if (ire->ire_type == IRE_BROADCAST) { 23095 BUMP_MIB(out_ill->ill_ip_mib, 23096 ipIfStatsHCOutBcastPkts); 23097 } 23098 23099 if (multirt_send) { 23100 /* 23101 * We are in a multiple send case, 23102 * need to re-enter the sending loop 23103 * using the next ire. 23104 */ 23105 ire_refrele(ire); 23106 ire = ire1; 23107 stq = ire->ire_stq; 23108 mp = next_mp; 23109 next_mp = NULL; 23110 ipha = (ipha_t *)mp->b_rptr; 23111 ill_index = Q_TO_INDEX(stq); 23112 } 23113 } while (multirt_send); 23114 23115 if (!next_mp) { 23116 /* 23117 * Last copy going out (the ultra-common 23118 * case). Note that we intentionally replicate 23119 * the putnext rather than calling it before 23120 * the next_mp check in hopes of a little 23121 * tail-call action out of the compiler. 23122 */ 23123 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23124 "ip_wput_ire_end: q %p (%S)", 23125 q, "last copy out(1)"); 23126 ire_refrele(ire); 23127 if (conn_outgoing_ill != NULL) 23128 ill_refrele(conn_outgoing_ill); 23129 return; 23130 } 23131 /* More copies going out below. */ 23132 } else { 23133 int offset; 23134 fragmentit: 23135 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 23136 /* 23137 * If this would generate a icmp_frag_needed message, 23138 * we need to handle it before we do the IPsec 23139 * processing. Otherwise, we need to strip the IPsec 23140 * headers before we send up the message to the ULPs 23141 * which becomes messy and difficult. 23142 */ 23143 if (ipsec_len != 0) { 23144 if ((max_frag < (unsigned int)(LENGTH + 23145 ipsec_len)) && (offset & IPH_DF)) { 23146 out_ill = (ill_t *)stq->q_ptr; 23147 BUMP_MIB(out_ill->ill_ip_mib, 23148 ipIfStatsOutFragFails); 23149 BUMP_MIB(out_ill->ill_ip_mib, 23150 ipIfStatsOutFragReqds); 23151 ipha->ipha_hdr_checksum = 0; 23152 ipha->ipha_hdr_checksum = 23153 (uint16_t)ip_csum_hdr(ipha); 23154 icmp_frag_needed(ire->ire_stq, first_mp, 23155 max_frag, zoneid, ipst); 23156 if (!next_mp) { 23157 ire_refrele(ire); 23158 if (conn_outgoing_ill != NULL) { 23159 ill_refrele( 23160 conn_outgoing_ill); 23161 } 23162 return; 23163 } 23164 } else { 23165 /* 23166 * This won't cause a icmp_frag_needed 23167 * message. to be generated. Send it on 23168 * the wire. Note that this could still 23169 * cause fragmentation and all we 23170 * do is the generation of the message 23171 * to the ULP if needed before IPsec. 23172 */ 23173 if (!next_mp) { 23174 ipsec_out_process(q, first_mp, 23175 ire, ill_index); 23176 TRACE_2(TR_FAC_IP, 23177 TR_IP_WPUT_IRE_END, 23178 "ip_wput_ire_end: q %p " 23179 "(%S)", q, 23180 "last ipsec_out_process"); 23181 ire_refrele(ire); 23182 if (conn_outgoing_ill != NULL) { 23183 ill_refrele( 23184 conn_outgoing_ill); 23185 } 23186 return; 23187 } 23188 ipsec_out_process(q, first_mp, 23189 ire, ill_index); 23190 } 23191 } else { 23192 /* 23193 * Initiate IPPF processing. For 23194 * fragmentable packets we finish 23195 * all QOS packet processing before 23196 * calling: 23197 * ip_wput_ire_fragmentit->ip_wput_frag 23198 */ 23199 23200 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23201 ip_process(IPP_LOCAL_OUT, &mp, 23202 ill_index); 23203 if (mp == NULL) { 23204 out_ill = (ill_t *)stq->q_ptr; 23205 BUMP_MIB(out_ill->ill_ip_mib, 23206 ipIfStatsOutDiscards); 23207 if (next_mp != NULL) { 23208 freemsg(next_mp); 23209 ire_refrele(ire1); 23210 } 23211 ire_refrele(ire); 23212 TRACE_2(TR_FAC_IP, 23213 TR_IP_WPUT_IRE_END, 23214 "ip_wput_ire: q %p (%S)", 23215 q, "discard MDATA"); 23216 if (conn_outgoing_ill != NULL) { 23217 ill_refrele( 23218 conn_outgoing_ill); 23219 } 23220 return; 23221 } 23222 } 23223 if (!next_mp) { 23224 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23225 "ip_wput_ire_end: q %p (%S)", 23226 q, "last fragmentation"); 23227 ip_wput_ire_fragmentit(mp, ire, 23228 zoneid, ipst, connp); 23229 ire_refrele(ire); 23230 if (conn_outgoing_ill != NULL) 23231 ill_refrele(conn_outgoing_ill); 23232 return; 23233 } 23234 ip_wput_ire_fragmentit(mp, ire, 23235 zoneid, ipst, connp); 23236 } 23237 } 23238 } else { 23239 nullstq: 23240 /* A NULL stq means the destination address is local. */ 23241 UPDATE_OB_PKT_COUNT(ire); 23242 ire->ire_last_used_time = lbolt; 23243 ASSERT(ire->ire_ipif != NULL); 23244 if (!next_mp) { 23245 /* 23246 * Is there an "in" and "out" for traffic local 23247 * to a host (loopback)? The code in Solaris doesn't 23248 * explicitly draw a line in its code for in vs out, 23249 * so we've had to draw a line in the sand: ip_wput_ire 23250 * is considered to be the "output" side and 23251 * ip_wput_local to be the "input" side. 23252 */ 23253 out_ill = ire_to_ill(ire); 23254 23255 /* 23256 * DTrace this as ip:::send. A blocked packet will 23257 * fire the send probe, but not the receive probe. 23258 */ 23259 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23260 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23261 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23262 23263 DTRACE_PROBE4(ip4__loopback__out__start, 23264 ill_t *, NULL, ill_t *, out_ill, 23265 ipha_t *, ipha, mblk_t *, first_mp); 23266 23267 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23268 ipst->ips_ipv4firewall_loopback_out, 23269 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23270 23271 DTRACE_PROBE1(ip4__loopback__out_end, 23272 mblk_t *, first_mp); 23273 23274 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23275 "ip_wput_ire_end: q %p (%S)", 23276 q, "local address"); 23277 23278 if (first_mp != NULL) 23279 ip_wput_local(q, out_ill, ipha, 23280 first_mp, ire, 0, ire->ire_zoneid); 23281 ire_refrele(ire); 23282 if (conn_outgoing_ill != NULL) 23283 ill_refrele(conn_outgoing_ill); 23284 return; 23285 } 23286 23287 out_ill = ire_to_ill(ire); 23288 23289 /* 23290 * DTrace this as ip:::send. A blocked packet will fire the 23291 * send probe, but not the receive probe. 23292 */ 23293 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23294 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23295 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23296 23297 DTRACE_PROBE4(ip4__loopback__out__start, 23298 ill_t *, NULL, ill_t *, out_ill, 23299 ipha_t *, ipha, mblk_t *, first_mp); 23300 23301 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23302 ipst->ips_ipv4firewall_loopback_out, 23303 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23304 23305 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp); 23306 23307 if (first_mp != NULL) 23308 ip_wput_local(q, out_ill, ipha, 23309 first_mp, ire, 0, ire->ire_zoneid); 23310 } 23311 next: 23312 /* 23313 * More copies going out to additional interfaces. 23314 * ire1 has already been held. We don't need the 23315 * "ire" anymore. 23316 */ 23317 ire_refrele(ire); 23318 ire = ire1; 23319 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 23320 mp = next_mp; 23321 ASSERT(ire->ire_ipversion == IPV4_VERSION); 23322 ill = ire_to_ill(ire); 23323 first_mp = mp; 23324 if (ipsec_len != 0) { 23325 ASSERT(first_mp->b_datap->db_type == M_CTL); 23326 mp = mp->b_cont; 23327 } 23328 dst = ire->ire_addr; 23329 ipha = (ipha_t *)mp->b_rptr; 23330 /* 23331 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 23332 * Restore ipha_ident "no checksum" flag. 23333 */ 23334 src = orig_src; 23335 ipha->ipha_ident = ip_hdr_included; 23336 goto another; 23337 23338 #undef rptr 23339 #undef Q_TO_INDEX 23340 } 23341 23342 /* 23343 * Routine to allocate a message that is used to notify the ULP about MDT. 23344 * The caller may provide a pointer to the link-layer MDT capabilities, 23345 * or NULL if MDT is to be disabled on the stream. 23346 */ 23347 mblk_t * 23348 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 23349 { 23350 mblk_t *mp; 23351 ip_mdt_info_t *mdti; 23352 ill_mdt_capab_t *idst; 23353 23354 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 23355 DB_TYPE(mp) = M_CTL; 23356 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 23357 mdti = (ip_mdt_info_t *)mp->b_rptr; 23358 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 23359 idst = &(mdti->mdt_capab); 23360 23361 /* 23362 * If the caller provides us with the capability, copy 23363 * it over into our notification message; otherwise 23364 * we zero out the capability portion. 23365 */ 23366 if (isrc != NULL) 23367 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23368 else 23369 bzero((caddr_t)idst, sizeof (*idst)); 23370 } 23371 return (mp); 23372 } 23373 23374 /* 23375 * Routine which determines whether MDT can be enabled on the destination 23376 * IRE and IPC combination, and if so, allocates and returns the MDT 23377 * notification mblk that may be used by ULP. We also check if we need to 23378 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 23379 * MDT usage in the past have been lifted. This gets called during IP 23380 * and ULP binding. 23381 */ 23382 mblk_t * 23383 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23384 ill_mdt_capab_t *mdt_cap) 23385 { 23386 mblk_t *mp; 23387 boolean_t rc = B_FALSE; 23388 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23389 23390 ASSERT(dst_ire != NULL); 23391 ASSERT(connp != NULL); 23392 ASSERT(mdt_cap != NULL); 23393 23394 /* 23395 * Currently, we only support simple TCP/{IPv4,IPv6} with 23396 * Multidata, which is handled in tcp_multisend(). This 23397 * is the reason why we do all these checks here, to ensure 23398 * that we don't enable Multidata for the cases which we 23399 * can't handle at the moment. 23400 */ 23401 do { 23402 /* Only do TCP at the moment */ 23403 if (connp->conn_ulp != IPPROTO_TCP) 23404 break; 23405 23406 /* 23407 * IPsec outbound policy present? Note that we get here 23408 * after calling ipsec_conn_cache_policy() where the global 23409 * policy checking is performed. conn_latch will be 23410 * non-NULL as long as there's a policy defined, 23411 * i.e. conn_out_enforce_policy may be NULL in such case 23412 * when the connection is non-secure, and hence we check 23413 * further if the latch refers to an outbound policy. 23414 */ 23415 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 23416 break; 23417 23418 /* CGTP (multiroute) is enabled? */ 23419 if (dst_ire->ire_flags & RTF_MULTIRT) 23420 break; 23421 23422 /* Outbound IPQoS enabled? */ 23423 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23424 /* 23425 * In this case, we disable MDT for this and all 23426 * future connections going over the interface. 23427 */ 23428 mdt_cap->ill_mdt_on = 0; 23429 break; 23430 } 23431 23432 /* socket option(s) present? */ 23433 if (!CONN_IS_LSO_MD_FASTPATH(connp)) 23434 break; 23435 23436 rc = B_TRUE; 23437 /* CONSTCOND */ 23438 } while (0); 23439 23440 /* Remember the result */ 23441 connp->conn_mdt_ok = rc; 23442 23443 if (!rc) 23444 return (NULL); 23445 else if (!mdt_cap->ill_mdt_on) { 23446 /* 23447 * If MDT has been previously turned off in the past, and we 23448 * currently can do MDT (due to IPQoS policy removal, etc.) 23449 * then enable it for this interface. 23450 */ 23451 mdt_cap->ill_mdt_on = 1; 23452 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 23453 "interface %s\n", ill_name)); 23454 } 23455 23456 /* Allocate the MDT info mblk */ 23457 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 23458 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 23459 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23460 return (NULL); 23461 } 23462 return (mp); 23463 } 23464 23465 /* 23466 * Routine to allocate a message that is used to notify the ULP about LSO. 23467 * The caller may provide a pointer to the link-layer LSO capabilities, 23468 * or NULL if LSO is to be disabled on the stream. 23469 */ 23470 mblk_t * 23471 ip_lsoinfo_alloc(ill_lso_capab_t *isrc) 23472 { 23473 mblk_t *mp; 23474 ip_lso_info_t *lsoi; 23475 ill_lso_capab_t *idst; 23476 23477 if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) { 23478 DB_TYPE(mp) = M_CTL; 23479 mp->b_wptr = mp->b_rptr + sizeof (*lsoi); 23480 lsoi = (ip_lso_info_t *)mp->b_rptr; 23481 lsoi->lso_info_id = LSO_IOC_INFO_UPDATE; 23482 idst = &(lsoi->lso_capab); 23483 23484 /* 23485 * If the caller provides us with the capability, copy 23486 * it over into our notification message; otherwise 23487 * we zero out the capability portion. 23488 */ 23489 if (isrc != NULL) 23490 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23491 else 23492 bzero((caddr_t)idst, sizeof (*idst)); 23493 } 23494 return (mp); 23495 } 23496 23497 /* 23498 * Routine which determines whether LSO can be enabled on the destination 23499 * IRE and IPC combination, and if so, allocates and returns the LSO 23500 * notification mblk that may be used by ULP. We also check if we need to 23501 * turn LSO back to 'on' when certain restrictions prohibiting us to allow 23502 * LSO usage in the past have been lifted. This gets called during IP 23503 * and ULP binding. 23504 */ 23505 mblk_t * 23506 ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23507 ill_lso_capab_t *lso_cap) 23508 { 23509 mblk_t *mp; 23510 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23511 23512 ASSERT(dst_ire != NULL); 23513 ASSERT(connp != NULL); 23514 ASSERT(lso_cap != NULL); 23515 23516 connp->conn_lso_ok = B_TRUE; 23517 23518 if ((connp->conn_ulp != IPPROTO_TCP) || 23519 CONN_IPSEC_OUT_ENCAPSULATED(connp) || 23520 (dst_ire->ire_flags & RTF_MULTIRT) || 23521 !CONN_IS_LSO_MD_FASTPATH(connp) || 23522 (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { 23523 connp->conn_lso_ok = B_FALSE; 23524 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23525 /* 23526 * Disable LSO for this and all future connections going 23527 * over the interface. 23528 */ 23529 lso_cap->ill_lso_on = 0; 23530 } 23531 } 23532 23533 if (!connp->conn_lso_ok) 23534 return (NULL); 23535 else if (!lso_cap->ill_lso_on) { 23536 /* 23537 * If LSO has been previously turned off in the past, and we 23538 * currently can do LSO (due to IPQoS policy removal, etc.) 23539 * then enable it for this interface. 23540 */ 23541 lso_cap->ill_lso_on = 1; 23542 ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n", 23543 ill_name)); 23544 } 23545 23546 /* Allocate the LSO info mblk */ 23547 if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL) 23548 ip0dbg(("ip_lsoinfo_return: can't enable LSO for " 23549 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23550 23551 return (mp); 23552 } 23553 23554 /* 23555 * Create destination address attribute, and fill it with the physical 23556 * destination address and SAP taken from the template DL_UNITDATA_REQ 23557 * message block. 23558 */ 23559 boolean_t 23560 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 23561 { 23562 dl_unitdata_req_t *dlurp; 23563 pattr_t *pa; 23564 pattrinfo_t pa_info; 23565 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 23566 uint_t das_len, das_off; 23567 23568 ASSERT(dlmp != NULL); 23569 23570 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 23571 das_len = dlurp->dl_dest_addr_length; 23572 das_off = dlurp->dl_dest_addr_offset; 23573 23574 pa_info.type = PATTR_DSTADDRSAP; 23575 pa_info.len = sizeof (**das) + das_len - 1; 23576 23577 /* create and associate the attribute */ 23578 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23579 if (pa != NULL) { 23580 ASSERT(*das != NULL); 23581 (*das)->addr_is_group = 0; 23582 (*das)->addr_len = (uint8_t)das_len; 23583 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 23584 } 23585 23586 return (pa != NULL); 23587 } 23588 23589 /* 23590 * Create hardware checksum attribute and fill it with the values passed. 23591 */ 23592 boolean_t 23593 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 23594 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 23595 { 23596 pattr_t *pa; 23597 pattrinfo_t pa_info; 23598 23599 ASSERT(mmd != NULL); 23600 23601 pa_info.type = PATTR_HCKSUM; 23602 pa_info.len = sizeof (pattr_hcksum_t); 23603 23604 /* create and associate the attribute */ 23605 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23606 if (pa != NULL) { 23607 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 23608 23609 hck->hcksum_start_offset = start_offset; 23610 hck->hcksum_stuff_offset = stuff_offset; 23611 hck->hcksum_end_offset = end_offset; 23612 hck->hcksum_flags = flags; 23613 } 23614 return (pa != NULL); 23615 } 23616 23617 /* 23618 * Create zerocopy attribute and fill it with the specified flags 23619 */ 23620 boolean_t 23621 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 23622 { 23623 pattr_t *pa; 23624 pattrinfo_t pa_info; 23625 23626 ASSERT(mmd != NULL); 23627 pa_info.type = PATTR_ZCOPY; 23628 pa_info.len = sizeof (pattr_zcopy_t); 23629 23630 /* create and associate the attribute */ 23631 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23632 if (pa != NULL) { 23633 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 23634 23635 zcopy->zcopy_flags = flags; 23636 } 23637 return (pa != NULL); 23638 } 23639 23640 /* 23641 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 23642 * block chain. We could rewrite to handle arbitrary message block chains but 23643 * that would make the code complicated and slow. Right now there three 23644 * restrictions: 23645 * 23646 * 1. The first message block must contain the complete IP header and 23647 * at least 1 byte of payload data. 23648 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 23649 * so that we can use a single Multidata message. 23650 * 3. No frag must be distributed over two or more message blocks so 23651 * that we don't need more than two packet descriptors per frag. 23652 * 23653 * The above restrictions allow us to support userland applications (which 23654 * will send down a single message block) and NFS over UDP (which will 23655 * send down a chain of at most three message blocks). 23656 * 23657 * We also don't use MDT for payloads with less than or equal to 23658 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 23659 */ 23660 boolean_t 23661 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 23662 { 23663 int blocks; 23664 ssize_t total, missing, size; 23665 23666 ASSERT(mp != NULL); 23667 ASSERT(hdr_len > 0); 23668 23669 size = MBLKL(mp) - hdr_len; 23670 if (size <= 0) 23671 return (B_FALSE); 23672 23673 /* The first mblk contains the header and some payload. */ 23674 blocks = 1; 23675 total = size; 23676 size %= len; 23677 missing = (size == 0) ? 0 : (len - size); 23678 mp = mp->b_cont; 23679 23680 while (mp != NULL) { 23681 /* 23682 * Give up if we encounter a zero length message block. 23683 * In practice, this should rarely happen and therefore 23684 * not worth the trouble of freeing and re-linking the 23685 * mblk from the chain to handle such case. 23686 */ 23687 if ((size = MBLKL(mp)) == 0) 23688 return (B_FALSE); 23689 23690 /* Too many payload buffers for a single Multidata message? */ 23691 if (++blocks > MULTIDATA_MAX_PBUFS) 23692 return (B_FALSE); 23693 23694 total += size; 23695 /* Is a frag distributed over two or more message blocks? */ 23696 if (missing > size) 23697 return (B_FALSE); 23698 size -= missing; 23699 23700 size %= len; 23701 missing = (size == 0) ? 0 : (len - size); 23702 23703 mp = mp->b_cont; 23704 } 23705 23706 return (total > ip_wput_frag_mdt_min); 23707 } 23708 23709 /* 23710 * Outbound IPv4 fragmentation routine using MDT. 23711 */ 23712 static void 23713 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 23714 uint32_t frag_flag, int offset) 23715 { 23716 ipha_t *ipha_orig; 23717 int i1, ip_data_end; 23718 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 23719 mblk_t *hdr_mp, *md_mp = NULL; 23720 unsigned char *hdr_ptr, *pld_ptr; 23721 multidata_t *mmd; 23722 ip_pdescinfo_t pdi; 23723 ill_t *ill; 23724 ip_stack_t *ipst = ire->ire_ipst; 23725 23726 ASSERT(DB_TYPE(mp) == M_DATA); 23727 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 23728 23729 ill = ire_to_ill(ire); 23730 ASSERT(ill != NULL); 23731 23732 ipha_orig = (ipha_t *)mp->b_rptr; 23733 mp->b_rptr += sizeof (ipha_t); 23734 23735 /* Calculate how many packets we will send out */ 23736 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 23737 pkts = (i1 + len - 1) / len; 23738 ASSERT(pkts > 1); 23739 23740 /* Allocate a message block which will hold all the IP Headers. */ 23741 wroff = ipst->ips_ip_wroff_extra; 23742 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 23743 23744 i1 = pkts * hdr_chunk_len; 23745 /* 23746 * Create the header buffer, Multidata and destination address 23747 * and SAP attribute that should be associated with it. 23748 */ 23749 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 23750 ((hdr_mp->b_wptr += i1), 23751 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 23752 !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { 23753 freemsg(mp); 23754 if (md_mp == NULL) { 23755 freemsg(hdr_mp); 23756 } else { 23757 free_mmd: IP_STAT(ipst, ip_frag_mdt_discarded); 23758 freemsg(md_mp); 23759 } 23760 IP_STAT(ipst, ip_frag_mdt_allocfail); 23761 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); 23762 return; 23763 } 23764 IP_STAT(ipst, ip_frag_mdt_allocd); 23765 23766 /* 23767 * Add a payload buffer to the Multidata; this operation must not 23768 * fail, or otherwise our logic in this routine is broken. There 23769 * is no memory allocation done by the routine, so any returned 23770 * failure simply tells us that we've done something wrong. 23771 * 23772 * A failure tells us that either we're adding the same payload 23773 * buffer more than once, or we're trying to add more buffers than 23774 * allowed. None of the above cases should happen, and we panic 23775 * because either there's horrible heap corruption, and/or 23776 * programming mistake. 23777 */ 23778 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 23779 goto pbuf_panic; 23780 23781 hdr_ptr = hdr_mp->b_rptr; 23782 pld_ptr = mp->b_rptr; 23783 23784 /* Establish the ending byte offset, based on the starting offset. */ 23785 offset <<= 3; 23786 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 23787 IP_SIMPLE_HDR_LENGTH; 23788 23789 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 23790 23791 while (pld_ptr < mp->b_wptr) { 23792 ipha_t *ipha; 23793 uint16_t offset_and_flags; 23794 uint16_t ip_len; 23795 int error; 23796 23797 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 23798 ipha = (ipha_t *)(hdr_ptr + wroff); 23799 ASSERT(OK_32PTR(ipha)); 23800 *ipha = *ipha_orig; 23801 23802 if (ip_data_end - offset > len) { 23803 offset_and_flags = IPH_MF; 23804 } else { 23805 /* 23806 * Last frag. Set len to the length of this last piece. 23807 */ 23808 len = ip_data_end - offset; 23809 /* A frag of a frag might have IPH_MF non-zero */ 23810 offset_and_flags = 23811 ntohs(ipha->ipha_fragment_offset_and_flags) & 23812 IPH_MF; 23813 } 23814 offset_and_flags |= (uint16_t)(offset >> 3); 23815 offset_and_flags |= (uint16_t)frag_flag; 23816 /* Store the offset and flags in the IP header. */ 23817 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 23818 23819 /* Store the length in the IP header. */ 23820 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 23821 ipha->ipha_length = htons(ip_len); 23822 23823 /* 23824 * Set the IP header checksum. Note that mp is just 23825 * the header, so this is easy to pass to ip_csum. 23826 */ 23827 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 23828 23829 DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *, 23830 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 23831 NULL, int, 0); 23832 23833 /* 23834 * Record offset and size of header and data of the next packet 23835 * in the multidata message. 23836 */ 23837 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 23838 PDESC_PLD_INIT(&pdi); 23839 i1 = MIN(mp->b_wptr - pld_ptr, len); 23840 ASSERT(i1 > 0); 23841 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 23842 if (i1 == len) { 23843 pld_ptr += len; 23844 } else { 23845 i1 = len - i1; 23846 mp = mp->b_cont; 23847 ASSERT(mp != NULL); 23848 ASSERT(MBLKL(mp) >= i1); 23849 /* 23850 * Attach the next payload message block to the 23851 * multidata message. 23852 */ 23853 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 23854 goto pbuf_panic; 23855 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 23856 pld_ptr = mp->b_rptr + i1; 23857 } 23858 23859 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 23860 KM_NOSLEEP)) == NULL) { 23861 /* 23862 * Any failure other than ENOMEM indicates that we 23863 * have passed in invalid pdesc info or parameters 23864 * to mmd_addpdesc, which must not happen. 23865 * 23866 * EINVAL is a result of failure on boundary checks 23867 * against the pdesc info contents. It should not 23868 * happen, and we panic because either there's 23869 * horrible heap corruption, and/or programming 23870 * mistake. 23871 */ 23872 if (error != ENOMEM) { 23873 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 23874 "pdesc logic error detected for " 23875 "mmd %p pinfo %p (%d)\n", 23876 (void *)mmd, (void *)&pdi, error); 23877 /* NOTREACHED */ 23878 } 23879 IP_STAT(ipst, ip_frag_mdt_addpdescfail); 23880 /* Free unattached payload message blocks as well */ 23881 md_mp->b_cont = mp->b_cont; 23882 goto free_mmd; 23883 } 23884 23885 /* Advance fragment offset. */ 23886 offset += len; 23887 23888 /* Advance to location for next header in the buffer. */ 23889 hdr_ptr += hdr_chunk_len; 23890 23891 /* Did we reach the next payload message block? */ 23892 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 23893 mp = mp->b_cont; 23894 /* 23895 * Attach the next message block with payload 23896 * data to the multidata message. 23897 */ 23898 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 23899 goto pbuf_panic; 23900 pld_ptr = mp->b_rptr; 23901 } 23902 } 23903 23904 ASSERT(hdr_mp->b_wptr == hdr_ptr); 23905 ASSERT(mp->b_wptr == pld_ptr); 23906 23907 /* Update IP statistics */ 23908 IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts); 23909 23910 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts); 23911 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); 23912 23913 len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH; 23914 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts); 23915 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len); 23916 23917 if (pkt_type == OB_PKT) { 23918 ire->ire_ob_pkt_count += pkts; 23919 if (ire->ire_ipif != NULL) 23920 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 23921 } else { 23922 /* The type is IB_PKT in the forwarding path. */ 23923 ire->ire_ib_pkt_count += pkts; 23924 ASSERT(!IRE_IS_LOCAL(ire)); 23925 if (ire->ire_type & IRE_BROADCAST) { 23926 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 23927 } else { 23928 UPDATE_MIB(ill->ill_ip_mib, 23929 ipIfStatsHCOutForwDatagrams, pkts); 23930 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 23931 } 23932 } 23933 ire->ire_last_used_time = lbolt; 23934 /* Send it down */ 23935 putnext(ire->ire_stq, md_mp); 23936 return; 23937 23938 pbuf_panic: 23939 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 23940 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 23941 pbuf_idx); 23942 /* NOTREACHED */ 23943 } 23944 23945 /* 23946 * Outbound IP fragmentation routine. 23947 * 23948 * NOTE : This routine does not ire_refrele the ire that is passed in 23949 * as the argument. 23950 */ 23951 static void 23952 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 23953 uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp) 23954 { 23955 int i1; 23956 mblk_t *ll_hdr_mp; 23957 int ll_hdr_len; 23958 int hdr_len; 23959 mblk_t *hdr_mp; 23960 ipha_t *ipha; 23961 int ip_data_end; 23962 int len; 23963 mblk_t *mp = mp_orig, *mp1; 23964 int offset; 23965 queue_t *q; 23966 uint32_t v_hlen_tos_len; 23967 mblk_t *first_mp; 23968 boolean_t mctl_present; 23969 ill_t *ill; 23970 ill_t *out_ill; 23971 mblk_t *xmit_mp; 23972 mblk_t *carve_mp; 23973 ire_t *ire1 = NULL; 23974 ire_t *save_ire = NULL; 23975 mblk_t *next_mp = NULL; 23976 boolean_t last_frag = B_FALSE; 23977 boolean_t multirt_send = B_FALSE; 23978 ire_t *first_ire = NULL; 23979 irb_t *irb = NULL; 23980 mib2_ipIfStatsEntry_t *mibptr = NULL; 23981 23982 ill = ire_to_ill(ire); 23983 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 23984 23985 BUMP_MIB(mibptr, ipIfStatsOutFragReqds); 23986 23987 if (max_frag == 0) { 23988 ip1dbg(("ip_wput_frag: ire frag size is 0" 23989 " - dropping packet\n")); 23990 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 23991 freemsg(mp); 23992 return; 23993 } 23994 23995 /* 23996 * IPsec does not allow hw accelerated packets to be fragmented 23997 * This check is made in ip_wput_ipsec_out prior to coming here 23998 * via ip_wput_ire_fragmentit. 23999 * 24000 * If at this point we have an ire whose ARP request has not 24001 * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger 24002 * sending of ARP query and change ire's state to ND_INCOMPLETE. 24003 * This packet and all fragmentable packets for this ire will 24004 * continue to get dropped while ire_nce->nce_state remains in 24005 * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to 24006 * ND_REACHABLE, all subsquent large packets for this ire will 24007 * get fragemented and sent out by this function. 24008 */ 24009 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 24010 /* If nce_state is ND_INITIAL, trigger ARP query */ 24011 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); 24012 ip1dbg(("ip_wput_frag: mac address for ire is unresolved" 24013 " - dropping packet\n")); 24014 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24015 freemsg(mp); 24016 return; 24017 } 24018 24019 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 24020 "ip_wput_frag_start:"); 24021 24022 if (mp->b_datap->db_type == M_CTL) { 24023 first_mp = mp; 24024 mp_orig = mp = mp->b_cont; 24025 mctl_present = B_TRUE; 24026 } else { 24027 first_mp = mp; 24028 mctl_present = B_FALSE; 24029 } 24030 24031 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 24032 ipha = (ipha_t *)mp->b_rptr; 24033 24034 /* 24035 * If the Don't Fragment flag is on, generate an ICMP destination 24036 * unreachable, fragmentation needed. 24037 */ 24038 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 24039 if (offset & IPH_DF) { 24040 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24041 if (is_system_labeled()) { 24042 max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag, 24043 ire->ire_max_frag - max_frag, AF_INET); 24044 } 24045 /* 24046 * Need to compute hdr checksum if called from ip_wput_ire. 24047 * Note that ip_rput_forward verifies the checksum before 24048 * calling this routine so in that case this is a noop. 24049 */ 24050 ipha->ipha_hdr_checksum = 0; 24051 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24052 icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid, 24053 ipst); 24054 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24055 "ip_wput_frag_end:(%S)", 24056 "don't fragment"); 24057 return; 24058 } 24059 /* 24060 * Labeled systems adjust max_frag if they add a label 24061 * to send the correct path mtu. We need the real mtu since we 24062 * are fragmenting the packet after label adjustment. 24063 */ 24064 if (is_system_labeled()) 24065 max_frag = ire->ire_max_frag; 24066 if (mctl_present) 24067 freeb(first_mp); 24068 /* 24069 * Establish the starting offset. May not be zero if we are fragging 24070 * a fragment that is being forwarded. 24071 */ 24072 offset = offset & IPH_OFFSET; 24073 24074 /* TODO why is this test needed? */ 24075 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 24076 if (((max_frag - LENGTH) & ~7) < 8) { 24077 /* TODO: notify ulp somehow */ 24078 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24079 freemsg(mp); 24080 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24081 "ip_wput_frag_end:(%S)", 24082 "len < 8"); 24083 return; 24084 } 24085 24086 hdr_len = (V_HLEN & 0xF) << 2; 24087 24088 ipha->ipha_hdr_checksum = 0; 24089 24090 /* 24091 * Establish the number of bytes maximum per frag, after putting 24092 * in the header. 24093 */ 24094 len = (max_frag - hdr_len) & ~7; 24095 24096 /* Check if we can use MDT to send out the frags. */ 24097 ASSERT(!IRE_IS_LOCAL(ire)); 24098 if (hdr_len == IP_SIMPLE_HDR_LENGTH && 24099 ipst->ips_ip_multidata_outbound && 24100 !(ire->ire_flags & RTF_MULTIRT) && 24101 !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && 24102 ill != NULL && ILL_MDT_CAPABLE(ill) && 24103 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 24104 ASSERT(ill->ill_mdt_capab != NULL); 24105 if (!ill->ill_mdt_capab->ill_mdt_on) { 24106 /* 24107 * If MDT has been previously turned off in the past, 24108 * and we currently can do MDT (due to IPQoS policy 24109 * removal, etc.) then enable it for this interface. 24110 */ 24111 ill->ill_mdt_capab->ill_mdt_on = 1; 24112 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 24113 ill->ill_name)); 24114 } 24115 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 24116 offset); 24117 return; 24118 } 24119 24120 /* Get a copy of the header for the trailing frags */ 24121 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst, 24122 mp); 24123 if (!hdr_mp) { 24124 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24125 freemsg(mp); 24126 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24127 "ip_wput_frag_end:(%S)", 24128 "couldn't copy hdr"); 24129 return; 24130 } 24131 24132 /* Store the starting offset, with the MoreFrags flag. */ 24133 i1 = offset | IPH_MF | frag_flag; 24134 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 24135 24136 /* Establish the ending byte offset, based on the starting offset. */ 24137 offset <<= 3; 24138 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 24139 24140 /* Store the length of the first fragment in the IP header. */ 24141 i1 = len + hdr_len; 24142 ASSERT(i1 <= IP_MAXPACKET); 24143 ipha->ipha_length = htons((uint16_t)i1); 24144 24145 /* 24146 * Compute the IP header checksum for the first frag. We have to 24147 * watch out that we stop at the end of the header. 24148 */ 24149 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24150 24151 /* 24152 * Now carve off the first frag. Note that this will include the 24153 * original IP header. 24154 */ 24155 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 24156 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24157 freeb(hdr_mp); 24158 freemsg(mp_orig); 24159 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24160 "ip_wput_frag_end:(%S)", 24161 "couldn't carve first"); 24162 return; 24163 } 24164 24165 /* 24166 * Multirouting case. Each fragment is replicated 24167 * via all non-condemned RTF_MULTIRT routes 24168 * currently resolved. 24169 * We ensure that first_ire is the first RTF_MULTIRT 24170 * ire in the bucket. 24171 */ 24172 if (ire->ire_flags & RTF_MULTIRT) { 24173 irb = ire->ire_bucket; 24174 ASSERT(irb != NULL); 24175 24176 multirt_send = B_TRUE; 24177 24178 /* Make sure we do not omit any multiroute ire. */ 24179 IRB_REFHOLD(irb); 24180 for (first_ire = irb->irb_ire; 24181 first_ire != NULL; 24182 first_ire = first_ire->ire_next) { 24183 if ((first_ire->ire_flags & RTF_MULTIRT) && 24184 (first_ire->ire_addr == ire->ire_addr) && 24185 !(first_ire->ire_marks & 24186 (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) 24187 break; 24188 } 24189 24190 if (first_ire != NULL) { 24191 if (first_ire != ire) { 24192 IRE_REFHOLD(first_ire); 24193 /* 24194 * Do not release the ire passed in 24195 * as the argument. 24196 */ 24197 ire = first_ire; 24198 } else { 24199 first_ire = NULL; 24200 } 24201 } 24202 IRB_REFRELE(irb); 24203 24204 /* 24205 * Save the first ire; we will need to restore it 24206 * for the trailing frags. 24207 * We REFHOLD save_ire, as each iterated ire will be 24208 * REFRELEd. 24209 */ 24210 save_ire = ire; 24211 IRE_REFHOLD(save_ire); 24212 } 24213 24214 /* 24215 * First fragment emission loop. 24216 * In most cases, the emission loop below is entered only 24217 * once. Only in the case where the ire holds the RTF_MULTIRT 24218 * flag, do we loop to process all RTF_MULTIRT ires in the 24219 * bucket, and send the fragment through all crossed 24220 * RTF_MULTIRT routes. 24221 */ 24222 do { 24223 if (ire->ire_flags & RTF_MULTIRT) { 24224 /* 24225 * We are in a multiple send case, need to get 24226 * the next ire and make a copy of the packet. 24227 * ire1 holds here the next ire to process in the 24228 * bucket. If multirouting is expected, 24229 * any non-RTF_MULTIRT ire that has the 24230 * right destination address is ignored. 24231 * 24232 * We have to take into account the MTU of 24233 * each walked ire. max_frag is set by the 24234 * the caller and generally refers to 24235 * the primary ire entry. Here we ensure that 24236 * no route with a lower MTU will be used, as 24237 * fragments are carved once for all ires, 24238 * then replicated. 24239 */ 24240 ASSERT(irb != NULL); 24241 IRB_REFHOLD(irb); 24242 for (ire1 = ire->ire_next; 24243 ire1 != NULL; 24244 ire1 = ire1->ire_next) { 24245 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24246 continue; 24247 if (ire1->ire_addr != ire->ire_addr) 24248 continue; 24249 if (ire1->ire_marks & 24250 (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) 24251 continue; 24252 /* 24253 * Ensure we do not exceed the MTU 24254 * of the next route. 24255 */ 24256 if (ire1->ire_max_frag < max_frag) { 24257 ip_multirt_bad_mtu(ire1, max_frag); 24258 continue; 24259 } 24260 24261 /* Got one. */ 24262 IRE_REFHOLD(ire1); 24263 break; 24264 } 24265 IRB_REFRELE(irb); 24266 24267 if (ire1 != NULL) { 24268 next_mp = copyb(mp); 24269 if ((next_mp == NULL) || 24270 ((mp->b_cont != NULL) && 24271 ((next_mp->b_cont = 24272 dupmsg(mp->b_cont)) == NULL))) { 24273 freemsg(next_mp); 24274 next_mp = NULL; 24275 ire_refrele(ire1); 24276 ire1 = NULL; 24277 } 24278 } 24279 24280 /* Last multiroute ire; don't loop anymore. */ 24281 if (ire1 == NULL) { 24282 multirt_send = B_FALSE; 24283 } 24284 } 24285 24286 ll_hdr_len = 0; 24287 LOCK_IRE_FP_MP(ire); 24288 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24289 if (ll_hdr_mp != NULL) { 24290 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24291 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 24292 } else { 24293 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24294 } 24295 24296 /* If there is a transmit header, get a copy for this frag. */ 24297 /* 24298 * TODO: should check db_ref before calling ip_carve_mp since 24299 * it might give us a dup. 24300 */ 24301 if (!ll_hdr_mp) { 24302 /* No xmit header. */ 24303 xmit_mp = mp; 24304 24305 /* We have a link-layer header that can fit in our mblk. */ 24306 } else if (mp->b_datap->db_ref == 1 && 24307 ll_hdr_len != 0 && 24308 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24309 /* M_DATA fastpath */ 24310 mp->b_rptr -= ll_hdr_len; 24311 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 24312 xmit_mp = mp; 24313 24314 /* Corner case if copyb has failed */ 24315 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 24316 UNLOCK_IRE_FP_MP(ire); 24317 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24318 freeb(hdr_mp); 24319 freemsg(mp); 24320 freemsg(mp_orig); 24321 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24322 "ip_wput_frag_end:(%S)", 24323 "discard"); 24324 24325 if (multirt_send) { 24326 ASSERT(ire1); 24327 ASSERT(next_mp); 24328 24329 freemsg(next_mp); 24330 ire_refrele(ire1); 24331 } 24332 if (save_ire != NULL) 24333 IRE_REFRELE(save_ire); 24334 24335 if (first_ire != NULL) 24336 ire_refrele(first_ire); 24337 return; 24338 24339 /* 24340 * Case of res_mp OR the fastpath mp can't fit 24341 * in the mblk 24342 */ 24343 } else { 24344 xmit_mp->b_cont = mp; 24345 24346 /* 24347 * Get priority marking, if any. 24348 * We propagate the CoS marking from the 24349 * original packet that went to QoS processing 24350 * in ip_wput_ire to the newly carved mp. 24351 */ 24352 if (DB_TYPE(xmit_mp) == M_DATA) 24353 xmit_mp->b_band = mp->b_band; 24354 } 24355 UNLOCK_IRE_FP_MP(ire); 24356 24357 q = ire->ire_stq; 24358 out_ill = (ill_t *)q->q_ptr; 24359 24360 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24361 24362 DTRACE_PROBE4(ip4__physical__out__start, 24363 ill_t *, NULL, ill_t *, out_ill, 24364 ipha_t *, ipha, mblk_t *, xmit_mp); 24365 24366 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24367 ipst->ips_ipv4firewall_physical_out, 24368 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24369 24370 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp); 24371 24372 if (xmit_mp != NULL) { 24373 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL, 24374 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 24375 ipha_t *, ipha, ip6_t *, NULL, int, 0); 24376 24377 ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp); 24378 24379 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 24380 UPDATE_MIB(out_ill->ill_ip_mib, 24381 ipIfStatsHCOutOctets, i1); 24382 24383 if (pkt_type != OB_PKT) { 24384 /* 24385 * Update the packet count and MIB stats 24386 * of trailing RTF_MULTIRT ires. 24387 */ 24388 UPDATE_OB_PKT_COUNT(ire); 24389 BUMP_MIB(out_ill->ill_ip_mib, 24390 ipIfStatsOutFragReqds); 24391 } 24392 } 24393 24394 if (multirt_send) { 24395 /* 24396 * We are in a multiple send case; look for 24397 * the next ire and re-enter the loop. 24398 */ 24399 ASSERT(ire1); 24400 ASSERT(next_mp); 24401 /* REFRELE the current ire before looping */ 24402 ire_refrele(ire); 24403 ire = ire1; 24404 ire1 = NULL; 24405 mp = next_mp; 24406 next_mp = NULL; 24407 } 24408 } while (multirt_send); 24409 24410 ASSERT(ire1 == NULL); 24411 24412 /* Restore the original ire; we need it for the trailing frags */ 24413 if (save_ire != NULL) { 24414 /* REFRELE the last iterated ire */ 24415 ire_refrele(ire); 24416 /* save_ire has been REFHOLDed */ 24417 ire = save_ire; 24418 save_ire = NULL; 24419 q = ire->ire_stq; 24420 } 24421 24422 if (pkt_type == OB_PKT) { 24423 UPDATE_OB_PKT_COUNT(ire); 24424 } else { 24425 out_ill = (ill_t *)q->q_ptr; 24426 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 24427 UPDATE_IB_PKT_COUNT(ire); 24428 } 24429 24430 /* Advance the offset to the second frag starting point. */ 24431 offset += len; 24432 /* 24433 * Update hdr_len from the copied header - there might be less options 24434 * in the later fragments. 24435 */ 24436 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 24437 /* Loop until done. */ 24438 for (;;) { 24439 uint16_t offset_and_flags; 24440 uint16_t ip_len; 24441 24442 if (ip_data_end - offset > len) { 24443 /* 24444 * Carve off the appropriate amount from the original 24445 * datagram. 24446 */ 24447 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24448 mp = NULL; 24449 break; 24450 } 24451 /* 24452 * More frags after this one. Get another copy 24453 * of the header. 24454 */ 24455 if (carve_mp->b_datap->db_ref == 1 && 24456 hdr_mp->b_wptr - hdr_mp->b_rptr < 24457 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24458 /* Inline IP header */ 24459 carve_mp->b_rptr -= hdr_mp->b_wptr - 24460 hdr_mp->b_rptr; 24461 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24462 hdr_mp->b_wptr - hdr_mp->b_rptr); 24463 mp = carve_mp; 24464 } else { 24465 if (!(mp = copyb(hdr_mp))) { 24466 freemsg(carve_mp); 24467 break; 24468 } 24469 /* Get priority marking, if any. */ 24470 mp->b_band = carve_mp->b_band; 24471 mp->b_cont = carve_mp; 24472 } 24473 ipha = (ipha_t *)mp->b_rptr; 24474 offset_and_flags = IPH_MF; 24475 } else { 24476 /* 24477 * Last frag. Consume the header. Set len to 24478 * the length of this last piece. 24479 */ 24480 len = ip_data_end - offset; 24481 24482 /* 24483 * Carve off the appropriate amount from the original 24484 * datagram. 24485 */ 24486 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24487 mp = NULL; 24488 break; 24489 } 24490 if (carve_mp->b_datap->db_ref == 1 && 24491 hdr_mp->b_wptr - hdr_mp->b_rptr < 24492 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24493 /* Inline IP header */ 24494 carve_mp->b_rptr -= hdr_mp->b_wptr - 24495 hdr_mp->b_rptr; 24496 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24497 hdr_mp->b_wptr - hdr_mp->b_rptr); 24498 mp = carve_mp; 24499 freeb(hdr_mp); 24500 hdr_mp = mp; 24501 } else { 24502 mp = hdr_mp; 24503 /* Get priority marking, if any. */ 24504 mp->b_band = carve_mp->b_band; 24505 mp->b_cont = carve_mp; 24506 } 24507 ipha = (ipha_t *)mp->b_rptr; 24508 /* A frag of a frag might have IPH_MF non-zero */ 24509 offset_and_flags = 24510 ntohs(ipha->ipha_fragment_offset_and_flags) & 24511 IPH_MF; 24512 } 24513 offset_and_flags |= (uint16_t)(offset >> 3); 24514 offset_and_flags |= (uint16_t)frag_flag; 24515 /* Store the offset and flags in the IP header. */ 24516 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24517 24518 /* Store the length in the IP header. */ 24519 ip_len = (uint16_t)(len + hdr_len); 24520 ipha->ipha_length = htons(ip_len); 24521 24522 /* 24523 * Set the IP header checksum. Note that mp is just 24524 * the header, so this is easy to pass to ip_csum. 24525 */ 24526 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24527 24528 /* Attach a transmit header, if any, and ship it. */ 24529 if (pkt_type == OB_PKT) { 24530 UPDATE_OB_PKT_COUNT(ire); 24531 } else { 24532 out_ill = (ill_t *)q->q_ptr; 24533 BUMP_MIB(out_ill->ill_ip_mib, 24534 ipIfStatsHCOutForwDatagrams); 24535 UPDATE_IB_PKT_COUNT(ire); 24536 } 24537 24538 if (ire->ire_flags & RTF_MULTIRT) { 24539 irb = ire->ire_bucket; 24540 ASSERT(irb != NULL); 24541 24542 multirt_send = B_TRUE; 24543 24544 /* 24545 * Save the original ire; we will need to restore it 24546 * for the tailing frags. 24547 */ 24548 save_ire = ire; 24549 IRE_REFHOLD(save_ire); 24550 } 24551 /* 24552 * Emission loop for this fragment, similar 24553 * to what is done for the first fragment. 24554 */ 24555 do { 24556 if (multirt_send) { 24557 /* 24558 * We are in a multiple send case, need to get 24559 * the next ire and make a copy of the packet. 24560 */ 24561 ASSERT(irb != NULL); 24562 IRB_REFHOLD(irb); 24563 for (ire1 = ire->ire_next; 24564 ire1 != NULL; 24565 ire1 = ire1->ire_next) { 24566 if (!(ire1->ire_flags & RTF_MULTIRT)) 24567 continue; 24568 if (ire1->ire_addr != ire->ire_addr) 24569 continue; 24570 if (ire1->ire_marks & 24571 (IRE_MARK_CONDEMNED | 24572 IRE_MARK_TESTHIDDEN)) 24573 continue; 24574 /* 24575 * Ensure we do not exceed the MTU 24576 * of the next route. 24577 */ 24578 if (ire1->ire_max_frag < max_frag) { 24579 ip_multirt_bad_mtu(ire1, 24580 max_frag); 24581 continue; 24582 } 24583 24584 /* Got one. */ 24585 IRE_REFHOLD(ire1); 24586 break; 24587 } 24588 IRB_REFRELE(irb); 24589 24590 if (ire1 != NULL) { 24591 next_mp = copyb(mp); 24592 if ((next_mp == NULL) || 24593 ((mp->b_cont != NULL) && 24594 ((next_mp->b_cont = 24595 dupmsg(mp->b_cont)) == NULL))) { 24596 freemsg(next_mp); 24597 next_mp = NULL; 24598 ire_refrele(ire1); 24599 ire1 = NULL; 24600 } 24601 } 24602 24603 /* Last multiroute ire; don't loop anymore. */ 24604 if (ire1 == NULL) { 24605 multirt_send = B_FALSE; 24606 } 24607 } 24608 24609 /* Update transmit header */ 24610 ll_hdr_len = 0; 24611 LOCK_IRE_FP_MP(ire); 24612 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24613 if (ll_hdr_mp != NULL) { 24614 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24615 ll_hdr_len = MBLKL(ll_hdr_mp); 24616 } else { 24617 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24618 } 24619 24620 if (!ll_hdr_mp) { 24621 xmit_mp = mp; 24622 24623 /* 24624 * We have link-layer header that can fit in 24625 * our mblk. 24626 */ 24627 } else if (mp->b_datap->db_ref == 1 && 24628 ll_hdr_len != 0 && 24629 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24630 /* M_DATA fastpath */ 24631 mp->b_rptr -= ll_hdr_len; 24632 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 24633 ll_hdr_len); 24634 xmit_mp = mp; 24635 24636 /* 24637 * Case of res_mp OR the fastpath mp can't fit 24638 * in the mblk 24639 */ 24640 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 24641 xmit_mp->b_cont = mp; 24642 /* Get priority marking, if any. */ 24643 if (DB_TYPE(xmit_mp) == M_DATA) 24644 xmit_mp->b_band = mp->b_band; 24645 24646 /* Corner case if copyb failed */ 24647 } else { 24648 /* 24649 * Exit both the replication and 24650 * fragmentation loops. 24651 */ 24652 UNLOCK_IRE_FP_MP(ire); 24653 goto drop_pkt; 24654 } 24655 UNLOCK_IRE_FP_MP(ire); 24656 24657 mp1 = mp; 24658 out_ill = (ill_t *)q->q_ptr; 24659 24660 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24661 24662 DTRACE_PROBE4(ip4__physical__out__start, 24663 ill_t *, NULL, ill_t *, out_ill, 24664 ipha_t *, ipha, mblk_t *, xmit_mp); 24665 24666 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24667 ipst->ips_ipv4firewall_physical_out, 24668 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24669 24670 DTRACE_PROBE1(ip4__physical__out__end, 24671 mblk_t *, xmit_mp); 24672 24673 if (mp != mp1 && hdr_mp == mp1) 24674 hdr_mp = mp; 24675 if (mp != mp1 && mp_orig == mp1) 24676 mp_orig = mp; 24677 24678 if (xmit_mp != NULL) { 24679 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, 24680 NULL, void_ip_t *, ipha, 24681 __dtrace_ipsr_ill_t *, out_ill, ipha_t *, 24682 ipha, ip6_t *, NULL, int, 0); 24683 24684 ILL_SEND_TX(out_ill, ire, connp, 24685 xmit_mp, 0, connp); 24686 24687 BUMP_MIB(out_ill->ill_ip_mib, 24688 ipIfStatsHCOutTransmits); 24689 UPDATE_MIB(out_ill->ill_ip_mib, 24690 ipIfStatsHCOutOctets, ip_len); 24691 24692 if (pkt_type != OB_PKT) { 24693 /* 24694 * Update the packet count of trailing 24695 * RTF_MULTIRT ires. 24696 */ 24697 UPDATE_OB_PKT_COUNT(ire); 24698 } 24699 } 24700 24701 /* All done if we just consumed the hdr_mp. */ 24702 if (mp == hdr_mp) { 24703 last_frag = B_TRUE; 24704 BUMP_MIB(out_ill->ill_ip_mib, 24705 ipIfStatsOutFragOKs); 24706 } 24707 24708 if (multirt_send) { 24709 /* 24710 * We are in a multiple send case; look for 24711 * the next ire and re-enter the loop. 24712 */ 24713 ASSERT(ire1); 24714 ASSERT(next_mp); 24715 /* REFRELE the current ire before looping */ 24716 ire_refrele(ire); 24717 ire = ire1; 24718 ire1 = NULL; 24719 q = ire->ire_stq; 24720 mp = next_mp; 24721 next_mp = NULL; 24722 } 24723 } while (multirt_send); 24724 /* 24725 * Restore the original ire; we need it for the 24726 * trailing frags 24727 */ 24728 if (save_ire != NULL) { 24729 ASSERT(ire1 == NULL); 24730 /* REFRELE the last iterated ire */ 24731 ire_refrele(ire); 24732 /* save_ire has been REFHOLDed */ 24733 ire = save_ire; 24734 q = ire->ire_stq; 24735 save_ire = NULL; 24736 } 24737 24738 if (last_frag) { 24739 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24740 "ip_wput_frag_end:(%S)", 24741 "consumed hdr_mp"); 24742 24743 if (first_ire != NULL) 24744 ire_refrele(first_ire); 24745 return; 24746 } 24747 /* Otherwise, advance and loop. */ 24748 offset += len; 24749 } 24750 24751 drop_pkt: 24752 /* Clean up following allocation failure. */ 24753 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24754 freemsg(mp); 24755 if (mp != hdr_mp) 24756 freeb(hdr_mp); 24757 if (mp != mp_orig) 24758 freemsg(mp_orig); 24759 24760 if (save_ire != NULL) 24761 IRE_REFRELE(save_ire); 24762 if (first_ire != NULL) 24763 ire_refrele(first_ire); 24764 24765 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24766 "ip_wput_frag_end:(%S)", 24767 "end--alloc failure"); 24768 } 24769 24770 /* 24771 * Copy the header plus those options which have the copy bit set 24772 * src is the template to make sure we preserve the cred for TX purposes. 24773 */ 24774 static mblk_t * 24775 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst, 24776 mblk_t *src) 24777 { 24778 mblk_t *mp; 24779 uchar_t *up; 24780 24781 /* 24782 * Quick check if we need to look for options without the copy bit 24783 * set 24784 */ 24785 mp = allocb_tmpl(ipst->ips_ip_wroff_extra + hdr_len, src); 24786 if (!mp) 24787 return (mp); 24788 mp->b_rptr += ipst->ips_ip_wroff_extra; 24789 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 24790 bcopy(rptr, mp->b_rptr, hdr_len); 24791 mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra; 24792 return (mp); 24793 } 24794 up = mp->b_rptr; 24795 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 24796 up += IP_SIMPLE_HDR_LENGTH; 24797 rptr += IP_SIMPLE_HDR_LENGTH; 24798 hdr_len -= IP_SIMPLE_HDR_LENGTH; 24799 while (hdr_len > 0) { 24800 uint32_t optval; 24801 uint32_t optlen; 24802 24803 optval = *rptr; 24804 if (optval == IPOPT_EOL) 24805 break; 24806 if (optval == IPOPT_NOP) 24807 optlen = 1; 24808 else 24809 optlen = rptr[1]; 24810 if (optval & IPOPT_COPY) { 24811 bcopy(rptr, up, optlen); 24812 up += optlen; 24813 } 24814 rptr += optlen; 24815 hdr_len -= optlen; 24816 } 24817 /* 24818 * Make sure that we drop an even number of words by filling 24819 * with EOL to the next word boundary. 24820 */ 24821 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 24822 hdr_len & 0x3; hdr_len++) 24823 *up++ = IPOPT_EOL; 24824 mp->b_wptr = up; 24825 /* Update header length */ 24826 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 24827 return (mp); 24828 } 24829 24830 /* 24831 * Delivery to local recipients including fanout to multiple recipients. 24832 * Does not do checksumming of UDP/TCP. 24833 * Note: q should be the read side queue for either the ill or conn. 24834 * Note: rq should be the read side q for the lower (ill) stream. 24835 * We don't send packets to IPPF processing, thus the last argument 24836 * to all the fanout calls are B_FALSE. 24837 */ 24838 void 24839 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 24840 int fanout_flags, zoneid_t zoneid) 24841 { 24842 uint32_t protocol; 24843 mblk_t *first_mp; 24844 boolean_t mctl_present; 24845 int ire_type; 24846 #define rptr ((uchar_t *)ipha) 24847 ip_stack_t *ipst = ill->ill_ipst; 24848 24849 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 24850 "ip_wput_local_start: q %p", q); 24851 24852 if (ire != NULL) { 24853 ire_type = ire->ire_type; 24854 } else { 24855 /* 24856 * Only ip_multicast_loopback() calls us with a NULL ire. If the 24857 * packet is not multicast, we can't tell the ire type. 24858 */ 24859 ASSERT(CLASSD(ipha->ipha_dst)); 24860 ire_type = IRE_BROADCAST; 24861 } 24862 24863 first_mp = mp; 24864 if (first_mp->b_datap->db_type == M_CTL) { 24865 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 24866 if (!io->ipsec_out_secure) { 24867 /* 24868 * This ipsec_out_t was allocated in ip_wput 24869 * for multicast packets to store the ill_index. 24870 * As this is being delivered locally, we don't 24871 * need this anymore. 24872 */ 24873 mp = first_mp->b_cont; 24874 freeb(first_mp); 24875 first_mp = mp; 24876 mctl_present = B_FALSE; 24877 } else { 24878 /* 24879 * Convert IPSEC_OUT to IPSEC_IN, preserving all 24880 * security properties for the looped-back packet. 24881 */ 24882 mctl_present = B_TRUE; 24883 mp = first_mp->b_cont; 24884 ASSERT(mp != NULL); 24885 ipsec_out_to_in(first_mp); 24886 } 24887 } else { 24888 mctl_present = B_FALSE; 24889 } 24890 24891 DTRACE_PROBE4(ip4__loopback__in__start, 24892 ill_t *, ill, ill_t *, NULL, 24893 ipha_t *, ipha, mblk_t *, first_mp); 24894 24895 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 24896 ipst->ips_ipv4firewall_loopback_in, 24897 ill, NULL, ipha, first_mp, mp, 0, ipst); 24898 24899 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp); 24900 24901 if (first_mp == NULL) 24902 return; 24903 24904 if (ipst->ips_ipobs_enabled) { 24905 zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES; 24906 zoneid_t stackzoneid = netstackid_to_zoneid( 24907 ipst->ips_netstack->netstack_stackid); 24908 24909 dzone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid; 24910 /* 24911 * 127.0.0.1 is special, as we cannot lookup its zoneid by 24912 * address. Restrict the lookup below to the destination zone. 24913 */ 24914 if (ipha->ipha_src == ntohl(INADDR_LOOPBACK)) 24915 lookup_zoneid = zoneid; 24916 szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst, 24917 lookup_zoneid); 24918 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 24919 IPV4_VERSION, 0, ipst); 24920 } 24921 24922 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, 24923 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 24924 int, 1); 24925 24926 ipst->ips_loopback_packets++; 24927 24928 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 24929 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 24930 if (!IS_SIMPLE_IPH(ipha)) { 24931 ip_wput_local_options(ipha, ipst); 24932 } 24933 24934 protocol = ipha->ipha_protocol; 24935 switch (protocol) { 24936 case IPPROTO_ICMP: { 24937 ire_t *ire_zone; 24938 ilm_t *ilm; 24939 mblk_t *mp1; 24940 zoneid_t last_zoneid; 24941 ilm_walker_t ilw; 24942 24943 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { 24944 ASSERT(ire_type == IRE_BROADCAST); 24945 /* 24946 * In the multicast case, applications may have joined 24947 * the group from different zones, so we need to deliver 24948 * the packet to each of them. Loop through the 24949 * multicast memberships structures (ilm) on the receive 24950 * ill and send a copy of the packet up each matching 24951 * one. However, we don't do this for multicasts sent on 24952 * the loopback interface (PHYI_LOOPBACK flag set) as 24953 * they must stay in the sender's zone. 24954 * 24955 * ilm_add_v6() ensures that ilms in the same zone are 24956 * contiguous in the ill_ilm list. We use this property 24957 * to avoid sending duplicates needed when two 24958 * applications in the same zone join the same group on 24959 * different logical interfaces: we ignore the ilm if 24960 * it's zoneid is the same as the last matching one. 24961 * In addition, the sending of the packet for 24962 * ire_zoneid is delayed until all of the other ilms 24963 * have been exhausted. 24964 */ 24965 last_zoneid = -1; 24966 ilm = ilm_walker_start(&ilw, ill); 24967 for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { 24968 if (ipha->ipha_dst != ilm->ilm_addr || 24969 ilm->ilm_zoneid == last_zoneid || 24970 ilm->ilm_zoneid == zoneid || 24971 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 24972 continue; 24973 mp1 = ip_copymsg(first_mp); 24974 if (mp1 == NULL) 24975 continue; 24976 icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, 24977 0, 0, mctl_present, B_FALSE, ill, 24978 ilm->ilm_zoneid); 24979 last_zoneid = ilm->ilm_zoneid; 24980 } 24981 ilm_walker_finish(&ilw); 24982 /* 24983 * Loopback case: the sending endpoint has 24984 * IP_MULTICAST_LOOP disabled, therefore we don't 24985 * dispatch the multicast packet to the sending zone. 24986 */ 24987 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 24988 freemsg(first_mp); 24989 return; 24990 } 24991 } else if (ire_type == IRE_BROADCAST) { 24992 /* 24993 * In the broadcast case, there may be many zones 24994 * which need a copy of the packet delivered to them. 24995 * There is one IRE_BROADCAST per broadcast address 24996 * and per zone; we walk those using a helper function. 24997 * In addition, the sending of the packet for zoneid is 24998 * delayed until all of the other ires have been 24999 * processed. 25000 */ 25001 IRB_REFHOLD(ire->ire_bucket); 25002 ire_zone = NULL; 25003 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 25004 ire)) != NULL) { 25005 mp1 = ip_copymsg(first_mp); 25006 if (mp1 == NULL) 25007 continue; 25008 25009 UPDATE_IB_PKT_COUNT(ire_zone); 25010 ire_zone->ire_last_used_time = lbolt; 25011 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25012 mctl_present, B_FALSE, ill, 25013 ire_zone->ire_zoneid); 25014 } 25015 IRB_REFRELE(ire->ire_bucket); 25016 } 25017 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 25018 0, mctl_present, B_FALSE, ill, zoneid); 25019 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25020 "ip_wput_local_end: q %p (%S)", 25021 q, "icmp"); 25022 return; 25023 } 25024 case IPPROTO_IGMP: 25025 if ((mp = igmp_input(q, mp, ill)) == NULL) { 25026 /* Bad packet - discarded by igmp_input */ 25027 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25028 "ip_wput_local_end: q %p (%S)", 25029 q, "igmp_input--bad packet"); 25030 if (mctl_present) 25031 freeb(first_mp); 25032 return; 25033 } 25034 /* 25035 * igmp_input() may have returned the pulled up message. 25036 * So first_mp and ipha need to be reinitialized. 25037 */ 25038 ipha = (ipha_t *)mp->b_rptr; 25039 if (mctl_present) 25040 first_mp->b_cont = mp; 25041 else 25042 first_mp = mp; 25043 /* deliver to local raw users */ 25044 break; 25045 case IPPROTO_ENCAP: 25046 /* 25047 * This case is covered by either ip_fanout_proto, or by 25048 * the above security processing for self-tunneled packets. 25049 */ 25050 break; 25051 case IPPROTO_UDP: { 25052 uint16_t *up; 25053 uint32_t ports; 25054 25055 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 25056 UDP_PORTS_OFFSET); 25057 /* Force a 'valid' checksum. */ 25058 up[3] = 0; 25059 25060 ports = *(uint32_t *)up; 25061 ip_fanout_udp(q, first_mp, ill, ipha, ports, 25062 (ire_type == IRE_BROADCAST), 25063 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25064 IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE, 25065 ill, zoneid); 25066 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25067 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 25068 return; 25069 } 25070 case IPPROTO_TCP: { 25071 25072 /* 25073 * For TCP, discard broadcast packets. 25074 */ 25075 if ((ushort_t)ire_type == IRE_BROADCAST) { 25076 freemsg(first_mp); 25077 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 25078 ip2dbg(("ip_wput_local: discard broadcast\n")); 25079 return; 25080 } 25081 25082 if (mp->b_datap->db_type == M_DATA) { 25083 /* 25084 * M_DATA mblk, so init mblk (chain) for no struio(). 25085 */ 25086 mblk_t *mp1 = mp; 25087 25088 do { 25089 mp1->b_datap->db_struioflag = 0; 25090 } while ((mp1 = mp1->b_cont) != NULL); 25091 } 25092 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 25093 <= mp->b_wptr); 25094 ip_fanout_tcp(q, first_mp, ill, ipha, 25095 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25096 IP_FF_SYN_ADDIRE | IP_FF_IPINFO, 25097 mctl_present, B_FALSE, zoneid); 25098 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25099 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 25100 return; 25101 } 25102 case IPPROTO_SCTP: 25103 { 25104 uint32_t ports; 25105 25106 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 25107 ip_fanout_sctp(first_mp, ill, ipha, ports, 25108 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25109 IP_FF_IPINFO, mctl_present, B_FALSE, zoneid); 25110 return; 25111 } 25112 25113 default: 25114 break; 25115 } 25116 /* 25117 * Find a client for some other protocol. We give 25118 * copies to multiple clients, if more than one is 25119 * bound. 25120 */ 25121 ip_fanout_proto(q, first_mp, ill, ipha, 25122 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 25123 mctl_present, B_FALSE, ill, zoneid); 25124 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25125 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 25126 #undef rptr 25127 } 25128 25129 /* 25130 * Update any source route, record route, or timestamp options. 25131 * Check that we are at end of strict source route. 25132 * The options have been sanity checked by ip_wput_options(). 25133 */ 25134 static void 25135 ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) 25136 { 25137 ipoptp_t opts; 25138 uchar_t *opt; 25139 uint8_t optval; 25140 uint8_t optlen; 25141 ipaddr_t dst; 25142 uint32_t ts; 25143 ire_t *ire; 25144 timestruc_t now; 25145 25146 ip2dbg(("ip_wput_local_options\n")); 25147 for (optval = ipoptp_first(&opts, ipha); 25148 optval != IPOPT_EOL; 25149 optval = ipoptp_next(&opts)) { 25150 opt = opts.ipoptp_cur; 25151 optlen = opts.ipoptp_len; 25152 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 25153 switch (optval) { 25154 uint32_t off; 25155 case IPOPT_SSRR: 25156 case IPOPT_LSRR: 25157 off = opt[IPOPT_OFFSET]; 25158 off--; 25159 if (optlen < IP_ADDR_LEN || 25160 off > optlen - IP_ADDR_LEN) { 25161 /* End of source route */ 25162 break; 25163 } 25164 /* 25165 * This will only happen if two consecutive entries 25166 * in the source route contains our address or if 25167 * it is a packet with a loose source route which 25168 * reaches us before consuming the whole source route 25169 */ 25170 ip1dbg(("ip_wput_local_options: not end of SR\n")); 25171 if (optval == IPOPT_SSRR) { 25172 return; 25173 } 25174 /* 25175 * Hack: instead of dropping the packet truncate the 25176 * source route to what has been used by filling the 25177 * rest with IPOPT_NOP. 25178 */ 25179 opt[IPOPT_OLEN] = (uint8_t)off; 25180 while (off < optlen) { 25181 opt[off++] = IPOPT_NOP; 25182 } 25183 break; 25184 case IPOPT_RR: 25185 off = opt[IPOPT_OFFSET]; 25186 off--; 25187 if (optlen < IP_ADDR_LEN || 25188 off > optlen - IP_ADDR_LEN) { 25189 /* No more room - ignore */ 25190 ip1dbg(( 25191 "ip_wput_forward_options: end of RR\n")); 25192 break; 25193 } 25194 dst = htonl(INADDR_LOOPBACK); 25195 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25196 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25197 break; 25198 case IPOPT_TS: 25199 /* Insert timestamp if there is romm */ 25200 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25201 case IPOPT_TS_TSONLY: 25202 off = IPOPT_TS_TIMELEN; 25203 break; 25204 case IPOPT_TS_PRESPEC: 25205 case IPOPT_TS_PRESPEC_RFC791: 25206 /* Verify that the address matched */ 25207 off = opt[IPOPT_OFFSET] - 1; 25208 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 25209 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 25210 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 25211 ipst); 25212 if (ire == NULL) { 25213 /* Not for us */ 25214 break; 25215 } 25216 ire_refrele(ire); 25217 /* FALLTHRU */ 25218 case IPOPT_TS_TSANDADDR: 25219 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25220 break; 25221 default: 25222 /* 25223 * ip_*put_options should have already 25224 * dropped this packet. 25225 */ 25226 cmn_err(CE_PANIC, "ip_wput_local_options: " 25227 "unknown IT - bug in ip_wput_options?\n"); 25228 return; /* Keep "lint" happy */ 25229 } 25230 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 25231 /* Increase overflow counter */ 25232 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 25233 opt[IPOPT_POS_OV_FLG] = (uint8_t) 25234 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 25235 (off << 4); 25236 break; 25237 } 25238 off = opt[IPOPT_OFFSET] - 1; 25239 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25240 case IPOPT_TS_PRESPEC: 25241 case IPOPT_TS_PRESPEC_RFC791: 25242 case IPOPT_TS_TSANDADDR: 25243 dst = htonl(INADDR_LOOPBACK); 25244 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25245 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25246 /* FALLTHRU */ 25247 case IPOPT_TS_TSONLY: 25248 off = opt[IPOPT_OFFSET] - 1; 25249 /* Compute # of milliseconds since midnight */ 25250 gethrestime(&now); 25251 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 25252 now.tv_nsec / (NANOSEC / MILLISEC); 25253 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 25254 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 25255 break; 25256 } 25257 break; 25258 } 25259 } 25260 } 25261 25262 /* 25263 * Send out a multicast packet on interface ipif. 25264 * The sender does not have an conn. 25265 * Caller verifies that this isn't a PHYI_LOOPBACK. 25266 */ 25267 void 25268 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) 25269 { 25270 ipha_t *ipha; 25271 ire_t *ire; 25272 ipaddr_t dst; 25273 mblk_t *first_mp; 25274 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 25275 25276 /* igmp_sendpkt always allocates a ipsec_out_t */ 25277 ASSERT(mp->b_datap->db_type == M_CTL); 25278 ASSERT(!ipif->ipif_isv6); 25279 ASSERT(!IS_LOOPBACK(ipif->ipif_ill)); 25280 25281 first_mp = mp; 25282 mp = first_mp->b_cont; 25283 ASSERT(mp->b_datap->db_type == M_DATA); 25284 ipha = (ipha_t *)mp->b_rptr; 25285 25286 /* 25287 * Find an IRE which matches the destination and the outgoing 25288 * queue (i.e. the outgoing interface.) 25289 */ 25290 if (ipif->ipif_flags & IPIF_POINTOPOINT) 25291 dst = ipif->ipif_pp_dst_addr; 25292 else 25293 dst = ipha->ipha_dst; 25294 /* 25295 * The source address has already been initialized by the 25296 * caller and hence matching on ILL (MATCH_IRE_ILL) would 25297 * be sufficient rather than MATCH_IRE_IPIF. 25298 * 25299 * This function is used for sending IGMP packets. For IPMP, 25300 * we sidestep IGMP snooping issues by sending all multicast 25301 * traffic on a single interface in the IPMP group. 25302 */ 25303 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, 25304 MATCH_IRE_ILL, ipst); 25305 if (!ire) { 25306 /* 25307 * Mark this packet to make it be delivered to 25308 * ip_wput_ire after the new ire has been 25309 * created. 25310 */ 25311 mp->b_prev = NULL; 25312 mp->b_next = NULL; 25313 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC, 25314 zoneid, &zero_info); 25315 return; 25316 } 25317 25318 /* 25319 * Honor the RTF_SETSRC flag; this is the only case 25320 * where we force this addr whatever the current src addr is, 25321 * because this address is set by igmp_sendpkt(), and 25322 * cannot be specified by any user. 25323 */ 25324 if (ire->ire_flags & RTF_SETSRC) { 25325 ipha->ipha_src = ire->ire_src_addr; 25326 } 25327 25328 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid); 25329 } 25330 25331 /* 25332 * NOTE : This function does not ire_refrele the ire argument passed in. 25333 * 25334 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 25335 * failure. The nce_fp_mp can vanish any time in the case of 25336 * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 25337 * the ire_lock to access the nce_fp_mp in this case. 25338 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 25339 * prepending a fastpath message IPQoS processing must precede it, we also set 25340 * the b_band of the fastpath message to that of the mblk returned by IPQoS 25341 * (IPQoS might have set the b_band for CoS marking). 25342 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 25343 * must follow it so that IPQoS can mark the dl_priority field for CoS 25344 * marking, if needed. 25345 */ 25346 static mblk_t * 25347 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, 25348 uint32_t ill_index, ipha_t **iphap) 25349 { 25350 uint_t hlen; 25351 ipha_t *ipha; 25352 mblk_t *mp1; 25353 boolean_t qos_done = B_FALSE; 25354 uchar_t *ll_hdr; 25355 ip_stack_t *ipst = ire->ire_ipst; 25356 25357 #define rptr ((uchar_t *)ipha) 25358 25359 ipha = (ipha_t *)mp->b_rptr; 25360 hlen = 0; 25361 LOCK_IRE_FP_MP(ire); 25362 if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) { 25363 ASSERT(DB_TYPE(mp1) == M_DATA); 25364 /* Initiate IPPF processing */ 25365 if ((proc != 0) && IPP_ENABLED(proc, ipst)) { 25366 UNLOCK_IRE_FP_MP(ire); 25367 ip_process(proc, &mp, ill_index); 25368 if (mp == NULL) 25369 return (NULL); 25370 25371 ipha = (ipha_t *)mp->b_rptr; 25372 LOCK_IRE_FP_MP(ire); 25373 if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) { 25374 qos_done = B_TRUE; 25375 goto no_fp_mp; 25376 } 25377 ASSERT(DB_TYPE(mp1) == M_DATA); 25378 } 25379 hlen = MBLKL(mp1); 25380 /* 25381 * Check if we have enough room to prepend fastpath 25382 * header 25383 */ 25384 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 25385 ll_hdr = rptr - hlen; 25386 bcopy(mp1->b_rptr, ll_hdr, hlen); 25387 /* 25388 * Set the b_rptr to the start of the link layer 25389 * header 25390 */ 25391 mp->b_rptr = ll_hdr; 25392 mp1 = mp; 25393 } else { 25394 mp1 = copyb(mp1); 25395 if (mp1 == NULL) 25396 goto unlock_err; 25397 mp1->b_band = mp->b_band; 25398 mp1->b_cont = mp; 25399 /* 25400 * XXX disable ICK_VALID and compute checksum 25401 * here; can happen if nce_fp_mp changes and 25402 * it can't be copied now due to insufficient 25403 * space. (unlikely, fp mp can change, but it 25404 * does not increase in length) 25405 */ 25406 } 25407 UNLOCK_IRE_FP_MP(ire); 25408 } else { 25409 no_fp_mp: 25410 mp1 = copyb(ire->ire_nce->nce_res_mp); 25411 if (mp1 == NULL) { 25412 unlock_err: 25413 UNLOCK_IRE_FP_MP(ire); 25414 freemsg(mp); 25415 return (NULL); 25416 } 25417 UNLOCK_IRE_FP_MP(ire); 25418 mp1->b_cont = mp; 25419 if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) { 25420 ip_process(proc, &mp1, ill_index); 25421 if (mp1 == NULL) 25422 return (NULL); 25423 25424 if (mp1->b_cont == NULL) 25425 ipha = NULL; 25426 else 25427 ipha = (ipha_t *)mp1->b_cont->b_rptr; 25428 } 25429 } 25430 25431 *iphap = ipha; 25432 return (mp1); 25433 #undef rptr 25434 } 25435 25436 /* 25437 * Finish the outbound IPsec processing for an IPv6 packet. This function 25438 * is called from ipsec_out_process() if the IPsec packet was processed 25439 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25440 * asynchronously. 25441 */ 25442 void 25443 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 25444 ire_t *ire_arg) 25445 { 25446 in6_addr_t *v6dstp; 25447 ire_t *ire; 25448 mblk_t *mp; 25449 ip6_t *ip6h1; 25450 uint_t ill_index; 25451 ipsec_out_t *io; 25452 boolean_t hwaccel; 25453 uint32_t flags = IP6_NO_IPPOLICY; 25454 int match_flags; 25455 zoneid_t zoneid; 25456 boolean_t ill_need_rele = B_FALSE; 25457 boolean_t ire_need_rele = B_FALSE; 25458 ip_stack_t *ipst; 25459 25460 mp = ipsec_mp->b_cont; 25461 ip6h1 = (ip6_t *)mp->b_rptr; 25462 io = (ipsec_out_t *)ipsec_mp->b_rptr; 25463 ASSERT(io->ipsec_out_ns != NULL); 25464 ipst = io->ipsec_out_ns->netstack_ip; 25465 ill_index = io->ipsec_out_ill_index; 25466 if (io->ipsec_out_reachable) { 25467 flags |= IPV6_REACHABILITY_CONFIRMATION; 25468 } 25469 hwaccel = io->ipsec_out_accelerated; 25470 zoneid = io->ipsec_out_zoneid; 25471 ASSERT(zoneid != ALL_ZONES); 25472 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 25473 /* Multicast addresses should have non-zero ill_index. */ 25474 v6dstp = &ip6h->ip6_dst; 25475 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 25476 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 25477 25478 if (ill == NULL && ill_index != 0) { 25479 ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst); 25480 /* Failure case frees things for us. */ 25481 if (ill == NULL) 25482 return; 25483 25484 ill_need_rele = B_TRUE; 25485 } 25486 ASSERT(mp != NULL); 25487 25488 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 25489 boolean_t unspec_src; 25490 ipif_t *ipif; 25491 25492 /* 25493 * Use the ill_index to get the right ill. 25494 */ 25495 unspec_src = io->ipsec_out_unspec_src; 25496 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 25497 if (ipif == NULL) { 25498 if (ill_need_rele) 25499 ill_refrele(ill); 25500 freemsg(ipsec_mp); 25501 return; 25502 } 25503 25504 if (ire_arg != NULL) { 25505 ire = ire_arg; 25506 } else { 25507 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25508 zoneid, msg_getlabel(mp), match_flags, ipst); 25509 ire_need_rele = B_TRUE; 25510 } 25511 if (ire != NULL) { 25512 ipif_refrele(ipif); 25513 /* 25514 * XXX Do the multicast forwarding now, as the IPsec 25515 * processing has been done. 25516 */ 25517 goto send; 25518 } 25519 25520 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 25521 mp->b_prev = NULL; 25522 mp->b_next = NULL; 25523 25524 /* 25525 * If the IPsec packet was processed asynchronously, 25526 * drop it now. 25527 */ 25528 if (q == NULL) { 25529 if (ill_need_rele) 25530 ill_refrele(ill); 25531 freemsg(ipsec_mp); 25532 return; 25533 } 25534 25535 ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src, 25536 unspec_src, zoneid); 25537 ipif_refrele(ipif); 25538 } else { 25539 if (ire_arg != NULL) { 25540 ire = ire_arg; 25541 } else { 25542 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst); 25543 ire_need_rele = B_TRUE; 25544 } 25545 if (ire != NULL) 25546 goto send; 25547 /* 25548 * ire disappeared underneath. 25549 * 25550 * What we need to do here is the ip_newroute 25551 * logic to get the ire without doing the IPsec 25552 * processing. Follow the same old path. But this 25553 * time, ip_wput or ire_add_then_send will call us 25554 * directly as all the IPsec operations are done. 25555 */ 25556 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 25557 mp->b_prev = NULL; 25558 mp->b_next = NULL; 25559 25560 /* 25561 * If the IPsec packet was processed asynchronously, 25562 * drop it now. 25563 */ 25564 if (q == NULL) { 25565 if (ill_need_rele) 25566 ill_refrele(ill); 25567 freemsg(ipsec_mp); 25568 return; 25569 } 25570 25571 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 25572 zoneid, ipst); 25573 } 25574 if (ill != NULL && ill_need_rele) 25575 ill_refrele(ill); 25576 return; 25577 send: 25578 if (ill != NULL && ill_need_rele) 25579 ill_refrele(ill); 25580 25581 /* Local delivery */ 25582 if (ire->ire_stq == NULL) { 25583 ill_t *out_ill; 25584 ASSERT(q != NULL); 25585 25586 /* PFHooks: LOOPBACK_OUT */ 25587 out_ill = ire_to_ill(ire); 25588 25589 /* 25590 * DTrace this as ip:::send. A blocked packet will fire the 25591 * send probe, but not the receive probe. 25592 */ 25593 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 25594 void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill, 25595 ipha_t *, NULL, ip6_t *, ip6h, int, 1); 25596 25597 DTRACE_PROBE4(ip6__loopback__out__start, 25598 ill_t *, NULL, ill_t *, out_ill, 25599 ip6_t *, ip6h1, mblk_t *, ipsec_mp); 25600 25601 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 25602 ipst->ips_ipv6firewall_loopback_out, 25603 NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst); 25604 25605 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp); 25606 25607 if (ipsec_mp != NULL) { 25608 ip_wput_local_v6(RD(q), out_ill, 25609 ip6h, ipsec_mp, ire, 0, zoneid); 25610 } 25611 if (ire_need_rele) 25612 ire_refrele(ire); 25613 return; 25614 } 25615 /* 25616 * Everything is done. Send it out on the wire. 25617 * We force the insertion of a fragment header using the 25618 * IPH_FRAG_HDR flag in two cases: 25619 * - after reception of an ICMPv6 "packet too big" message 25620 * with a MTU < 1280 (cf. RFC 2460 section 5) 25621 * - for multirouted IPv6 packets, so that the receiver can 25622 * discard duplicates according to their fragment identifier 25623 */ 25624 /* XXX fix flow control problems. */ 25625 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 25626 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 25627 if (hwaccel) { 25628 /* 25629 * hardware acceleration does not handle these 25630 * "slow path" cases. 25631 */ 25632 /* IPsec KSTATS: should bump bean counter here. */ 25633 if (ire_need_rele) 25634 ire_refrele(ire); 25635 freemsg(ipsec_mp); 25636 return; 25637 } 25638 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 25639 (mp->b_cont ? msgdsize(mp) : 25640 mp->b_wptr - (uchar_t *)ip6h)) { 25641 /* IPsec KSTATS: should bump bean counter here. */ 25642 ip0dbg(("Packet length mismatch: %d, %ld\n", 25643 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 25644 msgdsize(mp))); 25645 if (ire_need_rele) 25646 ire_refrele(ire); 25647 freemsg(ipsec_mp); 25648 return; 25649 } 25650 ASSERT(mp->b_prev == NULL); 25651 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 25652 ntohs(ip6h->ip6_plen) + 25653 IPV6_HDR_LEN, ire->ire_max_frag)); 25654 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 25655 ire->ire_max_frag); 25656 } else { 25657 UPDATE_OB_PKT_COUNT(ire); 25658 ire->ire_last_used_time = lbolt; 25659 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 25660 } 25661 if (ire_need_rele) 25662 ire_refrele(ire); 25663 freeb(ipsec_mp); 25664 } 25665 25666 void 25667 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 25668 { 25669 mblk_t *hada_mp; /* attributes M_CTL mblk */ 25670 da_ipsec_t *hada; /* data attributes */ 25671 ill_t *ill = (ill_t *)q->q_ptr; 25672 25673 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 25674 25675 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 25676 /* IPsec KSTATS: Bump lose counter here! */ 25677 freemsg(mp); 25678 return; 25679 } 25680 25681 /* 25682 * It's an IPsec packet that must be 25683 * accelerated by the Provider, and the 25684 * outbound ill is IPsec acceleration capable. 25685 * Prepends the mblk with an IPHADA_M_CTL, and ship it 25686 * to the ill. 25687 * IPsec KSTATS: should bump packet counter here. 25688 */ 25689 25690 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 25691 if (hada_mp == NULL) { 25692 /* IPsec KSTATS: should bump packet counter here. */ 25693 freemsg(mp); 25694 return; 25695 } 25696 25697 hada_mp->b_datap->db_type = M_CTL; 25698 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 25699 hada_mp->b_cont = mp; 25700 25701 hada = (da_ipsec_t *)hada_mp->b_rptr; 25702 bzero(hada, sizeof (da_ipsec_t)); 25703 hada->da_type = IPHADA_M_CTL; 25704 25705 putnext(q, hada_mp); 25706 } 25707 25708 /* 25709 * Finish the outbound IPsec processing. This function is called from 25710 * ipsec_out_process() if the IPsec packet was processed 25711 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25712 * asynchronously. 25713 */ 25714 void 25715 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 25716 ire_t *ire_arg) 25717 { 25718 uint32_t v_hlen_tos_len; 25719 ipaddr_t dst; 25720 ipif_t *ipif = NULL; 25721 ire_t *ire; 25722 ire_t *ire1 = NULL; 25723 mblk_t *next_mp = NULL; 25724 uint32_t max_frag; 25725 boolean_t multirt_send = B_FALSE; 25726 mblk_t *mp; 25727 ipha_t *ipha1; 25728 uint_t ill_index; 25729 ipsec_out_t *io; 25730 int match_flags; 25731 irb_t *irb = NULL; 25732 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 25733 zoneid_t zoneid; 25734 ipxmit_state_t pktxmit_state; 25735 ip_stack_t *ipst; 25736 25737 #ifdef _BIG_ENDIAN 25738 #define LENGTH (v_hlen_tos_len & 0xFFFF) 25739 #else 25740 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 25741 #endif 25742 25743 mp = ipsec_mp->b_cont; 25744 ipha1 = (ipha_t *)mp->b_rptr; 25745 ASSERT(mp != NULL); 25746 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 25747 dst = ipha->ipha_dst; 25748 25749 io = (ipsec_out_t *)ipsec_mp->b_rptr; 25750 ill_index = io->ipsec_out_ill_index; 25751 zoneid = io->ipsec_out_zoneid; 25752 ASSERT(zoneid != ALL_ZONES); 25753 ipst = io->ipsec_out_ns->netstack_ip; 25754 ASSERT(io->ipsec_out_ns != NULL); 25755 25756 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 25757 if (ill == NULL && ill_index != 0) { 25758 ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst); 25759 /* Failure case frees things for us. */ 25760 if (ill == NULL) 25761 return; 25762 25763 ill_need_rele = B_TRUE; 25764 } 25765 25766 if (CLASSD(dst)) { 25767 boolean_t conn_dontroute; 25768 /* 25769 * Use the ill_index to get the right ipif. 25770 */ 25771 conn_dontroute = io->ipsec_out_dontroute; 25772 if (ill_index == 0) 25773 ipif = ipif_lookup_group(dst, zoneid, ipst); 25774 else 25775 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 25776 if (ipif == NULL) { 25777 ip1dbg(("ip_wput_ipsec_out: No ipif for" 25778 " multicast\n")); 25779 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 25780 freemsg(ipsec_mp); 25781 goto done; 25782 } 25783 /* 25784 * ipha_src has already been intialized with the 25785 * value of the ipif in ip_wput. All we need now is 25786 * an ire to send this downstream. 25787 */ 25788 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 25789 msg_getlabel(mp), match_flags, ipst); 25790 if (ire != NULL) { 25791 ill_t *ill1; 25792 /* 25793 * Do the multicast forwarding now, as the IPsec 25794 * processing has been done. 25795 */ 25796 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 25797 (ill1 = ire_to_ill(ire))) { 25798 if (ip_mforward(ill1, ipha, mp)) { 25799 freemsg(ipsec_mp); 25800 ip1dbg(("ip_wput_ipsec_out: mforward " 25801 "failed\n")); 25802 ire_refrele(ire); 25803 goto done; 25804 } 25805 } 25806 goto send; 25807 } 25808 25809 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 25810 mp->b_prev = NULL; 25811 mp->b_next = NULL; 25812 25813 /* 25814 * If the IPsec packet was processed asynchronously, 25815 * drop it now. 25816 */ 25817 if (q == NULL) { 25818 freemsg(ipsec_mp); 25819 goto done; 25820 } 25821 25822 /* 25823 * We may be using a wrong ipif to create the ire. 25824 * But it is okay as the source address is assigned 25825 * for the packet already. Next outbound packet would 25826 * create the IRE with the right IPIF in ip_wput. 25827 * 25828 * Also handle RTF_MULTIRT routes. 25829 */ 25830 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, 25831 zoneid, &zero_info); 25832 } else { 25833 if (ire_arg != NULL) { 25834 ire = ire_arg; 25835 ire_need_rele = B_FALSE; 25836 } else { 25837 ire = ire_cache_lookup(dst, zoneid, 25838 msg_getlabel(mp), ipst); 25839 } 25840 if (ire != NULL) { 25841 goto send; 25842 } 25843 25844 /* 25845 * ire disappeared underneath. 25846 * 25847 * What we need to do here is the ip_newroute 25848 * logic to get the ire without doing the IPsec 25849 * processing. Follow the same old path. But this 25850 * time, ip_wput or ire_add_then_put will call us 25851 * directly as all the IPsec operations are done. 25852 */ 25853 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 25854 mp->b_prev = NULL; 25855 mp->b_next = NULL; 25856 25857 /* 25858 * If the IPsec packet was processed asynchronously, 25859 * drop it now. 25860 */ 25861 if (q == NULL) { 25862 freemsg(ipsec_mp); 25863 goto done; 25864 } 25865 25866 /* 25867 * Since we're going through ip_newroute() again, we 25868 * need to make sure we don't: 25869 * 25870 * 1.) Trigger the ASSERT() with the ipha_ident 25871 * overloading. 25872 * 2.) Redo transport-layer checksumming, since we've 25873 * already done all that to get this far. 25874 * 25875 * The easiest way not do either of the above is to set 25876 * the ipha_ident field to IP_HDR_INCLUDED. 25877 */ 25878 ipha->ipha_ident = IP_HDR_INCLUDED; 25879 ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 25880 zoneid, ipst); 25881 } 25882 goto done; 25883 send: 25884 if (ire->ire_stq == NULL) { 25885 ill_t *out_ill; 25886 /* 25887 * Loopbacks go through ip_wput_local except for one case. 25888 * We come here if we generate a icmp_frag_needed message 25889 * after IPsec processing is over. When this function calls 25890 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 25891 * icmp_frag_needed. The message generated comes back here 25892 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 25893 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 25894 * source address as it is usually set in ip_wput_ire. As 25895 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 25896 * and we end up here. We can't enter ip_wput_ire once the 25897 * IPsec processing is over and hence we need to do it here. 25898 */ 25899 ASSERT(q != NULL); 25900 UPDATE_OB_PKT_COUNT(ire); 25901 ire->ire_last_used_time = lbolt; 25902 if (ipha->ipha_src == 0) 25903 ipha->ipha_src = ire->ire_src_addr; 25904 25905 /* PFHooks: LOOPBACK_OUT */ 25906 out_ill = ire_to_ill(ire); 25907 25908 /* 25909 * DTrace this as ip:::send. A blocked packet will fire the 25910 * send probe, but not the receive probe. 25911 */ 25912 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 25913 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 25914 ipha_t *, ipha, ip6_t *, NULL, int, 1); 25915 25916 DTRACE_PROBE4(ip4__loopback__out__start, 25917 ill_t *, NULL, ill_t *, out_ill, 25918 ipha_t *, ipha1, mblk_t *, ipsec_mp); 25919 25920 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 25921 ipst->ips_ipv4firewall_loopback_out, 25922 NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst); 25923 25924 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp); 25925 25926 if (ipsec_mp != NULL) 25927 ip_wput_local(RD(q), out_ill, 25928 ipha, ipsec_mp, ire, 0, zoneid); 25929 if (ire_need_rele) 25930 ire_refrele(ire); 25931 goto done; 25932 } 25933 25934 if (ire->ire_max_frag < (unsigned int)LENGTH) { 25935 /* 25936 * We are through with IPsec processing. 25937 * Fragment this and send it on the wire. 25938 */ 25939 if (io->ipsec_out_accelerated) { 25940 /* 25941 * The packet has been accelerated but must 25942 * be fragmented. This should not happen 25943 * since AH and ESP must not accelerate 25944 * packets that need fragmentation, however 25945 * the configuration could have changed 25946 * since the AH or ESP processing. 25947 * Drop packet. 25948 * IPsec KSTATS: bump bean counter here. 25949 */ 25950 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 25951 "fragmented accelerated packet!\n")); 25952 freemsg(ipsec_mp); 25953 } else { 25954 ip_wput_ire_fragmentit(ipsec_mp, ire, 25955 zoneid, ipst, NULL); 25956 } 25957 if (ire_need_rele) 25958 ire_refrele(ire); 25959 goto done; 25960 } 25961 25962 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 25963 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 25964 (void *)ire->ire_ipif, (void *)ipif)); 25965 25966 /* 25967 * Multiroute the secured packet. 25968 */ 25969 if (ire->ire_flags & RTF_MULTIRT) { 25970 ire_t *first_ire; 25971 irb = ire->ire_bucket; 25972 ASSERT(irb != NULL); 25973 /* 25974 * This ire has been looked up as the one that 25975 * goes through the given ipif; 25976 * make sure we do not omit any other multiroute ire 25977 * that may be present in the bucket before this one. 25978 */ 25979 IRB_REFHOLD(irb); 25980 for (first_ire = irb->irb_ire; 25981 first_ire != NULL; 25982 first_ire = first_ire->ire_next) { 25983 if ((first_ire->ire_flags & RTF_MULTIRT) && 25984 (first_ire->ire_addr == ire->ire_addr) && 25985 !(first_ire->ire_marks & 25986 (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) 25987 break; 25988 } 25989 25990 if ((first_ire != NULL) && (first_ire != ire)) { 25991 /* 25992 * Don't change the ire if the packet must 25993 * be fragmented if sent via this new one. 25994 */ 25995 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 25996 IRE_REFHOLD(first_ire); 25997 if (ire_need_rele) 25998 ire_refrele(ire); 25999 else 26000 ire_need_rele = B_TRUE; 26001 ire = first_ire; 26002 } 26003 } 26004 IRB_REFRELE(irb); 26005 26006 multirt_send = B_TRUE; 26007 max_frag = ire->ire_max_frag; 26008 } 26009 26010 /* 26011 * In most cases, the emission loop below is entered only once. 26012 * Only in the case where the ire holds the RTF_MULTIRT 26013 * flag, we loop to process all RTF_MULTIRT ires in the 26014 * bucket, and send the packet through all crossed 26015 * RTF_MULTIRT routes. 26016 */ 26017 do { 26018 if (multirt_send) { 26019 /* 26020 * ire1 holds here the next ire to process in the 26021 * bucket. If multirouting is expected, 26022 * any non-RTF_MULTIRT ire that has the 26023 * right destination address is ignored. 26024 */ 26025 ASSERT(irb != NULL); 26026 IRB_REFHOLD(irb); 26027 for (ire1 = ire->ire_next; 26028 ire1 != NULL; 26029 ire1 = ire1->ire_next) { 26030 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 26031 continue; 26032 if (ire1->ire_addr != ire->ire_addr) 26033 continue; 26034 if (ire1->ire_marks & 26035 (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) 26036 continue; 26037 /* No loopback here */ 26038 if (ire1->ire_stq == NULL) 26039 continue; 26040 /* 26041 * Ensure we do not exceed the MTU 26042 * of the next route. 26043 */ 26044 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 26045 ip_multirt_bad_mtu(ire1, max_frag); 26046 continue; 26047 } 26048 26049 IRE_REFHOLD(ire1); 26050 break; 26051 } 26052 IRB_REFRELE(irb); 26053 if (ire1 != NULL) { 26054 /* 26055 * We are in a multiple send case, need to 26056 * make a copy of the packet. 26057 */ 26058 next_mp = copymsg(ipsec_mp); 26059 if (next_mp == NULL) { 26060 ire_refrele(ire1); 26061 ire1 = NULL; 26062 } 26063 } 26064 } 26065 /* 26066 * Everything is done. Send it out on the wire 26067 * 26068 * ip_xmit_v4 will call ip_wput_attach_llhdr and then 26069 * either send it on the wire or, in the case of 26070 * HW acceleration, call ipsec_hw_putnext. 26071 */ 26072 if (ire->ire_nce && 26073 ire->ire_nce->nce_state != ND_REACHABLE) { 26074 DTRACE_PROBE2(ip__wput__ipsec__bail, 26075 (ire_t *), ire, (mblk_t *), ipsec_mp); 26076 /* 26077 * If ire's link-layer is unresolved (this 26078 * would only happen if the incomplete ire 26079 * was added to cachetable via forwarding path) 26080 * don't bother going to ip_xmit_v4. Just drop the 26081 * packet. 26082 * There is a slight risk here, in that, if we 26083 * have the forwarding path create an incomplete 26084 * IRE, then until the IRE is completed, any 26085 * transmitted IPsec packets will be dropped 26086 * instead of being queued waiting for resolution. 26087 * 26088 * But the likelihood of a forwarding packet and a wput 26089 * packet sending to the same dst at the same time 26090 * and there not yet be an ARP entry for it is small. 26091 * Furthermore, if this actually happens, it might 26092 * be likely that wput would generate multiple 26093 * packets (and forwarding would also have a train 26094 * of packets) for that destination. If this is 26095 * the case, some of them would have been dropped 26096 * anyway, since ARP only queues a few packets while 26097 * waiting for resolution 26098 * 26099 * NOTE: We should really call ip_xmit_v4, 26100 * and let it queue the packet and send the 26101 * ARP query and have ARP come back thus: 26102 * <ARP> ip_wput->ip_output->ip-wput_nondata-> 26103 * ip_xmit_v4->ip_wput_attach_llhdr + ipsec 26104 * hw accel work. But it's too complex to get 26105 * the IPsec hw acceleration approach to fit 26106 * well with ip_xmit_v4 doing ARP without 26107 * doing IPsec simplification. For now, we just 26108 * poke ip_xmit_v4 to trigger the arp resolve, so 26109 * that we can continue with the send on the next 26110 * attempt. 26111 * 26112 * XXX THis should be revisited, when 26113 * the IPsec/IP interaction is cleaned up 26114 */ 26115 ip1dbg(("ip_wput_ipsec_out: ire is incomplete" 26116 " - dropping packet\n")); 26117 freemsg(ipsec_mp); 26118 /* 26119 * Call ip_xmit_v4() to trigger ARP query 26120 * in case the nce_state is ND_INITIAL 26121 */ 26122 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); 26123 goto drop_pkt; 26124 } 26125 26126 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 26127 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1, 26128 mblk_t *, ipsec_mp); 26129 FW_HOOKS(ipst->ips_ip4_physical_out_event, 26130 ipst->ips_ipv4firewall_physical_out, NULL, 26131 ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst); 26132 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp); 26133 if (ipsec_mp == NULL) 26134 goto drop_pkt; 26135 26136 ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n")); 26137 pktxmit_state = ip_xmit_v4(mp, ire, 26138 (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL); 26139 26140 if ((pktxmit_state == SEND_FAILED) || 26141 (pktxmit_state == LLHDR_RESLV_FAILED)) { 26142 26143 freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */ 26144 drop_pkt: 26145 BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib, 26146 ipIfStatsOutDiscards); 26147 if (ire_need_rele) 26148 ire_refrele(ire); 26149 if (ire1 != NULL) { 26150 ire_refrele(ire1); 26151 freemsg(next_mp); 26152 } 26153 goto done; 26154 } 26155 26156 freeb(ipsec_mp); 26157 if (ire_need_rele) 26158 ire_refrele(ire); 26159 26160 if (ire1 != NULL) { 26161 ire = ire1; 26162 ire_need_rele = B_TRUE; 26163 ASSERT(next_mp); 26164 ipsec_mp = next_mp; 26165 mp = ipsec_mp->b_cont; 26166 ire1 = NULL; 26167 next_mp = NULL; 26168 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26169 } else { 26170 multirt_send = B_FALSE; 26171 } 26172 } while (multirt_send); 26173 done: 26174 if (ill != NULL && ill_need_rele) 26175 ill_refrele(ill); 26176 if (ipif != NULL) 26177 ipif_refrele(ipif); 26178 } 26179 26180 /* 26181 * Get the ill corresponding to the specified ire, and compare its 26182 * capabilities with the protocol and algorithms specified by the 26183 * the SA obtained from ipsec_out. If they match, annotate the 26184 * ipsec_out structure to indicate that the packet needs acceleration. 26185 * 26186 * 26187 * A packet is eligible for outbound hardware acceleration if the 26188 * following conditions are satisfied: 26189 * 26190 * 1. the packet will not be fragmented 26191 * 2. the provider supports the algorithm 26192 * 3. there is no pending control message being exchanged 26193 * 4. snoop is not attached 26194 * 5. the destination address is not a broadcast or multicast address. 26195 * 26196 * Rationale: 26197 * - Hardware drivers do not support fragmentation with 26198 * the current interface. 26199 * - snoop, multicast, and broadcast may result in exposure of 26200 * a cleartext datagram. 26201 * We check all five of these conditions here. 26202 * 26203 * XXX would like to nuke "ire_t *" parameter here; problem is that 26204 * IRE is only way to figure out if a v4 address is a broadcast and 26205 * thus ineligible for acceleration... 26206 */ 26207 static void 26208 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 26209 { 26210 ipsec_out_t *io; 26211 mblk_t *data_mp; 26212 uint_t plen, overhead; 26213 ip_stack_t *ipst; 26214 phyint_t *phyint; 26215 26216 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 26217 return; 26218 26219 if (ill == NULL) 26220 return; 26221 ipst = ill->ill_ipst; 26222 phyint = ill->ill_phyint; 26223 26224 /* 26225 * Destination address is a broadcast or multicast. Punt. 26226 */ 26227 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 26228 IRE_LOCAL))) 26229 return; 26230 26231 data_mp = ipsec_mp->b_cont; 26232 26233 if (ill->ill_isv6) { 26234 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 26235 26236 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 26237 return; 26238 26239 plen = ip6h->ip6_plen; 26240 } else { 26241 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 26242 26243 if (CLASSD(ipha->ipha_dst)) 26244 return; 26245 26246 plen = ipha->ipha_length; 26247 } 26248 /* 26249 * Is there a pending DLPI control message being exchanged 26250 * between IP/IPsec and the DLS Provider? If there is, it 26251 * could be a SADB update, and the state of the DLS Provider 26252 * SADB might not be in sync with the SADB maintained by 26253 * IPsec. To avoid dropping packets or using the wrong keying 26254 * material, we do not accelerate this packet. 26255 */ 26256 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 26257 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26258 "ill_dlpi_pending! don't accelerate packet\n")); 26259 return; 26260 } 26261 26262 /* 26263 * Is the Provider in promiscous mode? If it does, we don't 26264 * accelerate the packet since it will bounce back up to the 26265 * listeners in the clear. 26266 */ 26267 if (phyint->phyint_flags & PHYI_PROMISC) { 26268 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26269 "ill in promiscous mode, don't accelerate packet\n")); 26270 return; 26271 } 26272 26273 /* 26274 * Will the packet require fragmentation? 26275 */ 26276 26277 /* 26278 * IPsec ESP note: this is a pessimistic estimate, but the same 26279 * as is used elsewhere. 26280 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 26281 * + 2-byte trailer 26282 */ 26283 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 26284 IPSEC_BASE_ESP_HDR_SIZE(sa); 26285 26286 if ((plen + overhead) > ill->ill_max_mtu) 26287 return; 26288 26289 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26290 26291 /* 26292 * Can the ill accelerate this IPsec protocol and algorithm 26293 * specified by the SA? 26294 */ 26295 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 26296 ill->ill_isv6, sa, ipst->ips_netstack)) { 26297 return; 26298 } 26299 26300 /* 26301 * Tell AH or ESP that the outbound ill is capable of 26302 * accelerating this packet. 26303 */ 26304 io->ipsec_out_is_capab_ill = B_TRUE; 26305 } 26306 26307 /* 26308 * Select which AH & ESP SA's to use (if any) for the outbound packet. 26309 * 26310 * If this function returns B_TRUE, the requested SA's have been filled 26311 * into the ipsec_out_*_sa pointers. 26312 * 26313 * If the function returns B_FALSE, the packet has been "consumed", most 26314 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 26315 * 26316 * The SA references created by the protocol-specific "select" 26317 * function will be released when the ipsec_mp is freed, thanks to the 26318 * ipsec_out_free destructor -- see spd.c. 26319 */ 26320 static boolean_t 26321 ipsec_out_select_sa(mblk_t *ipsec_mp) 26322 { 26323 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 26324 ipsec_out_t *io; 26325 ipsec_policy_t *pp; 26326 ipsec_action_t *ap; 26327 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26328 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26329 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26330 26331 if (!io->ipsec_out_secure) { 26332 /* 26333 * We came here by mistake. 26334 * Don't bother with ipsec processing 26335 * We should "discourage" this path in the future. 26336 */ 26337 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26338 return (B_FALSE); 26339 } 26340 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26341 ASSERT((io->ipsec_out_policy != NULL) || 26342 (io->ipsec_out_act != NULL)); 26343 26344 ASSERT(io->ipsec_out_failed == B_FALSE); 26345 26346 /* 26347 * IPsec processing has started. 26348 */ 26349 io->ipsec_out_proc_begin = B_TRUE; 26350 ap = io->ipsec_out_act; 26351 if (ap == NULL) { 26352 pp = io->ipsec_out_policy; 26353 ASSERT(pp != NULL); 26354 ap = pp->ipsp_act; 26355 ASSERT(ap != NULL); 26356 } 26357 26358 /* 26359 * We have an action. now, let's select SA's. 26360 * (In the future, we can cache this in the conn_t..) 26361 */ 26362 if (ap->ipa_want_esp) { 26363 if (io->ipsec_out_esp_sa == NULL) { 26364 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 26365 IPPROTO_ESP); 26366 } 26367 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 26368 } 26369 26370 if (ap->ipa_want_ah) { 26371 if (io->ipsec_out_ah_sa == NULL) { 26372 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 26373 IPPROTO_AH); 26374 } 26375 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 26376 /* 26377 * The ESP and AH processing order needs to be preserved 26378 * when both protocols are required (ESP should be applied 26379 * before AH for an outbound packet). Force an ESP ACQUIRE 26380 * when both ESP and AH are required, and an AH ACQUIRE 26381 * is needed. 26382 */ 26383 if (ap->ipa_want_esp && need_ah_acquire) 26384 need_esp_acquire = B_TRUE; 26385 } 26386 26387 /* 26388 * Send an ACQUIRE (extended, regular, or both) if we need one. 26389 * Release SAs that got referenced, but will not be used until we 26390 * acquire _all_ of the SAs we need. 26391 */ 26392 if (need_ah_acquire || need_esp_acquire) { 26393 if (io->ipsec_out_ah_sa != NULL) { 26394 IPSA_REFRELE(io->ipsec_out_ah_sa); 26395 io->ipsec_out_ah_sa = NULL; 26396 } 26397 if (io->ipsec_out_esp_sa != NULL) { 26398 IPSA_REFRELE(io->ipsec_out_esp_sa); 26399 io->ipsec_out_esp_sa = NULL; 26400 } 26401 26402 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 26403 return (B_FALSE); 26404 } 26405 26406 return (B_TRUE); 26407 } 26408 26409 /* 26410 * Process an IPSEC_OUT message and see what you can 26411 * do with it. 26412 * IPQoS Notes: 26413 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 26414 * IPsec. 26415 * XXX would like to nuke ire_t. 26416 * XXX ill_index better be "real" 26417 */ 26418 void 26419 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 26420 { 26421 ipsec_out_t *io; 26422 ipsec_policy_t *pp; 26423 ipsec_action_t *ap; 26424 ipha_t *ipha; 26425 ip6_t *ip6h; 26426 mblk_t *mp; 26427 ill_t *ill; 26428 zoneid_t zoneid; 26429 ipsec_status_t ipsec_rc; 26430 boolean_t ill_need_rele = B_FALSE; 26431 ip_stack_t *ipst; 26432 ipsec_stack_t *ipss; 26433 26434 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26435 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26436 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26437 ipst = io->ipsec_out_ns->netstack_ip; 26438 mp = ipsec_mp->b_cont; 26439 26440 /* 26441 * Initiate IPPF processing. We do it here to account for packets 26442 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 26443 * We can check for ipsec_out_proc_begin even for such packets, as 26444 * they will always be false (asserted below). 26445 */ 26446 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) { 26447 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 26448 io->ipsec_out_ill_index : ill_index); 26449 if (mp == NULL) { 26450 ip2dbg(("ipsec_out_process: packet dropped "\ 26451 "during IPPF processing\n")); 26452 freeb(ipsec_mp); 26453 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26454 return; 26455 } 26456 } 26457 26458 if (!io->ipsec_out_secure) { 26459 /* 26460 * We came here by mistake. 26461 * Don't bother with ipsec processing 26462 * Should "discourage" this path in the future. 26463 */ 26464 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26465 goto done; 26466 } 26467 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26468 ASSERT((io->ipsec_out_policy != NULL) || 26469 (io->ipsec_out_act != NULL)); 26470 ASSERT(io->ipsec_out_failed == B_FALSE); 26471 26472 ipss = ipst->ips_netstack->netstack_ipsec; 26473 if (!ipsec_loaded(ipss)) { 26474 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 26475 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26476 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26477 } else { 26478 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 26479 } 26480 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 26481 DROPPER(ipss, ipds_ip_ipsec_not_loaded), 26482 &ipss->ipsec_dropper); 26483 return; 26484 } 26485 26486 /* 26487 * IPsec processing has started. 26488 */ 26489 io->ipsec_out_proc_begin = B_TRUE; 26490 ap = io->ipsec_out_act; 26491 if (ap == NULL) { 26492 pp = io->ipsec_out_policy; 26493 ASSERT(pp != NULL); 26494 ap = pp->ipsp_act; 26495 ASSERT(ap != NULL); 26496 } 26497 26498 /* 26499 * Save the outbound ill index. When the packet comes back 26500 * from IPsec, we make sure the ill hasn't changed or disappeared 26501 * before sending it the accelerated packet. 26502 */ 26503 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 26504 ill = ire_to_ill(ire); 26505 io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex; 26506 } 26507 26508 /* 26509 * The order of processing is first insert a IP header if needed. 26510 * Then insert the ESP header and then the AH header. 26511 */ 26512 if ((io->ipsec_out_se_done == B_FALSE) && 26513 (ap->ipa_want_se)) { 26514 /* 26515 * First get the outer IP header before sending 26516 * it to ESP. 26517 */ 26518 ipha_t *oipha, *iipha; 26519 mblk_t *outer_mp, *inner_mp; 26520 26521 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 26522 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 26523 "ipsec_out_process: " 26524 "Self-Encapsulation failed: Out of memory\n"); 26525 freemsg(ipsec_mp); 26526 if (ill != NULL) { 26527 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26528 } else { 26529 BUMP_MIB(&ipst->ips_ip_mib, 26530 ipIfStatsOutDiscards); 26531 } 26532 return; 26533 } 26534 inner_mp = ipsec_mp->b_cont; 26535 ASSERT(inner_mp->b_datap->db_type == M_DATA); 26536 oipha = (ipha_t *)outer_mp->b_rptr; 26537 iipha = (ipha_t *)inner_mp->b_rptr; 26538 *oipha = *iipha; 26539 outer_mp->b_wptr += sizeof (ipha_t); 26540 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 26541 sizeof (ipha_t)); 26542 oipha->ipha_protocol = IPPROTO_ENCAP; 26543 oipha->ipha_version_and_hdr_length = 26544 IP_SIMPLE_HDR_VERSION; 26545 oipha->ipha_hdr_checksum = 0; 26546 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 26547 outer_mp->b_cont = inner_mp; 26548 ipsec_mp->b_cont = outer_mp; 26549 26550 io->ipsec_out_se_done = B_TRUE; 26551 io->ipsec_out_tunnel = B_TRUE; 26552 } 26553 26554 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 26555 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 26556 !ipsec_out_select_sa(ipsec_mp)) 26557 return; 26558 26559 /* 26560 * By now, we know what SA's to use. Toss over to ESP & AH 26561 * to do the heavy lifting. 26562 */ 26563 zoneid = io->ipsec_out_zoneid; 26564 ASSERT(zoneid != ALL_ZONES); 26565 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 26566 ASSERT(io->ipsec_out_esp_sa != NULL); 26567 io->ipsec_out_esp_done = B_TRUE; 26568 /* 26569 * Note that since hw accel can only apply one transform, 26570 * not two, we skip hw accel for ESP if we also have AH 26571 * This is an design limitation of the interface 26572 * which should be revisited. 26573 */ 26574 ASSERT(ire != NULL); 26575 if (io->ipsec_out_ah_sa == NULL) { 26576 ill = (ill_t *)ire->ire_stq->q_ptr; 26577 ipsec_out_is_accelerated(ipsec_mp, 26578 io->ipsec_out_esp_sa, ill, ire); 26579 } 26580 26581 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 26582 switch (ipsec_rc) { 26583 case IPSEC_STATUS_SUCCESS: 26584 break; 26585 case IPSEC_STATUS_FAILED: 26586 if (ill != NULL) { 26587 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26588 } else { 26589 BUMP_MIB(&ipst->ips_ip_mib, 26590 ipIfStatsOutDiscards); 26591 } 26592 /* FALLTHRU */ 26593 case IPSEC_STATUS_PENDING: 26594 return; 26595 } 26596 } 26597 26598 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 26599 ASSERT(io->ipsec_out_ah_sa != NULL); 26600 io->ipsec_out_ah_done = B_TRUE; 26601 if (ire == NULL) { 26602 int idx = io->ipsec_out_capab_ill_index; 26603 ill = ill_lookup_on_ifindex(idx, B_FALSE, 26604 NULL, NULL, NULL, NULL, ipst); 26605 ill_need_rele = B_TRUE; 26606 } else { 26607 ill = (ill_t *)ire->ire_stq->q_ptr; 26608 } 26609 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 26610 ire); 26611 26612 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 26613 switch (ipsec_rc) { 26614 case IPSEC_STATUS_SUCCESS: 26615 break; 26616 case IPSEC_STATUS_FAILED: 26617 if (ill != NULL) { 26618 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26619 } else { 26620 BUMP_MIB(&ipst->ips_ip_mib, 26621 ipIfStatsOutDiscards); 26622 } 26623 /* FALLTHRU */ 26624 case IPSEC_STATUS_PENDING: 26625 if (ill != NULL && ill_need_rele) 26626 ill_refrele(ill); 26627 return; 26628 } 26629 } 26630 /* 26631 * We are done with IPsec processing. Send it over the wire. 26632 */ 26633 done: 26634 mp = ipsec_mp->b_cont; 26635 ipha = (ipha_t *)mp->b_rptr; 26636 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26637 ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill, 26638 ire); 26639 } else { 26640 ip6h = (ip6_t *)ipha; 26641 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill, 26642 ire); 26643 } 26644 if (ill != NULL && ill_need_rele) 26645 ill_refrele(ill); 26646 } 26647 26648 /* ARGSUSED */ 26649 void 26650 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 26651 { 26652 opt_restart_t *or; 26653 int err; 26654 conn_t *connp; 26655 cred_t *cr; 26656 26657 ASSERT(CONN_Q(q)); 26658 connp = Q_TO_CONN(q); 26659 26660 ASSERT(first_mp->b_datap->db_type == M_CTL); 26661 or = (opt_restart_t *)first_mp->b_rptr; 26662 /* 26663 * We checked for a db_credp the first time svr4_optcom_req 26664 * was called (from ip_wput_nondata). So we can just ASSERT here. 26665 */ 26666 cr = msg_getcred(first_mp, NULL); 26667 ASSERT(cr != NULL); 26668 26669 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 26670 err = svr4_optcom_req(q, first_mp, cr, 26671 &ip_opt_obj, B_FALSE); 26672 } else { 26673 ASSERT(or->or_type == T_OPTMGMT_REQ); 26674 err = tpi_optcom_req(q, first_mp, cr, 26675 &ip_opt_obj, B_FALSE); 26676 } 26677 if (err != EINPROGRESS) { 26678 /* operation is done */ 26679 CONN_OPER_PENDING_DONE(connp); 26680 } 26681 } 26682 26683 /* 26684 * ioctls that go through a down/up sequence may need to wait for the down 26685 * to complete. This involves waiting for the ire and ipif refcnts to go down 26686 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 26687 */ 26688 /* ARGSUSED */ 26689 void 26690 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 26691 { 26692 struct iocblk *iocp; 26693 mblk_t *mp1; 26694 ip_ioctl_cmd_t *ipip; 26695 int err; 26696 sin_t *sin; 26697 struct lifreq *lifr; 26698 struct ifreq *ifr; 26699 26700 iocp = (struct iocblk *)mp->b_rptr; 26701 ASSERT(ipsq != NULL); 26702 /* Existence of mp1 verified in ip_wput_nondata */ 26703 mp1 = mp->b_cont->b_cont; 26704 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 26705 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 26706 /* 26707 * Special case where ipx_current_ipif is not set: 26708 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 26709 * We are here as were not able to complete the operation in 26710 * ipif_set_values because we could not become exclusive on 26711 * the new ipsq. 26712 */ 26713 ill_t *ill = q->q_ptr; 26714 ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd); 26715 } 26716 ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL); 26717 26718 if (ipip->ipi_cmd_type == IF_CMD) { 26719 /* This a old style SIOC[GS]IF* command */ 26720 ifr = (struct ifreq *)mp1->b_rptr; 26721 sin = (sin_t *)&ifr->ifr_addr; 26722 } else if (ipip->ipi_cmd_type == LIF_CMD) { 26723 /* This a new style SIOC[GS]LIF* command */ 26724 lifr = (struct lifreq *)mp1->b_rptr; 26725 sin = (sin_t *)&lifr->lifr_addr; 26726 } else { 26727 sin = NULL; 26728 } 26729 26730 err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin, 26731 q, mp, ipip, mp1->b_rptr); 26732 26733 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 26734 } 26735 26736 /* 26737 * ioctl processing 26738 * 26739 * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up 26740 * the ioctl command in the ioctl tables, determines the copyin data size 26741 * from the ipi_copyin_size field, and does an mi_copyin() of that size. 26742 * 26743 * ioctl processing then continues when the M_IOCDATA makes its way down to 26744 * ip_wput_nondata(). The ioctl is looked up again in the ioctl table, its 26745 * associated 'conn' is refheld till the end of the ioctl and the general 26746 * ioctl processing function ip_process_ioctl() is called to extract the 26747 * arguments and process the ioctl. To simplify extraction, ioctl commands 26748 * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a 26749 * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq()) 26750 * is used to extract the ioctl's arguments. 26751 * 26752 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 26753 * so goes thru the serialization primitive ipsq_try_enter. Then the 26754 * appropriate function to handle the ioctl is called based on the entry in 26755 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 26756 * which also refreleases the 'conn' that was refheld at the start of the 26757 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 26758 * 26759 * Many exclusive ioctls go thru an internal down up sequence as part of 26760 * the operation. For example an attempt to change the IP address of an 26761 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 26762 * does all the cleanup such as deleting all ires that use this address. 26763 * Then we need to wait till all references to the interface go away. 26764 */ 26765 void 26766 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 26767 { 26768 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 26769 ip_ioctl_cmd_t *ipip = arg; 26770 ip_extract_func_t *extract_funcp; 26771 cmd_info_t ci; 26772 int err; 26773 boolean_t entered_ipsq = B_FALSE; 26774 26775 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 26776 26777 if (ipip == NULL) 26778 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 26779 26780 /* 26781 * SIOCLIFADDIF needs to go thru a special path since the 26782 * ill may not exist yet. This happens in the case of lo0 26783 * which is created using this ioctl. 26784 */ 26785 if (ipip->ipi_cmd == SIOCLIFADDIF) { 26786 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 26787 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 26788 return; 26789 } 26790 26791 ci.ci_ipif = NULL; 26792 if (ipip->ipi_cmd_type == MISC_CMD) { 26793 /* 26794 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF. 26795 */ 26796 if (ipip->ipi_cmd == IF_UNITSEL) { 26797 /* ioctl comes down the ill */ 26798 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 26799 ipif_refhold(ci.ci_ipif); 26800 } 26801 err = 0; 26802 ci.ci_sin = NULL; 26803 ci.ci_sin6 = NULL; 26804 ci.ci_lifr = NULL; 26805 } else { 26806 switch (ipip->ipi_cmd_type) { 26807 case IF_CMD: 26808 case LIF_CMD: 26809 extract_funcp = ip_extract_lifreq; 26810 break; 26811 26812 case ARP_CMD: 26813 case XARP_CMD: 26814 extract_funcp = ip_extract_arpreq; 26815 break; 26816 26817 case MSFILT_CMD: 26818 extract_funcp = ip_extract_msfilter; 26819 break; 26820 26821 default: 26822 ASSERT(0); 26823 } 26824 26825 err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl); 26826 if (err != 0) { 26827 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 26828 return; 26829 } 26830 26831 /* 26832 * All of the extraction functions return a refheld ipif. 26833 */ 26834 ASSERT(ci.ci_ipif != NULL); 26835 } 26836 26837 if (!(ipip->ipi_flags & IPI_WR)) { 26838 /* 26839 * A return value of EINPROGRESS means the ioctl is 26840 * either queued and waiting for some reason or has 26841 * already completed. 26842 */ 26843 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 26844 ci.ci_lifr); 26845 if (ci.ci_ipif != NULL) 26846 ipif_refrele(ci.ci_ipif); 26847 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 26848 return; 26849 } 26850 26851 ASSERT(ci.ci_ipif != NULL); 26852 26853 /* 26854 * If ipsq is non-NULL, we are already being called exclusively. 26855 */ 26856 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 26857 if (ipsq == NULL) { 26858 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, 26859 NEW_OP, B_TRUE); 26860 if (ipsq == NULL) { 26861 ipif_refrele(ci.ci_ipif); 26862 return; 26863 } 26864 entered_ipsq = B_TRUE; 26865 } 26866 26867 /* 26868 * Release the ipif so that ipif_down and friends that wait for 26869 * references to go away are not misled about the current ipif_refcnt 26870 * values. We are writer so we can access the ipif even after releasing 26871 * the ipif. 26872 */ 26873 ipif_refrele(ci.ci_ipif); 26874 26875 ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); 26876 26877 /* 26878 * A return value of EINPROGRESS means the ioctl is 26879 * either queued and waiting for some reason or has 26880 * already completed. 26881 */ 26882 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); 26883 26884 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 26885 26886 if (entered_ipsq) 26887 ipsq_exit(ipsq); 26888 } 26889 26890 /* 26891 * Complete the ioctl. Typically ioctls use the mi package and need to 26892 * do mi_copyout/mi_copy_done. 26893 */ 26894 void 26895 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq) 26896 { 26897 conn_t *connp = NULL; 26898 26899 if (err == EINPROGRESS) 26900 return; 26901 26902 if (CONN_Q(q)) { 26903 connp = Q_TO_CONN(q); 26904 ASSERT(connp->conn_ref >= 2); 26905 } 26906 26907 switch (mode) { 26908 case COPYOUT: 26909 if (err == 0) 26910 mi_copyout(q, mp); 26911 else 26912 mi_copy_done(q, mp, err); 26913 break; 26914 26915 case NO_COPYOUT: 26916 mi_copy_done(q, mp, err); 26917 break; 26918 26919 default: 26920 ASSERT(mode == CONN_CLOSE); /* aborted through CONN_CLOSE */ 26921 break; 26922 } 26923 26924 /* 26925 * The refhold placed at the start of the ioctl is released here. 26926 */ 26927 if (connp != NULL) 26928 CONN_OPER_PENDING_DONE(connp); 26929 26930 if (ipsq != NULL) 26931 ipsq_current_finish(ipsq); 26932 } 26933 26934 /* Called from ip_wput for all non data messages */ 26935 /* ARGSUSED */ 26936 void 26937 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 26938 { 26939 mblk_t *mp1; 26940 ire_t *ire, *fake_ire; 26941 ill_t *ill; 26942 struct iocblk *iocp; 26943 ip_ioctl_cmd_t *ipip; 26944 cred_t *cr; 26945 conn_t *connp; 26946 int err; 26947 nce_t *nce; 26948 ipif_t *ipif; 26949 ip_stack_t *ipst; 26950 char *proto_str; 26951 26952 if (CONN_Q(q)) { 26953 connp = Q_TO_CONN(q); 26954 ipst = connp->conn_netstack->netstack_ip; 26955 } else { 26956 connp = NULL; 26957 ipst = ILLQ_TO_IPST(q); 26958 } 26959 26960 switch (DB_TYPE(mp)) { 26961 case M_IOCTL: 26962 /* 26963 * IOCTL processing begins in ip_sioctl_copyin_setup which 26964 * will arrange to copy in associated control structures. 26965 */ 26966 ip_sioctl_copyin_setup(q, mp); 26967 return; 26968 case M_IOCDATA: 26969 /* 26970 * Ensure that this is associated with one of our trans- 26971 * parent ioctls. If it's not ours, discard it if we're 26972 * running as a driver, or pass it on if we're a module. 26973 */ 26974 iocp = (struct iocblk *)mp->b_rptr; 26975 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 26976 if (ipip == NULL) { 26977 if (q->q_next == NULL) { 26978 goto nak; 26979 } else { 26980 putnext(q, mp); 26981 } 26982 return; 26983 } 26984 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 26985 /* 26986 * the ioctl is one we recognise, but is not 26987 * consumed by IP as a module, pass M_IOCDATA 26988 * for processing downstream, but only for 26989 * common Streams ioctls. 26990 */ 26991 if (ipip->ipi_flags & IPI_PASS_DOWN) { 26992 putnext(q, mp); 26993 return; 26994 } else { 26995 goto nak; 26996 } 26997 } 26998 26999 /* IOCTL continuation following copyin or copyout. */ 27000 if (mi_copy_state(q, mp, NULL) == -1) { 27001 /* 27002 * The copy operation failed. mi_copy_state already 27003 * cleaned up, so we're out of here. 27004 */ 27005 return; 27006 } 27007 /* 27008 * If we just completed a copy in, we become writer and 27009 * continue processing in ip_sioctl_copyin_done. If it 27010 * was a copy out, we call mi_copyout again. If there is 27011 * nothing more to copy out, it will complete the IOCTL. 27012 */ 27013 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 27014 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 27015 mi_copy_done(q, mp, EPROTO); 27016 return; 27017 } 27018 /* 27019 * Check for cases that need more copying. A return 27020 * value of 0 means a second copyin has been started, 27021 * so we return; a return value of 1 means no more 27022 * copying is needed, so we continue. 27023 */ 27024 if (ipip->ipi_cmd_type == MSFILT_CMD && 27025 MI_COPY_COUNT(mp) == 1) { 27026 if (ip_copyin_msfilter(q, mp) == 0) 27027 return; 27028 } 27029 /* 27030 * Refhold the conn, till the ioctl completes. This is 27031 * needed in case the ioctl ends up in the pending mp 27032 * list. Every mp in the ill_pending_mp list and 27033 * the ipx_pending_mp must have a refhold on the conn 27034 * to resume processing. The refhold is released when 27035 * the ioctl completes. (normally or abnormally) 27036 * In all cases ip_ioctl_finish is called to finish 27037 * the ioctl. 27038 */ 27039 if (connp != NULL) { 27040 /* This is not a reentry */ 27041 ASSERT(ipsq == NULL); 27042 CONN_INC_REF(connp); 27043 } else { 27044 if (!(ipip->ipi_flags & IPI_MODOK)) { 27045 mi_copy_done(q, mp, EINVAL); 27046 return; 27047 } 27048 } 27049 27050 ip_process_ioctl(ipsq, q, mp, ipip); 27051 27052 } else { 27053 mi_copyout(q, mp); 27054 } 27055 return; 27056 nak: 27057 iocp->ioc_error = EINVAL; 27058 mp->b_datap->db_type = M_IOCNAK; 27059 iocp->ioc_count = 0; 27060 qreply(q, mp); 27061 return; 27062 27063 case M_IOCNAK: 27064 /* 27065 * The only way we could get here is if a resolver didn't like 27066 * an IOCTL we sent it. This shouldn't happen. 27067 */ 27068 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 27069 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 27070 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 27071 freemsg(mp); 27072 return; 27073 case M_IOCACK: 27074 /* /dev/ip shouldn't see this */ 27075 if (CONN_Q(q)) 27076 goto nak; 27077 27078 /* 27079 * Finish socket ioctls passed through to ARP. We use the 27080 * ioc_cmd values we set in ip_sioctl_arp() to decide whether 27081 * we need to become writer before calling ip_sioctl_iocack(). 27082 * Note that qwriter_ip() will release the refhold, and that a 27083 * refhold is OK without ILL_CAN_LOOKUP() since we're on the 27084 * ill stream. 27085 */ 27086 iocp = (struct iocblk *)mp->b_rptr; 27087 if (iocp->ioc_cmd == AR_ENTRY_SQUERY) { 27088 ip_sioctl_iocack(NULL, q, mp, NULL); 27089 return; 27090 } 27091 27092 ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE || 27093 iocp->ioc_cmd == AR_ENTRY_ADD); 27094 ill = q->q_ptr; 27095 ill_refhold(ill); 27096 qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE); 27097 return; 27098 case M_FLUSH: 27099 if (*mp->b_rptr & FLUSHW) 27100 flushq(q, FLUSHALL); 27101 if (q->q_next) { 27102 putnext(q, mp); 27103 return; 27104 } 27105 if (*mp->b_rptr & FLUSHR) { 27106 *mp->b_rptr &= ~FLUSHW; 27107 qreply(q, mp); 27108 return; 27109 } 27110 freemsg(mp); 27111 return; 27112 case IRE_DB_REQ_TYPE: 27113 if (connp == NULL) { 27114 proto_str = "IRE_DB_REQ_TYPE"; 27115 goto protonak; 27116 } 27117 /* An Upper Level Protocol wants a copy of an IRE. */ 27118 ip_ire_req(q, mp); 27119 return; 27120 case M_CTL: 27121 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 27122 break; 27123 27124 /* M_CTL messages are used by ARP to tell us things. */ 27125 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 27126 break; 27127 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 27128 case AR_ENTRY_SQUERY: 27129 putnext(q, mp); 27130 return; 27131 case AR_CLIENT_NOTIFY: 27132 ip_arp_news(q, mp); 27133 return; 27134 case AR_DLPIOP_DONE: 27135 ASSERT(q->q_next != NULL); 27136 ill = (ill_t *)q->q_ptr; 27137 /* qwriter_ip releases the refhold */ 27138 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 27139 ill_refhold(ill); 27140 qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE); 27141 return; 27142 case AR_ARP_CLOSING: 27143 /* 27144 * ARP (above us) is closing. If no ARP bringup is 27145 * currently pending, ack the message so that ARP 27146 * can complete its close. Also mark ill_arp_closing 27147 * so that new ARP bringups will fail. If any 27148 * ARP bringup is currently in progress, we will 27149 * ack this when the current ARP bringup completes. 27150 */ 27151 ASSERT(q->q_next != NULL); 27152 ill = (ill_t *)q->q_ptr; 27153 mutex_enter(&ill->ill_lock); 27154 ill->ill_arp_closing = 1; 27155 if (!ill->ill_arp_bringup_pending) { 27156 mutex_exit(&ill->ill_lock); 27157 qreply(q, mp); 27158 } else { 27159 mutex_exit(&ill->ill_lock); 27160 freemsg(mp); 27161 } 27162 return; 27163 case AR_ARP_EXTEND: 27164 /* 27165 * The ARP module above us is capable of duplicate 27166 * address detection. Old ATM drivers will not send 27167 * this message. 27168 */ 27169 ASSERT(q->q_next != NULL); 27170 ill = (ill_t *)q->q_ptr; 27171 ill->ill_arp_extend = B_TRUE; 27172 freemsg(mp); 27173 return; 27174 default: 27175 break; 27176 } 27177 break; 27178 case M_PROTO: 27179 case M_PCPROTO: 27180 /* 27181 * The only PROTO messages we expect are copies of option 27182 * negotiation acknowledgements, AH and ESP bind requests 27183 * are also expected. 27184 */ 27185 switch (((union T_primitives *)mp->b_rptr)->type) { 27186 case O_T_BIND_REQ: 27187 case T_BIND_REQ: { 27188 /* Request can get queued in bind */ 27189 if (connp == NULL) { 27190 proto_str = "O_T_BIND_REQ/T_BIND_REQ"; 27191 goto protonak; 27192 } 27193 /* 27194 * The transports except SCTP call ip_bind_{v4,v6}() 27195 * directly instead of a a putnext. SCTP doesn't 27196 * generate any T_BIND_REQ since it has its own 27197 * fanout data structures. However, ESP and AH 27198 * come in for regular binds; all other cases are 27199 * bind retries. 27200 */ 27201 ASSERT(!IPCL_IS_SCTP(connp)); 27202 27203 /* Don't increment refcnt if this is a re-entry */ 27204 if (ipsq == NULL) 27205 CONN_INC_REF(connp); 27206 27207 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 27208 connp, NULL) : ip_bind_v4(q, mp, connp); 27209 ASSERT(mp != NULL); 27210 27211 ASSERT(!IPCL_IS_TCP(connp)); 27212 ASSERT(!IPCL_IS_UDP(connp)); 27213 ASSERT(!IPCL_IS_RAWIP(connp)); 27214 ASSERT(!IPCL_IS_IPTUN(connp)); 27215 27216 /* The case of AH and ESP */ 27217 qreply(q, mp); 27218 CONN_OPER_PENDING_DONE(connp); 27219 return; 27220 } 27221 case T_SVR4_OPTMGMT_REQ: 27222 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 27223 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 27224 27225 if (connp == NULL) { 27226 proto_str = "T_SVR4_OPTMGMT_REQ"; 27227 goto protonak; 27228 } 27229 27230 /* 27231 * All Solaris components should pass a db_credp 27232 * for this TPI message, hence we ASSERT. 27233 * But in case there is some other M_PROTO that looks 27234 * like a TPI message sent by some other kernel 27235 * component, we check and return an error. 27236 */ 27237 cr = msg_getcred(mp, NULL); 27238 ASSERT(cr != NULL); 27239 if (cr == NULL) { 27240 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 27241 if (mp != NULL) 27242 qreply(q, mp); 27243 return; 27244 } 27245 27246 if (!snmpcom_req(q, mp, ip_snmp_set, 27247 ip_snmp_get, cr)) { 27248 /* 27249 * Call svr4_optcom_req so that it can 27250 * generate the ack. We don't come here 27251 * if this operation is being restarted. 27252 * ip_restart_optmgmt will drop the conn ref. 27253 * In the case of ipsec option after the ipsec 27254 * load is complete conn_restart_ipsec_waiter 27255 * drops the conn ref. 27256 */ 27257 ASSERT(ipsq == NULL); 27258 CONN_INC_REF(connp); 27259 if (ip_check_for_ipsec_opt(q, mp)) 27260 return; 27261 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj, 27262 B_FALSE); 27263 if (err != EINPROGRESS) { 27264 /* Operation is done */ 27265 CONN_OPER_PENDING_DONE(connp); 27266 } 27267 } 27268 return; 27269 case T_OPTMGMT_REQ: 27270 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 27271 /* 27272 * Note: No snmpcom_req support through new 27273 * T_OPTMGMT_REQ. 27274 * Call tpi_optcom_req so that it can 27275 * generate the ack. 27276 */ 27277 if (connp == NULL) { 27278 proto_str = "T_OPTMGMT_REQ"; 27279 goto protonak; 27280 } 27281 27282 /* 27283 * All Solaris components should pass a db_credp 27284 * for this TPI message, hence we ASSERT. 27285 * But in case there is some other M_PROTO that looks 27286 * like a TPI message sent by some other kernel 27287 * component, we check and return an error. 27288 */ 27289 cr = msg_getcred(mp, NULL); 27290 ASSERT(cr != NULL); 27291 if (cr == NULL) { 27292 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 27293 if (mp != NULL) 27294 qreply(q, mp); 27295 return; 27296 } 27297 ASSERT(ipsq == NULL); 27298 /* 27299 * We don't come here for restart. ip_restart_optmgmt 27300 * will drop the conn ref. In the case of ipsec option 27301 * after the ipsec load is complete 27302 * conn_restart_ipsec_waiter drops the conn ref. 27303 */ 27304 CONN_INC_REF(connp); 27305 if (ip_check_for_ipsec_opt(q, mp)) 27306 return; 27307 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE); 27308 if (err != EINPROGRESS) { 27309 /* Operation is done */ 27310 CONN_OPER_PENDING_DONE(connp); 27311 } 27312 return; 27313 case T_UNBIND_REQ: 27314 if (connp == NULL) { 27315 proto_str = "T_UNBIND_REQ"; 27316 goto protonak; 27317 } 27318 ip_unbind(Q_TO_CONN(q)); 27319 mp = mi_tpi_ok_ack_alloc(mp); 27320 qreply(q, mp); 27321 return; 27322 default: 27323 /* 27324 * Have to drop any DLPI messages coming down from 27325 * arp (such as an info_req which would cause ip 27326 * to receive an extra info_ack if it was passed 27327 * through. 27328 */ 27329 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 27330 (int)*(uint_t *)mp->b_rptr)); 27331 freemsg(mp); 27332 return; 27333 } 27334 /* NOTREACHED */ 27335 case IRE_DB_TYPE: { 27336 nce_t *nce; 27337 ill_t *ill; 27338 in6_addr_t gw_addr_v6; 27339 27340 /* 27341 * This is a response back from a resolver. It 27342 * consists of a message chain containing: 27343 * IRE_MBLK-->LL_HDR_MBLK->pkt 27344 * The IRE_MBLK is the one we allocated in ip_newroute. 27345 * The LL_HDR_MBLK is the DLPI header to use to get 27346 * the attached packet, and subsequent ones for the 27347 * same destination, transmitted. 27348 */ 27349 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 27350 break; 27351 /* 27352 * First, check to make sure the resolution succeeded. 27353 * If it failed, the second mblk will be empty. 27354 * If it is, free the chain, dropping the packet. 27355 * (We must ire_delete the ire; that frees the ire mblk) 27356 * We're doing this now to support PVCs for ATM; it's 27357 * a partial xresolv implementation. When we fully implement 27358 * xresolv interfaces, instead of freeing everything here 27359 * we'll initiate neighbor discovery. 27360 * 27361 * For v4 (ARP and other external resolvers) the resolver 27362 * frees the message, so no check is needed. This check 27363 * is required, though, for a full xresolve implementation. 27364 * Including this code here now both shows how external 27365 * resolvers can NACK a resolution request using an 27366 * existing design that has no specific provisions for NACKs, 27367 * and also takes into account that the current non-ARP 27368 * external resolver has been coded to use this method of 27369 * NACKing for all IPv6 (xresolv) cases, 27370 * whether our xresolv implementation is complete or not. 27371 * 27372 */ 27373 ire = (ire_t *)mp->b_rptr; 27374 ill = ire_to_ill(ire); 27375 mp1 = mp->b_cont; /* dl_unitdata_req */ 27376 if (mp1->b_rptr == mp1->b_wptr) { 27377 if (ire->ire_ipversion == IPV6_VERSION) { 27378 /* 27379 * XRESOLV interface. 27380 */ 27381 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27382 mutex_enter(&ire->ire_lock); 27383 gw_addr_v6 = ire->ire_gateway_addr_v6; 27384 mutex_exit(&ire->ire_lock); 27385 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27386 nce = ndp_lookup_v6(ill, B_FALSE, 27387 &ire->ire_addr_v6, B_FALSE); 27388 } else { 27389 nce = ndp_lookup_v6(ill, B_FALSE, 27390 &gw_addr_v6, B_FALSE); 27391 } 27392 if (nce != NULL) { 27393 nce_resolv_failed(nce); 27394 ndp_delete(nce); 27395 NCE_REFRELE(nce); 27396 } 27397 } 27398 mp->b_cont = NULL; 27399 freemsg(mp1); /* frees the pkt as well */ 27400 ASSERT(ire->ire_nce == NULL); 27401 ire_delete((ire_t *)mp->b_rptr); 27402 return; 27403 } 27404 27405 /* 27406 * Split them into IRE_MBLK and pkt and feed it into 27407 * ire_add_then_send. Then in ire_add_then_send 27408 * the IRE will be added, and then the packet will be 27409 * run back through ip_wput. This time it will make 27410 * it to the wire. 27411 */ 27412 mp->b_cont = NULL; 27413 mp = mp1->b_cont; /* now, mp points to pkt */ 27414 mp1->b_cont = NULL; 27415 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 27416 if (ire->ire_ipversion == IPV6_VERSION) { 27417 /* 27418 * XRESOLV interface. Find the nce and put a copy 27419 * of the dl_unitdata_req in nce_res_mp 27420 */ 27421 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27422 mutex_enter(&ire->ire_lock); 27423 gw_addr_v6 = ire->ire_gateway_addr_v6; 27424 mutex_exit(&ire->ire_lock); 27425 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27426 nce = ndp_lookup_v6(ill, B_FALSE, 27427 &ire->ire_addr_v6, B_FALSE); 27428 } else { 27429 nce = ndp_lookup_v6(ill, B_FALSE, 27430 &gw_addr_v6, B_FALSE); 27431 } 27432 if (nce != NULL) { 27433 /* 27434 * We have to protect nce_res_mp here 27435 * from being accessed by other threads 27436 * while we change the mblk pointer. 27437 * Other functions will also lock the nce when 27438 * accessing nce_res_mp. 27439 * 27440 * The reason we change the mblk pointer 27441 * here rather than copying the resolved address 27442 * into the template is that, unlike with 27443 * ethernet, we have no guarantee that the 27444 * resolved address length will be 27445 * smaller than or equal to the lla length 27446 * with which the template was allocated, 27447 * (for ethernet, they're equal) 27448 * so we have to use the actual resolved 27449 * address mblk - which holds the real 27450 * dl_unitdata_req with the resolved address. 27451 * 27452 * Doing this is the same behavior as was 27453 * previously used in the v4 ARP case. 27454 */ 27455 mutex_enter(&nce->nce_lock); 27456 if (nce->nce_res_mp != NULL) 27457 freemsg(nce->nce_res_mp); 27458 nce->nce_res_mp = mp1; 27459 mutex_exit(&nce->nce_lock); 27460 /* 27461 * We do a fastpath probe here because 27462 * we have resolved the address without 27463 * using Neighbor Discovery. 27464 * In the non-XRESOLV v6 case, the fastpath 27465 * probe is done right after neighbor 27466 * discovery completes. 27467 */ 27468 if (nce->nce_res_mp != NULL) { 27469 int res; 27470 nce_fastpath_list_add(nce); 27471 res = ill_fastpath_probe(ill, 27472 nce->nce_res_mp); 27473 if (res != 0 && res != EAGAIN) 27474 nce_fastpath_list_delete(nce); 27475 } 27476 27477 ire_add_then_send(q, ire, mp); 27478 /* 27479 * Now we have to clean out any packets 27480 * that may have been queued on the nce 27481 * while it was waiting for address resolution 27482 * to complete. 27483 */ 27484 mutex_enter(&nce->nce_lock); 27485 mp1 = nce->nce_qd_mp; 27486 nce->nce_qd_mp = NULL; 27487 mutex_exit(&nce->nce_lock); 27488 while (mp1 != NULL) { 27489 mblk_t *nxt_mp; 27490 queue_t *fwdq = NULL; 27491 ill_t *inbound_ill; 27492 uint_t ifindex; 27493 27494 nxt_mp = mp1->b_next; 27495 mp1->b_next = NULL; 27496 /* 27497 * Retrieve ifindex stored in 27498 * ip_rput_data_v6() 27499 */ 27500 ifindex = 27501 (uint_t)(uintptr_t)mp1->b_prev; 27502 inbound_ill = 27503 ill_lookup_on_ifindex(ifindex, 27504 B_TRUE, NULL, NULL, NULL, 27505 NULL, ipst); 27506 mp1->b_prev = NULL; 27507 if (inbound_ill != NULL) 27508 fwdq = inbound_ill->ill_rq; 27509 27510 if (fwdq != NULL) { 27511 put(fwdq, mp1); 27512 ill_refrele(inbound_ill); 27513 } else 27514 put(WR(ill->ill_rq), mp1); 27515 mp1 = nxt_mp; 27516 } 27517 NCE_REFRELE(nce); 27518 } else { /* nce is NULL; clean up */ 27519 ire_delete(ire); 27520 freemsg(mp); 27521 freemsg(mp1); 27522 return; 27523 } 27524 } else { 27525 nce_t *arpce; 27526 /* 27527 * Link layer resolution succeeded. Recompute the 27528 * ire_nce. 27529 */ 27530 ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST)); 27531 if ((arpce = ndp_lookup_v4(ill, 27532 (ire->ire_gateway_addr != INADDR_ANY ? 27533 &ire->ire_gateway_addr : &ire->ire_addr), 27534 B_FALSE)) == NULL) { 27535 freeb(ire->ire_mp); 27536 freeb(mp1); 27537 freemsg(mp); 27538 return; 27539 } 27540 mutex_enter(&arpce->nce_lock); 27541 arpce->nce_last = TICK_TO_MSEC(lbolt64); 27542 if (arpce->nce_state == ND_REACHABLE) { 27543 /* 27544 * Someone resolved this before us; 27545 * cleanup the res_mp. Since ire has 27546 * not been added yet, the call to ire_add_v4 27547 * from ire_add_then_send (when a dup is 27548 * detected) will clean up the ire. 27549 */ 27550 freeb(mp1); 27551 } else { 27552 ASSERT(arpce->nce_res_mp == NULL); 27553 arpce->nce_res_mp = mp1; 27554 arpce->nce_state = ND_REACHABLE; 27555 } 27556 mutex_exit(&arpce->nce_lock); 27557 if (ire->ire_marks & IRE_MARK_NOADD) { 27558 /* 27559 * this ire will not be added to the ire 27560 * cache table, so we can set the ire_nce 27561 * here, as there are no atomicity constraints. 27562 */ 27563 ire->ire_nce = arpce; 27564 /* 27565 * We are associating this nce with the ire 27566 * so change the nce ref taken in 27567 * ndp_lookup_v4() from 27568 * NCE_REFHOLD to NCE_REFHOLD_NOTR 27569 */ 27570 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 27571 } else { 27572 NCE_REFRELE(arpce); 27573 } 27574 ire_add_then_send(q, ire, mp); 27575 } 27576 return; /* All is well, the packet has been sent. */ 27577 } 27578 case IRE_ARPRESOLVE_TYPE: { 27579 27580 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */ 27581 break; 27582 mp1 = mp->b_cont; /* dl_unitdata_req */ 27583 mp->b_cont = NULL; 27584 /* 27585 * First, check to make sure the resolution succeeded. 27586 * If it failed, the second mblk will be empty. 27587 */ 27588 if (mp1->b_rptr == mp1->b_wptr) { 27589 /* cleanup the incomplete ire, free queued packets */ 27590 freemsg(mp); /* fake ire */ 27591 freeb(mp1); /* dl_unitdata response */ 27592 return; 27593 } 27594 27595 /* 27596 * Update any incomplete nce_t found. We search the ctable 27597 * and find the nce from the ire->ire_nce because we need 27598 * to pass the ire to ip_xmit_v4 later, and can find both 27599 * ire and nce in one lookup. 27600 */ 27601 fake_ire = (ire_t *)mp->b_rptr; 27602 27603 /* 27604 * By the time we come back here from ARP the logical outgoing 27605 * interface of the incomplete ire we added in ire_forward() 27606 * could have disappeared, causing the incomplete ire to also 27607 * disappear. So we need to retreive the proper ipif for the 27608 * ire before looking in ctable. In the case of IPMP, the 27609 * ipif may be on the IPMP ill, so look it up based on the 27610 * ire_ipif_ifindex we stashed back in ire_init_common(). 27611 * Then, we can verify that ire_ipif_seqid still exists. 27612 */ 27613 ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE, 27614 NULL, NULL, NULL, NULL, ipst); 27615 if (ill == NULL) { 27616 ip1dbg(("ill for incomplete ire vanished\n")); 27617 freemsg(mp); /* fake ire */ 27618 freeb(mp1); /* dl_unitdata response */ 27619 return; 27620 } 27621 27622 /* Get the outgoing ipif */ 27623 mutex_enter(&ill->ill_lock); 27624 ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid); 27625 if (ipif == NULL) { 27626 mutex_exit(&ill->ill_lock); 27627 ill_refrele(ill); 27628 ip1dbg(("logical intrf to incomplete ire vanished\n")); 27629 freemsg(mp); /* fake_ire */ 27630 freeb(mp1); /* dl_unitdata response */ 27631 return; 27632 } 27633 27634 ipif_refhold_locked(ipif); 27635 mutex_exit(&ill->ill_lock); 27636 ill_refrele(ill); 27637 ire = ire_arpresolve_lookup(fake_ire->ire_addr, 27638 fake_ire->ire_gateway_addr, ipif, fake_ire->ire_zoneid, 27639 ipst, ((ill_t *)q->q_ptr)->ill_wq); 27640 ipif_refrele(ipif); 27641 if (ire == NULL) { 27642 /* 27643 * no ire was found; check if there is an nce 27644 * for this lookup; if it has no ire's pointing at it 27645 * cleanup. 27646 */ 27647 if ((nce = ndp_lookup_v4(q->q_ptr, 27648 (fake_ire->ire_gateway_addr != INADDR_ANY ? 27649 &fake_ire->ire_gateway_addr : &fake_ire->ire_addr), 27650 B_FALSE)) != NULL) { 27651 /* 27652 * cleanup: 27653 * We check for refcnt 2 (one for the nce 27654 * hash list + 1 for the ref taken by 27655 * ndp_lookup_v4) to check that there are 27656 * no ire's pointing at the nce. 27657 */ 27658 if (nce->nce_refcnt == 2) 27659 ndp_delete(nce); 27660 NCE_REFRELE(nce); 27661 } 27662 freeb(mp1); /* dl_unitdata response */ 27663 freemsg(mp); /* fake ire */ 27664 return; 27665 } 27666 27667 nce = ire->ire_nce; 27668 DTRACE_PROBE2(ire__arpresolve__type, 27669 ire_t *, ire, nce_t *, nce); 27670 mutex_enter(&nce->nce_lock); 27671 nce->nce_last = TICK_TO_MSEC(lbolt64); 27672 if (nce->nce_state == ND_REACHABLE) { 27673 /* 27674 * Someone resolved this before us; 27675 * our response is not needed any more. 27676 */ 27677 mutex_exit(&nce->nce_lock); 27678 freeb(mp1); /* dl_unitdata response */ 27679 } else { 27680 ASSERT(nce->nce_res_mp == NULL); 27681 nce->nce_res_mp = mp1; 27682 nce->nce_state = ND_REACHABLE; 27683 mutex_exit(&nce->nce_lock); 27684 nce_fastpath(nce); 27685 } 27686 /* 27687 * The cached nce_t has been updated to be reachable; 27688 * Clear the IRE_MARK_UNCACHED flag and free the fake_ire. 27689 */ 27690 fake_ire->ire_marks &= ~IRE_MARK_UNCACHED; 27691 freemsg(mp); 27692 /* 27693 * send out queued packets. 27694 */ 27695 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); 27696 27697 IRE_REFRELE(ire); 27698 return; 27699 } 27700 default: 27701 break; 27702 } 27703 if (q->q_next) { 27704 putnext(q, mp); 27705 } else 27706 freemsg(mp); 27707 return; 27708 27709 protonak: 27710 cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str); 27711 if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL) 27712 qreply(q, mp); 27713 } 27714 27715 /* 27716 * Process IP options in an outbound packet. Modify the destination if there 27717 * is a source route option. 27718 * Returns non-zero if something fails in which case an ICMP error has been 27719 * sent and mp freed. 27720 */ 27721 static int 27722 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 27723 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 27724 { 27725 ipoptp_t opts; 27726 uchar_t *opt; 27727 uint8_t optval; 27728 uint8_t optlen; 27729 ipaddr_t dst; 27730 intptr_t code = 0; 27731 mblk_t *mp; 27732 ire_t *ire = NULL; 27733 27734 ip2dbg(("ip_wput_options\n")); 27735 mp = ipsec_mp; 27736 if (mctl_present) { 27737 mp = ipsec_mp->b_cont; 27738 } 27739 27740 dst = ipha->ipha_dst; 27741 for (optval = ipoptp_first(&opts, ipha); 27742 optval != IPOPT_EOL; 27743 optval = ipoptp_next(&opts)) { 27744 opt = opts.ipoptp_cur; 27745 optlen = opts.ipoptp_len; 27746 ip2dbg(("ip_wput_options: opt %d, len %d\n", 27747 optval, optlen)); 27748 switch (optval) { 27749 uint32_t off; 27750 case IPOPT_SSRR: 27751 case IPOPT_LSRR: 27752 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 27753 ip1dbg(( 27754 "ip_wput_options: bad option offset\n")); 27755 code = (char *)&opt[IPOPT_OLEN] - 27756 (char *)ipha; 27757 goto param_prob; 27758 } 27759 off = opt[IPOPT_OFFSET]; 27760 ip1dbg(("ip_wput_options: next hop 0x%x\n", 27761 ntohl(dst))); 27762 /* 27763 * For strict: verify that dst is directly 27764 * reachable. 27765 */ 27766 if (optval == IPOPT_SSRR) { 27767 ire = ire_ftable_lookup(dst, 0, 0, 27768 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 27769 msg_getlabel(mp), 27770 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 27771 if (ire == NULL) { 27772 ip1dbg(("ip_wput_options: SSRR not" 27773 " directly reachable: 0x%x\n", 27774 ntohl(dst))); 27775 goto bad_src_route; 27776 } 27777 ire_refrele(ire); 27778 } 27779 break; 27780 case IPOPT_RR: 27781 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 27782 ip1dbg(( 27783 "ip_wput_options: bad option offset\n")); 27784 code = (char *)&opt[IPOPT_OLEN] - 27785 (char *)ipha; 27786 goto param_prob; 27787 } 27788 break; 27789 case IPOPT_TS: 27790 /* 27791 * Verify that length >=5 and that there is either 27792 * room for another timestamp or that the overflow 27793 * counter is not maxed out. 27794 */ 27795 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 27796 if (optlen < IPOPT_MINLEN_IT) { 27797 goto param_prob; 27798 } 27799 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 27800 ip1dbg(( 27801 "ip_wput_options: bad option offset\n")); 27802 code = (char *)&opt[IPOPT_OFFSET] - 27803 (char *)ipha; 27804 goto param_prob; 27805 } 27806 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 27807 case IPOPT_TS_TSONLY: 27808 off = IPOPT_TS_TIMELEN; 27809 break; 27810 case IPOPT_TS_TSANDADDR: 27811 case IPOPT_TS_PRESPEC: 27812 case IPOPT_TS_PRESPEC_RFC791: 27813 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 27814 break; 27815 default: 27816 code = (char *)&opt[IPOPT_POS_OV_FLG] - 27817 (char *)ipha; 27818 goto param_prob; 27819 } 27820 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 27821 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 27822 /* 27823 * No room and the overflow counter is 15 27824 * already. 27825 */ 27826 goto param_prob; 27827 } 27828 break; 27829 } 27830 } 27831 27832 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 27833 return (0); 27834 27835 ip1dbg(("ip_wput_options: error processing IP options.")); 27836 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 27837 27838 param_prob: 27839 /* 27840 * Since ip_wput() isn't close to finished, we fill 27841 * in enough of the header for credible error reporting. 27842 */ 27843 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 27844 /* Failed */ 27845 freemsg(ipsec_mp); 27846 return (-1); 27847 } 27848 icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst); 27849 return (-1); 27850 27851 bad_src_route: 27852 /* 27853 * Since ip_wput() isn't close to finished, we fill 27854 * in enough of the header for credible error reporting. 27855 */ 27856 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 27857 /* Failed */ 27858 freemsg(ipsec_mp); 27859 return (-1); 27860 } 27861 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 27862 return (-1); 27863 } 27864 27865 /* 27866 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 27867 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 27868 * thru /etc/system. 27869 */ 27870 #define CONN_MAXDRAINCNT 64 27871 27872 static void 27873 conn_drain_init(ip_stack_t *ipst) 27874 { 27875 int i, j; 27876 idl_tx_list_t *itl_tx; 27877 27878 ipst->ips_conn_drain_list_cnt = conn_drain_nthreads; 27879 27880 if ((ipst->ips_conn_drain_list_cnt == 0) || 27881 (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 27882 /* 27883 * Default value of the number of drainers is the 27884 * number of cpus, subject to maximum of 8 drainers. 27885 */ 27886 if (boot_max_ncpus != -1) 27887 ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 27888 else 27889 ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8); 27890 } 27891 27892 ipst->ips_idl_tx_list = 27893 kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP); 27894 for (i = 0; i < TX_FANOUT_SIZE; i++) { 27895 itl_tx = &ipst->ips_idl_tx_list[i]; 27896 itl_tx->txl_drain_list = 27897 kmem_zalloc(ipst->ips_conn_drain_list_cnt * 27898 sizeof (idl_t), KM_SLEEP); 27899 mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL); 27900 for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) { 27901 mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL, 27902 MUTEX_DEFAULT, NULL); 27903 itl_tx->txl_drain_list[j].idl_itl = itl_tx; 27904 } 27905 } 27906 } 27907 27908 static void 27909 conn_drain_fini(ip_stack_t *ipst) 27910 { 27911 int i; 27912 idl_tx_list_t *itl_tx; 27913 27914 for (i = 0; i < TX_FANOUT_SIZE; i++) { 27915 itl_tx = &ipst->ips_idl_tx_list[i]; 27916 kmem_free(itl_tx->txl_drain_list, 27917 ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); 27918 } 27919 kmem_free(ipst->ips_idl_tx_list, 27920 TX_FANOUT_SIZE * sizeof (idl_tx_list_t)); 27921 ipst->ips_idl_tx_list = NULL; 27922 } 27923 27924 /* 27925 * Note: For an overview of how flowcontrol is handled in IP please see the 27926 * IP Flowcontrol notes at the top of this file. 27927 * 27928 * Flow control has blocked us from proceeding. Insert the given conn in one 27929 * of the conn drain lists. These conn wq's will be qenabled later on when 27930 * STREAMS flow control does a backenable. conn_walk_drain will enable 27931 * the first conn in each of these drain lists. Each of these qenabled conns 27932 * in turn enables the next in the list, after it runs, or when it closes, 27933 * thus sustaining the drain process. 27934 */ 27935 void 27936 conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list) 27937 { 27938 idl_t *idl = tx_list->txl_drain_list; 27939 uint_t index; 27940 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 27941 27942 mutex_enter(&connp->conn_lock); 27943 if (connp->conn_state_flags & CONN_CLOSING) { 27944 /* 27945 * The conn is closing as a result of which CONN_CLOSING 27946 * is set. Return. 27947 */ 27948 mutex_exit(&connp->conn_lock); 27949 return; 27950 } else if (connp->conn_idl == NULL) { 27951 /* 27952 * Assign the next drain list round robin. We dont' use 27953 * a lock, and thus it may not be strictly round robin. 27954 * Atomicity of load/stores is enough to make sure that 27955 * conn_drain_list_index is always within bounds. 27956 */ 27957 index = tx_list->txl_drain_index; 27958 ASSERT(index < ipst->ips_conn_drain_list_cnt); 27959 connp->conn_idl = &tx_list->txl_drain_list[index]; 27960 index++; 27961 if (index == ipst->ips_conn_drain_list_cnt) 27962 index = 0; 27963 tx_list->txl_drain_index = index; 27964 } 27965 mutex_exit(&connp->conn_lock); 27966 27967 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 27968 if ((connp->conn_drain_prev != NULL) || 27969 (connp->conn_state_flags & CONN_CLOSING)) { 27970 /* 27971 * The conn is already in the drain list, OR 27972 * the conn is closing. We need to check again for 27973 * the closing case again since close can happen 27974 * after we drop the conn_lock, and before we 27975 * acquire the CONN_DRAIN_LIST_LOCK. 27976 */ 27977 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 27978 return; 27979 } else { 27980 idl = connp->conn_idl; 27981 } 27982 27983 /* 27984 * The conn is not in the drain list. Insert it at the 27985 * tail of the drain list. The drain list is circular 27986 * and doubly linked. idl_conn points to the 1st element 27987 * in the list. 27988 */ 27989 if (idl->idl_conn == NULL) { 27990 idl->idl_conn = connp; 27991 connp->conn_drain_next = connp; 27992 connp->conn_drain_prev = connp; 27993 } else { 27994 conn_t *head = idl->idl_conn; 27995 27996 connp->conn_drain_next = head; 27997 connp->conn_drain_prev = head->conn_drain_prev; 27998 head->conn_drain_prev->conn_drain_next = connp; 27999 head->conn_drain_prev = connp; 28000 } 28001 /* 28002 * For non streams based sockets assert flow control. 28003 */ 28004 if (IPCL_IS_NONSTR(connp)) { 28005 DTRACE_PROBE1(su__txq__full, conn_t *, connp); 28006 (*connp->conn_upcalls->su_txq_full) 28007 (connp->conn_upper_handle, B_TRUE); 28008 } else { 28009 conn_setqfull(connp); 28010 noenable(connp->conn_wq); 28011 } 28012 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28013 } 28014 28015 /* 28016 * This conn is closing, and we are called from ip_close. OR 28017 * This conn has been serviced by ip_wsrv, and we need to do the tail 28018 * processing. 28019 * If this conn is part of the drain list, we may need to sustain the drain 28020 * process by qenabling the next conn in the drain list. We may also need to 28021 * remove this conn from the list, if it is done. 28022 */ 28023 static void 28024 conn_drain_tail(conn_t *connp, boolean_t closing) 28025 { 28026 idl_t *idl; 28027 28028 /* 28029 * connp->conn_idl is stable at this point, and no lock is needed 28030 * to check it. If we are called from ip_close, close has already 28031 * set CONN_CLOSING, thus freezing the value of conn_idl, and 28032 * called us only because conn_idl is non-null. If we are called thru 28033 * service, conn_idl could be null, but it cannot change because 28034 * service is single-threaded per queue, and there cannot be another 28035 * instance of service trying to call conn_drain_insert on this conn 28036 * now. 28037 */ 28038 ASSERT(!closing || (connp->conn_idl != NULL)); 28039 28040 /* 28041 * If connp->conn_idl is null, the conn has not been inserted into any 28042 * drain list even once since creation of the conn. Just return. 28043 */ 28044 if (connp->conn_idl == NULL) 28045 return; 28046 28047 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28048 28049 if (connp->conn_drain_prev == NULL) { 28050 /* This conn is currently not in the drain list. */ 28051 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28052 return; 28053 } 28054 idl = connp->conn_idl; 28055 if (idl->idl_conn_draining == connp) { 28056 /* 28057 * This conn is the current drainer. If this is the last conn 28058 * in the drain list, we need to do more checks, in the 'if' 28059 * below. Otherwwise we need to just qenable the next conn, 28060 * to sustain the draining, and is handled in the 'else' 28061 * below. 28062 */ 28063 if (connp->conn_drain_next == idl->idl_conn) { 28064 /* 28065 * This conn is the last in this list. This round 28066 * of draining is complete. If idl_repeat is set, 28067 * it means another flow enabling has happened from 28068 * the driver/streams and we need to another round 28069 * of draining. 28070 * If there are more than 2 conns in the drain list, 28071 * do a left rotate by 1, so that all conns except the 28072 * conn at the head move towards the head by 1, and the 28073 * the conn at the head goes to the tail. This attempts 28074 * a more even share for all queues that are being 28075 * drained. 28076 */ 28077 if ((connp->conn_drain_next != connp) && 28078 (idl->idl_conn->conn_drain_next != connp)) { 28079 idl->idl_conn = idl->idl_conn->conn_drain_next; 28080 } 28081 if (idl->idl_repeat) { 28082 qenable(idl->idl_conn->conn_wq); 28083 idl->idl_conn_draining = idl->idl_conn; 28084 idl->idl_repeat = 0; 28085 } else { 28086 idl->idl_conn_draining = NULL; 28087 } 28088 } else { 28089 /* 28090 * If the next queue that we are now qenable'ing, 28091 * is closing, it will remove itself from this list 28092 * and qenable the subsequent queue in ip_close(). 28093 * Serialization is acheived thru idl_lock. 28094 */ 28095 qenable(connp->conn_drain_next->conn_wq); 28096 idl->idl_conn_draining = connp->conn_drain_next; 28097 } 28098 } 28099 if (!connp->conn_did_putbq || closing) { 28100 /* 28101 * Remove ourself from the drain list, if we did not do 28102 * a putbq, or if the conn is closing. 28103 * Note: It is possible that q->q_first is non-null. It means 28104 * that these messages landed after we did a enableok() in 28105 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 28106 * service them. 28107 */ 28108 if (connp->conn_drain_next == connp) { 28109 /* Singleton in the list */ 28110 ASSERT(connp->conn_drain_prev == connp); 28111 idl->idl_conn = NULL; 28112 idl->idl_conn_draining = NULL; 28113 } else { 28114 connp->conn_drain_prev->conn_drain_next = 28115 connp->conn_drain_next; 28116 connp->conn_drain_next->conn_drain_prev = 28117 connp->conn_drain_prev; 28118 if (idl->idl_conn == connp) 28119 idl->idl_conn = connp->conn_drain_next; 28120 ASSERT(idl->idl_conn_draining != connp); 28121 28122 } 28123 connp->conn_drain_next = NULL; 28124 connp->conn_drain_prev = NULL; 28125 28126 /* 28127 * For non streams based sockets open up flow control. 28128 */ 28129 if (IPCL_IS_NONSTR(connp)) { 28130 (*connp->conn_upcalls->su_txq_full) 28131 (connp->conn_upper_handle, B_FALSE); 28132 } else { 28133 conn_clrqfull(connp); 28134 enableok(connp->conn_wq); 28135 } 28136 } 28137 28138 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28139 } 28140 28141 /* 28142 * Write service routine. Shared perimeter entry point. 28143 * ip_wsrv can be called in any of the following ways. 28144 * 1. The device queue's messages has fallen below the low water mark 28145 * and STREAMS has backenabled the ill_wq. We walk thru all the 28146 * the drain lists and backenable the first conn in each list. 28147 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 28148 * qenabled non-tcp upper layers. We start dequeing messages and call 28149 * ip_wput for each message. 28150 */ 28151 28152 void 28153 ip_wsrv(queue_t *q) 28154 { 28155 conn_t *connp; 28156 ill_t *ill; 28157 mblk_t *mp; 28158 28159 if (q->q_next) { 28160 ill = (ill_t *)q->q_ptr; 28161 if (ill->ill_state_flags == 0) { 28162 ip_stack_t *ipst = ill->ill_ipst; 28163 28164 /* 28165 * The device flow control has opened up. 28166 * Walk through conn drain lists and qenable the 28167 * first conn in each list. This makes sense only 28168 * if the stream is fully plumbed and setup. 28169 * Hence the if check above. 28170 */ 28171 ip1dbg(("ip_wsrv: walking\n")); 28172 conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]); 28173 } 28174 return; 28175 } 28176 28177 connp = Q_TO_CONN(q); 28178 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 28179 28180 /* 28181 * 1. Set conn_draining flag to signal that service is active. 28182 * 28183 * 2. ip_output determines whether it has been called from service, 28184 * based on the last parameter. If it is IP_WSRV it concludes it 28185 * has been called from service. 28186 * 28187 * 3. Message ordering is preserved by the following logic. 28188 * i. A directly called ip_output (i.e. not thru service) will queue 28189 * the message at the tail, if conn_draining is set (i.e. service 28190 * is running) or if q->q_first is non-null. 28191 * 28192 * ii. If ip_output is called from service, and if ip_output cannot 28193 * putnext due to flow control, it does a putbq. 28194 * 28195 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 28196 * (causing an infinite loop). 28197 */ 28198 ASSERT(!connp->conn_did_putbq); 28199 28200 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 28201 connp->conn_draining = 1; 28202 noenable(q); 28203 while ((mp = getq(q)) != NULL) { 28204 ASSERT(CONN_Q(q)); 28205 28206 DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp); 28207 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 28208 if (connp->conn_did_putbq) { 28209 /* ip_wput did a putbq */ 28210 break; 28211 } 28212 } 28213 /* 28214 * At this point, a thread coming down from top, calling 28215 * ip_wput, may end up queueing the message. We have not yet 28216 * enabled the queue, so ip_wsrv won't be called again. 28217 * To avoid this race, check q->q_first again (in the loop) 28218 * If the other thread queued the message before we call 28219 * enableok(), we will catch it in the q->q_first check. 28220 * If the other thread queues the message after we call 28221 * enableok(), ip_wsrv will be called again by STREAMS. 28222 */ 28223 connp->conn_draining = 0; 28224 enableok(q); 28225 } 28226 28227 /* Enable the next conn for draining */ 28228 conn_drain_tail(connp, B_FALSE); 28229 28230 /* 28231 * conn_direct_blocked is used to indicate blocked 28232 * condition for direct path (ILL_DIRECT_CAPABLE()). 28233 * This is the only place where it is set without 28234 * checking for ILL_DIRECT_CAPABLE() and setting it 28235 * to 0 is ok even if it is not ILL_DIRECT_CAPABLE(). 28236 */ 28237 if (!connp->conn_did_putbq && connp->conn_direct_blocked) { 28238 DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp); 28239 connp->conn_direct_blocked = B_FALSE; 28240 } 28241 28242 connp->conn_did_putbq = 0; 28243 } 28244 28245 /* 28246 * Callback to disable flow control in IP. 28247 * 28248 * This is a mac client callback added when the DLD_CAPAB_DIRECT capability 28249 * is enabled. 28250 * 28251 * When MAC_TX() is not able to send any more packets, dld sets its queue 28252 * to QFULL and enable the STREAMS flow control. Later, when the underlying 28253 * driver is able to continue to send packets, it calls mac_tx_(ring_)update() 28254 * function and wakes up corresponding mac worker threads, which in turn 28255 * calls this callback function, and disables flow control. 28256 */ 28257 void 28258 ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie) 28259 { 28260 ill_t *ill = (ill_t *)arg; 28261 ip_stack_t *ipst = ill->ill_ipst; 28262 idl_tx_list_t *idl_txl; 28263 28264 idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; 28265 mutex_enter(&idl_txl->txl_lock); 28266 /* add code to to set a flag to indicate idl_txl is enabled */ 28267 conn_walk_drain(ipst, idl_txl); 28268 mutex_exit(&idl_txl->txl_lock); 28269 } 28270 28271 /* 28272 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 28273 * of conns that need to be drained, check if drain is already in progress. 28274 * If so set the idl_repeat bit, indicating that the last conn in the list 28275 * needs to reinitiate the drain once again, for the list. If drain is not 28276 * in progress for the list, initiate the draining, by qenabling the 1st 28277 * conn in the list. The drain is self-sustaining, each qenabled conn will 28278 * in turn qenable the next conn, when it is done/blocked/closing. 28279 */ 28280 static void 28281 conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list) 28282 { 28283 int i; 28284 idl_t *idl; 28285 28286 IP_STAT(ipst, ip_conn_walk_drain); 28287 28288 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28289 idl = &tx_list->txl_drain_list[i]; 28290 mutex_enter(&idl->idl_lock); 28291 if (idl->idl_conn == NULL) { 28292 mutex_exit(&idl->idl_lock); 28293 continue; 28294 } 28295 /* 28296 * If this list is not being drained currently by 28297 * an ip_wsrv thread, start the process. 28298 */ 28299 if (idl->idl_conn_draining == NULL) { 28300 ASSERT(idl->idl_repeat == 0); 28301 qenable(idl->idl_conn->conn_wq); 28302 idl->idl_conn_draining = idl->idl_conn; 28303 } else { 28304 idl->idl_repeat = 1; 28305 } 28306 mutex_exit(&idl->idl_lock); 28307 } 28308 } 28309 28310 /* 28311 * Determine if the ill and multicast aspects of that packets 28312 * "matches" the conn. 28313 */ 28314 boolean_t 28315 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 28316 zoneid_t zoneid) 28317 { 28318 ill_t *bound_ill; 28319 boolean_t found; 28320 ipif_t *ipif; 28321 ire_t *ire; 28322 ipaddr_t dst, src; 28323 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28324 28325 dst = ipha->ipha_dst; 28326 src = ipha->ipha_src; 28327 28328 /* 28329 * conn_incoming_ill is set by IP_BOUND_IF which limits 28330 * unicast, broadcast and multicast reception to 28331 * conn_incoming_ill. conn_wantpacket itself is called 28332 * only for BROADCAST and multicast. 28333 */ 28334 bound_ill = connp->conn_incoming_ill; 28335 if (bound_ill != NULL) { 28336 if (IS_IPMP(bound_ill)) { 28337 if (bound_ill->ill_grp != ill->ill_grp) 28338 return (B_FALSE); 28339 } else { 28340 if (bound_ill != ill) 28341 return (B_FALSE); 28342 } 28343 } 28344 28345 if (!CLASSD(dst)) { 28346 if (IPCL_ZONE_MATCH(connp, zoneid)) 28347 return (B_TRUE); 28348 /* 28349 * The conn is in a different zone; we need to check that this 28350 * broadcast address is configured in the application's zone. 28351 */ 28352 ipif = ipif_get_next_ipif(NULL, ill); 28353 if (ipif == NULL) 28354 return (B_FALSE); 28355 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 28356 connp->conn_zoneid, NULL, 28357 (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 28358 ipif_refrele(ipif); 28359 if (ire != NULL) { 28360 ire_refrele(ire); 28361 return (B_TRUE); 28362 } else { 28363 return (B_FALSE); 28364 } 28365 } 28366 28367 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 28368 connp->conn_zoneid == zoneid) { 28369 /* 28370 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 28371 * disabled, therefore we don't dispatch the multicast packet to 28372 * the sending zone. 28373 */ 28374 return (B_FALSE); 28375 } 28376 28377 if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) { 28378 /* 28379 * Multicast packet on the loopback interface: we only match 28380 * conns who joined the group in the specified zone. 28381 */ 28382 return (B_FALSE); 28383 } 28384 28385 if (connp->conn_multi_router) { 28386 /* multicast packet and multicast router socket: send up */ 28387 return (B_TRUE); 28388 } 28389 28390 mutex_enter(&connp->conn_lock); 28391 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 28392 mutex_exit(&connp->conn_lock); 28393 return (found); 28394 } 28395 28396 static void 28397 conn_setqfull(conn_t *connp) 28398 { 28399 queue_t *q = connp->conn_wq; 28400 28401 if (!(q->q_flag & QFULL)) { 28402 mutex_enter(QLOCK(q)); 28403 if (!(q->q_flag & QFULL)) { 28404 /* still need to set QFULL */ 28405 q->q_flag |= QFULL; 28406 mutex_exit(QLOCK(q)); 28407 } else { 28408 mutex_exit(QLOCK(q)); 28409 } 28410 } 28411 } 28412 28413 static void 28414 conn_clrqfull(conn_t *connp) 28415 { 28416 queue_t *q = connp->conn_wq; 28417 28418 if (q->q_flag & QFULL) { 28419 mutex_enter(QLOCK(q)); 28420 if (q->q_flag & QFULL) { 28421 q->q_flag &= ~QFULL; 28422 mutex_exit(QLOCK(q)); 28423 if (q->q_flag & QWANTW) 28424 qbackenable(q, 0); 28425 } else { 28426 mutex_exit(QLOCK(q)); 28427 } 28428 } 28429 } 28430 28431 /* 28432 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 28433 */ 28434 /* ARGSUSED */ 28435 static void 28436 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 28437 { 28438 ill_t *ill = (ill_t *)q->q_ptr; 28439 mblk_t *mp1, *mp2; 28440 ipif_t *ipif; 28441 int err = 0; 28442 conn_t *connp = NULL; 28443 ipsq_t *ipsq; 28444 arc_t *arc; 28445 28446 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 28447 28448 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 28449 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 28450 28451 ASSERT(IAM_WRITER_ILL(ill)); 28452 mp2 = mp->b_cont; 28453 mp->b_cont = NULL; 28454 28455 /* 28456 * We have now received the arp bringup completion message 28457 * from ARP. Mark the arp bringup as done. Also if the arp 28458 * stream has already started closing, send up the AR_ARP_CLOSING 28459 * ack now since ARP is waiting in close for this ack. 28460 */ 28461 mutex_enter(&ill->ill_lock); 28462 ill->ill_arp_bringup_pending = 0; 28463 if (ill->ill_arp_closing) { 28464 mutex_exit(&ill->ill_lock); 28465 /* Let's reuse the mp for sending the ack */ 28466 arc = (arc_t *)mp->b_rptr; 28467 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 28468 arc->arc_cmd = AR_ARP_CLOSING; 28469 qreply(q, mp); 28470 } else { 28471 mutex_exit(&ill->ill_lock); 28472 freeb(mp); 28473 } 28474 28475 ipsq = ill->ill_phyint->phyint_ipsq; 28476 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 28477 mp1 = ipsq_pending_mp_get(ipsq, &connp); 28478 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 28479 if (mp1 == NULL) { 28480 /* bringup was aborted by the user */ 28481 freemsg(mp2); 28482 return; 28483 } 28484 28485 /* 28486 * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we 28487 * must have an associated conn_t. Otherwise, we're bringing this 28488 * interface back up as part of handling an asynchronous event (e.g., 28489 * physical address change). 28490 */ 28491 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 28492 ASSERT(connp != NULL); 28493 q = CONNP_TO_WQ(connp); 28494 } else { 28495 ASSERT(connp == NULL); 28496 q = ill->ill_rq; 28497 } 28498 28499 /* 28500 * If the DL_BIND_REQ fails, it is noted 28501 * in arc_name_offset. 28502 */ 28503 err = *((int *)mp2->b_rptr); 28504 if (err == 0) { 28505 if (ipif->ipif_isv6) { 28506 if ((err = ipif_up_done_v6(ipif)) != 0) 28507 ip0dbg(("ip_arp_done: init failed\n")); 28508 } else { 28509 if ((err = ipif_up_done(ipif)) != 0) 28510 ip0dbg(("ip_arp_done: init failed\n")); 28511 } 28512 } else { 28513 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 28514 } 28515 28516 freemsg(mp2); 28517 28518 if ((err == 0) && (ill->ill_up_ipifs)) { 28519 err = ill_up_ipifs(ill, q, mp1); 28520 if (err == EINPROGRESS) 28521 return; 28522 } 28523 28524 /* 28525 * If we have a moved ipif to bring up, and everything has succeeded 28526 * to this point, bring it up on the IPMP ill. Otherwise, leave it 28527 * down -- the admin can try to bring it up by hand if need be. 28528 */ 28529 if (ill->ill_move_ipif != NULL) { 28530 ipif = ill->ill_move_ipif; 28531 ill->ill_move_ipif = NULL; 28532 if (err == 0) { 28533 err = ipif_up(ipif, q, mp1); 28534 if (err == EINPROGRESS) 28535 return; 28536 } 28537 } 28538 28539 /* 28540 * The operation must complete without EINPROGRESS since 28541 * ipsq_pending_mp_get() has removed the mblk. Otherwise, the 28542 * operation will be stuck forever in the ipsq. 28543 */ 28544 ASSERT(err != EINPROGRESS); 28545 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) 28546 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 28547 else 28548 ipsq_current_finish(ipsq); 28549 } 28550 28551 /* Allocate the private structure */ 28552 static int 28553 ip_priv_alloc(void **bufp) 28554 { 28555 void *buf; 28556 28557 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 28558 return (ENOMEM); 28559 28560 *bufp = buf; 28561 return (0); 28562 } 28563 28564 /* Function to delete the private structure */ 28565 void 28566 ip_priv_free(void *buf) 28567 { 28568 ASSERT(buf != NULL); 28569 kmem_free(buf, sizeof (ip_priv_t)); 28570 } 28571 28572 /* 28573 * The entry point for IPPF processing. 28574 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 28575 * routine just returns. 28576 * 28577 * When called, ip_process generates an ipp_packet_t structure 28578 * which holds the state information for this packet and invokes the 28579 * the classifier (via ipp_packet_process). The classification, depending on 28580 * configured filters, results in a list of actions for this packet. Invoking 28581 * an action may cause the packet to be dropped, in which case the resulting 28582 * mblk (*mpp) is NULL. proc indicates the callout position for 28583 * this packet and ill_index is the interface this packet on or will leave 28584 * on (inbound and outbound resp.). 28585 */ 28586 void 28587 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 28588 { 28589 mblk_t *mp; 28590 ip_priv_t *priv; 28591 ipp_action_id_t aid; 28592 int rc = 0; 28593 ipp_packet_t *pp; 28594 #define IP_CLASS "ip" 28595 28596 /* If the classifier is not loaded, return */ 28597 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 28598 return; 28599 } 28600 28601 mp = *mpp; 28602 ASSERT(mp != NULL); 28603 28604 /* Allocate the packet structure */ 28605 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 28606 if (rc != 0) { 28607 *mpp = NULL; 28608 freemsg(mp); 28609 return; 28610 } 28611 28612 /* Allocate the private structure */ 28613 rc = ip_priv_alloc((void **)&priv); 28614 if (rc != 0) { 28615 *mpp = NULL; 28616 freemsg(mp); 28617 ipp_packet_free(pp); 28618 return; 28619 } 28620 priv->proc = proc; 28621 priv->ill_index = ill_index; 28622 ipp_packet_set_private(pp, priv, ip_priv_free); 28623 ipp_packet_set_data(pp, mp); 28624 28625 /* Invoke the classifier */ 28626 rc = ipp_packet_process(&pp); 28627 if (pp != NULL) { 28628 mp = ipp_packet_get_data(pp); 28629 ipp_packet_free(pp); 28630 if (rc != 0) { 28631 freemsg(mp); 28632 *mpp = NULL; 28633 } 28634 } else { 28635 *mpp = NULL; 28636 } 28637 #undef IP_CLASS 28638 } 28639 28640 /* 28641 * Propagate a multicast group membership operation (add/drop) on 28642 * all the interfaces crossed by the related multirt routes. 28643 * The call is considered successful if the operation succeeds 28644 * on at least one interface. 28645 */ 28646 static int 28647 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 28648 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 28649 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 28650 mblk_t *first_mp) 28651 { 28652 ire_t *ire_gw; 28653 irb_t *irb; 28654 int error = 0; 28655 opt_restart_t *or; 28656 ip_stack_t *ipst = ire->ire_ipst; 28657 28658 irb = ire->ire_bucket; 28659 ASSERT(irb != NULL); 28660 28661 ASSERT(DB_TYPE(first_mp) == M_CTL); 28662 28663 or = (opt_restart_t *)first_mp->b_rptr; 28664 IRB_REFHOLD(irb); 28665 for (; ire != NULL; ire = ire->ire_next) { 28666 if ((ire->ire_flags & RTF_MULTIRT) == 0) 28667 continue; 28668 if (ire->ire_addr != group) 28669 continue; 28670 28671 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 28672 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 28673 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst); 28674 /* No resolver exists for the gateway; skip this ire. */ 28675 if (ire_gw == NULL) 28676 continue; 28677 28678 /* 28679 * This function can return EINPROGRESS. If so the operation 28680 * will be restarted from ip_restart_optmgmt which will 28681 * call ip_opt_set and option processing will restart for 28682 * this option. So we may end up calling 'fn' more than once. 28683 * This requires that 'fn' is idempotent except for the 28684 * return value. The operation is considered a success if 28685 * it succeeds at least once on any one interface. 28686 */ 28687 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 28688 NULL, fmode, src, first_mp); 28689 if (error == 0) 28690 or->or_private = CGTP_MCAST_SUCCESS; 28691 28692 if (ip_debug > 0) { 28693 ulong_t off; 28694 char *ksym; 28695 ksym = kobj_getsymname((uintptr_t)fn, &off); 28696 ip2dbg(("ip_multirt_apply_membership: " 28697 "called %s, multirt group 0x%08x via itf 0x%08x, " 28698 "error %d [success %u]\n", 28699 ksym ? ksym : "?", 28700 ntohl(group), ntohl(ire_gw->ire_src_addr), 28701 error, or->or_private)); 28702 } 28703 28704 ire_refrele(ire_gw); 28705 if (error == EINPROGRESS) { 28706 IRB_REFRELE(irb); 28707 return (error); 28708 } 28709 } 28710 IRB_REFRELE(irb); 28711 /* 28712 * Consider the call as successful if we succeeded on at least 28713 * one interface. Otherwise, return the last encountered error. 28714 */ 28715 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 28716 } 28717 28718 /* 28719 * Issue a warning regarding a route crossing an interface with an 28720 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 28721 * amount of time is logged. 28722 */ 28723 static void 28724 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 28725 { 28726 hrtime_t current = gethrtime(); 28727 char buf[INET_ADDRSTRLEN]; 28728 ip_stack_t *ipst = ire->ire_ipst; 28729 28730 /* Convert interval in ms to hrtime in ns */ 28731 if (ipst->ips_multirt_bad_mtu_last_time + 28732 ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <= 28733 current) { 28734 cmn_err(CE_WARN, "ip: ignoring multiroute " 28735 "to %s, incorrect MTU %u (expected %u)\n", 28736 ip_dot_addr(ire->ire_addr, buf), 28737 ire->ire_max_frag, max_frag); 28738 28739 ipst->ips_multirt_bad_mtu_last_time = current; 28740 } 28741 } 28742 28743 /* 28744 * Get the CGTP (multirouting) filtering status. 28745 * If 0, the CGTP hooks are transparent. 28746 */ 28747 /* ARGSUSED */ 28748 static int 28749 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 28750 { 28751 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 28752 28753 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 28754 return (0); 28755 } 28756 28757 /* 28758 * Set the CGTP (multirouting) filtering status. 28759 * If the status is changed from active to transparent 28760 * or from transparent to active, forward the new status 28761 * to the filtering module (if loaded). 28762 */ 28763 /* ARGSUSED */ 28764 static int 28765 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 28766 cred_t *ioc_cr) 28767 { 28768 long new_value; 28769 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 28770 ip_stack_t *ipst = CONNQ_TO_IPST(q); 28771 28772 if (secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 28773 return (EPERM); 28774 28775 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 28776 new_value < 0 || new_value > 1) { 28777 return (EINVAL); 28778 } 28779 28780 if ((!*ip_cgtp_filter_value) && new_value) { 28781 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 28782 ipst->ips_ip_cgtp_filter_ops == NULL ? 28783 " (module not loaded)" : ""); 28784 } 28785 if (*ip_cgtp_filter_value && (!new_value)) { 28786 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 28787 ipst->ips_ip_cgtp_filter_ops == NULL ? 28788 " (module not loaded)" : ""); 28789 } 28790 28791 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 28792 int res; 28793 netstackid_t stackid; 28794 28795 stackid = ipst->ips_netstack->netstack_stackid; 28796 res = ipst->ips_ip_cgtp_filter_ops->cfo_change_state(stackid, 28797 new_value); 28798 if (res) 28799 return (res); 28800 } 28801 28802 *ip_cgtp_filter_value = (boolean_t)new_value; 28803 28804 return (0); 28805 } 28806 28807 /* 28808 * Return the expected CGTP hooks version number. 28809 */ 28810 int 28811 ip_cgtp_filter_supported(void) 28812 { 28813 return (ip_cgtp_filter_rev); 28814 } 28815 28816 /* 28817 * CGTP hooks can be registered by invoking this function. 28818 * Checks that the version number matches. 28819 */ 28820 int 28821 ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops) 28822 { 28823 netstack_t *ns; 28824 ip_stack_t *ipst; 28825 28826 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 28827 return (ENOTSUP); 28828 28829 ns = netstack_find_by_stackid(stackid); 28830 if (ns == NULL) 28831 return (EINVAL); 28832 ipst = ns->netstack_ip; 28833 ASSERT(ipst != NULL); 28834 28835 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 28836 netstack_rele(ns); 28837 return (EALREADY); 28838 } 28839 28840 ipst->ips_ip_cgtp_filter_ops = ops; 28841 netstack_rele(ns); 28842 return (0); 28843 } 28844 28845 /* 28846 * CGTP hooks can be unregistered by invoking this function. 28847 * Returns ENXIO if there was no registration. 28848 * Returns EBUSY if the ndd variable has not been turned off. 28849 */ 28850 int 28851 ip_cgtp_filter_unregister(netstackid_t stackid) 28852 { 28853 netstack_t *ns; 28854 ip_stack_t *ipst; 28855 28856 ns = netstack_find_by_stackid(stackid); 28857 if (ns == NULL) 28858 return (EINVAL); 28859 ipst = ns->netstack_ip; 28860 ASSERT(ipst != NULL); 28861 28862 if (ipst->ips_ip_cgtp_filter) { 28863 netstack_rele(ns); 28864 return (EBUSY); 28865 } 28866 28867 if (ipst->ips_ip_cgtp_filter_ops == NULL) { 28868 netstack_rele(ns); 28869 return (ENXIO); 28870 } 28871 ipst->ips_ip_cgtp_filter_ops = NULL; 28872 netstack_rele(ns); 28873 return (0); 28874 } 28875 28876 /* 28877 * Check whether there is a CGTP filter registration. 28878 * Returns non-zero if there is a registration, otherwise returns zero. 28879 * Note: returns zero if bad stackid. 28880 */ 28881 int 28882 ip_cgtp_filter_is_registered(netstackid_t stackid) 28883 { 28884 netstack_t *ns; 28885 ip_stack_t *ipst; 28886 int ret; 28887 28888 ns = netstack_find_by_stackid(stackid); 28889 if (ns == NULL) 28890 return (0); 28891 ipst = ns->netstack_ip; 28892 ASSERT(ipst != NULL); 28893 28894 if (ipst->ips_ip_cgtp_filter_ops != NULL) 28895 ret = 1; 28896 else 28897 ret = 0; 28898 28899 netstack_rele(ns); 28900 return (ret); 28901 } 28902 28903 static int 28904 ip_squeue_switch(int val) 28905 { 28906 int rval = SQ_FILL; 28907 28908 switch (val) { 28909 case IP_SQUEUE_ENTER_NODRAIN: 28910 rval = SQ_NODRAIN; 28911 break; 28912 case IP_SQUEUE_ENTER: 28913 rval = SQ_PROCESS; 28914 break; 28915 default: 28916 break; 28917 } 28918 return (rval); 28919 } 28920 28921 /* ARGSUSED */ 28922 static int 28923 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 28924 caddr_t addr, cred_t *cr) 28925 { 28926 int *v = (int *)addr; 28927 long new_value; 28928 28929 if (secpolicy_net_config(cr, B_FALSE) != 0) 28930 return (EPERM); 28931 28932 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 28933 return (EINVAL); 28934 28935 ip_squeue_flag = ip_squeue_switch(new_value); 28936 *v = new_value; 28937 return (0); 28938 } 28939 28940 /* 28941 * Handle ndd set of variables which require PRIV_SYS_NET_CONFIG such as 28942 * ip_debug. 28943 */ 28944 /* ARGSUSED */ 28945 static int 28946 ip_int_set(queue_t *q, mblk_t *mp, char *value, 28947 caddr_t addr, cred_t *cr) 28948 { 28949 int *v = (int *)addr; 28950 long new_value; 28951 28952 if (secpolicy_net_config(cr, B_FALSE) != 0) 28953 return (EPERM); 28954 28955 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 28956 return (EINVAL); 28957 28958 *v = new_value; 28959 return (0); 28960 } 28961 28962 static void * 28963 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) 28964 { 28965 kstat_t *ksp; 28966 28967 ip_stat_t template = { 28968 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 28969 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 28970 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 28971 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 28972 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 28973 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 28974 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 28975 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 28976 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 28977 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 28978 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 28979 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 28980 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 28981 { "ip_db_ref", KSTAT_DATA_UINT64 }, 28982 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 28983 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 28984 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 28985 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 28986 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 28987 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 28988 { "ip_opt", KSTAT_DATA_UINT64 }, 28989 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 28990 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 28991 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 28992 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 28993 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 28994 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 28995 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 28996 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 28997 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 28998 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 28999 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 29000 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 29001 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29002 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29003 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29004 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29005 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29006 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29007 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29008 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29009 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 29010 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 29011 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 29012 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 29013 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 29014 }; 29015 29016 ksp = kstat_create_netstack("ip", 0, "ipstat", "net", 29017 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 29018 KSTAT_FLAG_VIRTUAL, stackid); 29019 29020 if (ksp == NULL) 29021 return (NULL); 29022 29023 bcopy(&template, ip_statisticsp, sizeof (template)); 29024 ksp->ks_data = (void *)ip_statisticsp; 29025 ksp->ks_private = (void *)(uintptr_t)stackid; 29026 29027 kstat_install(ksp); 29028 return (ksp); 29029 } 29030 29031 static void 29032 ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp) 29033 { 29034 if (ksp != NULL) { 29035 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29036 kstat_delete_netstack(ksp, stackid); 29037 } 29038 } 29039 29040 static void * 29041 ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst) 29042 { 29043 kstat_t *ksp; 29044 29045 ip_named_kstat_t template = { 29046 { "forwarding", KSTAT_DATA_UINT32, 0 }, 29047 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 29048 { "inReceives", KSTAT_DATA_UINT64, 0 }, 29049 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 29050 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 29051 { "forwDatagrams", KSTAT_DATA_UINT64, 0 }, 29052 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 29053 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 29054 { "inDelivers", KSTAT_DATA_UINT64, 0 }, 29055 { "outRequests", KSTAT_DATA_UINT64, 0 }, 29056 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 29057 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 29058 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 29059 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 29060 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 29061 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 29062 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 29063 { "fragFails", KSTAT_DATA_UINT32, 0 }, 29064 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 29065 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 29066 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 29067 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 29068 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 29069 { "inErrs", KSTAT_DATA_UINT32, 0 }, 29070 { "noPorts", KSTAT_DATA_UINT32, 0 }, 29071 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 29072 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 29073 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 29074 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 29075 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 29076 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 29077 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 29078 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 29079 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 29080 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 29081 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 29082 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 29083 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 29084 }; 29085 29086 ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 29087 NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid); 29088 if (ksp == NULL || ksp->ks_data == NULL) 29089 return (NULL); 29090 29091 template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2; 29092 template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl; 29093 template.reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29094 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 29095 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 29096 29097 template.netToMediaEntrySize.value.i32 = 29098 sizeof (mib2_ipNetToMediaEntry_t); 29099 29100 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 29101 29102 bcopy(&template, ksp->ks_data, sizeof (template)); 29103 ksp->ks_update = ip_kstat_update; 29104 ksp->ks_private = (void *)(uintptr_t)stackid; 29105 29106 kstat_install(ksp); 29107 return (ksp); 29108 } 29109 29110 static void 29111 ip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29112 { 29113 if (ksp != NULL) { 29114 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29115 kstat_delete_netstack(ksp, stackid); 29116 } 29117 } 29118 29119 static int 29120 ip_kstat_update(kstat_t *kp, int rw) 29121 { 29122 ip_named_kstat_t *ipkp; 29123 mib2_ipIfStatsEntry_t ipmib; 29124 ill_walk_context_t ctx; 29125 ill_t *ill; 29126 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29127 netstack_t *ns; 29128 ip_stack_t *ipst; 29129 29130 if (kp == NULL || kp->ks_data == NULL) 29131 return (EIO); 29132 29133 if (rw == KSTAT_WRITE) 29134 return (EACCES); 29135 29136 ns = netstack_find_by_stackid(stackid); 29137 if (ns == NULL) 29138 return (-1); 29139 ipst = ns->netstack_ip; 29140 if (ipst == NULL) { 29141 netstack_rele(ns); 29142 return (-1); 29143 } 29144 ipkp = (ip_named_kstat_t *)kp->ks_data; 29145 29146 bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib)); 29147 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29148 ill = ILL_START_WALK_V4(&ctx, ipst); 29149 for (; ill != NULL; ill = ill_next(&ctx, ill)) 29150 ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib); 29151 rw_exit(&ipst->ips_ill_g_lock); 29152 29153 ipkp->forwarding.value.ui32 = ipmib.ipIfStatsForwarding; 29154 ipkp->defaultTTL.value.ui32 = ipmib.ipIfStatsDefaultTTL; 29155 ipkp->inReceives.value.ui64 = ipmib.ipIfStatsHCInReceives; 29156 ipkp->inHdrErrors.value.ui32 = ipmib.ipIfStatsInHdrErrors; 29157 ipkp->inAddrErrors.value.ui32 = ipmib.ipIfStatsInAddrErrors; 29158 ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams; 29159 ipkp->inUnknownProtos.value.ui32 = ipmib.ipIfStatsInUnknownProtos; 29160 ipkp->inDiscards.value.ui32 = ipmib.ipIfStatsInDiscards; 29161 ipkp->inDelivers.value.ui64 = ipmib.ipIfStatsHCInDelivers; 29162 ipkp->outRequests.value.ui64 = ipmib.ipIfStatsHCOutRequests; 29163 ipkp->outDiscards.value.ui32 = ipmib.ipIfStatsOutDiscards; 29164 ipkp->outNoRoutes.value.ui32 = ipmib.ipIfStatsOutNoRoutes; 29165 ipkp->reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29166 ipkp->reasmReqds.value.ui32 = ipmib.ipIfStatsReasmReqds; 29167 ipkp->reasmOKs.value.ui32 = ipmib.ipIfStatsReasmOKs; 29168 ipkp->reasmFails.value.ui32 = ipmib.ipIfStatsReasmFails; 29169 ipkp->fragOKs.value.ui32 = ipmib.ipIfStatsOutFragOKs; 29170 ipkp->fragFails.value.ui32 = ipmib.ipIfStatsOutFragFails; 29171 ipkp->fragCreates.value.ui32 = ipmib.ipIfStatsOutFragCreates; 29172 29173 ipkp->routingDiscards.value.ui32 = 0; 29174 ipkp->inErrs.value.ui32 = ipmib.tcpIfStatsInErrs; 29175 ipkp->noPorts.value.ui32 = ipmib.udpIfStatsNoPorts; 29176 ipkp->inCksumErrs.value.ui32 = ipmib.ipIfStatsInCksumErrs; 29177 ipkp->reasmDuplicates.value.ui32 = ipmib.ipIfStatsReasmDuplicates; 29178 ipkp->reasmPartDups.value.ui32 = ipmib.ipIfStatsReasmPartDups; 29179 ipkp->forwProhibits.value.ui32 = ipmib.ipIfStatsForwProhibits; 29180 ipkp->udpInCksumErrs.value.ui32 = ipmib.udpIfStatsInCksumErrs; 29181 ipkp->udpInOverflows.value.ui32 = ipmib.udpIfStatsInOverflows; 29182 ipkp->rawipInOverflows.value.ui32 = ipmib.rawipIfStatsInOverflows; 29183 ipkp->ipsecInSucceeded.value.ui32 = ipmib.ipsecIfStatsInSucceeded; 29184 ipkp->ipsecInFailed.value.i32 = ipmib.ipsecIfStatsInFailed; 29185 29186 ipkp->inIPv6.value.ui32 = ipmib.ipIfStatsInWrongIPVersion; 29187 ipkp->outIPv6.value.ui32 = ipmib.ipIfStatsOutWrongIPVersion; 29188 ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion; 29189 29190 netstack_rele(ns); 29191 29192 return (0); 29193 } 29194 29195 static void * 29196 icmp_kstat_init(netstackid_t stackid) 29197 { 29198 kstat_t *ksp; 29199 29200 icmp_named_kstat_t template = { 29201 { "inMsgs", KSTAT_DATA_UINT32 }, 29202 { "inErrors", KSTAT_DATA_UINT32 }, 29203 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 29204 { "inTimeExcds", KSTAT_DATA_UINT32 }, 29205 { "inParmProbs", KSTAT_DATA_UINT32 }, 29206 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 29207 { "inRedirects", KSTAT_DATA_UINT32 }, 29208 { "inEchos", KSTAT_DATA_UINT32 }, 29209 { "inEchoReps", KSTAT_DATA_UINT32 }, 29210 { "inTimestamps", KSTAT_DATA_UINT32 }, 29211 { "inTimestampReps", KSTAT_DATA_UINT32 }, 29212 { "inAddrMasks", KSTAT_DATA_UINT32 }, 29213 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 29214 { "outMsgs", KSTAT_DATA_UINT32 }, 29215 { "outErrors", KSTAT_DATA_UINT32 }, 29216 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 29217 { "outTimeExcds", KSTAT_DATA_UINT32 }, 29218 { "outParmProbs", KSTAT_DATA_UINT32 }, 29219 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 29220 { "outRedirects", KSTAT_DATA_UINT32 }, 29221 { "outEchos", KSTAT_DATA_UINT32 }, 29222 { "outEchoReps", KSTAT_DATA_UINT32 }, 29223 { "outTimestamps", KSTAT_DATA_UINT32 }, 29224 { "outTimestampReps", KSTAT_DATA_UINT32 }, 29225 { "outAddrMasks", KSTAT_DATA_UINT32 }, 29226 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 29227 { "inChksumErrs", KSTAT_DATA_UINT32 }, 29228 { "inUnknowns", KSTAT_DATA_UINT32 }, 29229 { "inFragNeeded", KSTAT_DATA_UINT32 }, 29230 { "outFragNeeded", KSTAT_DATA_UINT32 }, 29231 { "outDrops", KSTAT_DATA_UINT32 }, 29232 { "inOverFlows", KSTAT_DATA_UINT32 }, 29233 { "inBadRedirects", KSTAT_DATA_UINT32 }, 29234 }; 29235 29236 ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 29237 NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid); 29238 if (ksp == NULL || ksp->ks_data == NULL) 29239 return (NULL); 29240 29241 bcopy(&template, ksp->ks_data, sizeof (template)); 29242 29243 ksp->ks_update = icmp_kstat_update; 29244 ksp->ks_private = (void *)(uintptr_t)stackid; 29245 29246 kstat_install(ksp); 29247 return (ksp); 29248 } 29249 29250 static void 29251 icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29252 { 29253 if (ksp != NULL) { 29254 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29255 kstat_delete_netstack(ksp, stackid); 29256 } 29257 } 29258 29259 static int 29260 icmp_kstat_update(kstat_t *kp, int rw) 29261 { 29262 icmp_named_kstat_t *icmpkp; 29263 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29264 netstack_t *ns; 29265 ip_stack_t *ipst; 29266 29267 if ((kp == NULL) || (kp->ks_data == NULL)) 29268 return (EIO); 29269 29270 if (rw == KSTAT_WRITE) 29271 return (EACCES); 29272 29273 ns = netstack_find_by_stackid(stackid); 29274 if (ns == NULL) 29275 return (-1); 29276 ipst = ns->netstack_ip; 29277 if (ipst == NULL) { 29278 netstack_rele(ns); 29279 return (-1); 29280 } 29281 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 29282 29283 icmpkp->inMsgs.value.ui32 = ipst->ips_icmp_mib.icmpInMsgs; 29284 icmpkp->inErrors.value.ui32 = ipst->ips_icmp_mib.icmpInErrors; 29285 icmpkp->inDestUnreachs.value.ui32 = 29286 ipst->ips_icmp_mib.icmpInDestUnreachs; 29287 icmpkp->inTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpInTimeExcds; 29288 icmpkp->inParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpInParmProbs; 29289 icmpkp->inSrcQuenchs.value.ui32 = ipst->ips_icmp_mib.icmpInSrcQuenchs; 29290 icmpkp->inRedirects.value.ui32 = ipst->ips_icmp_mib.icmpInRedirects; 29291 icmpkp->inEchos.value.ui32 = ipst->ips_icmp_mib.icmpInEchos; 29292 icmpkp->inEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpInEchoReps; 29293 icmpkp->inTimestamps.value.ui32 = ipst->ips_icmp_mib.icmpInTimestamps; 29294 icmpkp->inTimestampReps.value.ui32 = 29295 ipst->ips_icmp_mib.icmpInTimestampReps; 29296 icmpkp->inAddrMasks.value.ui32 = ipst->ips_icmp_mib.icmpInAddrMasks; 29297 icmpkp->inAddrMaskReps.value.ui32 = 29298 ipst->ips_icmp_mib.icmpInAddrMaskReps; 29299 icmpkp->outMsgs.value.ui32 = ipst->ips_icmp_mib.icmpOutMsgs; 29300 icmpkp->outErrors.value.ui32 = ipst->ips_icmp_mib.icmpOutErrors; 29301 icmpkp->outDestUnreachs.value.ui32 = 29302 ipst->ips_icmp_mib.icmpOutDestUnreachs; 29303 icmpkp->outTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpOutTimeExcds; 29304 icmpkp->outParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpOutParmProbs; 29305 icmpkp->outSrcQuenchs.value.ui32 = 29306 ipst->ips_icmp_mib.icmpOutSrcQuenchs; 29307 icmpkp->outRedirects.value.ui32 = ipst->ips_icmp_mib.icmpOutRedirects; 29308 icmpkp->outEchos.value.ui32 = ipst->ips_icmp_mib.icmpOutEchos; 29309 icmpkp->outEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpOutEchoReps; 29310 icmpkp->outTimestamps.value.ui32 = 29311 ipst->ips_icmp_mib.icmpOutTimestamps; 29312 icmpkp->outTimestampReps.value.ui32 = 29313 ipst->ips_icmp_mib.icmpOutTimestampReps; 29314 icmpkp->outAddrMasks.value.ui32 = 29315 ipst->ips_icmp_mib.icmpOutAddrMasks; 29316 icmpkp->outAddrMaskReps.value.ui32 = 29317 ipst->ips_icmp_mib.icmpOutAddrMaskReps; 29318 icmpkp->inCksumErrs.value.ui32 = ipst->ips_icmp_mib.icmpInCksumErrs; 29319 icmpkp->inUnknowns.value.ui32 = ipst->ips_icmp_mib.icmpInUnknowns; 29320 icmpkp->inFragNeeded.value.ui32 = ipst->ips_icmp_mib.icmpInFragNeeded; 29321 icmpkp->outFragNeeded.value.ui32 = 29322 ipst->ips_icmp_mib.icmpOutFragNeeded; 29323 icmpkp->outDrops.value.ui32 = ipst->ips_icmp_mib.icmpOutDrops; 29324 icmpkp->inOverflows.value.ui32 = ipst->ips_icmp_mib.icmpInOverflows; 29325 icmpkp->inBadRedirects.value.ui32 = 29326 ipst->ips_icmp_mib.icmpInBadRedirects; 29327 29328 netstack_rele(ns); 29329 return (0); 29330 } 29331 29332 /* 29333 * This is the fanout function for raw socket opened for SCTP. Note 29334 * that it is called after SCTP checks that there is no socket which 29335 * wants a packet. Then before SCTP handles this out of the blue packet, 29336 * this function is called to see if there is any raw socket for SCTP. 29337 * If there is and it is bound to the correct address, the packet will 29338 * be sent to that socket. Note that only one raw socket can be bound to 29339 * a port. This is assured in ipcl_sctp_hash_insert(); 29340 */ 29341 void 29342 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 29343 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 29344 zoneid_t zoneid) 29345 { 29346 conn_t *connp; 29347 queue_t *rq; 29348 mblk_t *first_mp; 29349 boolean_t secure; 29350 ip6_t *ip6h; 29351 ip_stack_t *ipst = recv_ill->ill_ipst; 29352 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 29353 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; 29354 boolean_t sctp_csum_err = B_FALSE; 29355 29356 if (flags & IP_FF_SCTP_CSUM_ERR) { 29357 sctp_csum_err = B_TRUE; 29358 flags &= ~IP_FF_SCTP_CSUM_ERR; 29359 } 29360 29361 first_mp = mp; 29362 if (mctl_present) { 29363 mp = first_mp->b_cont; 29364 secure = ipsec_in_is_secure(first_mp); 29365 ASSERT(mp != NULL); 29366 } else { 29367 secure = B_FALSE; 29368 } 29369 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 29370 29371 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst); 29372 if (connp == NULL) { 29373 /* 29374 * Although raw sctp is not summed, OOB chunks must be. 29375 * Drop the packet here if the sctp checksum failed. 29376 */ 29377 if (sctp_csum_err) { 29378 BUMP_MIB(&sctps->sctps_mib, sctpChecksumError); 29379 freemsg(first_mp); 29380 return; 29381 } 29382 sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present); 29383 return; 29384 } 29385 rq = connp->conn_rq; 29386 if (!canputnext(rq)) { 29387 CONN_DEC_REF(connp); 29388 BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows); 29389 freemsg(first_mp); 29390 return; 29391 } 29392 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) : 29393 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) { 29394 first_mp = ipsec_check_inbound_policy(first_mp, connp, 29395 (isv4 ? ipha : NULL), ip6h, mctl_present); 29396 if (first_mp == NULL) { 29397 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 29398 CONN_DEC_REF(connp); 29399 return; 29400 } 29401 } 29402 /* 29403 * We probably should not send M_CTL message up to 29404 * raw socket. 29405 */ 29406 if (mctl_present) 29407 freeb(first_mp); 29408 29409 /* Initiate IPPF processing here if needed. */ 29410 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) || 29411 (!isv4 && IP6_IN_IPP(flags, ipst))) { 29412 ip_process(IPP_LOCAL_IN, &mp, 29413 recv_ill->ill_phyint->phyint_ifindex); 29414 if (mp == NULL) { 29415 CONN_DEC_REF(connp); 29416 return; 29417 } 29418 } 29419 29420 if (connp->conn_recvif || connp->conn_recvslla || 29421 ((connp->conn_ip_recvpktinfo || 29422 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 29423 (flags & IP_FF_IPINFO))) { 29424 int in_flags = 0; 29425 29426 /* 29427 * Since sctp does not support IP_RECVPKTINFO for v4, only pass 29428 * IPF_RECVIF. 29429 */ 29430 if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { 29431 in_flags = IPF_RECVIF; 29432 } 29433 if (connp->conn_recvslla) { 29434 in_flags |= IPF_RECVSLLA; 29435 } 29436 if (isv4) { 29437 mp = ip_add_info(mp, recv_ill, in_flags, 29438 IPCL_ZONEID(connp), ipst); 29439 } else { 29440 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 29441 if (mp == NULL) { 29442 BUMP_MIB(recv_ill->ill_ip_mib, 29443 ipIfStatsInDiscards); 29444 CONN_DEC_REF(connp); 29445 return; 29446 } 29447 } 29448 } 29449 29450 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 29451 /* 29452 * We are sending the IPSEC_IN message also up. Refer 29453 * to comments above this function. 29454 * This is the SOCK_RAW, IPPROTO_SCTP case. 29455 */ 29456 (connp->conn_recv)(connp, mp, NULL); 29457 CONN_DEC_REF(connp); 29458 } 29459 29460 #define UPDATE_IP_MIB_OB_COUNTERS(ill, len) \ 29461 { \ 29462 BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits); \ 29463 UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len)); \ 29464 } 29465 /* 29466 * This function should be called only if all packet processing 29467 * including fragmentation is complete. Callers of this function 29468 * must set mp->b_prev to one of these values: 29469 * {0, IPP_FWD_OUT, IPP_LOCAL_OUT} 29470 * prior to handing over the mp as first argument to this function. 29471 * 29472 * If the ire passed by caller is incomplete, this function 29473 * queues the packet and if necessary, sends ARP request and bails. 29474 * If the ire passed is fully resolved, we simply prepend 29475 * the link-layer header to the packet, do ipsec hw acceleration 29476 * work if necessary, and send the packet out on the wire. 29477 * 29478 * NOTE: IPsec will only call this function with fully resolved 29479 * ires if hw acceleration is involved. 29480 * TODO list : 29481 * a Handle M_MULTIDATA so that 29482 * tcp_multisend->tcp_multisend_data can 29483 * call ip_xmit_v4 directly 29484 * b Handle post-ARP work for fragments so that 29485 * ip_wput_frag can call this function. 29486 */ 29487 ipxmit_state_t 29488 ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, 29489 boolean_t flow_ctl_enabled, conn_t *connp) 29490 { 29491 nce_t *arpce; 29492 ipha_t *ipha; 29493 queue_t *q; 29494 int ill_index; 29495 mblk_t *nxt_mp, *first_mp; 29496 boolean_t xmit_drop = B_FALSE; 29497 ip_proc_t proc; 29498 ill_t *out_ill; 29499 int pkt_len; 29500 29501 arpce = ire->ire_nce; 29502 ASSERT(arpce != NULL); 29503 29504 DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire, nce_t *, arpce); 29505 29506 mutex_enter(&arpce->nce_lock); 29507 switch (arpce->nce_state) { 29508 case ND_REACHABLE: 29509 /* If there are other queued packets, queue this packet */ 29510 if (arpce->nce_qd_mp != NULL) { 29511 if (mp != NULL) 29512 nce_queue_mp_common(arpce, mp, B_FALSE); 29513 mp = arpce->nce_qd_mp; 29514 } 29515 arpce->nce_qd_mp = NULL; 29516 mutex_exit(&arpce->nce_lock); 29517 29518 /* 29519 * Flush the queue. In the common case, where the 29520 * ARP is already resolved, it will go through the 29521 * while loop only once. 29522 */ 29523 while (mp != NULL) { 29524 29525 nxt_mp = mp->b_next; 29526 mp->b_next = NULL; 29527 ASSERT(mp->b_datap->db_type != M_CTL); 29528 pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length); 29529 /* 29530 * This info is needed for IPQOS to do COS marking 29531 * in ip_wput_attach_llhdr->ip_process. 29532 */ 29533 proc = (ip_proc_t)(uintptr_t)mp->b_prev; 29534 mp->b_prev = NULL; 29535 29536 /* set up ill index for outbound qos processing */ 29537 out_ill = ire_to_ill(ire); 29538 ill_index = out_ill->ill_phyint->phyint_ifindex; 29539 first_mp = ip_wput_attach_llhdr(mp, ire, proc, 29540 ill_index, &ipha); 29541 if (first_mp == NULL) { 29542 xmit_drop = B_TRUE; 29543 BUMP_MIB(out_ill->ill_ip_mib, 29544 ipIfStatsOutDiscards); 29545 goto next_mp; 29546 } 29547 29548 /* non-ipsec hw accel case */ 29549 if (io == NULL || !io->ipsec_out_accelerated) { 29550 /* send it */ 29551 q = ire->ire_stq; 29552 if (proc == IPP_FWD_OUT) { 29553 UPDATE_IB_PKT_COUNT(ire); 29554 } else { 29555 UPDATE_OB_PKT_COUNT(ire); 29556 } 29557 ire->ire_last_used_time = lbolt; 29558 29559 if (flow_ctl_enabled || canputnext(q)) { 29560 if (proc == IPP_FWD_OUT) { 29561 29562 BUMP_MIB(out_ill->ill_ip_mib, 29563 ipIfStatsHCOutForwDatagrams); 29564 29565 } 29566 UPDATE_IP_MIB_OB_COUNTERS(out_ill, 29567 pkt_len); 29568 29569 DTRACE_IP7(send, mblk_t *, first_mp, 29570 conn_t *, NULL, void_ip_t *, ipha, 29571 __dtrace_ipsr_ill_t *, out_ill, 29572 ipha_t *, ipha, ip6_t *, NULL, int, 29573 0); 29574 29575 ILL_SEND_TX(out_ill, 29576 ire, connp, first_mp, 0, connp); 29577 } else { 29578 BUMP_MIB(out_ill->ill_ip_mib, 29579 ipIfStatsOutDiscards); 29580 xmit_drop = B_TRUE; 29581 freemsg(first_mp); 29582 } 29583 } else { 29584 /* 29585 * Safety Pup says: make sure this 29586 * is going to the right interface! 29587 */ 29588 ill_t *ill1 = 29589 (ill_t *)ire->ire_stq->q_ptr; 29590 int ifindex = 29591 ill1->ill_phyint->phyint_ifindex; 29592 if (ifindex != 29593 io->ipsec_out_capab_ill_index) { 29594 xmit_drop = B_TRUE; 29595 freemsg(mp); 29596 } else { 29597 UPDATE_IP_MIB_OB_COUNTERS(ill1, 29598 pkt_len); 29599 29600 DTRACE_IP7(send, mblk_t *, first_mp, 29601 conn_t *, NULL, void_ip_t *, ipha, 29602 __dtrace_ipsr_ill_t *, ill1, 29603 ipha_t *, ipha, ip6_t *, NULL, 29604 int, 0); 29605 29606 ipsec_hw_putnext(ire->ire_stq, mp); 29607 } 29608 } 29609 next_mp: 29610 mp = nxt_mp; 29611 } /* while (mp != NULL) */ 29612 if (xmit_drop) 29613 return (SEND_FAILED); 29614 else 29615 return (SEND_PASSED); 29616 29617 case ND_INITIAL: 29618 case ND_INCOMPLETE: 29619 29620 /* 29621 * While we do send off packets to dests that 29622 * use fully-resolved CGTP routes, we do not 29623 * handle unresolved CGTP routes. 29624 */ 29625 ASSERT(!(ire->ire_flags & RTF_MULTIRT)); 29626 ASSERT(io == NULL || !io->ipsec_out_accelerated); 29627 29628 if (mp != NULL) { 29629 /* queue the packet */ 29630 nce_queue_mp_common(arpce, mp, B_FALSE); 29631 } 29632 29633 if (arpce->nce_state == ND_INCOMPLETE) { 29634 mutex_exit(&arpce->nce_lock); 29635 DTRACE_PROBE3(ip__xmit__incomplete, 29636 (ire_t *), ire, (mblk_t *), mp, 29637 (ipsec_out_t *), io); 29638 return (LOOKUP_IN_PROGRESS); 29639 } 29640 29641 arpce->nce_state = ND_INCOMPLETE; 29642 mutex_exit(&arpce->nce_lock); 29643 29644 /* 29645 * Note that ire_add() (called from ire_forward()) 29646 * holds a ref on the ire until ARP is completed. 29647 */ 29648 ire_arpresolve(ire); 29649 return (LOOKUP_IN_PROGRESS); 29650 default: 29651 ASSERT(0); 29652 mutex_exit(&arpce->nce_lock); 29653 return (LLHDR_RESLV_FAILED); 29654 } 29655 } 29656 29657 #undef UPDATE_IP_MIB_OB_COUNTERS 29658 29659 /* 29660 * Return B_TRUE if the buffers differ in length or content. 29661 * This is used for comparing extension header buffers. 29662 * Note that an extension header would be declared different 29663 * even if all that changed was the next header value in that header i.e. 29664 * what really changed is the next extension header. 29665 */ 29666 boolean_t 29667 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 29668 uint_t blen) 29669 { 29670 if (!b_valid) 29671 blen = 0; 29672 29673 if (alen != blen) 29674 return (B_TRUE); 29675 if (alen == 0) 29676 return (B_FALSE); /* Both zero length */ 29677 return (bcmp(abuf, bbuf, alen)); 29678 } 29679 29680 /* 29681 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 29682 * Return B_FALSE if memory allocation fails - don't change any state! 29683 */ 29684 boolean_t 29685 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 29686 const void *src, uint_t srclen) 29687 { 29688 void *dst; 29689 29690 if (!src_valid) 29691 srclen = 0; 29692 29693 ASSERT(*dstlenp == 0); 29694 if (src != NULL && srclen != 0) { 29695 dst = mi_alloc(srclen, BPRI_MED); 29696 if (dst == NULL) 29697 return (B_FALSE); 29698 } else { 29699 dst = NULL; 29700 } 29701 if (*dstp != NULL) 29702 mi_free(*dstp); 29703 *dstp = dst; 29704 *dstlenp = dst == NULL ? 0 : srclen; 29705 return (B_TRUE); 29706 } 29707 29708 /* 29709 * Replace what is in *dst, *dstlen with the source. 29710 * Assumes ip_allocbuf has already been called. 29711 */ 29712 void 29713 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 29714 const void *src, uint_t srclen) 29715 { 29716 if (!src_valid) 29717 srclen = 0; 29718 29719 ASSERT(*dstlenp == srclen); 29720 if (src != NULL && srclen != 0) 29721 bcopy(src, *dstp, srclen); 29722 } 29723 29724 /* 29725 * Free the storage pointed to by the members of an ip6_pkt_t. 29726 */ 29727 void 29728 ip6_pkt_free(ip6_pkt_t *ipp) 29729 { 29730 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 29731 29732 if (ipp->ipp_fields & IPPF_HOPOPTS) { 29733 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 29734 ipp->ipp_hopopts = NULL; 29735 ipp->ipp_hopoptslen = 0; 29736 } 29737 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 29738 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 29739 ipp->ipp_rtdstopts = NULL; 29740 ipp->ipp_rtdstoptslen = 0; 29741 } 29742 if (ipp->ipp_fields & IPPF_DSTOPTS) { 29743 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 29744 ipp->ipp_dstopts = NULL; 29745 ipp->ipp_dstoptslen = 0; 29746 } 29747 if (ipp->ipp_fields & IPPF_RTHDR) { 29748 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 29749 ipp->ipp_rthdr = NULL; 29750 ipp->ipp_rthdrlen = 0; 29751 } 29752 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 29753 IPPF_RTHDR); 29754 } 29755 29756 zoneid_t 29757 ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst, 29758 zoneid_t lookup_zoneid) 29759 { 29760 ire_t *ire; 29761 int ire_flags = MATCH_IRE_TYPE; 29762 zoneid_t zoneid = ALL_ZONES; 29763 29764 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) 29765 return (ALL_ZONES); 29766 29767 if (lookup_zoneid != ALL_ZONES) 29768 ire_flags |= MATCH_IRE_ZONEONLY; 29769 ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK, NULL, 29770 lookup_zoneid, NULL, ire_flags, ipst); 29771 if (ire != NULL) { 29772 zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst); 29773 ire_refrele(ire); 29774 } 29775 return (zoneid); 29776 } 29777 29778 zoneid_t 29779 ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill, 29780 ip_stack_t *ipst, zoneid_t lookup_zoneid) 29781 { 29782 ire_t *ire; 29783 int ire_flags = MATCH_IRE_TYPE; 29784 zoneid_t zoneid = ALL_ZONES; 29785 ipif_t *ipif_arg = NULL; 29786 29787 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) 29788 return (ALL_ZONES); 29789 29790 if (IN6_IS_ADDR_LINKLOCAL(addr)) { 29791 ire_flags |= MATCH_IRE_ILL; 29792 ipif_arg = ill->ill_ipif; 29793 } 29794 if (lookup_zoneid != ALL_ZONES) 29795 ire_flags |= MATCH_IRE_ZONEONLY; 29796 ire = ire_ctable_lookup_v6(addr, NULL, IRE_LOCAL | IRE_LOOPBACK, 29797 ipif_arg, lookup_zoneid, NULL, ire_flags, ipst); 29798 if (ire != NULL) { 29799 zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst); 29800 ire_refrele(ire); 29801 } 29802 return (zoneid); 29803 } 29804 29805 /* 29806 * IP obserability hook support functions. 29807 */ 29808 29809 static void 29810 ipobs_init(ip_stack_t *ipst) 29811 { 29812 ipst->ips_ipobs_enabled = B_FALSE; 29813 list_create(&ipst->ips_ipobs_cb_list, sizeof (ipobs_cb_t), 29814 offsetof(ipobs_cb_t, ipobs_cbnext)); 29815 mutex_init(&ipst->ips_ipobs_cb_lock, NULL, MUTEX_DEFAULT, NULL); 29816 ipst->ips_ipobs_cb_nwalkers = 0; 29817 cv_init(&ipst->ips_ipobs_cb_cv, NULL, CV_DRIVER, NULL); 29818 } 29819 29820 static void 29821 ipobs_fini(ip_stack_t *ipst) 29822 { 29823 ipobs_cb_t *cb; 29824 29825 mutex_enter(&ipst->ips_ipobs_cb_lock); 29826 while (ipst->ips_ipobs_cb_nwalkers != 0) 29827 cv_wait(&ipst->ips_ipobs_cb_cv, &ipst->ips_ipobs_cb_lock); 29828 29829 while ((cb = list_head(&ipst->ips_ipobs_cb_list)) != NULL) { 29830 list_remove(&ipst->ips_ipobs_cb_list, cb); 29831 kmem_free(cb, sizeof (*cb)); 29832 } 29833 list_destroy(&ipst->ips_ipobs_cb_list); 29834 mutex_exit(&ipst->ips_ipobs_cb_lock); 29835 mutex_destroy(&ipst->ips_ipobs_cb_lock); 29836 cv_destroy(&ipst->ips_ipobs_cb_cv); 29837 } 29838 29839 void 29840 ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst, 29841 const ill_t *ill, int ipver, uint32_t hlen, ip_stack_t *ipst) 29842 { 29843 mblk_t *mp2; 29844 ipobs_cb_t *ipobs_cb; 29845 ipobs_hook_data_t *ihd; 29846 uint64_t grifindex = 0; 29847 29848 ASSERT(DB_TYPE(mp) == M_DATA); 29849 29850 if (IS_UNDER_IPMP(ill)) 29851 grifindex = ipmp_ill_get_ipmp_ifindex(ill); 29852 29853 mutex_enter(&ipst->ips_ipobs_cb_lock); 29854 ipst->ips_ipobs_cb_nwalkers++; 29855 mutex_exit(&ipst->ips_ipobs_cb_lock); 29856 for (ipobs_cb = list_head(&ipst->ips_ipobs_cb_list); ipobs_cb != NULL; 29857 ipobs_cb = list_next(&ipst->ips_ipobs_cb_list, ipobs_cb)) { 29858 mp2 = allocb(sizeof (ipobs_hook_data_t), BPRI_HI); 29859 if (mp2 != NULL) { 29860 ihd = (ipobs_hook_data_t *)mp2->b_rptr; 29861 if (((ihd->ihd_mp = dupmsg(mp)) == NULL) && 29862 ((ihd->ihd_mp = copymsg(mp)) == NULL)) { 29863 freemsg(mp2); 29864 continue; 29865 } 29866 ihd->ihd_mp->b_rptr += hlen; 29867 ihd->ihd_htype = htype; 29868 ihd->ihd_ipver = ipver; 29869 ihd->ihd_zsrc = zsrc; 29870 ihd->ihd_zdst = zdst; 29871 ihd->ihd_ifindex = ill->ill_phyint->phyint_ifindex; 29872 ihd->ihd_grifindex = grifindex; 29873 ihd->ihd_stack = ipst->ips_netstack; 29874 mp2->b_wptr += sizeof (*ihd); 29875 ipobs_cb->ipobs_cbfunc(mp2); 29876 } 29877 } 29878 mutex_enter(&ipst->ips_ipobs_cb_lock); 29879 ipst->ips_ipobs_cb_nwalkers--; 29880 if (ipst->ips_ipobs_cb_nwalkers == 0) 29881 cv_broadcast(&ipst->ips_ipobs_cb_cv); 29882 mutex_exit(&ipst->ips_ipobs_cb_lock); 29883 } 29884 29885 void 29886 ipobs_register_hook(netstack_t *ns, pfv_t func) 29887 { 29888 ipobs_cb_t *cb; 29889 ip_stack_t *ipst = ns->netstack_ip; 29890 29891 cb = kmem_alloc(sizeof (*cb), KM_SLEEP); 29892 29893 mutex_enter(&ipst->ips_ipobs_cb_lock); 29894 while (ipst->ips_ipobs_cb_nwalkers != 0) 29895 cv_wait(&ipst->ips_ipobs_cb_cv, &ipst->ips_ipobs_cb_lock); 29896 ASSERT(ipst->ips_ipobs_cb_nwalkers == 0); 29897 29898 cb->ipobs_cbfunc = func; 29899 list_insert_head(&ipst->ips_ipobs_cb_list, cb); 29900 ipst->ips_ipobs_enabled = B_TRUE; 29901 mutex_exit(&ipst->ips_ipobs_cb_lock); 29902 } 29903 29904 void 29905 ipobs_unregister_hook(netstack_t *ns, pfv_t func) 29906 { 29907 ipobs_cb_t *curcb; 29908 ip_stack_t *ipst = ns->netstack_ip; 29909 29910 mutex_enter(&ipst->ips_ipobs_cb_lock); 29911 while (ipst->ips_ipobs_cb_nwalkers != 0) 29912 cv_wait(&ipst->ips_ipobs_cb_cv, &ipst->ips_ipobs_cb_lock); 29913 29914 for (curcb = list_head(&ipst->ips_ipobs_cb_list); curcb != NULL; 29915 curcb = list_next(&ipst->ips_ipobs_cb_list, curcb)) { 29916 if (func == curcb->ipobs_cbfunc) { 29917 list_remove(&ipst->ips_ipobs_cb_list, curcb); 29918 kmem_free(curcb, sizeof (*curcb)); 29919 break; 29920 } 29921 } 29922 if (list_is_empty(&ipst->ips_ipobs_cb_list)) 29923 ipst->ips_ipobs_enabled = B_FALSE; 29924 mutex_exit(&ipst->ips_ipobs_cb_lock); 29925 } 29926