1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/dlpi.h> 33 #include <sys/stropts.h> 34 #include <sys/sysmacros.h> 35 #include <sys/strsubr.h> 36 #include <sys/strlog.h> 37 #include <sys/strsun.h> 38 #include <sys/zone.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/xti_inet.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/cmn_err.h> 45 #include <sys/debug.h> 46 #include <sys/kobj.h> 47 #include <sys/modctl.h> 48 #include <sys/atomic.h> 49 #include <sys/policy.h> 50 #include <sys/priv.h> 51 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/kmem.h> 55 #include <sys/sdt.h> 56 #include <sys/socket.h> 57 #include <sys/vtrace.h> 58 #include <sys/isa_defs.h> 59 #include <sys/mac.h> 60 #include <net/if.h> 61 #include <net/if_arp.h> 62 #include <net/route.h> 63 #include <sys/sockio.h> 64 #include <netinet/in.h> 65 #include <net/if_dl.h> 66 67 #include <inet/common.h> 68 #include <inet/mi.h> 69 #include <inet/mib2.h> 70 #include <inet/nd.h> 71 #include <inet/arp.h> 72 #include <inet/snmpcom.h> 73 #include <inet/optcom.h> 74 #include <inet/kstatcom.h> 75 76 #include <netinet/igmp_var.h> 77 #include <netinet/ip6.h> 78 #include <netinet/icmp6.h> 79 #include <netinet/sctp.h> 80 81 #include <inet/ip.h> 82 #include <inet/ip_impl.h> 83 #include <inet/ip6.h> 84 #include <inet/ip6_asp.h> 85 #include <inet/tcp.h> 86 #include <inet/tcp_impl.h> 87 #include <inet/ip_multi.h> 88 #include <inet/ip_if.h> 89 #include <inet/ip_ire.h> 90 #include <inet/ip_ftable.h> 91 #include <inet/ip_rts.h> 92 #include <inet/ip_ndp.h> 93 #include <inet/ip_listutils.h> 94 #include <netinet/igmp.h> 95 #include <netinet/ip_mroute.h> 96 #include <inet/ipp_common.h> 97 98 #include <net/pfkeyv2.h> 99 #include <inet/ipsec_info.h> 100 #include <inet/sadb.h> 101 #include <inet/ipsec_impl.h> 102 #include <sys/iphada.h> 103 #include <inet/tun.h> 104 #include <inet/ipdrop.h> 105 #include <inet/ip_netinfo.h> 106 107 #include <sys/ethernet.h> 108 #include <net/if_types.h> 109 #include <sys/cpuvar.h> 110 111 #include <ipp/ipp.h> 112 #include <ipp/ipp_impl.h> 113 #include <ipp/ipgpc/ipgpc.h> 114 115 #include <sys/multidata.h> 116 #include <sys/pattr.h> 117 118 #include <inet/ipclassifier.h> 119 #include <inet/sctp_ip.h> 120 #include <inet/sctp/sctp_impl.h> 121 #include <inet/udp_impl.h> 122 #include <inet/rawip_impl.h> 123 #include <inet/rts_impl.h> 124 #include <sys/sunddi.h> 125 126 #include <sys/tsol/label.h> 127 #include <sys/tsol/tnet.h> 128 129 #include <rpc/pmap_prot.h> 130 131 /* 132 * Values for squeue switch: 133 * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain 134 * IP_SQUEUE_ENTER: squeue_enter 135 * IP_SQUEUE_FILL: squeue_fill 136 */ 137 int ip_squeue_enter = 2; /* Setable in /etc/system */ 138 139 squeue_func_t ip_input_proc; 140 #define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) 141 142 /* 143 * Setable in /etc/system 144 */ 145 int ip_poll_normal_ms = 100; 146 int ip_poll_normal_ticks = 0; 147 int ip_modclose_ackwait_ms = 3000; 148 149 /* 150 * It would be nice to have these present only in DEBUG systems, but the 151 * current design of the global symbol checking logic requires them to be 152 * unconditionally present. 153 */ 154 uint_t ip_thread_data; /* TSD key for debug support */ 155 krwlock_t ip_thread_rwlock; 156 list_t ip_thread_list; 157 158 /* 159 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 160 */ 161 162 struct listptr_s { 163 mblk_t *lp_head; /* pointer to the head of the list */ 164 mblk_t *lp_tail; /* pointer to the tail of the list */ 165 }; 166 167 typedef struct listptr_s listptr_t; 168 169 /* 170 * This is used by ip_snmp_get_mib2_ip_route_media and 171 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 172 */ 173 typedef struct iproutedata_s { 174 uint_t ird_idx; 175 listptr_t ird_route; /* ipRouteEntryTable */ 176 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 177 listptr_t ird_attrs; /* ipRouteAttributeTable */ 178 } iproutedata_t; 179 180 /* 181 * Cluster specific hooks. These should be NULL when booted as a non-cluster 182 */ 183 184 /* 185 * Hook functions to enable cluster networking 186 * On non-clustered systems these vectors must always be NULL. 187 * 188 * Hook function to Check ip specified ip address is a shared ip address 189 * in the cluster 190 * 191 */ 192 int (*cl_inet_isclusterwide)(uint8_t protocol, 193 sa_family_t addr_family, uint8_t *laddrp) = NULL; 194 195 /* 196 * Hook function to generate cluster wide ip fragment identifier 197 */ 198 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 199 uint8_t *laddrp, uint8_t *faddrp) = NULL; 200 201 /* 202 * Synchronization notes: 203 * 204 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 205 * MT level protection given by STREAMS. IP uses a combination of its own 206 * internal serialization mechanism and standard Solaris locking techniques. 207 * The internal serialization is per phyint (no IPMP) or per IPMP group. 208 * This is used to serialize plumbing operations, IPMP operations, certain 209 * multicast operations, most set ioctls, igmp/mld timers etc. 210 * 211 * Plumbing is a long sequence of operations involving message 212 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 213 * involved in plumbing operations. A natural model is to serialize these 214 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 215 * parallel without any interference. But various set ioctls on hme0 are best 216 * serialized. However if the system uses IPMP, the operations are easier if 217 * they are serialized on a per IPMP group basis since IPMP operations 218 * happen across ill's of a group. Thus the lowest common denominator is to 219 * serialize most set ioctls, multicast join/leave operations, IPMP operations 220 * igmp/mld timer operations, and processing of DLPI control messages received 221 * from drivers on a per IPMP group basis. If the system does not employ 222 * IPMP the serialization is on a per phyint basis. This serialization is 223 * provided by the ipsq_t and primitives operating on this. Details can 224 * be found in ip_if.c above the core primitives operating on ipsq_t. 225 * 226 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 227 * Simiarly lookup of an ire by a thread also returns a refheld ire. 228 * In addition ipif's and ill's referenced by the ire are also indirectly 229 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 230 * the ipif's address or netmask change as long as an ipif is refheld 231 * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the 232 * address of an ipif has to go through the ipsq_t. This ensures that only 233 * 1 such exclusive operation proceeds at any time on the ipif. It then 234 * deletes all ires associated with this ipif, and waits for all refcnts 235 * associated with this ipif to come down to zero. The address is changed 236 * only after the ipif has been quiesced. Then the ipif is brought up again. 237 * More details are described above the comment in ip_sioctl_flags. 238 * 239 * Packet processing is based mostly on IREs and are fully multi-threaded 240 * using standard Solaris MT techniques. 241 * 242 * There are explicit locks in IP to handle: 243 * - The ip_g_head list maintained by mi_open_link() and friends. 244 * 245 * - The reassembly data structures (one lock per hash bucket) 246 * 247 * - conn_lock is meant to protect conn_t fields. The fields actually 248 * protected by conn_lock are documented in the conn_t definition. 249 * 250 * - ire_lock to protect some of the fields of the ire, IRE tables 251 * (one lock per hash bucket). Refer to ip_ire.c for details. 252 * 253 * - ndp_g_lock and nce_lock for protecting NCEs. 254 * 255 * - ill_lock protects fields of the ill and ipif. Details in ip.h 256 * 257 * - ill_g_lock: This is a global reader/writer lock. Protects the following 258 * * The AVL tree based global multi list of all ills. 259 * * The linked list of all ipifs of an ill 260 * * The <ill-ipsq> mapping 261 * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next 262 * * The illgroup list threaded by ill_group_next. 263 * * <ill-phyint> association 264 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 265 * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion 266 * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill 267 * will all have to hold the ill_g_lock as writer for the actual duration 268 * of the insertion/deletion/change. More details about the <ill-ipsq> mapping 269 * may be found in the IPMP section. 270 * 271 * - ill_lock: This is a per ill mutex. 272 * It protects some members of the ill and is documented below. 273 * It also protects the <ill-ipsq> mapping 274 * It also protects the illgroup list threaded by ill_group_next. 275 * It also protects the <ill-phyint> assoc. 276 * It also protects the list of ipifs hanging off the ill. 277 * 278 * - ipsq_lock: This is a per ipsq_t mutex lock. 279 * This protects all the other members of the ipsq struct except 280 * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock 281 * 282 * - illgrp_lock: This is a per ill_group mutex lock. 283 * The only thing it protects is the illgrp_ill_schednext member of ill_group 284 * which dictates which is the next ill in an ill_group that is to be chosen 285 * for sending outgoing packets, through creation of an IRE_CACHE that 286 * references this ill. 287 * 288 * - phyint_lock: This is a per phyint mutex lock. Protects just the 289 * phyint_flags 290 * 291 * - ip_g_nd_lock: This is a global reader/writer lock. 292 * Any call to nd_load to load a new parameter to the ND table must hold the 293 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 294 * as reader. 295 * 296 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 297 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 298 * uniqueness check also done atomically. 299 * 300 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 301 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 302 * as a writer when adding or deleting elements from these lists, and 303 * as a reader when walking these lists to send a SADB update to the 304 * IPsec capable ills. 305 * 306 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 307 * group list linked by ill_usesrc_grp_next. It also protects the 308 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 309 * group is being added or deleted. This lock is taken as a reader when 310 * walking the list/group(eg: to get the number of members in a usesrc group). 311 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 312 * field is changing state i.e from NULL to non-NULL or vice-versa. For 313 * example, it is not necessary to take this lock in the initial portion 314 * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and 315 * ip_sioctl_flags since the these operations are executed exclusively and 316 * that ensures that the "usesrc group state" cannot change. The "usesrc 317 * group state" change can happen only in the latter part of 318 * ip_sioctl_slifusesrc and in ill_delete. 319 * 320 * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. 321 * 322 * To change the <ill-phyint> association, the ill_g_lock must be held 323 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 324 * must be held. 325 * 326 * To change the <ill-ipsq> association the ill_g_lock must be held as writer 327 * and the ill_lock of the ill in question must be held. 328 * 329 * To change the <ill-illgroup> association the ill_g_lock must be held as 330 * writer and the ill_lock of the ill in question must be held. 331 * 332 * To add or delete an ipif from the list of ipifs hanging off the ill, 333 * ill_g_lock (writer) and ill_lock must be held and the thread must be 334 * a writer on the associated ipsq,. 335 * 336 * To add or delete an ill to the system, the ill_g_lock must be held as 337 * writer and the thread must be a writer on the associated ipsq. 338 * 339 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 340 * must be a writer on the associated ipsq. 341 * 342 * Lock hierarchy 343 * 344 * Some lock hierarchy scenarios are listed below. 345 * 346 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 347 * ill_g_lock -> illgrp_lock -> ill_lock 348 * ill_g_lock -> ill_lock(s) -> phyint_lock 349 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 350 * ill_g_lock -> ip_addr_avail_lock 351 * conn_lock -> irb_lock -> ill_lock -> ire_lock 352 * ill_g_lock -> ip_g_nd_lock 353 * 354 * When more than 1 ill lock is needed to be held, all ill lock addresses 355 * are sorted on address and locked starting from highest addressed lock 356 * downward. 357 * 358 * IPsec scenarios 359 * 360 * ipsa_lock -> ill_g_lock -> ill_lock 361 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 362 * ipsec_capab_ills_lock -> ipsa_lock 363 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 364 * 365 * Trusted Solaris scenarios 366 * 367 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 368 * igsa_lock -> gcdb_lock 369 * gcgrp_rwlock -> ire_lock 370 * gcgrp_rwlock -> gcdb_lock 371 * 372 * 373 * Routing/forwarding table locking notes: 374 * 375 * Lock acquisition order: Radix tree lock, irb_lock. 376 * Requirements: 377 * i. Walker must not hold any locks during the walker callback. 378 * ii Walker must not see a truncated tree during the walk because of any node 379 * deletion. 380 * iii Existing code assumes ire_bucket is valid if it is non-null and is used 381 * in many places in the code to walk the irb list. Thus even if all the 382 * ires in a bucket have been deleted, we still can't free the radix node 383 * until the ires have actually been inactive'd (freed). 384 * 385 * Tree traversal - Need to hold the global tree lock in read mode. 386 * Before dropping the global tree lock, need to either increment the ire_refcnt 387 * to ensure that the radix node can't be deleted. 388 * 389 * Tree add - Need to hold the global tree lock in write mode to add a 390 * radix node. To prevent the node from being deleted, increment the 391 * irb_refcnt, after the node is added to the tree. The ire itself is 392 * added later while holding the irb_lock, but not the tree lock. 393 * 394 * Tree delete - Need to hold the global tree lock and irb_lock in write mode. 395 * All associated ires must be inactive (i.e. freed), and irb_refcnt 396 * must be zero. 397 * 398 * Walker - Increment irb_refcnt before calling the walker callback. Hold the 399 * global tree lock (read mode) for traversal. 400 * 401 * IPsec notes : 402 * 403 * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message 404 * in front of the actual packet. For outbound datagrams, the M_CTL 405 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 406 * information used by the IPsec code for applying the right level of 407 * protection. The information initialized by IP in the ipsec_out_t 408 * is determined by the per-socket policy or global policy in the system. 409 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 410 * ipsec_info.h) which starts out with nothing in it. It gets filled 411 * with the right information if it goes through the AH/ESP code, which 412 * happens if the incoming packet is secure. The information initialized 413 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 414 * the policy requirements needed by per-socket policy or global policy 415 * is met or not. 416 * 417 * If there is both per-socket policy (set using setsockopt) and there 418 * is also global policy match for the 5 tuples of the socket, 419 * ipsec_override_policy() makes the decision of which one to use. 420 * 421 * For fully connected sockets i.e dst, src [addr, port] is known, 422 * conn_policy_cached is set indicating that policy has been cached. 423 * conn_in_enforce_policy may or may not be set depending on whether 424 * there is a global policy match or per-socket policy match. 425 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 426 * Once the right policy is set on the conn_t, policy cannot change for 427 * this socket. This makes life simpler for TCP (UDP ?) where 428 * re-transmissions go out with the same policy. For symmetry, policy 429 * is cached for fully connected UDP sockets also. Thus if policy is cached, 430 * it also implies that policy is latched i.e policy cannot change 431 * on these sockets. As we have the right policy on the conn, we don't 432 * have to lookup global policy for every outbound and inbound datagram 433 * and thus serving as an optimization. Note that a global policy change 434 * does not affect fully connected sockets if they have policy. If fully 435 * connected sockets did not have any policy associated with it, global 436 * policy change may affect them. 437 * 438 * IP Flow control notes: 439 * 440 * Non-TCP streams are flow controlled by IP. On the send side, if the packet 441 * cannot be sent down to the driver by IP, because of a canput failure, IP 442 * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. 443 * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained 444 * when the flowcontrol condition subsides. Ultimately STREAMS backenables the 445 * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the 446 * first conn in the list of conn's to be drained. ip_wsrv on this conn drains 447 * the queued messages, and removes the conn from the drain list, if all 448 * messages were drained. It also qenables the next conn in the drain list to 449 * continue the drain process. 450 * 451 * In reality the drain list is not a single list, but a configurable number 452 * of lists. The ip_wsrv on the IP module, qenables the first conn in each 453 * list. If the ip_wsrv of the next qenabled conn does not run, because the 454 * stream closes, ip_close takes responsibility to qenable the next conn in 455 * the drain list. The directly called ip_wput path always does a putq, if 456 * it cannot putnext. Thus synchronization problems are handled between 457 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 458 * functions that manipulate this drain list. Furthermore conn_drain_insert 459 * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv 460 * running on a queue at any time. conn_drain_tail can be simultaneously called 461 * from both ip_wsrv and ip_close. 462 * 463 * IPQOS notes: 464 * 465 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 466 * and IPQoS modules. IPPF includes hooks in IP at different control points 467 * (callout positions) which direct packets to IPQoS modules for policy 468 * processing. Policies, if present, are global. 469 * 470 * The callout positions are located in the following paths: 471 * o local_in (packets destined for this host) 472 * o local_out (packets orginating from this host ) 473 * o fwd_in (packets forwarded by this m/c - inbound) 474 * o fwd_out (packets forwarded by this m/c - outbound) 475 * Hooks at these callout points can be enabled/disabled using the ndd variable 476 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 477 * By default all the callout positions are enabled. 478 * 479 * Outbound (local_out) 480 * Hooks are placed in ip_wput_ire and ipsec_out_process. 481 * 482 * Inbound (local_in) 483 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 484 * TCP and UDP fanout routines. 485 * 486 * Forwarding (in and out) 487 * Hooks are placed in ip_rput_forward. 488 * 489 * IP Policy Framework processing (IPPF processing) 490 * Policy processing for a packet is initiated by ip_process, which ascertains 491 * that the classifier (ipgpc) is loaded and configured, failing which the 492 * packet resumes normal processing in IP. If the clasifier is present, the 493 * packet is acted upon by one or more IPQoS modules (action instances), per 494 * filters configured in ipgpc and resumes normal IP processing thereafter. 495 * An action instance can drop a packet in course of its processing. 496 * 497 * A boolean variable, ip_policy, is used in all the fanout routines that can 498 * invoke ip_process for a packet. This variable indicates if the packet should 499 * to be sent for policy processing. The variable is set to B_TRUE by default, 500 * i.e. when the routines are invoked in the normal ip procesing path for a 501 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 502 * ip_policy is set to B_FALSE for all the routines called in these two 503 * functions because, in the former case, we don't process loopback traffic 504 * currently while in the latter, the packets have already been processed in 505 * icmp_inbound. 506 * 507 * Zones notes: 508 * 509 * The partitioning rules for networking are as follows: 510 * 1) Packets coming from a zone must have a source address belonging to that 511 * zone. 512 * 2) Packets coming from a zone can only be sent on a physical interface on 513 * which the zone has an IP address. 514 * 3) Between two zones on the same machine, packet delivery is only allowed if 515 * there's a matching route for the destination and zone in the forwarding 516 * table. 517 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 518 * different zones can bind to the same port with the wildcard address 519 * (INADDR_ANY). 520 * 521 * The granularity of interface partitioning is at the logical interface level. 522 * Therefore, every zone has its own IP addresses, and incoming packets can be 523 * attributed to a zone unambiguously. A logical interface is placed into a zone 524 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 525 * structure. Rule (1) is implemented by modifying the source address selection 526 * algorithm so that the list of eligible addresses is filtered based on the 527 * sending process zone. 528 * 529 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 530 * across all zones, depending on their type. Here is the break-up: 531 * 532 * IRE type Shared/exclusive 533 * -------- ---------------- 534 * IRE_BROADCAST Exclusive 535 * IRE_DEFAULT (default routes) Shared (*) 536 * IRE_LOCAL Exclusive (x) 537 * IRE_LOOPBACK Exclusive 538 * IRE_PREFIX (net routes) Shared (*) 539 * IRE_CACHE Exclusive 540 * IRE_IF_NORESOLVER (interface routes) Exclusive 541 * IRE_IF_RESOLVER (interface routes) Exclusive 542 * IRE_HOST (host routes) Shared (*) 543 * 544 * (*) A zone can only use a default or off-subnet route if the gateway is 545 * directly reachable from the zone, that is, if the gateway's address matches 546 * one of the zone's logical interfaces. 547 * 548 * (x) IRE_LOCAL are handled a bit differently, since for all other entries 549 * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source 550 * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP 551 * address of the zone itself (the destination). Since IRE_LOCAL is used 552 * for communication between zones, ip_wput_ire has special logic to set 553 * the right source address when sending using an IRE_LOCAL. 554 * 555 * Furthermore, when ip_restrict_interzone_loopback is set (the default), 556 * ire_cache_lookup restricts loopback using an IRE_LOCAL 557 * between zone to the case when L2 would have conceptually looped the packet 558 * back, i.e. the loopback which is required since neither Ethernet drivers 559 * nor Ethernet hardware loops them back. This is the case when the normal 560 * routes (ignoring IREs with different zoneids) would send out the packet on 561 * the same ill (or ill group) as the ill with which is IRE_LOCAL is 562 * associated. 563 * 564 * Multiple zones can share a common broadcast address; typically all zones 565 * share the 255.255.255.255 address. Incoming as well as locally originated 566 * broadcast packets must be dispatched to all the zones on the broadcast 567 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 568 * since some zones may not be on the 10.16.72/24 network. To handle this, each 569 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 570 * sent to every zone that has an IRE_BROADCAST entry for the destination 571 * address on the input ill, see conn_wantpacket(). 572 * 573 * Applications in different zones can join the same multicast group address. 574 * For IPv4, group memberships are per-logical interface, so they're already 575 * inherently part of a zone. For IPv6, group memberships are per-physical 576 * interface, so we distinguish IPv6 group memberships based on group address, 577 * interface and zoneid. In both cases, received multicast packets are sent to 578 * every zone for which a group membership entry exists. On IPv6 we need to 579 * check that the target zone still has an address on the receiving physical 580 * interface; it could have been removed since the application issued the 581 * IPV6_JOIN_GROUP. 582 */ 583 584 /* 585 * Squeue Fanout flags: 586 * 0: No fanout. 587 * 1: Fanout across all squeues 588 */ 589 boolean_t ip_squeue_fanout = 0; 590 591 /* 592 * Maximum dups allowed per packet. 593 */ 594 uint_t ip_max_frag_dups = 10; 595 596 #define IS_SIMPLE_IPH(ipha) \ 597 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 598 599 /* RFC1122 Conformance */ 600 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 601 602 #define ILL_MAX_NAMELEN LIFNAMSIZ 603 604 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 605 606 static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, 607 cred_t *credp, boolean_t isv6); 608 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t, 609 ipha_t **); 610 611 static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t, 612 ip_stack_t *); 613 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 614 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 615 static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp); 616 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t, 617 mblk_t *, int, ip_stack_t *); 618 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 619 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 620 ill_t *, zoneid_t); 621 static void icmp_options_update(ipha_t *); 622 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t, 623 ip_stack_t *); 624 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t, 625 zoneid_t zoneid, ip_stack_t *); 626 static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *); 627 static void icmp_redirect(ill_t *, mblk_t *); 628 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, 629 ip_stack_t *); 630 631 static void ip_arp_news(queue_t *, mblk_t *); 632 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *, 633 ip_stack_t *); 634 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 635 char *ip_dot_addr(ipaddr_t, char *); 636 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 637 int ip_close(queue_t *, int); 638 static char *ip_dot_saddr(uchar_t *, char *); 639 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 640 boolean_t, boolean_t, ill_t *, zoneid_t); 641 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 642 boolean_t, boolean_t, zoneid_t); 643 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 644 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 645 static void ip_lrput(queue_t *, mblk_t *); 646 ipaddr_t ip_net_mask(ipaddr_t); 647 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, 648 ip_stack_t *); 649 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 650 conn_t *, uint32_t, zoneid_t, ip_opt_info_t *); 651 char *ip_nv_lookup(nv_t *, int); 652 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 653 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 654 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 655 static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, 656 ipndp_t *, size_t); 657 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 658 void ip_rput(queue_t *, mblk_t *); 659 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 660 void *dummy_arg); 661 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 662 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *, 663 ip_stack_t *); 664 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 665 ire_t *, ip_stack_t *); 666 static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, 667 mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); 668 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, 669 ip_stack_t *); 670 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, 671 uint16_t *); 672 int ip_snmp_get(queue_t *, mblk_t *, int); 673 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, 674 mib2_ipIfStatsEntry_t *, ip_stack_t *); 675 static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, 676 ip_stack_t *); 677 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); 678 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); 679 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); 680 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); 681 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); 682 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, 683 ip_stack_t *ipst); 684 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, 685 ip_stack_t *ipst); 686 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, 687 ip_stack_t *ipst); 688 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, 689 ip_stack_t *ipst); 690 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, 691 ip_stack_t *ipst); 692 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, 693 ip_stack_t *ipst); 694 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, 695 ip_stack_t *ipst); 696 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, 697 ip_stack_t *ipst); 698 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, 699 ip_stack_t *ipst); 700 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, 701 ip_stack_t *ipst); 702 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 703 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 704 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 705 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 706 static boolean_t ip_source_routed(ipha_t *, ip_stack_t *); 707 static boolean_t ip_source_route_included(ipha_t *); 708 static void ip_trash_ire_reclaim_stack(ip_stack_t *); 709 710 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, 711 zoneid_t, ip_stack_t *); 712 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *); 713 static void ip_wput_local_options(ipha_t *, ip_stack_t *); 714 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 715 zoneid_t, ip_stack_t *); 716 717 static void conn_drain_init(ip_stack_t *); 718 static void conn_drain_fini(ip_stack_t *); 719 static void conn_drain_tail(conn_t *connp, boolean_t closing); 720 721 static void conn_walk_drain(ip_stack_t *); 722 static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, 723 zoneid_t); 724 725 static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); 726 static void ip_stack_shutdown(netstackid_t stackid, void *arg); 727 static void ip_stack_fini(netstackid_t stackid, void *arg); 728 729 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 730 zoneid_t); 731 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 732 void *dummy_arg); 733 734 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 735 736 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 737 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 738 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 739 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 740 741 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 742 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 743 caddr_t, cred_t *); 744 extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 745 caddr_t cp, cred_t *cr); 746 extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, 747 cred_t *); 748 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 749 caddr_t cp, cred_t *cr); 750 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 751 cred_t *); 752 static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, 753 cred_t *); 754 static squeue_func_t ip_squeue_switch(int); 755 756 static void *ip_kstat_init(netstackid_t, ip_stack_t *); 757 static void ip_kstat_fini(netstackid_t, kstat_t *); 758 static int ip_kstat_update(kstat_t *kp, int rw); 759 static void *icmp_kstat_init(netstackid_t); 760 static void icmp_kstat_fini(netstackid_t, kstat_t *); 761 static int icmp_kstat_update(kstat_t *kp, int rw); 762 static void *ip_kstat2_init(netstackid_t, ip_stat_t *); 763 static void ip_kstat2_fini(netstackid_t, kstat_t *); 764 765 static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); 766 767 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 768 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 769 770 static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, 771 ipha_t *, ill_t *, boolean_t); 772 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 773 774 /* How long, in seconds, we allow frags to hang around. */ 775 #define IP_FRAG_TIMEOUT 60 776 777 /* 778 * Threshold which determines whether MDT should be used when 779 * generating IP fragments; payload size must be greater than 780 * this threshold for MDT to take place. 781 */ 782 #define IP_WPUT_FRAG_MDT_MIN 32768 783 784 /* Setable in /etc/system only */ 785 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 786 787 static long ip_rput_pullups; 788 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 789 790 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ 791 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ 792 793 int ip_debug; 794 795 #ifdef DEBUG 796 uint32_t ipsechw_debug = 0; 797 #endif 798 799 /* 800 * Multirouting/CGTP stuff 801 */ 802 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 803 804 /* 805 * XXX following really should only be in a header. Would need more 806 * header and .c clean up first. 807 */ 808 extern optdb_obj_t ip_opt_obj; 809 810 ulong_t ip_squeue_enter_unbound = 0; 811 812 /* 813 * Named Dispatch Parameter Table. 814 * All of these are alterable, within the min/max values given, at run time. 815 */ 816 static ipparam_t lcl_param_arr[] = { 817 /* min max value name */ 818 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 819 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 820 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 821 { 0, 1, 0, "ip_respond_to_timestamp"}, 822 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 823 { 0, 1, 1, "ip_send_redirects"}, 824 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 825 { 0, 10, 0, "ip_mrtdebug"}, 826 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 827 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 828 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 829 { 1, 255, 255, "ip_def_ttl" }, 830 { 0, 1, 0, "ip_forward_src_routed"}, 831 { 0, 256, 32, "ip_wroff_extra" }, 832 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 833 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 834 { 0, 1, 1, "ip_path_mtu_discovery" }, 835 { 0, 240, 30, "ip_ignore_delete_time" }, 836 { 0, 1, 0, "ip_ignore_redirect" }, 837 { 0, 1, 1, "ip_output_queue" }, 838 { 1, 254, 1, "ip_broadcast_ttl" }, 839 { 0, 99999, 100, "ip_icmp_err_interval" }, 840 { 1, 99999, 10, "ip_icmp_err_burst" }, 841 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 842 { 0, 1, 0, "ip_strict_dst_multihoming" }, 843 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 844 { 0, 1, 0, "ipsec_override_persocket_policy" }, 845 { 0, 1, 1, "icmp_accept_clear_messages" }, 846 { 0, 1, 1, "igmp_accept_clear_messages" }, 847 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 848 "ip_ndp_delay_first_probe_time"}, 849 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 850 "ip_ndp_max_unicast_solicit"}, 851 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 852 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 853 { 0, 1, 0, "ip6_forward_src_routed"}, 854 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 855 { 0, 1, 1, "ip6_send_redirects"}, 856 { 0, 1, 0, "ip6_ignore_redirect" }, 857 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 858 859 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 860 861 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 862 863 { 0, 1, 1, "pim_accept_clear_messages" }, 864 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 865 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 866 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 867 { 0, 15, 0, "ip_policy_mask" }, 868 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 869 { 0, 255, 1, "ip_multirt_ttl" }, 870 { 0, 1, 1, "ip_multidata_outbound" }, 871 { 0, 3600000, 300000, "ip_ndp_defense_interval" }, 872 { 0, 999999, 60*60*24, "ip_max_temp_idle" }, 873 { 0, 1000, 1, "ip_max_temp_defend" }, 874 { 0, 1000, 3, "ip_max_defend" }, 875 { 0, 999999, 30, "ip_defend_interval" }, 876 { 0, 3600000, 300000, "ip_dup_recovery" }, 877 { 0, 1, 1, "ip_restrict_interzone_loopback" }, 878 { 0, 1, 1, "ip_lso_outbound" }, 879 { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, 880 { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, 881 #ifdef DEBUG 882 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 883 #else 884 { 0, 0, 0, "" }, 885 #endif 886 }; 887 888 /* 889 * Extended NDP table 890 * The addresses for the first two are filled in to be ips_ip_g_forward 891 * and ips_ipv6_forward at init time. 892 */ 893 static ipndp_t lcl_ndp_arr[] = { 894 /* getf setf data name */ 895 #define IPNDP_IP_FORWARDING_OFFSET 0 896 { ip_param_generic_get, ip_forward_set, NULL, 897 "ip_forwarding" }, 898 #define IPNDP_IP6_FORWARDING_OFFSET 1 899 { ip_param_generic_get, ip_forward_set, NULL, 900 "ip6_forwarding" }, 901 { ip_ill_report, NULL, NULL, 902 "ip_ill_status" }, 903 { ip_ipif_report, NULL, NULL, 904 "ip_ipif_status" }, 905 { ip_conn_report, NULL, NULL, 906 "ip_conn_status" }, 907 { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, 908 "ip_rput_pullups" }, 909 { ip_srcid_report, NULL, NULL, 910 "ip_srcid_status" }, 911 { ip_param_generic_get, ip_squeue_profile_set, 912 (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, 913 { ip_param_generic_get, ip_squeue_bind_set, 914 (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, 915 { ip_param_generic_get, ip_input_proc_set, 916 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 917 { ip_param_generic_get, ip_int_set, 918 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 919 #define IPNDP_CGTP_FILTER_OFFSET 11 920 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, 921 "ip_cgtp_filter" }, 922 { ip_param_generic_get, ip_int_set, 923 (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" }, 924 #define IPNDP_IPMP_HOOK_OFFSET 13 925 { ip_param_generic_get, ipmp_hook_emulation_set, NULL, 926 "ipmp_hook_emulation" }, 927 { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, 928 "ip_debug" }, 929 }; 930 931 /* 932 * Table of IP ioctls encoding the various properties of the ioctl and 933 * indexed based on the last byte of the ioctl command. Occasionally there 934 * is a clash, and there is more than 1 ioctl with the same last byte. 935 * In such a case 1 ioctl is encoded in the ndx table and the remaining 936 * ioctls are encoded in the misc table. An entry in the ndx table is 937 * retrieved by indexing on the last byte of the ioctl command and comparing 938 * the ioctl command with the value in the ndx table. In the event of a 939 * mismatch the misc table is then searched sequentially for the desired 940 * ioctl command. 941 * 942 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 943 */ 944 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 945 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 946 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 947 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 948 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 949 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 950 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 951 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 952 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 953 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 954 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 955 956 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 957 MISC_CMD, ip_siocaddrt, NULL }, 958 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 959 MISC_CMD, ip_siocdelrt, NULL }, 960 961 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 962 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 963 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 964 IF_CMD, ip_sioctl_get_addr, NULL }, 965 966 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 967 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 968 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 969 IPI_GET_CMD | IPI_REPL, 970 IF_CMD, ip_sioctl_get_dstaddr, NULL }, 971 972 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 973 IPI_PRIV | IPI_WR | IPI_REPL, 974 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 975 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 976 IPI_MODOK | IPI_GET_CMD | IPI_REPL, 977 IF_CMD, ip_sioctl_get_flags, NULL }, 978 979 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 980 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 981 982 /* copyin size cannot be coded for SIOCGIFCONF */ 983 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, 984 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 985 986 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 987 IF_CMD, ip_sioctl_mtu, NULL }, 988 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 989 IF_CMD, ip_sioctl_get_mtu, NULL }, 990 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 991 IPI_GET_CMD | IPI_REPL, 992 IF_CMD, ip_sioctl_get_brdaddr, NULL }, 993 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 994 IF_CMD, ip_sioctl_brdaddr, NULL }, 995 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 996 IPI_GET_CMD | IPI_REPL, 997 IF_CMD, ip_sioctl_get_netmask, NULL }, 998 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 999 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1000 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1001 IPI_GET_CMD | IPI_REPL, 1002 IF_CMD, ip_sioctl_get_metric, NULL }, 1003 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1004 IF_CMD, ip_sioctl_metric, NULL }, 1005 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1006 1007 /* See 166-168 below for extended SIOC*XARP ioctls */ 1008 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, 1009 ARP_CMD, ip_sioctl_arp, NULL }, 1010 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, 1011 ARP_CMD, ip_sioctl_arp, NULL }, 1012 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, 1013 ARP_CMD, ip_sioctl_arp, NULL }, 1014 1015 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1016 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1017 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1018 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1019 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1020 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1021 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1022 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1023 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1024 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1025 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1026 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1027 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1028 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1029 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1030 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1031 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1032 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1033 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1034 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1035 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1036 1037 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1038 MISC_CMD, if_unitsel, if_unitsel_restart }, 1039 1040 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1041 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1042 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1043 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1044 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1045 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1046 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1047 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1048 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1049 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1050 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1051 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1052 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1053 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1054 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1055 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1056 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1057 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1058 1059 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1060 IPI_PRIV | IPI_WR | IPI_MODOK, 1061 IF_CMD, ip_sioctl_sifname, NULL }, 1062 1063 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1064 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1065 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1066 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1067 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1068 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1069 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1070 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1071 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1072 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1073 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 1077 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, 1078 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1079 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1080 IF_CMD, ip_sioctl_get_muxid, NULL }, 1081 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1082 IPI_PRIV | IPI_WR | IPI_REPL, 1083 IF_CMD, ip_sioctl_muxid, NULL }, 1084 1085 /* Both if and lif variants share same func */ 1086 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1087 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1088 /* Both if and lif variants share same func */ 1089 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1090 IPI_PRIV | IPI_WR | IPI_REPL, 1091 IF_CMD, ip_sioctl_slifindex, NULL }, 1092 1093 /* copyin size cannot be coded for SIOCGIFCONF */ 1094 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, 1095 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1096 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1097 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1098 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1099 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1100 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1101 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1102 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1103 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1104 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1105 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1106 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1107 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1108 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1109 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1110 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1111 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1112 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1113 1114 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1115 IPI_PRIV | IPI_WR | IPI_REPL, 1116 LIF_CMD, ip_sioctl_removeif, 1117 ip_sioctl_removeif_restart }, 1118 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1119 IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, 1120 LIF_CMD, ip_sioctl_addif, NULL }, 1121 #define SIOCLIFADDR_NDX 112 1122 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1123 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1124 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1125 IPI_GET_CMD | IPI_REPL, 1126 LIF_CMD, ip_sioctl_get_addr, NULL }, 1127 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1128 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1129 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1130 IPI_GET_CMD | IPI_REPL, 1131 LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1132 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1133 IPI_PRIV | IPI_WR | IPI_REPL, 1134 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1135 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1136 IPI_GET_CMD | IPI_MODOK | IPI_REPL, 1137 LIF_CMD, ip_sioctl_get_flags, NULL }, 1138 1139 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1140 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1141 1142 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1143 ip_sioctl_get_lifconf, NULL }, 1144 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1145 LIF_CMD, ip_sioctl_mtu, NULL }, 1146 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, 1147 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1148 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1149 IPI_GET_CMD | IPI_REPL, 1150 LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1151 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1152 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1153 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1154 IPI_GET_CMD | IPI_REPL, 1155 LIF_CMD, ip_sioctl_get_netmask, NULL }, 1156 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1157 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1158 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1159 IPI_GET_CMD | IPI_REPL, 1160 LIF_CMD, ip_sioctl_get_metric, NULL }, 1161 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1162 LIF_CMD, ip_sioctl_metric, NULL }, 1163 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1164 IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, 1165 LIF_CMD, ip_sioctl_slifname, 1166 ip_sioctl_slifname_restart }, 1167 1168 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, 1169 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1170 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1171 IPI_GET_CMD | IPI_REPL, 1172 LIF_CMD, ip_sioctl_get_muxid, NULL }, 1173 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1174 IPI_PRIV | IPI_WR | IPI_REPL, 1175 LIF_CMD, ip_sioctl_muxid, NULL }, 1176 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1177 IPI_GET_CMD | IPI_REPL, 1178 LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1179 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1180 IPI_PRIV | IPI_WR | IPI_REPL, 1181 LIF_CMD, ip_sioctl_slifindex, 0 }, 1182 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1183 LIF_CMD, ip_sioctl_token, NULL }, 1184 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1185 IPI_GET_CMD | IPI_REPL, 1186 LIF_CMD, ip_sioctl_get_token, NULL }, 1187 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1188 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1189 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1190 IPI_GET_CMD | IPI_REPL, 1191 LIF_CMD, ip_sioctl_get_subnet, NULL }, 1192 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1193 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1194 1195 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1196 IPI_GET_CMD | IPI_REPL, 1197 LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1198 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1199 LIF_CMD, ip_siocdelndp_v6, NULL }, 1200 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1201 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1202 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1203 LIF_CMD, ip_siocsetndp_v6, NULL }, 1204 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1205 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1206 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1207 MISC_CMD, ip_sioctl_tonlink, NULL }, 1208 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1209 MISC_CMD, ip_sioctl_tmysite, NULL }, 1210 /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, 1211 TUN_CMD, ip_sioctl_tunparam, NULL }, 1212 /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), 1213 IPI_PRIV | IPI_WR, 1214 TUN_CMD, ip_sioctl_tunparam, NULL }, 1215 1216 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1217 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1218 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1219 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1220 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1221 1222 /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), 1223 IPI_PRIV | IPI_WR | IPI_REPL, 1224 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1225 /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), 1226 IPI_PRIV | IPI_WR | IPI_REPL, 1227 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1228 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1229 IPI_PRIV | IPI_WR, 1230 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1231 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1232 IPI_GET_CMD | IPI_REPL, 1233 LIF_CMD, ip_sioctl_get_groupname, NULL }, 1234 /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), 1235 IPI_GET_CMD | IPI_REPL, 1236 LIF_CMD, ip_sioctl_get_oindex, NULL }, 1237 1238 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1239 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1240 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1241 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1242 1243 /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1244 LIF_CMD, ip_sioctl_slifoindex, NULL }, 1245 1246 /* These are handled in ip_sioctl_copyin_setup itself */ 1247 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1248 MISC_CMD, NULL, NULL }, 1249 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1250 MISC_CMD, NULL, NULL }, 1251 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1252 1253 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1254 ip_sioctl_get_lifconf, NULL }, 1255 1256 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, 1257 XARP_CMD, ip_sioctl_arp, NULL }, 1258 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, 1259 XARP_CMD, ip_sioctl_arp, NULL }, 1260 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, 1261 XARP_CMD, ip_sioctl_arp, NULL }, 1262 1263 /* SIOCPOPSOCKFS is not handled by IP */ 1264 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1265 1266 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1267 IPI_GET_CMD | IPI_REPL, 1268 LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1269 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1270 IPI_PRIV | IPI_WR | IPI_REPL, 1271 LIF_CMD, ip_sioctl_slifzone, 1272 ip_sioctl_slifzone_restart }, 1273 /* 172-174 are SCTP ioctls and not handled by IP */ 1274 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1275 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1276 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1277 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1278 IPI_GET_CMD, LIF_CMD, 1279 ip_sioctl_get_lifusesrc, 0 }, 1280 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1281 IPI_PRIV | IPI_WR, 1282 LIF_CMD, ip_sioctl_slifusesrc, 1283 NULL }, 1284 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1285 ip_sioctl_get_lifsrcof, NULL }, 1286 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1287 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1288 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1289 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1290 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1291 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1292 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1293 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1294 /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, 1295 ip_sioctl_set_ipmpfailback, NULL }, 1296 /* SIOCSENABLESDP is handled by SDP */ 1297 /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, 1298 }; 1299 1300 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1301 1302 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1303 { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), 1304 IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, 1305 { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, 1306 TUN_CMD, ip_sioctl_tunparam, NULL }, 1307 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1308 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1309 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1310 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1311 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1312 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1313 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1314 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, 1315 MISC_CMD, mrt_ioctl}, 1316 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, 1317 MISC_CMD, mrt_ioctl}, 1318 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, 1319 MISC_CMD, mrt_ioctl} 1320 }; 1321 1322 int ip_misc_ioctl_count = 1323 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1324 1325 int conn_drain_nthreads; /* Number of drainers reqd. */ 1326 /* Settable in /etc/system */ 1327 /* Defined in ip_ire.c */ 1328 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1329 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1330 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1331 1332 static nv_t ire_nv_arr[] = { 1333 { IRE_BROADCAST, "BROADCAST" }, 1334 { IRE_LOCAL, "LOCAL" }, 1335 { IRE_LOOPBACK, "LOOPBACK" }, 1336 { IRE_CACHE, "CACHE" }, 1337 { IRE_DEFAULT, "DEFAULT" }, 1338 { IRE_PREFIX, "PREFIX" }, 1339 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1340 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1341 { IRE_HOST, "HOST" }, 1342 { 0 } 1343 }; 1344 1345 nv_t *ire_nv_tbl = ire_nv_arr; 1346 1347 /* Simple ICMP IP Header Template */ 1348 static ipha_t icmp_ipha = { 1349 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1350 }; 1351 1352 struct module_info ip_mod_info = { 1353 IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 1354 }; 1355 1356 /* 1357 * Duplicate static symbols within a module confuses mdb; so we avoid the 1358 * problem by making the symbols here distinct from those in udp.c. 1359 */ 1360 1361 /* 1362 * Entry points for IP as a device and as a module. 1363 * FIXME: down the road we might want a separate module and driver qinit. 1364 * We have separate open functions for the /dev/ip and /dev/ip6 devices. 1365 */ 1366 static struct qinit iprinitv4 = { 1367 (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, 1368 &ip_mod_info 1369 }; 1370 1371 struct qinit iprinitv6 = { 1372 (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, 1373 &ip_mod_info 1374 }; 1375 1376 static struct qinit ipwinitv4 = { 1377 (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1378 &ip_mod_info 1379 }; 1380 1381 struct qinit ipwinitv6 = { 1382 (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1383 &ip_mod_info 1384 }; 1385 1386 static struct qinit iplrinit = { 1387 (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, 1388 &ip_mod_info 1389 }; 1390 1391 static struct qinit iplwinit = { 1392 (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, 1393 &ip_mod_info 1394 }; 1395 1396 /* For AF_INET aka /dev/ip */ 1397 struct streamtab ipinfov4 = { 1398 &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit 1399 }; 1400 1401 /* For AF_INET6 aka /dev/ip6 */ 1402 struct streamtab ipinfov6 = { 1403 &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit 1404 }; 1405 1406 #ifdef DEBUG 1407 static boolean_t skip_sctp_cksum = B_FALSE; 1408 #endif 1409 1410 /* 1411 * Prepend the zoneid using an ipsec_out_t for later use by functions like 1412 * ip_rput_v6(), ip_output(), etc. If the message 1413 * block already has a M_CTL at the front of it, then simply set the zoneid 1414 * appropriately. 1415 */ 1416 mblk_t * 1417 ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1418 { 1419 mblk_t *first_mp; 1420 ipsec_out_t *io; 1421 1422 ASSERT(zoneid != ALL_ZONES); 1423 if (mp->b_datap->db_type == M_CTL) { 1424 io = (ipsec_out_t *)mp->b_rptr; 1425 ASSERT(io->ipsec_out_type == IPSEC_OUT); 1426 io->ipsec_out_zoneid = zoneid; 1427 return (mp); 1428 } 1429 1430 first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack); 1431 if (first_mp == NULL) 1432 return (NULL); 1433 io = (ipsec_out_t *)first_mp->b_rptr; 1434 /* This is not a secure packet */ 1435 io->ipsec_out_secure = B_FALSE; 1436 io->ipsec_out_zoneid = zoneid; 1437 first_mp->b_cont = mp; 1438 return (first_mp); 1439 } 1440 1441 /* 1442 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1443 */ 1444 mblk_t * 1445 ip_copymsg(mblk_t *mp) 1446 { 1447 mblk_t *nmp; 1448 ipsec_info_t *in; 1449 1450 if (mp->b_datap->db_type != M_CTL) 1451 return (copymsg(mp)); 1452 1453 in = (ipsec_info_t *)mp->b_rptr; 1454 1455 /* 1456 * Note that M_CTL is also used for delivering ICMP error messages 1457 * upstream to transport layers. 1458 */ 1459 if (in->ipsec_info_type != IPSEC_OUT && 1460 in->ipsec_info_type != IPSEC_IN) 1461 return (copymsg(mp)); 1462 1463 nmp = copymsg(mp->b_cont); 1464 1465 if (in->ipsec_info_type == IPSEC_OUT) { 1466 return (ipsec_out_tag(mp, nmp, 1467 ((ipsec_out_t *)in)->ipsec_out_ns)); 1468 } else { 1469 return (ipsec_in_tag(mp, nmp, 1470 ((ipsec_in_t *)in)->ipsec_in_ns)); 1471 } 1472 } 1473 1474 /* Generate an ICMP fragmentation needed message. */ 1475 static void 1476 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, 1477 ip_stack_t *ipst) 1478 { 1479 icmph_t icmph; 1480 mblk_t *first_mp; 1481 boolean_t mctl_present; 1482 1483 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1484 1485 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 1486 if (mctl_present) 1487 freeb(first_mp); 1488 return; 1489 } 1490 1491 bzero(&icmph, sizeof (icmph_t)); 1492 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1493 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1494 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1495 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); 1496 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 1497 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 1498 ipst); 1499 } 1500 1501 /* 1502 * icmp_inbound deals with ICMP messages in the following ways. 1503 * 1504 * 1) It needs to send a reply back and possibly delivering it 1505 * to the "interested" upper clients. 1506 * 2) It needs to send it to the upper clients only. 1507 * 3) It needs to change some values in IP only. 1508 * 4) It needs to change some values in IP and upper layers e.g TCP. 1509 * 1510 * We need to accomodate icmp messages coming in clear until we get 1511 * everything secure from the wire. If icmp_accept_clear_messages 1512 * is zero we check with the global policy and act accordingly. If 1513 * it is non-zero, we accept the message without any checks. But 1514 * *this does not mean* that this will be delivered to the upper 1515 * clients. By accepting we might send replies back, change our MTU 1516 * value etc. but delivery to the ULP/clients depends on their policy 1517 * dispositions. 1518 * 1519 * We handle the above 4 cases in the context of IPsec in the 1520 * following way : 1521 * 1522 * 1) Send the reply back in the same way as the request came in. 1523 * If it came in encrypted, it goes out encrypted. If it came in 1524 * clear, it goes out in clear. Thus, this will prevent chosen 1525 * plain text attack. 1526 * 2) The client may or may not expect things to come in secure. 1527 * If it comes in secure, the policy constraints are checked 1528 * before delivering it to the upper layers. If it comes in 1529 * clear, ipsec_inbound_accept_clear will decide whether to 1530 * accept this in clear or not. In both the cases, if the returned 1531 * message (IP header + 8 bytes) that caused the icmp message has 1532 * AH/ESP headers, it is sent up to AH/ESP for validation before 1533 * sending up. If there are only 8 bytes of returned message, then 1534 * upper client will not be notified. 1535 * 3) Check with global policy to see whether it matches the constaints. 1536 * But this will be done only if icmp_accept_messages_in_clear is 1537 * zero. 1538 * 4) If we need to change both in IP and ULP, then the decision taken 1539 * while affecting the values in IP and while delivering up to TCP 1540 * should be the same. 1541 * 1542 * There are two cases. 1543 * 1544 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1545 * failed), we will not deliver it to the ULP, even though they 1546 * are *willing* to accept in *clear*. This is fine as our global 1547 * disposition to icmp messages asks us reject the datagram. 1548 * 1549 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1550 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1551 * to deliver it to ULP (policy failed), it can lead to 1552 * consistency problems. The cases known at this time are 1553 * ICMP_DESTINATION_UNREACHABLE messages with following code 1554 * values : 1555 * 1556 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1557 * and Upper layer rejects. Then the communication will 1558 * come to a stop. This is solved by making similar decisions 1559 * at both levels. Currently, when we are unable to deliver 1560 * to the Upper Layer (due to policy failures) while IP has 1561 * adjusted ire_max_frag, the next outbound datagram would 1562 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1563 * will be with the right level of protection. Thus the right 1564 * value will be communicated even if we are not able to 1565 * communicate when we get from the wire initially. But this 1566 * assumes there would be at least one outbound datagram after 1567 * IP has adjusted its ire_max_frag value. To make things 1568 * simpler, we accept in clear after the validation of 1569 * AH/ESP headers. 1570 * 1571 * - Other ICMP ERRORS : We may not be able to deliver it to the 1572 * upper layer depending on the level of protection the upper 1573 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1574 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1575 * should be accepted in clear when the Upper layer expects secure. 1576 * Thus the communication may get aborted by some bad ICMP 1577 * packets. 1578 * 1579 * IPQoS Notes: 1580 * The only instance when a packet is sent for processing is when there 1581 * isn't an ICMP client and if we are interested in it. 1582 * If there is a client, IPPF processing will take place in the 1583 * ip_fanout_proto routine. 1584 * 1585 * Zones notes: 1586 * The packet is only processed in the context of the specified zone: typically 1587 * only this zone will reply to an echo request, and only interested clients in 1588 * this zone will receive a copy of the packet. This means that the caller must 1589 * call icmp_inbound() for each relevant zone. 1590 */ 1591 static void 1592 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1593 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1594 ill_t *recv_ill, zoneid_t zoneid) 1595 { 1596 icmph_t *icmph; 1597 ipha_t *ipha; 1598 int iph_hdr_length; 1599 int hdr_length; 1600 boolean_t interested; 1601 uint32_t ts; 1602 uchar_t *wptr; 1603 ipif_t *ipif; 1604 mblk_t *first_mp; 1605 ipsec_in_t *ii; 1606 ire_t *src_ire; 1607 boolean_t onlink; 1608 timestruc_t now; 1609 uint32_t ill_index; 1610 ip_stack_t *ipst; 1611 1612 ASSERT(ill != NULL); 1613 ipst = ill->ill_ipst; 1614 1615 first_mp = mp; 1616 if (mctl_present) { 1617 mp = first_mp->b_cont; 1618 ASSERT(mp != NULL); 1619 } 1620 1621 ipha = (ipha_t *)mp->b_rptr; 1622 if (ipst->ips_icmp_accept_clear_messages == 0) { 1623 first_mp = ipsec_check_global_policy(first_mp, NULL, 1624 ipha, NULL, mctl_present, ipst->ips_netstack); 1625 if (first_mp == NULL) 1626 return; 1627 } 1628 1629 /* 1630 * On a labeled system, we have to check whether the zone itself is 1631 * permitted to receive raw traffic. 1632 */ 1633 if (is_system_labeled()) { 1634 if (zoneid == ALL_ZONES) 1635 zoneid = tsol_packet_to_zoneid(mp); 1636 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1637 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1638 zoneid)); 1639 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1640 freemsg(first_mp); 1641 return; 1642 } 1643 } 1644 1645 /* 1646 * We have accepted the ICMP message. It means that we will 1647 * respond to the packet if needed. It may not be delivered 1648 * to the upper client depending on the policy constraints 1649 * and the disposition in ipsec_inbound_accept_clear. 1650 */ 1651 1652 ASSERT(ill != NULL); 1653 1654 BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); 1655 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1656 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1657 /* Last chance to get real. */ 1658 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1659 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1660 freemsg(first_mp); 1661 return; 1662 } 1663 /* Refresh iph following the pullup. */ 1664 ipha = (ipha_t *)mp->b_rptr; 1665 } 1666 /* ICMP header checksum, including checksum field, should be zero. */ 1667 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1668 IP_CSUM(mp, iph_hdr_length, 0)) { 1669 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 1670 freemsg(first_mp); 1671 return; 1672 } 1673 /* The IP header will always be a multiple of four bytes */ 1674 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1675 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1676 icmph->icmph_code)); 1677 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1678 /* We will set "interested" to "true" if we want a copy */ 1679 interested = B_FALSE; 1680 switch (icmph->icmph_type) { 1681 case ICMP_ECHO_REPLY: 1682 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); 1683 break; 1684 case ICMP_DEST_UNREACHABLE: 1685 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1686 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); 1687 interested = B_TRUE; /* Pass up to transport */ 1688 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); 1689 break; 1690 case ICMP_SOURCE_QUENCH: 1691 interested = B_TRUE; /* Pass up to transport */ 1692 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); 1693 break; 1694 case ICMP_REDIRECT: 1695 if (!ipst->ips_ip_ignore_redirect) 1696 interested = B_TRUE; 1697 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); 1698 break; 1699 case ICMP_ECHO_REQUEST: 1700 /* 1701 * Whether to respond to echo requests that come in as IP 1702 * broadcasts or as IP multicast is subject to debate 1703 * (what isn't?). We aim to please, you pick it. 1704 * Default is do it. 1705 */ 1706 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1707 /* unicast: always respond */ 1708 interested = B_TRUE; 1709 } else if (CLASSD(ipha->ipha_dst)) { 1710 /* multicast: respond based on tunable */ 1711 interested = ipst->ips_ip_g_resp_to_echo_mcast; 1712 } else if (broadcast) { 1713 /* broadcast: respond based on tunable */ 1714 interested = ipst->ips_ip_g_resp_to_echo_bcast; 1715 } 1716 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); 1717 break; 1718 case ICMP_ROUTER_ADVERTISEMENT: 1719 case ICMP_ROUTER_SOLICITATION: 1720 break; 1721 case ICMP_TIME_EXCEEDED: 1722 interested = B_TRUE; /* Pass up to transport */ 1723 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); 1724 break; 1725 case ICMP_PARAM_PROBLEM: 1726 interested = B_TRUE; /* Pass up to transport */ 1727 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); 1728 break; 1729 case ICMP_TIME_STAMP_REQUEST: 1730 /* Response to Time Stamp Requests is local policy. */ 1731 if (ipst->ips_ip_g_resp_to_timestamp && 1732 /* So is whether to respond if it was an IP broadcast. */ 1733 (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) { 1734 int tstamp_len = 3 * sizeof (uint32_t); 1735 1736 if (wptr + tstamp_len > mp->b_wptr) { 1737 if (!pullupmsg(mp, wptr + tstamp_len - 1738 mp->b_rptr)) { 1739 BUMP_MIB(ill->ill_ip_mib, 1740 ipIfStatsInDiscards); 1741 freemsg(first_mp); 1742 return; 1743 } 1744 /* Refresh ipha following the pullup. */ 1745 ipha = (ipha_t *)mp->b_rptr; 1746 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1747 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1748 } 1749 interested = B_TRUE; 1750 } 1751 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); 1752 break; 1753 case ICMP_TIME_STAMP_REPLY: 1754 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); 1755 break; 1756 case ICMP_INFO_REQUEST: 1757 /* Per RFC 1122 3.2.2.7, ignore this. */ 1758 case ICMP_INFO_REPLY: 1759 break; 1760 case ICMP_ADDRESS_MASK_REQUEST: 1761 if ((ipst->ips_ip_respond_to_address_mask_broadcast || 1762 !broadcast) && 1763 /* TODO m_pullup of complete header? */ 1764 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) { 1765 interested = B_TRUE; 1766 } 1767 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); 1768 break; 1769 case ICMP_ADDRESS_MASK_REPLY: 1770 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); 1771 break; 1772 default: 1773 interested = B_TRUE; /* Pass up to transport */ 1774 BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); 1775 break; 1776 } 1777 /* See if there is an ICMP client. */ 1778 if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) { 1779 /* If there is an ICMP client and we want one too, copy it. */ 1780 mblk_t *first_mp1; 1781 1782 if (!interested) { 1783 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1784 ip_policy, recv_ill, zoneid); 1785 return; 1786 } 1787 first_mp1 = ip_copymsg(first_mp); 1788 if (first_mp1 != NULL) { 1789 ip_fanout_proto(q, first_mp1, ill, ipha, 1790 0, mctl_present, ip_policy, recv_ill, zoneid); 1791 } 1792 } else if (!interested) { 1793 freemsg(first_mp); 1794 return; 1795 } else { 1796 /* 1797 * Initiate policy processing for this packet if ip_policy 1798 * is true. 1799 */ 1800 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 1801 ill_index = ill->ill_phyint->phyint_ifindex; 1802 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1803 if (mp == NULL) { 1804 if (mctl_present) { 1805 freeb(first_mp); 1806 } 1807 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1808 return; 1809 } 1810 } 1811 } 1812 /* We want to do something with it. */ 1813 /* Check db_ref to make sure we can modify the packet. */ 1814 if (mp->b_datap->db_ref > 1) { 1815 mblk_t *first_mp1; 1816 1817 first_mp1 = ip_copymsg(first_mp); 1818 freemsg(first_mp); 1819 if (!first_mp1) { 1820 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1821 return; 1822 } 1823 first_mp = first_mp1; 1824 if (mctl_present) { 1825 mp = first_mp->b_cont; 1826 ASSERT(mp != NULL); 1827 } else { 1828 mp = first_mp; 1829 } 1830 ipha = (ipha_t *)mp->b_rptr; 1831 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1832 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1833 } 1834 switch (icmph->icmph_type) { 1835 case ICMP_ADDRESS_MASK_REQUEST: 1836 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1837 if (ipif == NULL) { 1838 freemsg(first_mp); 1839 return; 1840 } 1841 /* 1842 * outging interface must be IPv4 1843 */ 1844 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1845 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1846 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1847 ipif_refrele(ipif); 1848 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); 1849 break; 1850 case ICMP_ECHO_REQUEST: 1851 icmph->icmph_type = ICMP_ECHO_REPLY; 1852 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); 1853 break; 1854 case ICMP_TIME_STAMP_REQUEST: { 1855 uint32_t *tsp; 1856 1857 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1858 tsp = (uint32_t *)wptr; 1859 tsp++; /* Skip past 'originate time' */ 1860 /* Compute # of milliseconds since midnight */ 1861 gethrestime(&now); 1862 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1863 now.tv_nsec / (NANOSEC / MILLISEC); 1864 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1865 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1866 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); 1867 break; 1868 } 1869 default: 1870 ipha = (ipha_t *)&icmph[1]; 1871 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1872 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1873 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1874 freemsg(first_mp); 1875 return; 1876 } 1877 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1878 ipha = (ipha_t *)&icmph[1]; 1879 } 1880 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1881 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1882 freemsg(first_mp); 1883 return; 1884 } 1885 hdr_length = IPH_HDR_LENGTH(ipha); 1886 if (hdr_length < sizeof (ipha_t)) { 1887 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1888 freemsg(first_mp); 1889 return; 1890 } 1891 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1892 if (!pullupmsg(mp, 1893 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1894 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1895 freemsg(first_mp); 1896 return; 1897 } 1898 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1899 ipha = (ipha_t *)&icmph[1]; 1900 } 1901 switch (icmph->icmph_type) { 1902 case ICMP_REDIRECT: 1903 /* 1904 * As there is no upper client to deliver, we don't 1905 * need the first_mp any more. 1906 */ 1907 if (mctl_present) { 1908 freeb(first_mp); 1909 } 1910 icmp_redirect(ill, mp); 1911 return; 1912 case ICMP_DEST_UNREACHABLE: 1913 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1914 if (!icmp_inbound_too_big(icmph, ipha, ill, 1915 zoneid, mp, iph_hdr_length, ipst)) { 1916 freemsg(first_mp); 1917 return; 1918 } 1919 /* 1920 * icmp_inbound_too_big() may alter mp. 1921 * Resynch ipha and icmph accordingly. 1922 */ 1923 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1924 ipha = (ipha_t *)&icmph[1]; 1925 } 1926 /* FALLTHRU */ 1927 default : 1928 /* 1929 * IPQoS notes: Since we have already done IPQoS 1930 * processing we don't want to do it again in 1931 * the fanout routines called by 1932 * icmp_inbound_error_fanout, hence the last 1933 * argument, ip_policy, is B_FALSE. 1934 */ 1935 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1936 ipha, iph_hdr_length, hdr_length, mctl_present, 1937 B_FALSE, recv_ill, zoneid); 1938 } 1939 return; 1940 } 1941 /* Send out an ICMP packet */ 1942 icmph->icmph_checksum = 0; 1943 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 1944 if (broadcast || CLASSD(ipha->ipha_dst)) { 1945 ipif_t *ipif_chosen; 1946 /* 1947 * Make it look like it was directed to us, so we don't look 1948 * like a fool with a broadcast or multicast source address. 1949 */ 1950 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1951 /* 1952 * Make sure that we haven't grabbed an interface that's DOWN. 1953 */ 1954 if (ipif != NULL) { 1955 ipif_chosen = ipif_select_source(ipif->ipif_ill, 1956 ipha->ipha_src, zoneid); 1957 if (ipif_chosen != NULL) { 1958 ipif_refrele(ipif); 1959 ipif = ipif_chosen; 1960 } 1961 } 1962 if (ipif == NULL) { 1963 ip0dbg(("icmp_inbound: " 1964 "No source for broadcast/multicast:\n" 1965 "\tsrc 0x%x dst 0x%x ill %p " 1966 "ipif_lcl_addr 0x%x\n", 1967 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1968 (void *)ill, 1969 ill->ill_ipif->ipif_lcl_addr)); 1970 freemsg(first_mp); 1971 return; 1972 } 1973 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1974 ipha->ipha_dst = ipif->ipif_src_addr; 1975 ipif_refrele(ipif); 1976 } 1977 /* Reset time to live. */ 1978 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 1979 { 1980 /* Swap source and destination addresses */ 1981 ipaddr_t tmp; 1982 1983 tmp = ipha->ipha_src; 1984 ipha->ipha_src = ipha->ipha_dst; 1985 ipha->ipha_dst = tmp; 1986 } 1987 ipha->ipha_ident = 0; 1988 if (!IS_SIMPLE_IPH(ipha)) 1989 icmp_options_update(ipha); 1990 1991 /* 1992 * ICMP echo replies should go out on the same interface 1993 * the request came on as probes used by in.mpathd for detecting 1994 * NIC failures are ECHO packets. We turn-off load spreading 1995 * by setting ipsec_in_attach_if to B_TRUE, which is copied 1996 * to ipsec_out_attach_if by ipsec_in_to_out called later in this 1997 * function. This is in turn handled by ip_wput and ip_newroute 1998 * to make sure that the packet goes out on the interface it came 1999 * in on. If we don't turnoff load spreading, the packets might get 2000 * dropped if there are no non-FAILED/INACTIVE interfaces for it 2001 * to go out and in.mpathd would wrongly detect a failure or 2002 * mis-detect a NIC failure for link failure. As load spreading 2003 * can happen only if ill_group is not NULL, we do only for 2004 * that case and this does not affect the normal case. 2005 * 2006 * We turn off load spreading only on echo packets that came from 2007 * on-link hosts. If the interface route has been deleted, this will 2008 * not be enforced as we can't do much. For off-link hosts, as the 2009 * default routes in IPv4 does not typically have an ire_ipif 2010 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. 2011 * Moreover, expecting a default route through this interface may 2012 * not be correct. We use ipha_dst because of the swap above. 2013 */ 2014 onlink = B_FALSE; 2015 if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { 2016 /* 2017 * First, we need to make sure that it is not one of our 2018 * local addresses. If we set onlink when it is one of 2019 * our local addresses, we will end up creating IRE_CACHES 2020 * for one of our local addresses. Then, we will never 2021 * accept packets for them afterwards. 2022 */ 2023 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, 2024 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2025 if (src_ire == NULL) { 2026 ipif = ipif_get_next_ipif(NULL, ill); 2027 if (ipif == NULL) { 2028 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2029 freemsg(mp); 2030 return; 2031 } 2032 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 2033 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 2034 NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst); 2035 ipif_refrele(ipif); 2036 if (src_ire != NULL) { 2037 onlink = B_TRUE; 2038 ire_refrele(src_ire); 2039 } 2040 } else { 2041 ire_refrele(src_ire); 2042 } 2043 } 2044 if (!mctl_present) { 2045 /* 2046 * This packet should go out the same way as it 2047 * came in i.e in clear. To make sure that global 2048 * policy will not be applied to this in ip_wput_ire, 2049 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2050 */ 2051 ASSERT(first_mp == mp); 2052 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2053 if (first_mp == NULL) { 2054 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2055 freemsg(mp); 2056 return; 2057 } 2058 ii = (ipsec_in_t *)first_mp->b_rptr; 2059 2060 /* This is not a secure packet */ 2061 ii->ipsec_in_secure = B_FALSE; 2062 if (onlink) { 2063 ii->ipsec_in_attach_if = B_TRUE; 2064 ii->ipsec_in_ill_index = 2065 ill->ill_phyint->phyint_ifindex; 2066 ii->ipsec_in_rill_index = 2067 recv_ill->ill_phyint->phyint_ifindex; 2068 } 2069 first_mp->b_cont = mp; 2070 } else if (onlink) { 2071 ii = (ipsec_in_t *)first_mp->b_rptr; 2072 ii->ipsec_in_attach_if = B_TRUE; 2073 ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; 2074 ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; 2075 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2076 } else { 2077 ii = (ipsec_in_t *)first_mp->b_rptr; 2078 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2079 } 2080 ii->ipsec_in_zoneid = zoneid; 2081 ASSERT(zoneid != ALL_ZONES); 2082 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2083 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2084 return; 2085 } 2086 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 2087 put(WR(q), first_mp); 2088 } 2089 2090 static ipaddr_t 2091 icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp) 2092 { 2093 conn_t *connp; 2094 connf_t *connfp; 2095 ipaddr_t nexthop_addr = INADDR_ANY; 2096 int hdr_length = IPH_HDR_LENGTH(ipha); 2097 uint16_t *up; 2098 uint32_t ports; 2099 ip_stack_t *ipst = ill->ill_ipst; 2100 2101 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2102 switch (ipha->ipha_protocol) { 2103 case IPPROTO_TCP: 2104 { 2105 tcph_t *tcph; 2106 2107 /* do a reverse lookup */ 2108 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2109 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, 2110 TCPS_LISTEN, ipst); 2111 break; 2112 } 2113 case IPPROTO_UDP: 2114 { 2115 uint32_t dstport, srcport; 2116 2117 ((uint16_t *)&ports)[0] = up[1]; 2118 ((uint16_t *)&ports)[1] = up[0]; 2119 2120 /* Extract ports in net byte order */ 2121 dstport = htons(ntohl(ports) & 0xFFFF); 2122 srcport = htons(ntohl(ports) >> 16); 2123 2124 connfp = &ipst->ips_ipcl_udp_fanout[ 2125 IPCL_UDP_HASH(dstport, ipst)]; 2126 mutex_enter(&connfp->connf_lock); 2127 connp = connfp->connf_head; 2128 2129 /* do a reverse lookup */ 2130 while ((connp != NULL) && 2131 (!IPCL_UDP_MATCH(connp, dstport, 2132 ipha->ipha_src, srcport, ipha->ipha_dst) || 2133 !IPCL_ZONE_MATCH(connp, zoneid))) { 2134 connp = connp->conn_next; 2135 } 2136 if (connp != NULL) 2137 CONN_INC_REF(connp); 2138 mutex_exit(&connfp->connf_lock); 2139 break; 2140 } 2141 case IPPROTO_SCTP: 2142 { 2143 in6_addr_t map_src, map_dst; 2144 2145 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src); 2146 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst); 2147 ((uint16_t *)&ports)[0] = up[1]; 2148 ((uint16_t *)&ports)[1] = up[0]; 2149 2150 connp = sctp_find_conn(&map_src, &map_dst, ports, 2151 zoneid, ipst->ips_netstack->netstack_sctp); 2152 if (connp == NULL) { 2153 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, 2154 zoneid, ports, ipha, ipst); 2155 } else { 2156 CONN_INC_REF(connp); 2157 SCTP_REFRELE(CONN2SCTP(connp)); 2158 } 2159 break; 2160 } 2161 default: 2162 { 2163 ipha_t ripha; 2164 2165 ripha.ipha_src = ipha->ipha_dst; 2166 ripha.ipha_dst = ipha->ipha_src; 2167 ripha.ipha_protocol = ipha->ipha_protocol; 2168 2169 connfp = &ipst->ips_ipcl_proto_fanout[ 2170 ipha->ipha_protocol]; 2171 mutex_enter(&connfp->connf_lock); 2172 connp = connfp->connf_head; 2173 for (connp = connfp->connf_head; connp != NULL; 2174 connp = connp->conn_next) { 2175 if (IPCL_PROTO_MATCH(connp, 2176 ipha->ipha_protocol, &ripha, ill, 2177 0, zoneid)) { 2178 CONN_INC_REF(connp); 2179 break; 2180 } 2181 } 2182 mutex_exit(&connfp->connf_lock); 2183 } 2184 } 2185 if (connp != NULL) { 2186 if (connp->conn_nexthop_set) 2187 nexthop_addr = connp->conn_nexthop_v4; 2188 CONN_DEC_REF(connp); 2189 } 2190 return (nexthop_addr); 2191 } 2192 2193 /* Table from RFC 1191 */ 2194 static int icmp_frag_size_table[] = 2195 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2196 2197 /* 2198 * Process received ICMP Packet too big. 2199 * After updating any IRE it does the fanout to any matching transport streams. 2200 * Assumes the message has been pulled up till the IP header that caused 2201 * the error. 2202 * 2203 * Returns B_FALSE on failure and B_TRUE on success. 2204 */ 2205 static boolean_t 2206 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, 2207 zoneid_t zoneid, mblk_t *mp, int iph_hdr_length, 2208 ip_stack_t *ipst) 2209 { 2210 ire_t *ire, *first_ire; 2211 int mtu; 2212 int hdr_length; 2213 ipaddr_t nexthop_addr; 2214 2215 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2216 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2217 ASSERT(ill != NULL); 2218 2219 hdr_length = IPH_HDR_LENGTH(ipha); 2220 2221 /* Drop if the original packet contained a source route */ 2222 if (ip_source_route_included(ipha)) { 2223 return (B_FALSE); 2224 } 2225 /* 2226 * Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport 2227 * header. 2228 */ 2229 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2230 mp->b_wptr) { 2231 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2232 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2233 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2234 ip1dbg(("icmp_inbound_too_big: insufficient hdr\n")); 2235 return (B_FALSE); 2236 } 2237 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2238 ipha = (ipha_t *)&icmph[1]; 2239 } 2240 nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp); 2241 if (nexthop_addr != INADDR_ANY) { 2242 /* nexthop set */ 2243 first_ire = ire_ctable_lookup(ipha->ipha_dst, 2244 nexthop_addr, 0, NULL, ALL_ZONES, MBLK_GETLABEL(mp), 2245 MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst); 2246 } else { 2247 /* nexthop not set */ 2248 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, 2249 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2250 } 2251 2252 if (!first_ire) { 2253 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2254 ntohl(ipha->ipha_dst))); 2255 return (B_FALSE); 2256 } 2257 /* Check for MTU discovery advice as described in RFC 1191 */ 2258 mtu = ntohs(icmph->icmph_du_mtu); 2259 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2260 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2261 ire = ire->ire_next) { 2262 /* 2263 * Look for the connection to which this ICMP message is 2264 * directed. If it has the IP_NEXTHOP option set, then the 2265 * search is limited to IREs with the MATCH_IRE_PRIVATE 2266 * option. Else the search is limited to regular IREs. 2267 */ 2268 if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2269 (nexthop_addr != ire->ire_gateway_addr)) || 2270 (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2271 (nexthop_addr != INADDR_ANY))) 2272 continue; 2273 2274 mutex_enter(&ire->ire_lock); 2275 if (icmph->icmph_du_zero == 0 && mtu > 68) { 2276 /* Reduce the IRE max frag value as advised. */ 2277 ip1dbg(("Received mtu from router: %d (was %d)\n", 2278 mtu, ire->ire_max_frag)); 2279 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2280 } else { 2281 uint32_t length; 2282 int i; 2283 2284 /* 2285 * Use the table from RFC 1191 to figure out 2286 * the next "plateau" based on the length in 2287 * the original IP packet. 2288 */ 2289 length = ntohs(ipha->ipha_length); 2290 if (ire->ire_max_frag <= length && 2291 ire->ire_max_frag >= length - hdr_length) { 2292 /* 2293 * Handle broken BSD 4.2 systems that 2294 * return the wrong iph_length in ICMP 2295 * errors. 2296 */ 2297 ip1dbg(("Wrong mtu: sent %d, ire %d\n", 2298 length, ire->ire_max_frag)); 2299 length -= hdr_length; 2300 } 2301 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2302 if (length > icmp_frag_size_table[i]) 2303 break; 2304 } 2305 if (i == A_CNT(icmp_frag_size_table)) { 2306 /* Smaller than 68! */ 2307 ip1dbg(("Too big for packet size %d\n", 2308 length)); 2309 ire->ire_max_frag = MIN(ire->ire_max_frag, 576); 2310 ire->ire_frag_flag = 0; 2311 } else { 2312 mtu = icmp_frag_size_table[i]; 2313 ip1dbg(("Calculated mtu %d, packet size %d, " 2314 "before %d", mtu, length, 2315 ire->ire_max_frag)); 2316 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2317 ip1dbg((", after %d\n", ire->ire_max_frag)); 2318 } 2319 /* Record the new max frag size for the ULP. */ 2320 icmph->icmph_du_zero = 0; 2321 icmph->icmph_du_mtu = 2322 htons((uint16_t)ire->ire_max_frag); 2323 } 2324 mutex_exit(&ire->ire_lock); 2325 } 2326 rw_exit(&first_ire->ire_bucket->irb_lock); 2327 ire_refrele(first_ire); 2328 return (B_TRUE); 2329 } 2330 2331 /* 2332 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2333 * calls this function. 2334 */ 2335 static mblk_t * 2336 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2337 { 2338 ipha_t *ipha; 2339 icmph_t *icmph; 2340 ipha_t *in_ipha; 2341 int length; 2342 2343 ASSERT(mp->b_datap->db_type == M_DATA); 2344 2345 /* 2346 * For Self-encapsulated packets, we added an extra IP header 2347 * without the options. Inner IP header is the one from which 2348 * the outer IP header was formed. Thus, we need to remove the 2349 * outer IP header. To do this, we pullup the whole message 2350 * and overlay whatever follows the outer IP header over the 2351 * outer IP header. 2352 */ 2353 2354 if (!pullupmsg(mp, -1)) 2355 return (NULL); 2356 2357 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2358 ipha = (ipha_t *)&icmph[1]; 2359 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2360 2361 /* 2362 * The length that we want to overlay is following the inner 2363 * IP header. Subtracting the IP header + icmp header + outer 2364 * IP header's length should give us the length that we want to 2365 * overlay. 2366 */ 2367 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2368 hdr_length; 2369 /* 2370 * Overlay whatever follows the inner header over the 2371 * outer header. 2372 */ 2373 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2374 2375 /* Set the wptr to account for the outer header */ 2376 mp->b_wptr -= hdr_length; 2377 return (mp); 2378 } 2379 2380 /* 2381 * Try to pass the ICMP message upstream in case the ULP cares. 2382 * 2383 * If the packet that caused the ICMP error is secure, we send 2384 * it to AH/ESP to make sure that the attached packet has a 2385 * valid association. ipha in the code below points to the 2386 * IP header of the packet that caused the error. 2387 * 2388 * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently 2389 * in the context of IPsec. Normally we tell the upper layer 2390 * whenever we send the ire (including ip_bind), the IPsec header 2391 * length in ire_ipsec_overhead. TCP can deduce the MSS as it 2392 * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. 2393 * Similarly, we pass the new MTU icmph_du_mtu and TCP does the 2394 * same thing. As TCP has the IPsec options size that needs to be 2395 * adjusted, we just pass the MTU unchanged. 2396 * 2397 * IFN could have been generated locally or by some router. 2398 * 2399 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2400 * This happens because IP adjusted its value of MTU on an 2401 * earlier IFN message and could not tell the upper layer, 2402 * the new adjusted value of MTU e.g. Packet was encrypted 2403 * or there was not enough information to fanout to upper 2404 * layers. Thus on the next outbound datagram, ip_wput_ire 2405 * generates the IFN, where IPsec processing has *not* been 2406 * done. 2407 * 2408 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2409 * could have generated this. This happens because ire_max_frag 2410 * value in IP was set to a new value, while the IPsec processing 2411 * was being done and after we made the fragmentation check in 2412 * ip_wput_ire. Thus on return from IPsec processing, 2413 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2414 * and generates the IFN. As IPsec processing is over, we fanout 2415 * to AH/ESP to remove the header. 2416 * 2417 * In both these cases, ipsec_in_loopback will be set indicating 2418 * that IFN was generated locally. 2419 * 2420 * ROUTER : IFN could be secure or non-secure. 2421 * 2422 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2423 * packet in error has AH/ESP headers to validate the AH/ESP 2424 * headers. AH/ESP will verify whether there is a valid SA or 2425 * not and send it back. We will fanout again if we have more 2426 * data in the packet. 2427 * 2428 * If the packet in error does not have AH/ESP, we handle it 2429 * like any other case. 2430 * 2431 * * NON_SECURE : If the packet in error has AH/ESP headers, 2432 * we attach a dummy ipsec_in and send it up to AH/ESP 2433 * for validation. AH/ESP will verify whether there is a 2434 * valid SA or not and send it back. We will fanout again if 2435 * we have more data in the packet. 2436 * 2437 * If the packet in error does not have AH/ESP, we handle it 2438 * like any other case. 2439 */ 2440 static void 2441 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2442 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2443 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2444 zoneid_t zoneid) 2445 { 2446 uint16_t *up; /* Pointer to ports in ULP header */ 2447 uint32_t ports; /* reversed ports for fanout */ 2448 ipha_t ripha; /* With reversed addresses */ 2449 mblk_t *first_mp; 2450 ipsec_in_t *ii; 2451 tcph_t *tcph; 2452 conn_t *connp; 2453 ip_stack_t *ipst; 2454 2455 ASSERT(ill != NULL); 2456 2457 ASSERT(recv_ill != NULL); 2458 ipst = recv_ill->ill_ipst; 2459 2460 first_mp = mp; 2461 if (mctl_present) { 2462 mp = first_mp->b_cont; 2463 ASSERT(mp != NULL); 2464 2465 ii = (ipsec_in_t *)first_mp->b_rptr; 2466 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2467 } else { 2468 ii = NULL; 2469 } 2470 2471 switch (ipha->ipha_protocol) { 2472 case IPPROTO_UDP: 2473 /* 2474 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2475 * transport header. 2476 */ 2477 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2478 mp->b_wptr) { 2479 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2480 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2481 goto discard_pkt; 2482 } 2483 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2484 ipha = (ipha_t *)&icmph[1]; 2485 } 2486 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2487 2488 /* 2489 * Attempt to find a client stream based on port. 2490 * Note that we do a reverse lookup since the header is 2491 * in the form we sent it out. 2492 * The ripha header is only used for the IP_UDP_MATCH and we 2493 * only set the src and dst addresses and protocol. 2494 */ 2495 ripha.ipha_src = ipha->ipha_dst; 2496 ripha.ipha_dst = ipha->ipha_src; 2497 ripha.ipha_protocol = ipha->ipha_protocol; 2498 ((uint16_t *)&ports)[0] = up[1]; 2499 ((uint16_t *)&ports)[1] = up[0]; 2500 ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", 2501 ntohl(ipha->ipha_src), ntohs(up[0]), 2502 ntohl(ipha->ipha_dst), ntohs(up[1]), 2503 icmph->icmph_type, icmph->icmph_code)); 2504 2505 /* Have to change db_type after any pullupmsg */ 2506 DB_TYPE(mp) = M_CTL; 2507 2508 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2509 mctl_present, ip_policy, recv_ill, zoneid); 2510 return; 2511 2512 case IPPROTO_TCP: 2513 /* 2514 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2515 * transport header. 2516 */ 2517 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2518 mp->b_wptr) { 2519 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2520 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2521 goto discard_pkt; 2522 } 2523 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2524 ipha = (ipha_t *)&icmph[1]; 2525 } 2526 /* 2527 * Find a TCP client stream for this packet. 2528 * Note that we do a reverse lookup since the header is 2529 * in the form we sent it out. 2530 */ 2531 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2532 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, 2533 ipst); 2534 if (connp == NULL) 2535 goto discard_pkt; 2536 2537 /* Have to change db_type after any pullupmsg */ 2538 DB_TYPE(mp) = M_CTL; 2539 squeue_fill(connp->conn_sqp, first_mp, tcp_input, 2540 connp, SQTAG_TCP_INPUT_ICMP_ERR); 2541 return; 2542 2543 case IPPROTO_SCTP: 2544 /* 2545 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2546 * transport header. 2547 */ 2548 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2549 mp->b_wptr) { 2550 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2551 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2552 goto discard_pkt; 2553 } 2554 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2555 ipha = (ipha_t *)&icmph[1]; 2556 } 2557 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2558 /* 2559 * Find a SCTP client stream for this packet. 2560 * Note that we do a reverse lookup since the header is 2561 * in the form we sent it out. 2562 * The ripha header is only used for the matching and we 2563 * only set the src and dst addresses, protocol, and version. 2564 */ 2565 ripha.ipha_src = ipha->ipha_dst; 2566 ripha.ipha_dst = ipha->ipha_src; 2567 ripha.ipha_protocol = ipha->ipha_protocol; 2568 ripha.ipha_version_and_hdr_length = 2569 ipha->ipha_version_and_hdr_length; 2570 ((uint16_t *)&ports)[0] = up[1]; 2571 ((uint16_t *)&ports)[1] = up[0]; 2572 2573 /* Have to change db_type after any pullupmsg */ 2574 DB_TYPE(mp) = M_CTL; 2575 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2576 mctl_present, ip_policy, zoneid); 2577 return; 2578 2579 case IPPROTO_ESP: 2580 case IPPROTO_AH: { 2581 int ipsec_rc; 2582 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 2583 2584 /* 2585 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2586 * We will re-use the IPSEC_IN if it is already present as 2587 * AH/ESP will not affect any fields in the IPSEC_IN for 2588 * ICMP errors. If there is no IPSEC_IN, allocate a new 2589 * one and attach it in the front. 2590 */ 2591 if (ii != NULL) { 2592 /* 2593 * ip_fanout_proto_again converts the ICMP errors 2594 * that come back from AH/ESP to M_DATA so that 2595 * if it is non-AH/ESP and we do a pullupmsg in 2596 * this function, it would work. Convert it back 2597 * to M_CTL before we send up as this is a ICMP 2598 * error. This could have been generated locally or 2599 * by some router. Validate the inner IPsec 2600 * headers. 2601 * 2602 * NOTE : ill_index is used by ip_fanout_proto_again 2603 * to locate the ill. 2604 */ 2605 ASSERT(ill != NULL); 2606 ii->ipsec_in_ill_index = 2607 ill->ill_phyint->phyint_ifindex; 2608 ii->ipsec_in_rill_index = 2609 recv_ill->ill_phyint->phyint_ifindex; 2610 DB_TYPE(first_mp->b_cont) = M_CTL; 2611 } else { 2612 /* 2613 * IPSEC_IN is not present. We attach a ipsec_in 2614 * message and send up to IPsec for validating 2615 * and removing the IPsec headers. Clear 2616 * ipsec_in_secure so that when we return 2617 * from IPsec, we don't mistakenly think that this 2618 * is a secure packet came from the network. 2619 * 2620 * NOTE : ill_index is used by ip_fanout_proto_again 2621 * to locate the ill. 2622 */ 2623 ASSERT(first_mp == mp); 2624 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2625 if (first_mp == NULL) { 2626 freemsg(mp); 2627 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2628 return; 2629 } 2630 ii = (ipsec_in_t *)first_mp->b_rptr; 2631 2632 /* This is not a secure packet */ 2633 ii->ipsec_in_secure = B_FALSE; 2634 first_mp->b_cont = mp; 2635 DB_TYPE(mp) = M_CTL; 2636 ASSERT(ill != NULL); 2637 ii->ipsec_in_ill_index = 2638 ill->ill_phyint->phyint_ifindex; 2639 ii->ipsec_in_rill_index = 2640 recv_ill->ill_phyint->phyint_ifindex; 2641 } 2642 ip2dbg(("icmp_inbound_error: ipsec\n")); 2643 2644 if (!ipsec_loaded(ipss)) { 2645 ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); 2646 return; 2647 } 2648 2649 if (ipha->ipha_protocol == IPPROTO_ESP) 2650 ipsec_rc = ipsecesp_icmp_error(first_mp); 2651 else 2652 ipsec_rc = ipsecah_icmp_error(first_mp); 2653 if (ipsec_rc == IPSEC_STATUS_FAILED) 2654 return; 2655 2656 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2657 return; 2658 } 2659 default: 2660 /* 2661 * The ripha header is only used for the lookup and we 2662 * only set the src and dst addresses and protocol. 2663 */ 2664 ripha.ipha_src = ipha->ipha_dst; 2665 ripha.ipha_dst = ipha->ipha_src; 2666 ripha.ipha_protocol = ipha->ipha_protocol; 2667 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2668 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2669 ntohl(ipha->ipha_dst), 2670 icmph->icmph_type, icmph->icmph_code)); 2671 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2672 ipha_t *in_ipha; 2673 2674 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2675 mp->b_wptr) { 2676 if (!pullupmsg(mp, (uchar_t *)ipha + 2677 hdr_length + sizeof (ipha_t) - 2678 mp->b_rptr)) { 2679 goto discard_pkt; 2680 } 2681 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2682 ipha = (ipha_t *)&icmph[1]; 2683 } 2684 /* 2685 * Caller has verified that length has to be 2686 * at least the size of IP header. 2687 */ 2688 ASSERT(hdr_length >= sizeof (ipha_t)); 2689 /* 2690 * Check the sanity of the inner IP header like 2691 * we did for the outer header. 2692 */ 2693 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2694 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2695 goto discard_pkt; 2696 } 2697 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2698 goto discard_pkt; 2699 } 2700 /* Check for Self-encapsulated tunnels */ 2701 if (in_ipha->ipha_src == ipha->ipha_src && 2702 in_ipha->ipha_dst == ipha->ipha_dst) { 2703 2704 mp = icmp_inbound_self_encap_error(mp, 2705 iph_hdr_length, hdr_length); 2706 if (mp == NULL) 2707 goto discard_pkt; 2708 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2709 ipha = (ipha_t *)&icmph[1]; 2710 hdr_length = IPH_HDR_LENGTH(ipha); 2711 /* 2712 * The packet in error is self-encapsualted. 2713 * And we are finding it further encapsulated 2714 * which we could not have possibly generated. 2715 */ 2716 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2717 goto discard_pkt; 2718 } 2719 icmp_inbound_error_fanout(q, ill, first_mp, 2720 icmph, ipha, iph_hdr_length, hdr_length, 2721 mctl_present, ip_policy, recv_ill, zoneid); 2722 return; 2723 } 2724 } 2725 if ((ipha->ipha_protocol == IPPROTO_ENCAP || 2726 ipha->ipha_protocol == IPPROTO_IPV6) && 2727 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 2728 ii != NULL && 2729 ii->ipsec_in_loopback && 2730 ii->ipsec_in_secure) { 2731 /* 2732 * For IP tunnels that get a looped-back 2733 * ICMP_FRAGMENTATION_NEEDED message, adjust the 2734 * reported new MTU to take into account the IPsec 2735 * headers protecting this configured tunnel. 2736 * 2737 * This allows the tunnel module (tun.c) to blindly 2738 * accept the MTU reported in an ICMP "too big" 2739 * message. 2740 * 2741 * Non-looped back ICMP messages will just be 2742 * handled by the security protocols (if needed), 2743 * and the first subsequent packet will hit this 2744 * path. 2745 */ 2746 icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - 2747 ipsec_in_extra_length(first_mp)); 2748 } 2749 /* Have to change db_type after any pullupmsg */ 2750 DB_TYPE(mp) = M_CTL; 2751 2752 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2753 ip_policy, recv_ill, zoneid); 2754 return; 2755 } 2756 /* NOTREACHED */ 2757 discard_pkt: 2758 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2759 drop_pkt:; 2760 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2761 freemsg(first_mp); 2762 } 2763 2764 /* 2765 * Common IP options parser. 2766 * 2767 * Setup routine: fill in *optp with options-parsing state, then 2768 * tail-call ipoptp_next to return the first option. 2769 */ 2770 uint8_t 2771 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2772 { 2773 uint32_t totallen; /* total length of all options */ 2774 2775 totallen = ipha->ipha_version_and_hdr_length - 2776 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2777 totallen <<= 2; 2778 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2779 optp->ipoptp_end = optp->ipoptp_next + totallen; 2780 optp->ipoptp_flags = 0; 2781 return (ipoptp_next(optp)); 2782 } 2783 2784 /* 2785 * Common IP options parser: extract next option. 2786 */ 2787 uint8_t 2788 ipoptp_next(ipoptp_t *optp) 2789 { 2790 uint8_t *end = optp->ipoptp_end; 2791 uint8_t *cur = optp->ipoptp_next; 2792 uint8_t opt, len, pointer; 2793 2794 /* 2795 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2796 * has been corrupted. 2797 */ 2798 ASSERT(cur <= end); 2799 2800 if (cur == end) 2801 return (IPOPT_EOL); 2802 2803 opt = cur[IPOPT_OPTVAL]; 2804 2805 /* 2806 * Skip any NOP options. 2807 */ 2808 while (opt == IPOPT_NOP) { 2809 cur++; 2810 if (cur == end) 2811 return (IPOPT_EOL); 2812 opt = cur[IPOPT_OPTVAL]; 2813 } 2814 2815 if (opt == IPOPT_EOL) 2816 return (IPOPT_EOL); 2817 2818 /* 2819 * Option requiring a length. 2820 */ 2821 if ((cur + 1) >= end) { 2822 optp->ipoptp_flags |= IPOPTP_ERROR; 2823 return (IPOPT_EOL); 2824 } 2825 len = cur[IPOPT_OLEN]; 2826 if (len < 2) { 2827 optp->ipoptp_flags |= IPOPTP_ERROR; 2828 return (IPOPT_EOL); 2829 } 2830 optp->ipoptp_cur = cur; 2831 optp->ipoptp_len = len; 2832 optp->ipoptp_next = cur + len; 2833 if (cur + len > end) { 2834 optp->ipoptp_flags |= IPOPTP_ERROR; 2835 return (IPOPT_EOL); 2836 } 2837 2838 /* 2839 * For the options which require a pointer field, make sure 2840 * its there, and make sure it points to either something 2841 * inside this option, or the end of the option. 2842 */ 2843 switch (opt) { 2844 case IPOPT_RR: 2845 case IPOPT_TS: 2846 case IPOPT_LSRR: 2847 case IPOPT_SSRR: 2848 if (len <= IPOPT_OFFSET) { 2849 optp->ipoptp_flags |= IPOPTP_ERROR; 2850 return (opt); 2851 } 2852 pointer = cur[IPOPT_OFFSET]; 2853 if (pointer - 1 > len) { 2854 optp->ipoptp_flags |= IPOPTP_ERROR; 2855 return (opt); 2856 } 2857 break; 2858 } 2859 2860 /* 2861 * Sanity check the pointer field based on the type of the 2862 * option. 2863 */ 2864 switch (opt) { 2865 case IPOPT_RR: 2866 case IPOPT_SSRR: 2867 case IPOPT_LSRR: 2868 if (pointer < IPOPT_MINOFF_SR) 2869 optp->ipoptp_flags |= IPOPTP_ERROR; 2870 break; 2871 case IPOPT_TS: 2872 if (pointer < IPOPT_MINOFF_IT) 2873 optp->ipoptp_flags |= IPOPTP_ERROR; 2874 /* 2875 * Note that the Internet Timestamp option also 2876 * contains two four bit fields (the Overflow field, 2877 * and the Flag field), which follow the pointer 2878 * field. We don't need to check that these fields 2879 * fall within the length of the option because this 2880 * was implicitely done above. We've checked that the 2881 * pointer value is at least IPOPT_MINOFF_IT, and that 2882 * it falls within the option. Since IPOPT_MINOFF_IT > 2883 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2884 */ 2885 ASSERT(len > IPOPT_POS_OV_FLG); 2886 break; 2887 } 2888 2889 return (opt); 2890 } 2891 2892 /* 2893 * Use the outgoing IP header to create an IP_OPTIONS option the way 2894 * it was passed down from the application. 2895 */ 2896 int 2897 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2898 { 2899 ipoptp_t opts; 2900 const uchar_t *opt; 2901 uint8_t optval; 2902 uint8_t optlen; 2903 uint32_t len = 0; 2904 uchar_t *buf1 = buf; 2905 2906 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2907 len += IP_ADDR_LEN; 2908 bzero(buf1, IP_ADDR_LEN); 2909 2910 /* 2911 * OK to cast away const here, as we don't store through the returned 2912 * opts.ipoptp_cur pointer. 2913 */ 2914 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2915 optval != IPOPT_EOL; 2916 optval = ipoptp_next(&opts)) { 2917 int off; 2918 2919 opt = opts.ipoptp_cur; 2920 optlen = opts.ipoptp_len; 2921 switch (optval) { 2922 case IPOPT_SSRR: 2923 case IPOPT_LSRR: 2924 2925 /* 2926 * Insert ipha_dst as the first entry in the source 2927 * route and move down the entries on step. 2928 * The last entry gets placed at buf1. 2929 */ 2930 buf[IPOPT_OPTVAL] = optval; 2931 buf[IPOPT_OLEN] = optlen; 2932 buf[IPOPT_OFFSET] = optlen; 2933 2934 off = optlen - IP_ADDR_LEN; 2935 if (off < 0) { 2936 /* No entries in source route */ 2937 break; 2938 } 2939 /* Last entry in source route */ 2940 bcopy(opt + off, buf1, IP_ADDR_LEN); 2941 off -= IP_ADDR_LEN; 2942 2943 while (off > 0) { 2944 bcopy(opt + off, 2945 buf + off + IP_ADDR_LEN, 2946 IP_ADDR_LEN); 2947 off -= IP_ADDR_LEN; 2948 } 2949 /* ipha_dst into first slot */ 2950 bcopy(&ipha->ipha_dst, 2951 buf + off + IP_ADDR_LEN, 2952 IP_ADDR_LEN); 2953 buf += optlen; 2954 len += optlen; 2955 break; 2956 2957 case IPOPT_COMSEC: 2958 case IPOPT_SECURITY: 2959 /* if passing up a label is not ok, then remove */ 2960 if (is_system_labeled()) 2961 break; 2962 /* FALLTHROUGH */ 2963 default: 2964 bcopy(opt, buf, optlen); 2965 buf += optlen; 2966 len += optlen; 2967 break; 2968 } 2969 } 2970 done: 2971 /* Pad the resulting options */ 2972 while (len & 0x3) { 2973 *buf++ = IPOPT_EOL; 2974 len++; 2975 } 2976 return (len); 2977 } 2978 2979 /* 2980 * Update any record route or timestamp options to include this host. 2981 * Reverse any source route option. 2982 * This routine assumes that the options are well formed i.e. that they 2983 * have already been checked. 2984 */ 2985 static void 2986 icmp_options_update(ipha_t *ipha) 2987 { 2988 ipoptp_t opts; 2989 uchar_t *opt; 2990 uint8_t optval; 2991 ipaddr_t src; /* Our local address */ 2992 ipaddr_t dst; 2993 2994 ip2dbg(("icmp_options_update\n")); 2995 src = ipha->ipha_src; 2996 dst = ipha->ipha_dst; 2997 2998 for (optval = ipoptp_first(&opts, ipha); 2999 optval != IPOPT_EOL; 3000 optval = ipoptp_next(&opts)) { 3001 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 3002 opt = opts.ipoptp_cur; 3003 ip2dbg(("icmp_options_update: opt %d, len %d\n", 3004 optval, opts.ipoptp_len)); 3005 switch (optval) { 3006 int off1, off2; 3007 case IPOPT_SSRR: 3008 case IPOPT_LSRR: 3009 /* 3010 * Reverse the source route. The first entry 3011 * should be the next to last one in the current 3012 * source route (the last entry is our address). 3013 * The last entry should be the final destination. 3014 */ 3015 off1 = IPOPT_MINOFF_SR - 1; 3016 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 3017 if (off2 < 0) { 3018 /* No entries in source route */ 3019 ip1dbg(( 3020 "icmp_options_update: bad src route\n")); 3021 break; 3022 } 3023 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 3024 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 3025 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 3026 off2 -= IP_ADDR_LEN; 3027 3028 while (off1 < off2) { 3029 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 3030 bcopy((char *)opt + off2, (char *)opt + off1, 3031 IP_ADDR_LEN); 3032 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 3033 off1 += IP_ADDR_LEN; 3034 off2 -= IP_ADDR_LEN; 3035 } 3036 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 3037 break; 3038 } 3039 } 3040 } 3041 3042 /* 3043 * Process received ICMP Redirect messages. 3044 */ 3045 static void 3046 icmp_redirect(ill_t *ill, mblk_t *mp) 3047 { 3048 ipha_t *ipha; 3049 int iph_hdr_length; 3050 icmph_t *icmph; 3051 ipha_t *ipha_err; 3052 ire_t *ire; 3053 ire_t *prev_ire; 3054 ire_t *save_ire; 3055 ipaddr_t src, dst, gateway; 3056 iulp_t ulp_info = { 0 }; 3057 int error; 3058 ip_stack_t *ipst; 3059 3060 ASSERT(ill != NULL); 3061 ipst = ill->ill_ipst; 3062 3063 ipha = (ipha_t *)mp->b_rptr; 3064 iph_hdr_length = IPH_HDR_LENGTH(ipha); 3065 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 3066 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 3067 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3068 freemsg(mp); 3069 return; 3070 } 3071 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 3072 ipha_err = (ipha_t *)&icmph[1]; 3073 src = ipha->ipha_src; 3074 dst = ipha_err->ipha_dst; 3075 gateway = icmph->icmph_rd_gateway; 3076 /* Make sure the new gateway is reachable somehow. */ 3077 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 3078 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3079 /* 3080 * Make sure we had a route for the dest in question and that 3081 * that route was pointing to the old gateway (the source of the 3082 * redirect packet.) 3083 */ 3084 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3085 NULL, MATCH_IRE_GW, ipst); 3086 /* 3087 * Check that 3088 * the redirect was not from ourselves 3089 * the new gateway and the old gateway are directly reachable 3090 */ 3091 if (!prev_ire || 3092 !ire || 3093 ire->ire_type == IRE_LOCAL) { 3094 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3095 freemsg(mp); 3096 if (ire != NULL) 3097 ire_refrele(ire); 3098 if (prev_ire != NULL) 3099 ire_refrele(prev_ire); 3100 return; 3101 } 3102 3103 /* 3104 * Should we use the old ULP info to create the new gateway? From 3105 * a user's perspective, we should inherit the info so that it 3106 * is a "smooth" transition. If we do not do that, then new 3107 * connections going thru the new gateway will have no route metrics, 3108 * which is counter-intuitive to user. From a network point of 3109 * view, this may or may not make sense even though the new gateway 3110 * is still directly connected to us so the route metrics should not 3111 * change much. 3112 * 3113 * But if the old ire_uinfo is not initialized, we do another 3114 * recursive lookup on the dest using the new gateway. There may 3115 * be a route to that. If so, use it to initialize the redirect 3116 * route. 3117 */ 3118 if (prev_ire->ire_uinfo.iulp_set) { 3119 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3120 } else { 3121 ire_t *tmp_ire; 3122 ire_t *sire; 3123 3124 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3125 ALL_ZONES, 0, NULL, 3126 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), 3127 ipst); 3128 if (sire != NULL) { 3129 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3130 /* 3131 * If sire != NULL, ire_ftable_lookup() should not 3132 * return a NULL value. 3133 */ 3134 ASSERT(tmp_ire != NULL); 3135 ire_refrele(tmp_ire); 3136 ire_refrele(sire); 3137 } else if (tmp_ire != NULL) { 3138 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3139 sizeof (iulp_t)); 3140 ire_refrele(tmp_ire); 3141 } 3142 } 3143 if (prev_ire->ire_type == IRE_CACHE) 3144 ire_delete(prev_ire); 3145 ire_refrele(prev_ire); 3146 /* 3147 * TODO: more precise handling for cases 0, 2, 3, the latter two 3148 * require TOS routing 3149 */ 3150 switch (icmph->icmph_code) { 3151 case 0: 3152 case 1: 3153 /* TODO: TOS specificity for cases 2 and 3 */ 3154 case 2: 3155 case 3: 3156 break; 3157 default: 3158 freemsg(mp); 3159 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3160 ire_refrele(ire); 3161 return; 3162 } 3163 /* 3164 * Create a Route Association. This will allow us to remember that 3165 * someone we believe told us to use the particular gateway. 3166 */ 3167 save_ire = ire; 3168 ire = ire_create( 3169 (uchar_t *)&dst, /* dest addr */ 3170 (uchar_t *)&ip_g_all_ones, /* mask */ 3171 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3172 (uchar_t *)&gateway, /* gateway addr */ 3173 &save_ire->ire_max_frag, /* max frag */ 3174 NULL, /* no src nce */ 3175 NULL, /* no rfq */ 3176 NULL, /* no stq */ 3177 IRE_HOST, 3178 NULL, /* ipif */ 3179 0, /* cmask */ 3180 0, /* phandle */ 3181 0, /* ihandle */ 3182 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3183 &ulp_info, 3184 NULL, /* tsol_gc_t */ 3185 NULL, /* gcgrp */ 3186 ipst); 3187 3188 if (ire == NULL) { 3189 freemsg(mp); 3190 ire_refrele(save_ire); 3191 return; 3192 } 3193 error = ire_add(&ire, NULL, NULL, NULL, B_FALSE); 3194 ire_refrele(save_ire); 3195 atomic_inc_32(&ipst->ips_ip_redirect_cnt); 3196 3197 if (error == 0) { 3198 ire_refrele(ire); /* Held in ire_add_v4 */ 3199 /* tell routing sockets that we received a redirect */ 3200 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3201 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3202 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); 3203 } 3204 3205 /* 3206 * Delete any existing IRE_HOST type redirect ires for this destination. 3207 * This together with the added IRE has the effect of 3208 * modifying an existing redirect. 3209 */ 3210 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL, 3211 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst); 3212 if (prev_ire != NULL) { 3213 if (prev_ire ->ire_flags & RTF_DYNAMIC) 3214 ire_delete(prev_ire); 3215 ire_refrele(prev_ire); 3216 } 3217 3218 freemsg(mp); 3219 } 3220 3221 /* 3222 * Generate an ICMP parameter problem message. 3223 */ 3224 static void 3225 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, 3226 ip_stack_t *ipst) 3227 { 3228 icmph_t icmph; 3229 boolean_t mctl_present; 3230 mblk_t *first_mp; 3231 3232 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3233 3234 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3235 if (mctl_present) 3236 freeb(first_mp); 3237 return; 3238 } 3239 3240 bzero(&icmph, sizeof (icmph_t)); 3241 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3242 icmph.icmph_pp_ptr = ptr; 3243 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); 3244 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3245 ipst); 3246 } 3247 3248 /* 3249 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3250 * the ICMP header pointed to by "stuff". (May be called as writer.) 3251 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3252 * an icmp error packet can be sent. 3253 * Assigns an appropriate source address to the packet. If ipha_dst is 3254 * one of our addresses use it for source. Otherwise pick a source based 3255 * on a route lookup back to ipha_src. 3256 * Note that ipha_src must be set here since the 3257 * packet is likely to arrive on an ill queue in ip_wput() which will 3258 * not set a source address. 3259 */ 3260 static void 3261 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3262 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 3263 { 3264 ipaddr_t dst; 3265 icmph_t *icmph; 3266 ipha_t *ipha; 3267 uint_t len_needed; 3268 size_t msg_len; 3269 mblk_t *mp1; 3270 ipaddr_t src; 3271 ire_t *ire; 3272 mblk_t *ipsec_mp; 3273 ipsec_out_t *io = NULL; 3274 3275 if (mctl_present) { 3276 /* 3277 * If it is : 3278 * 3279 * 1) a IPSEC_OUT, then this is caused by outbound 3280 * datagram originating on this host. IPsec processing 3281 * may or may not have been done. Refer to comments above 3282 * icmp_inbound_error_fanout for details. 3283 * 3284 * 2) a IPSEC_IN if we are generating a icmp_message 3285 * for an incoming datagram destined for us i.e called 3286 * from ip_fanout_send_icmp. 3287 */ 3288 ipsec_info_t *in; 3289 ipsec_mp = mp; 3290 mp = ipsec_mp->b_cont; 3291 3292 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3293 ipha = (ipha_t *)mp->b_rptr; 3294 3295 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3296 in->ipsec_info_type == IPSEC_IN); 3297 3298 if (in->ipsec_info_type == IPSEC_IN) { 3299 /* 3300 * Convert the IPSEC_IN to IPSEC_OUT. 3301 */ 3302 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3303 BUMP_MIB(&ipst->ips_ip_mib, 3304 ipIfStatsOutDiscards); 3305 return; 3306 } 3307 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3308 } else { 3309 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3310 io = (ipsec_out_t *)in; 3311 /* 3312 * Clear out ipsec_out_proc_begin, so we do a fresh 3313 * ire lookup. 3314 */ 3315 io->ipsec_out_proc_begin = B_FALSE; 3316 } 3317 ASSERT(zoneid == io->ipsec_out_zoneid); 3318 ASSERT(zoneid != ALL_ZONES); 3319 } else { 3320 /* 3321 * This is in clear. The icmp message we are building 3322 * here should go out in clear. 3323 * 3324 * Pardon the convolution of it all, but it's easier to 3325 * allocate a "use cleartext" IPSEC_IN message and convert 3326 * it than it is to allocate a new one. 3327 */ 3328 ipsec_in_t *ii; 3329 ASSERT(DB_TYPE(mp) == M_DATA); 3330 ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 3331 if (ipsec_mp == NULL) { 3332 freemsg(mp); 3333 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3334 return; 3335 } 3336 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3337 3338 /* This is not a secure packet */ 3339 ii->ipsec_in_secure = B_FALSE; 3340 /* 3341 * For trusted extensions using a shared IP address we can 3342 * send using any zoneid. 3343 */ 3344 if (zoneid == ALL_ZONES) 3345 ii->ipsec_in_zoneid = GLOBAL_ZONEID; 3346 else 3347 ii->ipsec_in_zoneid = zoneid; 3348 ipsec_mp->b_cont = mp; 3349 ipha = (ipha_t *)mp->b_rptr; 3350 /* 3351 * Convert the IPSEC_IN to IPSEC_OUT. 3352 */ 3353 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3354 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3355 return; 3356 } 3357 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3358 } 3359 3360 /* Remember our eventual destination */ 3361 dst = ipha->ipha_src; 3362 3363 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3364 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst); 3365 if (ire != NULL && 3366 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3367 src = ipha->ipha_dst; 3368 } else { 3369 if (ire != NULL) 3370 ire_refrele(ire); 3371 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3372 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY), 3373 ipst); 3374 if (ire == NULL) { 3375 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 3376 freemsg(ipsec_mp); 3377 return; 3378 } 3379 src = ire->ire_src_addr; 3380 } 3381 3382 if (ire != NULL) 3383 ire_refrele(ire); 3384 3385 /* 3386 * Check if we can send back more then 8 bytes in addition to 3387 * the IP header. We try to send 64 bytes of data and the internal 3388 * header in the special cases of ipv4 encapsulated ipv4 or ipv6. 3389 */ 3390 len_needed = IPH_HDR_LENGTH(ipha); 3391 if (ipha->ipha_protocol == IPPROTO_ENCAP || 3392 ipha->ipha_protocol == IPPROTO_IPV6) { 3393 3394 if (!pullupmsg(mp, -1)) { 3395 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3396 freemsg(ipsec_mp); 3397 return; 3398 } 3399 ipha = (ipha_t *)mp->b_rptr; 3400 3401 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 3402 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + 3403 len_needed)); 3404 } else { 3405 ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); 3406 3407 ASSERT(ipha->ipha_protocol == IPPROTO_IPV6); 3408 len_needed += ip_hdr_length_v6(mp, ip6h); 3409 } 3410 } 3411 len_needed += ipst->ips_ip_icmp_return; 3412 msg_len = msgdsize(mp); 3413 if (msg_len > len_needed) { 3414 (void) adjmsg(mp, len_needed - msg_len); 3415 msg_len = len_needed; 3416 } 3417 mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp); 3418 if (mp1 == NULL) { 3419 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors); 3420 freemsg(ipsec_mp); 3421 return; 3422 } 3423 mp1->b_cont = mp; 3424 mp = mp1; 3425 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3426 ipsec_mp->b_rptr == (uint8_t *)io && 3427 io->ipsec_out_type == IPSEC_OUT); 3428 ipsec_mp->b_cont = mp; 3429 3430 /* 3431 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3432 * node generates be accepted in peace by all on-host destinations. 3433 * If we do NOT assume that all on-host destinations trust 3434 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3435 * (Look for ipsec_out_icmp_loopback). 3436 */ 3437 io->ipsec_out_icmp_loopback = B_TRUE; 3438 3439 ipha = (ipha_t *)mp->b_rptr; 3440 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3441 *ipha = icmp_ipha; 3442 ipha->ipha_src = src; 3443 ipha->ipha_dst = dst; 3444 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 3445 msg_len += sizeof (icmp_ipha) + len; 3446 if (msg_len > IP_MAXPACKET) { 3447 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3448 msg_len = IP_MAXPACKET; 3449 } 3450 ipha->ipha_length = htons((uint16_t)msg_len); 3451 icmph = (icmph_t *)&ipha[1]; 3452 bcopy(stuff, icmph, len); 3453 icmph->icmph_checksum = 0; 3454 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3455 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 3456 put(q, ipsec_mp); 3457 } 3458 3459 /* 3460 * Determine if an ICMP error packet can be sent given the rate limit. 3461 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3462 * in milliseconds) and a burst size. Burst size number of packets can 3463 * be sent arbitrarely closely spaced. 3464 * The state is tracked using two variables to implement an approximate 3465 * token bucket filter: 3466 * icmp_pkt_err_last - lbolt value when the last burst started 3467 * icmp_pkt_err_sent - number of packets sent in current burst 3468 */ 3469 boolean_t 3470 icmp_err_rate_limit(ip_stack_t *ipst) 3471 { 3472 clock_t now = TICK_TO_MSEC(lbolt); 3473 uint_t refilled; /* Number of packets refilled in tbf since last */ 3474 /* Guard against changes by loading into local variable */ 3475 uint_t err_interval = ipst->ips_ip_icmp_err_interval; 3476 3477 if (err_interval == 0) 3478 return (B_FALSE); 3479 3480 if (ipst->ips_icmp_pkt_err_last > now) { 3481 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3482 ipst->ips_icmp_pkt_err_last = 0; 3483 ipst->ips_icmp_pkt_err_sent = 0; 3484 } 3485 /* 3486 * If we are in a burst update the token bucket filter. 3487 * Update the "last" time to be close to "now" but make sure 3488 * we don't loose precision. 3489 */ 3490 if (ipst->ips_icmp_pkt_err_sent != 0) { 3491 refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval; 3492 if (refilled > ipst->ips_icmp_pkt_err_sent) { 3493 ipst->ips_icmp_pkt_err_sent = 0; 3494 } else { 3495 ipst->ips_icmp_pkt_err_sent -= refilled; 3496 ipst->ips_icmp_pkt_err_last += refilled * err_interval; 3497 } 3498 } 3499 if (ipst->ips_icmp_pkt_err_sent == 0) { 3500 /* Start of new burst */ 3501 ipst->ips_icmp_pkt_err_last = now; 3502 } 3503 if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) { 3504 ipst->ips_icmp_pkt_err_sent++; 3505 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3506 ipst->ips_icmp_pkt_err_sent)); 3507 return (B_FALSE); 3508 } 3509 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3510 return (B_TRUE); 3511 } 3512 3513 /* 3514 * Check if it is ok to send an IPv4 ICMP error packet in 3515 * response to the IPv4 packet in mp. 3516 * Free the message and return null if no 3517 * ICMP error packet should be sent. 3518 */ 3519 static mblk_t * 3520 icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) 3521 { 3522 icmph_t *icmph; 3523 ipha_t *ipha; 3524 uint_t len_needed; 3525 ire_t *src_ire; 3526 ire_t *dst_ire; 3527 3528 if (!mp) 3529 return (NULL); 3530 ipha = (ipha_t *)mp->b_rptr; 3531 if (ip_csum_hdr(ipha)) { 3532 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs); 3533 freemsg(mp); 3534 return (NULL); 3535 } 3536 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3537 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3538 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3539 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3540 if (src_ire != NULL || dst_ire != NULL || 3541 CLASSD(ipha->ipha_dst) || 3542 CLASSD(ipha->ipha_src) || 3543 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3544 /* Note: only errors to the fragment with offset 0 */ 3545 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3546 freemsg(mp); 3547 if (src_ire != NULL) 3548 ire_refrele(src_ire); 3549 if (dst_ire != NULL) 3550 ire_refrele(dst_ire); 3551 return (NULL); 3552 } 3553 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3554 /* 3555 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3556 * errors in response to any ICMP errors. 3557 */ 3558 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3559 if (mp->b_wptr - mp->b_rptr < len_needed) { 3560 if (!pullupmsg(mp, len_needed)) { 3561 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3562 freemsg(mp); 3563 return (NULL); 3564 } 3565 ipha = (ipha_t *)mp->b_rptr; 3566 } 3567 icmph = (icmph_t *) 3568 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3569 switch (icmph->icmph_type) { 3570 case ICMP_DEST_UNREACHABLE: 3571 case ICMP_SOURCE_QUENCH: 3572 case ICMP_TIME_EXCEEDED: 3573 case ICMP_PARAM_PROBLEM: 3574 case ICMP_REDIRECT: 3575 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3576 freemsg(mp); 3577 return (NULL); 3578 default: 3579 break; 3580 } 3581 } 3582 /* 3583 * If this is a labeled system, then check to see if we're allowed to 3584 * send a response to this particular sender. If not, then just drop. 3585 */ 3586 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3587 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3588 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3589 freemsg(mp); 3590 return (NULL); 3591 } 3592 if (icmp_err_rate_limit(ipst)) { 3593 /* 3594 * Only send ICMP error packets every so often. 3595 * This should be done on a per port/source basis, 3596 * but for now this will suffice. 3597 */ 3598 freemsg(mp); 3599 return (NULL); 3600 } 3601 return (mp); 3602 } 3603 3604 /* 3605 * Generate an ICMP redirect message. 3606 */ 3607 static void 3608 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst) 3609 { 3610 icmph_t icmph; 3611 3612 /* 3613 * We are called from ip_rput where we could 3614 * not have attached an IPSEC_IN. 3615 */ 3616 ASSERT(mp->b_datap->db_type == M_DATA); 3617 3618 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3619 return; 3620 } 3621 3622 bzero(&icmph, sizeof (icmph_t)); 3623 icmph.icmph_type = ICMP_REDIRECT; 3624 icmph.icmph_code = 1; 3625 icmph.icmph_rd_gateway = gateway; 3626 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); 3627 /* Redirects sent by router, and router is global zone */ 3628 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst); 3629 } 3630 3631 /* 3632 * Generate an ICMP time exceeded message. 3633 */ 3634 void 3635 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3636 ip_stack_t *ipst) 3637 { 3638 icmph_t icmph; 3639 boolean_t mctl_present; 3640 mblk_t *first_mp; 3641 3642 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3643 3644 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3645 if (mctl_present) 3646 freeb(first_mp); 3647 return; 3648 } 3649 3650 bzero(&icmph, sizeof (icmph_t)); 3651 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3652 icmph.icmph_code = code; 3653 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); 3654 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3655 ipst); 3656 } 3657 3658 /* 3659 * Generate an ICMP unreachable message. 3660 */ 3661 void 3662 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3663 ip_stack_t *ipst) 3664 { 3665 icmph_t icmph; 3666 mblk_t *first_mp; 3667 boolean_t mctl_present; 3668 3669 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3670 3671 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3672 if (mctl_present) 3673 freeb(first_mp); 3674 return; 3675 } 3676 3677 bzero(&icmph, sizeof (icmph_t)); 3678 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3679 icmph.icmph_code = code; 3680 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 3681 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3682 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present, 3683 zoneid, ipst); 3684 } 3685 3686 /* 3687 * Attempt to start recovery of an IPv4 interface that's been shut down as a 3688 * duplicate. As long as someone else holds the address, the interface will 3689 * stay down. When that conflict goes away, the interface is brought back up. 3690 * This is done so that accidental shutdowns of addresses aren't made 3691 * permanent. Your server will recover from a failure. 3692 * 3693 * For DHCP, recovery is not done in the kernel. Instead, it's handled by a 3694 * user space process (dhcpagent). 3695 * 3696 * Recovery completes if ARP reports that the address is now ours (via 3697 * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. 3698 * 3699 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 3700 */ 3701 static void 3702 ipif_dup_recovery(void *arg) 3703 { 3704 ipif_t *ipif = arg; 3705 ill_t *ill = ipif->ipif_ill; 3706 mblk_t *arp_add_mp; 3707 mblk_t *arp_del_mp; 3708 area_t *area; 3709 ip_stack_t *ipst = ill->ill_ipst; 3710 3711 ipif->ipif_recovery_id = 0; 3712 3713 /* 3714 * No lock needed for moving or condemned check, as this is just an 3715 * optimization. 3716 */ 3717 if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || 3718 (ipif->ipif_flags & IPIF_POINTOPOINT) || 3719 (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { 3720 /* No reason to try to bring this address back. */ 3721 return; 3722 } 3723 3724 if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) 3725 goto alloc_fail; 3726 3727 if (ipif->ipif_arp_del_mp == NULL) { 3728 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 3729 goto alloc_fail; 3730 ipif->ipif_arp_del_mp = arp_del_mp; 3731 } 3732 3733 /* Setting the 'unverified' flag restarts DAD */ 3734 area = (area_t *)arp_add_mp->b_rptr; 3735 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 3736 ACE_F_UNVERIFIED; 3737 putnext(ill->ill_rq, arp_add_mp); 3738 return; 3739 3740 alloc_fail: 3741 /* 3742 * On allocation failure, just restart the timer. Note that the ipif 3743 * is down here, so no other thread could be trying to start a recovery 3744 * timer. The ill_lock protects the condemned flag and the recovery 3745 * timer ID. 3746 */ 3747 freemsg(arp_add_mp); 3748 mutex_enter(&ill->ill_lock); 3749 if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 && 3750 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 3751 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, 3752 MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3753 } 3754 mutex_exit(&ill->ill_lock); 3755 } 3756 3757 /* 3758 * This is for exclusive changes due to ARP. Either tear down an interface due 3759 * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. 3760 */ 3761 /* ARGSUSED */ 3762 static void 3763 ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3764 { 3765 ill_t *ill = rq->q_ptr; 3766 arh_t *arh; 3767 ipaddr_t src; 3768 ipif_t *ipif; 3769 char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ 3770 char hbuf[MAC_STR_LEN]; 3771 char sbuf[INET_ADDRSTRLEN]; 3772 const char *failtype; 3773 boolean_t bring_up; 3774 ip_stack_t *ipst = ill->ill_ipst; 3775 3776 switch (((arcn_t *)mp->b_rptr)->arcn_code) { 3777 case AR_CN_READY: 3778 failtype = NULL; 3779 bring_up = B_TRUE; 3780 break; 3781 case AR_CN_FAILED: 3782 failtype = "in use"; 3783 bring_up = B_FALSE; 3784 break; 3785 default: 3786 failtype = "claimed"; 3787 bring_up = B_FALSE; 3788 break; 3789 } 3790 3791 arh = (arh_t *)mp->b_cont->b_rptr; 3792 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3793 3794 /* Handle failures due to probes */ 3795 if (src == 0) { 3796 bcopy((char *)&arh[1] + 2 * arh->arh_hlen + IP_ADDR_LEN, &src, 3797 IP_ADDR_LEN); 3798 } 3799 3800 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, 3801 sizeof (hbuf)); 3802 (void) ip_dot_addr(src, sbuf); 3803 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3804 3805 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || 3806 ipif->ipif_lcl_addr != src) { 3807 continue; 3808 } 3809 3810 /* 3811 * If we failed on a recovery probe, then restart the timer to 3812 * try again later. 3813 */ 3814 if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && 3815 !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3816 ill->ill_net_type == IRE_IF_RESOLVER && 3817 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3818 ipst->ips_ip_dup_recovery > 0 && 3819 ipif->ipif_recovery_id == 0) { 3820 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3821 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3822 continue; 3823 } 3824 3825 /* 3826 * If what we're trying to do has already been done, then do 3827 * nothing. 3828 */ 3829 if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) 3830 continue; 3831 3832 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 3833 3834 if (failtype == NULL) { 3835 cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, 3836 ibuf); 3837 } else { 3838 cmn_err(CE_WARN, "%s has duplicate address %s (%s " 3839 "by %s); disabled", ibuf, sbuf, failtype, hbuf); 3840 } 3841 3842 if (bring_up) { 3843 ASSERT(ill->ill_dl_up); 3844 /* 3845 * Free up the ARP delete message so we can allocate 3846 * a fresh one through the normal path. 3847 */ 3848 freemsg(ipif->ipif_arp_del_mp); 3849 ipif->ipif_arp_del_mp = NULL; 3850 if (ipif_resolver_up(ipif, Res_act_initial) != 3851 EINPROGRESS) { 3852 ipif->ipif_addr_ready = 1; 3853 (void) ipif_up_done(ipif); 3854 } 3855 continue; 3856 } 3857 3858 mutex_enter(&ill->ill_lock); 3859 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 3860 ipif->ipif_flags |= IPIF_DUPLICATE; 3861 ill->ill_ipif_dup_count++; 3862 mutex_exit(&ill->ill_lock); 3863 /* 3864 * Already exclusive on the ill; no need to handle deferred 3865 * processing here. 3866 */ 3867 (void) ipif_down(ipif, NULL, NULL); 3868 ipif_down_tail(ipif); 3869 mutex_enter(&ill->ill_lock); 3870 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3871 ill->ill_net_type == IRE_IF_RESOLVER && 3872 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3873 ipst->ips_ip_dup_recovery > 0) { 3874 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3875 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3876 } 3877 mutex_exit(&ill->ill_lock); 3878 } 3879 freemsg(mp); 3880 } 3881 3882 /* ARGSUSED */ 3883 static void 3884 ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3885 { 3886 ill_t *ill = rq->q_ptr; 3887 arh_t *arh; 3888 ipaddr_t src; 3889 ipif_t *ipif; 3890 3891 arh = (arh_t *)mp->b_cont->b_rptr; 3892 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3893 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3894 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src) 3895 (void) ipif_resolver_up(ipif, Res_act_defend); 3896 } 3897 freemsg(mp); 3898 } 3899 3900 /* 3901 * News from ARP. ARP sends notification of interesting events down 3902 * to its clients using M_CTL messages with the interesting ARP packet 3903 * attached via b_cont. 3904 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3905 * queue as opposed to ARP sending the message to all the clients, i.e. all 3906 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3907 * table if a cache IRE is found to delete all the entries for the address in 3908 * the packet. 3909 */ 3910 static void 3911 ip_arp_news(queue_t *q, mblk_t *mp) 3912 { 3913 arcn_t *arcn; 3914 arh_t *arh; 3915 ire_t *ire = NULL; 3916 char hbuf[MAC_STR_LEN]; 3917 char sbuf[INET_ADDRSTRLEN]; 3918 ipaddr_t src; 3919 in6_addr_t v6src; 3920 boolean_t isv6 = B_FALSE; 3921 ipif_t *ipif; 3922 ill_t *ill; 3923 ip_stack_t *ipst; 3924 3925 if (CONN_Q(q)) { 3926 conn_t *connp = Q_TO_CONN(q); 3927 3928 ipst = connp->conn_netstack->netstack_ip; 3929 } else { 3930 ill_t *ill = (ill_t *)q->q_ptr; 3931 3932 ipst = ill->ill_ipst; 3933 } 3934 3935 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3936 if (q->q_next) { 3937 putnext(q, mp); 3938 } else 3939 freemsg(mp); 3940 return; 3941 } 3942 arh = (arh_t *)mp->b_cont->b_rptr; 3943 /* Is it one we are interested in? */ 3944 if (BE16_TO_U16(arh->arh_proto) == IP6_DL_SAP) { 3945 isv6 = B_TRUE; 3946 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3947 IPV6_ADDR_LEN); 3948 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3949 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3950 IP_ADDR_LEN); 3951 } else { 3952 freemsg(mp); 3953 return; 3954 } 3955 3956 ill = q->q_ptr; 3957 3958 arcn = (arcn_t *)mp->b_rptr; 3959 switch (arcn->arcn_code) { 3960 case AR_CN_BOGON: 3961 /* 3962 * Someone is sending ARP packets with a source protocol 3963 * address that we have published and for which we believe our 3964 * entry is authoritative and (when ill_arp_extend is set) 3965 * verified to be unique on the network. 3966 * 3967 * The ARP module internally handles the cases where the sender 3968 * is just probing (for DAD) and where the hardware address of 3969 * a non-authoritative entry has changed. Thus, these are the 3970 * real conflicts, and we have to do resolution. 3971 * 3972 * We back away quickly from the address if it's from DHCP or 3973 * otherwise temporary and hasn't been used recently (or at 3974 * all). We'd like to include "deprecated" addresses here as 3975 * well (as there's no real reason to defend something we're 3976 * discarding), but IPMP "reuses" this flag to mean something 3977 * other than the standard meaning. 3978 * 3979 * If the ARP module above is not extended (meaning that it 3980 * doesn't know how to defend the address), then we just log 3981 * the problem as we always did and continue on. It's not 3982 * right, but there's little else we can do, and those old ATM 3983 * users are going away anyway. 3984 */ 3985 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, 3986 hbuf, sizeof (hbuf)); 3987 (void) ip_dot_addr(src, sbuf); 3988 if (isv6) { 3989 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL, 3990 ipst); 3991 } else { 3992 ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst); 3993 } 3994 if (ire != NULL && IRE_IS_LOCAL(ire)) { 3995 uint32_t now; 3996 uint32_t maxage; 3997 clock_t lused; 3998 uint_t maxdefense; 3999 uint_t defs; 4000 4001 /* 4002 * First, figure out if this address hasn't been used 4003 * in a while. If it hasn't, then it's a better 4004 * candidate for abandoning. 4005 */ 4006 ipif = ire->ire_ipif; 4007 ASSERT(ipif != NULL); 4008 now = gethrestime_sec(); 4009 maxage = now - ire->ire_create_time; 4010 if (maxage > ipst->ips_ip_max_temp_idle) 4011 maxage = ipst->ips_ip_max_temp_idle; 4012 lused = drv_hztousec(ddi_get_lbolt() - 4013 ire->ire_last_used_time) / MICROSEC + 1; 4014 if (lused >= maxage && (ipif->ipif_flags & 4015 (IPIF_DHCPRUNNING | IPIF_TEMPORARY))) 4016 maxdefense = ipst->ips_ip_max_temp_defend; 4017 else 4018 maxdefense = ipst->ips_ip_max_defend; 4019 4020 /* 4021 * Now figure out how many times we've defended 4022 * ourselves. Ignore defenses that happened long in 4023 * the past. 4024 */ 4025 mutex_enter(&ire->ire_lock); 4026 if ((defs = ire->ire_defense_count) > 0 && 4027 now - ire->ire_defense_time > 4028 ipst->ips_ip_defend_interval) { 4029 ire->ire_defense_count = defs = 0; 4030 } 4031 ire->ire_defense_count++; 4032 ire->ire_defense_time = now; 4033 mutex_exit(&ire->ire_lock); 4034 ill_refhold(ill); 4035 ire_refrele(ire); 4036 4037 /* 4038 * If we've defended ourselves too many times already, 4039 * then give up and tear down the interface(s) using 4040 * this address. Otherwise, defend by sending out a 4041 * gratuitous ARP. 4042 */ 4043 if (defs >= maxdefense && ill->ill_arp_extend) { 4044 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4045 B_FALSE); 4046 } else { 4047 cmn_err(CE_WARN, 4048 "node %s is using our IP address %s on %s", 4049 hbuf, sbuf, ill->ill_name); 4050 /* 4051 * If this is an old (ATM) ARP module, then 4052 * don't try to defend the address. Remain 4053 * compatible with the old behavior. Defend 4054 * only with new ARP. 4055 */ 4056 if (ill->ill_arp_extend) { 4057 qwriter_ip(ill, q, mp, ip_arp_defend, 4058 NEW_OP, B_FALSE); 4059 } else { 4060 ill_refrele(ill); 4061 } 4062 } 4063 return; 4064 } 4065 cmn_err(CE_WARN, 4066 "proxy ARP problem? Node '%s' is using %s on %s", 4067 hbuf, sbuf, ill->ill_name); 4068 if (ire != NULL) 4069 ire_refrele(ire); 4070 break; 4071 case AR_CN_ANNOUNCE: 4072 if (isv6) { 4073 /* 4074 * For XRESOLV interfaces. 4075 * Delete the IRE cache entry and NCE for this 4076 * v6 address 4077 */ 4078 ip_ire_clookup_and_delete_v6(&v6src, ipst); 4079 /* 4080 * If v6src is a non-zero, it's a router address 4081 * as below. Do the same sort of thing to clean 4082 * out off-net IRE_CACHE entries that go through 4083 * the router. 4084 */ 4085 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4086 ire_walk_v6(ire_delete_cache_gw_v6, 4087 (char *)&v6src, ALL_ZONES, ipst); 4088 } 4089 } else { 4090 nce_hw_map_t hwm; 4091 4092 /* 4093 * ARP gives us a copy of any packet where it thinks 4094 * the address has changed, so that we can update our 4095 * caches. We're responsible for caching known answers 4096 * in the current design. We check whether the 4097 * hardware address really has changed in all of our 4098 * entries that have cached this mapping, and if so, we 4099 * blow them away. This way we will immediately pick 4100 * up the rare case of a host changing hardware 4101 * address. 4102 */ 4103 if (src == 0) 4104 break; 4105 hwm.hwm_addr = src; 4106 hwm.hwm_hwlen = arh->arh_hlen; 4107 hwm.hwm_hwaddr = (uchar_t *)(arh + 1); 4108 NDP_HW_CHANGE_INCR(ipst->ips_ndp4); 4109 ndp_walk_common(ipst->ips_ndp4, NULL, 4110 (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES); 4111 NDP_HW_CHANGE_DECR(ipst->ips_ndp4); 4112 } 4113 break; 4114 case AR_CN_READY: 4115 /* No external v6 resolver has a contract to use this */ 4116 if (isv6) 4117 break; 4118 /* If the link is down, we'll retry this later */ 4119 if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 4120 break; 4121 ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL, 4122 NULL, NULL, ipst); 4123 if (ipif != NULL) { 4124 /* 4125 * If this is a duplicate recovery, then we now need to 4126 * go exclusive to bring this thing back up. 4127 */ 4128 if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) == 4129 IPIF_DUPLICATE) { 4130 ipif_refrele(ipif); 4131 ill_refhold(ill); 4132 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4133 B_FALSE); 4134 return; 4135 } 4136 /* 4137 * If this is the first notice that this address is 4138 * ready, then let the user know now. 4139 */ 4140 if ((ipif->ipif_flags & IPIF_UP) && 4141 !ipif->ipif_addr_ready) { 4142 ipif_mask_reply(ipif); 4143 ip_rts_ifmsg(ipif); 4144 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 4145 sctp_update_ipif(ipif, SCTP_IPIF_UP); 4146 } 4147 ipif->ipif_addr_ready = 1; 4148 ipif_refrele(ipif); 4149 } 4150 ire = ire_cache_lookup(src, ALL_ZONES, MBLK_GETLABEL(mp), ipst); 4151 if (ire != NULL) { 4152 ire->ire_defense_count = 0; 4153 ire_refrele(ire); 4154 } 4155 break; 4156 case AR_CN_FAILED: 4157 /* No external v6 resolver has a contract to use this */ 4158 if (isv6) 4159 break; 4160 ill_refhold(ill); 4161 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE); 4162 return; 4163 } 4164 freemsg(mp); 4165 } 4166 4167 /* 4168 * Create a mblk suitable for carrying the interface index and/or source link 4169 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 4170 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 4171 * application. 4172 */ 4173 mblk_t * 4174 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, 4175 ip_stack_t *ipst) 4176 { 4177 mblk_t *mp; 4178 ip_pktinfo_t *pinfo; 4179 ipha_t *ipha; 4180 struct ether_header *pether; 4181 4182 mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); 4183 if (mp == NULL) { 4184 ip1dbg(("ip_add_info: allocation failure.\n")); 4185 return (data_mp); 4186 } 4187 4188 ipha = (ipha_t *)data_mp->b_rptr; 4189 pinfo = (ip_pktinfo_t *)mp->b_rptr; 4190 bzero(pinfo, sizeof (ip_pktinfo_t)); 4191 pinfo->ip_pkt_flags = (uchar_t)flags; 4192 pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 4193 4194 if (flags & (IPF_RECVIF | IPF_RECVADDR)) 4195 pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 4196 if (flags & IPF_RECVADDR) { 4197 ipif_t *ipif; 4198 ire_t *ire; 4199 4200 /* 4201 * Only valid for V4 4202 */ 4203 ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) == 4204 (IPV4_VERSION << 4)); 4205 4206 ipif = ipif_get_next_ipif(NULL, ill); 4207 if (ipif != NULL) { 4208 /* 4209 * Since a decision has already been made to deliver the 4210 * packet, there is no need to test for SECATTR and 4211 * ZONEONLY. 4212 * When a multicast packet is transmitted 4213 * a cache entry is created for the multicast address. 4214 * When delivering a copy of the packet or when new 4215 * packets are received we do not want to match on the 4216 * cached entry so explicitly match on 4217 * IRE_LOCAL and IRE_LOOPBACK 4218 */ 4219 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4220 IRE_LOCAL | IRE_LOOPBACK, 4221 ipif, zoneid, NULL, 4222 MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); 4223 if (ire == NULL) { 4224 /* 4225 * packet must have come on a different 4226 * interface. 4227 * Since a decision has already been made to 4228 * deliver the packet, there is no need to test 4229 * for SECATTR and ZONEONLY. 4230 * Only match on local and broadcast ire's. 4231 * See detailed comment above. 4232 */ 4233 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4234 IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, 4235 NULL, MATCH_IRE_TYPE, ipst); 4236 } 4237 4238 if (ire == NULL) { 4239 /* 4240 * This is either a multicast packet or 4241 * the address has been removed since 4242 * the packet was received. 4243 * Return INADDR_ANY so that normal source 4244 * selection occurs for the response. 4245 */ 4246 4247 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4248 } else { 4249 pinfo->ip_pkt_match_addr.s_addr = 4250 ire->ire_src_addr; 4251 ire_refrele(ire); 4252 } 4253 ipif_refrele(ipif); 4254 } else { 4255 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4256 } 4257 } 4258 4259 pether = (struct ether_header *)((char *)ipha 4260 - sizeof (struct ether_header)); 4261 /* 4262 * Make sure the interface is an ethernet type, since this option 4263 * is currently supported only on this type of interface. Also make 4264 * sure we are pointing correctly above db_base. 4265 */ 4266 4267 if ((flags & IPF_RECVSLLA) && 4268 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 4269 (ill->ill_type == IFT_ETHER) && 4270 (ill->ill_net_type == IRE_IF_RESOLVER)) { 4271 4272 pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; 4273 bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, 4274 (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); 4275 } else { 4276 /* 4277 * Clear the bit. Indicate to upper layer that IP is not 4278 * sending this ancillary info. 4279 */ 4280 pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; 4281 } 4282 4283 mp->b_datap->db_type = M_CTL; 4284 mp->b_wptr += sizeof (ip_pktinfo_t); 4285 mp->b_cont = data_mp; 4286 4287 return (mp); 4288 } 4289 4290 /* 4291 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 4292 * part of the bind request. 4293 */ 4294 4295 boolean_t 4296 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 4297 { 4298 ipsec_in_t *ii; 4299 4300 ASSERT(policy_mp != NULL); 4301 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 4302 4303 ii = (ipsec_in_t *)policy_mp->b_rptr; 4304 ASSERT(ii->ipsec_in_type == IPSEC_IN); 4305 4306 connp->conn_policy = ii->ipsec_in_policy; 4307 ii->ipsec_in_policy = NULL; 4308 4309 if (ii->ipsec_in_action != NULL) { 4310 if (connp->conn_latch == NULL) { 4311 connp->conn_latch = iplatch_create(); 4312 if (connp->conn_latch == NULL) 4313 return (B_FALSE); 4314 } 4315 ipsec_latch_inbound(connp->conn_latch, ii); 4316 } 4317 return (B_TRUE); 4318 } 4319 4320 /* 4321 * Upper level protocols (ULP) pass through bind requests to IP for inspection 4322 * and to arrange for power-fanout assist. The ULP is identified by 4323 * adding a single byte at the end of the original bind message. 4324 * A ULP other than UDP or TCP that wishes to be recognized passes 4325 * down a bind with a zero length address. 4326 * 4327 * The binding works as follows: 4328 * - A zero byte address means just bind to the protocol. 4329 * - A four byte address is treated as a request to validate 4330 * that the address is a valid local address, appropriate for 4331 * an application to bind to. This does not affect any fanout 4332 * information in IP. 4333 * - A sizeof sin_t byte address is used to bind to only the local address 4334 * and port. 4335 * - A sizeof ipa_conn_t byte address contains complete fanout information 4336 * consisting of local and remote addresses and ports. In 4337 * this case, the addresses are both validated as appropriate 4338 * for this operation, and, if so, the information is retained 4339 * for use in the inbound fanout. 4340 * 4341 * The ULP (except in the zero-length bind) can append an 4342 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 4343 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 4344 * a copy of the source or destination IRE (source for local bind; 4345 * destination for complete bind). IPSEC_POLICY_SET indicates that the 4346 * policy information contained should be copied on to the conn. 4347 * 4348 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 4349 */ 4350 mblk_t * 4351 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 4352 { 4353 ssize_t len; 4354 struct T_bind_req *tbr; 4355 sin_t *sin; 4356 ipa_conn_t *ac; 4357 uchar_t *ucp; 4358 mblk_t *mp1; 4359 boolean_t ire_requested; 4360 boolean_t ipsec_policy_set = B_FALSE; 4361 int error = 0; 4362 int protocol; 4363 ipa_conn_x_t *acx; 4364 4365 ASSERT(!connp->conn_af_isv6); 4366 connp->conn_pkt_isv6 = B_FALSE; 4367 4368 len = MBLKL(mp); 4369 if (len < (sizeof (*tbr) + 1)) { 4370 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 4371 "ip_bind: bogus msg, len %ld", len); 4372 /* XXX: Need to return something better */ 4373 goto bad_addr; 4374 } 4375 /* Back up and extract the protocol identifier. */ 4376 mp->b_wptr--; 4377 protocol = *mp->b_wptr & 0xFF; 4378 tbr = (struct T_bind_req *)mp->b_rptr; 4379 /* Reset the message type in preparation for shipping it back. */ 4380 DB_TYPE(mp) = M_PCPROTO; 4381 4382 connp->conn_ulp = (uint8_t)protocol; 4383 4384 /* 4385 * Check for a zero length address. This is from a protocol that 4386 * wants to register to receive all packets of its type. 4387 */ 4388 if (tbr->ADDR_length == 0) { 4389 /* 4390 * These protocols are now intercepted in ip_bind_v6(). 4391 * Reject protocol-level binds here for now. 4392 * 4393 * For SCTP raw socket, ICMP sends down a bind with sin_t 4394 * so that the protocol type cannot be SCTP. 4395 */ 4396 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 4397 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 4398 goto bad_addr; 4399 } 4400 4401 /* 4402 * 4403 * The udp module never sends down a zero-length address, 4404 * and allowing this on a labeled system will break MLP 4405 * functionality. 4406 */ 4407 if (is_system_labeled() && protocol == IPPROTO_UDP) 4408 goto bad_addr; 4409 4410 if (connp->conn_mac_exempt) 4411 goto bad_addr; 4412 4413 /* No hash here really. The table is big enough. */ 4414 connp->conn_srcv6 = ipv6_all_zeros; 4415 4416 ipcl_proto_insert(connp, protocol); 4417 4418 tbr->PRIM_type = T_BIND_ACK; 4419 return (mp); 4420 } 4421 4422 /* Extract the address pointer from the message. */ 4423 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 4424 tbr->ADDR_length); 4425 if (ucp == NULL) { 4426 ip1dbg(("ip_bind: no address\n")); 4427 goto bad_addr; 4428 } 4429 if (!OK_32PTR(ucp)) { 4430 ip1dbg(("ip_bind: unaligned address\n")); 4431 goto bad_addr; 4432 } 4433 /* 4434 * Check for trailing mps. 4435 */ 4436 4437 mp1 = mp->b_cont; 4438 ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); 4439 ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); 4440 4441 switch (tbr->ADDR_length) { 4442 default: 4443 ip1dbg(("ip_bind: bad address length %d\n", 4444 (int)tbr->ADDR_length)); 4445 goto bad_addr; 4446 4447 case IP_ADDR_LEN: 4448 /* Verification of local address only */ 4449 error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, 4450 ire_requested, ipsec_policy_set, B_FALSE); 4451 break; 4452 4453 case sizeof (sin_t): 4454 sin = (sin_t *)ucp; 4455 error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, 4456 sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); 4457 break; 4458 4459 case sizeof (ipa_conn_t): 4460 ac = (ipa_conn_t *)ucp; 4461 /* For raw socket, the local port is not set. */ 4462 if (ac->ac_lport == 0) 4463 ac->ac_lport = connp->conn_lport; 4464 /* Always verify destination reachability. */ 4465 error = ip_bind_connected(connp, mp, &ac->ac_laddr, 4466 ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, 4467 ipsec_policy_set, B_TRUE, B_TRUE); 4468 break; 4469 4470 case sizeof (ipa_conn_x_t): 4471 acx = (ipa_conn_x_t *)ucp; 4472 /* 4473 * Whether or not to verify destination reachability depends 4474 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4475 */ 4476 error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, 4477 acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, 4478 acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, 4479 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); 4480 break; 4481 } 4482 if (error == EINPROGRESS) 4483 return (NULL); 4484 else if (error != 0) 4485 goto bad_addr; 4486 /* 4487 * Pass the IPsec headers size in ire_ipsec_overhead. 4488 * We can't do this in ip_bind_insert_ire because the policy 4489 * may not have been inherited at that point in time and hence 4490 * conn_out_enforce_policy may not be set. 4491 */ 4492 mp1 = mp->b_cont; 4493 if (ire_requested && connp->conn_out_enforce_policy && 4494 mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { 4495 ire_t *ire = (ire_t *)mp1->b_rptr; 4496 ASSERT(MBLKL(mp1) >= sizeof (ire_t)); 4497 ire->ire_ipsec_overhead = conn_ipsec_length(connp); 4498 } 4499 4500 /* Send it home. */ 4501 mp->b_datap->db_type = M_PCPROTO; 4502 tbr->PRIM_type = T_BIND_ACK; 4503 return (mp); 4504 4505 bad_addr: 4506 /* 4507 * If error = -1 then we generate a TBADADDR - otherwise error is 4508 * a unix errno. 4509 */ 4510 if (error > 0) 4511 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4512 else 4513 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4514 return (mp); 4515 } 4516 4517 /* 4518 * Here address is verified to be a valid local address. 4519 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4520 * address is also considered a valid local address. 4521 * In the case of a broadcast/multicast address, however, the 4522 * upper protocol is expected to reset the src address 4523 * to 0 if it sees a IRE_BROADCAST type returned so that 4524 * no packets are emitted with broadcast/multicast address as 4525 * source address (that violates hosts requirements RFC1122) 4526 * The addresses valid for bind are: 4527 * (1) - INADDR_ANY (0) 4528 * (2) - IP address of an UP interface 4529 * (3) - IP address of a DOWN interface 4530 * (4) - valid local IP broadcast addresses. In this case 4531 * the conn will only receive packets destined to 4532 * the specified broadcast address. 4533 * (5) - a multicast address. In this case 4534 * the conn will only receive packets destined to 4535 * the specified multicast address. Note: the 4536 * application still has to issue an 4537 * IP_ADD_MEMBERSHIP socket option. 4538 * 4539 * On error, return -1 for TBADADDR otherwise pass the 4540 * errno with TSYSERR reply. 4541 * 4542 * In all the above cases, the bound address must be valid in the current zone. 4543 * When the address is loopback, multicast or broadcast, there might be many 4544 * matching IREs so bind has to look up based on the zone. 4545 * 4546 * Note: lport is in network byte order. 4547 */ 4548 int 4549 ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, 4550 boolean_t ire_requested, boolean_t ipsec_policy_set, 4551 boolean_t fanout_insert) 4552 { 4553 int error = 0; 4554 ire_t *src_ire; 4555 mblk_t *policy_mp; 4556 ipif_t *ipif; 4557 zoneid_t zoneid; 4558 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4559 4560 if (ipsec_policy_set) { 4561 policy_mp = mp->b_cont; 4562 } 4563 4564 /* 4565 * If it was previously connected, conn_fully_bound would have 4566 * been set. 4567 */ 4568 connp->conn_fully_bound = B_FALSE; 4569 4570 src_ire = NULL; 4571 ipif = NULL; 4572 4573 zoneid = IPCL_ZONEID(connp); 4574 4575 if (src_addr) { 4576 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4577 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 4578 /* 4579 * If an address other than 0.0.0.0 is requested, 4580 * we verify that it is a valid address for bind 4581 * Note: Following code is in if-else-if form for 4582 * readability compared to a condition check. 4583 */ 4584 /* LINTED - statement has no consequent */ 4585 if (IRE_IS_LOCAL(src_ire)) { 4586 /* 4587 * (2) Bind to address of local UP interface 4588 */ 4589 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4590 /* 4591 * (4) Bind to broadcast address 4592 * Note: permitted only from transports that 4593 * request IRE 4594 */ 4595 if (!ire_requested) 4596 error = EADDRNOTAVAIL; 4597 } else { 4598 /* 4599 * (3) Bind to address of local DOWN interface 4600 * (ipif_lookup_addr() looks up all interfaces 4601 * but we do not get here for UP interfaces 4602 * - case (2) above) 4603 * We put the protocol byte back into the mblk 4604 * since we may come back via ip_wput_nondata() 4605 * later with this mblk if ipif_lookup_addr chooses 4606 * to defer processing. 4607 */ 4608 *mp->b_wptr++ = (char)connp->conn_ulp; 4609 if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, 4610 CONNP_TO_WQ(connp), mp, ip_wput_nondata, 4611 &error, ipst)) != NULL) { 4612 ipif_refrele(ipif); 4613 } else if (error == EINPROGRESS) { 4614 if (src_ire != NULL) 4615 ire_refrele(src_ire); 4616 return (EINPROGRESS); 4617 } else if (CLASSD(src_addr)) { 4618 error = 0; 4619 if (src_ire != NULL) 4620 ire_refrele(src_ire); 4621 /* 4622 * (5) bind to multicast address. 4623 * Fake out the IRE returned to upper 4624 * layer to be a broadcast IRE. 4625 */ 4626 src_ire = ire_ctable_lookup( 4627 INADDR_BROADCAST, INADDR_ANY, 4628 IRE_BROADCAST, NULL, zoneid, NULL, 4629 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY), 4630 ipst); 4631 if (src_ire == NULL || !ire_requested) 4632 error = EADDRNOTAVAIL; 4633 } else { 4634 /* 4635 * Not a valid address for bind 4636 */ 4637 error = EADDRNOTAVAIL; 4638 } 4639 /* 4640 * Just to keep it consistent with the processing in 4641 * ip_bind_v4() 4642 */ 4643 mp->b_wptr--; 4644 } 4645 if (error) { 4646 /* Red Alert! Attempting to be a bogon! */ 4647 ip1dbg(("ip_bind: bad src address 0x%x\n", 4648 ntohl(src_addr))); 4649 goto bad_addr; 4650 } 4651 } 4652 4653 /* 4654 * Allow setting new policies. For example, disconnects come 4655 * down as ipa_t bind. As we would have set conn_policy_cached 4656 * to B_TRUE before, we should set it to B_FALSE, so that policy 4657 * can change after the disconnect. 4658 */ 4659 connp->conn_policy_cached = B_FALSE; 4660 4661 /* 4662 * If not fanout_insert this was just an address verification 4663 */ 4664 if (fanout_insert) { 4665 /* 4666 * The addresses have been verified. Time to insert in 4667 * the correct fanout list. 4668 */ 4669 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4670 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4671 connp->conn_lport = lport; 4672 connp->conn_fport = 0; 4673 /* 4674 * Do we need to add a check to reject Multicast packets 4675 */ 4676 error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); 4677 } 4678 4679 if (error == 0) { 4680 if (ire_requested) { 4681 if (!ip_bind_insert_ire(mp, src_ire, NULL, ipst)) { 4682 error = -1; 4683 /* Falls through to bad_addr */ 4684 } 4685 } else if (ipsec_policy_set) { 4686 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4687 error = -1; 4688 /* Falls through to bad_addr */ 4689 } 4690 } 4691 } 4692 bad_addr: 4693 if (error != 0) { 4694 if (connp->conn_anon_port) { 4695 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4696 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4697 B_FALSE); 4698 } 4699 connp->conn_mlp_type = mlptSingle; 4700 } 4701 if (src_ire != NULL) 4702 IRE_REFRELE(src_ire); 4703 if (ipsec_policy_set) { 4704 ASSERT(policy_mp == mp->b_cont); 4705 ASSERT(policy_mp != NULL); 4706 freeb(policy_mp); 4707 /* 4708 * As of now assume that nothing else accompanies 4709 * IPSEC_POLICY_SET. 4710 */ 4711 mp->b_cont = NULL; 4712 } 4713 return (error); 4714 } 4715 4716 /* 4717 * Verify that both the source and destination addresses 4718 * are valid. If verify_dst is false, then the destination address may be 4719 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4720 * destination reachability, while tunnels do not. 4721 * Note that we allow connect to broadcast and multicast 4722 * addresses when ire_requested is set. Thus the ULP 4723 * has to check for IRE_BROADCAST and multicast. 4724 * 4725 * Returns zero if ok. 4726 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4727 * (for use with TSYSERR reply). 4728 * 4729 * Note: lport and fport are in network byte order. 4730 */ 4731 int 4732 ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, 4733 uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4734 boolean_t ire_requested, boolean_t ipsec_policy_set, 4735 boolean_t fanout_insert, boolean_t verify_dst) 4736 { 4737 ire_t *src_ire; 4738 ire_t *dst_ire; 4739 int error = 0; 4740 int protocol; 4741 mblk_t *policy_mp; 4742 ire_t *sire = NULL; 4743 ire_t *md_dst_ire = NULL; 4744 ire_t *lso_dst_ire = NULL; 4745 ill_t *ill = NULL; 4746 zoneid_t zoneid; 4747 ipaddr_t src_addr = *src_addrp; 4748 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4749 4750 src_ire = dst_ire = NULL; 4751 protocol = *mp->b_wptr & 0xFF; 4752 4753 /* 4754 * If we never got a disconnect before, clear it now. 4755 */ 4756 connp->conn_fully_bound = B_FALSE; 4757 4758 if (ipsec_policy_set) { 4759 policy_mp = mp->b_cont; 4760 } 4761 4762 zoneid = IPCL_ZONEID(connp); 4763 4764 if (CLASSD(dst_addr)) { 4765 /* Pick up an IRE_BROADCAST */ 4766 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4767 NULL, zoneid, MBLK_GETLABEL(mp), 4768 (MATCH_IRE_RECURSIVE | 4769 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4770 MATCH_IRE_SECATTR), ipst); 4771 } else { 4772 /* 4773 * If conn_dontroute is set or if conn_nexthop_set is set, 4774 * and onlink ipif is not found set ENETUNREACH error. 4775 */ 4776 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4777 ipif_t *ipif; 4778 4779 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4780 dst_addr : connp->conn_nexthop_v4, zoneid, ipst); 4781 if (ipif == NULL) { 4782 error = ENETUNREACH; 4783 goto bad_addr; 4784 } 4785 ipif_refrele(ipif); 4786 } 4787 4788 if (connp->conn_nexthop_set) { 4789 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4790 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), 4791 MATCH_IRE_SECATTR, ipst); 4792 } else { 4793 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4794 &sire, zoneid, MBLK_GETLABEL(mp), 4795 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4796 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4797 MATCH_IRE_SECATTR), ipst); 4798 } 4799 } 4800 /* 4801 * dst_ire can't be a broadcast when not ire_requested. 4802 * We also prevent ire's with src address INADDR_ANY to 4803 * be used, which are created temporarily for 4804 * sending out packets from endpoints that have 4805 * conn_unspec_src set. If verify_dst is true, the destination must be 4806 * reachable. If verify_dst is false, the destination needn't be 4807 * reachable. 4808 * 4809 * If we match on a reject or black hole, then we've got a 4810 * local failure. May as well fail out the connect() attempt, 4811 * since it's never going to succeed. 4812 */ 4813 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4814 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4815 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4816 /* 4817 * If we're verifying destination reachability, we always want 4818 * to complain here. 4819 * 4820 * If we're not verifying destination reachability but the 4821 * destination has a route, we still want to fail on the 4822 * temporary address and broadcast address tests. 4823 */ 4824 if (verify_dst || (dst_ire != NULL)) { 4825 if (ip_debug > 2) { 4826 pr_addr_dbg("ip_bind_connected: bad connected " 4827 "dst %s\n", AF_INET, &dst_addr); 4828 } 4829 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4830 error = ENETUNREACH; 4831 else 4832 error = EHOSTUNREACH; 4833 goto bad_addr; 4834 } 4835 } 4836 4837 /* 4838 * We now know that routing will allow us to reach the destination. 4839 * Check whether Trusted Solaris policy allows communication with this 4840 * host, and pretend that the destination is unreachable if not. 4841 * 4842 * This is never a problem for TCP, since that transport is known to 4843 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4844 * handling. If the remote is unreachable, it will be detected at that 4845 * point, so there's no reason to check it here. 4846 * 4847 * Note that for sendto (and other datagram-oriented friends), this 4848 * check is done as part of the data path label computation instead. 4849 * The check here is just to make non-TCP connect() report the right 4850 * error. 4851 */ 4852 if (dst_ire != NULL && is_system_labeled() && 4853 !IPCL_IS_TCP(connp) && 4854 tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst_addr, NULL, 4855 connp->conn_mac_exempt, ipst) != 0) { 4856 error = EHOSTUNREACH; 4857 if (ip_debug > 2) { 4858 pr_addr_dbg("ip_bind_connected: no label for dst %s\n", 4859 AF_INET, &dst_addr); 4860 } 4861 goto bad_addr; 4862 } 4863 4864 /* 4865 * If the app does a connect(), it means that it will most likely 4866 * send more than 1 packet to the destination. It makes sense 4867 * to clear the temporary flag. 4868 */ 4869 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4870 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4871 irb_t *irb = dst_ire->ire_bucket; 4872 4873 rw_enter(&irb->irb_lock, RW_WRITER); 4874 /* 4875 * We need to recheck for IRE_MARK_TEMPORARY after acquiring 4876 * the lock to guarantee irb_tmp_ire_cnt. 4877 */ 4878 if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) { 4879 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4880 irb->irb_tmp_ire_cnt--; 4881 } 4882 rw_exit(&irb->irb_lock); 4883 } 4884 4885 /* 4886 * See if we should notify ULP about LSO/MDT; we do this whether or not 4887 * ire_requested is TRUE, in order to handle active connects; LSO/MDT 4888 * eligibility tests for passive connects are handled separately 4889 * through tcp_adapt_ire(). We do this before the source address 4890 * selection, because dst_ire may change after a call to 4891 * ipif_select_source(). This is a best-effort check, as the 4892 * packet for this connection may not actually go through 4893 * dst_ire->ire_stq, and the exact IRE can only be known after 4894 * calling ip_newroute(). This is why we further check on the 4895 * IRE during LSO/Multidata packet transmission in 4896 * tcp_lsosend()/tcp_multisend(). 4897 */ 4898 if (!ipsec_policy_set && dst_ire != NULL && 4899 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4900 (ill = ire_to_ill(dst_ire), ill != NULL)) { 4901 if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { 4902 lso_dst_ire = dst_ire; 4903 IRE_REFHOLD(lso_dst_ire); 4904 } else if (ipst->ips_ip_multidata_outbound && 4905 ILL_MDT_CAPABLE(ill)) { 4906 md_dst_ire = dst_ire; 4907 IRE_REFHOLD(md_dst_ire); 4908 } 4909 } 4910 4911 if (dst_ire != NULL && 4912 dst_ire->ire_type == IRE_LOCAL && 4913 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4914 /* 4915 * If the IRE belongs to a different zone, look for a matching 4916 * route in the forwarding table and use the source address from 4917 * that route. 4918 */ 4919 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4920 zoneid, 0, NULL, 4921 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4922 MATCH_IRE_RJ_BHOLE, ipst); 4923 if (src_ire == NULL) { 4924 error = EHOSTUNREACH; 4925 goto bad_addr; 4926 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4927 if (!(src_ire->ire_type & IRE_HOST)) 4928 error = ENETUNREACH; 4929 else 4930 error = EHOSTUNREACH; 4931 goto bad_addr; 4932 } 4933 if (src_addr == INADDR_ANY) 4934 src_addr = src_ire->ire_src_addr; 4935 ire_refrele(src_ire); 4936 src_ire = NULL; 4937 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4938 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4939 src_addr = sire->ire_src_addr; 4940 ire_refrele(dst_ire); 4941 dst_ire = sire; 4942 sire = NULL; 4943 } else { 4944 /* 4945 * Pick a source address so that a proper inbound 4946 * load spreading would happen. 4947 */ 4948 ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; 4949 ipif_t *src_ipif = NULL; 4950 ire_t *ipif_ire; 4951 4952 /* 4953 * Supply a local source address such that inbound 4954 * load spreading happens. 4955 * 4956 * Determine the best source address on this ill for 4957 * the destination. 4958 * 4959 * 1) For broadcast, we should return a broadcast ire 4960 * found above so that upper layers know that the 4961 * destination address is a broadcast address. 4962 * 4963 * 2) If this is part of a group, select a better 4964 * source address so that better inbound load 4965 * balancing happens. Do the same if the ipif 4966 * is DEPRECATED. 4967 * 4968 * 3) If the outgoing interface is part of a usesrc 4969 * group, then try selecting a source address from 4970 * the usesrc ILL. 4971 */ 4972 if ((dst_ire->ire_zoneid != zoneid && 4973 dst_ire->ire_zoneid != ALL_ZONES) || 4974 (!(dst_ire->ire_flags & RTF_SETSRC)) && 4975 (!(dst_ire->ire_type & IRE_BROADCAST) && 4976 ((dst_ill->ill_group != NULL) || 4977 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4978 (dst_ill->ill_usesrc_ifindex != 0)))) { 4979 /* 4980 * If the destination is reachable via a 4981 * given gateway, the selected source address 4982 * should be in the same subnet as the gateway. 4983 * Otherwise, the destination is not reachable. 4984 * 4985 * If there are no interfaces on the same subnet 4986 * as the destination, ipif_select_source gives 4987 * first non-deprecated interface which might be 4988 * on a different subnet than the gateway. 4989 * This is not desirable. Hence pass the dst_ire 4990 * source address to ipif_select_source. 4991 * It is sure that the destination is reachable 4992 * with the dst_ire source address subnet. 4993 * So passing dst_ire source address to 4994 * ipif_select_source will make sure that the 4995 * selected source will be on the same subnet 4996 * as dst_ire source address. 4997 */ 4998 ipaddr_t saddr = 4999 dst_ire->ire_ipif->ipif_src_addr; 5000 src_ipif = ipif_select_source(dst_ill, 5001 saddr, zoneid); 5002 if (src_ipif != NULL) { 5003 if (IS_VNI(src_ipif->ipif_ill)) { 5004 /* 5005 * For VNI there is no 5006 * interface route 5007 */ 5008 src_addr = 5009 src_ipif->ipif_src_addr; 5010 } else { 5011 ipif_ire = 5012 ipif_to_ire(src_ipif); 5013 if (ipif_ire != NULL) { 5014 IRE_REFRELE(dst_ire); 5015 dst_ire = ipif_ire; 5016 } 5017 src_addr = 5018 dst_ire->ire_src_addr; 5019 } 5020 ipif_refrele(src_ipif); 5021 } else { 5022 src_addr = dst_ire->ire_src_addr; 5023 } 5024 } else { 5025 src_addr = dst_ire->ire_src_addr; 5026 } 5027 } 5028 } 5029 5030 /* 5031 * We do ire_route_lookup() here (and not 5032 * interface lookup as we assert that 5033 * src_addr should only come from an 5034 * UP interface for hard binding. 5035 */ 5036 ASSERT(src_ire == NULL); 5037 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 5038 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 5039 /* src_ire must be a local|loopback */ 5040 if (!IRE_IS_LOCAL(src_ire)) { 5041 if (ip_debug > 2) { 5042 pr_addr_dbg("ip_bind_connected: bad connected " 5043 "src %s\n", AF_INET, &src_addr); 5044 } 5045 error = EADDRNOTAVAIL; 5046 goto bad_addr; 5047 } 5048 5049 /* 5050 * If the source address is a loopback address, the 5051 * destination had best be local or multicast. 5052 * The transports that can't handle multicast will reject 5053 * those addresses. 5054 */ 5055 if (src_ire->ire_type == IRE_LOOPBACK && 5056 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 5057 ip1dbg(("ip_bind_connected: bad connected loopback\n")); 5058 error = -1; 5059 goto bad_addr; 5060 } 5061 5062 /* 5063 * Allow setting new policies. For example, disconnects come 5064 * down as ipa_t bind. As we would have set conn_policy_cached 5065 * to B_TRUE before, we should set it to B_FALSE, so that policy 5066 * can change after the disconnect. 5067 */ 5068 connp->conn_policy_cached = B_FALSE; 5069 5070 /* 5071 * Set the conn addresses/ports immediately, so the IPsec policy calls 5072 * can handle their passed-in conn's. 5073 */ 5074 5075 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 5076 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 5077 connp->conn_lport = lport; 5078 connp->conn_fport = fport; 5079 *src_addrp = src_addr; 5080 5081 ASSERT(!(ipsec_policy_set && ire_requested)); 5082 if (ire_requested) { 5083 iulp_t *ulp_info = NULL; 5084 5085 /* 5086 * Note that sire will not be NULL if this is an off-link 5087 * connection and there is not cache for that dest yet. 5088 * 5089 * XXX Because of an existing bug, if there are multiple 5090 * default routes, the IRE returned now may not be the actual 5091 * default route used (default routes are chosen in a 5092 * round robin fashion). So if the metrics for different 5093 * default routes are different, we may return the wrong 5094 * metrics. This will not be a problem if the existing 5095 * bug is fixed. 5096 */ 5097 if (sire != NULL) { 5098 ulp_info = &(sire->ire_uinfo); 5099 } 5100 if (!ip_bind_insert_ire(mp, dst_ire, ulp_info, ipst)) { 5101 error = -1; 5102 goto bad_addr; 5103 } 5104 } else if (ipsec_policy_set) { 5105 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 5106 error = -1; 5107 goto bad_addr; 5108 } 5109 } 5110 5111 /* 5112 * Cache IPsec policy in this conn. If we have per-socket policy, 5113 * we'll cache that. If we don't, we'll inherit global policy. 5114 * 5115 * We can't insert until the conn reflects the policy. Note that 5116 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 5117 * connections where we don't have a policy. This is to prevent 5118 * global policy lookups in the inbound path. 5119 * 5120 * If we insert before we set conn_policy_cached, 5121 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 5122 * because global policy cound be non-empty. We normally call 5123 * ipsec_check_policy() for conn_policy_cached connections only if 5124 * ipc_in_enforce_policy is set. But in this case, 5125 * conn_policy_cached can get set anytime since we made the 5126 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 5127 * called, which will make the above assumption false. Thus, we 5128 * need to insert after we set conn_policy_cached. 5129 */ 5130 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 5131 goto bad_addr; 5132 5133 if (fanout_insert) { 5134 /* 5135 * The addresses have been verified. Time to insert in 5136 * the correct fanout list. 5137 */ 5138 error = ipcl_conn_insert(connp, protocol, src_addr, 5139 dst_addr, connp->conn_ports); 5140 } 5141 5142 if (error == 0) { 5143 connp->conn_fully_bound = B_TRUE; 5144 /* 5145 * Our initial checks for LSO/MDT have passed; the IRE is not 5146 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 5147 * be supporting LSO/MDT. Pass the IRE, IPC and ILL into 5148 * ip_xxinfo_return(), which performs further checks 5149 * against them and upon success, returns the LSO/MDT info 5150 * mblk which we will attach to the bind acknowledgment. 5151 */ 5152 if (lso_dst_ire != NULL) { 5153 mblk_t *lsoinfo_mp; 5154 5155 ASSERT(ill->ill_lso_capab != NULL); 5156 if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp, 5157 ill->ill_name, ill->ill_lso_capab)) != NULL) 5158 linkb(mp, lsoinfo_mp); 5159 } else if (md_dst_ire != NULL) { 5160 mblk_t *mdinfo_mp; 5161 5162 ASSERT(ill->ill_mdt_capab != NULL); 5163 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 5164 ill->ill_name, ill->ill_mdt_capab)) != NULL) 5165 linkb(mp, mdinfo_mp); 5166 } 5167 } 5168 bad_addr: 5169 if (ipsec_policy_set) { 5170 ASSERT(policy_mp == mp->b_cont); 5171 ASSERT(policy_mp != NULL); 5172 freeb(policy_mp); 5173 /* 5174 * As of now assume that nothing else accompanies 5175 * IPSEC_POLICY_SET. 5176 */ 5177 mp->b_cont = NULL; 5178 } 5179 if (src_ire != NULL) 5180 IRE_REFRELE(src_ire); 5181 if (dst_ire != NULL) 5182 IRE_REFRELE(dst_ire); 5183 if (sire != NULL) 5184 IRE_REFRELE(sire); 5185 if (md_dst_ire != NULL) 5186 IRE_REFRELE(md_dst_ire); 5187 if (lso_dst_ire != NULL) 5188 IRE_REFRELE(lso_dst_ire); 5189 return (error); 5190 } 5191 5192 /* 5193 * Insert the ire in b_cont. Returns false if it fails (due to lack of space). 5194 * Prefers dst_ire over src_ire. 5195 */ 5196 static boolean_t 5197 ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) 5198 { 5199 mblk_t *mp1; 5200 ire_t *ret_ire = NULL; 5201 5202 mp1 = mp->b_cont; 5203 ASSERT(mp1 != NULL); 5204 5205 if (ire != NULL) { 5206 /* 5207 * mp1 initialized above to IRE_DB_REQ_TYPE 5208 * appended mblk. Its <upper protocol>'s 5209 * job to make sure there is room. 5210 */ 5211 if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) 5212 return (0); 5213 5214 mp1->b_datap->db_type = IRE_DB_TYPE; 5215 mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); 5216 bcopy(ire, mp1->b_rptr, sizeof (ire_t)); 5217 ret_ire = (ire_t *)mp1->b_rptr; 5218 /* 5219 * Pass the latest setting of the ip_path_mtu_discovery and 5220 * copy the ulp info if any. 5221 */ 5222 ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? 5223 IPH_DF : 0; 5224 if (ulp_info != NULL) { 5225 bcopy(ulp_info, &(ret_ire->ire_uinfo), 5226 sizeof (iulp_t)); 5227 } 5228 ret_ire->ire_mp = mp1; 5229 } else { 5230 /* 5231 * No IRE was found. Remove IRE mblk. 5232 */ 5233 mp->b_cont = mp1->b_cont; 5234 freeb(mp1); 5235 } 5236 5237 return (1); 5238 } 5239 5240 /* 5241 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 5242 * the final piece where we don't. Return a pointer to the first mblk in the 5243 * result, and update the pointer to the next mblk to chew on. If anything 5244 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 5245 * NULL pointer. 5246 */ 5247 mblk_t * 5248 ip_carve_mp(mblk_t **mpp, ssize_t len) 5249 { 5250 mblk_t *mp0; 5251 mblk_t *mp1; 5252 mblk_t *mp2; 5253 5254 if (!len || !mpp || !(mp0 = *mpp)) 5255 return (NULL); 5256 /* If we aren't going to consume the first mblk, we need a dup. */ 5257 if (mp0->b_wptr - mp0->b_rptr > len) { 5258 mp1 = dupb(mp0); 5259 if (mp1) { 5260 /* Partition the data between the two mblks. */ 5261 mp1->b_wptr = mp1->b_rptr + len; 5262 mp0->b_rptr = mp1->b_wptr; 5263 /* 5264 * after adjustments if mblk not consumed is now 5265 * unaligned, try to align it. If this fails free 5266 * all messages and let upper layer recover. 5267 */ 5268 if (!OK_32PTR(mp0->b_rptr)) { 5269 if (!pullupmsg(mp0, -1)) { 5270 freemsg(mp0); 5271 freemsg(mp1); 5272 *mpp = NULL; 5273 return (NULL); 5274 } 5275 } 5276 } 5277 return (mp1); 5278 } 5279 /* Eat through as many mblks as we need to get len bytes. */ 5280 len -= mp0->b_wptr - mp0->b_rptr; 5281 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 5282 if (mp2->b_wptr - mp2->b_rptr > len) { 5283 /* 5284 * We won't consume the entire last mblk. Like 5285 * above, dup and partition it. 5286 */ 5287 mp1->b_cont = dupb(mp2); 5288 mp1 = mp1->b_cont; 5289 if (!mp1) { 5290 /* 5291 * Trouble. Rather than go to a lot of 5292 * trouble to clean up, we free the messages. 5293 * This won't be any worse than losing it on 5294 * the wire. 5295 */ 5296 freemsg(mp0); 5297 freemsg(mp2); 5298 *mpp = NULL; 5299 return (NULL); 5300 } 5301 mp1->b_wptr = mp1->b_rptr + len; 5302 mp2->b_rptr = mp1->b_wptr; 5303 /* 5304 * after adjustments if mblk not consumed is now 5305 * unaligned, try to align it. If this fails free 5306 * all messages and let upper layer recover. 5307 */ 5308 if (!OK_32PTR(mp2->b_rptr)) { 5309 if (!pullupmsg(mp2, -1)) { 5310 freemsg(mp0); 5311 freemsg(mp2); 5312 *mpp = NULL; 5313 return (NULL); 5314 } 5315 } 5316 *mpp = mp2; 5317 return (mp0); 5318 } 5319 /* Decrement len by the amount we just got. */ 5320 len -= mp2->b_wptr - mp2->b_rptr; 5321 } 5322 /* 5323 * len should be reduced to zero now. If not our caller has 5324 * screwed up. 5325 */ 5326 if (len) { 5327 /* Shouldn't happen! */ 5328 freemsg(mp0); 5329 *mpp = NULL; 5330 return (NULL); 5331 } 5332 /* 5333 * We consumed up to exactly the end of an mblk. Detach the part 5334 * we are returning from the rest of the chain. 5335 */ 5336 mp1->b_cont = NULL; 5337 *mpp = mp2; 5338 return (mp0); 5339 } 5340 5341 /* The ill stream is being unplumbed. Called from ip_close */ 5342 int 5343 ip_modclose(ill_t *ill) 5344 { 5345 boolean_t success; 5346 ipsq_t *ipsq; 5347 ipif_t *ipif; 5348 queue_t *q = ill->ill_rq; 5349 ip_stack_t *ipst = ill->ill_ipst; 5350 clock_t timeout; 5351 5352 /* 5353 * Wait for the ACKs of all deferred control messages to be processed. 5354 * In particular, we wait for a potential capability reset initiated 5355 * in ip_sioctl_plink() to complete before proceeding. 5356 * 5357 * Note: we wait for at most ip_modclose_ackwait_ms (by default 3000 ms) 5358 * in case the driver never replies. 5359 */ 5360 timeout = lbolt + MSEC_TO_TICK(ip_modclose_ackwait_ms); 5361 mutex_enter(&ill->ill_lock); 5362 while (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 5363 if (cv_timedwait(&ill->ill_cv, &ill->ill_lock, timeout) < 0) { 5364 /* Timeout */ 5365 break; 5366 } 5367 } 5368 mutex_exit(&ill->ill_lock); 5369 5370 /* 5371 * Forcibly enter the ipsq after some delay. This is to take 5372 * care of the case when some ioctl does not complete because 5373 * we sent a control message to the driver and it did not 5374 * send us a reply. We want to be able to at least unplumb 5375 * and replumb rather than force the user to reboot the system. 5376 */ 5377 success = ipsq_enter(ill, B_FALSE); 5378 5379 /* 5380 * Open/close/push/pop is guaranteed to be single threaded 5381 * per stream by STREAMS. FS guarantees that all references 5382 * from top are gone before close is called. So there can't 5383 * be another close thread that has set CONDEMNED on this ill. 5384 * and cause ipsq_enter to return failure. 5385 */ 5386 ASSERT(success); 5387 ipsq = ill->ill_phyint->phyint_ipsq; 5388 5389 /* 5390 * Mark it condemned. No new reference will be made to this ill. 5391 * Lookup functions will return an error. Threads that try to 5392 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 5393 * that the refcnt will drop down to zero. 5394 */ 5395 mutex_enter(&ill->ill_lock); 5396 ill->ill_state_flags |= ILL_CONDEMNED; 5397 for (ipif = ill->ill_ipif; ipif != NULL; 5398 ipif = ipif->ipif_next) { 5399 ipif->ipif_state_flags |= IPIF_CONDEMNED; 5400 } 5401 /* 5402 * Wake up anybody waiting to enter the ipsq. ipsq_enter 5403 * returns error if ILL_CONDEMNED is set 5404 */ 5405 cv_broadcast(&ill->ill_cv); 5406 mutex_exit(&ill->ill_lock); 5407 5408 /* 5409 * Send all the deferred DLPI messages downstream which came in 5410 * during the small window right before ipsq_enter(). We do this 5411 * without waiting for the ACKs because all the ACKs for M_PROTO 5412 * messages are ignored in ip_rput() when ILL_CONDEMNED is set. 5413 */ 5414 ill_dlpi_send_deferred(ill); 5415 5416 /* 5417 * Shut down fragmentation reassembly. 5418 * ill_frag_timer won't start a timer again. 5419 * Now cancel any existing timer 5420 */ 5421 (void) untimeout(ill->ill_frag_timer_id); 5422 (void) ill_frag_timeout(ill, 0); 5423 5424 /* 5425 * If MOVE was in progress, clear the 5426 * move_in_progress fields also. 5427 */ 5428 if (ill->ill_move_in_progress) { 5429 ILL_CLEAR_MOVE(ill); 5430 } 5431 5432 /* 5433 * Call ill_delete to bring down the ipifs, ilms and ill on 5434 * this ill. Then wait for the refcnts to drop to zero. 5435 * ill_is_freeable checks whether the ill is really quiescent. 5436 * Then make sure that threads that are waiting to enter the 5437 * ipsq have seen the error returned by ipsq_enter and have 5438 * gone away. Then we call ill_delete_tail which does the 5439 * DL_UNBIND_REQ with the driver and then qprocsoff. 5440 */ 5441 ill_delete(ill); 5442 mutex_enter(&ill->ill_lock); 5443 while (!ill_is_freeable(ill)) 5444 cv_wait(&ill->ill_cv, &ill->ill_lock); 5445 while (ill->ill_waiters) 5446 cv_wait(&ill->ill_cv, &ill->ill_lock); 5447 5448 mutex_exit(&ill->ill_lock); 5449 5450 /* 5451 * ill_delete_tail drops reference on ill_ipst, but we need to keep 5452 * it held until the end of the function since the cleanup 5453 * below needs to be able to use the ip_stack_t. 5454 */ 5455 netstack_hold(ipst->ips_netstack); 5456 5457 /* qprocsoff is called in ill_delete_tail */ 5458 ill_delete_tail(ill); 5459 ASSERT(ill->ill_ipst == NULL); 5460 5461 /* 5462 * Walk through all upper (conn) streams and qenable 5463 * those that have queued data. 5464 * close synchronization needs this to 5465 * be done to ensure that all upper layers blocked 5466 * due to flow control to the closing device 5467 * get unblocked. 5468 */ 5469 ip1dbg(("ip_wsrv: walking\n")); 5470 conn_walk_drain(ipst); 5471 5472 mutex_enter(&ipst->ips_ip_mi_lock); 5473 mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); 5474 mutex_exit(&ipst->ips_ip_mi_lock); 5475 5476 /* 5477 * credp could be null if the open didn't succeed and ip_modopen 5478 * itself calls ip_close. 5479 */ 5480 if (ill->ill_credp != NULL) 5481 crfree(ill->ill_credp); 5482 5483 mutex_enter(&ill->ill_lock); 5484 ill_nic_info_dispatch(ill); 5485 mutex_exit(&ill->ill_lock); 5486 5487 /* 5488 * Now we are done with the module close pieces that 5489 * need the netstack_t. 5490 */ 5491 netstack_rele(ipst->ips_netstack); 5492 5493 mi_close_free((IDP)ill); 5494 q->q_ptr = WR(q)->q_ptr = NULL; 5495 5496 ipsq_exit(ipsq, B_TRUE, B_TRUE); 5497 5498 return (0); 5499 } 5500 5501 /* 5502 * This is called as part of close() for IP, UDP, ICMP, and RTS 5503 * in order to quiesce the conn. 5504 */ 5505 void 5506 ip_quiesce_conn(conn_t *connp) 5507 { 5508 boolean_t drain_cleanup_reqd = B_FALSE; 5509 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 5510 boolean_t ilg_cleanup_reqd = B_FALSE; 5511 ip_stack_t *ipst; 5512 5513 ASSERT(!IPCL_IS_TCP(connp)); 5514 ipst = connp->conn_netstack->netstack_ip; 5515 5516 /* 5517 * Mark the conn as closing, and this conn must not be 5518 * inserted in future into any list. Eg. conn_drain_insert(), 5519 * won't insert this conn into the conn_drain_list. 5520 * Similarly ill_pending_mp_add() will not add any mp to 5521 * the pending mp list, after this conn has started closing. 5522 * 5523 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 5524 * cannot get set henceforth. 5525 */ 5526 mutex_enter(&connp->conn_lock); 5527 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 5528 connp->conn_state_flags |= CONN_CLOSING; 5529 if (connp->conn_idl != NULL) 5530 drain_cleanup_reqd = B_TRUE; 5531 if (connp->conn_oper_pending_ill != NULL) 5532 conn_ioctl_cleanup_reqd = B_TRUE; 5533 if (connp->conn_dhcpinit_ill != NULL) { 5534 ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0); 5535 atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit); 5536 connp->conn_dhcpinit_ill = NULL; 5537 } 5538 if (connp->conn_ilg_inuse != 0) 5539 ilg_cleanup_reqd = B_TRUE; 5540 mutex_exit(&connp->conn_lock); 5541 5542 if (conn_ioctl_cleanup_reqd) 5543 conn_ioctl_cleanup(connp); 5544 5545 if (is_system_labeled() && connp->conn_anon_port) { 5546 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5547 connp->conn_mlp_type, connp->conn_ulp, 5548 ntohs(connp->conn_lport), B_FALSE); 5549 connp->conn_anon_port = 0; 5550 } 5551 connp->conn_mlp_type = mlptSingle; 5552 5553 /* 5554 * Remove this conn from any fanout list it is on. 5555 * and then wait for any threads currently operating 5556 * on this endpoint to finish 5557 */ 5558 ipcl_hash_remove(connp); 5559 5560 /* 5561 * Remove this conn from the drain list, and do 5562 * any other cleanup that may be required. 5563 * (Only non-tcp streams may have a non-null conn_idl. 5564 * TCP streams are never flow controlled, and 5565 * conn_idl will be null) 5566 */ 5567 if (drain_cleanup_reqd) 5568 conn_drain_tail(connp, B_TRUE); 5569 5570 if (connp == ipst->ips_ip_g_mrouter) 5571 (void) ip_mrouter_done(NULL, ipst); 5572 5573 if (ilg_cleanup_reqd) 5574 ilg_delete_all(connp); 5575 5576 conn_delete_ire(connp, NULL); 5577 5578 /* 5579 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5580 * callers from write side can't be there now because close 5581 * is in progress. The only other caller is ipcl_walk 5582 * which checks for the condemned flag. 5583 */ 5584 mutex_enter(&connp->conn_lock); 5585 connp->conn_state_flags |= CONN_CONDEMNED; 5586 while (connp->conn_ref != 1) 5587 cv_wait(&connp->conn_cv, &connp->conn_lock); 5588 connp->conn_state_flags |= CONN_QUIESCED; 5589 mutex_exit(&connp->conn_lock); 5590 } 5591 5592 /* ARGSUSED */ 5593 int 5594 ip_close(queue_t *q, int flags) 5595 { 5596 conn_t *connp; 5597 5598 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5599 5600 /* 5601 * Call the appropriate delete routine depending on whether this is 5602 * a module or device. 5603 */ 5604 if (WR(q)->q_next != NULL) { 5605 /* This is a module close */ 5606 return (ip_modclose((ill_t *)q->q_ptr)); 5607 } 5608 5609 connp = q->q_ptr; 5610 ip_quiesce_conn(connp); 5611 5612 qprocsoff(q); 5613 5614 /* 5615 * Now we are truly single threaded on this stream, and can 5616 * delete the things hanging off the connp, and finally the connp. 5617 * We removed this connp from the fanout list, it cannot be 5618 * accessed thru the fanouts, and we already waited for the 5619 * conn_ref to drop to 0. We are already in close, so 5620 * there cannot be any other thread from the top. qprocsoff 5621 * has completed, and service has completed or won't run in 5622 * future. 5623 */ 5624 ASSERT(connp->conn_ref == 1); 5625 5626 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 5627 5628 connp->conn_ref--; 5629 ipcl_conn_destroy(connp); 5630 5631 q->q_ptr = WR(q)->q_ptr = NULL; 5632 return (0); 5633 } 5634 5635 /* 5636 * Wapper around putnext() so that ip_rts_request can merely use 5637 * conn_recv. 5638 */ 5639 /*ARGSUSED2*/ 5640 static void 5641 ip_conn_input(void *arg1, mblk_t *mp, void *arg2) 5642 { 5643 conn_t *connp = (conn_t *)arg1; 5644 5645 putnext(connp->conn_rq, mp); 5646 } 5647 5648 /* Return the IP checksum for the IP header at "iph". */ 5649 uint16_t 5650 ip_csum_hdr(ipha_t *ipha) 5651 { 5652 uint16_t *uph; 5653 uint32_t sum; 5654 int opt_len; 5655 5656 opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - 5657 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5658 uph = (uint16_t *)ipha; 5659 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 5660 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 5661 if (opt_len > 0) { 5662 do { 5663 sum += uph[10]; 5664 sum += uph[11]; 5665 uph += 2; 5666 } while (--opt_len); 5667 } 5668 sum = (sum & 0xFFFF) + (sum >> 16); 5669 sum = ~(sum + (sum >> 16)) & 0xFFFF; 5670 if (sum == 0xffff) 5671 sum = 0; 5672 return ((uint16_t)sum); 5673 } 5674 5675 /* 5676 * Called when the module is about to be unloaded 5677 */ 5678 void 5679 ip_ddi_destroy(void) 5680 { 5681 tnet_fini(); 5682 5683 icmp_ddi_destroy(); 5684 rts_ddi_destroy(); 5685 udp_ddi_destroy(); 5686 sctp_ddi_g_destroy(); 5687 tcp_ddi_g_destroy(); 5688 ipsec_policy_g_destroy(); 5689 ipcl_g_destroy(); 5690 ip_net_g_destroy(); 5691 ip_ire_g_fini(); 5692 inet_minor_destroy(ip_minor_arena_sa); 5693 #if defined(_LP64) 5694 inet_minor_destroy(ip_minor_arena_la); 5695 #endif 5696 5697 #ifdef DEBUG 5698 list_destroy(&ip_thread_list); 5699 rw_destroy(&ip_thread_rwlock); 5700 tsd_destroy(&ip_thread_data); 5701 #endif 5702 5703 netstack_unregister(NS_IP); 5704 } 5705 5706 /* 5707 * First step in cleanup. 5708 */ 5709 /* ARGSUSED */ 5710 static void 5711 ip_stack_shutdown(netstackid_t stackid, void *arg) 5712 { 5713 ip_stack_t *ipst = (ip_stack_t *)arg; 5714 5715 #ifdef NS_DEBUG 5716 printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid); 5717 #endif 5718 5719 /* Get rid of loopback interfaces and their IREs */ 5720 ip_loopback_cleanup(ipst); 5721 } 5722 5723 /* 5724 * Free the IP stack instance. 5725 */ 5726 static void 5727 ip_stack_fini(netstackid_t stackid, void *arg) 5728 { 5729 ip_stack_t *ipst = (ip_stack_t *)arg; 5730 int ret; 5731 5732 #ifdef NS_DEBUG 5733 printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); 5734 #endif 5735 ipv4_hook_destroy(ipst); 5736 ipv6_hook_destroy(ipst); 5737 ip_net_destroy(ipst); 5738 5739 rw_destroy(&ipst->ips_srcid_lock); 5740 5741 ip_kstat_fini(stackid, ipst->ips_ip_mibkp); 5742 ipst->ips_ip_mibkp = NULL; 5743 icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp); 5744 ipst->ips_icmp_mibkp = NULL; 5745 ip_kstat2_fini(stackid, ipst->ips_ip_kstat); 5746 ipst->ips_ip_kstat = NULL; 5747 bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics)); 5748 ip6_kstat_fini(stackid, ipst->ips_ip6_kstat); 5749 ipst->ips_ip6_kstat = NULL; 5750 bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics)); 5751 5752 nd_free(&ipst->ips_ip_g_nd); 5753 kmem_free(ipst->ips_param_arr, sizeof (lcl_param_arr)); 5754 ipst->ips_param_arr = NULL; 5755 kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 5756 ipst->ips_ndp_arr = NULL; 5757 5758 ip_mrouter_stack_destroy(ipst); 5759 5760 mutex_destroy(&ipst->ips_ip_mi_lock); 5761 rw_destroy(&ipst->ips_ipsec_capab_ills_lock); 5762 rw_destroy(&ipst->ips_ill_g_usesrc_lock); 5763 rw_destroy(&ipst->ips_ip_g_nd_lock); 5764 5765 ret = untimeout(ipst->ips_igmp_timeout_id); 5766 if (ret == -1) { 5767 ASSERT(ipst->ips_igmp_timeout_id == 0); 5768 } else { 5769 ASSERT(ipst->ips_igmp_timeout_id != 0); 5770 ipst->ips_igmp_timeout_id = 0; 5771 } 5772 ret = untimeout(ipst->ips_igmp_slowtimeout_id); 5773 if (ret == -1) { 5774 ASSERT(ipst->ips_igmp_slowtimeout_id == 0); 5775 } else { 5776 ASSERT(ipst->ips_igmp_slowtimeout_id != 0); 5777 ipst->ips_igmp_slowtimeout_id = 0; 5778 } 5779 ret = untimeout(ipst->ips_mld_timeout_id); 5780 if (ret == -1) { 5781 ASSERT(ipst->ips_mld_timeout_id == 0); 5782 } else { 5783 ASSERT(ipst->ips_mld_timeout_id != 0); 5784 ipst->ips_mld_timeout_id = 0; 5785 } 5786 ret = untimeout(ipst->ips_mld_slowtimeout_id); 5787 if (ret == -1) { 5788 ASSERT(ipst->ips_mld_slowtimeout_id == 0); 5789 } else { 5790 ASSERT(ipst->ips_mld_slowtimeout_id != 0); 5791 ipst->ips_mld_slowtimeout_id = 0; 5792 } 5793 ret = untimeout(ipst->ips_ip_ire_expire_id); 5794 if (ret == -1) { 5795 ASSERT(ipst->ips_ip_ire_expire_id == 0); 5796 } else { 5797 ASSERT(ipst->ips_ip_ire_expire_id != 0); 5798 ipst->ips_ip_ire_expire_id = 0; 5799 } 5800 5801 mutex_destroy(&ipst->ips_igmp_timer_lock); 5802 mutex_destroy(&ipst->ips_mld_timer_lock); 5803 mutex_destroy(&ipst->ips_igmp_slowtimeout_lock); 5804 mutex_destroy(&ipst->ips_mld_slowtimeout_lock); 5805 mutex_destroy(&ipst->ips_ip_addr_avail_lock); 5806 rw_destroy(&ipst->ips_ill_g_lock); 5807 5808 ip_ire_fini(ipst); 5809 ip6_asp_free(ipst); 5810 conn_drain_fini(ipst); 5811 ipcl_destroy(ipst); 5812 5813 mutex_destroy(&ipst->ips_ndp4->ndp_g_lock); 5814 mutex_destroy(&ipst->ips_ndp6->ndp_g_lock); 5815 kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t)); 5816 ipst->ips_ndp4 = NULL; 5817 kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t)); 5818 ipst->ips_ndp6 = NULL; 5819 5820 if (ipst->ips_loopback_ksp != NULL) { 5821 kstat_delete_netstack(ipst->ips_loopback_ksp, stackid); 5822 ipst->ips_loopback_ksp = NULL; 5823 } 5824 5825 kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t)); 5826 ipst->ips_phyint_g_list = NULL; 5827 kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS); 5828 ipst->ips_ill_g_heads = NULL; 5829 5830 kmem_free(ipst, sizeof (*ipst)); 5831 } 5832 5833 /* 5834 * This function is called from the TSD destructor, and is used to debug 5835 * reference count issues in IP. See block comment in <inet/ip_if.h> for 5836 * details. 5837 */ 5838 static void 5839 ip_thread_exit(void *phash) 5840 { 5841 th_hash_t *thh = phash; 5842 5843 rw_enter(&ip_thread_rwlock, RW_WRITER); 5844 list_remove(&ip_thread_list, thh); 5845 rw_exit(&ip_thread_rwlock); 5846 mod_hash_destroy_hash(thh->thh_hash); 5847 kmem_free(thh, sizeof (*thh)); 5848 } 5849 5850 /* 5851 * Called when the IP kernel module is loaded into the kernel 5852 */ 5853 void 5854 ip_ddi_init(void) 5855 { 5856 ip_input_proc = ip_squeue_switch(ip_squeue_enter); 5857 5858 /* 5859 * For IP and TCP the minor numbers should start from 2 since we have 4 5860 * initial devices: ip, ip6, tcp, tcp6. 5861 */ 5862 /* 5863 * If this is a 64-bit kernel, then create two separate arenas - 5864 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the 5865 * other for socket apps in the range 2^^18 through 2^^32-1. 5866 */ 5867 ip_minor_arena_la = NULL; 5868 ip_minor_arena_sa = NULL; 5869 #if defined(_LP64) 5870 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5871 INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) { 5872 cmn_err(CE_PANIC, 5873 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5874 } 5875 if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la", 5876 MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) { 5877 cmn_err(CE_PANIC, 5878 "ip_ddi_init: ip_minor_arena_la creation failed\n"); 5879 } 5880 #else 5881 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5882 INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) { 5883 cmn_err(CE_PANIC, 5884 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5885 } 5886 #endif 5887 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5888 5889 ipcl_g_init(); 5890 ip_ire_g_init(); 5891 ip_net_g_init(); 5892 5893 #ifdef DEBUG 5894 tsd_create(&ip_thread_data, ip_thread_exit); 5895 rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL); 5896 list_create(&ip_thread_list, sizeof (th_hash_t), 5897 offsetof(th_hash_t, thh_link)); 5898 #endif 5899 5900 /* 5901 * We want to be informed each time a stack is created or 5902 * destroyed in the kernel, so we can maintain the 5903 * set of udp_stack_t's. 5904 */ 5905 netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown, 5906 ip_stack_fini); 5907 5908 ipsec_policy_g_init(); 5909 tcp_ddi_g_init(); 5910 sctp_ddi_g_init(); 5911 5912 tnet_init(); 5913 5914 udp_ddi_init(); 5915 rts_ddi_init(); 5916 icmp_ddi_init(); 5917 } 5918 5919 /* 5920 * Initialize the IP stack instance. 5921 */ 5922 static void * 5923 ip_stack_init(netstackid_t stackid, netstack_t *ns) 5924 { 5925 ip_stack_t *ipst; 5926 ipparam_t *pa; 5927 ipndp_t *na; 5928 5929 #ifdef NS_DEBUG 5930 printf("ip_stack_init(stack %d)\n", stackid); 5931 #endif 5932 5933 ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP); 5934 ipst->ips_netstack = ns; 5935 5936 ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS, 5937 KM_SLEEP); 5938 ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t), 5939 KM_SLEEP); 5940 ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5941 ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5942 mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5943 mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5944 5945 rw_init(&ipst->ips_ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5946 mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5947 ipst->ips_igmp_deferred_next = INFINITY; 5948 mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5949 ipst->ips_mld_deferred_next = INFINITY; 5950 mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5951 mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5952 mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5953 mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5954 rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL); 5955 rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5956 rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5957 5958 ipcl_init(ipst); 5959 ip_ire_init(ipst); 5960 ip6_asp_init(ipst); 5961 ipif_init(ipst); 5962 conn_drain_init(ipst); 5963 ip_mrouter_stack_init(ipst); 5964 5965 ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT; 5966 ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 5967 5968 ipst->ips_ip_multirt_log_interval = 1000; 5969 5970 ipst->ips_ip_g_forward = IP_FORWARD_DEFAULT; 5971 ipst->ips_ipv6_forward = IP_FORWARD_DEFAULT; 5972 ipst->ips_ill_index = 1; 5973 5974 ipst->ips_saved_ip_g_forward = -1; 5975 ipst->ips_reg_vif_num = ALL_VIFS; /* Index to Register vif */ 5976 5977 pa = (ipparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP); 5978 ipst->ips_param_arr = pa; 5979 bcopy(lcl_param_arr, ipst->ips_param_arr, sizeof (lcl_param_arr)); 5980 5981 na = (ipndp_t *)kmem_alloc(sizeof (lcl_ndp_arr), KM_SLEEP); 5982 ipst->ips_ndp_arr = na; 5983 bcopy(lcl_ndp_arr, ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 5984 ipst->ips_ndp_arr[IPNDP_IP_FORWARDING_OFFSET].ip_ndp_data = 5985 (caddr_t)&ipst->ips_ip_g_forward; 5986 ipst->ips_ndp_arr[IPNDP_IP6_FORWARDING_OFFSET].ip_ndp_data = 5987 (caddr_t)&ipst->ips_ipv6_forward; 5988 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_name, 5989 "ip_cgtp_filter") == 0); 5990 ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data = 5991 (caddr_t)&ipst->ips_ip_cgtp_filter; 5992 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name, 5993 "ipmp_hook_emulation") == 0); 5994 ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data = 5995 (caddr_t)&ipst->ips_ipmp_hook_emulation; 5996 5997 (void) ip_param_register(&ipst->ips_ip_g_nd, 5998 ipst->ips_param_arr, A_CNT(lcl_param_arr), 5999 ipst->ips_ndp_arr, A_CNT(lcl_ndp_arr)); 6000 6001 ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst); 6002 ipst->ips_icmp_mibkp = icmp_kstat_init(stackid); 6003 ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics); 6004 ipst->ips_ip6_kstat = 6005 ip6_kstat_init(stackid, &ipst->ips_ip6_statistics); 6006 6007 ipst->ips_ipmp_enable_failback = B_TRUE; 6008 6009 ipst->ips_ip_src_id = 1; 6010 rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); 6011 6012 ip_net_init(ipst, ns); 6013 ipv4_hook_init(ipst); 6014 ipv6_hook_init(ipst); 6015 6016 return (ipst); 6017 } 6018 6019 /* 6020 * Allocate and initialize a DLPI template of the specified length. (May be 6021 * called as writer.) 6022 */ 6023 mblk_t * 6024 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 6025 { 6026 mblk_t *mp; 6027 6028 mp = allocb(len, BPRI_MED); 6029 if (!mp) 6030 return (NULL); 6031 6032 /* 6033 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 6034 * of which we don't seem to use) are sent with M_PCPROTO, and 6035 * that other DLPI are M_PROTO. 6036 */ 6037 if (prim == DL_INFO_REQ) { 6038 mp->b_datap->db_type = M_PCPROTO; 6039 } else { 6040 mp->b_datap->db_type = M_PROTO; 6041 } 6042 6043 mp->b_wptr = mp->b_rptr + len; 6044 bzero(mp->b_rptr, len); 6045 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 6046 return (mp); 6047 } 6048 6049 /* 6050 * Debug formatting routine. Returns a character string representation of the 6051 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 6052 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 6053 * 6054 * Once the ndd table-printing interfaces are removed, this can be changed to 6055 * standard dotted-decimal form. 6056 */ 6057 char * 6058 ip_dot_addr(ipaddr_t addr, char *buf) 6059 { 6060 uint8_t *ap = (uint8_t *)&addr; 6061 6062 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 6063 ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF); 6064 return (buf); 6065 } 6066 6067 /* 6068 * Write the given MAC address as a printable string in the usual colon- 6069 * separated format. 6070 */ 6071 const char * 6072 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen) 6073 { 6074 char *bp; 6075 6076 if (alen == 0 || buflen < 4) 6077 return ("?"); 6078 bp = buf; 6079 for (;;) { 6080 /* 6081 * If there are more MAC address bytes available, but we won't 6082 * have any room to print them, then add "..." to the string 6083 * instead. See below for the 'magic number' explanation. 6084 */ 6085 if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) { 6086 (void) strcpy(bp, "..."); 6087 break; 6088 } 6089 (void) sprintf(bp, "%02x", *addr++); 6090 bp += 2; 6091 if (--alen == 0) 6092 break; 6093 *bp++ = ':'; 6094 buflen -= 3; 6095 /* 6096 * At this point, based on the first 'if' statement above, 6097 * either alen == 1 and buflen >= 3, or alen > 1 and 6098 * buflen >= 4. The first case leaves room for the final "xx" 6099 * number and trailing NUL byte. The second leaves room for at 6100 * least "...". Thus the apparently 'magic' numbers chosen for 6101 * that statement. 6102 */ 6103 } 6104 return (buf); 6105 } 6106 6107 /* 6108 * Send an ICMP error after patching up the packet appropriately. Returns 6109 * non-zero if the appropriate MIB should be bumped; zero otherwise. 6110 */ 6111 static boolean_t 6112 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 6113 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, 6114 zoneid_t zoneid, ip_stack_t *ipst) 6115 { 6116 ipha_t *ipha; 6117 mblk_t *first_mp; 6118 boolean_t secure; 6119 unsigned char db_type; 6120 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6121 6122 first_mp = mp; 6123 if (mctl_present) { 6124 mp = mp->b_cont; 6125 secure = ipsec_in_is_secure(first_mp); 6126 ASSERT(mp != NULL); 6127 } else { 6128 /* 6129 * If this is an ICMP error being reported - which goes 6130 * up as M_CTLs, we need to convert them to M_DATA till 6131 * we finish checking with global policy because 6132 * ipsec_check_global_policy() assumes M_DATA as clear 6133 * and M_CTL as secure. 6134 */ 6135 db_type = DB_TYPE(mp); 6136 DB_TYPE(mp) = M_DATA; 6137 secure = B_FALSE; 6138 } 6139 /* 6140 * We are generating an icmp error for some inbound packet. 6141 * Called from all ip_fanout_(udp, tcp, proto) functions. 6142 * Before we generate an error, check with global policy 6143 * to see whether this is allowed to enter the system. As 6144 * there is no "conn", we are checking with global policy. 6145 */ 6146 ipha = (ipha_t *)mp->b_rptr; 6147 if (secure || ipss->ipsec_inbound_v4_policy_present) { 6148 first_mp = ipsec_check_global_policy(first_mp, NULL, 6149 ipha, NULL, mctl_present, ipst->ips_netstack); 6150 if (first_mp == NULL) 6151 return (B_FALSE); 6152 } 6153 6154 if (!mctl_present) 6155 DB_TYPE(mp) = db_type; 6156 6157 if (flags & IP_FF_SEND_ICMP) { 6158 if (flags & IP_FF_HDR_COMPLETE) { 6159 if (ip_hdr_complete(ipha, zoneid, ipst)) { 6160 freemsg(first_mp); 6161 return (B_TRUE); 6162 } 6163 } 6164 if (flags & IP_FF_CKSUM) { 6165 /* 6166 * Have to correct checksum since 6167 * the packet might have been 6168 * fragmented and the reassembly code in ip_rput 6169 * does not restore the IP checksum. 6170 */ 6171 ipha->ipha_hdr_checksum = 0; 6172 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6173 } 6174 switch (icmp_type) { 6175 case ICMP_DEST_UNREACHABLE: 6176 icmp_unreachable(WR(q), first_mp, icmp_code, zoneid, 6177 ipst); 6178 break; 6179 default: 6180 freemsg(first_mp); 6181 break; 6182 } 6183 } else { 6184 freemsg(first_mp); 6185 return (B_FALSE); 6186 } 6187 6188 return (B_TRUE); 6189 } 6190 6191 /* 6192 * Used to send an ICMP error message when a packet is received for 6193 * a protocol that is not supported. The mblk passed as argument 6194 * is consumed by this function. 6195 */ 6196 void 6197 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid, 6198 ip_stack_t *ipst) 6199 { 6200 mblk_t *mp; 6201 ipha_t *ipha; 6202 ill_t *ill; 6203 ipsec_in_t *ii; 6204 6205 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6206 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6207 6208 mp = ipsec_mp->b_cont; 6209 ipsec_mp->b_cont = NULL; 6210 ipha = (ipha_t *)mp->b_rptr; 6211 /* Get ill from index in ipsec_in_t. */ 6212 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 6213 (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL, 6214 ipst); 6215 if (ill != NULL) { 6216 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 6217 if (ip_fanout_send_icmp(q, mp, flags, 6218 ICMP_DEST_UNREACHABLE, 6219 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) { 6220 BUMP_MIB(ill->ill_ip_mib, 6221 ipIfStatsInUnknownProtos); 6222 } 6223 } else { 6224 if (ip_fanout_send_icmp_v6(q, mp, flags, 6225 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 6226 0, B_FALSE, zoneid, ipst)) { 6227 BUMP_MIB(ill->ill_ip_mib, 6228 ipIfStatsInUnknownProtos); 6229 } 6230 } 6231 ill_refrele(ill); 6232 } else { /* re-link for the freemsg() below. */ 6233 ipsec_mp->b_cont = mp; 6234 } 6235 6236 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 6237 freemsg(ipsec_mp); 6238 } 6239 6240 /* 6241 * See if the inbound datagram has had IPsec processing applied to it. 6242 */ 6243 boolean_t 6244 ipsec_in_is_secure(mblk_t *ipsec_mp) 6245 { 6246 ipsec_in_t *ii; 6247 6248 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6249 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6250 6251 if (ii->ipsec_in_loopback) { 6252 return (ii->ipsec_in_secure); 6253 } else { 6254 return (ii->ipsec_in_ah_sa != NULL || 6255 ii->ipsec_in_esp_sa != NULL || 6256 ii->ipsec_in_decaps); 6257 } 6258 } 6259 6260 /* 6261 * Handle protocols with which IP is less intimate. There 6262 * can be more than one stream bound to a particular 6263 * protocol. When this is the case, normally each one gets a copy 6264 * of any incoming packets. 6265 * 6266 * IPsec NOTE : 6267 * 6268 * Don't allow a secure packet going up a non-secure connection. 6269 * We don't allow this because 6270 * 6271 * 1) Reply might go out in clear which will be dropped at 6272 * the sending side. 6273 * 2) If the reply goes out in clear it will give the 6274 * adversary enough information for getting the key in 6275 * most of the cases. 6276 * 6277 * Moreover getting a secure packet when we expect clear 6278 * implies that SA's were added without checking for 6279 * policy on both ends. This should not happen once ISAKMP 6280 * is used to negotiate SAs as SAs will be added only after 6281 * verifying the policy. 6282 * 6283 * NOTE : If the packet was tunneled and not multicast we only send 6284 * to it the first match. Unlike TCP and UDP fanouts this doesn't fall 6285 * back to delivering packets to AF_INET6 raw sockets. 6286 * 6287 * IPQoS Notes: 6288 * Once we have determined the client, invoke IPPF processing. 6289 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6290 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6291 * ip_policy will be false. 6292 * 6293 * Zones notes: 6294 * Currently only applications in the global zone can create raw sockets for 6295 * protocols other than ICMP. So unlike the broadcast / multicast case of 6296 * ip_fanout_udp(), we only send a copy of the packet to streams in the 6297 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 6298 */ 6299 static void 6300 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 6301 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 6302 zoneid_t zoneid) 6303 { 6304 queue_t *rq; 6305 mblk_t *mp1, *first_mp1; 6306 uint_t protocol = ipha->ipha_protocol; 6307 ipaddr_t dst; 6308 boolean_t one_only; 6309 mblk_t *first_mp = mp; 6310 boolean_t secure; 6311 uint32_t ill_index; 6312 conn_t *connp, *first_connp, *next_connp; 6313 connf_t *connfp; 6314 boolean_t shared_addr; 6315 mib2_ipIfStatsEntry_t *mibptr; 6316 ip_stack_t *ipst = recv_ill->ill_ipst; 6317 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6318 6319 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 6320 if (mctl_present) { 6321 mp = first_mp->b_cont; 6322 secure = ipsec_in_is_secure(first_mp); 6323 ASSERT(mp != NULL); 6324 } else { 6325 secure = B_FALSE; 6326 } 6327 dst = ipha->ipha_dst; 6328 /* 6329 * If the packet was tunneled and not multicast we only send to it 6330 * the first match. 6331 */ 6332 one_only = ((protocol == IPPROTO_ENCAP || protocol == IPPROTO_IPV6) && 6333 !CLASSD(dst)); 6334 6335 shared_addr = (zoneid == ALL_ZONES); 6336 if (shared_addr) { 6337 /* 6338 * We don't allow multilevel ports for raw IP, so no need to 6339 * check for that here. 6340 */ 6341 zoneid = tsol_packet_to_zoneid(mp); 6342 } 6343 6344 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 6345 mutex_enter(&connfp->connf_lock); 6346 connp = connfp->connf_head; 6347 for (connp = connfp->connf_head; connp != NULL; 6348 connp = connp->conn_next) { 6349 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 6350 zoneid) && 6351 (!is_system_labeled() || 6352 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6353 connp))) { 6354 break; 6355 } 6356 } 6357 6358 if (connp == NULL || connp->conn_upq == NULL) { 6359 /* 6360 * No one bound to these addresses. Is 6361 * there a client that wants all 6362 * unclaimed datagrams? 6363 */ 6364 mutex_exit(&connfp->connf_lock); 6365 /* 6366 * Check for IPPROTO_ENCAP... 6367 */ 6368 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { 6369 /* 6370 * If an IPsec mblk is here on a multicast 6371 * tunnel (using ip_mroute stuff), check policy here, 6372 * THEN ship off to ip_mroute_decap(). 6373 * 6374 * BTW, If I match a configured IP-in-IP 6375 * tunnel, this path will not be reached, and 6376 * ip_mroute_decap will never be called. 6377 */ 6378 first_mp = ipsec_check_global_policy(first_mp, connp, 6379 ipha, NULL, mctl_present, ipst->ips_netstack); 6380 if (first_mp != NULL) { 6381 if (mctl_present) 6382 freeb(first_mp); 6383 ip_mroute_decap(q, mp, ill); 6384 } /* Else we already freed everything! */ 6385 } else { 6386 /* 6387 * Otherwise send an ICMP protocol unreachable. 6388 */ 6389 if (ip_fanout_send_icmp(q, first_mp, flags, 6390 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 6391 mctl_present, zoneid, ipst)) { 6392 BUMP_MIB(mibptr, ipIfStatsInUnknownProtos); 6393 } 6394 } 6395 return; 6396 } 6397 CONN_INC_REF(connp); 6398 first_connp = connp; 6399 6400 /* 6401 * Only send message to one tunnel driver by immediately 6402 * terminating the loop. 6403 */ 6404 connp = one_only ? NULL : connp->conn_next; 6405 6406 for (;;) { 6407 while (connp != NULL) { 6408 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 6409 flags, zoneid) && 6410 (!is_system_labeled() || 6411 tsol_receive_local(mp, &dst, IPV4_VERSION, 6412 shared_addr, connp))) 6413 break; 6414 connp = connp->conn_next; 6415 } 6416 6417 /* 6418 * Copy the packet. 6419 */ 6420 if (connp == NULL || connp->conn_upq == NULL || 6421 (((first_mp1 = dupmsg(first_mp)) == NULL) && 6422 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 6423 /* 6424 * No more interested clients or memory 6425 * allocation failed 6426 */ 6427 connp = first_connp; 6428 break; 6429 } 6430 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 6431 CONN_INC_REF(connp); 6432 mutex_exit(&connfp->connf_lock); 6433 rq = connp->conn_rq; 6434 if (!canputnext(rq)) { 6435 if (flags & IP_FF_RAWIP) { 6436 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6437 } else { 6438 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6439 } 6440 6441 freemsg(first_mp1); 6442 } else { 6443 /* 6444 * Don't enforce here if we're an actual tunnel - 6445 * let "tun" do it instead. 6446 */ 6447 if (!IPCL_IS_IPTUN(connp) && 6448 (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 6449 secure)) { 6450 first_mp1 = ipsec_check_inbound_policy 6451 (first_mp1, connp, ipha, NULL, 6452 mctl_present); 6453 } 6454 if (first_mp1 != NULL) { 6455 int in_flags = 0; 6456 /* 6457 * ip_fanout_proto also gets called from 6458 * icmp_inbound_error_fanout, in which case 6459 * the msg type is M_CTL. Don't add info 6460 * in this case for the time being. In future 6461 * when there is a need for knowing the 6462 * inbound iface index for ICMP error msgs, 6463 * then this can be changed. 6464 */ 6465 if (connp->conn_recvif) 6466 in_flags = IPF_RECVIF; 6467 /* 6468 * The ULP may support IP_RECVPKTINFO for both 6469 * IP v4 and v6 so pass the appropriate argument 6470 * based on conn IP version. 6471 */ 6472 if (connp->conn_ip_recvpktinfo) { 6473 if (connp->conn_af_isv6) { 6474 /* 6475 * V6 only needs index 6476 */ 6477 in_flags |= IPF_RECVIF; 6478 } else { 6479 /* 6480 * V4 needs index + 6481 * matching address. 6482 */ 6483 in_flags |= IPF_RECVADDR; 6484 } 6485 } 6486 if ((in_flags != 0) && 6487 (mp->b_datap->db_type != M_CTL)) { 6488 /* 6489 * the actual data will be 6490 * contained in b_cont upon 6491 * successful return of the 6492 * following call else 6493 * original mblk is returned 6494 */ 6495 ASSERT(recv_ill != NULL); 6496 mp1 = ip_add_info(mp1, recv_ill, 6497 in_flags, IPCL_ZONEID(connp), ipst); 6498 } 6499 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6500 if (mctl_present) 6501 freeb(first_mp1); 6502 (connp->conn_recv)(connp, mp1, NULL); 6503 } 6504 } 6505 mutex_enter(&connfp->connf_lock); 6506 /* Follow the next pointer before releasing the conn. */ 6507 next_connp = connp->conn_next; 6508 CONN_DEC_REF(connp); 6509 connp = next_connp; 6510 } 6511 6512 /* Last one. Send it upstream. */ 6513 mutex_exit(&connfp->connf_lock); 6514 6515 /* 6516 * If this packet is coming from icmp_inbound_error_fanout ip_policy 6517 * will be set to false. 6518 */ 6519 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6520 ill_index = ill->ill_phyint->phyint_ifindex; 6521 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6522 if (mp == NULL) { 6523 CONN_DEC_REF(connp); 6524 if (mctl_present) { 6525 freeb(first_mp); 6526 } 6527 return; 6528 } 6529 } 6530 6531 rq = connp->conn_rq; 6532 if (!canputnext(rq)) { 6533 if (flags & IP_FF_RAWIP) { 6534 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6535 } else { 6536 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6537 } 6538 6539 freemsg(first_mp); 6540 } else { 6541 if (IPCL_IS_IPTUN(connp)) { 6542 /* 6543 * Tunneled packet. We enforce policy in the tunnel 6544 * module itself. 6545 * 6546 * Send the WHOLE packet up (incl. IPSEC_IN) without 6547 * a policy check. 6548 * FIXME to use conn_recv for tun later. 6549 */ 6550 putnext(rq, first_mp); 6551 CONN_DEC_REF(connp); 6552 return; 6553 } 6554 6555 if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) { 6556 first_mp = ipsec_check_inbound_policy(first_mp, connp, 6557 ipha, NULL, mctl_present); 6558 } 6559 6560 if (first_mp != NULL) { 6561 int in_flags = 0; 6562 6563 /* 6564 * ip_fanout_proto also gets called 6565 * from icmp_inbound_error_fanout, in 6566 * which case the msg type is M_CTL. 6567 * Don't add info in this case for time 6568 * being. In future when there is a 6569 * need for knowing the inbound iface 6570 * index for ICMP error msgs, then this 6571 * can be changed 6572 */ 6573 if (connp->conn_recvif) 6574 in_flags = IPF_RECVIF; 6575 if (connp->conn_ip_recvpktinfo) { 6576 if (connp->conn_af_isv6) { 6577 /* 6578 * V6 only needs index 6579 */ 6580 in_flags |= IPF_RECVIF; 6581 } else { 6582 /* 6583 * V4 needs index + 6584 * matching address. 6585 */ 6586 in_flags |= IPF_RECVADDR; 6587 } 6588 } 6589 if ((in_flags != 0) && 6590 (mp->b_datap->db_type != M_CTL)) { 6591 6592 /* 6593 * the actual data will be contained in 6594 * b_cont upon successful return 6595 * of the following call else original 6596 * mblk is returned 6597 */ 6598 ASSERT(recv_ill != NULL); 6599 mp = ip_add_info(mp, recv_ill, 6600 in_flags, IPCL_ZONEID(connp), ipst); 6601 } 6602 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6603 (connp->conn_recv)(connp, mp, NULL); 6604 if (mctl_present) 6605 freeb(first_mp); 6606 } 6607 } 6608 CONN_DEC_REF(connp); 6609 } 6610 6611 /* 6612 * Fanout for TCP packets 6613 * The caller puts <fport, lport> in the ports parameter. 6614 * 6615 * IPQoS Notes 6616 * Before sending it to the client, invoke IPPF processing. 6617 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6618 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6619 * ip_policy is false. 6620 */ 6621 static void 6622 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 6623 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 6624 { 6625 mblk_t *first_mp; 6626 boolean_t secure; 6627 uint32_t ill_index; 6628 int ip_hdr_len; 6629 tcph_t *tcph; 6630 boolean_t syn_present = B_FALSE; 6631 conn_t *connp; 6632 ip_stack_t *ipst = recv_ill->ill_ipst; 6633 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6634 6635 ASSERT(recv_ill != NULL); 6636 6637 first_mp = mp; 6638 if (mctl_present) { 6639 ASSERT(first_mp->b_datap->db_type == M_CTL); 6640 mp = first_mp->b_cont; 6641 secure = ipsec_in_is_secure(first_mp); 6642 ASSERT(mp != NULL); 6643 } else { 6644 secure = B_FALSE; 6645 } 6646 6647 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 6648 6649 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 6650 zoneid, ipst)) == NULL) { 6651 /* 6652 * No connected connection or listener. Send a 6653 * TH_RST via tcp_xmit_listeners_reset. 6654 */ 6655 6656 /* Initiate IPPf processing, if needed. */ 6657 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 6658 uint32_t ill_index; 6659 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6660 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 6661 if (first_mp == NULL) 6662 return; 6663 } 6664 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6665 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 6666 zoneid)); 6667 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6668 ipst->ips_netstack->netstack_tcp, NULL); 6669 return; 6670 } 6671 6672 /* 6673 * Allocate the SYN for the TCP connection here itself 6674 */ 6675 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6676 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 6677 if (IPCL_IS_TCP(connp)) { 6678 squeue_t *sqp; 6679 6680 /* 6681 * For fused tcp loopback, assign the eager's 6682 * squeue to be that of the active connect's. 6683 * Note that we don't check for IP_FF_LOOPBACK 6684 * here since this routine gets called only 6685 * for loopback (unlike the IPv6 counterpart). 6686 */ 6687 ASSERT(Q_TO_CONN(q) != NULL); 6688 if (do_tcp_fusion && 6689 !CONN_INBOUND_POLICY_PRESENT(connp, ipss) && 6690 !secure && 6691 !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy && 6692 IPCL_IS_TCP(Q_TO_CONN(q))) { 6693 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 6694 sqp = Q_TO_CONN(q)->conn_sqp; 6695 } else { 6696 sqp = IP_SQUEUE_GET(lbolt); 6697 } 6698 6699 mp->b_datap->db_struioflag |= STRUIO_EAGER; 6700 DB_CKSUMSTART(mp) = (intptr_t)sqp; 6701 syn_present = B_TRUE; 6702 } 6703 } 6704 6705 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 6706 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 6707 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6708 if ((flags & TH_RST) || (flags & TH_URG)) { 6709 CONN_DEC_REF(connp); 6710 freemsg(first_mp); 6711 return; 6712 } 6713 if (flags & TH_ACK) { 6714 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6715 ipst->ips_netstack->netstack_tcp, connp); 6716 CONN_DEC_REF(connp); 6717 return; 6718 } 6719 6720 CONN_DEC_REF(connp); 6721 freemsg(first_mp); 6722 return; 6723 } 6724 6725 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 6726 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6727 NULL, mctl_present); 6728 if (first_mp == NULL) { 6729 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6730 CONN_DEC_REF(connp); 6731 return; 6732 } 6733 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 6734 ASSERT(syn_present); 6735 if (mctl_present) { 6736 ASSERT(first_mp != mp); 6737 first_mp->b_datap->db_struioflag |= 6738 STRUIO_POLICY; 6739 } else { 6740 ASSERT(first_mp == mp); 6741 mp->b_datap->db_struioflag &= 6742 ~STRUIO_EAGER; 6743 mp->b_datap->db_struioflag |= 6744 STRUIO_POLICY; 6745 } 6746 } else { 6747 /* 6748 * Discard first_mp early since we're dealing with a 6749 * fully-connected conn_t and tcp doesn't do policy in 6750 * this case. 6751 */ 6752 if (mctl_present) { 6753 freeb(first_mp); 6754 mctl_present = B_FALSE; 6755 } 6756 first_mp = mp; 6757 } 6758 } 6759 6760 /* 6761 * Initiate policy processing here if needed. If we get here from 6762 * icmp_inbound_error_fanout, ip_policy is false. 6763 */ 6764 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6765 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6766 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6767 if (mp == NULL) { 6768 CONN_DEC_REF(connp); 6769 if (mctl_present) 6770 freeb(first_mp); 6771 return; 6772 } else if (mctl_present) { 6773 ASSERT(first_mp != mp); 6774 first_mp->b_cont = mp; 6775 } else { 6776 first_mp = mp; 6777 } 6778 } 6779 6780 6781 6782 /* Handle socket options. */ 6783 if (!syn_present && 6784 connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 6785 /* Add header */ 6786 ASSERT(recv_ill != NULL); 6787 /* 6788 * Since tcp does not support IP_RECVPKTINFO for V4, only pass 6789 * IPF_RECVIF. 6790 */ 6791 mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp), 6792 ipst); 6793 if (mp == NULL) { 6794 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6795 CONN_DEC_REF(connp); 6796 if (mctl_present) 6797 freeb(first_mp); 6798 return; 6799 } else if (mctl_present) { 6800 /* 6801 * ip_add_info might return a new mp. 6802 */ 6803 ASSERT(first_mp != mp); 6804 first_mp->b_cont = mp; 6805 } else { 6806 first_mp = mp; 6807 } 6808 } 6809 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6810 if (IPCL_IS_TCP(connp)) { 6811 /* do not drain, certain use cases can blow the stack */ 6812 squeue_enter_nodrain(connp->conn_sqp, first_mp, 6813 connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); 6814 } else { 6815 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 6816 (connp->conn_recv)(connp, first_mp, NULL); 6817 CONN_DEC_REF(connp); 6818 } 6819 } 6820 6821 /* 6822 * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or 6823 * pass it along to ESP if the SPI is non-zero. Returns TRUE if the mblk 6824 * is not consumed. 6825 * 6826 * One of four things can happen, all of which affect the passed-in mblk: 6827 * 6828 * 1.) ICMP messages that go through here just get returned TRUE. 6829 * 6830 * 2.) The packet is stock UDP and gets its zero-SPI stripped. Return TRUE. 6831 * 6832 * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent 6833 * ESP packet, and is passed along to ESP for consumption. Return FALSE. 6834 * 6835 * 4.) The packet is an ESP-in-UDP Keepalive. Drop it and return FALSE. 6836 */ 6837 static boolean_t 6838 zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, 6839 ipsec_stack_t *ipss) 6840 { 6841 int shift, plen, iph_len; 6842 ipha_t *ipha; 6843 udpha_t *udpha; 6844 uint32_t *spi; 6845 uint8_t *orptr; 6846 boolean_t udp_pkt, free_ire; 6847 6848 if (DB_TYPE(mp) == M_CTL) { 6849 /* 6850 * ICMP message with UDP inside. Don't bother stripping, just 6851 * send it up. 6852 * 6853 * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going 6854 * to ignore errors set by ICMP anyway ('cause they might be 6855 * forged), but that's the app's decision, not ours. 6856 */ 6857 6858 /* Bunch of reality checks for DEBUG kernels... */ 6859 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 6860 ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP); 6861 6862 return (B_TRUE); 6863 } 6864 6865 ipha = (ipha_t *)mp->b_rptr; 6866 iph_len = IPH_HDR_LENGTH(ipha); 6867 plen = ntohs(ipha->ipha_length); 6868 6869 if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) { 6870 /* 6871 * Most likely a keepalive for the benefit of an intervening 6872 * NAT. These aren't for us, per se, so drop it. 6873 * 6874 * RFC 3947/8 doesn't say for sure what to do for 2-3 6875 * byte packets (keepalives are 1-byte), but we'll drop them 6876 * also. 6877 */ 6878 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6879 DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper); 6880 return (B_FALSE); 6881 } 6882 6883 if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) { 6884 /* might as well pull it all up - it might be ESP. */ 6885 if (!pullupmsg(mp, -1)) { 6886 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6887 DROPPER(ipss, ipds_esp_nomem), 6888 &ipss->ipsec_dropper); 6889 return (B_FALSE); 6890 } 6891 6892 ipha = (ipha_t *)mp->b_rptr; 6893 } 6894 spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t)); 6895 if (*spi == 0) { 6896 /* UDP packet - remove 0-spi. */ 6897 shift = sizeof (uint32_t); 6898 } else { 6899 /* ESP-in-UDP packet - reduce to ESP. */ 6900 ipha->ipha_protocol = IPPROTO_ESP; 6901 shift = sizeof (udpha_t); 6902 } 6903 6904 /* Fix IP header */ 6905 ipha->ipha_length = htons(plen - shift); 6906 ipha->ipha_hdr_checksum = 0; 6907 6908 orptr = mp->b_rptr; 6909 mp->b_rptr += shift; 6910 6911 if (*spi == 0) { 6912 ASSERT((uint8_t *)ipha == orptr); 6913 udpha = (udpha_t *)(orptr + iph_len); 6914 udpha->uha_length = htons(plen - shift - iph_len); 6915 iph_len += sizeof (udpha_t); /* For the call to ovbcopy(). */ 6916 udp_pkt = B_TRUE; 6917 } else { 6918 udp_pkt = B_FALSE; 6919 } 6920 ovbcopy(orptr, orptr + shift, iph_len); 6921 if (!udp_pkt) /* Punt up for ESP processing. */ { 6922 ipha = (ipha_t *)(orptr + shift); 6923 6924 free_ire = (ire == NULL); 6925 if (free_ire) { 6926 /* Re-acquire ire. */ 6927 ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL, 6928 ipss->ipsec_netstack->netstack_ip); 6929 if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) { 6930 if (ire != NULL) 6931 ire_refrele(ire); 6932 /* 6933 * Do a regular freemsg(), as this is an IP 6934 * error (no local route) not an IPsec one. 6935 */ 6936 freemsg(mp); 6937 } 6938 } 6939 6940 ip_proto_input(q, mp, ipha, ire, recv_ill, B_TRUE); 6941 if (free_ire) 6942 ire_refrele(ire); 6943 } 6944 6945 return (udp_pkt); 6946 } 6947 6948 /* 6949 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 6950 * We are responsible for disposing of mp, such as by freemsg() or putnext() 6951 * Caller is responsible for dropping references to the conn, and freeing 6952 * first_mp. 6953 * 6954 * IPQoS Notes 6955 * Before sending it to the client, invoke IPPF processing. Policy processing 6956 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 6957 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 6958 * ip_wput_local, ip_policy is false. 6959 */ 6960 static void 6961 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 6962 boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 6963 boolean_t ip_policy) 6964 { 6965 boolean_t mctl_present = (first_mp != NULL); 6966 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 6967 uint32_t ill_index; 6968 ip_stack_t *ipst = recv_ill->ill_ipst; 6969 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6970 6971 ASSERT(ill != NULL); 6972 6973 if (mctl_present) 6974 first_mp->b_cont = mp; 6975 else 6976 first_mp = mp; 6977 6978 if (CONN_UDP_FLOWCTLD(connp)) { 6979 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 6980 freemsg(first_mp); 6981 return; 6982 } 6983 6984 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 6985 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6986 NULL, mctl_present); 6987 if (first_mp == NULL) { 6988 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 6989 return; /* Freed by ipsec_check_inbound_policy(). */ 6990 } 6991 } 6992 if (mctl_present) 6993 freeb(first_mp); 6994 6995 /* Let's hope the compilers utter "branch, predict-not-taken..." ;) */ 6996 if (connp->conn_udp->udp_nat_t_endpoint) { 6997 if (mctl_present) { 6998 /* mctl_present *shouldn't* happen. */ 6999 ip_drop_packet(mp, B_TRUE, NULL, NULL, 7000 DROPPER(ipss, ipds_esp_nat_t_ipsec), 7001 &ipss->ipsec_dropper); 7002 return; 7003 } 7004 7005 if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss)) 7006 return; 7007 } 7008 7009 /* Handle options. */ 7010 if (connp->conn_recvif) 7011 in_flags = IPF_RECVIF; 7012 /* 7013 * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag 7014 * passed to ip_add_info is based on IP version of connp. 7015 */ 7016 if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 7017 if (connp->conn_af_isv6) { 7018 /* 7019 * V6 only needs index 7020 */ 7021 in_flags |= IPF_RECVIF; 7022 } else { 7023 /* 7024 * V4 needs index + matching address. 7025 */ 7026 in_flags |= IPF_RECVADDR; 7027 } 7028 } 7029 7030 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 7031 in_flags |= IPF_RECVSLLA; 7032 7033 /* 7034 * Initiate IPPF processing here, if needed. Note first_mp won't be 7035 * freed if the packet is dropped. The caller will do so. 7036 */ 7037 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 7038 ill_index = recv_ill->ill_phyint->phyint_ifindex; 7039 ip_process(IPP_LOCAL_IN, &mp, ill_index); 7040 if (mp == NULL) { 7041 return; 7042 } 7043 } 7044 if ((in_flags != 0) && 7045 (mp->b_datap->db_type != M_CTL)) { 7046 /* 7047 * The actual data will be contained in b_cont 7048 * upon successful return of the following call 7049 * else original mblk is returned 7050 */ 7051 ASSERT(recv_ill != NULL); 7052 mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp), 7053 ipst); 7054 } 7055 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 7056 /* Send it upstream */ 7057 (connp->conn_recv)(connp, mp, NULL); 7058 } 7059 7060 /* 7061 * Fanout for UDP packets. 7062 * The caller puts <fport, lport> in the ports parameter. 7063 * 7064 * If SO_REUSEADDR is set all multicast and broadcast packets 7065 * will be delivered to all streams bound to the same port. 7066 * 7067 * Zones notes: 7068 * Multicast and broadcast packets will be distributed to streams in all zones. 7069 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 7070 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 7071 * packets. To maintain this behavior with multiple zones, the conns are grouped 7072 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 7073 * each zone. If unset, all the following conns in the same zone are skipped. 7074 */ 7075 static void 7076 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 7077 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 7078 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 7079 { 7080 uint32_t dstport, srcport; 7081 ipaddr_t dst; 7082 mblk_t *first_mp; 7083 boolean_t secure; 7084 in6_addr_t v6src; 7085 conn_t *connp; 7086 connf_t *connfp; 7087 conn_t *first_connp; 7088 conn_t *next_connp; 7089 mblk_t *mp1, *first_mp1; 7090 ipaddr_t src; 7091 zoneid_t last_zoneid; 7092 boolean_t reuseaddr; 7093 boolean_t shared_addr; 7094 boolean_t unlabeled; 7095 ip_stack_t *ipst; 7096 7097 ASSERT(recv_ill != NULL); 7098 ipst = recv_ill->ill_ipst; 7099 7100 first_mp = mp; 7101 if (mctl_present) { 7102 mp = first_mp->b_cont; 7103 first_mp->b_cont = NULL; 7104 secure = ipsec_in_is_secure(first_mp); 7105 ASSERT(mp != NULL); 7106 } else { 7107 first_mp = NULL; 7108 secure = B_FALSE; 7109 } 7110 7111 /* Extract ports in net byte order */ 7112 dstport = htons(ntohl(ports) & 0xFFFF); 7113 srcport = htons(ntohl(ports) >> 16); 7114 dst = ipha->ipha_dst; 7115 src = ipha->ipha_src; 7116 7117 unlabeled = B_FALSE; 7118 if (is_system_labeled()) 7119 /* Cred cannot be null on IPv4 */ 7120 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 7121 TSLF_UNLABELED) != 0; 7122 shared_addr = (zoneid == ALL_ZONES); 7123 if (shared_addr) { 7124 /* 7125 * No need to handle exclusive-stack zones since ALL_ZONES 7126 * only applies to the shared stack. 7127 */ 7128 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 7129 /* 7130 * If no shared MLP is found, tsol_mlp_findzone returns 7131 * ALL_ZONES. In that case, we assume it's SLP, and 7132 * search for the zone based on the packet label. 7133 * 7134 * If there is such a zone, we prefer to find a 7135 * connection in it. Otherwise, we look for a 7136 * MAC-exempt connection in any zone whose label 7137 * dominates the default label on the packet. 7138 */ 7139 if (zoneid == ALL_ZONES) 7140 zoneid = tsol_packet_to_zoneid(mp); 7141 else 7142 unlabeled = B_FALSE; 7143 } 7144 7145 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7146 mutex_enter(&connfp->connf_lock); 7147 connp = connfp->connf_head; 7148 if (!broadcast && !CLASSD(dst)) { 7149 /* 7150 * Not broadcast or multicast. Send to the one (first) 7151 * client we find. No need to check conn_wantpacket() 7152 * since IP_BOUND_IF/conn_incoming_ill does not apply to 7153 * IPv4 unicast packets. 7154 */ 7155 while ((connp != NULL) && 7156 (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) || 7157 (!IPCL_ZONE_MATCH(connp, zoneid) && 7158 !(unlabeled && connp->conn_mac_exempt)))) { 7159 /* 7160 * We keep searching since the conn did not match, 7161 * or its zone did not match and it is not either 7162 * an allzones conn or a mac exempt conn (if the 7163 * sender is unlabeled.) 7164 */ 7165 connp = connp->conn_next; 7166 } 7167 7168 if (connp == NULL || connp->conn_upq == NULL) 7169 goto notfound; 7170 7171 if (is_system_labeled() && 7172 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7173 connp)) 7174 goto notfound; 7175 7176 CONN_INC_REF(connp); 7177 mutex_exit(&connfp->connf_lock); 7178 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7179 flags, recv_ill, ip_policy); 7180 IP_STAT(ipst, ip_udp_fannorm); 7181 CONN_DEC_REF(connp); 7182 return; 7183 } 7184 7185 /* 7186 * Broadcast and multicast case 7187 * 7188 * Need to check conn_wantpacket(). 7189 * If SO_REUSEADDR has been set on the first we send the 7190 * packet to all clients that have joined the group and 7191 * match the port. 7192 */ 7193 7194 while (connp != NULL) { 7195 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 7196 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7197 (!is_system_labeled() || 7198 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7199 connp))) 7200 break; 7201 connp = connp->conn_next; 7202 } 7203 7204 if (connp == NULL || connp->conn_upq == NULL) 7205 goto notfound; 7206 7207 first_connp = connp; 7208 /* 7209 * When SO_REUSEADDR is not set, send the packet only to the first 7210 * matching connection in its zone by keeping track of the zoneid. 7211 */ 7212 reuseaddr = first_connp->conn_reuseaddr; 7213 last_zoneid = first_connp->conn_zoneid; 7214 7215 CONN_INC_REF(connp); 7216 connp = connp->conn_next; 7217 for (;;) { 7218 while (connp != NULL) { 7219 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 7220 (reuseaddr || connp->conn_zoneid != last_zoneid) && 7221 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7222 (!is_system_labeled() || 7223 tsol_receive_local(mp, &dst, IPV4_VERSION, 7224 shared_addr, connp))) 7225 break; 7226 connp = connp->conn_next; 7227 } 7228 /* 7229 * Just copy the data part alone. The mctl part is 7230 * needed just for verifying policy and it is never 7231 * sent up. 7232 */ 7233 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7234 ((mp1 = copymsg(mp)) == NULL))) { 7235 /* 7236 * No more interested clients or memory 7237 * allocation failed 7238 */ 7239 connp = first_connp; 7240 break; 7241 } 7242 if (connp->conn_zoneid != last_zoneid) { 7243 /* 7244 * Update the zoneid so that the packet isn't sent to 7245 * any more conns in the same zone unless SO_REUSEADDR 7246 * is set. 7247 */ 7248 reuseaddr = connp->conn_reuseaddr; 7249 last_zoneid = connp->conn_zoneid; 7250 } 7251 if (first_mp != NULL) { 7252 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7253 ipsec_info_type == IPSEC_IN); 7254 first_mp1 = ipsec_in_tag(first_mp, NULL, 7255 ipst->ips_netstack); 7256 if (first_mp1 == NULL) { 7257 freemsg(mp1); 7258 connp = first_connp; 7259 break; 7260 } 7261 } else { 7262 first_mp1 = NULL; 7263 } 7264 CONN_INC_REF(connp); 7265 mutex_exit(&connfp->connf_lock); 7266 /* 7267 * IPQoS notes: We don't send the packet for policy 7268 * processing here, will do it for the last one (below). 7269 * i.e. we do it per-packet now, but if we do policy 7270 * processing per-conn, then we would need to do it 7271 * here too. 7272 */ 7273 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7274 ipha, flags, recv_ill, B_FALSE); 7275 mutex_enter(&connfp->connf_lock); 7276 /* Follow the next pointer before releasing the conn. */ 7277 next_connp = connp->conn_next; 7278 IP_STAT(ipst, ip_udp_fanmb); 7279 CONN_DEC_REF(connp); 7280 connp = next_connp; 7281 } 7282 7283 /* Last one. Send it upstream. */ 7284 mutex_exit(&connfp->connf_lock); 7285 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7286 recv_ill, ip_policy); 7287 IP_STAT(ipst, ip_udp_fanmb); 7288 CONN_DEC_REF(connp); 7289 return; 7290 7291 notfound: 7292 7293 mutex_exit(&connfp->connf_lock); 7294 IP_STAT(ipst, ip_udp_fanothers); 7295 /* 7296 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 7297 * have already been matched above, since they live in the IPv4 7298 * fanout tables. This implies we only need to 7299 * check for IPv6 in6addr_any endpoints here. 7300 * Thus we compare using ipv6_all_zeros instead of the destination 7301 * address, except for the multicast group membership lookup which 7302 * uses the IPv4 destination. 7303 */ 7304 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 7305 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7306 mutex_enter(&connfp->connf_lock); 7307 connp = connfp->connf_head; 7308 if (!broadcast && !CLASSD(dst)) { 7309 while (connp != NULL) { 7310 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7311 srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) && 7312 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7313 !connp->conn_ipv6_v6only) 7314 break; 7315 connp = connp->conn_next; 7316 } 7317 7318 if (connp != NULL && is_system_labeled() && 7319 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7320 connp)) 7321 connp = NULL; 7322 7323 if (connp == NULL || connp->conn_upq == NULL) { 7324 /* 7325 * No one bound to this port. Is 7326 * there a client that wants all 7327 * unclaimed datagrams? 7328 */ 7329 mutex_exit(&connfp->connf_lock); 7330 7331 if (mctl_present) 7332 first_mp->b_cont = mp; 7333 else 7334 first_mp = mp; 7335 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP]. 7336 connf_head != NULL) { 7337 ip_fanout_proto(q, first_mp, ill, ipha, 7338 flags | IP_FF_RAWIP, mctl_present, 7339 ip_policy, recv_ill, zoneid); 7340 } else { 7341 if (ip_fanout_send_icmp(q, first_mp, flags, 7342 ICMP_DEST_UNREACHABLE, 7343 ICMP_PORT_UNREACHABLE, 7344 mctl_present, zoneid, ipst)) { 7345 BUMP_MIB(ill->ill_ip_mib, 7346 udpIfStatsNoPorts); 7347 } 7348 } 7349 return; 7350 } 7351 7352 CONN_INC_REF(connp); 7353 mutex_exit(&connfp->connf_lock); 7354 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7355 flags, recv_ill, ip_policy); 7356 CONN_DEC_REF(connp); 7357 return; 7358 } 7359 /* 7360 * IPv4 multicast packet being delivered to an AF_INET6 7361 * in6addr_any endpoint. 7362 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 7363 * and not conn_wantpacket_v6() since any multicast membership is 7364 * for an IPv4-mapped multicast address. 7365 * The packet is sent to all clients in all zones that have joined the 7366 * group and match the port. 7367 */ 7368 while (connp != NULL) { 7369 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7370 srcport, v6src) && 7371 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7372 (!is_system_labeled() || 7373 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7374 connp))) 7375 break; 7376 connp = connp->conn_next; 7377 } 7378 7379 if (connp == NULL || connp->conn_upq == NULL) { 7380 /* 7381 * No one bound to this port. Is 7382 * there a client that wants all 7383 * unclaimed datagrams? 7384 */ 7385 mutex_exit(&connfp->connf_lock); 7386 7387 if (mctl_present) 7388 first_mp->b_cont = mp; 7389 else 7390 first_mp = mp; 7391 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head != 7392 NULL) { 7393 ip_fanout_proto(q, first_mp, ill, ipha, 7394 flags | IP_FF_RAWIP, mctl_present, ip_policy, 7395 recv_ill, zoneid); 7396 } else { 7397 /* 7398 * We used to attempt to send an icmp error here, but 7399 * since this is known to be a multicast packet 7400 * and we don't send icmp errors in response to 7401 * multicast, just drop the packet and give up sooner. 7402 */ 7403 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); 7404 freemsg(first_mp); 7405 } 7406 return; 7407 } 7408 7409 first_connp = connp; 7410 7411 CONN_INC_REF(connp); 7412 connp = connp->conn_next; 7413 for (;;) { 7414 while (connp != NULL) { 7415 if (IPCL_UDP_MATCH_V6(connp, dstport, 7416 ipv6_all_zeros, srcport, v6src) && 7417 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7418 (!is_system_labeled() || 7419 tsol_receive_local(mp, &dst, IPV4_VERSION, 7420 shared_addr, connp))) 7421 break; 7422 connp = connp->conn_next; 7423 } 7424 /* 7425 * Just copy the data part alone. The mctl part is 7426 * needed just for verifying policy and it is never 7427 * sent up. 7428 */ 7429 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7430 ((mp1 = copymsg(mp)) == NULL))) { 7431 /* 7432 * No more intested clients or memory 7433 * allocation failed 7434 */ 7435 connp = first_connp; 7436 break; 7437 } 7438 if (first_mp != NULL) { 7439 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7440 ipsec_info_type == IPSEC_IN); 7441 first_mp1 = ipsec_in_tag(first_mp, NULL, 7442 ipst->ips_netstack); 7443 if (first_mp1 == NULL) { 7444 freemsg(mp1); 7445 connp = first_connp; 7446 break; 7447 } 7448 } else { 7449 first_mp1 = NULL; 7450 } 7451 CONN_INC_REF(connp); 7452 mutex_exit(&connfp->connf_lock); 7453 /* 7454 * IPQoS notes: We don't send the packet for policy 7455 * processing here, will do it for the last one (below). 7456 * i.e. we do it per-packet now, but if we do policy 7457 * processing per-conn, then we would need to do it 7458 * here too. 7459 */ 7460 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7461 ipha, flags, recv_ill, B_FALSE); 7462 mutex_enter(&connfp->connf_lock); 7463 /* Follow the next pointer before releasing the conn. */ 7464 next_connp = connp->conn_next; 7465 CONN_DEC_REF(connp); 7466 connp = next_connp; 7467 } 7468 7469 /* Last one. Send it upstream. */ 7470 mutex_exit(&connfp->connf_lock); 7471 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7472 recv_ill, ip_policy); 7473 CONN_DEC_REF(connp); 7474 } 7475 7476 /* 7477 * Complete the ip_wput header so that it 7478 * is possible to generate ICMP 7479 * errors. 7480 */ 7481 int 7482 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst) 7483 { 7484 ire_t *ire; 7485 7486 if (ipha->ipha_src == INADDR_ANY) { 7487 ire = ire_lookup_local(zoneid, ipst); 7488 if (ire == NULL) { 7489 ip1dbg(("ip_hdr_complete: no source IRE\n")); 7490 return (1); 7491 } 7492 ipha->ipha_src = ire->ire_addr; 7493 ire_refrele(ire); 7494 } 7495 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 7496 ipha->ipha_hdr_checksum = 0; 7497 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 7498 return (0); 7499 } 7500 7501 /* 7502 * Nobody should be sending 7503 * packets up this stream 7504 */ 7505 static void 7506 ip_lrput(queue_t *q, mblk_t *mp) 7507 { 7508 mblk_t *mp1; 7509 7510 switch (mp->b_datap->db_type) { 7511 case M_FLUSH: 7512 /* Turn around */ 7513 if (*mp->b_rptr & FLUSHW) { 7514 *mp->b_rptr &= ~FLUSHR; 7515 qreply(q, mp); 7516 return; 7517 } 7518 break; 7519 } 7520 /* Could receive messages that passed through ar_rput */ 7521 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 7522 mp1->b_prev = mp1->b_next = NULL; 7523 freemsg(mp); 7524 } 7525 7526 /* Nobody should be sending packets down this stream */ 7527 /* ARGSUSED */ 7528 void 7529 ip_lwput(queue_t *q, mblk_t *mp) 7530 { 7531 freemsg(mp); 7532 } 7533 7534 /* 7535 * Move the first hop in any source route to ipha_dst and remove that part of 7536 * the source route. Called by other protocols. Errors in option formatting 7537 * are ignored - will be handled by ip_wput_options Return the final 7538 * destination (either ipha_dst or the last entry in a source route.) 7539 */ 7540 ipaddr_t 7541 ip_massage_options(ipha_t *ipha, netstack_t *ns) 7542 { 7543 ipoptp_t opts; 7544 uchar_t *opt; 7545 uint8_t optval; 7546 uint8_t optlen; 7547 ipaddr_t dst; 7548 int i; 7549 ire_t *ire; 7550 ip_stack_t *ipst = ns->netstack_ip; 7551 7552 ip2dbg(("ip_massage_options\n")); 7553 dst = ipha->ipha_dst; 7554 for (optval = ipoptp_first(&opts, ipha); 7555 optval != IPOPT_EOL; 7556 optval = ipoptp_next(&opts)) { 7557 opt = opts.ipoptp_cur; 7558 switch (optval) { 7559 uint8_t off; 7560 case IPOPT_SSRR: 7561 case IPOPT_LSRR: 7562 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 7563 ip1dbg(("ip_massage_options: bad src route\n")); 7564 break; 7565 } 7566 optlen = opts.ipoptp_len; 7567 off = opt[IPOPT_OFFSET]; 7568 off--; 7569 redo_srr: 7570 if (optlen < IP_ADDR_LEN || 7571 off > optlen - IP_ADDR_LEN) { 7572 /* End of source route */ 7573 ip1dbg(("ip_massage_options: end of SR\n")); 7574 break; 7575 } 7576 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 7577 ip1dbg(("ip_massage_options: next hop 0x%x\n", 7578 ntohl(dst))); 7579 /* 7580 * Check if our address is present more than 7581 * once as consecutive hops in source route. 7582 * XXX verify per-interface ip_forwarding 7583 * for source route? 7584 */ 7585 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 7586 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7587 if (ire != NULL) { 7588 ire_refrele(ire); 7589 off += IP_ADDR_LEN; 7590 goto redo_srr; 7591 } 7592 if (dst == htonl(INADDR_LOOPBACK)) { 7593 ip1dbg(("ip_massage_options: loopback addr in " 7594 "source route!\n")); 7595 break; 7596 } 7597 /* 7598 * Update ipha_dst to be the first hop and remove the 7599 * first hop from the source route (by overwriting 7600 * part of the option with NOP options). 7601 */ 7602 ipha->ipha_dst = dst; 7603 /* Put the last entry in dst */ 7604 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 7605 3; 7606 bcopy(&opt[off], &dst, IP_ADDR_LEN); 7607 7608 ip1dbg(("ip_massage_options: last hop 0x%x\n", 7609 ntohl(dst))); 7610 /* Move down and overwrite */ 7611 opt[IP_ADDR_LEN] = opt[0]; 7612 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 7613 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 7614 for (i = 0; i < IP_ADDR_LEN; i++) 7615 opt[i] = IPOPT_NOP; 7616 break; 7617 } 7618 } 7619 return (dst); 7620 } 7621 7622 /* 7623 * Return the network mask 7624 * associated with the specified address. 7625 */ 7626 ipaddr_t 7627 ip_net_mask(ipaddr_t addr) 7628 { 7629 uchar_t *up = (uchar_t *)&addr; 7630 ipaddr_t mask = 0; 7631 uchar_t *maskp = (uchar_t *)&mask; 7632 7633 #if defined(__i386) || defined(__amd64) 7634 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 7635 #endif 7636 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 7637 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 7638 #endif 7639 if (CLASSD(addr)) { 7640 maskp[0] = 0xF0; 7641 return (mask); 7642 } 7643 7644 /* We assume Class E default netmask to be 32 */ 7645 if (CLASSE(addr)) 7646 return (0xffffffffU); 7647 7648 if (addr == 0) 7649 return (0); 7650 maskp[0] = 0xFF; 7651 if ((up[0] & 0x80) == 0) 7652 return (mask); 7653 7654 maskp[1] = 0xFF; 7655 if ((up[0] & 0xC0) == 0x80) 7656 return (mask); 7657 7658 maskp[2] = 0xFF; 7659 if ((up[0] & 0xE0) == 0xC0) 7660 return (mask); 7661 7662 /* Otherwise return no mask */ 7663 return ((ipaddr_t)0); 7664 } 7665 7666 /* 7667 * Select an ill for the packet by considering load spreading across 7668 * a different ill in the group if dst_ill is part of some group. 7669 */ 7670 ill_t * 7671 ip_newroute_get_dst_ill(ill_t *dst_ill) 7672 { 7673 ill_t *ill; 7674 7675 /* 7676 * We schedule irrespective of whether the source address is 7677 * INADDR_ANY or not. illgrp_scheduler returns a held ill. 7678 */ 7679 ill = illgrp_scheduler(dst_ill); 7680 if (ill == NULL) 7681 return (NULL); 7682 7683 /* 7684 * For groups with names ip_sioctl_groupname ensures that all 7685 * ills are of same type. For groups without names, ifgrp_insert 7686 * ensures this. 7687 */ 7688 ASSERT(dst_ill->ill_type == ill->ill_type); 7689 7690 return (ill); 7691 } 7692 7693 /* 7694 * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. 7695 */ 7696 ill_t * 7697 ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6, 7698 ip_stack_t *ipst) 7699 { 7700 ill_t *ret_ill; 7701 7702 ASSERT(ifindex != 0); 7703 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 7704 ipst); 7705 if (ret_ill == NULL || 7706 (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { 7707 if (isv6) { 7708 if (ill != NULL) { 7709 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 7710 } else { 7711 BUMP_MIB(&ipst->ips_ip6_mib, 7712 ipIfStatsOutDiscards); 7713 } 7714 ip1dbg(("ip_grab_attach_ill (IPv6): " 7715 "bad ifindex %d.\n", ifindex)); 7716 } else { 7717 if (ill != NULL) { 7718 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 7719 } else { 7720 BUMP_MIB(&ipst->ips_ip_mib, 7721 ipIfStatsOutDiscards); 7722 } 7723 ip1dbg(("ip_grab_attach_ill (IPv4): " 7724 "bad ifindex %d.\n", ifindex)); 7725 } 7726 if (ret_ill != NULL) 7727 ill_refrele(ret_ill); 7728 freemsg(first_mp); 7729 return (NULL); 7730 } 7731 7732 return (ret_ill); 7733 } 7734 7735 /* 7736 * IPv4 - 7737 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 7738 * out a packet to a destination address for which we do not have specific 7739 * (or sufficient) routing information. 7740 * 7741 * NOTE : These are the scopes of some of the variables that point at IRE, 7742 * which needs to be followed while making any future modifications 7743 * to avoid memory leaks. 7744 * 7745 * - ire and sire are the entries looked up initially by 7746 * ire_ftable_lookup. 7747 * - ipif_ire is used to hold the interface ire associated with 7748 * the new cache ire. But it's scope is limited, so we always REFRELE 7749 * it before branching out to error paths. 7750 * - save_ire is initialized before ire_create, so that ire returned 7751 * by ire_create will not over-write the ire. We REFRELE save_ire 7752 * before breaking out of the switch. 7753 * 7754 * Thus on failures, we have to REFRELE only ire and sire, if they 7755 * are not NULL. 7756 */ 7757 void 7758 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, 7759 zoneid_t zoneid, ip_stack_t *ipst) 7760 { 7761 areq_t *areq; 7762 ipaddr_t gw = 0; 7763 ire_t *ire = NULL; 7764 mblk_t *res_mp; 7765 ipaddr_t *addrp; 7766 ipaddr_t nexthop_addr; 7767 ipif_t *src_ipif = NULL; 7768 ill_t *dst_ill = NULL; 7769 ipha_t *ipha; 7770 ire_t *sire = NULL; 7771 mblk_t *first_mp; 7772 ire_t *save_ire; 7773 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ 7774 ushort_t ire_marks = 0; 7775 boolean_t mctl_present; 7776 ipsec_out_t *io; 7777 mblk_t *saved_mp; 7778 ire_t *first_sire = NULL; 7779 mblk_t *copy_mp = NULL; 7780 mblk_t *xmit_mp = NULL; 7781 ipaddr_t save_dst; 7782 uint32_t multirt_flags = 7783 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 7784 boolean_t multirt_is_resolvable; 7785 boolean_t multirt_resolve_next; 7786 boolean_t unspec_src; 7787 boolean_t do_attach_ill = B_FALSE; 7788 boolean_t ip_nexthop = B_FALSE; 7789 tsol_ire_gw_secattr_t *attrp = NULL; 7790 tsol_gcgrp_t *gcgrp = NULL; 7791 tsol_gcgrp_addr_t ga; 7792 7793 if (ip_debug > 2) { 7794 /* ip1dbg */ 7795 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7796 } 7797 7798 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7799 if (mctl_present) { 7800 io = (ipsec_out_t *)first_mp->b_rptr; 7801 ASSERT(io->ipsec_out_type == IPSEC_OUT); 7802 ASSERT(zoneid == io->ipsec_out_zoneid); 7803 ASSERT(zoneid != ALL_ZONES); 7804 } 7805 7806 ipha = (ipha_t *)mp->b_rptr; 7807 7808 /* All multicast lookups come through ip_newroute_ipif() */ 7809 if (CLASSD(dst)) { 7810 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7811 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7812 freemsg(first_mp); 7813 return; 7814 } 7815 7816 if (mctl_present && io->ipsec_out_attach_if) { 7817 /* ip_grab_attach_ill returns a held ill */ 7818 attach_ill = ip_grab_attach_ill(NULL, first_mp, 7819 io->ipsec_out_ill_index, B_FALSE, ipst); 7820 7821 /* Failure case frees things for us. */ 7822 if (attach_ill == NULL) 7823 return; 7824 7825 /* 7826 * Check if we need an ire that will not be 7827 * looked up by anybody else i.e. HIDDEN. 7828 */ 7829 if (ill_is_probeonly(attach_ill)) 7830 ire_marks = IRE_MARK_HIDDEN; 7831 } 7832 if (mctl_present && io->ipsec_out_ip_nexthop) { 7833 ip_nexthop = B_TRUE; 7834 nexthop_addr = io->ipsec_out_nexthop_addr; 7835 } 7836 /* 7837 * If this IRE is created for forwarding or it is not for 7838 * traffic for congestion controlled protocols, mark it as temporary. 7839 */ 7840 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7841 ire_marks |= IRE_MARK_TEMPORARY; 7842 7843 /* 7844 * Get what we can from ire_ftable_lookup which will follow an IRE 7845 * chain until it gets the most specific information available. 7846 * For example, we know that there is no IRE_CACHE for this dest, 7847 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7848 * ire_ftable_lookup will look up the gateway, etc. 7849 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7850 * to the destination, of equal netmask length in the forward table, 7851 * will be recursively explored. If no information is available 7852 * for the final gateway of that route, we force the returned ire 7853 * to be equal to sire using MATCH_IRE_PARENT. 7854 * At least, in this case we have a starting point (in the buckets) 7855 * to look for other routes to the destination in the forward table. 7856 * This is actually used only for multirouting, where a list 7857 * of routes has to be processed in sequence. 7858 * 7859 * In the process of coming up with the most specific information, 7860 * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry 7861 * for the gateway (i.e., one for which the ire_nce->nce_state is 7862 * not yet ND_REACHABLE, and is in the middle of arp resolution). 7863 * Two caveats when handling incomplete ire's in ip_newroute: 7864 * - we should be careful when accessing its ire_nce (specifically 7865 * the nce_res_mp) ast it might change underneath our feet, and, 7866 * - not all legacy code path callers are prepared to handle 7867 * incomplete ire's, so we should not create/add incomplete 7868 * ire_cache entries here. (See discussion about temporary solution 7869 * further below). 7870 * 7871 * In order to minimize packet dropping, and to preserve existing 7872 * behavior, we treat this case as if there were no IRE_CACHE for the 7873 * gateway, and instead use the IF_RESOLVER ire to send out 7874 * another request to ARP (this is achieved by passing the 7875 * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the 7876 * arp response comes back in ip_wput_nondata, we will create 7877 * a per-dst ire_cache that has an ND_COMPLETE ire. 7878 * 7879 * Note that this is a temporary solution; the correct solution is 7880 * to create an incomplete per-dst ire_cache entry, and send the 7881 * packet out when the gw's nce is resolved. In order to achieve this, 7882 * all packet processing must have been completed prior to calling 7883 * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need 7884 * to be modified to accomodate this solution. 7885 */ 7886 if (ip_nexthop) { 7887 /* 7888 * The first time we come here, we look for an IRE_INTERFACE 7889 * entry for the specified nexthop, set the dst to be the 7890 * nexthop address and create an IRE_CACHE entry for the 7891 * nexthop. The next time around, we are able to find an 7892 * IRE_CACHE entry for the nexthop, set the gateway to be the 7893 * nexthop address and create an IRE_CACHE entry for the 7894 * destination address via the specified nexthop. 7895 */ 7896 ire = ire_cache_lookup(nexthop_addr, zoneid, 7897 MBLK_GETLABEL(mp), ipst); 7898 if (ire != NULL) { 7899 gw = nexthop_addr; 7900 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7901 } else { 7902 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7903 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7904 MBLK_GETLABEL(mp), 7905 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 7906 ipst); 7907 if (ire != NULL) { 7908 dst = nexthop_addr; 7909 } 7910 } 7911 } else if (attach_ill == NULL) { 7912 ire = ire_ftable_lookup(dst, 0, 0, 0, 7913 NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), 7914 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7915 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7916 MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, 7917 ipst); 7918 } else { 7919 /* 7920 * attach_ill is set only for communicating with 7921 * on-link hosts. So, don't look for DEFAULT. 7922 */ 7923 ipif_t *attach_ipif; 7924 7925 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 7926 if (attach_ipif == NULL) { 7927 ill_refrele(attach_ill); 7928 goto icmp_err_ret; 7929 } 7930 ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, 7931 &sire, zoneid, 0, MBLK_GETLABEL(mp), 7932 MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | 7933 MATCH_IRE_SECATTR, ipst); 7934 ipif_refrele(attach_ipif); 7935 } 7936 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7937 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7938 7939 /* 7940 * This loop is run only once in most cases. 7941 * We loop to resolve further routes only when the destination 7942 * can be reached through multiple RTF_MULTIRT-flagged ires. 7943 */ 7944 do { 7945 /* Clear the previous iteration's values */ 7946 if (src_ipif != NULL) { 7947 ipif_refrele(src_ipif); 7948 src_ipif = NULL; 7949 } 7950 if (dst_ill != NULL) { 7951 ill_refrele(dst_ill); 7952 dst_ill = NULL; 7953 } 7954 7955 multirt_resolve_next = B_FALSE; 7956 /* 7957 * We check if packets have to be multirouted. 7958 * In this case, given the current <ire, sire> couple, 7959 * we look for the next suitable <ire, sire>. 7960 * This check is done in ire_multirt_lookup(), 7961 * which applies various criteria to find the next route 7962 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7963 * unchanged if it detects it has not been tried yet. 7964 */ 7965 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7966 ip3dbg(("ip_newroute: starting next_resolution " 7967 "with first_mp %p, tag %d\n", 7968 (void *)first_mp, 7969 MULTIRT_DEBUG_TAGGED(first_mp))); 7970 7971 ASSERT(sire != NULL); 7972 multirt_is_resolvable = 7973 ire_multirt_lookup(&ire, &sire, multirt_flags, 7974 MBLK_GETLABEL(mp), ipst); 7975 7976 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 7977 "ire %p, sire %p\n", 7978 multirt_is_resolvable, 7979 (void *)ire, (void *)sire)); 7980 7981 if (!multirt_is_resolvable) { 7982 /* 7983 * No more multirt route to resolve; give up 7984 * (all routes resolved or no more 7985 * resolvable routes). 7986 */ 7987 if (ire != NULL) { 7988 ire_refrele(ire); 7989 ire = NULL; 7990 } 7991 } else { 7992 ASSERT(sire != NULL); 7993 ASSERT(ire != NULL); 7994 /* 7995 * We simply use first_sire as a flag that 7996 * indicates if a resolvable multirt route 7997 * has already been found. 7998 * If it is not the case, we may have to send 7999 * an ICMP error to report that the 8000 * destination is unreachable. 8001 * We do not IRE_REFHOLD first_sire. 8002 */ 8003 if (first_sire == NULL) { 8004 first_sire = sire; 8005 } 8006 } 8007 } 8008 if (ire == NULL) { 8009 if (ip_debug > 3) { 8010 /* ip2dbg */ 8011 pr_addr_dbg("ip_newroute: " 8012 "can't resolve %s\n", AF_INET, &dst); 8013 } 8014 ip3dbg(("ip_newroute: " 8015 "ire %p, sire %p, first_sire %p\n", 8016 (void *)ire, (void *)sire, (void *)first_sire)); 8017 8018 if (sire != NULL) { 8019 ire_refrele(sire); 8020 sire = NULL; 8021 } 8022 8023 if (first_sire != NULL) { 8024 /* 8025 * At least one multirt route has been found 8026 * in the same call to ip_newroute(); 8027 * there is no need to report an ICMP error. 8028 * first_sire was not IRE_REFHOLDed. 8029 */ 8030 MULTIRT_DEBUG_UNTAG(first_mp); 8031 freemsg(first_mp); 8032 return; 8033 } 8034 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 8035 RTA_DST, ipst); 8036 if (attach_ill != NULL) 8037 ill_refrele(attach_ill); 8038 goto icmp_err_ret; 8039 } 8040 8041 /* 8042 * Verify that the returned IRE does not have either 8043 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 8044 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 8045 */ 8046 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 8047 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 8048 if (attach_ill != NULL) 8049 ill_refrele(attach_ill); 8050 goto icmp_err_ret; 8051 } 8052 /* 8053 * Increment the ire_ob_pkt_count field for ire if it is an 8054 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 8055 * increment the same for the parent IRE, sire, if it is some 8056 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST) 8057 */ 8058 if ((ire->ire_type & IRE_INTERFACE) != 0) { 8059 UPDATE_OB_PKT_COUNT(ire); 8060 ire->ire_last_used_time = lbolt; 8061 } 8062 8063 if (sire != NULL) { 8064 gw = sire->ire_gateway_addr; 8065 ASSERT((sire->ire_type & (IRE_CACHETABLE | 8066 IRE_INTERFACE)) == 0); 8067 UPDATE_OB_PKT_COUNT(sire); 8068 sire->ire_last_used_time = lbolt; 8069 } 8070 /* 8071 * We have a route to reach the destination. 8072 * 8073 * 1) If the interface is part of ill group, try to get a new 8074 * ill taking load spreading into account. 8075 * 8076 * 2) After selecting the ill, get a source address that 8077 * might create good inbound load spreading. 8078 * ipif_select_source does this for us. 8079 * 8080 * If the application specified the ill (ifindex), we still 8081 * load spread. Only if the packets needs to go out 8082 * specifically on a given ill e.g. binding to 8083 * IPIF_NOFAILOVER address, then we don't try to use a 8084 * different ill for load spreading. 8085 */ 8086 if (attach_ill == NULL) { 8087 /* 8088 * Don't perform outbound load spreading in the 8089 * case of an RTF_MULTIRT route, as we actually 8090 * typically want to replicate outgoing packets 8091 * through particular interfaces. 8092 */ 8093 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8094 dst_ill = ire->ire_ipif->ipif_ill; 8095 /* for uniformity */ 8096 ill_refhold(dst_ill); 8097 } else { 8098 /* 8099 * If we are here trying to create an IRE_CACHE 8100 * for an offlink destination and have the 8101 * IRE_CACHE for the next hop and the latter is 8102 * using virtual IP source address selection i.e 8103 * it's ire->ire_ipif is pointing to a virtual 8104 * network interface (vni) then 8105 * ip_newroute_get_dst_ll() will return the vni 8106 * interface as the dst_ill. Since the vni is 8107 * virtual i.e not associated with any physical 8108 * interface, it cannot be the dst_ill, hence 8109 * in such a case call ip_newroute_get_dst_ll() 8110 * with the stq_ill instead of the ire_ipif ILL. 8111 * The function returns a refheld ill. 8112 */ 8113 if ((ire->ire_type == IRE_CACHE) && 8114 IS_VNI(ire->ire_ipif->ipif_ill)) 8115 dst_ill = ip_newroute_get_dst_ill( 8116 ire->ire_stq->q_ptr); 8117 else 8118 dst_ill = ip_newroute_get_dst_ill( 8119 ire->ire_ipif->ipif_ill); 8120 } 8121 if (dst_ill == NULL) { 8122 if (ip_debug > 2) { 8123 pr_addr_dbg("ip_newroute: " 8124 "no dst ill for dst" 8125 " %s\n", AF_INET, &dst); 8126 } 8127 goto icmp_err_ret; 8128 } 8129 } else { 8130 dst_ill = ire->ire_ipif->ipif_ill; 8131 /* for uniformity */ 8132 ill_refhold(dst_ill); 8133 /* 8134 * We should have found a route matching ill as we 8135 * called ire_ftable_lookup with MATCH_IRE_ILL. 8136 * Rather than asserting, when there is a mismatch, 8137 * we just drop the packet. 8138 */ 8139 if (dst_ill != attach_ill) { 8140 ip0dbg(("ip_newroute: Packet dropped as " 8141 "IPIF_NOFAILOVER ill is %s, " 8142 "ire->ire_ipif->ipif_ill is %s\n", 8143 attach_ill->ill_name, 8144 dst_ill->ill_name)); 8145 ill_refrele(attach_ill); 8146 goto icmp_err_ret; 8147 } 8148 } 8149 /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ 8150 if (attach_ill != NULL) { 8151 ill_refrele(attach_ill); 8152 attach_ill = NULL; 8153 do_attach_ill = B_TRUE; 8154 } 8155 ASSERT(dst_ill != NULL); 8156 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 8157 8158 /* 8159 * Pick the best source address from dst_ill. 8160 * 8161 * 1) If it is part of a multipathing group, we would 8162 * like to spread the inbound packets across different 8163 * interfaces. ipif_select_source picks a random source 8164 * across the different ills in the group. 8165 * 8166 * 2) If it is not part of a multipathing group, we try 8167 * to pick the source address from the destination 8168 * route. Clustering assumes that when we have multiple 8169 * prefixes hosted on an interface, the prefix of the 8170 * source address matches the prefix of the destination 8171 * route. We do this only if the address is not 8172 * DEPRECATED. 8173 * 8174 * 3) If the conn is in a different zone than the ire, we 8175 * need to pick a source address from the right zone. 8176 * 8177 * NOTE : If we hit case (1) above, the prefix of the source 8178 * address picked may not match the prefix of the 8179 * destination routes prefix as ipif_select_source 8180 * does not look at "dst" while picking a source 8181 * address. 8182 * If we want the same behavior as (2), we will need 8183 * to change the behavior of ipif_select_source. 8184 */ 8185 ASSERT(src_ipif == NULL); 8186 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 8187 /* 8188 * The RTF_SETSRC flag is set in the parent ire (sire). 8189 * Check that the ipif matching the requested source 8190 * address still exists. 8191 */ 8192 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 8193 zoneid, NULL, NULL, NULL, NULL, ipst); 8194 } 8195 8196 unspec_src = (connp != NULL && connp->conn_unspec_src); 8197 8198 if (src_ipif == NULL && 8199 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 8200 ire_marks |= IRE_MARK_USESRC_CHECK; 8201 if ((dst_ill->ill_group != NULL) || 8202 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 8203 (connp != NULL && ire->ire_zoneid != zoneid && 8204 ire->ire_zoneid != ALL_ZONES) || 8205 (dst_ill->ill_usesrc_ifindex != 0)) { 8206 /* 8207 * If the destination is reachable via a 8208 * given gateway, the selected source address 8209 * should be in the same subnet as the gateway. 8210 * Otherwise, the destination is not reachable. 8211 * 8212 * If there are no interfaces on the same subnet 8213 * as the destination, ipif_select_source gives 8214 * first non-deprecated interface which might be 8215 * on a different subnet than the gateway. 8216 * This is not desirable. Hence pass the dst_ire 8217 * source address to ipif_select_source. 8218 * It is sure that the destination is reachable 8219 * with the dst_ire source address subnet. 8220 * So passing dst_ire source address to 8221 * ipif_select_source will make sure that the 8222 * selected source will be on the same subnet 8223 * as dst_ire source address. 8224 */ 8225 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 8226 src_ipif = ipif_select_source(dst_ill, saddr, 8227 zoneid); 8228 if (src_ipif == NULL) { 8229 if (ip_debug > 2) { 8230 pr_addr_dbg("ip_newroute: " 8231 "no src for dst %s ", 8232 AF_INET, &dst); 8233 printf("through interface %s\n", 8234 dst_ill->ill_name); 8235 } 8236 goto icmp_err_ret; 8237 } 8238 } else { 8239 src_ipif = ire->ire_ipif; 8240 ASSERT(src_ipif != NULL); 8241 /* hold src_ipif for uniformity */ 8242 ipif_refhold(src_ipif); 8243 } 8244 } 8245 8246 /* 8247 * Assign a source address while we have the conn. 8248 * We can't have ip_wput_ire pick a source address when the 8249 * packet returns from arp since we need to look at 8250 * conn_unspec_src and conn_zoneid, and we lose the conn when 8251 * going through arp. 8252 * 8253 * NOTE : ip_newroute_v6 does not have this piece of code as 8254 * it uses ip6i to store this information. 8255 */ 8256 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 8257 ipha->ipha_src = src_ipif->ipif_src_addr; 8258 8259 if (ip_debug > 3) { 8260 /* ip2dbg */ 8261 pr_addr_dbg("ip_newroute: first hop %s\n", 8262 AF_INET, &gw); 8263 } 8264 ip2dbg(("\tire type %s (%d)\n", 8265 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 8266 8267 /* 8268 * The TTL of multirouted packets is bounded by the 8269 * ip_multirt_ttl ndd variable. 8270 */ 8271 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8272 /* Force TTL of multirouted packets */ 8273 if ((ipst->ips_ip_multirt_ttl > 0) && 8274 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 8275 ip2dbg(("ip_newroute: forcing multirt TTL " 8276 "to %d (was %d), dst 0x%08x\n", 8277 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 8278 ntohl(sire->ire_addr))); 8279 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 8280 } 8281 } 8282 /* 8283 * At this point in ip_newroute(), ire is either the 8284 * IRE_CACHE of the next-hop gateway for an off-subnet 8285 * destination or an IRE_INTERFACE type that should be used 8286 * to resolve an on-subnet destination or an on-subnet 8287 * next-hop gateway. 8288 * 8289 * In the IRE_CACHE case, we have the following : 8290 * 8291 * 1) src_ipif - used for getting a source address. 8292 * 8293 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8294 * means packets using this IRE_CACHE will go out on 8295 * dst_ill. 8296 * 8297 * 3) The IRE sire will point to the prefix that is the 8298 * longest matching route for the destination. These 8299 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST. 8300 * 8301 * The newly created IRE_CACHE entry for the off-subnet 8302 * destination is tied to both the prefix route and the 8303 * interface route used to resolve the next-hop gateway 8304 * via the ire_phandle and ire_ihandle fields, 8305 * respectively. 8306 * 8307 * In the IRE_INTERFACE case, we have the following : 8308 * 8309 * 1) src_ipif - used for getting a source address. 8310 * 8311 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8312 * means packets using the IRE_CACHE that we will build 8313 * here will go out on dst_ill. 8314 * 8315 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 8316 * to be created will only be tied to the IRE_INTERFACE 8317 * that was derived from the ire_ihandle field. 8318 * 8319 * If sire is non-NULL, it means the destination is 8320 * off-link and we will first create the IRE_CACHE for the 8321 * gateway. Next time through ip_newroute, we will create 8322 * the IRE_CACHE for the final destination as described 8323 * above. 8324 * 8325 * In both cases, after the current resolution has been 8326 * completed (or possibly initialised, in the IRE_INTERFACE 8327 * case), the loop may be re-entered to attempt the resolution 8328 * of another RTF_MULTIRT route. 8329 * 8330 * When an IRE_CACHE entry for the off-subnet destination is 8331 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 8332 * for further processing in emission loops. 8333 */ 8334 save_ire = ire; 8335 switch (ire->ire_type) { 8336 case IRE_CACHE: { 8337 ire_t *ipif_ire; 8338 8339 ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE); 8340 if (gw == 0) 8341 gw = ire->ire_gateway_addr; 8342 /* 8343 * We need 3 ire's to create a new cache ire for an 8344 * off-link destination from the cache ire of the 8345 * gateway. 8346 * 8347 * 1. The prefix ire 'sire' (Note that this does 8348 * not apply to the conn_nexthop_set case) 8349 * 2. The cache ire of the gateway 'ire' 8350 * 3. The interface ire 'ipif_ire' 8351 * 8352 * We have (1) and (2). We lookup (3) below. 8353 * 8354 * If there is no interface route to the gateway, 8355 * it is a race condition, where we found the cache 8356 * but the interface route has been deleted. 8357 */ 8358 if (ip_nexthop) { 8359 ipif_ire = ire_ihandle_lookup_onlink(ire); 8360 } else { 8361 ipif_ire = 8362 ire_ihandle_lookup_offlink(ire, sire); 8363 } 8364 if (ipif_ire == NULL) { 8365 ip1dbg(("ip_newroute: " 8366 "ire_ihandle_lookup_offlink failed\n")); 8367 goto icmp_err_ret; 8368 } 8369 8370 /* 8371 * Check cached gateway IRE for any security 8372 * attributes; if found, associate the gateway 8373 * credentials group to the destination IRE. 8374 */ 8375 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 8376 mutex_enter(&attrp->igsa_lock); 8377 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 8378 GCGRP_REFHOLD(gcgrp); 8379 mutex_exit(&attrp->igsa_lock); 8380 } 8381 8382 /* 8383 * XXX For the source of the resolver mp, 8384 * we are using the same DL_UNITDATA_REQ 8385 * (from save_ire->ire_nce->nce_res_mp) 8386 * though the save_ire is not pointing at the same ill. 8387 * This is incorrect. We need to send it up to the 8388 * resolver to get the right res_mp. For ethernets 8389 * this may be okay (ill_type == DL_ETHER). 8390 */ 8391 8392 ire = ire_create( 8393 (uchar_t *)&dst, /* dest address */ 8394 (uchar_t *)&ip_g_all_ones, /* mask */ 8395 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8396 (uchar_t *)&gw, /* gateway address */ 8397 &save_ire->ire_max_frag, 8398 save_ire->ire_nce, /* src nce */ 8399 dst_ill->ill_rq, /* recv-from queue */ 8400 dst_ill->ill_wq, /* send-to queue */ 8401 IRE_CACHE, /* IRE type */ 8402 src_ipif, 8403 (sire != NULL) ? 8404 sire->ire_mask : 0, /* Parent mask */ 8405 (sire != NULL) ? 8406 sire->ire_phandle : 0, /* Parent handle */ 8407 ipif_ire->ire_ihandle, /* Interface handle */ 8408 (sire != NULL) ? (sire->ire_flags & 8409 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 8410 (sire != NULL) ? 8411 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 8412 NULL, 8413 gcgrp, 8414 ipst); 8415 8416 if (ire == NULL) { 8417 if (gcgrp != NULL) { 8418 GCGRP_REFRELE(gcgrp); 8419 gcgrp = NULL; 8420 } 8421 ire_refrele(ipif_ire); 8422 ire_refrele(save_ire); 8423 break; 8424 } 8425 8426 /* reference now held by IRE */ 8427 gcgrp = NULL; 8428 8429 ire->ire_marks |= ire_marks; 8430 8431 /* 8432 * Prevent sire and ipif_ire from getting deleted. 8433 * The newly created ire is tied to both of them via 8434 * the phandle and ihandle respectively. 8435 */ 8436 if (sire != NULL) { 8437 IRB_REFHOLD(sire->ire_bucket); 8438 /* Has it been removed already ? */ 8439 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 8440 IRB_REFRELE(sire->ire_bucket); 8441 ire_refrele(ipif_ire); 8442 ire_refrele(save_ire); 8443 break; 8444 } 8445 } 8446 8447 IRB_REFHOLD(ipif_ire->ire_bucket); 8448 /* Has it been removed already ? */ 8449 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 8450 IRB_REFRELE(ipif_ire->ire_bucket); 8451 if (sire != NULL) 8452 IRB_REFRELE(sire->ire_bucket); 8453 ire_refrele(ipif_ire); 8454 ire_refrele(save_ire); 8455 break; 8456 } 8457 8458 xmit_mp = first_mp; 8459 /* 8460 * In the case of multirouting, a copy 8461 * of the packet is done before its sending. 8462 * The copy is used to attempt another 8463 * route resolution, in a next loop. 8464 */ 8465 if (ire->ire_flags & RTF_MULTIRT) { 8466 copy_mp = copymsg(first_mp); 8467 if (copy_mp != NULL) { 8468 xmit_mp = copy_mp; 8469 MULTIRT_DEBUG_TAG(first_mp); 8470 } 8471 } 8472 ire_add_then_send(q, ire, xmit_mp); 8473 ire_refrele(save_ire); 8474 8475 /* Assert that sire is not deleted yet. */ 8476 if (sire != NULL) { 8477 ASSERT(sire->ire_ptpn != NULL); 8478 IRB_REFRELE(sire->ire_bucket); 8479 } 8480 8481 /* Assert that ipif_ire is not deleted yet. */ 8482 ASSERT(ipif_ire->ire_ptpn != NULL); 8483 IRB_REFRELE(ipif_ire->ire_bucket); 8484 ire_refrele(ipif_ire); 8485 8486 /* 8487 * If copy_mp is not NULL, multirouting was 8488 * requested. We loop to initiate a next 8489 * route resolution attempt, starting from sire. 8490 */ 8491 if (copy_mp != NULL) { 8492 /* 8493 * Search for the next unresolved 8494 * multirt route. 8495 */ 8496 copy_mp = NULL; 8497 ipif_ire = NULL; 8498 ire = NULL; 8499 multirt_resolve_next = B_TRUE; 8500 continue; 8501 } 8502 if (sire != NULL) 8503 ire_refrele(sire); 8504 ipif_refrele(src_ipif); 8505 ill_refrele(dst_ill); 8506 return; 8507 } 8508 case IRE_IF_NORESOLVER: { 8509 8510 if (dst_ill->ill_phys_addr_length != IP_ADDR_LEN && 8511 dst_ill->ill_resolver_mp == NULL) { 8512 ip1dbg(("ip_newroute: dst_ill %p " 8513 "for IRE_IF_NORESOLVER ire %p has " 8514 "no ill_resolver_mp\n", 8515 (void *)dst_ill, (void *)ire)); 8516 break; 8517 } 8518 8519 /* 8520 * TSol note: We are creating the ire cache for the 8521 * destination 'dst'. If 'dst' is offlink, going 8522 * through the first hop 'gw', the security attributes 8523 * of 'dst' must be set to point to the gateway 8524 * credentials of gateway 'gw'. If 'dst' is onlink, it 8525 * is possible that 'dst' is a potential gateway that is 8526 * referenced by some route that has some security 8527 * attributes. Thus in the former case, we need to do a 8528 * gcgrp_lookup of 'gw' while in the latter case we 8529 * need to do gcgrp_lookup of 'dst' itself. 8530 */ 8531 ga.ga_af = AF_INET; 8532 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 8533 &ga.ga_addr); 8534 gcgrp = gcgrp_lookup(&ga, B_FALSE); 8535 8536 ire = ire_create( 8537 (uchar_t *)&dst, /* dest address */ 8538 (uchar_t *)&ip_g_all_ones, /* mask */ 8539 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8540 (uchar_t *)&gw, /* gateway address */ 8541 &save_ire->ire_max_frag, 8542 NULL, /* no src nce */ 8543 dst_ill->ill_rq, /* recv-from queue */ 8544 dst_ill->ill_wq, /* send-to queue */ 8545 IRE_CACHE, 8546 src_ipif, 8547 save_ire->ire_mask, /* Parent mask */ 8548 (sire != NULL) ? /* Parent handle */ 8549 sire->ire_phandle : 0, 8550 save_ire->ire_ihandle, /* Interface handle */ 8551 (sire != NULL) ? sire->ire_flags & 8552 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 8553 &(save_ire->ire_uinfo), 8554 NULL, 8555 gcgrp, 8556 ipst); 8557 8558 if (ire == NULL) { 8559 if (gcgrp != NULL) { 8560 GCGRP_REFRELE(gcgrp); 8561 gcgrp = NULL; 8562 } 8563 ire_refrele(save_ire); 8564 break; 8565 } 8566 8567 /* reference now held by IRE */ 8568 gcgrp = NULL; 8569 8570 ire->ire_marks |= ire_marks; 8571 8572 /* Prevent save_ire from getting deleted */ 8573 IRB_REFHOLD(save_ire->ire_bucket); 8574 /* Has it been removed already ? */ 8575 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8576 IRB_REFRELE(save_ire->ire_bucket); 8577 ire_refrele(save_ire); 8578 break; 8579 } 8580 8581 /* 8582 * In the case of multirouting, a copy 8583 * of the packet is made before it is sent. 8584 * The copy is used in the next 8585 * loop to attempt another resolution. 8586 */ 8587 xmit_mp = first_mp; 8588 if ((sire != NULL) && 8589 (sire->ire_flags & RTF_MULTIRT)) { 8590 copy_mp = copymsg(first_mp); 8591 if (copy_mp != NULL) { 8592 xmit_mp = copy_mp; 8593 MULTIRT_DEBUG_TAG(first_mp); 8594 } 8595 } 8596 ire_add_then_send(q, ire, xmit_mp); 8597 8598 /* Assert that it is not deleted yet. */ 8599 ASSERT(save_ire->ire_ptpn != NULL); 8600 IRB_REFRELE(save_ire->ire_bucket); 8601 ire_refrele(save_ire); 8602 8603 if (copy_mp != NULL) { 8604 /* 8605 * If we found a (no)resolver, we ignore any 8606 * trailing top priority IRE_CACHE in further 8607 * loops. This ensures that we do not omit any 8608 * (no)resolver. 8609 * This IRE_CACHE, if any, will be processed 8610 * by another thread entering ip_newroute(). 8611 * IRE_CACHE entries, if any, will be processed 8612 * by another thread entering ip_newroute(), 8613 * (upon resolver response, for instance). 8614 * This aims to force parallel multirt 8615 * resolutions as soon as a packet must be sent. 8616 * In the best case, after the tx of only one 8617 * packet, all reachable routes are resolved. 8618 * Otherwise, the resolution of all RTF_MULTIRT 8619 * routes would require several emissions. 8620 */ 8621 multirt_flags &= ~MULTIRT_CACHEGW; 8622 8623 /* 8624 * Search for the next unresolved multirt 8625 * route. 8626 */ 8627 copy_mp = NULL; 8628 save_ire = NULL; 8629 ire = NULL; 8630 multirt_resolve_next = B_TRUE; 8631 continue; 8632 } 8633 8634 /* 8635 * Don't need sire anymore 8636 */ 8637 if (sire != NULL) 8638 ire_refrele(sire); 8639 8640 ipif_refrele(src_ipif); 8641 ill_refrele(dst_ill); 8642 return; 8643 } 8644 case IRE_IF_RESOLVER: 8645 /* 8646 * We can't build an IRE_CACHE yet, but at least we 8647 * found a resolver that can help. 8648 */ 8649 res_mp = dst_ill->ill_resolver_mp; 8650 if (!OK_RESOLVER_MP(res_mp)) 8651 break; 8652 8653 /* 8654 * To be at this point in the code with a non-zero gw 8655 * means that dst is reachable through a gateway that 8656 * we have never resolved. By changing dst to the gw 8657 * addr we resolve the gateway first. 8658 * When ire_add_then_send() tries to put the IP dg 8659 * to dst, it will reenter ip_newroute() at which 8660 * time we will find the IRE_CACHE for the gw and 8661 * create another IRE_CACHE in case IRE_CACHE above. 8662 */ 8663 if (gw != INADDR_ANY) { 8664 /* 8665 * The source ipif that was determined above was 8666 * relative to the destination address, not the 8667 * gateway's. If src_ipif was not taken out of 8668 * the IRE_IF_RESOLVER entry, we'll need to call 8669 * ipif_select_source() again. 8670 */ 8671 if (src_ipif != ire->ire_ipif) { 8672 ipif_refrele(src_ipif); 8673 src_ipif = ipif_select_source(dst_ill, 8674 gw, zoneid); 8675 if (src_ipif == NULL) { 8676 if (ip_debug > 2) { 8677 pr_addr_dbg( 8678 "ip_newroute: no " 8679 "src for gw %s ", 8680 AF_INET, &gw); 8681 printf("through " 8682 "interface %s\n", 8683 dst_ill->ill_name); 8684 } 8685 goto icmp_err_ret; 8686 } 8687 } 8688 save_dst = dst; 8689 dst = gw; 8690 gw = INADDR_ANY; 8691 } 8692 8693 /* 8694 * We obtain a partial IRE_CACHE which we will pass 8695 * along with the resolver query. When the response 8696 * comes back it will be there ready for us to add. 8697 * The ire_max_frag is atomically set under the 8698 * irebucket lock in ire_add_v[46]. 8699 */ 8700 8701 ire = ire_create_mp( 8702 (uchar_t *)&dst, /* dest address */ 8703 (uchar_t *)&ip_g_all_ones, /* mask */ 8704 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8705 (uchar_t *)&gw, /* gateway address */ 8706 NULL, /* ire_max_frag */ 8707 NULL, /* no src nce */ 8708 dst_ill->ill_rq, /* recv-from queue */ 8709 dst_ill->ill_wq, /* send-to queue */ 8710 IRE_CACHE, 8711 src_ipif, /* Interface ipif */ 8712 save_ire->ire_mask, /* Parent mask */ 8713 0, 8714 save_ire->ire_ihandle, /* Interface handle */ 8715 0, /* flags if any */ 8716 &(save_ire->ire_uinfo), 8717 NULL, 8718 NULL, 8719 ipst); 8720 8721 if (ire == NULL) { 8722 ire_refrele(save_ire); 8723 break; 8724 } 8725 8726 if ((sire != NULL) && 8727 (sire->ire_flags & RTF_MULTIRT)) { 8728 copy_mp = copymsg(first_mp); 8729 if (copy_mp != NULL) 8730 MULTIRT_DEBUG_TAG(copy_mp); 8731 } 8732 8733 ire->ire_marks |= ire_marks; 8734 8735 /* 8736 * Construct message chain for the resolver 8737 * of the form: 8738 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8739 * Packet could contain a IPSEC_OUT mp. 8740 * 8741 * NOTE : ire will be added later when the response 8742 * comes back from ARP. If the response does not 8743 * come back, ARP frees the packet. For this reason, 8744 * we can't REFHOLD the bucket of save_ire to prevent 8745 * deletions. We may not be able to REFRELE the bucket 8746 * if the response never comes back. Thus, before 8747 * adding the ire, ire_add_v4 will make sure that the 8748 * interface route does not get deleted. This is the 8749 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8750 * where we can always prevent deletions because of 8751 * the synchronous nature of adding IRES i.e 8752 * ire_add_then_send is called after creating the IRE. 8753 */ 8754 ASSERT(ire->ire_mp != NULL); 8755 ire->ire_mp->b_cont = first_mp; 8756 /* Have saved_mp handy, for cleanup if canput fails */ 8757 saved_mp = mp; 8758 mp = copyb(res_mp); 8759 if (mp == NULL) { 8760 /* Prepare for cleanup */ 8761 mp = saved_mp; /* pkt */ 8762 ire_delete(ire); /* ire_mp */ 8763 ire = NULL; 8764 ire_refrele(save_ire); 8765 if (copy_mp != NULL) { 8766 MULTIRT_DEBUG_UNTAG(copy_mp); 8767 freemsg(copy_mp); 8768 copy_mp = NULL; 8769 } 8770 break; 8771 } 8772 linkb(mp, ire->ire_mp); 8773 8774 /* 8775 * Fill in the source and dest addrs for the resolver. 8776 * NOTE: this depends on memory layouts imposed by 8777 * ill_init(). 8778 */ 8779 areq = (areq_t *)mp->b_rptr; 8780 addrp = (ipaddr_t *)((char *)areq + 8781 areq->areq_sender_addr_offset); 8782 if (do_attach_ill) { 8783 /* 8784 * This is bind to no failover case. 8785 * arp packet also must go out on attach_ill. 8786 */ 8787 ASSERT(ipha->ipha_src != NULL); 8788 *addrp = ipha->ipha_src; 8789 } else { 8790 *addrp = save_ire->ire_src_addr; 8791 } 8792 8793 ire_refrele(save_ire); 8794 addrp = (ipaddr_t *)((char *)areq + 8795 areq->areq_target_addr_offset); 8796 *addrp = dst; 8797 /* Up to the resolver. */ 8798 if (canputnext(dst_ill->ill_rq) && 8799 !(dst_ill->ill_arp_closing)) { 8800 putnext(dst_ill->ill_rq, mp); 8801 ire = NULL; 8802 if (copy_mp != NULL) { 8803 /* 8804 * If we found a resolver, we ignore 8805 * any trailing top priority IRE_CACHE 8806 * in the further loops. This ensures 8807 * that we do not omit any resolver. 8808 * IRE_CACHE entries, if any, will be 8809 * processed next time we enter 8810 * ip_newroute(). 8811 */ 8812 multirt_flags &= ~MULTIRT_CACHEGW; 8813 /* 8814 * Search for the next unresolved 8815 * multirt route. 8816 */ 8817 first_mp = copy_mp; 8818 copy_mp = NULL; 8819 /* Prepare the next resolution loop. */ 8820 mp = first_mp; 8821 EXTRACT_PKT_MP(mp, first_mp, 8822 mctl_present); 8823 if (mctl_present) 8824 io = (ipsec_out_t *) 8825 first_mp->b_rptr; 8826 ipha = (ipha_t *)mp->b_rptr; 8827 8828 ASSERT(sire != NULL); 8829 8830 dst = save_dst; 8831 multirt_resolve_next = B_TRUE; 8832 continue; 8833 } 8834 8835 if (sire != NULL) 8836 ire_refrele(sire); 8837 8838 /* 8839 * The response will come back in ip_wput 8840 * with db_type IRE_DB_TYPE. 8841 */ 8842 ipif_refrele(src_ipif); 8843 ill_refrele(dst_ill); 8844 return; 8845 } else { 8846 /* Prepare for cleanup */ 8847 DTRACE_PROBE1(ip__newroute__drop, mblk_t *, 8848 mp); 8849 mp->b_cont = NULL; 8850 freeb(mp); /* areq */ 8851 /* 8852 * this is an ire that is not added to the 8853 * cache. ire_freemblk will handle the release 8854 * of any resources associated with the ire. 8855 */ 8856 ire_delete(ire); /* ire_mp */ 8857 mp = saved_mp; /* pkt */ 8858 ire = NULL; 8859 if (copy_mp != NULL) { 8860 MULTIRT_DEBUG_UNTAG(copy_mp); 8861 freemsg(copy_mp); 8862 copy_mp = NULL; 8863 } 8864 break; 8865 } 8866 default: 8867 break; 8868 } 8869 } while (multirt_resolve_next); 8870 8871 ip1dbg(("ip_newroute: dropped\n")); 8872 /* Did this packet originate externally? */ 8873 if (mp->b_prev) { 8874 mp->b_next = NULL; 8875 mp->b_prev = NULL; 8876 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 8877 } else { 8878 if (dst_ill != NULL) { 8879 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); 8880 } else { 8881 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 8882 } 8883 } 8884 ASSERT(copy_mp == NULL); 8885 MULTIRT_DEBUG_UNTAG(first_mp); 8886 freemsg(first_mp); 8887 if (ire != NULL) 8888 ire_refrele(ire); 8889 if (sire != NULL) 8890 ire_refrele(sire); 8891 if (src_ipif != NULL) 8892 ipif_refrele(src_ipif); 8893 if (dst_ill != NULL) 8894 ill_refrele(dst_ill); 8895 return; 8896 8897 icmp_err_ret: 8898 ip1dbg(("ip_newroute: no route\n")); 8899 if (src_ipif != NULL) 8900 ipif_refrele(src_ipif); 8901 if (dst_ill != NULL) 8902 ill_refrele(dst_ill); 8903 if (sire != NULL) 8904 ire_refrele(sire); 8905 /* Did this packet originate externally? */ 8906 if (mp->b_prev) { 8907 mp->b_next = NULL; 8908 mp->b_prev = NULL; 8909 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes); 8910 q = WR(q); 8911 } else { 8912 /* 8913 * There is no outgoing ill, so just increment the 8914 * system MIB. 8915 */ 8916 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 8917 /* 8918 * Since ip_wput() isn't close to finished, we fill 8919 * in enough of the header for credible error reporting. 8920 */ 8921 if (ip_hdr_complete(ipha, zoneid, ipst)) { 8922 /* Failed */ 8923 MULTIRT_DEBUG_UNTAG(first_mp); 8924 freemsg(first_mp); 8925 if (ire != NULL) 8926 ire_refrele(ire); 8927 return; 8928 } 8929 } 8930 8931 /* 8932 * At this point we will have ire only if RTF_BLACKHOLE 8933 * or RTF_REJECT flags are set on the IRE. It will not 8934 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8935 */ 8936 if (ire != NULL) { 8937 if (ire->ire_flags & RTF_BLACKHOLE) { 8938 ire_refrele(ire); 8939 MULTIRT_DEBUG_UNTAG(first_mp); 8940 freemsg(first_mp); 8941 return; 8942 } 8943 ire_refrele(ire); 8944 } 8945 if (ip_source_routed(ipha, ipst)) { 8946 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED, 8947 zoneid, ipst); 8948 return; 8949 } 8950 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 8951 } 8952 8953 ip_opt_info_t zero_info; 8954 8955 /* 8956 * IPv4 - 8957 * ip_newroute_ipif is called by ip_wput_multicast and 8958 * ip_rput_forward_multicast whenever we need to send 8959 * out a packet to a destination address for which we do not have specific 8960 * routing information. It is used when the packet will be sent out 8961 * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF 8962 * socket option is set or icmp error message wants to go out on a particular 8963 * interface for a unicast packet. 8964 * 8965 * In most cases, the destination address is resolved thanks to the ipif 8966 * intrinsic resolver. However, there are some cases where the call to 8967 * ip_newroute_ipif must take into account the potential presence of 8968 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8969 * that uses the interface. This is specified through flags, 8970 * which can be a combination of: 8971 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8972 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8973 * and flags. Additionally, the packet source address has to be set to 8974 * the specified address. The caller is thus expected to set this flag 8975 * if the packet has no specific source address yet. 8976 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 8977 * flag, the resulting ire will inherit the flag. All unresolved routes 8978 * to the destination must be explored in the same call to 8979 * ip_newroute_ipif(). 8980 */ 8981 static void 8982 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 8983 conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop) 8984 { 8985 areq_t *areq; 8986 ire_t *ire = NULL; 8987 mblk_t *res_mp; 8988 ipaddr_t *addrp; 8989 mblk_t *first_mp; 8990 ire_t *save_ire = NULL; 8991 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ 8992 ipif_t *src_ipif = NULL; 8993 ushort_t ire_marks = 0; 8994 ill_t *dst_ill = NULL; 8995 boolean_t mctl_present; 8996 ipsec_out_t *io; 8997 ipha_t *ipha; 8998 int ihandle = 0; 8999 mblk_t *saved_mp; 9000 ire_t *fire = NULL; 9001 mblk_t *copy_mp = NULL; 9002 boolean_t multirt_resolve_next; 9003 boolean_t unspec_src; 9004 ipaddr_t ipha_dst; 9005 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 9006 9007 /* 9008 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 9009 * here for uniformity 9010 */ 9011 ipif_refhold(ipif); 9012 9013 /* 9014 * This loop is run only once in most cases. 9015 * We loop to resolve further routes only when the destination 9016 * can be reached through multiple RTF_MULTIRT-flagged ires. 9017 */ 9018 do { 9019 if (dst_ill != NULL) { 9020 ill_refrele(dst_ill); 9021 dst_ill = NULL; 9022 } 9023 if (src_ipif != NULL) { 9024 ipif_refrele(src_ipif); 9025 src_ipif = NULL; 9026 } 9027 multirt_resolve_next = B_FALSE; 9028 9029 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 9030 ipif->ipif_ill->ill_name)); 9031 9032 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 9033 if (mctl_present) 9034 io = (ipsec_out_t *)first_mp->b_rptr; 9035 9036 ipha = (ipha_t *)mp->b_rptr; 9037 9038 /* 9039 * Save the packet destination address, we may need it after 9040 * the packet has been consumed. 9041 */ 9042 ipha_dst = ipha->ipha_dst; 9043 9044 /* 9045 * If the interface is a pt-pt interface we look for an 9046 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 9047 * local_address and the pt-pt destination address. Otherwise 9048 * we just match the local address. 9049 * NOTE: dst could be different than ipha->ipha_dst in case 9050 * of sending igmp multicast packets over a point-to-point 9051 * connection. 9052 * Thus we must be careful enough to check ipha_dst to be a 9053 * multicast address, otherwise it will take xmit_if path for 9054 * multicast packets resulting into kernel stack overflow by 9055 * repeated calls to ip_newroute_ipif from ire_send(). 9056 */ 9057 if (CLASSD(ipha_dst) && 9058 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 9059 goto err_ret; 9060 } 9061 9062 /* 9063 * We check if an IRE_OFFSUBNET for the addr that goes through 9064 * ipif exists. We need it to determine if the RTF_SETSRC and/or 9065 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 9066 * propagate its flags to the new ire. 9067 */ 9068 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 9069 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 9070 ip2dbg(("ip_newroute_ipif: " 9071 "ipif_lookup_multi_ire(" 9072 "ipif %p, dst %08x) = fire %p\n", 9073 (void *)ipif, ntohl(dst), (void *)fire)); 9074 } 9075 9076 if (mctl_present && io->ipsec_out_attach_if) { 9077 attach_ill = ip_grab_attach_ill(NULL, first_mp, 9078 io->ipsec_out_ill_index, B_FALSE, ipst); 9079 9080 /* Failure case frees things for us. */ 9081 if (attach_ill == NULL) { 9082 ipif_refrele(ipif); 9083 if (fire != NULL) 9084 ire_refrele(fire); 9085 return; 9086 } 9087 9088 /* 9089 * Check if we need an ire that will not be 9090 * looked up by anybody else i.e. HIDDEN. 9091 */ 9092 if (ill_is_probeonly(attach_ill)) { 9093 ire_marks = IRE_MARK_HIDDEN; 9094 } 9095 /* 9096 * ip_wput passes the right ipif for IPIF_NOFAILOVER 9097 * case. 9098 */ 9099 dst_ill = ipif->ipif_ill; 9100 /* attach_ill has been refheld by ip_grab_attach_ill */ 9101 ASSERT(dst_ill == attach_ill); 9102 } else { 9103 /* 9104 * If the interface belongs to an interface group, 9105 * make sure the next possible interface in the group 9106 * is used. This encourages load spreading among 9107 * peers in an interface group. 9108 * Note: load spreading is disabled for RTF_MULTIRT 9109 * routes. 9110 */ 9111 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9112 (fire->ire_flags & RTF_MULTIRT)) { 9113 /* 9114 * Don't perform outbound load spreading 9115 * in the case of an RTF_MULTIRT issued route, 9116 * we actually typically want to replicate 9117 * outgoing packets through particular 9118 * interfaces. 9119 */ 9120 dst_ill = ipif->ipif_ill; 9121 ill_refhold(dst_ill); 9122 } else { 9123 dst_ill = ip_newroute_get_dst_ill( 9124 ipif->ipif_ill); 9125 } 9126 if (dst_ill == NULL) { 9127 if (ip_debug > 2) { 9128 pr_addr_dbg("ip_newroute_ipif: " 9129 "no dst ill for dst %s\n", 9130 AF_INET, &dst); 9131 } 9132 goto err_ret; 9133 } 9134 } 9135 9136 /* 9137 * Pick a source address preferring non-deprecated ones. 9138 * Unlike ip_newroute, we don't do any source address 9139 * selection here since for multicast it really does not help 9140 * in inbound load spreading as in the unicast case. 9141 */ 9142 if ((flags & RTF_SETSRC) && (fire != NULL) && 9143 (fire->ire_flags & RTF_SETSRC)) { 9144 /* 9145 * As requested by flags, an IRE_OFFSUBNET was looked up 9146 * on that interface. This ire has RTF_SETSRC flag, so 9147 * the source address of the packet must be changed. 9148 * Check that the ipif matching the requested source 9149 * address still exists. 9150 */ 9151 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 9152 zoneid, NULL, NULL, NULL, NULL, ipst); 9153 } 9154 9155 unspec_src = (connp != NULL && connp->conn_unspec_src); 9156 9157 if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || 9158 (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || 9159 (connp != NULL && ipif->ipif_zoneid != zoneid && 9160 ipif->ipif_zoneid != ALL_ZONES)) && 9161 (src_ipif == NULL) && 9162 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 9163 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 9164 if (src_ipif == NULL) { 9165 if (ip_debug > 2) { 9166 /* ip1dbg */ 9167 pr_addr_dbg("ip_newroute_ipif: " 9168 "no src for dst %s", 9169 AF_INET, &dst); 9170 } 9171 ip1dbg((" through interface %s\n", 9172 dst_ill->ill_name)); 9173 goto err_ret; 9174 } 9175 ipif_refrele(ipif); 9176 ipif = src_ipif; 9177 ipif_refhold(ipif); 9178 } 9179 if (src_ipif == NULL) { 9180 src_ipif = ipif; 9181 ipif_refhold(src_ipif); 9182 } 9183 9184 /* 9185 * Assign a source address while we have the conn. 9186 * We can't have ip_wput_ire pick a source address when the 9187 * packet returns from arp since conn_unspec_src might be set 9188 * and we lose the conn when going through arp. 9189 */ 9190 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 9191 ipha->ipha_src = src_ipif->ipif_src_addr; 9192 9193 /* 9194 * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible 9195 * that the outgoing interface does not have an interface ire. 9196 */ 9197 if (CLASSD(ipha_dst) && (connp == NULL || 9198 connp->conn_outgoing_ill == NULL) && 9199 infop->ip_opt_ill_index == 0) { 9200 /* ipif_to_ire returns an held ire */ 9201 ire = ipif_to_ire(ipif); 9202 if (ire == NULL) 9203 goto err_ret; 9204 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 9205 goto err_ret; 9206 /* 9207 * ihandle is needed when the ire is added to 9208 * cache table. 9209 */ 9210 save_ire = ire; 9211 ihandle = save_ire->ire_ihandle; 9212 9213 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 9214 "flags %04x\n", 9215 (void *)ire, (void *)ipif, flags)); 9216 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9217 (fire->ire_flags & RTF_MULTIRT)) { 9218 /* 9219 * As requested by flags, an IRE_OFFSUBNET was 9220 * looked up on that interface. This ire has 9221 * RTF_MULTIRT flag, so the resolution loop will 9222 * be re-entered to resolve additional routes on 9223 * other interfaces. For that purpose, a copy of 9224 * the packet is performed at this point. 9225 */ 9226 fire->ire_last_used_time = lbolt; 9227 copy_mp = copymsg(first_mp); 9228 if (copy_mp) { 9229 MULTIRT_DEBUG_TAG(copy_mp); 9230 } 9231 } 9232 if ((flags & RTF_SETSRC) && (fire != NULL) && 9233 (fire->ire_flags & RTF_SETSRC)) { 9234 /* 9235 * As requested by flags, an IRE_OFFSUBET was 9236 * looked up on that interface. This ire has 9237 * RTF_SETSRC flag, so the source address of the 9238 * packet must be changed. 9239 */ 9240 ipha->ipha_src = fire->ire_src_addr; 9241 } 9242 } else { 9243 ASSERT((connp == NULL) || 9244 (connp->conn_outgoing_ill != NULL) || 9245 (connp->conn_dontroute) || 9246 infop->ip_opt_ill_index != 0); 9247 /* 9248 * The only ways we can come here are: 9249 * 1) IP_BOUND_IF socket option is set 9250 * 2) SO_DONTROUTE socket option is set 9251 * 3) IP_PKTINFO option is passed in as ancillary data. 9252 * In all cases, the new ire will not be added 9253 * into cache table. 9254 */ 9255 ire_marks |= IRE_MARK_NOADD; 9256 } 9257 9258 switch (ipif->ipif_net_type) { 9259 case IRE_IF_NORESOLVER: { 9260 /* We have what we need to build an IRE_CACHE. */ 9261 9262 if ((dst_ill->ill_phys_addr_length != IP_ADDR_LEN) && 9263 (dst_ill->ill_resolver_mp == NULL)) { 9264 ip1dbg(("ip_newroute_ipif: dst_ill %p " 9265 "for IRE_IF_NORESOLVER ire %p has " 9266 "no ill_resolver_mp\n", 9267 (void *)dst_ill, (void *)ire)); 9268 break; 9269 } 9270 9271 /* 9272 * The new ire inherits the IRE_OFFSUBNET flags 9273 * and source address, if this was requested. 9274 */ 9275 ire = ire_create( 9276 (uchar_t *)&dst, /* dest address */ 9277 (uchar_t *)&ip_g_all_ones, /* mask */ 9278 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9279 NULL, /* gateway address */ 9280 &ipif->ipif_mtu, 9281 NULL, /* no src nce */ 9282 dst_ill->ill_rq, /* recv-from queue */ 9283 dst_ill->ill_wq, /* send-to queue */ 9284 IRE_CACHE, 9285 src_ipif, 9286 (save_ire != NULL ? save_ire->ire_mask : 0), 9287 (fire != NULL) ? /* Parent handle */ 9288 fire->ire_phandle : 0, 9289 ihandle, /* Interface handle */ 9290 (fire != NULL) ? 9291 (fire->ire_flags & 9292 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9293 (save_ire == NULL ? &ire_uinfo_null : 9294 &save_ire->ire_uinfo), 9295 NULL, 9296 NULL, 9297 ipst); 9298 9299 if (ire == NULL) { 9300 if (save_ire != NULL) 9301 ire_refrele(save_ire); 9302 break; 9303 } 9304 9305 ire->ire_marks |= ire_marks; 9306 9307 /* 9308 * If IRE_MARK_NOADD is set then we need to convert 9309 * the max_fragp to a useable value now. This is 9310 * normally done in ire_add_v[46]. We also need to 9311 * associate the ire with an nce (normally would be 9312 * done in ip_wput_nondata()). 9313 * 9314 * Note that IRE_MARK_NOADD packets created here 9315 * do not have a non-null ire_mp pointer. The null 9316 * value of ire_bucket indicates that they were 9317 * never added. 9318 */ 9319 if (ire->ire_marks & IRE_MARK_NOADD) { 9320 uint_t max_frag; 9321 9322 max_frag = *ire->ire_max_fragp; 9323 ire->ire_max_fragp = NULL; 9324 ire->ire_max_frag = max_frag; 9325 9326 if ((ire->ire_nce = ndp_lookup_v4( 9327 ire_to_ill(ire), 9328 (ire->ire_gateway_addr != INADDR_ANY ? 9329 &ire->ire_gateway_addr : &ire->ire_addr), 9330 B_FALSE)) == NULL) { 9331 if (save_ire != NULL) 9332 ire_refrele(save_ire); 9333 break; 9334 } 9335 ASSERT(ire->ire_nce->nce_state == 9336 ND_REACHABLE); 9337 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 9338 } 9339 9340 /* Prevent save_ire from getting deleted */ 9341 if (save_ire != NULL) { 9342 IRB_REFHOLD(save_ire->ire_bucket); 9343 /* Has it been removed already ? */ 9344 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 9345 IRB_REFRELE(save_ire->ire_bucket); 9346 ire_refrele(save_ire); 9347 break; 9348 } 9349 } 9350 9351 ire_add_then_send(q, ire, first_mp); 9352 9353 /* Assert that save_ire is not deleted yet. */ 9354 if (save_ire != NULL) { 9355 ASSERT(save_ire->ire_ptpn != NULL); 9356 IRB_REFRELE(save_ire->ire_bucket); 9357 ire_refrele(save_ire); 9358 save_ire = NULL; 9359 } 9360 if (fire != NULL) { 9361 ire_refrele(fire); 9362 fire = NULL; 9363 } 9364 9365 /* 9366 * the resolution loop is re-entered if this 9367 * was requested through flags and if we 9368 * actually are in a multirouting case. 9369 */ 9370 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9371 boolean_t need_resolve = 9372 ire_multirt_need_resolve(ipha_dst, 9373 MBLK_GETLABEL(copy_mp), ipst); 9374 if (!need_resolve) { 9375 MULTIRT_DEBUG_UNTAG(copy_mp); 9376 freemsg(copy_mp); 9377 copy_mp = NULL; 9378 } else { 9379 /* 9380 * ipif_lookup_group() calls 9381 * ire_lookup_multi() that uses 9382 * ire_ftable_lookup() to find 9383 * an IRE_INTERFACE for the group. 9384 * In the multirt case, 9385 * ire_lookup_multi() then invokes 9386 * ire_multirt_lookup() to find 9387 * the next resolvable ire. 9388 * As a result, we obtain an new 9389 * interface, derived from the 9390 * next ire. 9391 */ 9392 ipif_refrele(ipif); 9393 ipif = ipif_lookup_group(ipha_dst, 9394 zoneid, ipst); 9395 ip2dbg(("ip_newroute_ipif: " 9396 "multirt dst %08x, ipif %p\n", 9397 htonl(dst), (void *)ipif)); 9398 if (ipif != NULL) { 9399 mp = copy_mp; 9400 copy_mp = NULL; 9401 multirt_resolve_next = B_TRUE; 9402 continue; 9403 } else { 9404 freemsg(copy_mp); 9405 } 9406 } 9407 } 9408 if (ipif != NULL) 9409 ipif_refrele(ipif); 9410 ill_refrele(dst_ill); 9411 ipif_refrele(src_ipif); 9412 return; 9413 } 9414 case IRE_IF_RESOLVER: 9415 /* 9416 * We can't build an IRE_CACHE yet, but at least 9417 * we found a resolver that can help. 9418 */ 9419 res_mp = dst_ill->ill_resolver_mp; 9420 if (!OK_RESOLVER_MP(res_mp)) 9421 break; 9422 9423 /* 9424 * We obtain a partial IRE_CACHE which we will pass 9425 * along with the resolver query. When the response 9426 * comes back it will be there ready for us to add. 9427 * The new ire inherits the IRE_OFFSUBNET flags 9428 * and source address, if this was requested. 9429 * The ire_max_frag is atomically set under the 9430 * irebucket lock in ire_add_v[46]. Only in the 9431 * case of IRE_MARK_NOADD, we set it here itself. 9432 */ 9433 ire = ire_create_mp( 9434 (uchar_t *)&dst, /* dest address */ 9435 (uchar_t *)&ip_g_all_ones, /* mask */ 9436 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9437 NULL, /* gateway address */ 9438 (ire_marks & IRE_MARK_NOADD) ? 9439 ipif->ipif_mtu : 0, /* max_frag */ 9440 NULL, /* no src nce */ 9441 dst_ill->ill_rq, /* recv-from queue */ 9442 dst_ill->ill_wq, /* send-to queue */ 9443 IRE_CACHE, 9444 src_ipif, 9445 (save_ire != NULL ? save_ire->ire_mask : 0), 9446 (fire != NULL) ? /* Parent handle */ 9447 fire->ire_phandle : 0, 9448 ihandle, /* Interface handle */ 9449 (fire != NULL) ? /* flags if any */ 9450 (fire->ire_flags & 9451 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9452 (save_ire == NULL ? &ire_uinfo_null : 9453 &save_ire->ire_uinfo), 9454 NULL, 9455 NULL, 9456 ipst); 9457 9458 if (save_ire != NULL) { 9459 ire_refrele(save_ire); 9460 save_ire = NULL; 9461 } 9462 if (ire == NULL) 9463 break; 9464 9465 ire->ire_marks |= ire_marks; 9466 /* 9467 * Construct message chain for the resolver of the 9468 * form: 9469 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 9470 * 9471 * NOTE : ire will be added later when the response 9472 * comes back from ARP. If the response does not 9473 * come back, ARP frees the packet. For this reason, 9474 * we can't REFHOLD the bucket of save_ire to prevent 9475 * deletions. We may not be able to REFRELE the 9476 * bucket if the response never comes back. 9477 * Thus, before adding the ire, ire_add_v4 will make 9478 * sure that the interface route does not get deleted. 9479 * This is the only case unlike ip_newroute_v6, 9480 * ip_newroute_ipif_v6 where we can always prevent 9481 * deletions because ire_add_then_send is called after 9482 * creating the IRE. 9483 * If IRE_MARK_NOADD is set, then ire_add_then_send 9484 * does not add this IRE into the IRE CACHE. 9485 */ 9486 ASSERT(ire->ire_mp != NULL); 9487 ire->ire_mp->b_cont = first_mp; 9488 /* Have saved_mp handy, for cleanup if canput fails */ 9489 saved_mp = mp; 9490 mp = copyb(res_mp); 9491 if (mp == NULL) { 9492 /* Prepare for cleanup */ 9493 mp = saved_mp; /* pkt */ 9494 ire_delete(ire); /* ire_mp */ 9495 ire = NULL; 9496 if (copy_mp != NULL) { 9497 MULTIRT_DEBUG_UNTAG(copy_mp); 9498 freemsg(copy_mp); 9499 copy_mp = NULL; 9500 } 9501 break; 9502 } 9503 linkb(mp, ire->ire_mp); 9504 9505 /* 9506 * Fill in the source and dest addrs for the resolver. 9507 * NOTE: this depends on memory layouts imposed by 9508 * ill_init(). 9509 */ 9510 areq = (areq_t *)mp->b_rptr; 9511 addrp = (ipaddr_t *)((char *)areq + 9512 areq->areq_sender_addr_offset); 9513 *addrp = ire->ire_src_addr; 9514 addrp = (ipaddr_t *)((char *)areq + 9515 areq->areq_target_addr_offset); 9516 *addrp = dst; 9517 /* Up to the resolver. */ 9518 if (canputnext(dst_ill->ill_rq) && 9519 !(dst_ill->ill_arp_closing)) { 9520 putnext(dst_ill->ill_rq, mp); 9521 /* 9522 * The response will come back in ip_wput 9523 * with db_type IRE_DB_TYPE. 9524 */ 9525 } else { 9526 mp->b_cont = NULL; 9527 freeb(mp); /* areq */ 9528 ire_delete(ire); /* ire_mp */ 9529 saved_mp->b_next = NULL; 9530 saved_mp->b_prev = NULL; 9531 freemsg(first_mp); /* pkt */ 9532 ip2dbg(("ip_newroute_ipif: dropped\n")); 9533 } 9534 9535 if (fire != NULL) { 9536 ire_refrele(fire); 9537 fire = NULL; 9538 } 9539 9540 9541 /* 9542 * The resolution loop is re-entered if this was 9543 * requested through flags and we actually are 9544 * in a multirouting case. 9545 */ 9546 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9547 boolean_t need_resolve = 9548 ire_multirt_need_resolve(ipha_dst, 9549 MBLK_GETLABEL(copy_mp), ipst); 9550 if (!need_resolve) { 9551 MULTIRT_DEBUG_UNTAG(copy_mp); 9552 freemsg(copy_mp); 9553 copy_mp = NULL; 9554 } else { 9555 /* 9556 * ipif_lookup_group() calls 9557 * ire_lookup_multi() that uses 9558 * ire_ftable_lookup() to find 9559 * an IRE_INTERFACE for the group. 9560 * In the multirt case, 9561 * ire_lookup_multi() then invokes 9562 * ire_multirt_lookup() to find 9563 * the next resolvable ire. 9564 * As a result, we obtain an new 9565 * interface, derived from the 9566 * next ire. 9567 */ 9568 ipif_refrele(ipif); 9569 ipif = ipif_lookup_group(ipha_dst, 9570 zoneid, ipst); 9571 if (ipif != NULL) { 9572 mp = copy_mp; 9573 copy_mp = NULL; 9574 multirt_resolve_next = B_TRUE; 9575 continue; 9576 } else { 9577 freemsg(copy_mp); 9578 } 9579 } 9580 } 9581 if (ipif != NULL) 9582 ipif_refrele(ipif); 9583 ill_refrele(dst_ill); 9584 ipif_refrele(src_ipif); 9585 return; 9586 default: 9587 break; 9588 } 9589 } while (multirt_resolve_next); 9590 9591 err_ret: 9592 ip2dbg(("ip_newroute_ipif: dropped\n")); 9593 if (fire != NULL) 9594 ire_refrele(fire); 9595 ipif_refrele(ipif); 9596 /* Did this packet originate externally? */ 9597 if (dst_ill != NULL) 9598 ill_refrele(dst_ill); 9599 if (src_ipif != NULL) 9600 ipif_refrele(src_ipif); 9601 if (mp->b_prev || mp->b_next) { 9602 mp->b_next = NULL; 9603 mp->b_prev = NULL; 9604 } else { 9605 /* 9606 * Since ip_wput() isn't close to finished, we fill 9607 * in enough of the header for credible error reporting. 9608 */ 9609 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 9610 /* Failed */ 9611 freemsg(first_mp); 9612 if (ire != NULL) 9613 ire_refrele(ire); 9614 return; 9615 } 9616 } 9617 /* 9618 * At this point we will have ire only if RTF_BLACKHOLE 9619 * or RTF_REJECT flags are set on the IRE. It will not 9620 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 9621 */ 9622 if (ire != NULL) { 9623 if (ire->ire_flags & RTF_BLACKHOLE) { 9624 ire_refrele(ire); 9625 freemsg(first_mp); 9626 return; 9627 } 9628 ire_refrele(ire); 9629 } 9630 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 9631 } 9632 9633 /* Name/Value Table Lookup Routine */ 9634 char * 9635 ip_nv_lookup(nv_t *nv, int value) 9636 { 9637 if (!nv) 9638 return (NULL); 9639 for (; nv->nv_name; nv++) { 9640 if (nv->nv_value == value) 9641 return (nv->nv_name); 9642 } 9643 return ("unknown"); 9644 } 9645 9646 /* 9647 * This is a module open, i.e. this is a control stream for access 9648 * to a DLPI device. We allocate an ill_t as the instance data in 9649 * this case. 9650 */ 9651 int 9652 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9653 { 9654 ill_t *ill; 9655 int err; 9656 zoneid_t zoneid; 9657 netstack_t *ns; 9658 ip_stack_t *ipst; 9659 9660 /* 9661 * Prevent unprivileged processes from pushing IP so that 9662 * they can't send raw IP. 9663 */ 9664 if (secpolicy_net_rawaccess(credp) != 0) 9665 return (EPERM); 9666 9667 ns = netstack_find_by_cred(credp); 9668 ASSERT(ns != NULL); 9669 ipst = ns->netstack_ip; 9670 ASSERT(ipst != NULL); 9671 9672 /* 9673 * For exclusive stacks we set the zoneid to zero 9674 * to make IP operate as if in the global zone. 9675 */ 9676 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9677 zoneid = GLOBAL_ZONEID; 9678 else 9679 zoneid = crgetzoneid(credp); 9680 9681 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 9682 q->q_ptr = WR(q)->q_ptr = ill; 9683 ill->ill_ipst = ipst; 9684 ill->ill_zoneid = zoneid; 9685 9686 /* 9687 * ill_init initializes the ill fields and then sends down 9688 * down a DL_INFO_REQ after calling qprocson. 9689 */ 9690 err = ill_init(q, ill); 9691 if (err != 0) { 9692 mi_free(ill); 9693 netstack_rele(ipst->ips_netstack); 9694 q->q_ptr = NULL; 9695 WR(q)->q_ptr = NULL; 9696 return (err); 9697 } 9698 9699 /* ill_init initializes the ipsq marking this thread as writer */ 9700 ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE, B_TRUE); 9701 /* Wait for the DL_INFO_ACK */ 9702 mutex_enter(&ill->ill_lock); 9703 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 9704 /* 9705 * Return value of 0 indicates a pending signal. 9706 */ 9707 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 9708 if (err == 0) { 9709 mutex_exit(&ill->ill_lock); 9710 (void) ip_close(q, 0); 9711 return (EINTR); 9712 } 9713 } 9714 mutex_exit(&ill->ill_lock); 9715 9716 /* 9717 * ip_rput_other could have set an error in ill_error on 9718 * receipt of M_ERROR. 9719 */ 9720 9721 err = ill->ill_error; 9722 if (err != 0) { 9723 (void) ip_close(q, 0); 9724 return (err); 9725 } 9726 9727 ill->ill_credp = credp; 9728 crhold(credp); 9729 9730 mutex_enter(&ipst->ips_ip_mi_lock); 9731 err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag, 9732 credp); 9733 mutex_exit(&ipst->ips_ip_mi_lock); 9734 if (err) { 9735 (void) ip_close(q, 0); 9736 return (err); 9737 } 9738 return (0); 9739 } 9740 9741 /* For /dev/ip aka AF_INET open */ 9742 int 9743 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9744 { 9745 return (ip_open(q, devp, flag, sflag, credp, B_FALSE)); 9746 } 9747 9748 /* For /dev/ip6 aka AF_INET6 open */ 9749 int 9750 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9751 { 9752 return (ip_open(q, devp, flag, sflag, credp, B_TRUE)); 9753 } 9754 9755 /* IP open routine. */ 9756 int 9757 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 9758 boolean_t isv6) 9759 { 9760 conn_t *connp; 9761 major_t maj; 9762 zoneid_t zoneid; 9763 netstack_t *ns; 9764 ip_stack_t *ipst; 9765 9766 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 9767 9768 /* Allow reopen. */ 9769 if (q->q_ptr != NULL) 9770 return (0); 9771 9772 if (sflag & MODOPEN) { 9773 /* This is a module open */ 9774 return (ip_modopen(q, devp, flag, sflag, credp)); 9775 } 9776 9777 ns = netstack_find_by_cred(credp); 9778 ASSERT(ns != NULL); 9779 ipst = ns->netstack_ip; 9780 ASSERT(ipst != NULL); 9781 9782 /* 9783 * For exclusive stacks we set the zoneid to zero 9784 * to make IP operate as if in the global zone. 9785 */ 9786 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9787 zoneid = GLOBAL_ZONEID; 9788 else 9789 zoneid = crgetzoneid(credp); 9790 9791 /* 9792 * We are opening as a device. This is an IP client stream, and we 9793 * allocate an conn_t as the instance data. 9794 */ 9795 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack); 9796 9797 /* 9798 * ipcl_conn_create did a netstack_hold. Undo the hold that was 9799 * done by netstack_find_by_cred() 9800 */ 9801 netstack_rele(ipst->ips_netstack); 9802 9803 connp->conn_zoneid = zoneid; 9804 9805 connp->conn_upq = q; 9806 q->q_ptr = WR(q)->q_ptr = connp; 9807 9808 if (flag & SO_SOCKSTR) 9809 connp->conn_flags |= IPCL_SOCKET; 9810 9811 /* Minor tells us which /dev entry was opened */ 9812 if (isv6) { 9813 connp->conn_flags |= IPCL_ISV6; 9814 connp->conn_af_isv6 = B_TRUE; 9815 ip_setpktversion(connp, isv6, B_FALSE, ipst); 9816 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9817 } else { 9818 connp->conn_af_isv6 = B_FALSE; 9819 connp->conn_pkt_isv6 = B_FALSE; 9820 } 9821 9822 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 9823 ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 9824 connp->conn_minor_arena = ip_minor_arena_la; 9825 } else { 9826 /* 9827 * Either minor numbers in the large arena were exhausted 9828 * or a non socket application is doing the open. 9829 * Try to allocate from the small arena. 9830 */ 9831 if ((connp->conn_dev = 9832 inet_minor_alloc(ip_minor_arena_sa)) == 0) { 9833 /* CONN_DEC_REF takes care of netstack_rele() */ 9834 q->q_ptr = WR(q)->q_ptr = NULL; 9835 CONN_DEC_REF(connp); 9836 return (EBUSY); 9837 } 9838 connp->conn_minor_arena = ip_minor_arena_sa; 9839 } 9840 9841 maj = getemajor(*devp); 9842 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9843 9844 /* 9845 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9846 */ 9847 connp->conn_cred = credp; 9848 9849 /* 9850 * Handle IP_RTS_REQUEST and other ioctls which use conn_recv 9851 */ 9852 connp->conn_recv = ip_conn_input; 9853 9854 crhold(connp->conn_cred); 9855 9856 /* 9857 * If the caller has the process-wide flag set, then default to MAC 9858 * exempt mode. This allows read-down to unlabeled hosts. 9859 */ 9860 if (getpflags(NET_MAC_AWARE, credp) != 0) 9861 connp->conn_mac_exempt = B_TRUE; 9862 9863 connp->conn_rq = q; 9864 connp->conn_wq = WR(q); 9865 9866 /* Non-zero default values */ 9867 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9868 9869 /* 9870 * Make the conn globally visible to walkers 9871 */ 9872 ASSERT(connp->conn_ref == 1); 9873 mutex_enter(&connp->conn_lock); 9874 connp->conn_state_flags &= ~CONN_INCIPIENT; 9875 mutex_exit(&connp->conn_lock); 9876 9877 qprocson(q); 9878 9879 return (0); 9880 } 9881 9882 /* 9883 * Change the output format (IPv4 vs. IPv6) for a conn_t. 9884 * Note that there is no race since either ip_output function works - it 9885 * is just an optimization to enter the best ip_output routine directly. 9886 */ 9887 void 9888 ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib, 9889 ip_stack_t *ipst) 9890 { 9891 if (isv6) { 9892 if (bump_mib) { 9893 BUMP_MIB(&ipst->ips_ip6_mib, 9894 ipIfStatsOutSwitchIPVersion); 9895 } 9896 connp->conn_send = ip_output_v6; 9897 connp->conn_pkt_isv6 = B_TRUE; 9898 } else { 9899 if (bump_mib) { 9900 BUMP_MIB(&ipst->ips_ip_mib, 9901 ipIfStatsOutSwitchIPVersion); 9902 } 9903 connp->conn_send = ip_output; 9904 connp->conn_pkt_isv6 = B_FALSE; 9905 } 9906 9907 } 9908 9909 /* 9910 * See if IPsec needs loading because of the options in mp. 9911 */ 9912 static boolean_t 9913 ipsec_opt_present(mblk_t *mp) 9914 { 9915 uint8_t *optcp, *next_optcp, *opt_endcp; 9916 struct opthdr *opt; 9917 struct T_opthdr *topt; 9918 int opthdr_len; 9919 t_uscalar_t optname, optlevel; 9920 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9921 ipsec_req_t *ipsr; 9922 9923 /* 9924 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9925 * return TRUE. 9926 */ 9927 9928 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9929 opt_endcp = optcp + tor->OPT_length; 9930 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9931 opthdr_len = sizeof (struct T_opthdr); 9932 } else { /* O_OPTMGMT_REQ */ 9933 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9934 opthdr_len = sizeof (struct opthdr); 9935 } 9936 for (; optcp < opt_endcp; optcp = next_optcp) { 9937 if (optcp + opthdr_len > opt_endcp) 9938 return (B_FALSE); /* Not enough option header. */ 9939 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9940 topt = (struct T_opthdr *)optcp; 9941 optlevel = topt->level; 9942 optname = topt->name; 9943 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9944 } else { 9945 opt = (struct opthdr *)optcp; 9946 optlevel = opt->level; 9947 optname = opt->name; 9948 next_optcp = optcp + opthdr_len + 9949 _TPI_ALIGN_OPT(opt->len); 9950 } 9951 if ((next_optcp < optcp) || /* wraparound pointer space */ 9952 ((next_optcp >= opt_endcp) && /* last option bad len */ 9953 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9954 return (B_FALSE); /* bad option buffer */ 9955 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9956 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9957 /* 9958 * Check to see if it's an all-bypass or all-zeroes 9959 * IPsec request. Don't bother loading IPsec if 9960 * the socket doesn't want to use it. (A good example 9961 * is a bypass request.) 9962 * 9963 * Basically, if any of the non-NEVER bits are set, 9964 * load IPsec. 9965 */ 9966 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9967 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9968 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9969 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9970 != 0) 9971 return (B_TRUE); 9972 } 9973 } 9974 return (B_FALSE); 9975 } 9976 9977 /* 9978 * If conn is is waiting for ipsec to finish loading, kick it. 9979 */ 9980 /* ARGSUSED */ 9981 static void 9982 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 9983 { 9984 t_scalar_t optreq_prim; 9985 mblk_t *mp; 9986 cred_t *cr; 9987 int err = 0; 9988 9989 /* 9990 * This function is called, after ipsec loading is complete. 9991 * Since IP checks exclusively and atomically (i.e it prevents 9992 * ipsec load from completing until ip_optcom_req completes) 9993 * whether ipsec load is complete, there cannot be a race with IP 9994 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 9995 */ 9996 mutex_enter(&connp->conn_lock); 9997 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 9998 ASSERT(connp->conn_ipsec_opt_mp != NULL); 9999 mp = connp->conn_ipsec_opt_mp; 10000 connp->conn_ipsec_opt_mp = NULL; 10001 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 10002 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(CONNP_TO_WQ(connp))); 10003 mutex_exit(&connp->conn_lock); 10004 10005 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 10006 10007 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 10008 if (optreq_prim == T_OPTMGMT_REQ) { 10009 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10010 &ip_opt_obj, B_FALSE); 10011 } else { 10012 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 10013 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10014 &ip_opt_obj, B_FALSE); 10015 } 10016 if (err != EINPROGRESS) 10017 CONN_OPER_PENDING_DONE(connp); 10018 return; 10019 } 10020 mutex_exit(&connp->conn_lock); 10021 } 10022 10023 /* 10024 * Called from the ipsec_loader thread, outside any perimeter, to tell 10025 * ip qenable any of the queues waiting for the ipsec loader to 10026 * complete. 10027 */ 10028 void 10029 ip_ipsec_load_complete(ipsec_stack_t *ipss) 10030 { 10031 netstack_t *ns = ipss->ipsec_netstack; 10032 10033 ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip); 10034 } 10035 10036 /* 10037 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 10038 * determines the grp on which it has to become exclusive, queues the mp 10039 * and sq draining restarts the optmgmt 10040 */ 10041 static boolean_t 10042 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 10043 { 10044 conn_t *connp = Q_TO_CONN(q); 10045 ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec; 10046 10047 /* 10048 * Take IPsec requests and treat them special. 10049 */ 10050 if (ipsec_opt_present(mp)) { 10051 /* First check if IPsec is loaded. */ 10052 mutex_enter(&ipss->ipsec_loader_lock); 10053 if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) { 10054 mutex_exit(&ipss->ipsec_loader_lock); 10055 return (B_FALSE); 10056 } 10057 mutex_enter(&connp->conn_lock); 10058 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 10059 10060 ASSERT(connp->conn_ipsec_opt_mp == NULL); 10061 connp->conn_ipsec_opt_mp = mp; 10062 mutex_exit(&connp->conn_lock); 10063 mutex_exit(&ipss->ipsec_loader_lock); 10064 10065 ipsec_loader_loadnow(ipss); 10066 return (B_TRUE); 10067 } 10068 return (B_FALSE); 10069 } 10070 10071 /* 10072 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 10073 * all of them are copied to the conn_t. If the req is "zero", the policy is 10074 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 10075 * fields. 10076 * We keep only the latest setting of the policy and thus policy setting 10077 * is not incremental/cumulative. 10078 * 10079 * Requests to set policies with multiple alternative actions will 10080 * go through a different API. 10081 */ 10082 int 10083 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 10084 { 10085 uint_t ah_req = 0; 10086 uint_t esp_req = 0; 10087 uint_t se_req = 0; 10088 ipsec_selkey_t sel; 10089 ipsec_act_t *actp = NULL; 10090 uint_t nact; 10091 ipsec_policy_t *pin4 = NULL, *pout4 = NULL; 10092 ipsec_policy_t *pin6 = NULL, *pout6 = NULL; 10093 ipsec_policy_root_t *pr; 10094 ipsec_policy_head_t *ph; 10095 int fam; 10096 boolean_t is_pol_reset; 10097 int error = 0; 10098 netstack_t *ns = connp->conn_netstack; 10099 ip_stack_t *ipst = ns->netstack_ip; 10100 ipsec_stack_t *ipss = ns->netstack_ipsec; 10101 10102 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 10103 10104 /* 10105 * The IP_SEC_OPT option does not allow variable length parameters, 10106 * hence a request cannot be NULL. 10107 */ 10108 if (req == NULL) 10109 return (EINVAL); 10110 10111 ah_req = req->ipsr_ah_req; 10112 esp_req = req->ipsr_esp_req; 10113 se_req = req->ipsr_self_encap_req; 10114 10115 /* Don't allow setting self-encap without one or more of AH/ESP. */ 10116 if (se_req != 0 && esp_req == 0 && ah_req == 0) 10117 return (EINVAL); 10118 10119 /* 10120 * Are we dealing with a request to reset the policy (i.e. 10121 * zero requests). 10122 */ 10123 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 10124 (esp_req & REQ_MASK) == 0 && 10125 (se_req & REQ_MASK) == 0); 10126 10127 if (!is_pol_reset) { 10128 /* 10129 * If we couldn't load IPsec, fail with "protocol 10130 * not supported". 10131 * IPsec may not have been loaded for a request with zero 10132 * policies, so we don't fail in this case. 10133 */ 10134 mutex_enter(&ipss->ipsec_loader_lock); 10135 if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 10136 mutex_exit(&ipss->ipsec_loader_lock); 10137 return (EPROTONOSUPPORT); 10138 } 10139 mutex_exit(&ipss->ipsec_loader_lock); 10140 10141 /* 10142 * Test for valid requests. Invalid algorithms 10143 * need to be tested by IPsec code because new 10144 * algorithms can be added dynamically. 10145 */ 10146 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10147 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10148 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 10149 return (EINVAL); 10150 } 10151 10152 /* 10153 * Only privileged users can issue these 10154 * requests. 10155 */ 10156 if (((ah_req & IPSEC_PREF_NEVER) || 10157 (esp_req & IPSEC_PREF_NEVER) || 10158 (se_req & IPSEC_PREF_NEVER)) && 10159 secpolicy_ip_config(cr, B_FALSE) != 0) { 10160 return (EPERM); 10161 } 10162 10163 /* 10164 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 10165 * are mutually exclusive. 10166 */ 10167 if (((ah_req & REQ_MASK) == REQ_MASK) || 10168 ((esp_req & REQ_MASK) == REQ_MASK) || 10169 ((se_req & REQ_MASK) == REQ_MASK)) { 10170 /* Both of them are set */ 10171 return (EINVAL); 10172 } 10173 } 10174 10175 mutex_enter(&connp->conn_lock); 10176 10177 /* 10178 * If we have already cached policies in ip_bind_connected*(), don't 10179 * let them change now. We cache policies for connections 10180 * whose src,dst [addr, port] is known. 10181 */ 10182 if (connp->conn_policy_cached) { 10183 mutex_exit(&connp->conn_lock); 10184 return (EINVAL); 10185 } 10186 10187 /* 10188 * We have a zero policies, reset the connection policy if already 10189 * set. This will cause the connection to inherit the 10190 * global policy, if any. 10191 */ 10192 if (is_pol_reset) { 10193 if (connp->conn_policy != NULL) { 10194 IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack); 10195 connp->conn_policy = NULL; 10196 } 10197 connp->conn_flags &= ~IPCL_CHECK_POLICY; 10198 connp->conn_in_enforce_policy = B_FALSE; 10199 connp->conn_out_enforce_policy = B_FALSE; 10200 mutex_exit(&connp->conn_lock); 10201 return (0); 10202 } 10203 10204 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy, 10205 ipst->ips_netstack); 10206 if (ph == NULL) 10207 goto enomem; 10208 10209 ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack); 10210 if (actp == NULL) 10211 goto enomem; 10212 10213 /* 10214 * Always allocate IPv4 policy entries, since they can also 10215 * apply to ipv6 sockets being used in ipv4-compat mode. 10216 */ 10217 bzero(&sel, sizeof (sel)); 10218 sel.ipsl_valid = IPSL_IPV4; 10219 10220 pin4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET, NULL, 10221 ipst->ips_netstack); 10222 if (pin4 == NULL) 10223 goto enomem; 10224 10225 pout4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET, NULL, 10226 ipst->ips_netstack); 10227 if (pout4 == NULL) 10228 goto enomem; 10229 10230 if (connp->conn_af_isv6) { 10231 /* 10232 * We're looking at a v6 socket, also allocate the 10233 * v6-specific entries... 10234 */ 10235 sel.ipsl_valid = IPSL_IPV6; 10236 pin6 = ipsec_policy_create(&sel, actp, nact, 10237 IPSEC_PRIO_SOCKET, NULL, ipst->ips_netstack); 10238 if (pin6 == NULL) 10239 goto enomem; 10240 10241 pout6 = ipsec_policy_create(&sel, actp, nact, 10242 IPSEC_PRIO_SOCKET, NULL, ipst->ips_netstack); 10243 if (pout6 == NULL) 10244 goto enomem; 10245 10246 /* 10247 * .. and file them away in the right place. 10248 */ 10249 fam = IPSEC_AF_V6; 10250 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 10251 HASHLIST_INSERT(pin6, ipsp_hash, pr->ipr_nonhash[fam]); 10252 ipsec_insert_always(&ph->iph_rulebyid, pin6); 10253 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 10254 HASHLIST_INSERT(pout6, ipsp_hash, pr->ipr_nonhash[fam]); 10255 ipsec_insert_always(&ph->iph_rulebyid, pout6); 10256 } 10257 10258 ipsec_actvec_free(actp, nact); 10259 10260 /* 10261 * File the v4 policies. 10262 */ 10263 fam = IPSEC_AF_V4; 10264 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 10265 HASHLIST_INSERT(pin4, ipsp_hash, pr->ipr_nonhash[fam]); 10266 ipsec_insert_always(&ph->iph_rulebyid, pin4); 10267 10268 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 10269 HASHLIST_INSERT(pout4, ipsp_hash, pr->ipr_nonhash[fam]); 10270 ipsec_insert_always(&ph->iph_rulebyid, pout4); 10271 10272 /* 10273 * If the requests need security, set enforce_policy. 10274 * If the requests are IPSEC_PREF_NEVER, one should 10275 * still set conn_out_enforce_policy so that an ipsec_out 10276 * gets attached in ip_wput. This is needed so that 10277 * for connections that we don't cache policy in ip_bind, 10278 * if global policy matches in ip_wput_attach_policy, we 10279 * don't wrongly inherit global policy. Similarly, we need 10280 * to set conn_in_enforce_policy also so that we don't verify 10281 * policy wrongly. 10282 */ 10283 if ((ah_req & REQ_MASK) != 0 || 10284 (esp_req & REQ_MASK) != 0 || 10285 (se_req & REQ_MASK) != 0) { 10286 connp->conn_in_enforce_policy = B_TRUE; 10287 connp->conn_out_enforce_policy = B_TRUE; 10288 connp->conn_flags |= IPCL_CHECK_POLICY; 10289 } 10290 10291 mutex_exit(&connp->conn_lock); 10292 return (error); 10293 #undef REQ_MASK 10294 10295 /* 10296 * Common memory-allocation-failure exit path. 10297 */ 10298 enomem: 10299 mutex_exit(&connp->conn_lock); 10300 if (actp != NULL) 10301 ipsec_actvec_free(actp, nact); 10302 if (pin4 != NULL) 10303 IPPOL_REFRELE(pin4, ipst->ips_netstack); 10304 if (pout4 != NULL) 10305 IPPOL_REFRELE(pout4, ipst->ips_netstack); 10306 if (pin6 != NULL) 10307 IPPOL_REFRELE(pin6, ipst->ips_netstack); 10308 if (pout6 != NULL) 10309 IPPOL_REFRELE(pout6, ipst->ips_netstack); 10310 return (ENOMEM); 10311 } 10312 10313 /* 10314 * Only for options that pass in an IP addr. Currently only V4 options 10315 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 10316 * So this function assumes level is IPPROTO_IP 10317 */ 10318 int 10319 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 10320 mblk_t *first_mp) 10321 { 10322 ipif_t *ipif = NULL; 10323 int error; 10324 ill_t *ill; 10325 int zoneid; 10326 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10327 10328 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 10329 10330 if (addr != INADDR_ANY || checkonly) { 10331 ASSERT(connp != NULL); 10332 zoneid = IPCL_ZONEID(connp); 10333 if (option == IP_NEXTHOP) { 10334 ipif = ipif_lookup_onlink_addr(addr, 10335 connp->conn_zoneid, ipst); 10336 } else { 10337 ipif = ipif_lookup_addr(addr, NULL, zoneid, 10338 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 10339 &error, ipst); 10340 } 10341 if (ipif == NULL) { 10342 if (error == EINPROGRESS) 10343 return (error); 10344 else if ((option == IP_MULTICAST_IF) || 10345 (option == IP_NEXTHOP)) 10346 return (EHOSTUNREACH); 10347 else 10348 return (EINVAL); 10349 } else if (checkonly) { 10350 if (option == IP_MULTICAST_IF) { 10351 ill = ipif->ipif_ill; 10352 /* not supported by the virtual network iface */ 10353 if (IS_VNI(ill)) { 10354 ipif_refrele(ipif); 10355 return (EINVAL); 10356 } 10357 } 10358 ipif_refrele(ipif); 10359 return (0); 10360 } 10361 ill = ipif->ipif_ill; 10362 mutex_enter(&connp->conn_lock); 10363 mutex_enter(&ill->ill_lock); 10364 if ((ill->ill_state_flags & ILL_CONDEMNED) || 10365 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 10366 mutex_exit(&ill->ill_lock); 10367 mutex_exit(&connp->conn_lock); 10368 ipif_refrele(ipif); 10369 return (option == IP_MULTICAST_IF ? 10370 EHOSTUNREACH : EINVAL); 10371 } 10372 } else { 10373 mutex_enter(&connp->conn_lock); 10374 } 10375 10376 /* None of the options below are supported on the VNI */ 10377 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 10378 mutex_exit(&ill->ill_lock); 10379 mutex_exit(&connp->conn_lock); 10380 ipif_refrele(ipif); 10381 return (EINVAL); 10382 } 10383 10384 switch (option) { 10385 case IP_DONTFAILOVER_IF: 10386 /* 10387 * This option is used by in.mpathd to ensure 10388 * that IPMP probe packets only go out on the 10389 * test interfaces. in.mpathd sets this option 10390 * on the non-failover interfaces. 10391 * For backward compatibility, this option 10392 * implicitly sets IP_MULTICAST_IF, as used 10393 * be done in bind(), so that ip_wput gets 10394 * this ipif to send mcast packets. 10395 */ 10396 if (ipif != NULL) { 10397 ASSERT(addr != INADDR_ANY); 10398 connp->conn_nofailover_ill = ipif->ipif_ill; 10399 connp->conn_multicast_ipif = ipif; 10400 } else { 10401 ASSERT(addr == INADDR_ANY); 10402 connp->conn_nofailover_ill = NULL; 10403 connp->conn_multicast_ipif = NULL; 10404 } 10405 break; 10406 10407 case IP_MULTICAST_IF: 10408 connp->conn_multicast_ipif = ipif; 10409 break; 10410 case IP_NEXTHOP: 10411 connp->conn_nexthop_v4 = addr; 10412 connp->conn_nexthop_set = B_TRUE; 10413 break; 10414 } 10415 10416 if (ipif != NULL) { 10417 mutex_exit(&ill->ill_lock); 10418 mutex_exit(&connp->conn_lock); 10419 ipif_refrele(ipif); 10420 return (0); 10421 } 10422 mutex_exit(&connp->conn_lock); 10423 /* We succeded in cleared the option */ 10424 return (0); 10425 } 10426 10427 /* 10428 * For options that pass in an ifindex specifying the ill. V6 options always 10429 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 10430 */ 10431 int 10432 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 10433 int level, int option, mblk_t *first_mp) 10434 { 10435 ill_t *ill = NULL; 10436 int error = 0; 10437 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10438 10439 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 10440 if (ifindex != 0) { 10441 ASSERT(connp != NULL); 10442 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 10443 first_mp, ip_restart_optmgmt, &error, ipst); 10444 if (ill != NULL) { 10445 if (checkonly) { 10446 /* not supported by the virtual network iface */ 10447 if (IS_VNI(ill)) { 10448 ill_refrele(ill); 10449 return (EINVAL); 10450 } 10451 ill_refrele(ill); 10452 return (0); 10453 } 10454 if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, 10455 0, NULL)) { 10456 ill_refrele(ill); 10457 ill = NULL; 10458 mutex_enter(&connp->conn_lock); 10459 goto setit; 10460 } 10461 mutex_enter(&connp->conn_lock); 10462 mutex_enter(&ill->ill_lock); 10463 if (ill->ill_state_flags & ILL_CONDEMNED) { 10464 mutex_exit(&ill->ill_lock); 10465 mutex_exit(&connp->conn_lock); 10466 ill_refrele(ill); 10467 ill = NULL; 10468 mutex_enter(&connp->conn_lock); 10469 } 10470 goto setit; 10471 } else if (error == EINPROGRESS) { 10472 return (error); 10473 } else { 10474 error = 0; 10475 } 10476 } 10477 mutex_enter(&connp->conn_lock); 10478 setit: 10479 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 10480 10481 /* 10482 * The options below assume that the ILL (if any) transmits and/or 10483 * receives traffic. Neither of which is true for the virtual network 10484 * interface, so fail setting these on a VNI. 10485 */ 10486 if (IS_VNI(ill)) { 10487 ASSERT(ill != NULL); 10488 mutex_exit(&ill->ill_lock); 10489 mutex_exit(&connp->conn_lock); 10490 ill_refrele(ill); 10491 return (EINVAL); 10492 } 10493 10494 if (level == IPPROTO_IP) { 10495 switch (option) { 10496 case IP_BOUND_IF: 10497 connp->conn_incoming_ill = ill; 10498 connp->conn_outgoing_ill = ill; 10499 connp->conn_orig_bound_ifindex = (ill == NULL) ? 10500 0 : ifindex; 10501 break; 10502 10503 case IP_MULTICAST_IF: 10504 /* 10505 * This option is an internal special. The socket 10506 * level IP_MULTICAST_IF specifies an 'ipaddr' and 10507 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 10508 * specifies an ifindex and we try first on V6 ill's. 10509 * If we don't find one, we they try using on v4 ill's 10510 * intenally and we come here. 10511 */ 10512 if (!checkonly && ill != NULL) { 10513 ipif_t *ipif; 10514 ipif = ill->ill_ipif; 10515 10516 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 10517 mutex_exit(&ill->ill_lock); 10518 mutex_exit(&connp->conn_lock); 10519 ill_refrele(ill); 10520 ill = NULL; 10521 mutex_enter(&connp->conn_lock); 10522 } else { 10523 connp->conn_multicast_ipif = ipif; 10524 } 10525 } 10526 break; 10527 10528 case IP_DHCPINIT_IF: 10529 if (connp->conn_dhcpinit_ill != NULL) { 10530 /* 10531 * We've locked the conn so conn_cleanup_ill() 10532 * cannot clear conn_dhcpinit_ill -- so it's 10533 * safe to access the ill. 10534 */ 10535 ill_t *oill = connp->conn_dhcpinit_ill; 10536 10537 ASSERT(oill->ill_dhcpinit != 0); 10538 atomic_dec_32(&oill->ill_dhcpinit); 10539 connp->conn_dhcpinit_ill = NULL; 10540 } 10541 10542 if (ill != NULL) { 10543 connp->conn_dhcpinit_ill = ill; 10544 atomic_inc_32(&ill->ill_dhcpinit); 10545 } 10546 break; 10547 } 10548 } else { 10549 switch (option) { 10550 case IPV6_BOUND_IF: 10551 connp->conn_incoming_ill = ill; 10552 connp->conn_outgoing_ill = ill; 10553 connp->conn_orig_bound_ifindex = (ill == NULL) ? 10554 0 : ifindex; 10555 break; 10556 10557 case IPV6_BOUND_PIF: 10558 /* 10559 * Limit all transmit to this ill. 10560 * Unlike IPV6_BOUND_IF, using this option 10561 * prevents load spreading and failover from 10562 * happening when the interface is part of the 10563 * group. That's why we don't need to remember 10564 * the ifindex in orig_bound_ifindex as in 10565 * IPV6_BOUND_IF. 10566 */ 10567 connp->conn_outgoing_pill = ill; 10568 break; 10569 10570 case IPV6_DONTFAILOVER_IF: 10571 /* 10572 * This option is used by in.mpathd to ensure 10573 * that IPMP probe packets only go out on the 10574 * test interfaces. in.mpathd sets this option 10575 * on the non-failover interfaces. 10576 */ 10577 connp->conn_nofailover_ill = ill; 10578 /* 10579 * For backward compatibility, this option 10580 * implicitly sets ip_multicast_ill as used in 10581 * IPV6_MULTICAST_IF so that ip_wput gets 10582 * this ill to send mcast packets. 10583 */ 10584 connp->conn_multicast_ill = ill; 10585 connp->conn_orig_multicast_ifindex = (ill == NULL) ? 10586 0 : ifindex; 10587 break; 10588 10589 case IPV6_MULTICAST_IF: 10590 /* 10591 * Set conn_multicast_ill to be the IPv6 ill. 10592 * Set conn_multicast_ipif to be an IPv4 ipif 10593 * for ifindex to make IPv4 mapped addresses 10594 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 10595 * Even if no IPv6 ill exists for the ifindex 10596 * we need to check for an IPv4 ifindex in order 10597 * for this to work with mapped addresses. In that 10598 * case only set conn_multicast_ipif. 10599 */ 10600 if (!checkonly) { 10601 if (ifindex == 0) { 10602 connp->conn_multicast_ill = NULL; 10603 connp->conn_orig_multicast_ifindex = 0; 10604 connp->conn_multicast_ipif = NULL; 10605 } else if (ill != NULL) { 10606 connp->conn_multicast_ill = ill; 10607 connp->conn_orig_multicast_ifindex = 10608 ifindex; 10609 } 10610 } 10611 break; 10612 } 10613 } 10614 10615 if (ill != NULL) { 10616 mutex_exit(&ill->ill_lock); 10617 mutex_exit(&connp->conn_lock); 10618 ill_refrele(ill); 10619 return (0); 10620 } 10621 mutex_exit(&connp->conn_lock); 10622 /* 10623 * We succeeded in clearing the option (ifindex == 0) or failed to 10624 * locate the ill and could not set the option (ifindex != 0) 10625 */ 10626 return (ifindex == 0 ? 0 : EINVAL); 10627 } 10628 10629 /* This routine sets socket options. */ 10630 /* ARGSUSED */ 10631 int 10632 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 10633 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 10634 void *dummy, cred_t *cr, mblk_t *first_mp) 10635 { 10636 int *i1 = (int *)invalp; 10637 conn_t *connp = Q_TO_CONN(q); 10638 int error = 0; 10639 boolean_t checkonly; 10640 ire_t *ire; 10641 boolean_t found; 10642 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10643 10644 switch (optset_context) { 10645 10646 case SETFN_OPTCOM_CHECKONLY: 10647 checkonly = B_TRUE; 10648 /* 10649 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 10650 * inlen != 0 implies value supplied and 10651 * we have to "pretend" to set it. 10652 * inlen == 0 implies that there is no 10653 * value part in T_CHECK request and just validation 10654 * done elsewhere should be enough, we just return here. 10655 */ 10656 if (inlen == 0) { 10657 *outlenp = 0; 10658 return (0); 10659 } 10660 break; 10661 case SETFN_OPTCOM_NEGOTIATE: 10662 case SETFN_UD_NEGOTIATE: 10663 case SETFN_CONN_NEGOTIATE: 10664 checkonly = B_FALSE; 10665 break; 10666 default: 10667 /* 10668 * We should never get here 10669 */ 10670 *outlenp = 0; 10671 return (EINVAL); 10672 } 10673 10674 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 10675 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 10676 10677 /* 10678 * For fixed length options, no sanity check 10679 * of passed in length is done. It is assumed *_optcom_req() 10680 * routines do the right thing. 10681 */ 10682 10683 switch (level) { 10684 case SOL_SOCKET: 10685 /* 10686 * conn_lock protects the bitfields, and is used to 10687 * set the fields atomically. 10688 */ 10689 switch (name) { 10690 case SO_BROADCAST: 10691 if (!checkonly) { 10692 /* TODO: use value someplace? */ 10693 mutex_enter(&connp->conn_lock); 10694 connp->conn_broadcast = *i1 ? 1 : 0; 10695 mutex_exit(&connp->conn_lock); 10696 } 10697 break; /* goto sizeof (int) option return */ 10698 case SO_USELOOPBACK: 10699 if (!checkonly) { 10700 /* TODO: use value someplace? */ 10701 mutex_enter(&connp->conn_lock); 10702 connp->conn_loopback = *i1 ? 1 : 0; 10703 mutex_exit(&connp->conn_lock); 10704 } 10705 break; /* goto sizeof (int) option return */ 10706 case SO_DONTROUTE: 10707 if (!checkonly) { 10708 mutex_enter(&connp->conn_lock); 10709 connp->conn_dontroute = *i1 ? 1 : 0; 10710 mutex_exit(&connp->conn_lock); 10711 } 10712 break; /* goto sizeof (int) option return */ 10713 case SO_REUSEADDR: 10714 if (!checkonly) { 10715 mutex_enter(&connp->conn_lock); 10716 connp->conn_reuseaddr = *i1 ? 1 : 0; 10717 mutex_exit(&connp->conn_lock); 10718 } 10719 break; /* goto sizeof (int) option return */ 10720 case SO_PROTOTYPE: 10721 if (!checkonly) { 10722 mutex_enter(&connp->conn_lock); 10723 connp->conn_proto = *i1; 10724 mutex_exit(&connp->conn_lock); 10725 } 10726 break; /* goto sizeof (int) option return */ 10727 case SO_ALLZONES: 10728 if (!checkonly) { 10729 mutex_enter(&connp->conn_lock); 10730 if (IPCL_IS_BOUND(connp)) { 10731 mutex_exit(&connp->conn_lock); 10732 return (EINVAL); 10733 } 10734 connp->conn_allzones = *i1 != 0 ? 1 : 0; 10735 mutex_exit(&connp->conn_lock); 10736 } 10737 break; /* goto sizeof (int) option return */ 10738 case SO_ANON_MLP: 10739 if (!checkonly) { 10740 mutex_enter(&connp->conn_lock); 10741 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 10742 mutex_exit(&connp->conn_lock); 10743 } 10744 break; /* goto sizeof (int) option return */ 10745 case SO_MAC_EXEMPT: 10746 if (secpolicy_net_mac_aware(cr) != 0 || 10747 IPCL_IS_BOUND(connp)) 10748 return (EACCES); 10749 if (!checkonly) { 10750 mutex_enter(&connp->conn_lock); 10751 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 10752 mutex_exit(&connp->conn_lock); 10753 } 10754 break; /* goto sizeof (int) option return */ 10755 default: 10756 /* 10757 * "soft" error (negative) 10758 * option not handled at this level 10759 * Note: Do not modify *outlenp 10760 */ 10761 return (-EINVAL); 10762 } 10763 break; 10764 case IPPROTO_IP: 10765 switch (name) { 10766 case IP_NEXTHOP: 10767 if (secpolicy_ip_config(cr, B_FALSE) != 0) 10768 return (EPERM); 10769 /* FALLTHRU */ 10770 case IP_MULTICAST_IF: 10771 case IP_DONTFAILOVER_IF: { 10772 ipaddr_t addr = *i1; 10773 10774 error = ip_opt_set_ipif(connp, addr, checkonly, name, 10775 first_mp); 10776 if (error != 0) 10777 return (error); 10778 break; /* goto sizeof (int) option return */ 10779 } 10780 10781 case IP_MULTICAST_TTL: 10782 /* Recorded in transport above IP */ 10783 *outvalp = *invalp; 10784 *outlenp = sizeof (uchar_t); 10785 return (0); 10786 case IP_MULTICAST_LOOP: 10787 if (!checkonly) { 10788 mutex_enter(&connp->conn_lock); 10789 connp->conn_multicast_loop = *invalp ? 1 : 0; 10790 mutex_exit(&connp->conn_lock); 10791 } 10792 *outvalp = *invalp; 10793 *outlenp = sizeof (uchar_t); 10794 return (0); 10795 case IP_ADD_MEMBERSHIP: 10796 case MCAST_JOIN_GROUP: 10797 case IP_DROP_MEMBERSHIP: 10798 case MCAST_LEAVE_GROUP: { 10799 struct ip_mreq *mreqp; 10800 struct group_req *greqp; 10801 ire_t *ire; 10802 boolean_t done = B_FALSE; 10803 ipaddr_t group, ifaddr; 10804 struct sockaddr_in *sin; 10805 uint32_t *ifindexp; 10806 boolean_t mcast_opt = B_TRUE; 10807 mcast_record_t fmode; 10808 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10809 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10810 10811 switch (name) { 10812 case IP_ADD_MEMBERSHIP: 10813 mcast_opt = B_FALSE; 10814 /* FALLTHRU */ 10815 case MCAST_JOIN_GROUP: 10816 fmode = MODE_IS_EXCLUDE; 10817 optfn = ip_opt_add_group; 10818 break; 10819 10820 case IP_DROP_MEMBERSHIP: 10821 mcast_opt = B_FALSE; 10822 /* FALLTHRU */ 10823 case MCAST_LEAVE_GROUP: 10824 fmode = MODE_IS_INCLUDE; 10825 optfn = ip_opt_delete_group; 10826 break; 10827 } 10828 10829 if (mcast_opt) { 10830 greqp = (struct group_req *)i1; 10831 sin = (struct sockaddr_in *)&greqp->gr_group; 10832 if (sin->sin_family != AF_INET) { 10833 *outlenp = 0; 10834 return (ENOPROTOOPT); 10835 } 10836 group = (ipaddr_t)sin->sin_addr.s_addr; 10837 ifaddr = INADDR_ANY; 10838 ifindexp = &greqp->gr_interface; 10839 } else { 10840 mreqp = (struct ip_mreq *)i1; 10841 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10842 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10843 ifindexp = NULL; 10844 } 10845 10846 /* 10847 * In the multirouting case, we need to replicate 10848 * the request on all interfaces that will take part 10849 * in replication. We do so because multirouting is 10850 * reflective, thus we will probably receive multi- 10851 * casts on those interfaces. 10852 * The ip_multirt_apply_membership() succeeds if the 10853 * operation succeeds on at least one interface. 10854 */ 10855 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10856 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10857 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10858 if (ire != NULL) { 10859 if (ire->ire_flags & RTF_MULTIRT) { 10860 error = ip_multirt_apply_membership( 10861 optfn, ire, connp, checkonly, group, 10862 fmode, INADDR_ANY, first_mp); 10863 done = B_TRUE; 10864 } 10865 ire_refrele(ire); 10866 } 10867 if (!done) { 10868 error = optfn(connp, checkonly, group, ifaddr, 10869 ifindexp, fmode, INADDR_ANY, first_mp); 10870 } 10871 if (error) { 10872 /* 10873 * EINPROGRESS is a soft error, needs retry 10874 * so don't make *outlenp zero. 10875 */ 10876 if (error != EINPROGRESS) 10877 *outlenp = 0; 10878 return (error); 10879 } 10880 /* OK return - copy input buffer into output buffer */ 10881 if (invalp != outvalp) { 10882 /* don't trust bcopy for identical src/dst */ 10883 bcopy(invalp, outvalp, inlen); 10884 } 10885 *outlenp = inlen; 10886 return (0); 10887 } 10888 case IP_BLOCK_SOURCE: 10889 case IP_UNBLOCK_SOURCE: 10890 case IP_ADD_SOURCE_MEMBERSHIP: 10891 case IP_DROP_SOURCE_MEMBERSHIP: 10892 case MCAST_BLOCK_SOURCE: 10893 case MCAST_UNBLOCK_SOURCE: 10894 case MCAST_JOIN_SOURCE_GROUP: 10895 case MCAST_LEAVE_SOURCE_GROUP: { 10896 struct ip_mreq_source *imreqp; 10897 struct group_source_req *gsreqp; 10898 in_addr_t grp, src, ifaddr = INADDR_ANY; 10899 uint32_t ifindex = 0; 10900 mcast_record_t fmode; 10901 struct sockaddr_in *sin; 10902 ire_t *ire; 10903 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10904 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10905 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10906 10907 switch (name) { 10908 case IP_BLOCK_SOURCE: 10909 mcast_opt = B_FALSE; 10910 /* FALLTHRU */ 10911 case MCAST_BLOCK_SOURCE: 10912 fmode = MODE_IS_EXCLUDE; 10913 optfn = ip_opt_add_group; 10914 break; 10915 10916 case IP_UNBLOCK_SOURCE: 10917 mcast_opt = B_FALSE; 10918 /* FALLTHRU */ 10919 case MCAST_UNBLOCK_SOURCE: 10920 fmode = MODE_IS_EXCLUDE; 10921 optfn = ip_opt_delete_group; 10922 break; 10923 10924 case IP_ADD_SOURCE_MEMBERSHIP: 10925 mcast_opt = B_FALSE; 10926 /* FALLTHRU */ 10927 case MCAST_JOIN_SOURCE_GROUP: 10928 fmode = MODE_IS_INCLUDE; 10929 optfn = ip_opt_add_group; 10930 break; 10931 10932 case IP_DROP_SOURCE_MEMBERSHIP: 10933 mcast_opt = B_FALSE; 10934 /* FALLTHRU */ 10935 case MCAST_LEAVE_SOURCE_GROUP: 10936 fmode = MODE_IS_INCLUDE; 10937 optfn = ip_opt_delete_group; 10938 break; 10939 } 10940 10941 if (mcast_opt) { 10942 gsreqp = (struct group_source_req *)i1; 10943 if (gsreqp->gsr_group.ss_family != AF_INET) { 10944 *outlenp = 0; 10945 return (ENOPROTOOPT); 10946 } 10947 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10948 grp = (ipaddr_t)sin->sin_addr.s_addr; 10949 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10950 src = (ipaddr_t)sin->sin_addr.s_addr; 10951 ifindex = gsreqp->gsr_interface; 10952 } else { 10953 imreqp = (struct ip_mreq_source *)i1; 10954 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10955 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10956 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10957 } 10958 10959 /* 10960 * In the multirouting case, we need to replicate 10961 * the request as noted in the mcast cases above. 10962 */ 10963 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10964 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10965 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10966 if (ire != NULL) { 10967 if (ire->ire_flags & RTF_MULTIRT) { 10968 error = ip_multirt_apply_membership( 10969 optfn, ire, connp, checkonly, grp, 10970 fmode, src, first_mp); 10971 done = B_TRUE; 10972 } 10973 ire_refrele(ire); 10974 } 10975 if (!done) { 10976 error = optfn(connp, checkonly, grp, ifaddr, 10977 &ifindex, fmode, src, first_mp); 10978 } 10979 if (error != 0) { 10980 /* 10981 * EINPROGRESS is a soft error, needs retry 10982 * so don't make *outlenp zero. 10983 */ 10984 if (error != EINPROGRESS) 10985 *outlenp = 0; 10986 return (error); 10987 } 10988 /* OK return - copy input buffer into output buffer */ 10989 if (invalp != outvalp) { 10990 bcopy(invalp, outvalp, inlen); 10991 } 10992 *outlenp = inlen; 10993 return (0); 10994 } 10995 case IP_SEC_OPT: 10996 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10997 if (error != 0) { 10998 *outlenp = 0; 10999 return (error); 11000 } 11001 break; 11002 case IP_HDRINCL: 11003 case IP_OPTIONS: 11004 case T_IP_OPTIONS: 11005 case IP_TOS: 11006 case T_IP_TOS: 11007 case IP_TTL: 11008 case IP_RECVDSTADDR: 11009 case IP_RECVOPTS: 11010 /* OK return - copy input buffer into output buffer */ 11011 if (invalp != outvalp) { 11012 /* don't trust bcopy for identical src/dst */ 11013 bcopy(invalp, outvalp, inlen); 11014 } 11015 *outlenp = inlen; 11016 return (0); 11017 case IP_RECVIF: 11018 /* Retrieve the inbound interface index */ 11019 if (!checkonly) { 11020 mutex_enter(&connp->conn_lock); 11021 connp->conn_recvif = *i1 ? 1 : 0; 11022 mutex_exit(&connp->conn_lock); 11023 } 11024 break; /* goto sizeof (int) option return */ 11025 case IP_RECVPKTINFO: 11026 if (!checkonly) { 11027 mutex_enter(&connp->conn_lock); 11028 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11029 mutex_exit(&connp->conn_lock); 11030 } 11031 break; /* goto sizeof (int) option return */ 11032 case IP_RECVSLLA: 11033 /* Retrieve the source link layer address */ 11034 if (!checkonly) { 11035 mutex_enter(&connp->conn_lock); 11036 connp->conn_recvslla = *i1 ? 1 : 0; 11037 mutex_exit(&connp->conn_lock); 11038 } 11039 break; /* goto sizeof (int) option return */ 11040 case MRT_INIT: 11041 case MRT_DONE: 11042 case MRT_ADD_VIF: 11043 case MRT_DEL_VIF: 11044 case MRT_ADD_MFC: 11045 case MRT_DEL_MFC: 11046 case MRT_ASSERT: 11047 if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { 11048 *outlenp = 0; 11049 return (error); 11050 } 11051 error = ip_mrouter_set((int)name, q, checkonly, 11052 (uchar_t *)invalp, inlen, first_mp); 11053 if (error) { 11054 *outlenp = 0; 11055 return (error); 11056 } 11057 /* OK return - copy input buffer into output buffer */ 11058 if (invalp != outvalp) { 11059 /* don't trust bcopy for identical src/dst */ 11060 bcopy(invalp, outvalp, inlen); 11061 } 11062 *outlenp = inlen; 11063 return (0); 11064 case IP_BOUND_IF: 11065 case IP_DHCPINIT_IF: 11066 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 11067 level, name, first_mp); 11068 if (error != 0) 11069 return (error); 11070 break; /* goto sizeof (int) option return */ 11071 11072 case IP_UNSPEC_SRC: 11073 /* Allow sending with a zero source address */ 11074 if (!checkonly) { 11075 mutex_enter(&connp->conn_lock); 11076 connp->conn_unspec_src = *i1 ? 1 : 0; 11077 mutex_exit(&connp->conn_lock); 11078 } 11079 break; /* goto sizeof (int) option return */ 11080 default: 11081 /* 11082 * "soft" error (negative) 11083 * option not handled at this level 11084 * Note: Do not modify *outlenp 11085 */ 11086 return (-EINVAL); 11087 } 11088 break; 11089 case IPPROTO_IPV6: 11090 switch (name) { 11091 case IPV6_BOUND_IF: 11092 case IPV6_BOUND_PIF: 11093 case IPV6_DONTFAILOVER_IF: 11094 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 11095 level, name, first_mp); 11096 if (error != 0) 11097 return (error); 11098 break; /* goto sizeof (int) option return */ 11099 11100 case IPV6_MULTICAST_IF: 11101 /* 11102 * The only possible errors are EINPROGRESS and 11103 * EINVAL. EINPROGRESS will be restarted and is not 11104 * a hard error. We call this option on both V4 and V6 11105 * If both return EINVAL, then this call returns 11106 * EINVAL. If at least one of them succeeds we 11107 * return success. 11108 */ 11109 found = B_FALSE; 11110 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 11111 level, name, first_mp); 11112 if (error == EINPROGRESS) 11113 return (error); 11114 if (error == 0) 11115 found = B_TRUE; 11116 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 11117 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 11118 if (error == 0) 11119 found = B_TRUE; 11120 if (!found) 11121 return (error); 11122 break; /* goto sizeof (int) option return */ 11123 11124 case IPV6_MULTICAST_HOPS: 11125 /* Recorded in transport above IP */ 11126 break; /* goto sizeof (int) option return */ 11127 case IPV6_MULTICAST_LOOP: 11128 if (!checkonly) { 11129 mutex_enter(&connp->conn_lock); 11130 connp->conn_multicast_loop = *i1; 11131 mutex_exit(&connp->conn_lock); 11132 } 11133 break; /* goto sizeof (int) option return */ 11134 case IPV6_JOIN_GROUP: 11135 case MCAST_JOIN_GROUP: 11136 case IPV6_LEAVE_GROUP: 11137 case MCAST_LEAVE_GROUP: { 11138 struct ipv6_mreq *ip_mreqp; 11139 struct group_req *greqp; 11140 ire_t *ire; 11141 boolean_t done = B_FALSE; 11142 in6_addr_t groupv6; 11143 uint32_t ifindex; 11144 boolean_t mcast_opt = B_TRUE; 11145 mcast_record_t fmode; 11146 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11147 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11148 11149 switch (name) { 11150 case IPV6_JOIN_GROUP: 11151 mcast_opt = B_FALSE; 11152 /* FALLTHRU */ 11153 case MCAST_JOIN_GROUP: 11154 fmode = MODE_IS_EXCLUDE; 11155 optfn = ip_opt_add_group_v6; 11156 break; 11157 11158 case IPV6_LEAVE_GROUP: 11159 mcast_opt = B_FALSE; 11160 /* FALLTHRU */ 11161 case MCAST_LEAVE_GROUP: 11162 fmode = MODE_IS_INCLUDE; 11163 optfn = ip_opt_delete_group_v6; 11164 break; 11165 } 11166 11167 if (mcast_opt) { 11168 struct sockaddr_in *sin; 11169 struct sockaddr_in6 *sin6; 11170 greqp = (struct group_req *)i1; 11171 if (greqp->gr_group.ss_family == AF_INET) { 11172 sin = (struct sockaddr_in *) 11173 &(greqp->gr_group); 11174 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 11175 &groupv6); 11176 } else { 11177 sin6 = (struct sockaddr_in6 *) 11178 &(greqp->gr_group); 11179 groupv6 = sin6->sin6_addr; 11180 } 11181 ifindex = greqp->gr_interface; 11182 } else { 11183 ip_mreqp = (struct ipv6_mreq *)i1; 11184 groupv6 = ip_mreqp->ipv6mr_multiaddr; 11185 ifindex = ip_mreqp->ipv6mr_interface; 11186 } 11187 /* 11188 * In the multirouting case, we need to replicate 11189 * the request on all interfaces that will take part 11190 * in replication. We do so because multirouting is 11191 * reflective, thus we will probably receive multi- 11192 * casts on those interfaces. 11193 * The ip_multirt_apply_membership_v6() succeeds if 11194 * the operation succeeds on at least one interface. 11195 */ 11196 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 11197 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11198 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11199 if (ire != NULL) { 11200 if (ire->ire_flags & RTF_MULTIRT) { 11201 error = ip_multirt_apply_membership_v6( 11202 optfn, ire, connp, checkonly, 11203 &groupv6, fmode, &ipv6_all_zeros, 11204 first_mp); 11205 done = B_TRUE; 11206 } 11207 ire_refrele(ire); 11208 } 11209 if (!done) { 11210 error = optfn(connp, checkonly, &groupv6, 11211 ifindex, fmode, &ipv6_all_zeros, first_mp); 11212 } 11213 if (error) { 11214 /* 11215 * EINPROGRESS is a soft error, needs retry 11216 * so don't make *outlenp zero. 11217 */ 11218 if (error != EINPROGRESS) 11219 *outlenp = 0; 11220 return (error); 11221 } 11222 /* OK return - copy input buffer into output buffer */ 11223 if (invalp != outvalp) { 11224 /* don't trust bcopy for identical src/dst */ 11225 bcopy(invalp, outvalp, inlen); 11226 } 11227 *outlenp = inlen; 11228 return (0); 11229 } 11230 case MCAST_BLOCK_SOURCE: 11231 case MCAST_UNBLOCK_SOURCE: 11232 case MCAST_JOIN_SOURCE_GROUP: 11233 case MCAST_LEAVE_SOURCE_GROUP: { 11234 struct group_source_req *gsreqp; 11235 in6_addr_t v6grp, v6src; 11236 uint32_t ifindex; 11237 mcast_record_t fmode; 11238 ire_t *ire; 11239 boolean_t done = B_FALSE; 11240 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11241 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11242 11243 switch (name) { 11244 case MCAST_BLOCK_SOURCE: 11245 fmode = MODE_IS_EXCLUDE; 11246 optfn = ip_opt_add_group_v6; 11247 break; 11248 case MCAST_UNBLOCK_SOURCE: 11249 fmode = MODE_IS_EXCLUDE; 11250 optfn = ip_opt_delete_group_v6; 11251 break; 11252 case MCAST_JOIN_SOURCE_GROUP: 11253 fmode = MODE_IS_INCLUDE; 11254 optfn = ip_opt_add_group_v6; 11255 break; 11256 case MCAST_LEAVE_SOURCE_GROUP: 11257 fmode = MODE_IS_INCLUDE; 11258 optfn = ip_opt_delete_group_v6; 11259 break; 11260 } 11261 11262 gsreqp = (struct group_source_req *)i1; 11263 ifindex = gsreqp->gsr_interface; 11264 if (gsreqp->gsr_group.ss_family == AF_INET) { 11265 struct sockaddr_in *s; 11266 s = (struct sockaddr_in *)&gsreqp->gsr_group; 11267 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 11268 s = (struct sockaddr_in *)&gsreqp->gsr_source; 11269 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 11270 } else { 11271 struct sockaddr_in6 *s6; 11272 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 11273 v6grp = s6->sin6_addr; 11274 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 11275 v6src = s6->sin6_addr; 11276 } 11277 11278 /* 11279 * In the multirouting case, we need to replicate 11280 * the request as noted in the mcast cases above. 11281 */ 11282 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 11283 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11284 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11285 if (ire != NULL) { 11286 if (ire->ire_flags & RTF_MULTIRT) { 11287 error = ip_multirt_apply_membership_v6( 11288 optfn, ire, connp, checkonly, 11289 &v6grp, fmode, &v6src, first_mp); 11290 done = B_TRUE; 11291 } 11292 ire_refrele(ire); 11293 } 11294 if (!done) { 11295 error = optfn(connp, checkonly, &v6grp, 11296 ifindex, fmode, &v6src, first_mp); 11297 } 11298 if (error != 0) { 11299 /* 11300 * EINPROGRESS is a soft error, needs retry 11301 * so don't make *outlenp zero. 11302 */ 11303 if (error != EINPROGRESS) 11304 *outlenp = 0; 11305 return (error); 11306 } 11307 /* OK return - copy input buffer into output buffer */ 11308 if (invalp != outvalp) { 11309 bcopy(invalp, outvalp, inlen); 11310 } 11311 *outlenp = inlen; 11312 return (0); 11313 } 11314 case IPV6_UNICAST_HOPS: 11315 /* Recorded in transport above IP */ 11316 break; /* goto sizeof (int) option return */ 11317 case IPV6_UNSPEC_SRC: 11318 /* Allow sending with a zero source address */ 11319 if (!checkonly) { 11320 mutex_enter(&connp->conn_lock); 11321 connp->conn_unspec_src = *i1 ? 1 : 0; 11322 mutex_exit(&connp->conn_lock); 11323 } 11324 break; /* goto sizeof (int) option return */ 11325 case IPV6_RECVPKTINFO: 11326 if (!checkonly) { 11327 mutex_enter(&connp->conn_lock); 11328 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11329 mutex_exit(&connp->conn_lock); 11330 } 11331 break; /* goto sizeof (int) option return */ 11332 case IPV6_RECVTCLASS: 11333 if (!checkonly) { 11334 if (*i1 < 0 || *i1 > 1) { 11335 return (EINVAL); 11336 } 11337 mutex_enter(&connp->conn_lock); 11338 connp->conn_ipv6_recvtclass = *i1; 11339 mutex_exit(&connp->conn_lock); 11340 } 11341 break; 11342 case IPV6_RECVPATHMTU: 11343 if (!checkonly) { 11344 if (*i1 < 0 || *i1 > 1) { 11345 return (EINVAL); 11346 } 11347 mutex_enter(&connp->conn_lock); 11348 connp->conn_ipv6_recvpathmtu = *i1; 11349 mutex_exit(&connp->conn_lock); 11350 } 11351 break; 11352 case IPV6_RECVHOPLIMIT: 11353 if (!checkonly) { 11354 mutex_enter(&connp->conn_lock); 11355 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 11356 mutex_exit(&connp->conn_lock); 11357 } 11358 break; /* goto sizeof (int) option return */ 11359 case IPV6_RECVHOPOPTS: 11360 if (!checkonly) { 11361 mutex_enter(&connp->conn_lock); 11362 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 11363 mutex_exit(&connp->conn_lock); 11364 } 11365 break; /* goto sizeof (int) option return */ 11366 case IPV6_RECVDSTOPTS: 11367 if (!checkonly) { 11368 mutex_enter(&connp->conn_lock); 11369 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 11370 mutex_exit(&connp->conn_lock); 11371 } 11372 break; /* goto sizeof (int) option return */ 11373 case IPV6_RECVRTHDR: 11374 if (!checkonly) { 11375 mutex_enter(&connp->conn_lock); 11376 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 11377 mutex_exit(&connp->conn_lock); 11378 } 11379 break; /* goto sizeof (int) option return */ 11380 case IPV6_RECVRTHDRDSTOPTS: 11381 if (!checkonly) { 11382 mutex_enter(&connp->conn_lock); 11383 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 11384 mutex_exit(&connp->conn_lock); 11385 } 11386 break; /* goto sizeof (int) option return */ 11387 case IPV6_PKTINFO: 11388 if (inlen == 0) 11389 return (-EINVAL); /* clearing option */ 11390 error = ip6_set_pktinfo(cr, connp, 11391 (struct in6_pktinfo *)invalp, first_mp); 11392 if (error != 0) 11393 *outlenp = 0; 11394 else 11395 *outlenp = inlen; 11396 return (error); 11397 case IPV6_NEXTHOP: { 11398 struct sockaddr_in6 *sin6; 11399 11400 /* Verify that the nexthop is reachable */ 11401 if (inlen == 0) 11402 return (-EINVAL); /* clearing option */ 11403 11404 sin6 = (struct sockaddr_in6 *)invalp; 11405 ire = ire_route_lookup_v6(&sin6->sin6_addr, 11406 0, 0, 0, NULL, NULL, connp->conn_zoneid, 11407 NULL, MATCH_IRE_DEFAULT, ipst); 11408 11409 if (ire == NULL) { 11410 *outlenp = 0; 11411 return (EHOSTUNREACH); 11412 } 11413 ire_refrele(ire); 11414 return (-EINVAL); 11415 } 11416 case IPV6_SEC_OPT: 11417 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 11418 if (error != 0) { 11419 *outlenp = 0; 11420 return (error); 11421 } 11422 break; 11423 case IPV6_SRC_PREFERENCES: { 11424 /* 11425 * This is implemented strictly in the ip module 11426 * (here and in tcp_opt_*() to accomodate tcp 11427 * sockets). Modules above ip pass this option 11428 * down here since ip is the only one that needs to 11429 * be aware of source address preferences. 11430 * 11431 * This socket option only affects connected 11432 * sockets that haven't already bound to a specific 11433 * IPv6 address. In other words, sockets that 11434 * don't call bind() with an address other than the 11435 * unspecified address and that call connect(). 11436 * ip_bind_connected_v6() passes these preferences 11437 * to the ipif_select_source_v6() function. 11438 */ 11439 if (inlen != sizeof (uint32_t)) 11440 return (EINVAL); 11441 error = ip6_set_src_preferences(connp, 11442 *(uint32_t *)invalp); 11443 if (error != 0) { 11444 *outlenp = 0; 11445 return (error); 11446 } else { 11447 *outlenp = sizeof (uint32_t); 11448 } 11449 break; 11450 } 11451 case IPV6_V6ONLY: 11452 if (*i1 < 0 || *i1 > 1) { 11453 return (EINVAL); 11454 } 11455 mutex_enter(&connp->conn_lock); 11456 connp->conn_ipv6_v6only = *i1; 11457 mutex_exit(&connp->conn_lock); 11458 break; 11459 default: 11460 return (-EINVAL); 11461 } 11462 break; 11463 default: 11464 /* 11465 * "soft" error (negative) 11466 * option not handled at this level 11467 * Note: Do not modify *outlenp 11468 */ 11469 return (-EINVAL); 11470 } 11471 /* 11472 * Common case of return from an option that is sizeof (int) 11473 */ 11474 *(int *)outvalp = *i1; 11475 *outlenp = sizeof (int); 11476 return (0); 11477 } 11478 11479 /* 11480 * This routine gets default values of certain options whose default 11481 * values are maintained by protocol specific code 11482 */ 11483 /* ARGSUSED */ 11484 int 11485 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 11486 { 11487 int *i1 = (int *)ptr; 11488 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11489 11490 switch (level) { 11491 case IPPROTO_IP: 11492 switch (name) { 11493 case IP_MULTICAST_TTL: 11494 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 11495 return (sizeof (uchar_t)); 11496 case IP_MULTICAST_LOOP: 11497 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 11498 return (sizeof (uchar_t)); 11499 default: 11500 return (-1); 11501 } 11502 case IPPROTO_IPV6: 11503 switch (name) { 11504 case IPV6_UNICAST_HOPS: 11505 *i1 = ipst->ips_ipv6_def_hops; 11506 return (sizeof (int)); 11507 case IPV6_MULTICAST_HOPS: 11508 *i1 = IP_DEFAULT_MULTICAST_TTL; 11509 return (sizeof (int)); 11510 case IPV6_MULTICAST_LOOP: 11511 *i1 = IP_DEFAULT_MULTICAST_LOOP; 11512 return (sizeof (int)); 11513 case IPV6_V6ONLY: 11514 *i1 = 1; 11515 return (sizeof (int)); 11516 default: 11517 return (-1); 11518 } 11519 default: 11520 return (-1); 11521 } 11522 /* NOTREACHED */ 11523 } 11524 11525 /* 11526 * Given a destination address and a pointer to where to put the information 11527 * this routine fills in the mtuinfo. 11528 */ 11529 int 11530 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 11531 struct ip6_mtuinfo *mtuinfo, netstack_t *ns) 11532 { 11533 ire_t *ire; 11534 ip_stack_t *ipst = ns->netstack_ip; 11535 11536 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 11537 return (-1); 11538 11539 bzero(mtuinfo, sizeof (*mtuinfo)); 11540 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 11541 mtuinfo->ip6m_addr.sin6_port = port; 11542 mtuinfo->ip6m_addr.sin6_addr = *in6; 11543 11544 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst); 11545 if (ire != NULL) { 11546 mtuinfo->ip6m_mtu = ire->ire_max_frag; 11547 ire_refrele(ire); 11548 } else { 11549 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 11550 } 11551 return (sizeof (struct ip6_mtuinfo)); 11552 } 11553 11554 /* 11555 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 11556 * checking of GET_QUEUE_CRED(q) and that ip_g_mrouter is set should be done and 11557 * isn't. This doesn't matter as the error checking is done properly for the 11558 * other MRT options coming in through ip_opt_set. 11559 */ 11560 int 11561 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 11562 { 11563 conn_t *connp = Q_TO_CONN(q); 11564 ipsec_req_t *req = (ipsec_req_t *)ptr; 11565 11566 switch (level) { 11567 case IPPROTO_IP: 11568 switch (name) { 11569 case MRT_VERSION: 11570 case MRT_ASSERT: 11571 (void) ip_mrouter_get(name, q, ptr); 11572 return (sizeof (int)); 11573 case IP_SEC_OPT: 11574 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 11575 case IP_NEXTHOP: 11576 if (connp->conn_nexthop_set) { 11577 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 11578 return (sizeof (ipaddr_t)); 11579 } else 11580 return (0); 11581 case IP_RECVPKTINFO: 11582 *(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0; 11583 return (sizeof (int)); 11584 default: 11585 break; 11586 } 11587 break; 11588 case IPPROTO_IPV6: 11589 switch (name) { 11590 case IPV6_SEC_OPT: 11591 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 11592 case IPV6_SRC_PREFERENCES: { 11593 return (ip6_get_src_preferences(connp, 11594 (uint32_t *)ptr)); 11595 } 11596 case IPV6_V6ONLY: 11597 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 11598 return (sizeof (int)); 11599 case IPV6_PATHMTU: 11600 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 11601 (struct ip6_mtuinfo *)ptr, connp->conn_netstack)); 11602 default: 11603 break; 11604 } 11605 break; 11606 default: 11607 break; 11608 } 11609 return (-1); 11610 } 11611 11612 /* Named Dispatch routine to get a current value out of our parameter table. */ 11613 /* ARGSUSED */ 11614 static int 11615 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11616 { 11617 ipparam_t *ippa = (ipparam_t *)cp; 11618 11619 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 11620 return (0); 11621 } 11622 11623 /* ARGSUSED */ 11624 static int 11625 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11626 { 11627 11628 (void) mi_mpprintf(mp, "%d", *(int *)cp); 11629 return (0); 11630 } 11631 11632 /* 11633 * Set ip{,6}_forwarding values. This means walking through all of the 11634 * ill's and toggling their forwarding values. 11635 */ 11636 /* ARGSUSED */ 11637 static int 11638 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11639 { 11640 long new_value; 11641 int *forwarding_value = (int *)cp; 11642 ill_t *ill; 11643 boolean_t isv6; 11644 ill_walk_context_t ctx; 11645 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11646 11647 isv6 = (forwarding_value == &ipst->ips_ipv6_forward); 11648 11649 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11650 new_value < 0 || new_value > 1) { 11651 return (EINVAL); 11652 } 11653 11654 *forwarding_value = new_value; 11655 11656 /* 11657 * Regardless of the current value of ip_forwarding, set all per-ill 11658 * values of ip_forwarding to the value being set. 11659 * 11660 * Bring all the ill's up to date with the new global value. 11661 */ 11662 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11663 11664 if (isv6) 11665 ill = ILL_START_WALK_V6(&ctx, ipst); 11666 else 11667 ill = ILL_START_WALK_V4(&ctx, ipst); 11668 11669 for (; ill != NULL; ill = ill_next(&ctx, ill)) 11670 (void) ill_forward_set(ill, new_value != 0); 11671 11672 rw_exit(&ipst->ips_ill_g_lock); 11673 return (0); 11674 } 11675 11676 /* 11677 * Walk through the param array specified registering each element with the 11678 * Named Dispatch handler. This is called only during init. So it is ok 11679 * not to acquire any locks 11680 */ 11681 static boolean_t 11682 ip_param_register(IDP *ndp, ipparam_t *ippa, size_t ippa_cnt, 11683 ipndp_t *ipnd, size_t ipnd_cnt) 11684 { 11685 for (; ippa_cnt-- > 0; ippa++) { 11686 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 11687 if (!nd_load(ndp, ippa->ip_param_name, 11688 ip_param_get, ip_param_set, (caddr_t)ippa)) { 11689 nd_free(ndp); 11690 return (B_FALSE); 11691 } 11692 } 11693 } 11694 11695 for (; ipnd_cnt-- > 0; ipnd++) { 11696 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 11697 if (!nd_load(ndp, ipnd->ip_ndp_name, 11698 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 11699 ipnd->ip_ndp_data)) { 11700 nd_free(ndp); 11701 return (B_FALSE); 11702 } 11703 } 11704 } 11705 11706 return (B_TRUE); 11707 } 11708 11709 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 11710 /* ARGSUSED */ 11711 static int 11712 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11713 { 11714 long new_value; 11715 ipparam_t *ippa = (ipparam_t *)cp; 11716 11717 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11718 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 11719 return (EINVAL); 11720 } 11721 ippa->ip_param_value = new_value; 11722 return (0); 11723 } 11724 11725 /* 11726 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 11727 * When an ipf is passed here for the first time, if 11728 * we already have in-order fragments on the queue, we convert from the fast- 11729 * path reassembly scheme to the hard-case scheme. From then on, additional 11730 * fragments are reassembled here. We keep track of the start and end offsets 11731 * of each piece, and the number of holes in the chain. When the hole count 11732 * goes to zero, we are done! 11733 * 11734 * The ipf_count will be updated to account for any mblk(s) added (pointed to 11735 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 11736 * ipfb_count and ill_frag_count by the difference of ipf_count before and 11737 * after the call to ip_reassemble(). 11738 */ 11739 int 11740 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 11741 size_t msg_len) 11742 { 11743 uint_t end; 11744 mblk_t *next_mp; 11745 mblk_t *mp1; 11746 uint_t offset; 11747 boolean_t incr_dups = B_TRUE; 11748 boolean_t offset_zero_seen = B_FALSE; 11749 boolean_t pkt_boundary_checked = B_FALSE; 11750 11751 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 11752 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 11753 11754 /* Add in byte count */ 11755 ipf->ipf_count += msg_len; 11756 if (ipf->ipf_end) { 11757 /* 11758 * We were part way through in-order reassembly, but now there 11759 * is a hole. We walk through messages already queued, and 11760 * mark them for hard case reassembly. We know that up till 11761 * now they were in order starting from offset zero. 11762 */ 11763 offset = 0; 11764 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11765 IP_REASS_SET_START(mp1, offset); 11766 if (offset == 0) { 11767 ASSERT(ipf->ipf_nf_hdr_len != 0); 11768 offset = -ipf->ipf_nf_hdr_len; 11769 } 11770 offset += mp1->b_wptr - mp1->b_rptr; 11771 IP_REASS_SET_END(mp1, offset); 11772 } 11773 /* One hole at the end. */ 11774 ipf->ipf_hole_cnt = 1; 11775 /* Brand it as a hard case, forever. */ 11776 ipf->ipf_end = 0; 11777 } 11778 /* Walk through all the new pieces. */ 11779 do { 11780 end = start + (mp->b_wptr - mp->b_rptr); 11781 /* 11782 * If start is 0, decrease 'end' only for the first mblk of 11783 * the fragment. Otherwise 'end' can get wrong value in the 11784 * second pass of the loop if first mblk is exactly the 11785 * size of ipf_nf_hdr_len. 11786 */ 11787 if (start == 0 && !offset_zero_seen) { 11788 /* First segment */ 11789 ASSERT(ipf->ipf_nf_hdr_len != 0); 11790 end -= ipf->ipf_nf_hdr_len; 11791 offset_zero_seen = B_TRUE; 11792 } 11793 next_mp = mp->b_cont; 11794 /* 11795 * We are checking to see if there is any interesing data 11796 * to process. If there isn't and the mblk isn't the 11797 * one which carries the unfragmentable header then we 11798 * drop it. It's possible to have just the unfragmentable 11799 * header come through without any data. That needs to be 11800 * saved. 11801 * 11802 * If the assert at the top of this function holds then the 11803 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 11804 * is infrequently traveled enough that the test is left in 11805 * to protect against future code changes which break that 11806 * invariant. 11807 */ 11808 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 11809 /* Empty. Blast it. */ 11810 IP_REASS_SET_START(mp, 0); 11811 IP_REASS_SET_END(mp, 0); 11812 /* 11813 * If the ipf points to the mblk we are about to free, 11814 * update ipf to point to the next mblk (or NULL 11815 * if none). 11816 */ 11817 if (ipf->ipf_mp->b_cont == mp) 11818 ipf->ipf_mp->b_cont = next_mp; 11819 freeb(mp); 11820 continue; 11821 } 11822 mp->b_cont = NULL; 11823 IP_REASS_SET_START(mp, start); 11824 IP_REASS_SET_END(mp, end); 11825 if (!ipf->ipf_tail_mp) { 11826 ipf->ipf_tail_mp = mp; 11827 ipf->ipf_mp->b_cont = mp; 11828 if (start == 0 || !more) { 11829 ipf->ipf_hole_cnt = 1; 11830 /* 11831 * if the first fragment comes in more than one 11832 * mblk, this loop will be executed for each 11833 * mblk. Need to adjust hole count so exiting 11834 * this routine will leave hole count at 1. 11835 */ 11836 if (next_mp) 11837 ipf->ipf_hole_cnt++; 11838 } else 11839 ipf->ipf_hole_cnt = 2; 11840 continue; 11841 } else if (ipf->ipf_last_frag_seen && !more && 11842 !pkt_boundary_checked) { 11843 /* 11844 * We check datagram boundary only if this fragment 11845 * claims to be the last fragment and we have seen a 11846 * last fragment in the past too. We do this only 11847 * once for a given fragment. 11848 * 11849 * start cannot be 0 here as fragments with start=0 11850 * and MF=0 gets handled as a complete packet. These 11851 * fragments should not reach here. 11852 */ 11853 11854 if (start + msgdsize(mp) != 11855 IP_REASS_END(ipf->ipf_tail_mp)) { 11856 /* 11857 * We have two fragments both of which claim 11858 * to be the last fragment but gives conflicting 11859 * information about the whole datagram size. 11860 * Something fishy is going on. Drop the 11861 * fragment and free up the reassembly list. 11862 */ 11863 return (IP_REASS_FAILED); 11864 } 11865 11866 /* 11867 * We shouldn't come to this code block again for this 11868 * particular fragment. 11869 */ 11870 pkt_boundary_checked = B_TRUE; 11871 } 11872 11873 /* New stuff at or beyond tail? */ 11874 offset = IP_REASS_END(ipf->ipf_tail_mp); 11875 if (start >= offset) { 11876 if (ipf->ipf_last_frag_seen) { 11877 /* current fragment is beyond last fragment */ 11878 return (IP_REASS_FAILED); 11879 } 11880 /* Link it on end. */ 11881 ipf->ipf_tail_mp->b_cont = mp; 11882 ipf->ipf_tail_mp = mp; 11883 if (more) { 11884 if (start != offset) 11885 ipf->ipf_hole_cnt++; 11886 } else if (start == offset && next_mp == NULL) 11887 ipf->ipf_hole_cnt--; 11888 continue; 11889 } 11890 mp1 = ipf->ipf_mp->b_cont; 11891 offset = IP_REASS_START(mp1); 11892 /* New stuff at the front? */ 11893 if (start < offset) { 11894 if (start == 0) { 11895 if (end >= offset) { 11896 /* Nailed the hole at the begining. */ 11897 ipf->ipf_hole_cnt--; 11898 } 11899 } else if (end < offset) { 11900 /* 11901 * A hole, stuff, and a hole where there used 11902 * to be just a hole. 11903 */ 11904 ipf->ipf_hole_cnt++; 11905 } 11906 mp->b_cont = mp1; 11907 /* Check for overlap. */ 11908 while (end > offset) { 11909 if (end < IP_REASS_END(mp1)) { 11910 mp->b_wptr -= end - offset; 11911 IP_REASS_SET_END(mp, offset); 11912 BUMP_MIB(ill->ill_ip_mib, 11913 ipIfStatsReasmPartDups); 11914 break; 11915 } 11916 /* Did we cover another hole? */ 11917 if ((mp1->b_cont && 11918 IP_REASS_END(mp1) != 11919 IP_REASS_START(mp1->b_cont) && 11920 end >= IP_REASS_START(mp1->b_cont)) || 11921 (!ipf->ipf_last_frag_seen && !more)) { 11922 ipf->ipf_hole_cnt--; 11923 } 11924 /* Clip out mp1. */ 11925 if ((mp->b_cont = mp1->b_cont) == NULL) { 11926 /* 11927 * After clipping out mp1, this guy 11928 * is now hanging off the end. 11929 */ 11930 ipf->ipf_tail_mp = mp; 11931 } 11932 IP_REASS_SET_START(mp1, 0); 11933 IP_REASS_SET_END(mp1, 0); 11934 /* Subtract byte count */ 11935 ipf->ipf_count -= mp1->b_datap->db_lim - 11936 mp1->b_datap->db_base; 11937 freeb(mp1); 11938 BUMP_MIB(ill->ill_ip_mib, 11939 ipIfStatsReasmPartDups); 11940 mp1 = mp->b_cont; 11941 if (!mp1) 11942 break; 11943 offset = IP_REASS_START(mp1); 11944 } 11945 ipf->ipf_mp->b_cont = mp; 11946 continue; 11947 } 11948 /* 11949 * The new piece starts somewhere between the start of the head 11950 * and before the end of the tail. 11951 */ 11952 for (; mp1; mp1 = mp1->b_cont) { 11953 offset = IP_REASS_END(mp1); 11954 if (start < offset) { 11955 if (end <= offset) { 11956 /* Nothing new. */ 11957 IP_REASS_SET_START(mp, 0); 11958 IP_REASS_SET_END(mp, 0); 11959 /* Subtract byte count */ 11960 ipf->ipf_count -= mp->b_datap->db_lim - 11961 mp->b_datap->db_base; 11962 if (incr_dups) { 11963 ipf->ipf_num_dups++; 11964 incr_dups = B_FALSE; 11965 } 11966 freeb(mp); 11967 BUMP_MIB(ill->ill_ip_mib, 11968 ipIfStatsReasmDuplicates); 11969 break; 11970 } 11971 /* 11972 * Trim redundant stuff off beginning of new 11973 * piece. 11974 */ 11975 IP_REASS_SET_START(mp, offset); 11976 mp->b_rptr += offset - start; 11977 BUMP_MIB(ill->ill_ip_mib, 11978 ipIfStatsReasmPartDups); 11979 start = offset; 11980 if (!mp1->b_cont) { 11981 /* 11982 * After trimming, this guy is now 11983 * hanging off the end. 11984 */ 11985 mp1->b_cont = mp; 11986 ipf->ipf_tail_mp = mp; 11987 if (!more) { 11988 ipf->ipf_hole_cnt--; 11989 } 11990 break; 11991 } 11992 } 11993 if (start >= IP_REASS_START(mp1->b_cont)) 11994 continue; 11995 /* Fill a hole */ 11996 if (start > offset) 11997 ipf->ipf_hole_cnt++; 11998 mp->b_cont = mp1->b_cont; 11999 mp1->b_cont = mp; 12000 mp1 = mp->b_cont; 12001 offset = IP_REASS_START(mp1); 12002 if (end >= offset) { 12003 ipf->ipf_hole_cnt--; 12004 /* Check for overlap. */ 12005 while (end > offset) { 12006 if (end < IP_REASS_END(mp1)) { 12007 mp->b_wptr -= end - offset; 12008 IP_REASS_SET_END(mp, offset); 12009 /* 12010 * TODO we might bump 12011 * this up twice if there is 12012 * overlap at both ends. 12013 */ 12014 BUMP_MIB(ill->ill_ip_mib, 12015 ipIfStatsReasmPartDups); 12016 break; 12017 } 12018 /* Did we cover another hole? */ 12019 if ((mp1->b_cont && 12020 IP_REASS_END(mp1) 12021 != IP_REASS_START(mp1->b_cont) && 12022 end >= 12023 IP_REASS_START(mp1->b_cont)) || 12024 (!ipf->ipf_last_frag_seen && 12025 !more)) { 12026 ipf->ipf_hole_cnt--; 12027 } 12028 /* Clip out mp1. */ 12029 if ((mp->b_cont = mp1->b_cont) == 12030 NULL) { 12031 /* 12032 * After clipping out mp1, 12033 * this guy is now hanging 12034 * off the end. 12035 */ 12036 ipf->ipf_tail_mp = mp; 12037 } 12038 IP_REASS_SET_START(mp1, 0); 12039 IP_REASS_SET_END(mp1, 0); 12040 /* Subtract byte count */ 12041 ipf->ipf_count -= 12042 mp1->b_datap->db_lim - 12043 mp1->b_datap->db_base; 12044 freeb(mp1); 12045 BUMP_MIB(ill->ill_ip_mib, 12046 ipIfStatsReasmPartDups); 12047 mp1 = mp->b_cont; 12048 if (!mp1) 12049 break; 12050 offset = IP_REASS_START(mp1); 12051 } 12052 } 12053 break; 12054 } 12055 } while (start = end, mp = next_mp); 12056 12057 /* Fragment just processed could be the last one. Remember this fact */ 12058 if (!more) 12059 ipf->ipf_last_frag_seen = B_TRUE; 12060 12061 /* Still got holes? */ 12062 if (ipf->ipf_hole_cnt) 12063 return (IP_REASS_PARTIAL); 12064 /* Clean up overloaded fields to avoid upstream disasters. */ 12065 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 12066 IP_REASS_SET_START(mp1, 0); 12067 IP_REASS_SET_END(mp1, 0); 12068 } 12069 return (IP_REASS_COMPLETE); 12070 } 12071 12072 /* 12073 * ipsec processing for the fast path, used for input UDP Packets 12074 * Returns true if ready for passup to UDP. 12075 * Return false if packet is not passable to UDP (e.g. it failed IPsec policy, 12076 * was an ESP-in-UDP packet, etc.). 12077 */ 12078 static boolean_t 12079 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 12080 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire) 12081 { 12082 uint32_t ill_index; 12083 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 12084 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 12085 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12086 udp_t *udp = connp->conn_udp; 12087 12088 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12089 /* The ill_index of the incoming ILL */ 12090 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 12091 12092 /* pass packet up to the transport */ 12093 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 12094 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 12095 NULL, mctl_present); 12096 if (*first_mpp == NULL) { 12097 return (B_FALSE); 12098 } 12099 } 12100 12101 /* Initiate IPPF processing for fastpath UDP */ 12102 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 12103 ip_process(IPP_LOCAL_IN, mpp, ill_index); 12104 if (*mpp == NULL) { 12105 ip2dbg(("ip_input_ipsec_process: UDP pkt " 12106 "deferred/dropped during IPPF processing\n")); 12107 return (B_FALSE); 12108 } 12109 } 12110 /* 12111 * Remove 0-spi if it's 0, or move everything behind 12112 * the UDP header over it and forward to ESP via 12113 * ip_proto_input(). 12114 */ 12115 if (udp->udp_nat_t_endpoint) { 12116 if (mctl_present) { 12117 /* mctl_present *shouldn't* happen. */ 12118 ip_drop_packet(*first_mpp, B_TRUE, NULL, 12119 NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec), 12120 &ipss->ipsec_dropper); 12121 *first_mpp = NULL; 12122 return (B_FALSE); 12123 } 12124 12125 /* "ill" is "recv_ill" in actuality. */ 12126 if (!zero_spi_check(q, *mpp, ire, ill, ipss)) 12127 return (B_FALSE); 12128 12129 /* Else continue like a normal UDP packet. */ 12130 } 12131 12132 /* 12133 * We make the checks as below since we are in the fast path 12134 * and want to minimize the number of checks if the IP_RECVIF and/or 12135 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 12136 */ 12137 if (connp->conn_recvif || connp->conn_recvslla || 12138 connp->conn_ip_recvpktinfo) { 12139 if (connp->conn_recvif) { 12140 in_flags = IPF_RECVIF; 12141 } 12142 /* 12143 * UDP supports IP_RECVPKTINFO option for both v4 and v6 12144 * so the flag passed to ip_add_info is based on IP version 12145 * of connp. 12146 */ 12147 if (connp->conn_ip_recvpktinfo) { 12148 if (connp->conn_af_isv6) { 12149 /* 12150 * V6 only needs index 12151 */ 12152 in_flags |= IPF_RECVIF; 12153 } else { 12154 /* 12155 * V4 needs index + matching address. 12156 */ 12157 in_flags |= IPF_RECVADDR; 12158 } 12159 } 12160 if (connp->conn_recvslla) { 12161 in_flags |= IPF_RECVSLLA; 12162 } 12163 /* 12164 * since in_flags are being set ill will be 12165 * referenced in ip_add_info, so it better not 12166 * be NULL. 12167 */ 12168 /* 12169 * the actual data will be contained in b_cont 12170 * upon successful return of the following call. 12171 * If the call fails then the original mblk is 12172 * returned. 12173 */ 12174 *mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp), 12175 ipst); 12176 } 12177 12178 return (B_TRUE); 12179 } 12180 12181 /* 12182 * Fragmentation reassembly. Each ILL has a hash table for 12183 * queuing packets undergoing reassembly for all IPIFs 12184 * associated with the ILL. The hash is based on the packet 12185 * IP ident field. The ILL frag hash table was allocated 12186 * as a timer block at the time the ILL was created. Whenever 12187 * there is anything on the reassembly queue, the timer will 12188 * be running. Returns B_TRUE if successful else B_FALSE; 12189 * frees mp on failure. 12190 */ 12191 static boolean_t 12192 ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, 12193 uint32_t *cksum_val, uint16_t *cksum_flags) 12194 { 12195 uint32_t frag_offset_flags; 12196 ill_t *ill = (ill_t *)q->q_ptr; 12197 mblk_t *mp = *mpp; 12198 mblk_t *t_mp; 12199 ipaddr_t dst; 12200 uint8_t proto = ipha->ipha_protocol; 12201 uint32_t sum_val; 12202 uint16_t sum_flags; 12203 ipf_t *ipf; 12204 ipf_t **ipfp; 12205 ipfb_t *ipfb; 12206 uint16_t ident; 12207 uint32_t offset; 12208 ipaddr_t src; 12209 uint_t hdr_length; 12210 uint32_t end; 12211 mblk_t *mp1; 12212 mblk_t *tail_mp; 12213 size_t count; 12214 size_t msg_len; 12215 uint8_t ecn_info = 0; 12216 uint32_t packet_size; 12217 boolean_t pruned = B_FALSE; 12218 ip_stack_t *ipst = ill->ill_ipst; 12219 12220 if (cksum_val != NULL) 12221 *cksum_val = 0; 12222 if (cksum_flags != NULL) 12223 *cksum_flags = 0; 12224 12225 /* 12226 * Drop the fragmented as early as possible, if 12227 * we don't have resource(s) to re-assemble. 12228 */ 12229 if (ipst->ips_ip_reass_queue_bytes == 0) { 12230 freemsg(mp); 12231 return (B_FALSE); 12232 } 12233 12234 /* Check for fragmentation offset; return if there's none */ 12235 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 12236 (IPH_MF | IPH_OFFSET)) == 0) 12237 return (B_TRUE); 12238 12239 /* 12240 * We utilize hardware computed checksum info only for UDP since 12241 * IP fragmentation is a normal occurence for the protocol. In 12242 * addition, checksum offload support for IP fragments carrying 12243 * UDP payload is commonly implemented across network adapters. 12244 */ 12245 ASSERT(ill != NULL); 12246 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && 12247 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 12248 mblk_t *mp1 = mp->b_cont; 12249 int32_t len; 12250 12251 /* Record checksum information from the packet */ 12252 sum_val = (uint32_t)DB_CKSUM16(mp); 12253 sum_flags = DB_CKSUMFLAGS(mp); 12254 12255 /* IP payload offset from beginning of mblk */ 12256 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 12257 12258 if ((sum_flags & HCK_PARTIALCKSUM) && 12259 (mp1 == NULL || mp1->b_cont == NULL) && 12260 offset >= DB_CKSUMSTART(mp) && 12261 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 12262 uint32_t adj; 12263 /* 12264 * Partial checksum has been calculated by hardware 12265 * and attached to the packet; in addition, any 12266 * prepended extraneous data is even byte aligned. 12267 * If any such data exists, we adjust the checksum; 12268 * this would also handle any postpended data. 12269 */ 12270 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 12271 mp, mp1, len, adj); 12272 12273 /* One's complement subtract extraneous checksum */ 12274 if (adj >= sum_val) 12275 sum_val = ~(adj - sum_val) & 0xFFFF; 12276 else 12277 sum_val -= adj; 12278 } 12279 } else { 12280 sum_val = 0; 12281 sum_flags = 0; 12282 } 12283 12284 /* Clear hardware checksumming flag */ 12285 DB_CKSUMFLAGS(mp) = 0; 12286 12287 ident = ipha->ipha_ident; 12288 offset = (frag_offset_flags << 3) & 0xFFFF; 12289 src = ipha->ipha_src; 12290 dst = ipha->ipha_dst; 12291 hdr_length = IPH_HDR_LENGTH(ipha); 12292 end = ntohs(ipha->ipha_length) - hdr_length; 12293 12294 /* If end == 0 then we have a packet with no data, so just free it */ 12295 if (end == 0) { 12296 freemsg(mp); 12297 return (B_FALSE); 12298 } 12299 12300 /* Record the ECN field info. */ 12301 ecn_info = (ipha->ipha_type_of_service & 0x3); 12302 if (offset != 0) { 12303 /* 12304 * If this isn't the first piece, strip the header, and 12305 * add the offset to the end value. 12306 */ 12307 mp->b_rptr += hdr_length; 12308 end += offset; 12309 } 12310 12311 msg_len = MBLKSIZE(mp); 12312 tail_mp = mp; 12313 while (tail_mp->b_cont != NULL) { 12314 tail_mp = tail_mp->b_cont; 12315 msg_len += MBLKSIZE(tail_mp); 12316 } 12317 12318 /* If the reassembly list for this ILL will get too big, prune it */ 12319 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 12320 ipst->ips_ip_reass_queue_bytes) { 12321 ill_frag_prune(ill, 12322 (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : 12323 (ipst->ips_ip_reass_queue_bytes - msg_len)); 12324 pruned = B_TRUE; 12325 } 12326 12327 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 12328 mutex_enter(&ipfb->ipfb_lock); 12329 12330 ipfp = &ipfb->ipfb_ipf; 12331 /* Try to find an existing fragment queue for this packet. */ 12332 for (;;) { 12333 ipf = ipfp[0]; 12334 if (ipf != NULL) { 12335 /* 12336 * It has to match on ident and src/dst address. 12337 */ 12338 if (ipf->ipf_ident == ident && 12339 ipf->ipf_src == src && 12340 ipf->ipf_dst == dst && 12341 ipf->ipf_protocol == proto) { 12342 /* 12343 * If we have received too many 12344 * duplicate fragments for this packet 12345 * free it. 12346 */ 12347 if (ipf->ipf_num_dups > ip_max_frag_dups) { 12348 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12349 freemsg(mp); 12350 mutex_exit(&ipfb->ipfb_lock); 12351 return (B_FALSE); 12352 } 12353 /* Found it. */ 12354 break; 12355 } 12356 ipfp = &ipf->ipf_hash_next; 12357 continue; 12358 } 12359 12360 /* 12361 * If we pruned the list, do we want to store this new 12362 * fragment?. We apply an optimization here based on the 12363 * fact that most fragments will be received in order. 12364 * So if the offset of this incoming fragment is zero, 12365 * it is the first fragment of a new packet. We will 12366 * keep it. Otherwise drop the fragment, as we have 12367 * probably pruned the packet already (since the 12368 * packet cannot be found). 12369 */ 12370 if (pruned && offset != 0) { 12371 mutex_exit(&ipfb->ipfb_lock); 12372 freemsg(mp); 12373 return (B_FALSE); 12374 } 12375 12376 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) { 12377 /* 12378 * Too many fragmented packets in this hash 12379 * bucket. Free the oldest. 12380 */ 12381 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 12382 } 12383 12384 /* New guy. Allocate a frag message. */ 12385 mp1 = allocb(sizeof (*ipf), BPRI_MED); 12386 if (mp1 == NULL) { 12387 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12388 freemsg(mp); 12389 reass_done: 12390 mutex_exit(&ipfb->ipfb_lock); 12391 return (B_FALSE); 12392 } 12393 12394 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); 12395 mp1->b_cont = mp; 12396 12397 /* Initialize the fragment header. */ 12398 ipf = (ipf_t *)mp1->b_rptr; 12399 ipf->ipf_mp = mp1; 12400 ipf->ipf_ptphn = ipfp; 12401 ipfp[0] = ipf; 12402 ipf->ipf_hash_next = NULL; 12403 ipf->ipf_ident = ident; 12404 ipf->ipf_protocol = proto; 12405 ipf->ipf_src = src; 12406 ipf->ipf_dst = dst; 12407 ipf->ipf_nf_hdr_len = 0; 12408 /* Record reassembly start time. */ 12409 ipf->ipf_timestamp = gethrestime_sec(); 12410 /* Record ipf generation and account for frag header */ 12411 ipf->ipf_gen = ill->ill_ipf_gen++; 12412 ipf->ipf_count = MBLKSIZE(mp1); 12413 ipf->ipf_last_frag_seen = B_FALSE; 12414 ipf->ipf_ecn = ecn_info; 12415 ipf->ipf_num_dups = 0; 12416 ipfb->ipfb_frag_pkts++; 12417 ipf->ipf_checksum = 0; 12418 ipf->ipf_checksum_flags = 0; 12419 12420 /* Store checksum value in fragment header */ 12421 if (sum_flags != 0) { 12422 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12423 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12424 ipf->ipf_checksum = sum_val; 12425 ipf->ipf_checksum_flags = sum_flags; 12426 } 12427 12428 /* 12429 * We handle reassembly two ways. In the easy case, 12430 * where all the fragments show up in order, we do 12431 * minimal bookkeeping, and just clip new pieces on 12432 * the end. If we ever see a hole, then we go off 12433 * to ip_reassemble which has to mark the pieces and 12434 * keep track of the number of holes, etc. Obviously, 12435 * the point of having both mechanisms is so we can 12436 * handle the easy case as efficiently as possible. 12437 */ 12438 if (offset == 0) { 12439 /* Easy case, in-order reassembly so far. */ 12440 ipf->ipf_count += msg_len; 12441 ipf->ipf_tail_mp = tail_mp; 12442 /* 12443 * Keep track of next expected offset in 12444 * ipf_end. 12445 */ 12446 ipf->ipf_end = end; 12447 ipf->ipf_nf_hdr_len = hdr_length; 12448 } else { 12449 /* Hard case, hole at the beginning. */ 12450 ipf->ipf_tail_mp = NULL; 12451 /* 12452 * ipf_end == 0 means that we have given up 12453 * on easy reassembly. 12454 */ 12455 ipf->ipf_end = 0; 12456 12457 /* Forget checksum offload from now on */ 12458 ipf->ipf_checksum_flags = 0; 12459 12460 /* 12461 * ipf_hole_cnt is set by ip_reassemble. 12462 * ipf_count is updated by ip_reassemble. 12463 * No need to check for return value here 12464 * as we don't expect reassembly to complete 12465 * or fail for the first fragment itself. 12466 */ 12467 (void) ip_reassemble(mp, ipf, 12468 (frag_offset_flags & IPH_OFFSET) << 3, 12469 (frag_offset_flags & IPH_MF), ill, msg_len); 12470 } 12471 /* Update per ipfb and ill byte counts */ 12472 ipfb->ipfb_count += ipf->ipf_count; 12473 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12474 atomic_add_32(&ill->ill_frag_count, ipf->ipf_count); 12475 /* If the frag timer wasn't already going, start it. */ 12476 mutex_enter(&ill->ill_lock); 12477 ill_frag_timer_start(ill); 12478 mutex_exit(&ill->ill_lock); 12479 goto reass_done; 12480 } 12481 12482 /* 12483 * If the packet's flag has changed (it could be coming up 12484 * from an interface different than the previous, therefore 12485 * possibly different checksum capability), then forget about 12486 * any stored checksum states. Otherwise add the value to 12487 * the existing one stored in the fragment header. 12488 */ 12489 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 12490 sum_val += ipf->ipf_checksum; 12491 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12492 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12493 ipf->ipf_checksum = sum_val; 12494 } else if (ipf->ipf_checksum_flags != 0) { 12495 /* Forget checksum offload from now on */ 12496 ipf->ipf_checksum_flags = 0; 12497 } 12498 12499 /* 12500 * We have a new piece of a datagram which is already being 12501 * reassembled. Update the ECN info if all IP fragments 12502 * are ECN capable. If there is one which is not, clear 12503 * all the info. If there is at least one which has CE 12504 * code point, IP needs to report that up to transport. 12505 */ 12506 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 12507 if (ecn_info == IPH_ECN_CE) 12508 ipf->ipf_ecn = IPH_ECN_CE; 12509 } else { 12510 ipf->ipf_ecn = IPH_ECN_NECT; 12511 } 12512 if (offset && ipf->ipf_end == offset) { 12513 /* The new fragment fits at the end */ 12514 ipf->ipf_tail_mp->b_cont = mp; 12515 /* Update the byte count */ 12516 ipf->ipf_count += msg_len; 12517 /* Update per ipfb and ill byte counts */ 12518 ipfb->ipfb_count += msg_len; 12519 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12520 atomic_add_32(&ill->ill_frag_count, msg_len); 12521 if (frag_offset_flags & IPH_MF) { 12522 /* More to come. */ 12523 ipf->ipf_end = end; 12524 ipf->ipf_tail_mp = tail_mp; 12525 goto reass_done; 12526 } 12527 } else { 12528 /* Go do the hard cases. */ 12529 int ret; 12530 12531 if (offset == 0) 12532 ipf->ipf_nf_hdr_len = hdr_length; 12533 12534 /* Save current byte count */ 12535 count = ipf->ipf_count; 12536 ret = ip_reassemble(mp, ipf, 12537 (frag_offset_flags & IPH_OFFSET) << 3, 12538 (frag_offset_flags & IPH_MF), ill, msg_len); 12539 /* Count of bytes added and subtracted (freeb()ed) */ 12540 count = ipf->ipf_count - count; 12541 if (count) { 12542 /* Update per ipfb and ill byte counts */ 12543 ipfb->ipfb_count += count; 12544 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12545 atomic_add_32(&ill->ill_frag_count, count); 12546 } 12547 if (ret == IP_REASS_PARTIAL) { 12548 goto reass_done; 12549 } else if (ret == IP_REASS_FAILED) { 12550 /* Reassembly failed. Free up all resources */ 12551 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12552 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 12553 IP_REASS_SET_START(t_mp, 0); 12554 IP_REASS_SET_END(t_mp, 0); 12555 } 12556 freemsg(mp); 12557 goto reass_done; 12558 } 12559 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 12560 } 12561 /* 12562 * We have completed reassembly. Unhook the frag header from 12563 * the reassembly list. 12564 * 12565 * Before we free the frag header, record the ECN info 12566 * to report back to the transport. 12567 */ 12568 ecn_info = ipf->ipf_ecn; 12569 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs); 12570 ipfp = ipf->ipf_ptphn; 12571 12572 /* We need to supply these to caller */ 12573 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 12574 sum_val = ipf->ipf_checksum; 12575 else 12576 sum_val = 0; 12577 12578 mp1 = ipf->ipf_mp; 12579 count = ipf->ipf_count; 12580 ipf = ipf->ipf_hash_next; 12581 if (ipf != NULL) 12582 ipf->ipf_ptphn = ipfp; 12583 ipfp[0] = ipf; 12584 atomic_add_32(&ill->ill_frag_count, -count); 12585 ASSERT(ipfb->ipfb_count >= count); 12586 ipfb->ipfb_count -= count; 12587 ipfb->ipfb_frag_pkts--; 12588 mutex_exit(&ipfb->ipfb_lock); 12589 /* Ditch the frag header. */ 12590 mp = mp1->b_cont; 12591 12592 freeb(mp1); 12593 12594 /* Restore original IP length in header. */ 12595 packet_size = (uint32_t)msgdsize(mp); 12596 if (packet_size > IP_MAXPACKET) { 12597 freemsg(mp); 12598 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 12599 return (B_FALSE); 12600 } 12601 12602 if (DB_REF(mp) > 1) { 12603 mblk_t *mp2 = copymsg(mp); 12604 12605 freemsg(mp); 12606 if (mp2 == NULL) { 12607 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12608 return (B_FALSE); 12609 } 12610 mp = mp2; 12611 } 12612 ipha = (ipha_t *)mp->b_rptr; 12613 12614 ipha->ipha_length = htons((uint16_t)packet_size); 12615 /* We're now complete, zip the frag state */ 12616 ipha->ipha_fragment_offset_and_flags = 0; 12617 /* Record the ECN info. */ 12618 ipha->ipha_type_of_service &= 0xFC; 12619 ipha->ipha_type_of_service |= ecn_info; 12620 *mpp = mp; 12621 12622 /* Reassembly is successful; return checksum information if needed */ 12623 if (cksum_val != NULL) 12624 *cksum_val = sum_val; 12625 if (cksum_flags != NULL) 12626 *cksum_flags = sum_flags; 12627 12628 return (B_TRUE); 12629 } 12630 12631 /* 12632 * Perform ip header check sum update local options. 12633 * return B_TRUE if all is well, else return B_FALSE and release 12634 * the mp. caller is responsible for decrementing ire ref cnt. 12635 */ 12636 static boolean_t 12637 ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12638 ip_stack_t *ipst) 12639 { 12640 mblk_t *first_mp; 12641 boolean_t mctl_present; 12642 uint16_t sum; 12643 12644 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12645 /* 12646 * Don't do the checksum if it has gone through AH/ESP 12647 * processing. 12648 */ 12649 if (!mctl_present) { 12650 sum = ip_csum_hdr(ipha); 12651 if (sum != 0) { 12652 if (ill != NULL) { 12653 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12654 } else { 12655 BUMP_MIB(&ipst->ips_ip_mib, 12656 ipIfStatsInCksumErrs); 12657 } 12658 freemsg(first_mp); 12659 return (B_FALSE); 12660 } 12661 } 12662 12663 if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) { 12664 if (mctl_present) 12665 freeb(first_mp); 12666 return (B_FALSE); 12667 } 12668 12669 return (B_TRUE); 12670 } 12671 12672 /* 12673 * All udp packet are delivered to the local host via this routine. 12674 */ 12675 void 12676 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12677 ill_t *recv_ill) 12678 { 12679 uint32_t sum; 12680 uint32_t u1; 12681 boolean_t mctl_present; 12682 conn_t *connp; 12683 mblk_t *first_mp; 12684 uint16_t *up; 12685 ill_t *ill = (ill_t *)q->q_ptr; 12686 uint16_t reass_hck_flags = 0; 12687 ip_stack_t *ipst; 12688 12689 ASSERT(recv_ill != NULL); 12690 ipst = recv_ill->ill_ipst; 12691 12692 #define rptr ((uchar_t *)ipha) 12693 12694 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12695 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 12696 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12697 ASSERT(ill != NULL); 12698 12699 /* 12700 * FAST PATH for udp packets 12701 */ 12702 12703 /* u1 is # words of IP options */ 12704 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 12705 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12706 12707 /* IP options present */ 12708 if (u1 != 0) 12709 goto ipoptions; 12710 12711 /* Check the IP header checksum. */ 12712 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12713 /* Clear the IP header h/w cksum flag */ 12714 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12715 } else if (!mctl_present) { 12716 /* 12717 * Don't verify header checksum if this packet is coming 12718 * back from AH/ESP as we already did it. 12719 */ 12720 #define uph ((uint16_t *)ipha) 12721 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 12722 uph[6] + uph[7] + uph[8] + uph[9]; 12723 #undef uph 12724 /* finish doing IP checksum */ 12725 sum = (sum & 0xFFFF) + (sum >> 16); 12726 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12727 if (sum != 0 && sum != 0xFFFF) { 12728 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12729 freemsg(first_mp); 12730 return; 12731 } 12732 } 12733 12734 /* 12735 * Count for SNMP of inbound packets for ire. 12736 * if mctl is present this might be a secure packet and 12737 * has already been counted for in ip_proto_input(). 12738 */ 12739 if (!mctl_present) { 12740 UPDATE_IB_PKT_COUNT(ire); 12741 ire->ire_last_used_time = lbolt; 12742 } 12743 12744 /* packet part of fragmented IP packet? */ 12745 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12746 if (u1 & (IPH_MF | IPH_OFFSET)) { 12747 goto fragmented; 12748 } 12749 12750 /* u1 = IP header length (20 bytes) */ 12751 u1 = IP_SIMPLE_HDR_LENGTH; 12752 12753 /* packet does not contain complete IP & UDP headers */ 12754 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 12755 goto udppullup; 12756 12757 /* up points to UDP header */ 12758 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 12759 #define iphs ((uint16_t *)ipha) 12760 12761 /* if udp hdr cksum != 0, then need to checksum udp packet */ 12762 if (up[3] != 0) { 12763 mblk_t *mp1 = mp->b_cont; 12764 boolean_t cksum_err; 12765 uint16_t hck_flags = 0; 12766 12767 /* Pseudo-header checksum */ 12768 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12769 iphs[9] + up[2]; 12770 12771 /* 12772 * Revert to software checksum calculation if the interface 12773 * isn't capable of checksum offload or if IPsec is present. 12774 */ 12775 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 12776 hck_flags = DB_CKSUMFLAGS(mp); 12777 12778 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12779 IP_STAT(ipst, ip_in_sw_cksum); 12780 12781 IP_CKSUM_RECV(hck_flags, u1, 12782 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12783 (int32_t)((uchar_t *)up - rptr), 12784 mp, mp1, cksum_err); 12785 12786 if (cksum_err) { 12787 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12788 if (hck_flags & HCK_FULLCKSUM) 12789 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12790 else if (hck_flags & HCK_PARTIALCKSUM) 12791 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12792 else 12793 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12794 12795 freemsg(first_mp); 12796 return; 12797 } 12798 } 12799 12800 /* Non-fragmented broadcast or multicast packet? */ 12801 if (ire->ire_type == IRE_BROADCAST) 12802 goto udpslowpath; 12803 12804 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 12805 ire->ire_zoneid, ipst)) != NULL) { 12806 ASSERT(connp->conn_upq != NULL); 12807 IP_STAT(ipst, ip_udp_fast_path); 12808 12809 if (CONN_UDP_FLOWCTLD(connp)) { 12810 freemsg(mp); 12811 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 12812 } else { 12813 if (!mctl_present) { 12814 BUMP_MIB(ill->ill_ip_mib, 12815 ipIfStatsHCInDelivers); 12816 } 12817 /* 12818 * mp and first_mp can change. 12819 */ 12820 if (ip_udp_check(q, connp, recv_ill, 12821 ipha, &mp, &first_mp, mctl_present, ire)) { 12822 /* Send it upstream */ 12823 (connp->conn_recv)(connp, mp, NULL); 12824 } 12825 } 12826 /* 12827 * freeb() cannot deal with null mblk being passed 12828 * in and first_mp can be set to null in the call 12829 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 12830 */ 12831 if (mctl_present && first_mp != NULL) { 12832 freeb(first_mp); 12833 } 12834 CONN_DEC_REF(connp); 12835 return; 12836 } 12837 12838 /* 12839 * if we got here we know the packet is not fragmented and 12840 * has no options. The classifier could not find a conn_t and 12841 * most likely its an icmp packet so send it through slow path. 12842 */ 12843 12844 goto udpslowpath; 12845 12846 ipoptions: 12847 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 12848 goto slow_done; 12849 } 12850 12851 UPDATE_IB_PKT_COUNT(ire); 12852 ire->ire_last_used_time = lbolt; 12853 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12854 if (u1 & (IPH_MF | IPH_OFFSET)) { 12855 fragmented: 12856 /* 12857 * "sum" and "reass_hck_flags" are non-zero if the 12858 * reassembled packet has a valid hardware computed 12859 * checksum information associated with it. 12860 */ 12861 if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) 12862 goto slow_done; 12863 /* 12864 * Make sure that first_mp points back to mp as 12865 * the mp we came in with could have changed in 12866 * ip_rput_fragment(). 12867 */ 12868 ASSERT(!mctl_present); 12869 ipha = (ipha_t *)mp->b_rptr; 12870 first_mp = mp; 12871 } 12872 12873 /* Now we have a complete datagram, destined for this machine. */ 12874 u1 = IPH_HDR_LENGTH(ipha); 12875 /* Pull up the UDP header, if necessary. */ 12876 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12877 udppullup: 12878 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12879 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12880 freemsg(first_mp); 12881 goto slow_done; 12882 } 12883 ipha = (ipha_t *)mp->b_rptr; 12884 } 12885 12886 /* 12887 * Validate the checksum for the reassembled packet; for the 12888 * pullup case we calculate the payload checksum in software. 12889 */ 12890 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12891 if (up[3] != 0) { 12892 boolean_t cksum_err; 12893 12894 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12895 IP_STAT(ipst, ip_in_sw_cksum); 12896 12897 IP_CKSUM_RECV_REASS(reass_hck_flags, 12898 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12899 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12900 iphs[9] + up[2], sum, cksum_err); 12901 12902 if (cksum_err) { 12903 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12904 12905 if (reass_hck_flags & HCK_FULLCKSUM) 12906 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12907 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12908 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12909 else 12910 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12911 12912 freemsg(first_mp); 12913 goto slow_done; 12914 } 12915 } 12916 udpslowpath: 12917 12918 /* Clear hardware checksum flag to be safe */ 12919 DB_CKSUMFLAGS(mp) = 0; 12920 12921 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12922 (ire->ire_type == IRE_BROADCAST), 12923 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO, 12924 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12925 12926 slow_done: 12927 IP_STAT(ipst, ip_udp_slow_path); 12928 return; 12929 12930 #undef iphs 12931 #undef rptr 12932 } 12933 12934 /* ARGSUSED */ 12935 static mblk_t * 12936 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12937 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12938 ill_rx_ring_t *ill_ring) 12939 { 12940 conn_t *connp; 12941 uint32_t sum; 12942 uint32_t u1; 12943 uint16_t *up; 12944 int offset; 12945 ssize_t len; 12946 mblk_t *mp1; 12947 boolean_t syn_present = B_FALSE; 12948 tcph_t *tcph; 12949 uint_t ip_hdr_len; 12950 ill_t *ill = (ill_t *)q->q_ptr; 12951 zoneid_t zoneid = ire->ire_zoneid; 12952 boolean_t cksum_err; 12953 uint16_t hck_flags = 0; 12954 ip_stack_t *ipst = recv_ill->ill_ipst; 12955 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12956 12957 #define rptr ((uchar_t *)ipha) 12958 12959 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12960 ASSERT(ill != NULL); 12961 12962 /* 12963 * FAST PATH for tcp packets 12964 */ 12965 12966 /* u1 is # words of IP options */ 12967 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12968 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12969 12970 /* IP options present */ 12971 if (u1) { 12972 goto ipoptions; 12973 } else if (!mctl_present) { 12974 /* Check the IP header checksum. */ 12975 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12976 /* Clear the IP header h/w cksum flag */ 12977 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12978 } else if (!mctl_present) { 12979 /* 12980 * Don't verify header checksum if this packet 12981 * is coming back from AH/ESP as we already did it. 12982 */ 12983 #define uph ((uint16_t *)ipha) 12984 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12985 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12986 #undef uph 12987 /* finish doing IP checksum */ 12988 sum = (sum & 0xFFFF) + (sum >> 16); 12989 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12990 if (sum != 0 && sum != 0xFFFF) { 12991 BUMP_MIB(ill->ill_ip_mib, 12992 ipIfStatsInCksumErrs); 12993 goto error; 12994 } 12995 } 12996 } 12997 12998 if (!mctl_present) { 12999 UPDATE_IB_PKT_COUNT(ire); 13000 ire->ire_last_used_time = lbolt; 13001 } 13002 13003 /* packet part of fragmented IP packet? */ 13004 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13005 if (u1 & (IPH_MF | IPH_OFFSET)) { 13006 goto fragmented; 13007 } 13008 13009 /* u1 = IP header length (20 bytes) */ 13010 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 13011 13012 /* does packet contain IP+TCP headers? */ 13013 len = mp->b_wptr - rptr; 13014 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 13015 IP_STAT(ipst, ip_tcppullup); 13016 goto tcppullup; 13017 } 13018 13019 /* TCP options present? */ 13020 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 13021 13022 /* 13023 * If options need to be pulled up, then goto tcpoptions. 13024 * otherwise we are still in the fast path 13025 */ 13026 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 13027 IP_STAT(ipst, ip_tcpoptions); 13028 goto tcpoptions; 13029 } 13030 13031 /* multiple mblks of tcp data? */ 13032 if ((mp1 = mp->b_cont) != NULL) { 13033 /* more then two? */ 13034 if (mp1->b_cont != NULL) { 13035 IP_STAT(ipst, ip_multipkttcp); 13036 goto multipkttcp; 13037 } 13038 len += mp1->b_wptr - mp1->b_rptr; 13039 } 13040 13041 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 13042 13043 /* part of pseudo checksum */ 13044 13045 /* TCP datagram length */ 13046 u1 = len - IP_SIMPLE_HDR_LENGTH; 13047 13048 #define iphs ((uint16_t *)ipha) 13049 13050 #ifdef _BIG_ENDIAN 13051 u1 += IPPROTO_TCP; 13052 #else 13053 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13054 #endif 13055 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13056 13057 /* 13058 * Revert to software checksum calculation if the interface 13059 * isn't capable of checksum offload or if IPsec is present. 13060 */ 13061 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 13062 hck_flags = DB_CKSUMFLAGS(mp); 13063 13064 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 13065 IP_STAT(ipst, ip_in_sw_cksum); 13066 13067 IP_CKSUM_RECV(hck_flags, u1, 13068 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 13069 (int32_t)((uchar_t *)up - rptr), 13070 mp, mp1, cksum_err); 13071 13072 if (cksum_err) { 13073 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13074 13075 if (hck_flags & HCK_FULLCKSUM) 13076 IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); 13077 else if (hck_flags & HCK_PARTIALCKSUM) 13078 IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); 13079 else 13080 IP_STAT(ipst, ip_tcp_in_sw_cksum_err); 13081 13082 goto error; 13083 } 13084 13085 try_again: 13086 13087 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 13088 zoneid, ipst)) == NULL) { 13089 /* Send the TH_RST */ 13090 goto no_conn; 13091 } 13092 13093 /* 13094 * TCP FAST PATH for AF_INET socket. 13095 * 13096 * TCP fast path to avoid extra work. An AF_INET socket type 13097 * does not have facility to receive extra information via 13098 * ip_process or ip_add_info. Also, when the connection was 13099 * established, we made a check if this connection is impacted 13100 * by any global IPsec policy or per connection policy (a 13101 * policy that comes in effect later will not apply to this 13102 * connection). Since all this can be determined at the 13103 * connection establishment time, a quick check of flags 13104 * can avoid extra work. 13105 */ 13106 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 13107 !IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13108 ASSERT(first_mp == mp); 13109 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13110 SET_SQUEUE(mp, tcp_rput_data, connp); 13111 return (mp); 13112 } 13113 13114 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 13115 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 13116 if (IPCL_IS_TCP(connp)) { 13117 mp->b_datap->db_struioflag |= STRUIO_EAGER; 13118 DB_CKSUMSTART(mp) = 13119 (intptr_t)ip_squeue_get(ill_ring); 13120 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 13121 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13122 BUMP_MIB(ill->ill_ip_mib, 13123 ipIfStatsHCInDelivers); 13124 SET_SQUEUE(mp, connp->conn_recv, connp); 13125 return (mp); 13126 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 13127 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13128 BUMP_MIB(ill->ill_ip_mib, 13129 ipIfStatsHCInDelivers); 13130 ip_squeue_enter_unbound++; 13131 SET_SQUEUE(mp, tcp_conn_request_unbound, 13132 connp); 13133 return (mp); 13134 } 13135 syn_present = B_TRUE; 13136 } 13137 13138 } 13139 13140 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 13141 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 13142 13143 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13144 /* No need to send this packet to TCP */ 13145 if ((flags & TH_RST) || (flags & TH_URG)) { 13146 CONN_DEC_REF(connp); 13147 freemsg(first_mp); 13148 return (NULL); 13149 } 13150 if (flags & TH_ACK) { 13151 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 13152 ipst->ips_netstack->netstack_tcp, connp); 13153 CONN_DEC_REF(connp); 13154 return (NULL); 13155 } 13156 13157 CONN_DEC_REF(connp); 13158 freemsg(first_mp); 13159 return (NULL); 13160 } 13161 13162 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 13163 first_mp = ipsec_check_inbound_policy(first_mp, connp, 13164 ipha, NULL, mctl_present); 13165 if (first_mp == NULL) { 13166 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13167 CONN_DEC_REF(connp); 13168 return (NULL); 13169 } 13170 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 13171 ASSERT(syn_present); 13172 if (mctl_present) { 13173 ASSERT(first_mp != mp); 13174 first_mp->b_datap->db_struioflag |= 13175 STRUIO_POLICY; 13176 } else { 13177 ASSERT(first_mp == mp); 13178 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 13179 mp->b_datap->db_struioflag |= STRUIO_POLICY; 13180 } 13181 } else { 13182 /* 13183 * Discard first_mp early since we're dealing with a 13184 * fully-connected conn_t and tcp doesn't do policy in 13185 * this case. 13186 */ 13187 if (mctl_present) { 13188 freeb(first_mp); 13189 mctl_present = B_FALSE; 13190 } 13191 first_mp = mp; 13192 } 13193 } 13194 13195 /* Initiate IPPF processing for fastpath */ 13196 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13197 uint32_t ill_index; 13198 13199 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13200 ip_process(IPP_LOCAL_IN, &mp, ill_index); 13201 if (mp == NULL) { 13202 ip2dbg(("ip_input_ipsec_process: TCP pkt " 13203 "deferred/dropped during IPPF processing\n")); 13204 CONN_DEC_REF(connp); 13205 if (mctl_present) 13206 freeb(first_mp); 13207 return (NULL); 13208 } else if (mctl_present) { 13209 /* 13210 * ip_process might return a new mp. 13211 */ 13212 ASSERT(first_mp != mp); 13213 first_mp->b_cont = mp; 13214 } else { 13215 first_mp = mp; 13216 } 13217 13218 } 13219 13220 if (!syn_present && connp->conn_ip_recvpktinfo) { 13221 /* 13222 * TCP does not support IP_RECVPKTINFO for v4 so lets 13223 * make sure IPF_RECVIF is passed to ip_add_info. 13224 */ 13225 mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF, 13226 IPCL_ZONEID(connp), ipst); 13227 if (mp == NULL) { 13228 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13229 CONN_DEC_REF(connp); 13230 if (mctl_present) 13231 freeb(first_mp); 13232 return (NULL); 13233 } else if (mctl_present) { 13234 /* 13235 * ip_add_info might return a new mp. 13236 */ 13237 ASSERT(first_mp != mp); 13238 first_mp->b_cont = mp; 13239 } else { 13240 first_mp = mp; 13241 } 13242 } 13243 13244 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13245 if (IPCL_IS_TCP(connp)) { 13246 SET_SQUEUE(first_mp, connp->conn_recv, connp); 13247 return (first_mp); 13248 } else { 13249 /* SOCK_RAW, IPPROTO_TCP case */ 13250 (connp->conn_recv)(connp, first_mp, NULL); 13251 CONN_DEC_REF(connp); 13252 return (NULL); 13253 } 13254 13255 no_conn: 13256 /* Initiate IPPf processing, if needed. */ 13257 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13258 uint32_t ill_index; 13259 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13260 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 13261 if (first_mp == NULL) { 13262 return (NULL); 13263 } 13264 } 13265 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13266 13267 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid, 13268 ipst->ips_netstack->netstack_tcp, NULL); 13269 return (NULL); 13270 ipoptions: 13271 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) { 13272 goto slow_done; 13273 } 13274 13275 UPDATE_IB_PKT_COUNT(ire); 13276 ire->ire_last_used_time = lbolt; 13277 13278 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13279 if (u1 & (IPH_MF | IPH_OFFSET)) { 13280 fragmented: 13281 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 13282 if (mctl_present) 13283 freeb(first_mp); 13284 goto slow_done; 13285 } 13286 /* 13287 * Make sure that first_mp points back to mp as 13288 * the mp we came in with could have changed in 13289 * ip_rput_fragment(). 13290 */ 13291 ASSERT(!mctl_present); 13292 ipha = (ipha_t *)mp->b_rptr; 13293 first_mp = mp; 13294 } 13295 13296 /* Now we have a complete datagram, destined for this machine. */ 13297 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 13298 13299 len = mp->b_wptr - mp->b_rptr; 13300 /* Pull up a minimal TCP header, if necessary. */ 13301 if (len < (u1 + 20)) { 13302 tcppullup: 13303 if (!pullupmsg(mp, u1 + 20)) { 13304 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13305 goto error; 13306 } 13307 ipha = (ipha_t *)mp->b_rptr; 13308 len = mp->b_wptr - mp->b_rptr; 13309 } 13310 13311 /* 13312 * Extract the offset field from the TCP header. As usual, we 13313 * try to help the compiler more than the reader. 13314 */ 13315 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 13316 if (offset != 5) { 13317 tcpoptions: 13318 if (offset < 5) { 13319 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13320 goto error; 13321 } 13322 /* 13323 * There must be TCP options. 13324 * Make sure we can grab them. 13325 */ 13326 offset <<= 2; 13327 offset += u1; 13328 if (len < offset) { 13329 if (!pullupmsg(mp, offset)) { 13330 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13331 goto error; 13332 } 13333 ipha = (ipha_t *)mp->b_rptr; 13334 len = mp->b_wptr - rptr; 13335 } 13336 } 13337 13338 /* Get the total packet length in len, including headers. */ 13339 if (mp->b_cont) { 13340 multipkttcp: 13341 len = msgdsize(mp); 13342 } 13343 13344 /* 13345 * Check the TCP checksum by pulling together the pseudo- 13346 * header checksum, and passing it to ip_csum to be added in 13347 * with the TCP datagram. 13348 * 13349 * Since we are not using the hwcksum if available we must 13350 * clear the flag. We may come here via tcppullup or tcpoptions. 13351 * If either of these fails along the way the mblk is freed. 13352 * If this logic ever changes and mblk is reused to say send 13353 * ICMP's back, then this flag may need to be cleared in 13354 * other places as well. 13355 */ 13356 DB_CKSUMFLAGS(mp) = 0; 13357 13358 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 13359 13360 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 13361 #ifdef _BIG_ENDIAN 13362 u1 += IPPROTO_TCP; 13363 #else 13364 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13365 #endif 13366 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13367 /* 13368 * Not M_DATA mblk or its a dup, so do the checksum now. 13369 */ 13370 IP_STAT(ipst, ip_in_sw_cksum); 13371 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 13372 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13373 goto error; 13374 } 13375 13376 IP_STAT(ipst, ip_tcp_slow_path); 13377 goto try_again; 13378 #undef iphs 13379 #undef rptr 13380 13381 error: 13382 freemsg(first_mp); 13383 slow_done: 13384 return (NULL); 13385 } 13386 13387 /* ARGSUSED */ 13388 static void 13389 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 13390 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 13391 { 13392 conn_t *connp; 13393 uint32_t sum; 13394 uint32_t u1; 13395 ssize_t len; 13396 sctp_hdr_t *sctph; 13397 zoneid_t zoneid = ire->ire_zoneid; 13398 uint32_t pktsum; 13399 uint32_t calcsum; 13400 uint32_t ports; 13401 in6_addr_t map_src, map_dst; 13402 ill_t *ill = (ill_t *)q->q_ptr; 13403 ip_stack_t *ipst; 13404 sctp_stack_t *sctps; 13405 boolean_t sctp_csum_err = B_FALSE; 13406 13407 ASSERT(recv_ill != NULL); 13408 ipst = recv_ill->ill_ipst; 13409 sctps = ipst->ips_netstack->netstack_sctp; 13410 13411 #define rptr ((uchar_t *)ipha) 13412 13413 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 13414 ASSERT(ill != NULL); 13415 13416 /* u1 is # words of IP options */ 13417 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 13418 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 13419 13420 /* IP options present */ 13421 if (u1 > 0) { 13422 goto ipoptions; 13423 } else { 13424 /* Check the IP header checksum. */ 13425 if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) && 13426 !mctl_present) { 13427 #define uph ((uint16_t *)ipha) 13428 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 13429 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 13430 #undef uph 13431 /* finish doing IP checksum */ 13432 sum = (sum & 0xFFFF) + (sum >> 16); 13433 sum = ~(sum + (sum >> 16)) & 0xFFFF; 13434 /* 13435 * Don't verify header checksum if this packet 13436 * is coming back from AH/ESP as we already did it. 13437 */ 13438 if (sum != 0 && sum != 0xFFFF) { 13439 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 13440 goto error; 13441 } 13442 } 13443 /* 13444 * Since there is no SCTP h/w cksum support yet, just 13445 * clear the flag. 13446 */ 13447 DB_CKSUMFLAGS(mp) = 0; 13448 } 13449 13450 /* 13451 * Don't verify header checksum if this packet is coming 13452 * back from AH/ESP as we already did it. 13453 */ 13454 if (!mctl_present) { 13455 UPDATE_IB_PKT_COUNT(ire); 13456 ire->ire_last_used_time = lbolt; 13457 } 13458 13459 /* packet part of fragmented IP packet? */ 13460 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13461 if (u1 & (IPH_MF | IPH_OFFSET)) 13462 goto fragmented; 13463 13464 /* u1 = IP header length (20 bytes) */ 13465 u1 = IP_SIMPLE_HDR_LENGTH; 13466 13467 find_sctp_client: 13468 /* Pullup if we don't have the sctp common header. */ 13469 len = MBLKL(mp); 13470 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 13471 if (mp->b_cont == NULL || 13472 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 13473 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13474 goto error; 13475 } 13476 ipha = (ipha_t *)mp->b_rptr; 13477 len = MBLKL(mp); 13478 } 13479 13480 sctph = (sctp_hdr_t *)(rptr + u1); 13481 #ifdef DEBUG 13482 if (!skip_sctp_cksum) { 13483 #endif 13484 pktsum = sctph->sh_chksum; 13485 sctph->sh_chksum = 0; 13486 calcsum = sctp_cksum(mp, u1); 13487 sctph->sh_chksum = pktsum; 13488 if (calcsum != pktsum) 13489 sctp_csum_err = B_TRUE; 13490 #ifdef DEBUG /* skip_sctp_cksum */ 13491 } 13492 #endif 13493 /* get the ports */ 13494 ports = *(uint32_t *)&sctph->sh_sport; 13495 13496 IRE_REFRELE(ire); 13497 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 13498 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 13499 if (sctp_csum_err) { 13500 /* 13501 * No potential sctp checksum errors go to the Sun 13502 * sctp stack however they might be Adler-32 summed 13503 * packets a userland stack bound to a raw IP socket 13504 * could reasonably use. Note though that Adler-32 is 13505 * a long deprecated algorithm and customer sctp 13506 * networks should eventually migrate to CRC-32 at 13507 * which time this facility should be removed. 13508 */ 13509 flags |= IP_FF_SCTP_CSUM_ERR; 13510 goto no_conn; 13511 } 13512 if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp, 13513 sctps)) == NULL) { 13514 /* Check for raw socket or OOTB handling */ 13515 goto no_conn; 13516 } 13517 13518 /* Found a client; up it goes */ 13519 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13520 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 13521 return; 13522 13523 no_conn: 13524 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 13525 ports, mctl_present, flags, B_TRUE, zoneid); 13526 return; 13527 13528 ipoptions: 13529 DB_CKSUMFLAGS(mp) = 0; 13530 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) 13531 goto slow_done; 13532 13533 UPDATE_IB_PKT_COUNT(ire); 13534 ire->ire_last_used_time = lbolt; 13535 13536 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13537 if (u1 & (IPH_MF | IPH_OFFSET)) { 13538 fragmented: 13539 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) 13540 goto slow_done; 13541 /* 13542 * Make sure that first_mp points back to mp as 13543 * the mp we came in with could have changed in 13544 * ip_rput_fragment(). 13545 */ 13546 ASSERT(!mctl_present); 13547 ipha = (ipha_t *)mp->b_rptr; 13548 first_mp = mp; 13549 } 13550 13551 /* Now we have a complete datagram, destined for this machine. */ 13552 u1 = IPH_HDR_LENGTH(ipha); 13553 goto find_sctp_client; 13554 #undef iphs 13555 #undef rptr 13556 13557 error: 13558 freemsg(first_mp); 13559 slow_done: 13560 IRE_REFRELE(ire); 13561 } 13562 13563 #define VER_BITS 0xF0 13564 #define VERSION_6 0x60 13565 13566 static boolean_t 13567 ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp, 13568 ipaddr_t *dstp, ip_stack_t *ipst) 13569 { 13570 uint_t opt_len; 13571 ipha_t *ipha; 13572 ssize_t len; 13573 uint_t pkt_len; 13574 13575 ASSERT(ill != NULL); 13576 IP_STAT(ipst, ip_ipoptions); 13577 ipha = *iphapp; 13578 13579 #define rptr ((uchar_t *)ipha) 13580 /* Assume no IPv6 packets arrive over the IPv4 queue */ 13581 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 13582 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); 13583 freemsg(mp); 13584 return (B_FALSE); 13585 } 13586 13587 /* multiple mblk or too short */ 13588 pkt_len = ntohs(ipha->ipha_length); 13589 13590 /* Get the number of words of IP options in the IP header. */ 13591 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 13592 if (opt_len) { 13593 /* IP Options present! Validate and process. */ 13594 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 13595 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13596 goto done; 13597 } 13598 /* 13599 * Recompute complete header length and make sure we 13600 * have access to all of it. 13601 */ 13602 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 13603 if (len > (mp->b_wptr - rptr)) { 13604 if (len > pkt_len) { 13605 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13606 goto done; 13607 } 13608 if (!pullupmsg(mp, len)) { 13609 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13610 goto done; 13611 } 13612 ipha = (ipha_t *)mp->b_rptr; 13613 } 13614 /* 13615 * Go off to ip_rput_options which returns the next hop 13616 * destination address, which may have been affected 13617 * by source routing. 13618 */ 13619 IP_STAT(ipst, ip_opt); 13620 if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) { 13621 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13622 return (B_FALSE); 13623 } 13624 } 13625 *iphapp = ipha; 13626 return (B_TRUE); 13627 done: 13628 /* clear b_prev - used by ip_mroute_decap */ 13629 mp->b_prev = NULL; 13630 freemsg(mp); 13631 return (B_FALSE); 13632 #undef rptr 13633 } 13634 13635 /* 13636 * Deal with the fact that there is no ire for the destination. 13637 */ 13638 static ire_t * 13639 ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst) 13640 { 13641 ipha_t *ipha; 13642 ill_t *ill; 13643 ire_t *ire; 13644 ip_stack_t *ipst; 13645 enum ire_forward_action ret_action; 13646 13647 ipha = (ipha_t *)mp->b_rptr; 13648 ill = (ill_t *)q->q_ptr; 13649 13650 ASSERT(ill != NULL); 13651 ipst = ill->ill_ipst; 13652 13653 /* 13654 * No IRE for this destination, so it can't be for us. 13655 * Unless we are forwarding, drop the packet. 13656 * We have to let source routed packets through 13657 * since we don't yet know if they are 'ping -l' 13658 * packets i.e. if they will go out over the 13659 * same interface as they came in on. 13660 */ 13661 if (ll_multicast) { 13662 freemsg(mp); 13663 return (NULL); 13664 } 13665 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { 13666 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13667 freemsg(mp); 13668 return (NULL); 13669 } 13670 13671 /* 13672 * Mark this packet as having originated externally. 13673 * 13674 * For non-forwarding code path, ire_send later double 13675 * checks this interface to see if it is still exists 13676 * post-ARP resolution. 13677 * 13678 * Also, IPQOS uses this to differentiate between 13679 * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP 13680 * QOS packet processing in ip_wput_attach_llhdr(). 13681 * The QoS module can mark the b_band for a fastpath message 13682 * or the dl_priority field in a unitdata_req header for 13683 * CoS marking. This info can only be found in 13684 * ip_wput_attach_llhdr(). 13685 */ 13686 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 13687 /* 13688 * Clear the indication that this may have a hardware checksum 13689 * as we are not using it 13690 */ 13691 DB_CKSUMFLAGS(mp) = 0; 13692 13693 ire = ire_forward(dst, &ret_action, NULL, NULL, 13694 MBLK_GETLABEL(mp), ipst); 13695 13696 if (ire == NULL && ret_action == Forward_check_multirt) { 13697 /* Let ip_newroute handle CGTP */ 13698 ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst); 13699 return (NULL); 13700 } 13701 13702 if (ire != NULL) 13703 return (ire); 13704 13705 mp->b_prev = mp->b_next = 0; 13706 13707 if (ret_action == Forward_blackhole) { 13708 freemsg(mp); 13709 return (NULL); 13710 } 13711 /* send icmp unreachable */ 13712 q = WR(q); 13713 /* Sent by forwarding path, and router is global zone */ 13714 if (ip_source_routed(ipha, ipst)) { 13715 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, 13716 GLOBAL_ZONEID, ipst); 13717 } else { 13718 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID, 13719 ipst); 13720 } 13721 13722 return (NULL); 13723 13724 } 13725 13726 /* 13727 * check ip header length and align it. 13728 */ 13729 static boolean_t 13730 ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) 13731 { 13732 ssize_t len; 13733 ill_t *ill; 13734 ipha_t *ipha; 13735 13736 len = MBLKL(mp); 13737 13738 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13739 ill = (ill_t *)q->q_ptr; 13740 13741 if (!OK_32PTR(mp->b_rptr)) 13742 IP_STAT(ipst, ip_notaligned1); 13743 else 13744 IP_STAT(ipst, ip_notaligned2); 13745 /* Guard against bogus device drivers */ 13746 if (len < 0) { 13747 /* clear b_prev - used by ip_mroute_decap */ 13748 mp->b_prev = NULL; 13749 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13750 freemsg(mp); 13751 return (B_FALSE); 13752 } 13753 13754 if (ip_rput_pullups++ == 0) { 13755 ipha = (ipha_t *)mp->b_rptr; 13756 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 13757 "ip_check_and_align_header: %s forced us to " 13758 " pullup pkt, hdr len %ld, hdr addr %p", 13759 ill->ill_name, len, ipha); 13760 } 13761 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 13762 /* clear b_prev - used by ip_mroute_decap */ 13763 mp->b_prev = NULL; 13764 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13765 freemsg(mp); 13766 return (B_FALSE); 13767 } 13768 } 13769 return (B_TRUE); 13770 } 13771 13772 ire_t * 13773 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) 13774 { 13775 ire_t *new_ire; 13776 ill_t *ire_ill; 13777 uint_t ifindex; 13778 ip_stack_t *ipst = ill->ill_ipst; 13779 boolean_t strict_check = B_FALSE; 13780 13781 /* 13782 * This packet came in on an interface other than the one associated 13783 * with the first ire we found for the destination address. We do 13784 * another ire lookup here, using the ingress ill, to see if the 13785 * interface is in an interface group. 13786 * As long as the ills belong to the same group, we don't consider 13787 * them to be arriving on the wrong interface. Thus, if the switch 13788 * is doing inbound load spreading, we won't drop packets when the 13789 * ip*_strict_dst_multihoming switch is on. Note, the same holds true 13790 * for 'usesrc groups' where the destination address may belong to 13791 * another interface to allow multipathing to happen. 13792 * We also need to check for IPIF_UNNUMBERED point2point interfaces 13793 * where the local address may not be unique. In this case we were 13794 * at the mercy of the initial ire cache lookup and the IRE_LOCAL it 13795 * actually returned. The new lookup, which is more specific, should 13796 * only find the IRE_LOCAL associated with the ingress ill if one 13797 * exists. 13798 */ 13799 13800 if (ire->ire_ipversion == IPV4_VERSION) { 13801 if (ipst->ips_ip_strict_dst_multihoming) 13802 strict_check = B_TRUE; 13803 new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, 13804 ill->ill_ipif, ALL_ZONES, NULL, 13805 (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); 13806 } else { 13807 ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); 13808 if (ipst->ips_ipv6_strict_dst_multihoming) 13809 strict_check = B_TRUE; 13810 new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, 13811 IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, 13812 (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); 13813 } 13814 /* 13815 * If the same ire that was returned in ip_input() is found then this 13816 * is an indication that interface groups are in use. The packet 13817 * arrived on a different ill in the group than the one associated with 13818 * the destination address. If a different ire was found then the same 13819 * IP address must be hosted on multiple ills. This is possible with 13820 * unnumbered point2point interfaces. We switch to use this new ire in 13821 * order to have accurate interface statistics. 13822 */ 13823 if (new_ire != NULL) { 13824 if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) { 13825 ire_refrele(ire); 13826 ire = new_ire; 13827 } else { 13828 ire_refrele(new_ire); 13829 } 13830 return (ire); 13831 } else if ((ire->ire_rfq == NULL) && 13832 (ire->ire_ipversion == IPV4_VERSION)) { 13833 /* 13834 * The best match could have been the original ire which 13835 * was created against an IRE_LOCAL on lo0. In the IPv4 case 13836 * the strict multihoming checks are irrelevant as we consider 13837 * local addresses hosted on lo0 to be interface agnostic. We 13838 * only expect a null ire_rfq on IREs which are associated with 13839 * lo0 hence we can return now. 13840 */ 13841 return (ire); 13842 } 13843 13844 /* 13845 * Chase pointers once and store locally. 13846 */ 13847 ire_ill = (ire->ire_rfq == NULL) ? NULL : 13848 (ill_t *)(ire->ire_rfq->q_ptr); 13849 ifindex = ill->ill_usesrc_ifindex; 13850 13851 /* 13852 * Check if it's a legal address on the 'usesrc' interface. 13853 */ 13854 if ((ifindex != 0) && (ire_ill != NULL) && 13855 (ifindex == ire_ill->ill_phyint->phyint_ifindex)) { 13856 return (ire); 13857 } 13858 13859 /* 13860 * If the ip*_strict_dst_multihoming switch is on then we can 13861 * only accept this packet if the interface is marked as routing. 13862 */ 13863 if (!(strict_check)) 13864 return (ire); 13865 13866 if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & 13867 ILLF_ROUTER) != 0) { 13868 return (ire); 13869 } 13870 13871 ire_refrele(ire); 13872 return (NULL); 13873 } 13874 13875 ire_t * 13876 ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) 13877 { 13878 ipha_t *ipha; 13879 ire_t *src_ire; 13880 ill_t *stq_ill; 13881 uint_t hlen; 13882 uint_t pkt_len; 13883 uint32_t sum; 13884 queue_t *dev_q; 13885 ip_stack_t *ipst = ill->ill_ipst; 13886 mblk_t *fpmp; 13887 enum ire_forward_action ret_action; 13888 13889 ipha = (ipha_t *)mp->b_rptr; 13890 13891 if (ire != NULL && 13892 ire->ire_zoneid != GLOBAL_ZONEID && 13893 ire->ire_zoneid != ALL_ZONES) { 13894 /* 13895 * Should only use IREs that are visible to the global 13896 * zone for forwarding. 13897 */ 13898 ire_refrele(ire); 13899 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst); 13900 } 13901 13902 /* 13903 * Martian Address Filtering [RFC 1812, Section 5.3.7] 13904 * The loopback address check for both src and dst has already 13905 * been checked in ip_input 13906 */ 13907 13908 if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) { 13909 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13910 goto drop; 13911 } 13912 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 13913 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 13914 13915 if (src_ire != NULL) { 13916 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13917 ire_refrele(src_ire); 13918 goto drop; 13919 } 13920 13921 /* No ire cache of nexthop. So first create one */ 13922 if (ire == NULL) { 13923 13924 ire = ire_forward(dst, &ret_action, NULL, NULL, 13925 NULL, ipst); 13926 /* 13927 * We only come to ip_fast_forward if ip_cgtp_filter 13928 * is not set. So ire_forward() should not return with 13929 * Forward_check_multirt as the next action. 13930 */ 13931 ASSERT(ret_action != Forward_check_multirt); 13932 if (ire == NULL) { 13933 /* An attempt was made to forward the packet */ 13934 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 13935 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13936 mp->b_prev = mp->b_next = 0; 13937 /* send icmp unreachable */ 13938 /* Sent by forwarding path, and router is global zone */ 13939 if (ret_action == Forward_ret_icmp_err) { 13940 if (ip_source_routed(ipha, ipst)) { 13941 icmp_unreachable(ill->ill_wq, mp, 13942 ICMP_SOURCE_ROUTE_FAILED, 13943 GLOBAL_ZONEID, ipst); 13944 } else { 13945 icmp_unreachable(ill->ill_wq, mp, 13946 ICMP_HOST_UNREACHABLE, 13947 GLOBAL_ZONEID, ipst); 13948 } 13949 } else { 13950 freemsg(mp); 13951 } 13952 return (NULL); 13953 } 13954 } 13955 13956 /* 13957 * Forwarding fastpath exception case: 13958 * If either of the follwoing case is true, we take 13959 * the slowpath 13960 * o forwarding is not enabled 13961 * o incoming and outgoing interface are the same, or the same 13962 * IPMP group 13963 * o corresponding ire is in incomplete state 13964 * o packet needs fragmentation 13965 * o ARP cache is not resolved 13966 * 13967 * The codeflow from here on is thus: 13968 * ip_rput_process_forward->ip_rput_forward->ip_xmit_v4 13969 */ 13970 pkt_len = ntohs(ipha->ipha_length); 13971 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 13972 if (!(stq_ill->ill_flags & ILLF_ROUTER) || 13973 !(ill->ill_flags & ILLF_ROUTER) || 13974 (ill == stq_ill) || 13975 (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) || 13976 (ire->ire_nce == NULL) || 13977 (pkt_len > ire->ire_max_frag) || 13978 ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || 13979 ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) || 13980 ipha->ipha_ttl <= 1) { 13981 ip_rput_process_forward(ill->ill_rq, mp, ire, 13982 ipha, ill, B_FALSE); 13983 return (ire); 13984 } 13985 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 13986 13987 DTRACE_PROBE4(ip4__forwarding__start, 13988 ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); 13989 13990 FW_HOOKS(ipst->ips_ip4_forwarding_event, 13991 ipst->ips_ipv4firewall_forwarding, 13992 ill, stq_ill, ipha, mp, mp, 0, ipst); 13993 13994 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 13995 13996 if (mp == NULL) 13997 goto drop; 13998 13999 mp->b_datap->db_struioun.cksum.flags = 0; 14000 /* Adjust the checksum to reflect the ttl decrement. */ 14001 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 14002 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 14003 ipha->ipha_ttl--; 14004 14005 /* 14006 * Write the link layer header. We can do this safely here, 14007 * because we have already tested to make sure that the IP 14008 * policy is not set, and that we have a fast path destination 14009 * header. 14010 */ 14011 mp->b_rptr -= hlen; 14012 bcopy(fpmp->b_rptr, mp->b_rptr, hlen); 14013 14014 UPDATE_IB_PKT_COUNT(ire); 14015 ire->ire_last_used_time = lbolt; 14016 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 14017 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 14018 UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len); 14019 14020 dev_q = ire->ire_stq->q_next; 14021 if ((dev_q->q_next != NULL || dev_q->q_first != NULL) && 14022 !canputnext(ire->ire_stq)) { 14023 goto indiscard; 14024 } 14025 if (ILL_DLS_CAPABLE(stq_ill)) { 14026 /* 14027 * Send the packet directly to DLD, where it 14028 * may be queued depending on the availability 14029 * of transmit resources at the media layer. 14030 */ 14031 IP_DLS_ILL_TX(stq_ill, ipha, mp, ipst); 14032 } else { 14033 DTRACE_PROBE4(ip4__physical__out__start, 14034 ill_t *, NULL, ill_t *, stq_ill, 14035 ipha_t *, ipha, mblk_t *, mp); 14036 FW_HOOKS(ipst->ips_ip4_physical_out_event, 14037 ipst->ips_ipv4firewall_physical_out, 14038 NULL, stq_ill, ipha, mp, mp, 0, ipst); 14039 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 14040 if (mp == NULL) 14041 goto drop; 14042 14043 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 14044 ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, 14045 ip6_t *, NULL, int, 0); 14046 14047 putnext(ire->ire_stq, mp); 14048 } 14049 return (ire); 14050 14051 indiscard: 14052 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14053 drop: 14054 if (mp != NULL) 14055 freemsg(mp); 14056 return (ire); 14057 14058 } 14059 14060 /* 14061 * This function is called in the forwarding slowpath, when 14062 * either the ire lacks the link-layer address, or the packet needs 14063 * further processing(eg. fragmentation), before transmission. 14064 */ 14065 14066 static void 14067 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14068 ill_t *ill, boolean_t ll_multicast) 14069 { 14070 ill_group_t *ill_group; 14071 ill_group_t *ire_group; 14072 queue_t *dev_q; 14073 ire_t *src_ire; 14074 ip_stack_t *ipst = ill->ill_ipst; 14075 14076 ASSERT(ire->ire_stq != NULL); 14077 14078 mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */ 14079 mp->b_next = NULL; /* ip_rput_noire sets dst here */ 14080 14081 if (ll_multicast != 0) { 14082 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14083 goto drop_pkt; 14084 } 14085 14086 /* 14087 * check if ipha_src is a broadcast address. Note that this 14088 * check is redundant when we get here from ip_fast_forward() 14089 * which has already done this check. However, since we can 14090 * also get here from ip_rput_process_broadcast() or, for 14091 * for the slow path through ip_fast_forward(), we perform 14092 * the check again for code-reusability 14093 */ 14094 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 14095 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 14096 if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) { 14097 if (src_ire != NULL) 14098 ire_refrele(src_ire); 14099 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14100 ip2dbg(("ip_rput_process_forward: Received packet with" 14101 " bad src/dst address on %s\n", ill->ill_name)); 14102 goto drop_pkt; 14103 } 14104 14105 ill_group = ill->ill_group; 14106 ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; 14107 /* 14108 * Check if we want to forward this one at this time. 14109 * We allow source routed packets on a host provided that 14110 * they go out the same interface or same interface group 14111 * as they came in on. 14112 * 14113 * XXX To be quicker, we may wish to not chase pointers to 14114 * get the ILLF_ROUTER flag and instead store the 14115 * forwarding policy in the ire. An unfortunate 14116 * side-effect of that would be requiring an ire flush 14117 * whenever the ILLF_ROUTER flag changes. 14118 */ 14119 if (((ill->ill_flags & 14120 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 14121 ILLF_ROUTER) == 0) && 14122 !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q || 14123 (ill_group != NULL && ill_group == ire_group)))) { 14124 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14125 if (ip_source_routed(ipha, ipst)) { 14126 q = WR(q); 14127 /* 14128 * Clear the indication that this may have 14129 * hardware checksum as we are not using it. 14130 */ 14131 DB_CKSUMFLAGS(mp) = 0; 14132 /* Sent by forwarding path, and router is global zone */ 14133 icmp_unreachable(q, mp, 14134 ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst); 14135 return; 14136 } 14137 goto drop_pkt; 14138 } 14139 14140 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 14141 14142 /* Packet is being forwarded. Turning off hwcksum flag. */ 14143 DB_CKSUMFLAGS(mp) = 0; 14144 if (ipst->ips_ip_g_send_redirects) { 14145 /* 14146 * Check whether the incoming interface and outgoing 14147 * interface is part of the same group. If so, 14148 * send redirects. 14149 * 14150 * Check the source address to see if it originated 14151 * on the same logical subnet it is going back out on. 14152 * If so, we should be able to send it a redirect. 14153 * Avoid sending a redirect if the destination 14154 * is directly connected (i.e., ipha_dst is the same 14155 * as ire_gateway_addr or the ire_addr of the 14156 * nexthop IRE_CACHE ), or if the packet was source 14157 * routed out this interface. 14158 */ 14159 ipaddr_t src, nhop; 14160 mblk_t *mp1; 14161 ire_t *nhop_ire = NULL; 14162 14163 /* 14164 * Check whether ire_rfq and q are from the same ill 14165 * or if they are not same, they at least belong 14166 * to the same group. If so, send redirects. 14167 */ 14168 if ((ire->ire_rfq == q || 14169 (ill_group != NULL && ill_group == ire_group)) && 14170 !ip_source_routed(ipha, ipst)) { 14171 14172 nhop = (ire->ire_gateway_addr != 0 ? 14173 ire->ire_gateway_addr : ire->ire_addr); 14174 14175 if (ipha->ipha_dst == nhop) { 14176 /* 14177 * We avoid sending a redirect if the 14178 * destination is directly connected 14179 * because it is possible that multiple 14180 * IP subnets may have been configured on 14181 * the link, and the source may not 14182 * be on the same subnet as ip destination, 14183 * even though they are on the same 14184 * physical link. 14185 */ 14186 goto sendit; 14187 } 14188 14189 src = ipha->ipha_src; 14190 14191 /* 14192 * We look up the interface ire for the nexthop, 14193 * to see if ipha_src is in the same subnet 14194 * as the nexthop. 14195 * 14196 * Note that, if, in the future, IRE_CACHE entries 14197 * are obsoleted, this lookup will not be needed, 14198 * as the ire passed to this function will be the 14199 * same as the nhop_ire computed below. 14200 */ 14201 nhop_ire = ire_ftable_lookup(nhop, 0, 0, 14202 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 14203 0, NULL, MATCH_IRE_TYPE, ipst); 14204 14205 if (nhop_ire != NULL) { 14206 if ((src & nhop_ire->ire_mask) == 14207 (nhop & nhop_ire->ire_mask)) { 14208 /* 14209 * The source is directly connected. 14210 * Just copy the ip header (which is 14211 * in the first mblk) 14212 */ 14213 mp1 = copyb(mp); 14214 if (mp1 != NULL) { 14215 icmp_send_redirect(WR(q), mp1, 14216 nhop, ipst); 14217 } 14218 } 14219 ire_refrele(nhop_ire); 14220 } 14221 } 14222 } 14223 sendit: 14224 dev_q = ire->ire_stq->q_next; 14225 if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { 14226 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14227 freemsg(mp); 14228 return; 14229 } 14230 14231 ip_rput_forward(ire, ipha, mp, ill); 14232 return; 14233 14234 drop_pkt: 14235 ip2dbg(("ip_rput_process_forward: drop pkt\n")); 14236 freemsg(mp); 14237 } 14238 14239 ire_t * 14240 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14241 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 14242 { 14243 queue_t *q; 14244 uint16_t hcksumflags; 14245 ip_stack_t *ipst = ill->ill_ipst; 14246 14247 q = *qp; 14248 14249 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); 14250 14251 /* 14252 * Clear the indication that this may have hardware 14253 * checksum as we are not using it for forwarding. 14254 */ 14255 hcksumflags = DB_CKSUMFLAGS(mp); 14256 DB_CKSUMFLAGS(mp) = 0; 14257 14258 /* 14259 * Directed broadcast forwarding: if the packet came in over a 14260 * different interface then it is routed out over we can forward it. 14261 */ 14262 if (ipha->ipha_protocol == IPPROTO_TCP) { 14263 ire_refrele(ire); 14264 freemsg(mp); 14265 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14266 return (NULL); 14267 } 14268 /* 14269 * For multicast we have set dst to be INADDR_BROADCAST 14270 * for delivering to all STREAMS. IRE_MARK_NORECV is really 14271 * only for broadcast packets. 14272 */ 14273 if (!CLASSD(ipha->ipha_dst)) { 14274 ire_t *new_ire; 14275 ipif_t *ipif; 14276 /* 14277 * For ill groups, as the switch duplicates broadcasts 14278 * across all the ports, we need to filter out and 14279 * send up only one copy. There is one copy for every 14280 * broadcast address on each ill. Thus, we look for a 14281 * specific IRE on this ill and look at IRE_MARK_NORECV 14282 * later to see whether this ill is eligible to receive 14283 * them or not. ill_nominate_bcast_rcv() nominates only 14284 * one set of IREs for receiving. 14285 */ 14286 14287 ipif = ipif_get_next_ipif(NULL, ill); 14288 if (ipif == NULL) { 14289 ire_refrele(ire); 14290 freemsg(mp); 14291 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14292 return (NULL); 14293 } 14294 new_ire = ire_ctable_lookup(dst, 0, 0, 14295 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst); 14296 ipif_refrele(ipif); 14297 14298 if (new_ire != NULL) { 14299 if (new_ire->ire_marks & IRE_MARK_NORECV) { 14300 ire_refrele(ire); 14301 ire_refrele(new_ire); 14302 freemsg(mp); 14303 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14304 return (NULL); 14305 } 14306 /* 14307 * In the special case of multirouted broadcast 14308 * packets, we unconditionally need to "gateway" 14309 * them to the appropriate interface here. 14310 * In the normal case, this cannot happen, because 14311 * there is no broadcast IRE tagged with the 14312 * RTF_MULTIRT flag. 14313 */ 14314 if (new_ire->ire_flags & RTF_MULTIRT) { 14315 ire_refrele(new_ire); 14316 if (ire->ire_rfq != NULL) { 14317 q = ire->ire_rfq; 14318 *qp = q; 14319 } 14320 } else { 14321 ire_refrele(ire); 14322 ire = new_ire; 14323 } 14324 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 14325 if (!ipst->ips_ip_g_forward_directed_bcast) { 14326 /* 14327 * Free the message if 14328 * ip_g_forward_directed_bcast is turned 14329 * off for non-local broadcast. 14330 */ 14331 ire_refrele(ire); 14332 freemsg(mp); 14333 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14334 return (NULL); 14335 } 14336 } else { 14337 /* 14338 * This CGTP packet successfully passed the 14339 * CGTP filter, but the related CGTP 14340 * broadcast IRE has not been found, 14341 * meaning that the redundant ipif is 14342 * probably down. However, if we discarded 14343 * this packet, its duplicate would be 14344 * filtered out by the CGTP filter so none 14345 * of them would get through. So we keep 14346 * going with this one. 14347 */ 14348 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 14349 if (ire->ire_rfq != NULL) { 14350 q = ire->ire_rfq; 14351 *qp = q; 14352 } 14353 } 14354 } 14355 if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) { 14356 /* 14357 * Verify that there are not more then one 14358 * IRE_BROADCAST with this broadcast address which 14359 * has ire_stq set. 14360 * TODO: simplify, loop over all IRE's 14361 */ 14362 ire_t *ire1; 14363 int num_stq = 0; 14364 mblk_t *mp1; 14365 14366 /* Find the first one with ire_stq set */ 14367 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 14368 for (ire1 = ire; ire1 && 14369 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 14370 ire1 = ire1->ire_next) 14371 ; 14372 if (ire1) { 14373 ire_refrele(ire); 14374 ire = ire1; 14375 IRE_REFHOLD(ire); 14376 } 14377 14378 /* Check if there are additional ones with stq set */ 14379 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 14380 if (ire->ire_addr != ire1->ire_addr) 14381 break; 14382 if (ire1->ire_stq) { 14383 num_stq++; 14384 break; 14385 } 14386 } 14387 rw_exit(&ire->ire_bucket->irb_lock); 14388 if (num_stq == 1 && ire->ire_stq != NULL) { 14389 ip1dbg(("ip_rput_process_broadcast: directed " 14390 "broadcast to 0x%x\n", 14391 ntohl(ire->ire_addr))); 14392 mp1 = copymsg(mp); 14393 if (mp1) { 14394 switch (ipha->ipha_protocol) { 14395 case IPPROTO_UDP: 14396 ip_udp_input(q, mp1, ipha, ire, ill); 14397 break; 14398 default: 14399 ip_proto_input(q, mp1, ipha, ire, ill, 14400 B_FALSE); 14401 break; 14402 } 14403 } 14404 /* 14405 * Adjust ttl to 2 (1+1 - the forward engine 14406 * will decrement it by one. 14407 */ 14408 if (ip_csum_hdr(ipha)) { 14409 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 14410 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 14411 freemsg(mp); 14412 ire_refrele(ire); 14413 return (NULL); 14414 } 14415 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; 14416 ipha->ipha_hdr_checksum = 0; 14417 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 14418 ip_rput_process_forward(q, mp, ire, ipha, 14419 ill, ll_multicast); 14420 ire_refrele(ire); 14421 return (NULL); 14422 } 14423 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 14424 ntohl(ire->ire_addr))); 14425 } 14426 14427 14428 /* Restore any hardware checksum flags */ 14429 DB_CKSUMFLAGS(mp) = hcksumflags; 14430 return (ire); 14431 } 14432 14433 /* ARGSUSED */ 14434 static boolean_t 14435 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 14436 int *ll_multicast, ipaddr_t *dstp) 14437 { 14438 ip_stack_t *ipst = ill->ill_ipst; 14439 14440 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); 14441 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, 14442 ntohs(ipha->ipha_length)); 14443 14444 /* 14445 * Forward packets only if we have joined the allmulti 14446 * group on this interface. 14447 */ 14448 if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) { 14449 int retval; 14450 14451 /* 14452 * Clear the indication that this may have hardware 14453 * checksum as we are not using it. 14454 */ 14455 DB_CKSUMFLAGS(mp) = 0; 14456 retval = ip_mforward(ill, ipha, mp); 14457 /* ip_mforward updates mib variables if needed */ 14458 /* clear b_prev - used by ip_mroute_decap */ 14459 mp->b_prev = NULL; 14460 14461 switch (retval) { 14462 case 0: 14463 /* 14464 * pkt is okay and arrived on phyint. 14465 * 14466 * If we are running as a multicast router 14467 * we need to see all IGMP and/or PIM packets. 14468 */ 14469 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 14470 (ipha->ipha_protocol == IPPROTO_PIM)) { 14471 goto done; 14472 } 14473 break; 14474 case -1: 14475 /* pkt is mal-formed, toss it */ 14476 goto drop_pkt; 14477 case 1: 14478 /* pkt is okay and arrived on a tunnel */ 14479 /* 14480 * If we are running a multicast router 14481 * we need to see all igmp packets. 14482 */ 14483 if (ipha->ipha_protocol == IPPROTO_IGMP) { 14484 *dstp = INADDR_BROADCAST; 14485 *ll_multicast = 1; 14486 return (B_FALSE); 14487 } 14488 14489 goto drop_pkt; 14490 } 14491 } 14492 14493 ILM_WALKER_HOLD(ill); 14494 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 14495 /* 14496 * This might just be caused by the fact that 14497 * multiple IP Multicast addresses map to the same 14498 * link layer multicast - no need to increment counter! 14499 */ 14500 ILM_WALKER_RELE(ill); 14501 freemsg(mp); 14502 return (B_TRUE); 14503 } 14504 ILM_WALKER_RELE(ill); 14505 done: 14506 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 14507 /* 14508 * This assumes the we deliver to all streams for multicast 14509 * and broadcast packets. 14510 */ 14511 *dstp = INADDR_BROADCAST; 14512 *ll_multicast = 1; 14513 return (B_FALSE); 14514 drop_pkt: 14515 ip2dbg(("ip_rput: drop pkt\n")); 14516 freemsg(mp); 14517 return (B_TRUE); 14518 } 14519 14520 /* 14521 * This function is used to both return an indication of whether or not 14522 * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND) 14523 * and in doing so, determine whether or not it is broadcast vs multicast. 14524 * For it to be a broadcast packet, we must have the appropriate mblk_t 14525 * hanging off the ill_t. If this is either not present or doesn't match 14526 * the destination mac address in the DL_UNITDATA_IND, the packet is deemed 14527 * to be multicast. Thus NICs that have no broadcast address (or no 14528 * capability for one, such as point to point links) cannot return as 14529 * the packet being broadcast. The use of HPE_BROADCAST/HPE_MULTICAST as 14530 * the return values simplifies the current use of the return value of this 14531 * function, which is to pass through the multicast/broadcast characteristic 14532 * to consumers of the netinfo/pfhooks API. While this is not cast in stone, 14533 * changing the return value to some other symbol demands the appropriate 14534 * "translation" when hpe_flags is set prior to calling hook_run() for 14535 * packet events. 14536 */ 14537 int 14538 ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb) 14539 { 14540 dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr; 14541 mblk_t *bmp; 14542 14543 if (ind->dl_group_address) { 14544 if (ind->dl_dest_addr_offset > sizeof (*ind) && 14545 ind->dl_dest_addr_offset + ind->dl_dest_addr_length < 14546 MBLKL(mb) && 14547 (bmp = ill->ill_bcast_mp) != NULL) { 14548 dl_unitdata_req_t *dlur; 14549 uint8_t *bphys_addr; 14550 14551 dlur = (dl_unitdata_req_t *)bmp->b_rptr; 14552 if (ill->ill_sap_length < 0) 14553 bphys_addr = (uchar_t *)dlur + 14554 dlur->dl_dest_addr_offset; 14555 else 14556 bphys_addr = (uchar_t *)dlur + 14557 dlur->dl_dest_addr_offset + 14558 ill->ill_sap_length; 14559 14560 if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset, 14561 bphys_addr, ind->dl_dest_addr_length) == 0) { 14562 return (HPE_BROADCAST); 14563 } 14564 return (HPE_MULTICAST); 14565 } 14566 return (HPE_MULTICAST); 14567 } 14568 return (0); 14569 } 14570 14571 static boolean_t 14572 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 14573 int *ll_multicast, mblk_t **mpp) 14574 { 14575 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 14576 boolean_t must_copy = B_FALSE; 14577 struct iocblk *iocp; 14578 ipha_t *ipha; 14579 ip_stack_t *ipst = ill->ill_ipst; 14580 14581 #define rptr ((uchar_t *)ipha) 14582 14583 first_mp = *first_mpp; 14584 mp = *mpp; 14585 14586 ASSERT(first_mp == mp); 14587 14588 /* 14589 * if db_ref > 1 then copymsg and free original. Packet may be 14590 * changed and do not want other entity who has a reference to this 14591 * message to trip over the changes. This is a blind change because 14592 * trying to catch all places that might change packet is too 14593 * difficult (since it may be a module above this one) 14594 * 14595 * This corresponds to the non-fast path case. We walk down the full 14596 * chain in this case, and check the db_ref count of all the dblks, 14597 * and do a copymsg if required. It is possible that the db_ref counts 14598 * of the data blocks in the mblk chain can be different. 14599 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 14600 * count of 1, followed by a M_DATA block with a ref count of 2, if 14601 * 'snoop' is running. 14602 */ 14603 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 14604 if (mp1->b_datap->db_ref > 1) { 14605 must_copy = B_TRUE; 14606 break; 14607 } 14608 } 14609 14610 if (must_copy) { 14611 mp1 = copymsg(mp); 14612 if (mp1 == NULL) { 14613 for (mp1 = mp; mp1 != NULL; 14614 mp1 = mp1->b_cont) { 14615 mp1->b_next = NULL; 14616 mp1->b_prev = NULL; 14617 } 14618 freemsg(mp); 14619 if (ill != NULL) { 14620 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14621 } else { 14622 BUMP_MIB(&ipst->ips_ip_mib, 14623 ipIfStatsInDiscards); 14624 } 14625 return (B_TRUE); 14626 } 14627 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 14628 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 14629 /* Copy b_prev - used by ip_mroute_decap */ 14630 to_mp->b_prev = from_mp->b_prev; 14631 from_mp->b_prev = NULL; 14632 } 14633 *first_mpp = first_mp = mp1; 14634 freemsg(mp); 14635 mp = mp1; 14636 *mpp = mp1; 14637 } 14638 14639 ipha = (ipha_t *)mp->b_rptr; 14640 14641 /* 14642 * previous code has a case for M_DATA. 14643 * We want to check how that happens. 14644 */ 14645 ASSERT(first_mp->b_datap->db_type != M_DATA); 14646 switch (first_mp->b_datap->db_type) { 14647 case M_PROTO: 14648 case M_PCPROTO: 14649 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 14650 DL_UNITDATA_IND) { 14651 /* Go handle anything other than data elsewhere. */ 14652 ip_rput_dlpi(q, mp); 14653 return (B_TRUE); 14654 } 14655 14656 *ll_multicast = ip_get_dlpi_mbcast(ill, mp); 14657 /* Ditch the DLPI header. */ 14658 mp1 = mp->b_cont; 14659 ASSERT(first_mp == mp); 14660 *first_mpp = mp1; 14661 freeb(mp); 14662 *mpp = mp1; 14663 return (B_FALSE); 14664 case M_IOCACK: 14665 ip1dbg(("got iocack ")); 14666 iocp = (struct iocblk *)mp->b_rptr; 14667 switch (iocp->ioc_cmd) { 14668 case DL_IOC_HDR_INFO: 14669 ill = (ill_t *)q->q_ptr; 14670 ill_fastpath_ack(ill, mp); 14671 return (B_TRUE); 14672 case SIOCSTUNPARAM: 14673 case OSIOCSTUNPARAM: 14674 /* Go through qwriter_ip */ 14675 break; 14676 case SIOCGTUNPARAM: 14677 case OSIOCGTUNPARAM: 14678 ip_rput_other(NULL, q, mp, NULL); 14679 return (B_TRUE); 14680 default: 14681 putnext(q, mp); 14682 return (B_TRUE); 14683 } 14684 /* FALLTHRU */ 14685 case M_ERROR: 14686 case M_HANGUP: 14687 /* 14688 * Since this is on the ill stream we unconditionally 14689 * bump up the refcount 14690 */ 14691 ill_refhold(ill); 14692 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14693 return (B_TRUE); 14694 case M_CTL: 14695 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 14696 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 14697 IPHADA_M_CTL)) { 14698 /* 14699 * It's an IPsec accelerated packet. 14700 * Make sure that the ill from which we received the 14701 * packet has enabled IPsec hardware acceleration. 14702 */ 14703 if (!(ill->ill_capabilities & 14704 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 14705 /* IPsec kstats: bean counter */ 14706 freemsg(mp); 14707 return (B_TRUE); 14708 } 14709 14710 /* 14711 * Make mp point to the mblk following the M_CTL, 14712 * then process according to type of mp. 14713 * After this processing, first_mp will point to 14714 * the data-attributes and mp to the pkt following 14715 * the M_CTL. 14716 */ 14717 mp = first_mp->b_cont; 14718 if (mp == NULL) { 14719 freemsg(first_mp); 14720 return (B_TRUE); 14721 } 14722 /* 14723 * A Hardware Accelerated packet can only be M_DATA 14724 * ESP or AH packet. 14725 */ 14726 if (mp->b_datap->db_type != M_DATA) { 14727 /* non-M_DATA IPsec accelerated packet */ 14728 IPSECHW_DEBUG(IPSECHW_PKT, 14729 ("non-M_DATA IPsec accelerated pkt\n")); 14730 freemsg(first_mp); 14731 return (B_TRUE); 14732 } 14733 ipha = (ipha_t *)mp->b_rptr; 14734 if (ipha->ipha_protocol != IPPROTO_AH && 14735 ipha->ipha_protocol != IPPROTO_ESP) { 14736 IPSECHW_DEBUG(IPSECHW_PKT, 14737 ("non-M_DATA IPsec accelerated pkt\n")); 14738 freemsg(first_mp); 14739 return (B_TRUE); 14740 } 14741 *mpp = mp; 14742 return (B_FALSE); 14743 } 14744 putnext(q, mp); 14745 return (B_TRUE); 14746 case M_IOCNAK: 14747 ip1dbg(("got iocnak ")); 14748 iocp = (struct iocblk *)mp->b_rptr; 14749 switch (iocp->ioc_cmd) { 14750 case SIOCSTUNPARAM: 14751 case OSIOCSTUNPARAM: 14752 /* 14753 * Since this is on the ill stream we unconditionally 14754 * bump up the refcount 14755 */ 14756 ill_refhold(ill); 14757 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14758 return (B_TRUE); 14759 case DL_IOC_HDR_INFO: 14760 case SIOCGTUNPARAM: 14761 case OSIOCGTUNPARAM: 14762 ip_rput_other(NULL, q, mp, NULL); 14763 return (B_TRUE); 14764 default: 14765 break; 14766 } 14767 /* FALLTHRU */ 14768 default: 14769 putnext(q, mp); 14770 return (B_TRUE); 14771 } 14772 } 14773 14774 /* Read side put procedure. Packets coming from the wire arrive here. */ 14775 void 14776 ip_rput(queue_t *q, mblk_t *mp) 14777 { 14778 ill_t *ill; 14779 union DL_primitives *dl; 14780 14781 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 14782 14783 ill = (ill_t *)q->q_ptr; 14784 14785 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 14786 /* 14787 * If things are opening or closing, only accept high-priority 14788 * DLPI messages. (On open ill->ill_ipif has not yet been 14789 * created; on close, things hanging off the ill may have been 14790 * freed already.) 14791 */ 14792 dl = (union DL_primitives *)mp->b_rptr; 14793 if (DB_TYPE(mp) != M_PCPROTO || 14794 dl->dl_primitive == DL_UNITDATA_IND) { 14795 /* 14796 * SIOC[GS]TUNPARAM ioctls can come here. 14797 */ 14798 inet_freemsg(mp); 14799 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14800 "ip_rput_end: q %p (%S)", q, "uninit"); 14801 return; 14802 } 14803 } 14804 14805 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14806 "ip_rput_end: q %p (%S)", q, "end"); 14807 14808 ip_input(ill, NULL, mp, NULL); 14809 } 14810 14811 static mblk_t * 14812 ip_fix_dbref(ill_t *ill, mblk_t *mp) 14813 { 14814 mblk_t *mp1; 14815 boolean_t adjusted = B_FALSE; 14816 ip_stack_t *ipst = ill->ill_ipst; 14817 14818 IP_STAT(ipst, ip_db_ref); 14819 /* 14820 * The IP_RECVSLLA option depends on having the 14821 * link layer header. First check that: 14822 * a> the underlying device is of type ether, 14823 * since this option is currently supported only 14824 * over ethernet. 14825 * b> there is enough room to copy over the link 14826 * layer header. 14827 * 14828 * Once the checks are done, adjust rptr so that 14829 * the link layer header will be copied via 14830 * copymsg. Note that, IFT_ETHER may be returned 14831 * by some non-ethernet drivers but in this case 14832 * the second check will fail. 14833 */ 14834 if (ill->ill_type == IFT_ETHER && 14835 (mp->b_rptr - mp->b_datap->db_base) >= 14836 sizeof (struct ether_header)) { 14837 mp->b_rptr -= sizeof (struct ether_header); 14838 adjusted = B_TRUE; 14839 } 14840 mp1 = copymsg(mp); 14841 14842 if (mp1 == NULL) { 14843 mp->b_next = NULL; 14844 /* clear b_prev - used by ip_mroute_decap */ 14845 mp->b_prev = NULL; 14846 freemsg(mp); 14847 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14848 return (NULL); 14849 } 14850 14851 if (adjusted) { 14852 /* 14853 * Copy is done. Restore the pointer in 14854 * the _new_ mblk 14855 */ 14856 mp1->b_rptr += sizeof (struct ether_header); 14857 } 14858 14859 /* Copy b_prev - used by ip_mroute_decap */ 14860 mp1->b_prev = mp->b_prev; 14861 mp->b_prev = NULL; 14862 14863 /* preserve the hardware checksum flags and data, if present */ 14864 if (DB_CKSUMFLAGS(mp) != 0) { 14865 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 14866 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 14867 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 14868 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 14869 DB_CKSUM16(mp1) = DB_CKSUM16(mp); 14870 } 14871 14872 freemsg(mp); 14873 return (mp1); 14874 } 14875 14876 /* 14877 * Direct read side procedure capable of dealing with chains. GLDv3 based 14878 * drivers call this function directly with mblk chains while STREAMS 14879 * read side procedure ip_rput() calls this for single packet with ip_ring 14880 * set to NULL to process one packet at a time. 14881 * 14882 * The ill will always be valid if this function is called directly from 14883 * the driver. 14884 * 14885 * If ip_input() is called from GLDv3: 14886 * 14887 * - This must be a non-VLAN IP stream. 14888 * - 'mp' is either an untagged or a special priority-tagged packet. 14889 * - Any VLAN tag that was in the MAC header has been stripped. 14890 * 14891 * If the IP header in packet is not 32-bit aligned, every message in the 14892 * chain will be aligned before further operations. This is required on SPARC 14893 * platform. 14894 */ 14895 /* ARGSUSED */ 14896 void 14897 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 14898 struct mac_header_info_s *mhip) 14899 { 14900 ipaddr_t dst = NULL; 14901 ipaddr_t prev_dst; 14902 ire_t *ire = NULL; 14903 ipha_t *ipha; 14904 uint_t pkt_len; 14905 ssize_t len; 14906 uint_t opt_len; 14907 int ll_multicast; 14908 int cgtp_flt_pkt; 14909 queue_t *q = ill->ill_rq; 14910 squeue_t *curr_sqp = NULL; 14911 mblk_t *head = NULL; 14912 mblk_t *tail = NULL; 14913 mblk_t *first_mp; 14914 mblk_t *mp; 14915 mblk_t *dmp; 14916 int cnt = 0; 14917 ip_stack_t *ipst = ill->ill_ipst; 14918 14919 ASSERT(mp_chain != NULL); 14920 ASSERT(ill != NULL); 14921 14922 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 14923 14924 #define rptr ((uchar_t *)ipha) 14925 14926 while (mp_chain != NULL) { 14927 first_mp = mp = mp_chain; 14928 mp_chain = mp_chain->b_next; 14929 mp->b_next = NULL; 14930 ll_multicast = 0; 14931 14932 /* 14933 * We do ire caching from one iteration to 14934 * another. In the event the packet chain contains 14935 * all packets from the same dst, this caching saves 14936 * an ire_cache_lookup for each of the succeeding 14937 * packets in a packet chain. 14938 */ 14939 prev_dst = dst; 14940 14941 /* 14942 * if db_ref > 1 then copymsg and free original. Packet 14943 * may be changed and we do not want the other entity 14944 * who has a reference to this message to trip over the 14945 * changes. This is a blind change because trying to 14946 * catch all places that might change the packet is too 14947 * difficult. 14948 * 14949 * This corresponds to the fast path case, where we have 14950 * a chain of M_DATA mblks. We check the db_ref count 14951 * of only the 1st data block in the mblk chain. There 14952 * doesn't seem to be a reason why a device driver would 14953 * send up data with varying db_ref counts in the mblk 14954 * chain. In any case the Fast path is a private 14955 * interface, and our drivers don't do such a thing. 14956 * Given the above assumption, there is no need to walk 14957 * down the entire mblk chain (which could have a 14958 * potential performance problem) 14959 */ 14960 14961 if (DB_REF(mp) > 1) { 14962 if ((mp = ip_fix_dbref(ill, mp)) == NULL) 14963 continue; 14964 } 14965 14966 /* 14967 * Check and align the IP header. 14968 */ 14969 first_mp = mp; 14970 if (DB_TYPE(mp) == M_DATA) { 14971 dmp = mp; 14972 } else if (DB_TYPE(mp) == M_PROTO && 14973 *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) { 14974 dmp = mp->b_cont; 14975 } else { 14976 dmp = NULL; 14977 } 14978 if (dmp != NULL) { 14979 /* 14980 * IP header ptr not aligned? 14981 * OR IP header not complete in first mblk 14982 */ 14983 if (!OK_32PTR(dmp->b_rptr) || 14984 MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) { 14985 if (!ip_check_and_align_header(q, dmp, ipst)) 14986 continue; 14987 } 14988 } 14989 14990 /* 14991 * ip_input fast path 14992 */ 14993 14994 /* mblk type is not M_DATA */ 14995 if (DB_TYPE(mp) != M_DATA) { 14996 if (ip_rput_process_notdata(q, &first_mp, ill, 14997 &ll_multicast, &mp)) 14998 continue; 14999 15000 /* 15001 * The only way we can get here is if we had a 15002 * packet that was either a DL_UNITDATA_IND or 15003 * an M_CTL for an IPsec accelerated packet. 15004 * 15005 * In either case, the first_mp will point to 15006 * the leading M_PROTO or M_CTL. 15007 */ 15008 ASSERT(first_mp != NULL); 15009 } else if (mhip != NULL) { 15010 /* 15011 * ll_multicast is set here so that it is ready 15012 * for easy use with FW_HOOKS(). ip_get_dlpi_mbcast 15013 * manipulates ll_multicast in the same fashion when 15014 * called from ip_rput_process_notdata. 15015 */ 15016 switch (mhip->mhi_dsttype) { 15017 case MAC_ADDRTYPE_MULTICAST : 15018 ll_multicast = HPE_MULTICAST; 15019 break; 15020 case MAC_ADDRTYPE_BROADCAST : 15021 ll_multicast = HPE_BROADCAST; 15022 break; 15023 default : 15024 break; 15025 } 15026 } 15027 15028 /* Make sure its an M_DATA and that its aligned */ 15029 ASSERT(DB_TYPE(mp) == M_DATA); 15030 ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr)); 15031 15032 ipha = (ipha_t *)mp->b_rptr; 15033 len = mp->b_wptr - rptr; 15034 pkt_len = ntohs(ipha->ipha_length); 15035 15036 /* 15037 * We must count all incoming packets, even if they end 15038 * up being dropped later on. 15039 */ 15040 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 15041 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); 15042 15043 /* multiple mblk or too short */ 15044 len -= pkt_len; 15045 if (len != 0) { 15046 /* 15047 * Make sure we have data length consistent 15048 * with the IP header. 15049 */ 15050 if (mp->b_cont == NULL) { 15051 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 15052 BUMP_MIB(ill->ill_ip_mib, 15053 ipIfStatsInHdrErrors); 15054 ip2dbg(("ip_input: drop pkt\n")); 15055 freemsg(mp); 15056 continue; 15057 } 15058 mp->b_wptr = rptr + pkt_len; 15059 } else if ((len += msgdsize(mp->b_cont)) != 0) { 15060 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 15061 BUMP_MIB(ill->ill_ip_mib, 15062 ipIfStatsInHdrErrors); 15063 ip2dbg(("ip_input: drop pkt\n")); 15064 freemsg(mp); 15065 continue; 15066 } 15067 (void) adjmsg(mp, -len); 15068 IP_STAT(ipst, ip_multimblk3); 15069 } 15070 } 15071 15072 /* Obtain the dst of the current packet */ 15073 dst = ipha->ipha_dst; 15074 15075 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, 15076 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, 15077 ipha, ip6_t *, NULL, int, 0); 15078 15079 /* 15080 * The following test for loopback is faster than 15081 * IP_LOOPBACK_ADDR(), because it avoids any bitwise 15082 * operations. 15083 * Note that these addresses are always in network byte order 15084 */ 15085 if (((*(uchar_t *)&ipha->ipha_dst) == 127) || 15086 ((*(uchar_t *)&ipha->ipha_src) == 127)) { 15087 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 15088 freemsg(mp); 15089 continue; 15090 } 15091 15092 /* 15093 * The event for packets being received from a 'physical' 15094 * interface is placed after validation of the source and/or 15095 * destination address as being local so that packets can be 15096 * redirected to loopback addresses using ipnat. 15097 */ 15098 DTRACE_PROBE4(ip4__physical__in__start, 15099 ill_t *, ill, ill_t *, NULL, 15100 ipha_t *, ipha, mblk_t *, first_mp); 15101 15102 FW_HOOKS(ipst->ips_ip4_physical_in_event, 15103 ipst->ips_ipv4firewall_physical_in, 15104 ill, NULL, ipha, first_mp, mp, ll_multicast, ipst); 15105 15106 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp); 15107 15108 if (first_mp == NULL) { 15109 continue; 15110 } 15111 dst = ipha->ipha_dst; 15112 15113 /* 15114 * Attach any necessary label information to 15115 * this packet 15116 */ 15117 if (is_system_labeled() && 15118 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 15119 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 15120 freemsg(mp); 15121 continue; 15122 } 15123 15124 /* 15125 * Reuse the cached ire only if the ipha_dst of the previous 15126 * packet is the same as the current packet AND it is not 15127 * INADDR_ANY. 15128 */ 15129 if (!(dst == prev_dst && dst != INADDR_ANY) && 15130 (ire != NULL)) { 15131 ire_refrele(ire); 15132 ire = NULL; 15133 } 15134 opt_len = ipha->ipha_version_and_hdr_length - 15135 IP_SIMPLE_HDR_VERSION; 15136 15137 /* 15138 * Check to see if we can take the fastpath. 15139 * That is possible if the following conditions are met 15140 * o Tsol disabled 15141 * o CGTP disabled 15142 * o ipp_action_count is 0 15143 * o no options in the packet 15144 * o not a RSVP packet 15145 * o not a multicast packet 15146 * o ill not in IP_DHCPINIT_IF mode 15147 */ 15148 if (!is_system_labeled() && 15149 !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 && 15150 opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP && 15151 !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) { 15152 if (ire == NULL) 15153 ire = ire_cache_lookup(dst, ALL_ZONES, NULL, 15154 ipst); 15155 15156 /* incoming packet is for forwarding */ 15157 if (ire == NULL || (ire->ire_type & IRE_CACHE)) { 15158 ire = ip_fast_forward(ire, dst, ill, mp); 15159 continue; 15160 } 15161 /* incoming packet is for local consumption */ 15162 if (ire->ire_type & IRE_LOCAL) 15163 goto local; 15164 } 15165 15166 /* 15167 * Disable ire caching for anything more complex 15168 * than the simple fast path case we checked for above. 15169 */ 15170 if (ire != NULL) { 15171 ire_refrele(ire); 15172 ire = NULL; 15173 } 15174 15175 /* 15176 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP 15177 * server to unicast DHCP packets to a DHCP client using the 15178 * IP address it is offering to the client. This can be 15179 * disabled through the "broadcast bit", but not all DHCP 15180 * servers honor that bit. Therefore, to interoperate with as 15181 * many DHCP servers as possible, the DHCP client allows the 15182 * server to unicast, but we treat those packets as broadcast 15183 * here. Note that we don't rewrite the packet itself since 15184 * (a) that would mess up the checksums and (b) the DHCP 15185 * client conn is bound to INADDR_ANY so ip_fanout_udp() will 15186 * hand it the packet regardless. 15187 */ 15188 if (ill->ill_dhcpinit != 0 && 15189 IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP && 15190 pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) { 15191 udpha_t *udpha; 15192 15193 /* 15194 * Reload ipha since pullupmsg() can change b_rptr. 15195 */ 15196 ipha = (ipha_t *)mp->b_rptr; 15197 udpha = (udpha_t *)&ipha[1]; 15198 15199 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { 15200 DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, 15201 mblk_t *, mp); 15202 dst = INADDR_BROADCAST; 15203 } 15204 } 15205 15206 /* Full-blown slow path */ 15207 if (opt_len != 0) { 15208 if (len != 0) 15209 IP_STAT(ipst, ip_multimblk4); 15210 else 15211 IP_STAT(ipst, ip_ipoptions); 15212 if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha, 15213 &dst, ipst)) 15214 continue; 15215 } 15216 15217 /* 15218 * Invoke the CGTP (multirouting) filtering module to process 15219 * the incoming packet. Packets identified as duplicates 15220 * must be discarded. Filtering is active only if the 15221 * the ip_cgtp_filter ndd variable is non-zero. 15222 */ 15223 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 15224 if (ipst->ips_ip_cgtp_filter && 15225 ipst->ips_ip_cgtp_filter_ops != NULL) { 15226 netstackid_t stackid; 15227 15228 stackid = ipst->ips_netstack->netstack_stackid; 15229 cgtp_flt_pkt = 15230 ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, 15231 ill->ill_phyint->phyint_ifindex, mp); 15232 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 15233 freemsg(first_mp); 15234 continue; 15235 } 15236 } 15237 15238 /* 15239 * If rsvpd is running, let RSVP daemon handle its processing 15240 * and forwarding of RSVP multicast/unicast packets. 15241 * If rsvpd is not running but mrouted is running, RSVP 15242 * multicast packets are forwarded as multicast traffic 15243 * and RSVP unicast packets are forwarded by unicast router. 15244 * If neither rsvpd nor mrouted is running, RSVP multicast 15245 * packets are not forwarded, but the unicast packets are 15246 * forwarded like unicast traffic. 15247 */ 15248 if (ipha->ipha_protocol == IPPROTO_RSVP && 15249 ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head != 15250 NULL) { 15251 /* RSVP packet and rsvpd running. Treat as ours */ 15252 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 15253 /* 15254 * This assumes that we deliver to all streams for 15255 * multicast and broadcast packets. 15256 * We have to force ll_multicast to 1 to handle the 15257 * M_DATA messages passed in from ip_mroute_decap. 15258 */ 15259 dst = INADDR_BROADCAST; 15260 ll_multicast = 1; 15261 } else if (CLASSD(dst)) { 15262 /* packet is multicast */ 15263 mp->b_next = NULL; 15264 if (ip_rput_process_multicast(q, mp, ill, ipha, 15265 &ll_multicast, &dst)) 15266 continue; 15267 } 15268 15269 if (ire == NULL) { 15270 ire = ire_cache_lookup(dst, ALL_ZONES, 15271 MBLK_GETLABEL(mp), ipst); 15272 } 15273 15274 if (ire != NULL && ire->ire_stq != NULL && 15275 ire->ire_zoneid != GLOBAL_ZONEID && 15276 ire->ire_zoneid != ALL_ZONES) { 15277 /* 15278 * Should only use IREs that are visible from the 15279 * global zone for forwarding. 15280 */ 15281 ire_refrele(ire); 15282 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, 15283 MBLK_GETLABEL(mp), ipst); 15284 } 15285 15286 if (ire == NULL) { 15287 /* 15288 * No IRE for this destination, so it can't be for us. 15289 * Unless we are forwarding, drop the packet. 15290 * We have to let source routed packets through 15291 * since we don't yet know if they are 'ping -l' 15292 * packets i.e. if they will go out over the 15293 * same interface as they came in on. 15294 */ 15295 ire = ip_rput_noire(q, mp, ll_multicast, dst); 15296 if (ire == NULL) 15297 continue; 15298 } 15299 15300 /* 15301 * Broadcast IRE may indicate either broadcast or 15302 * multicast packet 15303 */ 15304 if (ire->ire_type == IRE_BROADCAST) { 15305 /* 15306 * Skip broadcast checks if packet is UDP multicast; 15307 * we'd rather not enter ip_rput_process_broadcast() 15308 * unless the packet is broadcast for real, since 15309 * that routine is a no-op for multicast. 15310 */ 15311 if (ipha->ipha_protocol != IPPROTO_UDP || 15312 !CLASSD(ipha->ipha_dst)) { 15313 ire = ip_rput_process_broadcast(&q, mp, 15314 ire, ipha, ill, dst, cgtp_flt_pkt, 15315 ll_multicast); 15316 if (ire == NULL) 15317 continue; 15318 } 15319 } else if (ire->ire_stq != NULL) { 15320 /* fowarding? */ 15321 ip_rput_process_forward(q, mp, ire, ipha, ill, 15322 ll_multicast); 15323 /* ip_rput_process_forward consumed the packet */ 15324 continue; 15325 } 15326 15327 local: 15328 /* 15329 * If the queue in the ire is different to the ingress queue 15330 * then we need to check to see if we can accept the packet. 15331 * Note that for multicast packets and broadcast packets sent 15332 * to a broadcast address which is shared between multiple 15333 * interfaces we should not do this since we just got a random 15334 * broadcast ire. 15335 */ 15336 if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { 15337 if ((ire = ip_check_multihome(&ipha->ipha_dst, ire, 15338 ill)) == NULL) { 15339 /* Drop packet */ 15340 BUMP_MIB(ill->ill_ip_mib, 15341 ipIfStatsForwProhibits); 15342 freemsg(mp); 15343 continue; 15344 } 15345 if (ire->ire_rfq != NULL) 15346 q = ire->ire_rfq; 15347 } 15348 15349 switch (ipha->ipha_protocol) { 15350 case IPPROTO_TCP: 15351 ASSERT(first_mp == mp); 15352 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 15353 mp, 0, q, ip_ring)) != NULL) { 15354 if (curr_sqp == NULL) { 15355 curr_sqp = GET_SQUEUE(mp); 15356 ASSERT(cnt == 0); 15357 cnt++; 15358 head = tail = mp; 15359 } else if (curr_sqp == GET_SQUEUE(mp)) { 15360 ASSERT(tail != NULL); 15361 cnt++; 15362 tail->b_next = mp; 15363 tail = mp; 15364 } else { 15365 /* 15366 * A different squeue. Send the 15367 * chain for the previous squeue on 15368 * its way. This shouldn't happen 15369 * often unless interrupt binding 15370 * changes. 15371 */ 15372 IP_STAT(ipst, ip_input_multi_squeue); 15373 squeue_enter_chain(curr_sqp, head, 15374 tail, cnt, SQTAG_IP_INPUT); 15375 curr_sqp = GET_SQUEUE(mp); 15376 head = mp; 15377 tail = mp; 15378 cnt = 1; 15379 } 15380 } 15381 continue; 15382 case IPPROTO_UDP: 15383 ASSERT(first_mp == mp); 15384 ip_udp_input(q, mp, ipha, ire, ill); 15385 continue; 15386 case IPPROTO_SCTP: 15387 ASSERT(first_mp == mp); 15388 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 15389 q, dst); 15390 /* ire has been released by ip_sctp_input */ 15391 ire = NULL; 15392 continue; 15393 default: 15394 ip_proto_input(q, first_mp, ipha, ire, ill, B_FALSE); 15395 continue; 15396 } 15397 } 15398 15399 if (ire != NULL) 15400 ire_refrele(ire); 15401 15402 if (head != NULL) 15403 squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); 15404 15405 /* 15406 * This code is there just to make netperf/ttcp look good. 15407 * 15408 * Its possible that after being in polling mode (and having cleared 15409 * the backlog), squeues have turned the interrupt frequency higher 15410 * to improve latency at the expense of more CPU utilization (less 15411 * packets per interrupts or more number of interrupts). Workloads 15412 * like ttcp/netperf do manage to tickle polling once in a while 15413 * but for the remaining time, stay in higher interrupt mode since 15414 * their packet arrival rate is pretty uniform and this shows up 15415 * as higher CPU utilization. Since people care about CPU utilization 15416 * while running netperf/ttcp, turn the interrupt frequency back to 15417 * normal/default if polling has not been used in ip_poll_normal_ticks. 15418 */ 15419 if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { 15420 if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { 15421 ip_ring->rr_poll_state &= ~ILL_POLLING; 15422 ip_ring->rr_blank(ip_ring->rr_handle, 15423 ip_ring->rr_normal_blank_time, 15424 ip_ring->rr_normal_pkt_cnt); 15425 } 15426 } 15427 15428 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 15429 "ip_input_end: q %p (%S)", q, "end"); 15430 #undef rptr 15431 } 15432 15433 static void 15434 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 15435 t_uscalar_t err) 15436 { 15437 if (dl_err == DL_SYSERR) { 15438 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15439 "%s: %s failed: DL_SYSERR (errno %u)\n", 15440 ill->ill_name, dl_primstr(prim), err); 15441 return; 15442 } 15443 15444 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15445 "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim), 15446 dl_errstr(dl_err)); 15447 } 15448 15449 /* 15450 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 15451 * than DL_UNITDATA_IND messages. If we need to process this message 15452 * exclusively, we call qwriter_ip, in which case we also need to call 15453 * ill_refhold before that, since qwriter_ip does an ill_refrele. 15454 */ 15455 void 15456 ip_rput_dlpi(queue_t *q, mblk_t *mp) 15457 { 15458 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15459 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15460 ill_t *ill = (ill_t *)q->q_ptr; 15461 boolean_t pending; 15462 15463 ip1dbg(("ip_rput_dlpi")); 15464 if (dloa->dl_primitive == DL_ERROR_ACK) { 15465 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK %s (0x%x): " 15466 "%s (0x%x), unix %u\n", ill->ill_name, 15467 dl_primstr(dlea->dl_error_primitive), 15468 dlea->dl_error_primitive, 15469 dl_errstr(dlea->dl_errno), 15470 dlea->dl_errno, 15471 dlea->dl_unix_errno)); 15472 } 15473 15474 /* 15475 * If we received an ACK but didn't send a request for it, then it 15476 * can't be part of any pending operation; discard up-front. 15477 */ 15478 switch (dloa->dl_primitive) { 15479 case DL_NOTIFY_IND: 15480 pending = B_TRUE; 15481 break; 15482 case DL_ERROR_ACK: 15483 pending = ill_dlpi_pending(ill, dlea->dl_error_primitive); 15484 break; 15485 case DL_OK_ACK: 15486 pending = ill_dlpi_pending(ill, dloa->dl_correct_primitive); 15487 break; 15488 case DL_INFO_ACK: 15489 pending = ill_dlpi_pending(ill, DL_INFO_REQ); 15490 break; 15491 case DL_BIND_ACK: 15492 pending = ill_dlpi_pending(ill, DL_BIND_REQ); 15493 break; 15494 case DL_PHYS_ADDR_ACK: 15495 pending = ill_dlpi_pending(ill, DL_PHYS_ADDR_REQ); 15496 break; 15497 case DL_NOTIFY_ACK: 15498 pending = ill_dlpi_pending(ill, DL_NOTIFY_REQ); 15499 break; 15500 case DL_CONTROL_ACK: 15501 pending = ill_dlpi_pending(ill, DL_CONTROL_REQ); 15502 break; 15503 case DL_CAPABILITY_ACK: 15504 pending = ill_dlpi_pending(ill, DL_CAPABILITY_REQ); 15505 break; 15506 default: 15507 /* Not a DLPI message we support or were expecting */ 15508 freemsg(mp); 15509 return; 15510 } 15511 15512 if (!pending) { 15513 freemsg(mp); 15514 return; 15515 } 15516 15517 switch (dloa->dl_primitive) { 15518 case DL_ERROR_ACK: 15519 if (dlea->dl_error_primitive == DL_UNBIND_REQ) { 15520 mutex_enter(&ill->ill_lock); 15521 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 15522 cv_signal(&ill->ill_cv); 15523 mutex_exit(&ill->ill_lock); 15524 } 15525 break; 15526 15527 case DL_OK_ACK: 15528 ip1dbg(("ip_rput: DL_OK_ACK for %s\n", 15529 dl_primstr((int)dloa->dl_correct_primitive))); 15530 switch (dloa->dl_correct_primitive) { 15531 case DL_UNBIND_REQ: 15532 mutex_enter(&ill->ill_lock); 15533 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 15534 cv_signal(&ill->ill_cv); 15535 mutex_exit(&ill->ill_lock); 15536 break; 15537 15538 case DL_ENABMULTI_REQ: 15539 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15540 ill->ill_dlpi_multicast_state = IDS_OK; 15541 break; 15542 } 15543 break; 15544 default: 15545 break; 15546 } 15547 15548 /* 15549 * The message is one we're waiting for (or DL_NOTIFY_IND), but we 15550 * need to become writer to continue to process it. Because an 15551 * exclusive operation doesn't complete until replies to all queued 15552 * DLPI messages have been received, we know we're in the middle of an 15553 * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND). 15554 * 15555 * As required by qwriter_ip(), we refhold the ill; it will refrele. 15556 * Since this is on the ill stream we unconditionally bump up the 15557 * refcount without doing ILL_CAN_LOOKUP(). 15558 */ 15559 ill_refhold(ill); 15560 if (dloa->dl_primitive == DL_NOTIFY_IND) 15561 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE); 15562 else 15563 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE); 15564 } 15565 15566 /* 15567 * Handling of DLPI messages that require exclusive access to the ipsq. 15568 * 15569 * Need to do ill_pending_mp_release on ioctl completion, which could 15570 * happen here. (along with mi_copy_done) 15571 */ 15572 /* ARGSUSED */ 15573 static void 15574 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15575 { 15576 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15577 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15578 int err = 0; 15579 ill_t *ill; 15580 ipif_t *ipif = NULL; 15581 mblk_t *mp1 = NULL; 15582 conn_t *connp = NULL; 15583 t_uscalar_t paddrreq; 15584 mblk_t *mp_hw; 15585 boolean_t success; 15586 boolean_t ioctl_aborted = B_FALSE; 15587 boolean_t log = B_TRUE; 15588 ip_stack_t *ipst; 15589 15590 ip1dbg(("ip_rput_dlpi_writer ..")); 15591 ill = (ill_t *)q->q_ptr; 15592 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 15593 15594 ASSERT(IAM_WRITER_ILL(ill)); 15595 15596 ipst = ill->ill_ipst; 15597 15598 /* 15599 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. 15600 * both are null or non-null. However we can assert that only 15601 * after grabbing the ipsq_lock. So we don't make any assertion 15602 * here and in other places in the code. 15603 */ 15604 ipif = ipsq->ipsq_pending_ipif; 15605 /* 15606 * The current ioctl could have been aborted by the user and a new 15607 * ioctl to bring up another ill could have started. We could still 15608 * get a response from the driver later. 15609 */ 15610 if (ipif != NULL && ipif->ipif_ill != ill) 15611 ioctl_aborted = B_TRUE; 15612 15613 switch (dloa->dl_primitive) { 15614 case DL_ERROR_ACK: 15615 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n", 15616 dl_primstr(dlea->dl_error_primitive))); 15617 15618 switch (dlea->dl_error_primitive) { 15619 case DL_PROMISCON_REQ: 15620 case DL_PROMISCOFF_REQ: 15621 case DL_DISABMULTI_REQ: 15622 case DL_UNBIND_REQ: 15623 case DL_ATTACH_REQ: 15624 case DL_INFO_REQ: 15625 ill_dlpi_done(ill, dlea->dl_error_primitive); 15626 break; 15627 case DL_NOTIFY_REQ: 15628 ill_dlpi_done(ill, DL_NOTIFY_REQ); 15629 log = B_FALSE; 15630 break; 15631 case DL_PHYS_ADDR_REQ: 15632 /* 15633 * For IPv6 only, there are two additional 15634 * phys_addr_req's sent to the driver to get the 15635 * IPv6 token and lla. This allows IP to acquire 15636 * the hardware address format for a given interface 15637 * without having built in knowledge of the hardware 15638 * address. ill_phys_addr_pend keeps track of the last 15639 * DL_PAR sent so we know which response we are 15640 * dealing with. ill_dlpi_done will update 15641 * ill_phys_addr_pend when it sends the next req. 15642 * We don't complete the IOCTL until all three DL_PARs 15643 * have been attempted, so set *_len to 0 and break. 15644 */ 15645 paddrreq = ill->ill_phys_addr_pend; 15646 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 15647 if (paddrreq == DL_IPV6_TOKEN) { 15648 ill->ill_token_length = 0; 15649 log = B_FALSE; 15650 break; 15651 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 15652 ill->ill_nd_lla_len = 0; 15653 log = B_FALSE; 15654 break; 15655 } 15656 /* 15657 * Something went wrong with the DL_PHYS_ADDR_REQ. 15658 * We presumably have an IOCTL hanging out waiting 15659 * for completion. Find it and complete the IOCTL 15660 * with the error noted. 15661 * However, ill_dl_phys was called on an ill queue 15662 * (from SIOCSLIFNAME), thus conn_pending_ill is not 15663 * set. But the ioctl is known to be pending on ill_wq. 15664 */ 15665 if (!ill->ill_ifname_pending) 15666 break; 15667 ill->ill_ifname_pending = 0; 15668 if (!ioctl_aborted) 15669 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15670 if (mp1 != NULL) { 15671 /* 15672 * This operation (SIOCSLIFNAME) must have 15673 * happened on the ill. Assert there is no conn 15674 */ 15675 ASSERT(connp == NULL); 15676 q = ill->ill_wq; 15677 } 15678 break; 15679 case DL_BIND_REQ: 15680 ill_dlpi_done(ill, DL_BIND_REQ); 15681 if (ill->ill_ifname_pending) 15682 break; 15683 /* 15684 * Something went wrong with the bind. We presumably 15685 * have an IOCTL hanging out waiting for completion. 15686 * Find it, take down the interface that was coming 15687 * up, and complete the IOCTL with the error noted. 15688 */ 15689 if (!ioctl_aborted) 15690 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15691 if (mp1 != NULL) { 15692 /* 15693 * This operation (SIOCSLIFFLAGS) must have 15694 * happened from a conn. 15695 */ 15696 ASSERT(connp != NULL); 15697 q = CONNP_TO_WQ(connp); 15698 if (ill->ill_move_in_progress) { 15699 ILL_CLEAR_MOVE(ill); 15700 } 15701 (void) ipif_down(ipif, NULL, NULL); 15702 /* error is set below the switch */ 15703 } 15704 break; 15705 case DL_ENABMULTI_REQ: 15706 ill_dlpi_done(ill, DL_ENABMULTI_REQ); 15707 15708 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15709 ill->ill_dlpi_multicast_state = IDS_FAILED; 15710 if (ill->ill_dlpi_multicast_state == IDS_FAILED) { 15711 ipif_t *ipif; 15712 15713 printf("ip: joining multicasts failed (%d)" 15714 " on %s - will use link layer " 15715 "broadcasts for multicast\n", 15716 dlea->dl_errno, ill->ill_name); 15717 15718 /* 15719 * Set up the multicast mapping alone. 15720 * writer, so ok to access ill->ill_ipif 15721 * without any lock. 15722 */ 15723 ipif = ill->ill_ipif; 15724 mutex_enter(&ill->ill_phyint->phyint_lock); 15725 ill->ill_phyint->phyint_flags |= 15726 PHYI_MULTI_BCAST; 15727 mutex_exit(&ill->ill_phyint->phyint_lock); 15728 15729 if (!ill->ill_isv6) { 15730 (void) ipif_arp_setup_multicast(ipif, 15731 NULL); 15732 } else { 15733 (void) ipif_ndp_setup_multicast(ipif, 15734 NULL); 15735 } 15736 } 15737 freemsg(mp); /* Don't want to pass this up */ 15738 return; 15739 15740 case DL_CAPABILITY_REQ: 15741 case DL_CONTROL_REQ: 15742 ill_dlpi_done(ill, dlea->dl_error_primitive); 15743 ill->ill_dlpi_capab_state = IDS_FAILED; 15744 freemsg(mp); 15745 return; 15746 } 15747 /* 15748 * Note the error for IOCTL completion (mp1 is set when 15749 * ready to complete ioctl). If ill_ifname_pending_err is 15750 * set, an error occured during plumbing (ill_ifname_pending), 15751 * so we want to report that error. 15752 * 15753 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 15754 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 15755 * expected to get errack'd if the driver doesn't support 15756 * these flags (e.g. ethernet). log will be set to B_FALSE 15757 * if these error conditions are encountered. 15758 */ 15759 if (mp1 != NULL) { 15760 if (ill->ill_ifname_pending_err != 0) { 15761 err = ill->ill_ifname_pending_err; 15762 ill->ill_ifname_pending_err = 0; 15763 } else { 15764 err = dlea->dl_unix_errno ? 15765 dlea->dl_unix_errno : ENXIO; 15766 } 15767 /* 15768 * If we're plumbing an interface and an error hasn't already 15769 * been saved, set ill_ifname_pending_err to the error passed 15770 * up. Ignore the error if log is B_FALSE (see comment above). 15771 */ 15772 } else if (log && ill->ill_ifname_pending && 15773 ill->ill_ifname_pending_err == 0) { 15774 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 15775 dlea->dl_unix_errno : ENXIO; 15776 } 15777 15778 if (log) 15779 ip_dlpi_error(ill, dlea->dl_error_primitive, 15780 dlea->dl_errno, dlea->dl_unix_errno); 15781 break; 15782 case DL_CAPABILITY_ACK: 15783 /* Call a routine to handle this one. */ 15784 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 15785 ill_capability_ack(ill, mp); 15786 15787 /* 15788 * If the ack is due to renegotiation, we will need to send 15789 * a new CAPABILITY_REQ to start the renegotiation. 15790 */ 15791 if (ill->ill_capab_reneg) { 15792 ill->ill_capab_reneg = B_FALSE; 15793 ill_capability_probe(ill); 15794 } 15795 break; 15796 case DL_CONTROL_ACK: 15797 /* We treat all of these as "fire and forget" */ 15798 ill_dlpi_done(ill, DL_CONTROL_REQ); 15799 break; 15800 case DL_INFO_ACK: 15801 /* Call a routine to handle this one. */ 15802 ill_dlpi_done(ill, DL_INFO_REQ); 15803 ip_ll_subnet_defaults(ill, mp); 15804 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 15805 return; 15806 case DL_BIND_ACK: 15807 /* 15808 * We should have an IOCTL waiting on this unless 15809 * sent by ill_dl_phys, in which case just return 15810 */ 15811 ill_dlpi_done(ill, DL_BIND_REQ); 15812 if (ill->ill_ifname_pending) 15813 break; 15814 15815 if (!ioctl_aborted) 15816 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15817 if (mp1 == NULL) 15818 break; 15819 /* 15820 * Because mp1 was added by ill_dl_up(), and it always 15821 * passes a valid connp, connp must be valid here. 15822 */ 15823 ASSERT(connp != NULL); 15824 q = CONNP_TO_WQ(connp); 15825 15826 /* 15827 * We are exclusive. So nothing can change even after 15828 * we get the pending mp. If need be we can put it back 15829 * and restart, as in calling ipif_arp_up() below. 15830 */ 15831 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 15832 15833 mutex_enter(&ill->ill_lock); 15834 ill->ill_dl_up = 1; 15835 (void) ill_hook_event_create(ill, 0, NE_UP, NULL, 0); 15836 mutex_exit(&ill->ill_lock); 15837 15838 /* 15839 * Now bring up the resolver; when that is complete, we'll 15840 * create IREs. Note that we intentionally mirror what 15841 * ipif_up() would have done, because we got here by way of 15842 * ill_dl_up(), which stopped ipif_up()'s processing. 15843 */ 15844 if (ill->ill_isv6) { 15845 /* 15846 * v6 interfaces. 15847 * Unlike ARP which has to do another bind 15848 * and attach, once we get here we are 15849 * done with NDP. Except in the case of 15850 * ILLF_XRESOLV, in which case we send an 15851 * AR_INTERFACE_UP to the external resolver. 15852 * If all goes well, the ioctl will complete 15853 * in ip_rput(). If there's an error, we 15854 * complete it here. 15855 */ 15856 if ((err = ipif_ndp_up(ipif)) == 0) { 15857 if (ill->ill_flags & ILLF_XRESOLV) { 15858 mutex_enter(&connp->conn_lock); 15859 mutex_enter(&ill->ill_lock); 15860 success = ipsq_pending_mp_add( 15861 connp, ipif, q, mp1, 0); 15862 mutex_exit(&ill->ill_lock); 15863 mutex_exit(&connp->conn_lock); 15864 if (success) { 15865 err = ipif_resolver_up(ipif, 15866 Res_act_initial); 15867 if (err == EINPROGRESS) { 15868 freemsg(mp); 15869 return; 15870 } 15871 ASSERT(err != 0); 15872 mp1 = ipsq_pending_mp_get(ipsq, 15873 &connp); 15874 ASSERT(mp1 != NULL); 15875 } else { 15876 /* conn has started closing */ 15877 err = EINTR; 15878 } 15879 } else { /* Non XRESOLV interface */ 15880 (void) ipif_resolver_up(ipif, 15881 Res_act_initial); 15882 err = ipif_up_done_v6(ipif); 15883 } 15884 } 15885 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 15886 /* 15887 * ARP and other v4 external resolvers. 15888 * Leave the pending mblk intact so that 15889 * the ioctl completes in ip_rput(). 15890 */ 15891 mutex_enter(&connp->conn_lock); 15892 mutex_enter(&ill->ill_lock); 15893 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 15894 mutex_exit(&ill->ill_lock); 15895 mutex_exit(&connp->conn_lock); 15896 if (success) { 15897 err = ipif_resolver_up(ipif, Res_act_initial); 15898 if (err == EINPROGRESS) { 15899 freemsg(mp); 15900 return; 15901 } 15902 ASSERT(err != 0); 15903 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15904 } else { 15905 /* The conn has started closing */ 15906 err = EINTR; 15907 } 15908 } else { 15909 /* 15910 * This one is complete. Reply to pending ioctl. 15911 */ 15912 (void) ipif_resolver_up(ipif, Res_act_initial); 15913 err = ipif_up_done(ipif); 15914 } 15915 15916 if ((err == 0) && (ill->ill_up_ipifs)) { 15917 err = ill_up_ipifs(ill, q, mp1); 15918 if (err == EINPROGRESS) { 15919 freemsg(mp); 15920 return; 15921 } 15922 } 15923 15924 if (ill->ill_up_ipifs) { 15925 ill_group_cleanup(ill); 15926 } 15927 15928 break; 15929 case DL_NOTIFY_IND: { 15930 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 15931 ire_t *ire; 15932 boolean_t need_ire_walk_v4 = B_FALSE; 15933 boolean_t need_ire_walk_v6 = B_FALSE; 15934 15935 switch (notify->dl_notification) { 15936 case DL_NOTE_PHYS_ADDR: 15937 err = ill_set_phys_addr(ill, mp); 15938 break; 15939 15940 case DL_NOTE_FASTPATH_FLUSH: 15941 ill_fastpath_flush(ill); 15942 break; 15943 15944 case DL_NOTE_SDU_SIZE: 15945 /* 15946 * Change the MTU size of the interface, of all 15947 * attached ipif's, and of all relevant ire's. The 15948 * new value's a uint32_t at notify->dl_data. 15949 * Mtu change Vs. new ire creation - protocol below. 15950 * 15951 * a Mark the ipif as IPIF_CHANGING. 15952 * b Set the new mtu in the ipif. 15953 * c Change the ire_max_frag on all affected ires 15954 * d Unmark the IPIF_CHANGING 15955 * 15956 * To see how the protocol works, assume an interface 15957 * route is also being added simultaneously by 15958 * ip_rt_add and let 'ipif' be the ipif referenced by 15959 * the ire. If the ire is created before step a, 15960 * it will be cleaned up by step c. If the ire is 15961 * created after step d, it will see the new value of 15962 * ipif_mtu. Any attempt to create the ire between 15963 * steps a to d will fail because of the IPIF_CHANGING 15964 * flag. Note that ire_create() is passed a pointer to 15965 * the ipif_mtu, and not the value. During ire_add 15966 * under the bucket lock, the ire_max_frag of the 15967 * new ire being created is set from the ipif/ire from 15968 * which it is being derived. 15969 */ 15970 mutex_enter(&ill->ill_lock); 15971 ill->ill_max_frag = (uint_t)notify->dl_data; 15972 15973 /* 15974 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu 15975 * leave it alone 15976 */ 15977 if (ill->ill_mtu_userspecified) { 15978 mutex_exit(&ill->ill_lock); 15979 break; 15980 } 15981 ill->ill_max_mtu = ill->ill_max_frag; 15982 if (ill->ill_isv6) { 15983 if (ill->ill_max_mtu < IPV6_MIN_MTU) 15984 ill->ill_max_mtu = IPV6_MIN_MTU; 15985 } else { 15986 if (ill->ill_max_mtu < IP_MIN_MTU) 15987 ill->ill_max_mtu = IP_MIN_MTU; 15988 } 15989 for (ipif = ill->ill_ipif; ipif != NULL; 15990 ipif = ipif->ipif_next) { 15991 /* 15992 * Don't override the mtu if the user 15993 * has explicitly set it. 15994 */ 15995 if (ipif->ipif_flags & IPIF_FIXEDMTU) 15996 continue; 15997 ipif->ipif_mtu = (uint_t)notify->dl_data; 15998 if (ipif->ipif_isv6) 15999 ire = ipif_to_ire_v6(ipif); 16000 else 16001 ire = ipif_to_ire(ipif); 16002 if (ire != NULL) { 16003 ire->ire_max_frag = ipif->ipif_mtu; 16004 ire_refrele(ire); 16005 } 16006 if (ipif->ipif_flags & IPIF_UP) { 16007 if (ill->ill_isv6) 16008 need_ire_walk_v6 = B_TRUE; 16009 else 16010 need_ire_walk_v4 = B_TRUE; 16011 } 16012 } 16013 mutex_exit(&ill->ill_lock); 16014 if (need_ire_walk_v4) 16015 ire_walk_v4(ill_mtu_change, (char *)ill, 16016 ALL_ZONES, ipst); 16017 if (need_ire_walk_v6) 16018 ire_walk_v6(ill_mtu_change, (char *)ill, 16019 ALL_ZONES, ipst); 16020 break; 16021 case DL_NOTE_LINK_UP: 16022 case DL_NOTE_LINK_DOWN: { 16023 /* 16024 * We are writer. ill / phyint / ipsq assocs stable. 16025 * The RUNNING flag reflects the state of the link. 16026 */ 16027 phyint_t *phyint = ill->ill_phyint; 16028 uint64_t new_phyint_flags; 16029 boolean_t changed = B_FALSE; 16030 boolean_t went_up; 16031 16032 went_up = notify->dl_notification == DL_NOTE_LINK_UP; 16033 mutex_enter(&phyint->phyint_lock); 16034 new_phyint_flags = went_up ? 16035 phyint->phyint_flags | PHYI_RUNNING : 16036 phyint->phyint_flags & ~PHYI_RUNNING; 16037 if (new_phyint_flags != phyint->phyint_flags) { 16038 phyint->phyint_flags = new_phyint_flags; 16039 changed = B_TRUE; 16040 } 16041 mutex_exit(&phyint->phyint_lock); 16042 /* 16043 * ill_restart_dad handles the DAD restart and routing 16044 * socket notification logic. 16045 */ 16046 if (changed) { 16047 ill_restart_dad(phyint->phyint_illv4, went_up); 16048 ill_restart_dad(phyint->phyint_illv6, went_up); 16049 } 16050 break; 16051 } 16052 case DL_NOTE_PROMISC_ON_PHYS: 16053 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16054 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 16055 mutex_enter(&ill->ill_lock); 16056 ill->ill_promisc_on_phys = B_TRUE; 16057 mutex_exit(&ill->ill_lock); 16058 break; 16059 case DL_NOTE_PROMISC_OFF_PHYS: 16060 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16061 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 16062 mutex_enter(&ill->ill_lock); 16063 ill->ill_promisc_on_phys = B_FALSE; 16064 mutex_exit(&ill->ill_lock); 16065 break; 16066 case DL_NOTE_CAPAB_RENEG: 16067 /* 16068 * Something changed on the driver side. 16069 * It wants us to renegotiate the capabilities 16070 * on this ill. One possible cause is the aggregation 16071 * interface under us where a port got added or 16072 * went away. 16073 * 16074 * If the capability negotiation is already done 16075 * or is in progress, reset the capabilities and 16076 * mark the ill's ill_capab_reneg to be B_TRUE, 16077 * so that when the ack comes back, we can start 16078 * the renegotiation process. 16079 * 16080 * Note that if ill_capab_reneg is already B_TRUE 16081 * (ill_dlpi_capab_state is IDS_UNKNOWN in this case), 16082 * the capability resetting request has been sent 16083 * and the renegotiation has not been started yet; 16084 * nothing needs to be done in this case. 16085 */ 16086 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) { 16087 ill_capability_reset(ill); 16088 ill->ill_capab_reneg = B_TRUE; 16089 } 16090 break; 16091 default: 16092 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 16093 "type 0x%x for DL_NOTIFY_IND\n", 16094 notify->dl_notification)); 16095 break; 16096 } 16097 16098 /* 16099 * As this is an asynchronous operation, we 16100 * should not call ill_dlpi_done 16101 */ 16102 break; 16103 } 16104 case DL_NOTIFY_ACK: { 16105 dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr; 16106 16107 if (noteack->dl_notifications & DL_NOTE_LINK_UP) 16108 ill->ill_note_link = 1; 16109 ill_dlpi_done(ill, DL_NOTIFY_REQ); 16110 break; 16111 } 16112 case DL_PHYS_ADDR_ACK: { 16113 /* 16114 * As part of plumbing the interface via SIOCSLIFNAME, 16115 * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs, 16116 * whose answers we receive here. As each answer is received, 16117 * we call ill_dlpi_done() to dispatch the next request as 16118 * we're processing the current one. Once all answers have 16119 * been received, we use ipsq_pending_mp_get() to dequeue the 16120 * outstanding IOCTL and reply to it. (Because ill_dl_phys() 16121 * is invoked from an ill queue, conn_oper_pending_ill is not 16122 * available, but we know the ioctl is pending on ill_wq.) 16123 */ 16124 uint_t paddrlen, paddroff; 16125 16126 paddrreq = ill->ill_phys_addr_pend; 16127 paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length; 16128 paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset; 16129 16130 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 16131 if (paddrreq == DL_IPV6_TOKEN) { 16132 /* 16133 * bcopy to low-order bits of ill_token 16134 * 16135 * XXX Temporary hack - currently, all known tokens 16136 * are 64 bits, so I'll cheat for the moment. 16137 */ 16138 bcopy(mp->b_rptr + paddroff, 16139 &ill->ill_token.s6_addr32[2], paddrlen); 16140 ill->ill_token_length = paddrlen; 16141 break; 16142 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 16143 ASSERT(ill->ill_nd_lla_mp == NULL); 16144 ill_set_ndmp(ill, mp, paddroff, paddrlen); 16145 mp = NULL; 16146 break; 16147 } 16148 16149 ASSERT(paddrreq == DL_CURR_PHYS_ADDR); 16150 ASSERT(ill->ill_phys_addr_mp == NULL); 16151 if (!ill->ill_ifname_pending) 16152 break; 16153 ill->ill_ifname_pending = 0; 16154 if (!ioctl_aborted) 16155 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16156 if (mp1 != NULL) { 16157 ASSERT(connp == NULL); 16158 q = ill->ill_wq; 16159 } 16160 /* 16161 * If any error acks received during the plumbing sequence, 16162 * ill_ifname_pending_err will be set. Break out and send up 16163 * the error to the pending ioctl. 16164 */ 16165 if (ill->ill_ifname_pending_err != 0) { 16166 err = ill->ill_ifname_pending_err; 16167 ill->ill_ifname_pending_err = 0; 16168 break; 16169 } 16170 16171 ill->ill_phys_addr_mp = mp; 16172 ill->ill_phys_addr = mp->b_rptr + paddroff; 16173 mp = NULL; 16174 16175 /* 16176 * If paddrlen is zero, the DLPI provider doesn't support 16177 * physical addresses. The other two tests were historical 16178 * workarounds for bugs in our former PPP implementation, but 16179 * now other things have grown dependencies on them -- e.g., 16180 * the tun module specifies a dl_addr_length of zero in its 16181 * DL_BIND_ACK, but then specifies an incorrect value in its 16182 * DL_PHYS_ADDR_ACK. These bogus checks need to be removed, 16183 * but only after careful testing ensures that all dependent 16184 * broken DLPI providers have been fixed. 16185 */ 16186 if (paddrlen == 0 || ill->ill_phys_addr_length == 0 || 16187 ill->ill_phys_addr_length == IP_ADDR_LEN) { 16188 ill->ill_phys_addr = NULL; 16189 } else if (paddrlen != ill->ill_phys_addr_length) { 16190 ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d", 16191 paddrlen, ill->ill_phys_addr_length)); 16192 err = EINVAL; 16193 break; 16194 } 16195 16196 if (ill->ill_nd_lla_mp == NULL) { 16197 if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) { 16198 err = ENOMEM; 16199 break; 16200 } 16201 ill_set_ndmp(ill, mp_hw, paddroff, paddrlen); 16202 } 16203 16204 /* 16205 * Set the interface token. If the zeroth interface address 16206 * is unspecified, then set it to the link local address. 16207 */ 16208 if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 16209 (void) ill_setdefaulttoken(ill); 16210 16211 ASSERT(ill->ill_ipif->ipif_id == 0); 16212 if (ipif != NULL && 16213 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 16214 (void) ipif_setlinklocal(ipif); 16215 } 16216 break; 16217 } 16218 case DL_OK_ACK: 16219 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 16220 dl_primstr((int)dloa->dl_correct_primitive), 16221 dloa->dl_correct_primitive)); 16222 switch (dloa->dl_correct_primitive) { 16223 case DL_PROMISCON_REQ: 16224 case DL_PROMISCOFF_REQ: 16225 case DL_ENABMULTI_REQ: 16226 case DL_DISABMULTI_REQ: 16227 case DL_UNBIND_REQ: 16228 case DL_ATTACH_REQ: 16229 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16230 break; 16231 } 16232 break; 16233 default: 16234 break; 16235 } 16236 16237 freemsg(mp); 16238 if (mp1 != NULL) { 16239 /* 16240 * The operation must complete without EINPROGRESS 16241 * since ipsq_pending_mp_get() has removed the mblk 16242 * from ipsq_pending_mp. Otherwise, the operation 16243 * will be stuck forever in the ipsq. 16244 */ 16245 ASSERT(err != EINPROGRESS); 16246 16247 switch (ipsq->ipsq_current_ioctl) { 16248 case 0: 16249 ipsq_current_finish(ipsq); 16250 break; 16251 16252 case SIOCLIFADDIF: 16253 case SIOCSLIFNAME: 16254 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); 16255 break; 16256 16257 default: 16258 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 16259 break; 16260 } 16261 } 16262 } 16263 16264 /* 16265 * ip_rput_other is called by ip_rput to handle messages modifying the global 16266 * state in IP. Normally called as writer. Exception SIOCGTUNPARAM (shared) 16267 */ 16268 /* ARGSUSED */ 16269 void 16270 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 16271 { 16272 ill_t *ill; 16273 struct iocblk *iocp; 16274 mblk_t *mp1; 16275 conn_t *connp = NULL; 16276 16277 ip1dbg(("ip_rput_other ")); 16278 ill = (ill_t *)q->q_ptr; 16279 /* 16280 * This routine is not a writer in the case of SIOCGTUNPARAM 16281 * in which case ipsq is NULL. 16282 */ 16283 if (ipsq != NULL) { 16284 ASSERT(IAM_WRITER_IPSQ(ipsq)); 16285 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 16286 } 16287 16288 switch (mp->b_datap->db_type) { 16289 case M_ERROR: 16290 case M_HANGUP: 16291 /* 16292 * The device has a problem. We force the ILL down. It can 16293 * be brought up again manually using SIOCSIFFLAGS (via 16294 * ifconfig or equivalent). 16295 */ 16296 ASSERT(ipsq != NULL); 16297 if (mp->b_rptr < mp->b_wptr) 16298 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 16299 if (ill->ill_error == 0) 16300 ill->ill_error = ENXIO; 16301 if (!ill_down_start(q, mp)) 16302 return; 16303 ipif_all_down_tail(ipsq, q, mp, NULL); 16304 break; 16305 case M_IOCACK: 16306 iocp = (struct iocblk *)mp->b_rptr; 16307 ASSERT(iocp->ioc_cmd != DL_IOC_HDR_INFO); 16308 switch (iocp->ioc_cmd) { 16309 case SIOCSTUNPARAM: 16310 case OSIOCSTUNPARAM: 16311 ASSERT(ipsq != NULL); 16312 /* 16313 * Finish socket ioctl passed through to tun. 16314 * We should have an IOCTL waiting on this. 16315 */ 16316 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16317 if (ill->ill_isv6) { 16318 struct iftun_req *ta; 16319 16320 /* 16321 * if a source or destination is 16322 * being set, try and set the link 16323 * local address for the tunnel 16324 */ 16325 ta = (struct iftun_req *)mp->b_cont-> 16326 b_cont->b_rptr; 16327 if (ta->ifta_flags & (IFTUN_SRC | IFTUN_DST)) { 16328 ipif_set_tun_llink(ill, ta); 16329 } 16330 16331 } 16332 if (mp1 != NULL) { 16333 /* 16334 * Now copy back the b_next/b_prev used by 16335 * mi code for the mi_copy* functions. 16336 * See ip_sioctl_tunparam() for the reason. 16337 * Also protect against missing b_cont. 16338 */ 16339 if (mp->b_cont != NULL) { 16340 mp->b_cont->b_next = 16341 mp1->b_cont->b_next; 16342 mp->b_cont->b_prev = 16343 mp1->b_cont->b_prev; 16344 } 16345 inet_freemsg(mp1); 16346 ASSERT(connp != NULL); 16347 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16348 iocp->ioc_error, NO_COPYOUT, ipsq); 16349 } else { 16350 ASSERT(connp == NULL); 16351 putnext(q, mp); 16352 } 16353 break; 16354 case SIOCGTUNPARAM: 16355 case OSIOCGTUNPARAM: 16356 /* 16357 * This is really M_IOCDATA from the tunnel driver. 16358 * convert back and complete the ioctl. 16359 * We should have an IOCTL waiting on this. 16360 */ 16361 mp1 = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 16362 if (mp1) { 16363 /* 16364 * Now copy back the b_next/b_prev used by 16365 * mi code for the mi_copy* functions. 16366 * See ip_sioctl_tunparam() for the reason. 16367 * Also protect against missing b_cont. 16368 */ 16369 if (mp->b_cont != NULL) { 16370 mp->b_cont->b_next = 16371 mp1->b_cont->b_next; 16372 mp->b_cont->b_prev = 16373 mp1->b_cont->b_prev; 16374 } 16375 inet_freemsg(mp1); 16376 if (iocp->ioc_error == 0) 16377 mp->b_datap->db_type = M_IOCDATA; 16378 ASSERT(connp != NULL); 16379 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16380 iocp->ioc_error, COPYOUT, NULL); 16381 } else { 16382 ASSERT(connp == NULL); 16383 putnext(q, mp); 16384 } 16385 break; 16386 default: 16387 break; 16388 } 16389 break; 16390 case M_IOCNAK: 16391 iocp = (struct iocblk *)mp->b_rptr; 16392 16393 switch (iocp->ioc_cmd) { 16394 int mode; 16395 16396 case DL_IOC_HDR_INFO: 16397 /* 16398 * If this was the first attempt turn of the 16399 * fastpath probing. 16400 */ 16401 mutex_enter(&ill->ill_lock); 16402 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) { 16403 ill->ill_dlpi_fastpath_state = IDS_FAILED; 16404 mutex_exit(&ill->ill_lock); 16405 ill_fastpath_nack(ill); 16406 ip1dbg(("ip_rput: DLPI fastpath off on " 16407 "interface %s\n", 16408 ill->ill_name)); 16409 } else { 16410 mutex_exit(&ill->ill_lock); 16411 } 16412 freemsg(mp); 16413 break; 16414 case SIOCSTUNPARAM: 16415 case OSIOCSTUNPARAM: 16416 ASSERT(ipsq != NULL); 16417 /* 16418 * Finish socket ioctl passed through to tun 16419 * We should have an IOCTL waiting on this. 16420 */ 16421 /* FALLTHRU */ 16422 case SIOCGTUNPARAM: 16423 case OSIOCGTUNPARAM: 16424 /* 16425 * This is really M_IOCDATA from the tunnel driver. 16426 * convert back and complete the ioctl. 16427 * We should have an IOCTL waiting on this. 16428 */ 16429 if (iocp->ioc_cmd == SIOCGTUNPARAM || 16430 iocp->ioc_cmd == OSIOCGTUNPARAM) { 16431 mp1 = ill_pending_mp_get(ill, &connp, 16432 iocp->ioc_id); 16433 mode = COPYOUT; 16434 ipsq = NULL; 16435 } else { 16436 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16437 mode = NO_COPYOUT; 16438 } 16439 if (mp1 != NULL) { 16440 /* 16441 * Now copy back the b_next/b_prev used by 16442 * mi code for the mi_copy* functions. 16443 * See ip_sioctl_tunparam() for the reason. 16444 * Also protect against missing b_cont. 16445 */ 16446 if (mp->b_cont != NULL) { 16447 mp->b_cont->b_next = 16448 mp1->b_cont->b_next; 16449 mp->b_cont->b_prev = 16450 mp1->b_cont->b_prev; 16451 } 16452 inet_freemsg(mp1); 16453 if (iocp->ioc_error == 0) 16454 iocp->ioc_error = EINVAL; 16455 ASSERT(connp != NULL); 16456 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16457 iocp->ioc_error, mode, ipsq); 16458 } else { 16459 ASSERT(connp == NULL); 16460 putnext(q, mp); 16461 } 16462 break; 16463 default: 16464 break; 16465 } 16466 default: 16467 break; 16468 } 16469 } 16470 16471 /* 16472 * NOTE : This function does not ire_refrele the ire argument passed in. 16473 * 16474 * IPQoS notes 16475 * IP policy is invoked twice for a forwarded packet, once on the read side 16476 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 16477 * enabled. An additional parameter, in_ill, has been added for this purpose. 16478 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 16479 * because ip_mroute drops this information. 16480 * 16481 */ 16482 void 16483 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 16484 { 16485 uint32_t old_pkt_len; 16486 uint32_t pkt_len; 16487 queue_t *q; 16488 uint32_t sum; 16489 #define rptr ((uchar_t *)ipha) 16490 uint32_t max_frag; 16491 uint32_t ill_index; 16492 ill_t *out_ill; 16493 mib2_ipIfStatsEntry_t *mibptr; 16494 ip_stack_t *ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst; 16495 16496 /* Get the ill_index of the incoming ILL */ 16497 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 16498 mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib; 16499 16500 /* Initiate Read side IPPF processing */ 16501 if (IPP_ENABLED(IPP_FWD_IN, ipst)) { 16502 ip_process(IPP_FWD_IN, &mp, ill_index); 16503 if (mp == NULL) { 16504 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 16505 "during IPPF processing\n")); 16506 return; 16507 } 16508 } 16509 16510 /* Adjust the checksum to reflect the ttl decrement. */ 16511 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 16512 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 16513 16514 if (ipha->ipha_ttl-- <= 1) { 16515 if (ip_csum_hdr(ipha)) { 16516 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16517 goto drop_pkt; 16518 } 16519 /* 16520 * Note: ire_stq this will be NULL for multicast 16521 * datagrams using the long path through arp (the IRE 16522 * is not an IRE_CACHE). This should not cause 16523 * problems since we don't generate ICMP errors for 16524 * multicast packets. 16525 */ 16526 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16527 q = ire->ire_stq; 16528 if (q != NULL) { 16529 /* Sent by forwarding path, and router is global zone */ 16530 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED, 16531 GLOBAL_ZONEID, ipst); 16532 } else 16533 freemsg(mp); 16534 return; 16535 } 16536 16537 /* 16538 * Don't forward if the interface is down 16539 */ 16540 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 16541 BUMP_MIB(mibptr, ipIfStatsInDiscards); 16542 ip2dbg(("ip_rput_forward:interface is down\n")); 16543 goto drop_pkt; 16544 } 16545 16546 /* Get the ill_index of the outgoing ILL */ 16547 out_ill = ire_to_ill(ire); 16548 ill_index = out_ill->ill_phyint->phyint_ifindex; 16549 16550 DTRACE_PROBE4(ip4__forwarding__start, 16551 ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16552 16553 FW_HOOKS(ipst->ips_ip4_forwarding_event, 16554 ipst->ips_ipv4firewall_forwarding, 16555 in_ill, out_ill, ipha, mp, mp, 0, ipst); 16556 16557 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 16558 16559 if (mp == NULL) 16560 return; 16561 old_pkt_len = pkt_len = ntohs(ipha->ipha_length); 16562 16563 if (is_system_labeled()) { 16564 mblk_t *mp1; 16565 16566 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 16567 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16568 goto drop_pkt; 16569 } 16570 /* Size may have changed */ 16571 mp = mp1; 16572 ipha = (ipha_t *)mp->b_rptr; 16573 pkt_len = ntohs(ipha->ipha_length); 16574 } 16575 16576 /* Check if there are options to update */ 16577 if (!IS_SIMPLE_IPH(ipha)) { 16578 if (ip_csum_hdr(ipha)) { 16579 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16580 goto drop_pkt; 16581 } 16582 if (ip_rput_forward_options(mp, ipha, ire, ipst)) { 16583 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16584 return; 16585 } 16586 16587 ipha->ipha_hdr_checksum = 0; 16588 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 16589 } 16590 max_frag = ire->ire_max_frag; 16591 if (pkt_len > max_frag) { 16592 /* 16593 * It needs fragging on its way out. We haven't 16594 * verified the header checksum yet. Since we 16595 * are going to put a surely good checksum in the 16596 * outgoing header, we have to make sure that it 16597 * was good coming in. 16598 */ 16599 if (ip_csum_hdr(ipha)) { 16600 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16601 goto drop_pkt; 16602 } 16603 /* Initiate Write side IPPF processing */ 16604 if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { 16605 ip_process(IPP_FWD_OUT, &mp, ill_index); 16606 if (mp == NULL) { 16607 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 16608 " during IPPF processing\n")); 16609 return; 16610 } 16611 } 16612 /* 16613 * Handle labeled packet resizing. 16614 * 16615 * If we have added a label, inform ip_wput_frag() of its 16616 * effect on the MTU for ICMP messages. 16617 */ 16618 if (pkt_len > old_pkt_len) { 16619 uint32_t secopt_size; 16620 16621 secopt_size = pkt_len - old_pkt_len; 16622 if (secopt_size < max_frag) 16623 max_frag -= secopt_size; 16624 } 16625 16626 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, GLOBAL_ZONEID, ipst); 16627 ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n")); 16628 return; 16629 } 16630 16631 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 16632 ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16633 FW_HOOKS(ipst->ips_ip4_physical_out_event, 16634 ipst->ips_ipv4firewall_physical_out, 16635 NULL, out_ill, ipha, mp, mp, 0, ipst); 16636 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 16637 if (mp == NULL) 16638 return; 16639 16640 mp->b_prev = (mblk_t *)IPP_FWD_OUT; 16641 ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n")); 16642 (void) ip_xmit_v4(mp, ire, NULL, B_FALSE); 16643 /* ip_xmit_v4 always consumes the packet */ 16644 return; 16645 16646 drop_pkt:; 16647 ip1dbg(("ip_rput_forward: drop pkt\n")); 16648 freemsg(mp); 16649 #undef rptr 16650 } 16651 16652 void 16653 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 16654 { 16655 ire_t *ire; 16656 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16657 16658 ASSERT(!ipif->ipif_isv6); 16659 /* 16660 * Find an IRE which matches the destination and the outgoing 16661 * queue in the cache table. All we need is an IRE_CACHE which 16662 * is pointing at ipif->ipif_ill. If it is part of some ill group, 16663 * then it is enough to have some IRE_CACHE in the group. 16664 */ 16665 if (ipif->ipif_flags & IPIF_POINTOPOINT) 16666 dst = ipif->ipif_pp_dst_addr; 16667 16668 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), 16669 MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst); 16670 if (ire == NULL) { 16671 /* 16672 * Mark this packet to make it be delivered to 16673 * ip_rput_forward after the new ire has been 16674 * created. 16675 */ 16676 mp->b_prev = NULL; 16677 mp->b_next = mp; 16678 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 16679 NULL, 0, GLOBAL_ZONEID, &zero_info); 16680 } else { 16681 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 16682 IRE_REFRELE(ire); 16683 } 16684 } 16685 16686 /* Update any source route, record route or timestamp options */ 16687 static int 16688 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) 16689 { 16690 ipoptp_t opts; 16691 uchar_t *opt; 16692 uint8_t optval; 16693 uint8_t optlen; 16694 ipaddr_t dst; 16695 uint32_t ts; 16696 ire_t *dst_ire = NULL; 16697 ire_t *tmp_ire = NULL; 16698 timestruc_t now; 16699 16700 ip2dbg(("ip_rput_forward_options\n")); 16701 dst = ipha->ipha_dst; 16702 for (optval = ipoptp_first(&opts, ipha); 16703 optval != IPOPT_EOL; 16704 optval = ipoptp_next(&opts)) { 16705 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16706 opt = opts.ipoptp_cur; 16707 optlen = opts.ipoptp_len; 16708 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 16709 optval, opts.ipoptp_len)); 16710 switch (optval) { 16711 uint32_t off; 16712 case IPOPT_SSRR: 16713 case IPOPT_LSRR: 16714 /* Check if adminstratively disabled */ 16715 if (!ipst->ips_ip_forward_src_routed) { 16716 if (ire->ire_stq != NULL) { 16717 /* 16718 * Sent by forwarding path, and router 16719 * is global zone 16720 */ 16721 icmp_unreachable(ire->ire_stq, mp, 16722 ICMP_SOURCE_ROUTE_FAILED, 16723 GLOBAL_ZONEID, ipst); 16724 } else { 16725 ip0dbg(("ip_rput_forward_options: " 16726 "unable to send unreach\n")); 16727 freemsg(mp); 16728 } 16729 return (-1); 16730 } 16731 16732 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16733 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16734 if (dst_ire == NULL) { 16735 /* 16736 * Must be partial since ip_rput_options 16737 * checked for strict. 16738 */ 16739 break; 16740 } 16741 off = opt[IPOPT_OFFSET]; 16742 off--; 16743 redo_srr: 16744 if (optlen < IP_ADDR_LEN || 16745 off > optlen - IP_ADDR_LEN) { 16746 /* End of source route */ 16747 ip1dbg(( 16748 "ip_rput_forward_options: end of SR\n")); 16749 ire_refrele(dst_ire); 16750 break; 16751 } 16752 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16753 bcopy(&ire->ire_src_addr, (char *)opt + off, 16754 IP_ADDR_LEN); 16755 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 16756 ntohl(dst))); 16757 16758 /* 16759 * Check if our address is present more than 16760 * once as consecutive hops in source route. 16761 */ 16762 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16763 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16764 if (tmp_ire != NULL) { 16765 ire_refrele(tmp_ire); 16766 off += IP_ADDR_LEN; 16767 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16768 goto redo_srr; 16769 } 16770 ipha->ipha_dst = dst; 16771 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16772 ire_refrele(dst_ire); 16773 break; 16774 case IPOPT_RR: 16775 off = opt[IPOPT_OFFSET]; 16776 off--; 16777 if (optlen < IP_ADDR_LEN || 16778 off > optlen - IP_ADDR_LEN) { 16779 /* No more room - ignore */ 16780 ip1dbg(( 16781 "ip_rput_forward_options: end of RR\n")); 16782 break; 16783 } 16784 bcopy(&ire->ire_src_addr, (char *)opt + off, 16785 IP_ADDR_LEN); 16786 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16787 break; 16788 case IPOPT_TS: 16789 /* Insert timestamp if there is room */ 16790 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16791 case IPOPT_TS_TSONLY: 16792 off = IPOPT_TS_TIMELEN; 16793 break; 16794 case IPOPT_TS_PRESPEC: 16795 case IPOPT_TS_PRESPEC_RFC791: 16796 /* Verify that the address matched */ 16797 off = opt[IPOPT_OFFSET] - 1; 16798 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16799 dst_ire = ire_ctable_lookup(dst, 0, 16800 IRE_LOCAL, NULL, ALL_ZONES, NULL, 16801 MATCH_IRE_TYPE, ipst); 16802 if (dst_ire == NULL) { 16803 /* Not for us */ 16804 break; 16805 } 16806 ire_refrele(dst_ire); 16807 /* FALLTHRU */ 16808 case IPOPT_TS_TSANDADDR: 16809 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16810 break; 16811 default: 16812 /* 16813 * ip_*put_options should have already 16814 * dropped this packet. 16815 */ 16816 cmn_err(CE_PANIC, "ip_rput_forward_options: " 16817 "unknown IT - bug in ip_rput_options?\n"); 16818 return (0); /* Keep "lint" happy */ 16819 } 16820 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16821 /* Increase overflow counter */ 16822 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16823 opt[IPOPT_POS_OV_FLG] = 16824 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16825 (off << 4)); 16826 break; 16827 } 16828 off = opt[IPOPT_OFFSET] - 1; 16829 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16830 case IPOPT_TS_PRESPEC: 16831 case IPOPT_TS_PRESPEC_RFC791: 16832 case IPOPT_TS_TSANDADDR: 16833 bcopy(&ire->ire_src_addr, 16834 (char *)opt + off, IP_ADDR_LEN); 16835 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16836 /* FALLTHRU */ 16837 case IPOPT_TS_TSONLY: 16838 off = opt[IPOPT_OFFSET] - 1; 16839 /* Compute # of milliseconds since midnight */ 16840 gethrestime(&now); 16841 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16842 now.tv_nsec / (NANOSEC / MILLISEC); 16843 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16844 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16845 break; 16846 } 16847 break; 16848 } 16849 } 16850 return (0); 16851 } 16852 16853 /* 16854 * This is called after processing at least one of AH/ESP headers. 16855 * 16856 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 16857 * the actual, physical interface on which the packet was received, 16858 * but, when ip_strict_dst_multihoming is set to 1, could be the 16859 * interface which had the ipha_dst configured when the packet went 16860 * through ip_rput. The ill_index corresponding to the recv_ill 16861 * is saved in ipsec_in_rill_index 16862 * 16863 * NOTE2: The "ire" argument is only used in IPv4 cases. This function 16864 * cannot assume "ire" points to valid data for any IPv6 cases. 16865 */ 16866 void 16867 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 16868 { 16869 mblk_t *mp; 16870 ipaddr_t dst; 16871 in6_addr_t *v6dstp; 16872 ipha_t *ipha; 16873 ip6_t *ip6h; 16874 ipsec_in_t *ii; 16875 boolean_t ill_need_rele = B_FALSE; 16876 boolean_t rill_need_rele = B_FALSE; 16877 boolean_t ire_need_rele = B_FALSE; 16878 netstack_t *ns; 16879 ip_stack_t *ipst; 16880 16881 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 16882 ASSERT(ii->ipsec_in_ill_index != 0); 16883 ns = ii->ipsec_in_ns; 16884 ASSERT(ii->ipsec_in_ns != NULL); 16885 ipst = ns->netstack_ip; 16886 16887 mp = ipsec_mp->b_cont; 16888 ASSERT(mp != NULL); 16889 16890 16891 if (ill == NULL) { 16892 ASSERT(recv_ill == NULL); 16893 /* 16894 * We need to get the original queue on which ip_rput_local 16895 * or ip_rput_data_v6 was called. 16896 */ 16897 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 16898 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst); 16899 ill_need_rele = B_TRUE; 16900 16901 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 16902 recv_ill = ill_lookup_on_ifindex( 16903 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 16904 NULL, NULL, NULL, NULL, ipst); 16905 rill_need_rele = B_TRUE; 16906 } else { 16907 recv_ill = ill; 16908 } 16909 16910 if ((ill == NULL) || (recv_ill == NULL)) { 16911 ip0dbg(("ip_fanout_proto_again: interface " 16912 "disappeared\n")); 16913 if (ill != NULL) 16914 ill_refrele(ill); 16915 if (recv_ill != NULL) 16916 ill_refrele(recv_ill); 16917 freemsg(ipsec_mp); 16918 return; 16919 } 16920 } 16921 16922 ASSERT(ill != NULL && recv_ill != NULL); 16923 16924 if (mp->b_datap->db_type == M_CTL) { 16925 /* 16926 * AH/ESP is returning the ICMP message after 16927 * removing their headers. Fanout again till 16928 * it gets to the right protocol. 16929 */ 16930 if (ii->ipsec_in_v4) { 16931 icmph_t *icmph; 16932 int iph_hdr_length; 16933 int hdr_length; 16934 16935 ipha = (ipha_t *)mp->b_rptr; 16936 iph_hdr_length = IPH_HDR_LENGTH(ipha); 16937 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 16938 ipha = (ipha_t *)&icmph[1]; 16939 hdr_length = IPH_HDR_LENGTH(ipha); 16940 /* 16941 * icmp_inbound_error_fanout may need to do pullupmsg. 16942 * Reset the type to M_DATA. 16943 */ 16944 mp->b_datap->db_type = M_DATA; 16945 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 16946 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 16947 B_FALSE, ill, ii->ipsec_in_zoneid); 16948 } else { 16949 icmp6_t *icmp6; 16950 int hdr_length; 16951 16952 ip6h = (ip6_t *)mp->b_rptr; 16953 /* Don't call hdr_length_v6() unless you have to. */ 16954 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 16955 hdr_length = ip_hdr_length_v6(mp, ip6h); 16956 else 16957 hdr_length = IPV6_HDR_LEN; 16958 16959 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 16960 /* 16961 * icmp_inbound_error_fanout_v6 may need to do 16962 * pullupmsg. Reset the type to M_DATA. 16963 */ 16964 mp->b_datap->db_type = M_DATA; 16965 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 16966 ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); 16967 } 16968 if (ill_need_rele) 16969 ill_refrele(ill); 16970 if (rill_need_rele) 16971 ill_refrele(recv_ill); 16972 return; 16973 } 16974 16975 if (ii->ipsec_in_v4) { 16976 ipha = (ipha_t *)mp->b_rptr; 16977 dst = ipha->ipha_dst; 16978 if (CLASSD(dst)) { 16979 /* 16980 * Multicast has to be delivered to all streams. 16981 */ 16982 dst = INADDR_BROADCAST; 16983 } 16984 16985 if (ire == NULL) { 16986 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 16987 MBLK_GETLABEL(mp), ipst); 16988 if (ire == NULL) { 16989 if (ill_need_rele) 16990 ill_refrele(ill); 16991 if (rill_need_rele) 16992 ill_refrele(recv_ill); 16993 ip1dbg(("ip_fanout_proto_again: " 16994 "IRE not found")); 16995 freemsg(ipsec_mp); 16996 return; 16997 } 16998 ire_need_rele = B_TRUE; 16999 } 17000 17001 switch (ipha->ipha_protocol) { 17002 case IPPROTO_UDP: 17003 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 17004 recv_ill); 17005 if (ire_need_rele) 17006 ire_refrele(ire); 17007 break; 17008 case IPPROTO_TCP: 17009 if (!ire_need_rele) 17010 IRE_REFHOLD(ire); 17011 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 17012 ire, ipsec_mp, 0, ill->ill_rq, NULL); 17013 IRE_REFRELE(ire); 17014 if (mp != NULL) 17015 squeue_enter_chain(GET_SQUEUE(mp), mp, 17016 mp, 1, SQTAG_IP_PROTO_AGAIN); 17017 break; 17018 case IPPROTO_SCTP: 17019 if (!ire_need_rele) 17020 IRE_REFHOLD(ire); 17021 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 17022 ipsec_mp, 0, ill->ill_rq, dst); 17023 break; 17024 default: 17025 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 17026 recv_ill, B_FALSE); 17027 if (ire_need_rele) 17028 ire_refrele(ire); 17029 break; 17030 } 17031 } else { 17032 uint32_t rput_flags = 0; 17033 17034 ip6h = (ip6_t *)mp->b_rptr; 17035 v6dstp = &ip6h->ip6_dst; 17036 /* 17037 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 17038 * address. 17039 * 17040 * Currently, we don't store that state in the IPSEC_IN 17041 * message, and we may need to. 17042 */ 17043 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 17044 IP6_IN_LLMCAST : 0); 17045 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 17046 NULL, NULL); 17047 } 17048 if (ill_need_rele) 17049 ill_refrele(ill); 17050 if (rill_need_rele) 17051 ill_refrele(recv_ill); 17052 } 17053 17054 /* 17055 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 17056 * returns 'true' if there are still fragments left on the queue, in 17057 * which case we restart the timer. 17058 */ 17059 void 17060 ill_frag_timer(void *arg) 17061 { 17062 ill_t *ill = (ill_t *)arg; 17063 boolean_t frag_pending; 17064 ip_stack_t *ipst = ill->ill_ipst; 17065 17066 mutex_enter(&ill->ill_lock); 17067 ASSERT(!ill->ill_fragtimer_executing); 17068 if (ill->ill_state_flags & ILL_CONDEMNED) { 17069 ill->ill_frag_timer_id = 0; 17070 mutex_exit(&ill->ill_lock); 17071 return; 17072 } 17073 ill->ill_fragtimer_executing = 1; 17074 mutex_exit(&ill->ill_lock); 17075 17076 frag_pending = ill_frag_timeout(ill, ipst->ips_ip_g_frag_timeout); 17077 17078 /* 17079 * Restart the timer, if we have fragments pending or if someone 17080 * wanted us to be scheduled again. 17081 */ 17082 mutex_enter(&ill->ill_lock); 17083 ill->ill_fragtimer_executing = 0; 17084 ill->ill_frag_timer_id = 0; 17085 if (frag_pending || ill->ill_fragtimer_needrestart) 17086 ill_frag_timer_start(ill); 17087 mutex_exit(&ill->ill_lock); 17088 } 17089 17090 void 17091 ill_frag_timer_start(ill_t *ill) 17092 { 17093 ip_stack_t *ipst = ill->ill_ipst; 17094 17095 ASSERT(MUTEX_HELD(&ill->ill_lock)); 17096 17097 /* If the ill is closing or opening don't proceed */ 17098 if (ill->ill_state_flags & ILL_CONDEMNED) 17099 return; 17100 17101 if (ill->ill_fragtimer_executing) { 17102 /* 17103 * ill_frag_timer is currently executing. Just record the 17104 * the fact that we want the timer to be restarted. 17105 * ill_frag_timer will post a timeout before it returns, 17106 * ensuring it will be called again. 17107 */ 17108 ill->ill_fragtimer_needrestart = 1; 17109 return; 17110 } 17111 17112 if (ill->ill_frag_timer_id == 0) { 17113 /* 17114 * The timer is neither running nor is the timeout handler 17115 * executing. Post a timeout so that ill_frag_timer will be 17116 * called 17117 */ 17118 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 17119 MSEC_TO_TICK(ipst->ips_ip_g_frag_timo_ms >> 1)); 17120 ill->ill_fragtimer_needrestart = 0; 17121 } 17122 } 17123 17124 /* 17125 * This routine is needed for loopback when forwarding multicasts. 17126 * 17127 * IPQoS Notes: 17128 * IPPF processing is done in fanout routines. 17129 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 17130 * processing for IPsec packets is done when it comes back in clear. 17131 * NOTE : The callers of this function need to do the ire_refrele for the 17132 * ire that is being passed in. 17133 */ 17134 void 17135 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17136 ill_t *recv_ill, boolean_t esp_in_udp_packet) 17137 { 17138 ill_t *ill = (ill_t *)q->q_ptr; 17139 uint32_t sum; 17140 uint32_t u1; 17141 uint32_t u2; 17142 int hdr_length; 17143 boolean_t mctl_present; 17144 mblk_t *first_mp = mp; 17145 mblk_t *hada_mp = NULL; 17146 ipha_t *inner_ipha; 17147 ip_stack_t *ipst; 17148 17149 ASSERT(recv_ill != NULL); 17150 ipst = recv_ill->ill_ipst; 17151 17152 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 17153 "ip_rput_locl_start: q %p", q); 17154 17155 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17156 ASSERT(ill != NULL); 17157 17158 17159 #define rptr ((uchar_t *)ipha) 17160 #define iphs ((uint16_t *)ipha) 17161 17162 /* 17163 * no UDP or TCP packet should come here anymore. 17164 */ 17165 ASSERT(ipha->ipha_protocol != IPPROTO_TCP && 17166 ipha->ipha_protocol != IPPROTO_UDP); 17167 17168 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 17169 if (mctl_present && 17170 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 17171 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 17172 17173 /* 17174 * It's an IPsec accelerated packet. 17175 * Keep a pointer to the data attributes around until 17176 * we allocate the ipsec_info_t. 17177 */ 17178 IPSECHW_DEBUG(IPSECHW_PKT, 17179 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 17180 hada_mp = first_mp; 17181 hada_mp->b_cont = NULL; 17182 /* 17183 * Since it is accelerated, it comes directly from 17184 * the ill and the data attributes is followed by 17185 * the packet data. 17186 */ 17187 ASSERT(mp->b_datap->db_type != M_CTL); 17188 first_mp = mp; 17189 mctl_present = B_FALSE; 17190 } 17191 17192 /* 17193 * IF M_CTL is not present, then ipsec_in_is_secure 17194 * should return B_TRUE. There is a case where loopback 17195 * packets has an M_CTL in the front with all the 17196 * IPsec options set to IPSEC_PREF_NEVER - which means 17197 * ipsec_in_is_secure will return B_FALSE. As loopback 17198 * packets never comes here, it is safe to ASSERT the 17199 * following. 17200 */ 17201 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 17202 17203 /* 17204 * Also, we should never have an mctl_present if this is an 17205 * ESP-in-UDP packet. 17206 */ 17207 ASSERT(!mctl_present || !esp_in_udp_packet); 17208 17209 17210 /* u1 is # words of IP options */ 17211 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 17212 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 17213 17214 /* 17215 * Don't verify header checksum if we just removed UDP header or 17216 * packet is coming back from AH/ESP. 17217 */ 17218 if (!esp_in_udp_packet && !mctl_present) { 17219 if (u1) { 17220 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 17221 if (hada_mp != NULL) 17222 freemsg(hada_mp); 17223 return; 17224 } 17225 } else { 17226 /* Check the IP header checksum. */ 17227 #define uph ((uint16_t *)ipha) 17228 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 17229 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 17230 #undef uph 17231 /* finish doing IP checksum */ 17232 sum = (sum & 0xFFFF) + (sum >> 16); 17233 sum = ~(sum + (sum >> 16)) & 0xFFFF; 17234 if (sum && sum != 0xFFFF) { 17235 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 17236 goto drop_pkt; 17237 } 17238 } 17239 } 17240 17241 /* 17242 * Count for SNMP of inbound packets for ire. As ip_proto_input 17243 * might be called more than once for secure packets, count only 17244 * the first time. 17245 */ 17246 if (!mctl_present) { 17247 UPDATE_IB_PKT_COUNT(ire); 17248 ire->ire_last_used_time = lbolt; 17249 } 17250 17251 /* Check for fragmentation offset. */ 17252 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 17253 u1 = u2 & (IPH_MF | IPH_OFFSET); 17254 if (u1) { 17255 /* 17256 * We re-assemble fragments before we do the AH/ESP 17257 * processing. Thus, M_CTL should not be present 17258 * while we are re-assembling. 17259 */ 17260 ASSERT(!mctl_present); 17261 ASSERT(first_mp == mp); 17262 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 17263 return; 17264 } 17265 /* 17266 * Make sure that first_mp points back to mp as 17267 * the mp we came in with could have changed in 17268 * ip_rput_fragment(). 17269 */ 17270 ipha = (ipha_t *)mp->b_rptr; 17271 first_mp = mp; 17272 } 17273 17274 /* 17275 * Clear hardware checksumming flag as it is currently only 17276 * used by TCP and UDP. 17277 */ 17278 DB_CKSUMFLAGS(mp) = 0; 17279 17280 /* Now we have a complete datagram, destined for this machine. */ 17281 u1 = IPH_HDR_LENGTH(ipha); 17282 switch (ipha->ipha_protocol) { 17283 case IPPROTO_ICMP: { 17284 ire_t *ire_zone; 17285 ilm_t *ilm; 17286 mblk_t *mp1; 17287 zoneid_t last_zoneid; 17288 17289 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { 17290 ASSERT(ire->ire_type == IRE_BROADCAST); 17291 /* 17292 * In the multicast case, applications may have joined 17293 * the group from different zones, so we need to deliver 17294 * the packet to each of them. Loop through the 17295 * multicast memberships structures (ilm) on the receive 17296 * ill and send a copy of the packet up each matching 17297 * one. However, we don't do this for multicasts sent on 17298 * the loopback interface (PHYI_LOOPBACK flag set) as 17299 * they must stay in the sender's zone. 17300 * 17301 * ilm_add_v6() ensures that ilms in the same zone are 17302 * contiguous in the ill_ilm list. We use this property 17303 * to avoid sending duplicates needed when two 17304 * applications in the same zone join the same group on 17305 * different logical interfaces: we ignore the ilm if 17306 * its zoneid is the same as the last matching one. 17307 * In addition, the sending of the packet for 17308 * ire_zoneid is delayed until all of the other ilms 17309 * have been exhausted. 17310 */ 17311 last_zoneid = -1; 17312 ILM_WALKER_HOLD(recv_ill); 17313 for (ilm = recv_ill->ill_ilm; ilm != NULL; 17314 ilm = ilm->ilm_next) { 17315 if ((ilm->ilm_flags & ILM_DELETED) || 17316 ipha->ipha_dst != ilm->ilm_addr || 17317 ilm->ilm_zoneid == last_zoneid || 17318 ilm->ilm_zoneid == ire->ire_zoneid || 17319 ilm->ilm_zoneid == ALL_ZONES || 17320 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 17321 continue; 17322 mp1 = ip_copymsg(first_mp); 17323 if (mp1 == NULL) 17324 continue; 17325 icmp_inbound(q, mp1, B_TRUE, ill, 17326 0, sum, mctl_present, B_TRUE, 17327 recv_ill, ilm->ilm_zoneid); 17328 last_zoneid = ilm->ilm_zoneid; 17329 } 17330 ILM_WALKER_RELE(recv_ill); 17331 } else if (ire->ire_type == IRE_BROADCAST) { 17332 /* 17333 * In the broadcast case, there may be many zones 17334 * which need a copy of the packet delivered to them. 17335 * There is one IRE_BROADCAST per broadcast address 17336 * and per zone; we walk those using a helper function. 17337 * In addition, the sending of the packet for ire is 17338 * delayed until all of the other ires have been 17339 * processed. 17340 */ 17341 IRB_REFHOLD(ire->ire_bucket); 17342 ire_zone = NULL; 17343 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 17344 ire)) != NULL) { 17345 mp1 = ip_copymsg(first_mp); 17346 if (mp1 == NULL) 17347 continue; 17348 17349 UPDATE_IB_PKT_COUNT(ire_zone); 17350 ire_zone->ire_last_used_time = lbolt; 17351 icmp_inbound(q, mp1, B_TRUE, ill, 17352 0, sum, mctl_present, B_TRUE, 17353 recv_ill, ire_zone->ire_zoneid); 17354 } 17355 IRB_REFRELE(ire->ire_bucket); 17356 } 17357 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 17358 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 17359 ire->ire_zoneid); 17360 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17361 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 17362 return; 17363 } 17364 case IPPROTO_IGMP: 17365 /* 17366 * If we are not willing to accept IGMP packets in clear, 17367 * then check with global policy. 17368 */ 17369 if (ipst->ips_igmp_accept_clear_messages == 0) { 17370 first_mp = ipsec_check_global_policy(first_mp, NULL, 17371 ipha, NULL, mctl_present, ipst->ips_netstack); 17372 if (first_mp == NULL) 17373 return; 17374 } 17375 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17376 freemsg(first_mp); 17377 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 17378 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17379 return; 17380 } 17381 if ((mp = igmp_input(q, mp, ill)) == NULL) { 17382 /* Bad packet - discarded by igmp_input */ 17383 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17384 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 17385 if (mctl_present) 17386 freeb(first_mp); 17387 return; 17388 } 17389 /* 17390 * igmp_input() may have returned the pulled up message. 17391 * So first_mp and ipha need to be reinitialized. 17392 */ 17393 ipha = (ipha_t *)mp->b_rptr; 17394 if (mctl_present) 17395 first_mp->b_cont = mp; 17396 else 17397 first_mp = mp; 17398 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17399 connf_head != NULL) { 17400 /* No user-level listener for IGMP packets */ 17401 goto drop_pkt; 17402 } 17403 /* deliver to local raw users */ 17404 break; 17405 case IPPROTO_PIM: 17406 /* 17407 * If we are not willing to accept PIM packets in clear, 17408 * then check with global policy. 17409 */ 17410 if (ipst->ips_pim_accept_clear_messages == 0) { 17411 first_mp = ipsec_check_global_policy(first_mp, NULL, 17412 ipha, NULL, mctl_present, ipst->ips_netstack); 17413 if (first_mp == NULL) 17414 return; 17415 } 17416 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17417 freemsg(first_mp); 17418 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 17419 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17420 return; 17421 } 17422 if (pim_input(q, mp, ill) != 0) { 17423 /* Bad packet - discarded by pim_input */ 17424 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17425 "ip_rput_locl_end: q %p (%S)", q, "pim"); 17426 if (mctl_present) 17427 freeb(first_mp); 17428 return; 17429 } 17430 17431 /* 17432 * pim_input() may have pulled up the message so ipha needs to 17433 * be reinitialized. 17434 */ 17435 ipha = (ipha_t *)mp->b_rptr; 17436 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17437 connf_head != NULL) { 17438 /* No user-level listener for PIM packets */ 17439 goto drop_pkt; 17440 } 17441 /* deliver to local raw users */ 17442 break; 17443 case IPPROTO_ENCAP: 17444 /* 17445 * Handle self-encapsulated packets (IP-in-IP where 17446 * the inner addresses == the outer addresses). 17447 */ 17448 hdr_length = IPH_HDR_LENGTH(ipha); 17449 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 17450 mp->b_wptr) { 17451 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 17452 sizeof (ipha_t) - mp->b_rptr)) { 17453 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17454 freemsg(first_mp); 17455 return; 17456 } 17457 ipha = (ipha_t *)mp->b_rptr; 17458 } 17459 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 17460 /* 17461 * Check the sanity of the inner IP header. 17462 */ 17463 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 17464 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17465 freemsg(first_mp); 17466 return; 17467 } 17468 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 17469 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17470 freemsg(first_mp); 17471 return; 17472 } 17473 if (inner_ipha->ipha_src == ipha->ipha_src && 17474 inner_ipha->ipha_dst == ipha->ipha_dst) { 17475 ipsec_in_t *ii; 17476 17477 /* 17478 * Self-encapsulated tunnel packet. Remove 17479 * the outer IP header and fanout again. 17480 * We also need to make sure that the inner 17481 * header is pulled up until options. 17482 */ 17483 mp->b_rptr = (uchar_t *)inner_ipha; 17484 ipha = inner_ipha; 17485 hdr_length = IPH_HDR_LENGTH(ipha); 17486 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 17487 if (!pullupmsg(mp, (uchar_t *)ipha + 17488 + hdr_length - mp->b_rptr)) { 17489 freemsg(first_mp); 17490 return; 17491 } 17492 ipha = (ipha_t *)mp->b_rptr; 17493 } 17494 if (hdr_length > sizeof (ipha_t)) { 17495 /* We got options on the inner packet. */ 17496 ipaddr_t dst = ipha->ipha_dst; 17497 17498 if (ip_rput_options(q, mp, ipha, &dst, ipst) == 17499 -1) { 17500 /* Bad options! */ 17501 return; 17502 } 17503 if (dst != ipha->ipha_dst) { 17504 /* 17505 * Someone put a source-route in 17506 * the inside header of a self- 17507 * encapsulated packet. Drop it 17508 * with extreme prejudice and let 17509 * the sender know. 17510 */ 17511 icmp_unreachable(q, first_mp, 17512 ICMP_SOURCE_ROUTE_FAILED, 17513 recv_ill->ill_zoneid, ipst); 17514 return; 17515 } 17516 } 17517 if (!mctl_present) { 17518 ASSERT(first_mp == mp); 17519 /* 17520 * This means that somebody is sending 17521 * Self-encapsualted packets without AH/ESP. 17522 * If AH/ESP was present, we would have already 17523 * allocated the first_mp. 17524 * 17525 * Send this packet to find a tunnel endpoint. 17526 * if I can't find one, an ICMP 17527 * PROTOCOL_UNREACHABLE will get sent. 17528 */ 17529 goto fanout; 17530 } 17531 /* 17532 * We generally store the ill_index if we need to 17533 * do IPsec processing as we lose the ill queue when 17534 * we come back. But in this case, we never should 17535 * have to store the ill_index here as it should have 17536 * been stored previously when we processed the 17537 * AH/ESP header in this routine or for non-ipsec 17538 * cases, we still have the queue. But for some bad 17539 * packets from the wire, we can get to IPsec after 17540 * this and we better store the index for that case. 17541 */ 17542 ill = (ill_t *)q->q_ptr; 17543 ii = (ipsec_in_t *)first_mp->b_rptr; 17544 ii->ipsec_in_ill_index = 17545 ill->ill_phyint->phyint_ifindex; 17546 ii->ipsec_in_rill_index = 17547 recv_ill->ill_phyint->phyint_ifindex; 17548 if (ii->ipsec_in_decaps) { 17549 /* 17550 * This packet is self-encapsulated multiple 17551 * times. We don't want to recurse infinitely. 17552 * To keep it simple, drop the packet. 17553 */ 17554 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17555 freemsg(first_mp); 17556 return; 17557 } 17558 ii->ipsec_in_decaps = B_TRUE; 17559 ip_fanout_proto_again(first_mp, recv_ill, recv_ill, 17560 ire); 17561 return; 17562 } 17563 break; 17564 case IPPROTO_AH: 17565 case IPPROTO_ESP: { 17566 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 17567 17568 /* 17569 * Fast path for AH/ESP. If this is the first time 17570 * we are sending a datagram to AH/ESP, allocate 17571 * a IPSEC_IN message and prepend it. Otherwise, 17572 * just fanout. 17573 */ 17574 17575 int ipsec_rc; 17576 ipsec_in_t *ii; 17577 netstack_t *ns = ipst->ips_netstack; 17578 17579 IP_STAT(ipst, ipsec_proto_ahesp); 17580 if (!mctl_present) { 17581 ASSERT(first_mp == mp); 17582 first_mp = ipsec_in_alloc(B_TRUE, ns); 17583 if (first_mp == NULL) { 17584 ip1dbg(("ip_proto_input: IPSEC_IN " 17585 "allocation failure.\n")); 17586 freemsg(hada_mp); /* okay ifnull */ 17587 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17588 freemsg(mp); 17589 return; 17590 } 17591 /* 17592 * Store the ill_index so that when we come back 17593 * from IPsec we ride on the same queue. 17594 */ 17595 ill = (ill_t *)q->q_ptr; 17596 ii = (ipsec_in_t *)first_mp->b_rptr; 17597 ii->ipsec_in_ill_index = 17598 ill->ill_phyint->phyint_ifindex; 17599 ii->ipsec_in_rill_index = 17600 recv_ill->ill_phyint->phyint_ifindex; 17601 first_mp->b_cont = mp; 17602 /* 17603 * Cache hardware acceleration info. 17604 */ 17605 if (hada_mp != NULL) { 17606 IPSECHW_DEBUG(IPSECHW_PKT, 17607 ("ip_rput_local: caching data attr.\n")); 17608 ii->ipsec_in_accelerated = B_TRUE; 17609 ii->ipsec_in_da = hada_mp; 17610 hada_mp = NULL; 17611 } 17612 } else { 17613 ii = (ipsec_in_t *)first_mp->b_rptr; 17614 } 17615 17616 if (!ipsec_loaded(ipss)) { 17617 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 17618 ire->ire_zoneid, ipst); 17619 return; 17620 } 17621 17622 ns = ipst->ips_netstack; 17623 /* select inbound SA and have IPsec process the pkt */ 17624 if (ipha->ipha_protocol == IPPROTO_ESP) { 17625 esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns); 17626 boolean_t esp_in_udp_sa; 17627 if (esph == NULL) 17628 return; 17629 ASSERT(ii->ipsec_in_esp_sa != NULL); 17630 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 17631 esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags & 17632 IPSA_F_NATT) != 0); 17633 /* 17634 * The following is a fancy, but quick, way of saying: 17635 * ESP-in-UDP SA and Raw ESP packet --> drop 17636 * OR 17637 * ESP SA and ESP-in-UDP packet --> drop 17638 */ 17639 if (esp_in_udp_sa != esp_in_udp_packet) { 17640 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17641 ip_drop_packet(first_mp, B_TRUE, ill, NULL, 17642 DROPPER(ns->netstack_ipsec, ipds_esp_no_sa), 17643 &ns->netstack_ipsec->ipsec_dropper); 17644 return; 17645 } 17646 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 17647 first_mp, esph); 17648 } else { 17649 ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns); 17650 if (ah == NULL) 17651 return; 17652 ASSERT(ii->ipsec_in_ah_sa != NULL); 17653 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 17654 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 17655 first_mp, ah); 17656 } 17657 17658 switch (ipsec_rc) { 17659 case IPSEC_STATUS_SUCCESS: 17660 break; 17661 case IPSEC_STATUS_FAILED: 17662 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17663 /* FALLTHRU */ 17664 case IPSEC_STATUS_PENDING: 17665 return; 17666 } 17667 /* we're done with IPsec processing, send it up */ 17668 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 17669 return; 17670 } 17671 default: 17672 break; 17673 } 17674 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 17675 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 17676 ire->ire_zoneid)); 17677 goto drop_pkt; 17678 } 17679 /* 17680 * Handle protocols with which IP is less intimate. There 17681 * can be more than one stream bound to a particular 17682 * protocol. When this is the case, each one gets a copy 17683 * of any incoming packets. 17684 */ 17685 fanout: 17686 ip_fanout_proto(q, first_mp, ill, ipha, 17687 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 17688 B_TRUE, recv_ill, ire->ire_zoneid); 17689 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17690 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 17691 return; 17692 17693 drop_pkt: 17694 freemsg(first_mp); 17695 if (hada_mp != NULL) 17696 freeb(hada_mp); 17697 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17698 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 17699 #undef rptr 17700 #undef iphs 17701 17702 } 17703 17704 /* 17705 * Update any source route, record route or timestamp options. 17706 * Check that we are at end of strict source route. 17707 * The options have already been checked for sanity in ip_rput_options(). 17708 */ 17709 static boolean_t 17710 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17711 ip_stack_t *ipst) 17712 { 17713 ipoptp_t opts; 17714 uchar_t *opt; 17715 uint8_t optval; 17716 uint8_t optlen; 17717 ipaddr_t dst; 17718 uint32_t ts; 17719 ire_t *dst_ire; 17720 timestruc_t now; 17721 zoneid_t zoneid; 17722 ill_t *ill; 17723 17724 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17725 17726 ip2dbg(("ip_rput_local_options\n")); 17727 17728 for (optval = ipoptp_first(&opts, ipha); 17729 optval != IPOPT_EOL; 17730 optval = ipoptp_next(&opts)) { 17731 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 17732 opt = opts.ipoptp_cur; 17733 optlen = opts.ipoptp_len; 17734 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 17735 optval, optlen)); 17736 switch (optval) { 17737 uint32_t off; 17738 case IPOPT_SSRR: 17739 case IPOPT_LSRR: 17740 off = opt[IPOPT_OFFSET]; 17741 off--; 17742 if (optlen < IP_ADDR_LEN || 17743 off > optlen - IP_ADDR_LEN) { 17744 /* End of source route */ 17745 ip1dbg(("ip_rput_local_options: end of SR\n")); 17746 break; 17747 } 17748 /* 17749 * This will only happen if two consecutive entries 17750 * in the source route contains our address or if 17751 * it is a packet with a loose source route which 17752 * reaches us before consuming the whole source route 17753 */ 17754 ip1dbg(("ip_rput_local_options: not end of SR\n")); 17755 if (optval == IPOPT_SSRR) { 17756 goto bad_src_route; 17757 } 17758 /* 17759 * Hack: instead of dropping the packet truncate the 17760 * source route to what has been used by filling the 17761 * rest with IPOPT_NOP. 17762 */ 17763 opt[IPOPT_OLEN] = (uint8_t)off; 17764 while (off < optlen) { 17765 opt[off++] = IPOPT_NOP; 17766 } 17767 break; 17768 case IPOPT_RR: 17769 off = opt[IPOPT_OFFSET]; 17770 off--; 17771 if (optlen < IP_ADDR_LEN || 17772 off > optlen - IP_ADDR_LEN) { 17773 /* No more room - ignore */ 17774 ip1dbg(( 17775 "ip_rput_local_options: end of RR\n")); 17776 break; 17777 } 17778 bcopy(&ire->ire_src_addr, (char *)opt + off, 17779 IP_ADDR_LEN); 17780 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17781 break; 17782 case IPOPT_TS: 17783 /* Insert timestamp if there is romm */ 17784 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17785 case IPOPT_TS_TSONLY: 17786 off = IPOPT_TS_TIMELEN; 17787 break; 17788 case IPOPT_TS_PRESPEC: 17789 case IPOPT_TS_PRESPEC_RFC791: 17790 /* Verify that the address matched */ 17791 off = opt[IPOPT_OFFSET] - 1; 17792 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17793 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 17794 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 17795 ipst); 17796 if (dst_ire == NULL) { 17797 /* Not for us */ 17798 break; 17799 } 17800 ire_refrele(dst_ire); 17801 /* FALLTHRU */ 17802 case IPOPT_TS_TSANDADDR: 17803 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 17804 break; 17805 default: 17806 /* 17807 * ip_*put_options should have already 17808 * dropped this packet. 17809 */ 17810 cmn_err(CE_PANIC, "ip_rput_local_options: " 17811 "unknown IT - bug in ip_rput_options?\n"); 17812 return (B_TRUE); /* Keep "lint" happy */ 17813 } 17814 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 17815 /* Increase overflow counter */ 17816 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 17817 opt[IPOPT_POS_OV_FLG] = 17818 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 17819 (off << 4)); 17820 break; 17821 } 17822 off = opt[IPOPT_OFFSET] - 1; 17823 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17824 case IPOPT_TS_PRESPEC: 17825 case IPOPT_TS_PRESPEC_RFC791: 17826 case IPOPT_TS_TSANDADDR: 17827 bcopy(&ire->ire_src_addr, (char *)opt + off, 17828 IP_ADDR_LEN); 17829 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17830 /* FALLTHRU */ 17831 case IPOPT_TS_TSONLY: 17832 off = opt[IPOPT_OFFSET] - 1; 17833 /* Compute # of milliseconds since midnight */ 17834 gethrestime(&now); 17835 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 17836 now.tv_nsec / (NANOSEC / MILLISEC); 17837 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 17838 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 17839 break; 17840 } 17841 break; 17842 } 17843 } 17844 return (B_TRUE); 17845 17846 bad_src_route: 17847 q = WR(q); 17848 if (q->q_next != NULL) 17849 ill = q->q_ptr; 17850 else 17851 ill = NULL; 17852 17853 /* make sure we clear any indication of a hardware checksum */ 17854 DB_CKSUMFLAGS(mp) = 0; 17855 zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst); 17856 if (zoneid == ALL_ZONES) 17857 freemsg(mp); 17858 else 17859 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 17860 return (B_FALSE); 17861 17862 } 17863 17864 /* 17865 * Process IP options in an inbound packet. If an option affects the 17866 * effective destination address, return the next hop address via dstp. 17867 * Returns -1 if something fails in which case an ICMP error has been sent 17868 * and mp freed. 17869 */ 17870 static int 17871 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, 17872 ip_stack_t *ipst) 17873 { 17874 ipoptp_t opts; 17875 uchar_t *opt; 17876 uint8_t optval; 17877 uint8_t optlen; 17878 ipaddr_t dst; 17879 intptr_t code = 0; 17880 ire_t *ire = NULL; 17881 zoneid_t zoneid; 17882 ill_t *ill; 17883 17884 ip2dbg(("ip_rput_options\n")); 17885 dst = ipha->ipha_dst; 17886 for (optval = ipoptp_first(&opts, ipha); 17887 optval != IPOPT_EOL; 17888 optval = ipoptp_next(&opts)) { 17889 opt = opts.ipoptp_cur; 17890 optlen = opts.ipoptp_len; 17891 ip2dbg(("ip_rput_options: opt %d, len %d\n", 17892 optval, optlen)); 17893 /* 17894 * Note: we need to verify the checksum before we 17895 * modify anything thus this routine only extracts the next 17896 * hop dst from any source route. 17897 */ 17898 switch (optval) { 17899 uint32_t off; 17900 case IPOPT_SSRR: 17901 case IPOPT_LSRR: 17902 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 17903 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 17904 if (ire == NULL) { 17905 if (optval == IPOPT_SSRR) { 17906 ip1dbg(("ip_rput_options: not next" 17907 " strict source route 0x%x\n", 17908 ntohl(dst))); 17909 code = (char *)&ipha->ipha_dst - 17910 (char *)ipha; 17911 goto param_prob; /* RouterReq's */ 17912 } 17913 ip2dbg(("ip_rput_options: " 17914 "not next source route 0x%x\n", 17915 ntohl(dst))); 17916 break; 17917 } 17918 ire_refrele(ire); 17919 17920 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 17921 ip1dbg(( 17922 "ip_rput_options: bad option offset\n")); 17923 code = (char *)&opt[IPOPT_OLEN] - 17924 (char *)ipha; 17925 goto param_prob; 17926 } 17927 off = opt[IPOPT_OFFSET]; 17928 off--; 17929 redo_srr: 17930 if (optlen < IP_ADDR_LEN || 17931 off > optlen - IP_ADDR_LEN) { 17932 /* End of source route */ 17933 ip1dbg(("ip_rput_options: end of SR\n")); 17934 break; 17935 } 17936 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17937 ip1dbg(("ip_rput_options: next hop 0x%x\n", 17938 ntohl(dst))); 17939 17940 /* 17941 * Check if our address is present more than 17942 * once as consecutive hops in source route. 17943 * XXX verify per-interface ip_forwarding 17944 * for source route? 17945 */ 17946 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 17947 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 17948 17949 if (ire != NULL) { 17950 ire_refrele(ire); 17951 off += IP_ADDR_LEN; 17952 goto redo_srr; 17953 } 17954 17955 if (dst == htonl(INADDR_LOOPBACK)) { 17956 ip1dbg(("ip_rput_options: loopback addr in " 17957 "source route!\n")); 17958 goto bad_src_route; 17959 } 17960 /* 17961 * For strict: verify that dst is directly 17962 * reachable. 17963 */ 17964 if (optval == IPOPT_SSRR) { 17965 ire = ire_ftable_lookup(dst, 0, 0, 17966 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 17967 MBLK_GETLABEL(mp), 17968 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 17969 if (ire == NULL) { 17970 ip1dbg(("ip_rput_options: SSRR not " 17971 "directly reachable: 0x%x\n", 17972 ntohl(dst))); 17973 goto bad_src_route; 17974 } 17975 ire_refrele(ire); 17976 } 17977 /* 17978 * Defer update of the offset and the record route 17979 * until the packet is forwarded. 17980 */ 17981 break; 17982 case IPOPT_RR: 17983 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 17984 ip1dbg(( 17985 "ip_rput_options: bad option offset\n")); 17986 code = (char *)&opt[IPOPT_OLEN] - 17987 (char *)ipha; 17988 goto param_prob; 17989 } 17990 break; 17991 case IPOPT_TS: 17992 /* 17993 * Verify that length >= 5 and that there is either 17994 * room for another timestamp or that the overflow 17995 * counter is not maxed out. 17996 */ 17997 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 17998 if (optlen < IPOPT_MINLEN_IT) { 17999 goto param_prob; 18000 } 18001 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18002 ip1dbg(( 18003 "ip_rput_options: bad option offset\n")); 18004 code = (char *)&opt[IPOPT_OFFSET] - 18005 (char *)ipha; 18006 goto param_prob; 18007 } 18008 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 18009 case IPOPT_TS_TSONLY: 18010 off = IPOPT_TS_TIMELEN; 18011 break; 18012 case IPOPT_TS_TSANDADDR: 18013 case IPOPT_TS_PRESPEC: 18014 case IPOPT_TS_PRESPEC_RFC791: 18015 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 18016 break; 18017 default: 18018 code = (char *)&opt[IPOPT_POS_OV_FLG] - 18019 (char *)ipha; 18020 goto param_prob; 18021 } 18022 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 18023 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 18024 /* 18025 * No room and the overflow counter is 15 18026 * already. 18027 */ 18028 goto param_prob; 18029 } 18030 break; 18031 } 18032 } 18033 18034 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 18035 *dstp = dst; 18036 return (0); 18037 } 18038 18039 ip1dbg(("ip_rput_options: error processing IP options.")); 18040 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 18041 18042 param_prob: 18043 q = WR(q); 18044 if (q->q_next != NULL) 18045 ill = q->q_ptr; 18046 else 18047 ill = NULL; 18048 18049 /* make sure we clear any indication of a hardware checksum */ 18050 DB_CKSUMFLAGS(mp) = 0; 18051 /* Don't know whether this is for non-global or global/forwarding */ 18052 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18053 if (zoneid == ALL_ZONES) 18054 freemsg(mp); 18055 else 18056 icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst); 18057 return (-1); 18058 18059 bad_src_route: 18060 q = WR(q); 18061 if (q->q_next != NULL) 18062 ill = q->q_ptr; 18063 else 18064 ill = NULL; 18065 18066 /* make sure we clear any indication of a hardware checksum */ 18067 DB_CKSUMFLAGS(mp) = 0; 18068 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18069 if (zoneid == ALL_ZONES) 18070 freemsg(mp); 18071 else 18072 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 18073 return (-1); 18074 } 18075 18076 /* 18077 * IP & ICMP info in >=14 msg's ... 18078 * - ip fixed part (mib2_ip_t) 18079 * - icmp fixed part (mib2_icmp_t) 18080 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 18081 * - ipRouteEntryTable (ip 21) all IPv4 IREs 18082 * - ipNetToMediaEntryTable (ip 22) [filled in by the arp module] 18083 * - ipRouteAttributeTable (ip 102) labeled routes 18084 * - ip multicast membership (ip_member_t) 18085 * - ip multicast source filtering (ip_grpsrc_t) 18086 * - igmp fixed part (struct igmpstat) 18087 * - multicast routing stats (struct mrtstat) 18088 * - multicast routing vifs (array of struct vifctl) 18089 * - multicast routing routes (array of struct mfcctl) 18090 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 18091 * One per ill plus one generic 18092 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 18093 * One per ill plus one generic 18094 * - ipv6RouteEntry all IPv6 IREs 18095 * - ipv6RouteAttributeTable (ip6 102) labeled routes 18096 * - ipv6NetToMediaEntry all Neighbor Cache entries 18097 * - ipv6AddrEntry all IPv6 ipifs 18098 * - ipv6 multicast membership (ipv6_member_t) 18099 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 18100 * 18101 * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries. 18102 * 18103 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 18104 * already filled in by the caller. 18105 * Return value of 0 indicates that no messages were sent and caller 18106 * should free mpctl. 18107 */ 18108 int 18109 ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) 18110 { 18111 ip_stack_t *ipst; 18112 sctp_stack_t *sctps; 18113 18114 if (q->q_next != NULL) { 18115 ipst = ILLQ_TO_IPST(q); 18116 } else { 18117 ipst = CONNQ_TO_IPST(q); 18118 } 18119 ASSERT(ipst != NULL); 18120 sctps = ipst->ips_netstack->netstack_sctp; 18121 18122 if (mpctl == NULL || mpctl->b_cont == NULL) { 18123 return (0); 18124 } 18125 18126 /* 18127 * For the purposes of the (broken) packet shell use 18128 * of the level we make sure MIB2_TCP/MIB2_UDP can be used 18129 * to make TCP and UDP appear first in the list of mib items. 18130 * TBD: We could expand this and use it in netstat so that 18131 * the kernel doesn't have to produce large tables (connections, 18132 * routes, etc) when netstat only wants the statistics or a particular 18133 * table. 18134 */ 18135 if (!(level == MIB2_TCP || level == MIB2_UDP)) { 18136 if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) { 18137 return (1); 18138 } 18139 } 18140 18141 if (level != MIB2_TCP) { 18142 if ((mpctl = udp_snmp_get(q, mpctl)) == NULL) { 18143 return (1); 18144 } 18145 } 18146 18147 if (level != MIB2_UDP) { 18148 if ((mpctl = tcp_snmp_get(q, mpctl)) == NULL) { 18149 return (1); 18150 } 18151 } 18152 18153 if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl, 18154 ipst)) == NULL) { 18155 return (1); 18156 } 18157 18158 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst)) == NULL) { 18159 return (1); 18160 } 18161 18162 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) { 18163 return (1); 18164 } 18165 18166 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) { 18167 return (1); 18168 } 18169 18170 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) { 18171 return (1); 18172 } 18173 18174 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) { 18175 return (1); 18176 } 18177 18178 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst)) == NULL) { 18179 return (1); 18180 } 18181 18182 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst)) == NULL) { 18183 return (1); 18184 } 18185 18186 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) { 18187 return (1); 18188 } 18189 18190 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) { 18191 return (1); 18192 } 18193 18194 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) { 18195 return (1); 18196 } 18197 18198 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) { 18199 return (1); 18200 } 18201 18202 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) { 18203 return (1); 18204 } 18205 18206 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) { 18207 return (1); 18208 } 18209 18210 if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) { 18211 return (1); 18212 } 18213 18214 mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst); 18215 if (mpctl == NULL) { 18216 return (1); 18217 } 18218 18219 if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { 18220 return (1); 18221 } 18222 freemsg(mpctl); 18223 return (1); 18224 } 18225 18226 18227 /* Get global (legacy) IPv4 statistics */ 18228 static mblk_t * 18229 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib, 18230 ip_stack_t *ipst) 18231 { 18232 mib2_ip_t old_ip_mib; 18233 struct opthdr *optp; 18234 mblk_t *mp2ctl; 18235 18236 /* 18237 * make a copy of the original message 18238 */ 18239 mp2ctl = copymsg(mpctl); 18240 18241 /* fixed length IP structure... */ 18242 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18243 optp->level = MIB2_IP; 18244 optp->name = 0; 18245 SET_MIB(old_ip_mib.ipForwarding, 18246 (WE_ARE_FORWARDING(ipst) ? 1 : 2)); 18247 SET_MIB(old_ip_mib.ipDefaultTTL, 18248 (uint32_t)ipst->ips_ip_def_ttl); 18249 SET_MIB(old_ip_mib.ipReasmTimeout, 18250 ipst->ips_ip_g_frag_timeout); 18251 SET_MIB(old_ip_mib.ipAddrEntrySize, 18252 sizeof (mib2_ipAddrEntry_t)); 18253 SET_MIB(old_ip_mib.ipRouteEntrySize, 18254 sizeof (mib2_ipRouteEntry_t)); 18255 SET_MIB(old_ip_mib.ipNetToMediaEntrySize, 18256 sizeof (mib2_ipNetToMediaEntry_t)); 18257 SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 18258 SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 18259 SET_MIB(old_ip_mib.ipRouteAttributeSize, 18260 sizeof (mib2_ipAttributeEntry_t)); 18261 SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 18262 18263 /* 18264 * Grab the statistics from the new IP MIB 18265 */ 18266 SET_MIB(old_ip_mib.ipInReceives, 18267 (uint32_t)ipmib->ipIfStatsHCInReceives); 18268 SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors); 18269 SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors); 18270 SET_MIB(old_ip_mib.ipForwDatagrams, 18271 (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams); 18272 SET_MIB(old_ip_mib.ipInUnknownProtos, 18273 ipmib->ipIfStatsInUnknownProtos); 18274 SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards); 18275 SET_MIB(old_ip_mib.ipInDelivers, 18276 (uint32_t)ipmib->ipIfStatsHCInDelivers); 18277 SET_MIB(old_ip_mib.ipOutRequests, 18278 (uint32_t)ipmib->ipIfStatsHCOutRequests); 18279 SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards); 18280 SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes); 18281 SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds); 18282 SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs); 18283 SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails); 18284 SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs); 18285 SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails); 18286 SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates); 18287 18288 /* ipRoutingDiscards is not being used */ 18289 SET_MIB(old_ip_mib.ipRoutingDiscards, 0); 18290 SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs); 18291 SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts); 18292 SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs); 18293 SET_MIB(old_ip_mib.ipReasmDuplicates, 18294 ipmib->ipIfStatsReasmDuplicates); 18295 SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups); 18296 SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits); 18297 SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs); 18298 SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows); 18299 SET_MIB(old_ip_mib.rawipInOverflows, 18300 ipmib->rawipIfStatsInOverflows); 18301 18302 SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded); 18303 SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed); 18304 SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion); 18305 SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion); 18306 SET_MIB(old_ip_mib.ipOutSwitchIPv6, 18307 ipmib->ipIfStatsOutSwitchIPVersion); 18308 18309 if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib, 18310 (int)sizeof (old_ip_mib))) { 18311 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 18312 (uint_t)sizeof (old_ip_mib))); 18313 } 18314 18315 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18316 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 18317 (int)optp->level, (int)optp->name, (int)optp->len)); 18318 qreply(q, mpctl); 18319 return (mp2ctl); 18320 } 18321 18322 /* Per interface IPv4 statistics */ 18323 static mblk_t * 18324 ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18325 { 18326 struct opthdr *optp; 18327 mblk_t *mp2ctl; 18328 ill_t *ill; 18329 ill_walk_context_t ctx; 18330 mblk_t *mp_tail = NULL; 18331 mib2_ipIfStatsEntry_t global_ip_mib; 18332 18333 /* 18334 * Make a copy of the original message 18335 */ 18336 mp2ctl = copymsg(mpctl); 18337 18338 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18339 optp->level = MIB2_IP; 18340 optp->name = MIB2_IP_TRAFFIC_STATS; 18341 /* Include "unknown interface" ip_mib */ 18342 ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 18343 ipst->ips_ip_mib.ipIfStatsIfIndex = 18344 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 18345 SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding, 18346 (ipst->ips_ip_g_forward ? 1 : 2)); 18347 SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL, 18348 (uint32_t)ipst->ips_ip_def_ttl); 18349 SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize, 18350 sizeof (mib2_ipIfStatsEntry_t)); 18351 SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize, 18352 sizeof (mib2_ipAddrEntry_t)); 18353 SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize, 18354 sizeof (mib2_ipRouteEntry_t)); 18355 SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize, 18356 sizeof (mib2_ipNetToMediaEntry_t)); 18357 SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize, 18358 sizeof (ip_member_t)); 18359 SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize, 18360 sizeof (ip_grpsrc_t)); 18361 18362 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18363 (char *)&ipst->ips_ip_mib, (int)sizeof (ipst->ips_ip_mib))) { 18364 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18365 "failed to allocate %u bytes\n", 18366 (uint_t)sizeof (ipst->ips_ip_mib))); 18367 } 18368 18369 bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib)); 18370 18371 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18372 ill = ILL_START_WALK_V4(&ctx, ipst); 18373 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18374 ill->ill_ip_mib->ipIfStatsIfIndex = 18375 ill->ill_phyint->phyint_ifindex; 18376 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 18377 (ipst->ips_ip_g_forward ? 1 : 2)); 18378 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL, 18379 (uint32_t)ipst->ips_ip_def_ttl); 18380 18381 ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib); 18382 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18383 (char *)ill->ill_ip_mib, 18384 (int)sizeof (*ill->ill_ip_mib))) { 18385 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18386 "failed to allocate %u bytes\n", 18387 (uint_t)sizeof (*ill->ill_ip_mib))); 18388 } 18389 } 18390 rw_exit(&ipst->ips_ill_g_lock); 18391 18392 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18393 ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18394 "level %d, name %d, len %d\n", 18395 (int)optp->level, (int)optp->name, (int)optp->len)); 18396 qreply(q, mpctl); 18397 18398 if (mp2ctl == NULL) 18399 return (NULL); 18400 18401 return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst)); 18402 } 18403 18404 /* Global IPv4 ICMP statistics */ 18405 static mblk_t * 18406 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18407 { 18408 struct opthdr *optp; 18409 mblk_t *mp2ctl; 18410 18411 /* 18412 * Make a copy of the original message 18413 */ 18414 mp2ctl = copymsg(mpctl); 18415 18416 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18417 optp->level = MIB2_ICMP; 18418 optp->name = 0; 18419 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib, 18420 (int)sizeof (ipst->ips_icmp_mib))) { 18421 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 18422 (uint_t)sizeof (ipst->ips_icmp_mib))); 18423 } 18424 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18425 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 18426 (int)optp->level, (int)optp->name, (int)optp->len)); 18427 qreply(q, mpctl); 18428 return (mp2ctl); 18429 } 18430 18431 /* Global IPv4 IGMP statistics */ 18432 static mblk_t * 18433 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18434 { 18435 struct opthdr *optp; 18436 mblk_t *mp2ctl; 18437 18438 /* 18439 * make a copy of the original message 18440 */ 18441 mp2ctl = copymsg(mpctl); 18442 18443 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18444 optp->level = EXPER_IGMP; 18445 optp->name = 0; 18446 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat, 18447 (int)sizeof (ipst->ips_igmpstat))) { 18448 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 18449 (uint_t)sizeof (ipst->ips_igmpstat))); 18450 } 18451 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18452 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 18453 (int)optp->level, (int)optp->name, (int)optp->len)); 18454 qreply(q, mpctl); 18455 return (mp2ctl); 18456 } 18457 18458 /* Global IPv4 Multicast Routing statistics */ 18459 static mblk_t * 18460 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18461 { 18462 struct opthdr *optp; 18463 mblk_t *mp2ctl; 18464 18465 /* 18466 * make a copy of the original message 18467 */ 18468 mp2ctl = copymsg(mpctl); 18469 18470 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18471 optp->level = EXPER_DVMRP; 18472 optp->name = 0; 18473 if (!ip_mroute_stats(mpctl->b_cont, ipst)) { 18474 ip0dbg(("ip_mroute_stats: failed\n")); 18475 } 18476 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18477 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 18478 (int)optp->level, (int)optp->name, (int)optp->len)); 18479 qreply(q, mpctl); 18480 return (mp2ctl); 18481 } 18482 18483 /* IPv4 address information */ 18484 static mblk_t * 18485 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18486 { 18487 struct opthdr *optp; 18488 mblk_t *mp2ctl; 18489 mblk_t *mp_tail = NULL; 18490 ill_t *ill; 18491 ipif_t *ipif; 18492 uint_t bitval; 18493 mib2_ipAddrEntry_t mae; 18494 zoneid_t zoneid; 18495 ill_walk_context_t ctx; 18496 18497 /* 18498 * make a copy of the original message 18499 */ 18500 mp2ctl = copymsg(mpctl); 18501 18502 /* ipAddrEntryTable */ 18503 18504 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18505 optp->level = MIB2_IP; 18506 optp->name = MIB2_IP_ADDR; 18507 zoneid = Q_TO_CONN(q)->conn_zoneid; 18508 18509 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18510 ill = ILL_START_WALK_V4(&ctx, ipst); 18511 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18512 for (ipif = ill->ill_ipif; ipif != NULL; 18513 ipif = ipif->ipif_next) { 18514 if (ipif->ipif_zoneid != zoneid && 18515 ipif->ipif_zoneid != ALL_ZONES) 18516 continue; 18517 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18518 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18519 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18520 18521 ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes, 18522 OCTET_LENGTH); 18523 mae.ipAdEntIfIndex.o_length = 18524 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 18525 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 18526 mae.ipAdEntNetMask = ipif->ipif_net_mask; 18527 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 18528 mae.ipAdEntInfo.ae_subnet_len = 18529 ip_mask_to_plen(ipif->ipif_net_mask); 18530 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 18531 for (bitval = 1; 18532 bitval && 18533 !(bitval & ipif->ipif_brd_addr); 18534 bitval <<= 1) 18535 noop; 18536 mae.ipAdEntBcastAddr = bitval; 18537 mae.ipAdEntReasmMaxSize = IP_MAXPACKET; 18538 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 18539 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 18540 mae.ipAdEntInfo.ae_broadcast_addr = 18541 ipif->ipif_brd_addr; 18542 mae.ipAdEntInfo.ae_pp_dst_addr = 18543 ipif->ipif_pp_dst_addr; 18544 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 18545 ill->ill_flags | ill->ill_phyint->phyint_flags; 18546 mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL; 18547 18548 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18549 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 18550 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 18551 "allocate %u bytes\n", 18552 (uint_t)sizeof (mib2_ipAddrEntry_t))); 18553 } 18554 } 18555 } 18556 rw_exit(&ipst->ips_ill_g_lock); 18557 18558 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18559 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 18560 (int)optp->level, (int)optp->name, (int)optp->len)); 18561 qreply(q, mpctl); 18562 return (mp2ctl); 18563 } 18564 18565 /* IPv6 address information */ 18566 static mblk_t * 18567 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18568 { 18569 struct opthdr *optp; 18570 mblk_t *mp2ctl; 18571 mblk_t *mp_tail = NULL; 18572 ill_t *ill; 18573 ipif_t *ipif; 18574 mib2_ipv6AddrEntry_t mae6; 18575 zoneid_t zoneid; 18576 ill_walk_context_t ctx; 18577 18578 /* 18579 * make a copy of the original message 18580 */ 18581 mp2ctl = copymsg(mpctl); 18582 18583 /* ipv6AddrEntryTable */ 18584 18585 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18586 optp->level = MIB2_IP6; 18587 optp->name = MIB2_IP6_ADDR; 18588 zoneid = Q_TO_CONN(q)->conn_zoneid; 18589 18590 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18591 ill = ILL_START_WALK_V6(&ctx, ipst); 18592 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18593 for (ipif = ill->ill_ipif; ipif != NULL; 18594 ipif = ipif->ipif_next) { 18595 if (ipif->ipif_zoneid != zoneid && 18596 ipif->ipif_zoneid != ALL_ZONES) 18597 continue; 18598 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18599 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18600 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18601 18602 ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes, 18603 OCTET_LENGTH); 18604 mae6.ipv6AddrIfIndex.o_length = 18605 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 18606 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 18607 mae6.ipv6AddrPfxLength = 18608 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 18609 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 18610 mae6.ipv6AddrInfo.ae_subnet_len = 18611 mae6.ipv6AddrPfxLength; 18612 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 18613 18614 /* Type: stateless(1), stateful(2), unknown(3) */ 18615 if (ipif->ipif_flags & IPIF_ADDRCONF) 18616 mae6.ipv6AddrType = 1; 18617 else 18618 mae6.ipv6AddrType = 2; 18619 /* Anycast: true(1), false(2) */ 18620 if (ipif->ipif_flags & IPIF_ANYCAST) 18621 mae6.ipv6AddrAnycastFlag = 1; 18622 else 18623 mae6.ipv6AddrAnycastFlag = 2; 18624 18625 /* 18626 * Address status: preferred(1), deprecated(2), 18627 * invalid(3), inaccessible(4), unknown(5) 18628 */ 18629 if (ipif->ipif_flags & IPIF_NOLOCAL) 18630 mae6.ipv6AddrStatus = 3; 18631 else if (ipif->ipif_flags & IPIF_DEPRECATED) 18632 mae6.ipv6AddrStatus = 2; 18633 else 18634 mae6.ipv6AddrStatus = 1; 18635 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 18636 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 18637 mae6.ipv6AddrInfo.ae_pp_dst_addr = 18638 ipif->ipif_v6pp_dst_addr; 18639 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 18640 ill->ill_flags | ill->ill_phyint->phyint_flags; 18641 mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET; 18642 mae6.ipv6AddrIdentifier = ill->ill_token; 18643 mae6.ipv6AddrIdentifierLen = ill->ill_token_length; 18644 mae6.ipv6AddrReachableTime = ill->ill_reachable_time; 18645 mae6.ipv6AddrRetransmitTime = 18646 ill->ill_reachable_retrans_time; 18647 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18648 (char *)&mae6, 18649 (int)sizeof (mib2_ipv6AddrEntry_t))) { 18650 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 18651 "allocate %u bytes\n", 18652 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 18653 } 18654 } 18655 } 18656 rw_exit(&ipst->ips_ill_g_lock); 18657 18658 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18659 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 18660 (int)optp->level, (int)optp->name, (int)optp->len)); 18661 qreply(q, mpctl); 18662 return (mp2ctl); 18663 } 18664 18665 /* IPv4 multicast group membership. */ 18666 static mblk_t * 18667 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18668 { 18669 struct opthdr *optp; 18670 mblk_t *mp2ctl; 18671 ill_t *ill; 18672 ipif_t *ipif; 18673 ilm_t *ilm; 18674 ip_member_t ipm; 18675 mblk_t *mp_tail = NULL; 18676 ill_walk_context_t ctx; 18677 zoneid_t zoneid; 18678 18679 /* 18680 * make a copy of the original message 18681 */ 18682 mp2ctl = copymsg(mpctl); 18683 zoneid = Q_TO_CONN(q)->conn_zoneid; 18684 18685 /* ipGroupMember table */ 18686 optp = (struct opthdr *)&mpctl->b_rptr[ 18687 sizeof (struct T_optmgmt_ack)]; 18688 optp->level = MIB2_IP; 18689 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 18690 18691 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18692 ill = ILL_START_WALK_V4(&ctx, ipst); 18693 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18694 ILM_WALKER_HOLD(ill); 18695 for (ipif = ill->ill_ipif; ipif != NULL; 18696 ipif = ipif->ipif_next) { 18697 if (ipif->ipif_zoneid != zoneid && 18698 ipif->ipif_zoneid != ALL_ZONES) 18699 continue; /* not this zone */ 18700 ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes, 18701 OCTET_LENGTH); 18702 ipm.ipGroupMemberIfIndex.o_length = 18703 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 18704 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18705 ASSERT(ilm->ilm_ipif != NULL); 18706 ASSERT(ilm->ilm_ill == NULL); 18707 if (ilm->ilm_ipif != ipif) 18708 continue; 18709 ipm.ipGroupMemberAddress = ilm->ilm_addr; 18710 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 18711 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 18712 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18713 (char *)&ipm, (int)sizeof (ipm))) { 18714 ip1dbg(("ip_snmp_get_mib2_ip_group: " 18715 "failed to allocate %u bytes\n", 18716 (uint_t)sizeof (ipm))); 18717 } 18718 } 18719 } 18720 ILM_WALKER_RELE(ill); 18721 } 18722 rw_exit(&ipst->ips_ill_g_lock); 18723 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18724 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18725 (int)optp->level, (int)optp->name, (int)optp->len)); 18726 qreply(q, mpctl); 18727 return (mp2ctl); 18728 } 18729 18730 /* IPv6 multicast group membership. */ 18731 static mblk_t * 18732 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18733 { 18734 struct opthdr *optp; 18735 mblk_t *mp2ctl; 18736 ill_t *ill; 18737 ilm_t *ilm; 18738 ipv6_member_t ipm6; 18739 mblk_t *mp_tail = NULL; 18740 ill_walk_context_t ctx; 18741 zoneid_t zoneid; 18742 18743 /* 18744 * make a copy of the original message 18745 */ 18746 mp2ctl = copymsg(mpctl); 18747 zoneid = Q_TO_CONN(q)->conn_zoneid; 18748 18749 /* ip6GroupMember table */ 18750 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18751 optp->level = MIB2_IP6; 18752 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 18753 18754 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18755 ill = ILL_START_WALK_V6(&ctx, ipst); 18756 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18757 ILM_WALKER_HOLD(ill); 18758 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 18759 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18760 ASSERT(ilm->ilm_ipif == NULL); 18761 ASSERT(ilm->ilm_ill != NULL); 18762 if (ilm->ilm_zoneid != zoneid) 18763 continue; /* not this zone */ 18764 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 18765 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 18766 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 18767 if (!snmp_append_data2(mpctl->b_cont, 18768 &mp_tail, 18769 (char *)&ipm6, (int)sizeof (ipm6))) { 18770 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 18771 "failed to allocate %u bytes\n", 18772 (uint_t)sizeof (ipm6))); 18773 } 18774 } 18775 ILM_WALKER_RELE(ill); 18776 } 18777 rw_exit(&ipst->ips_ill_g_lock); 18778 18779 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18780 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18781 (int)optp->level, (int)optp->name, (int)optp->len)); 18782 qreply(q, mpctl); 18783 return (mp2ctl); 18784 } 18785 18786 /* IP multicast filtered sources */ 18787 static mblk_t * 18788 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18789 { 18790 struct opthdr *optp; 18791 mblk_t *mp2ctl; 18792 ill_t *ill; 18793 ipif_t *ipif; 18794 ilm_t *ilm; 18795 ip_grpsrc_t ips; 18796 mblk_t *mp_tail = NULL; 18797 ill_walk_context_t ctx; 18798 zoneid_t zoneid; 18799 int i; 18800 slist_t *sl; 18801 18802 /* 18803 * make a copy of the original message 18804 */ 18805 mp2ctl = copymsg(mpctl); 18806 zoneid = Q_TO_CONN(q)->conn_zoneid; 18807 18808 /* ipGroupSource table */ 18809 optp = (struct opthdr *)&mpctl->b_rptr[ 18810 sizeof (struct T_optmgmt_ack)]; 18811 optp->level = MIB2_IP; 18812 optp->name = EXPER_IP_GROUP_SOURCES; 18813 18814 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18815 ill = ILL_START_WALK_V4(&ctx, ipst); 18816 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18817 ILM_WALKER_HOLD(ill); 18818 for (ipif = ill->ill_ipif; ipif != NULL; 18819 ipif = ipif->ipif_next) { 18820 if (ipif->ipif_zoneid != zoneid) 18821 continue; /* not this zone */ 18822 ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes, 18823 OCTET_LENGTH); 18824 ips.ipGroupSourceIfIndex.o_length = 18825 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 18826 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18827 ASSERT(ilm->ilm_ipif != NULL); 18828 ASSERT(ilm->ilm_ill == NULL); 18829 sl = ilm->ilm_filter; 18830 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 18831 continue; 18832 ips.ipGroupSourceGroup = ilm->ilm_addr; 18833 for (i = 0; i < sl->sl_numsrc; i++) { 18834 if (!IN6_IS_ADDR_V4MAPPED( 18835 &sl->sl_addr[i])) 18836 continue; 18837 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 18838 ips.ipGroupSourceAddress); 18839 if (snmp_append_data2(mpctl->b_cont, 18840 &mp_tail, (char *)&ips, 18841 (int)sizeof (ips)) == 0) { 18842 ip1dbg(("ip_snmp_get_mib2_" 18843 "ip_group_src: failed to " 18844 "allocate %u bytes\n", 18845 (uint_t)sizeof (ips))); 18846 } 18847 } 18848 } 18849 } 18850 ILM_WALKER_RELE(ill); 18851 } 18852 rw_exit(&ipst->ips_ill_g_lock); 18853 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18854 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18855 (int)optp->level, (int)optp->name, (int)optp->len)); 18856 qreply(q, mpctl); 18857 return (mp2ctl); 18858 } 18859 18860 /* IPv6 multicast filtered sources. */ 18861 static mblk_t * 18862 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18863 { 18864 struct opthdr *optp; 18865 mblk_t *mp2ctl; 18866 ill_t *ill; 18867 ilm_t *ilm; 18868 ipv6_grpsrc_t ips6; 18869 mblk_t *mp_tail = NULL; 18870 ill_walk_context_t ctx; 18871 zoneid_t zoneid; 18872 int i; 18873 slist_t *sl; 18874 18875 /* 18876 * make a copy of the original message 18877 */ 18878 mp2ctl = copymsg(mpctl); 18879 zoneid = Q_TO_CONN(q)->conn_zoneid; 18880 18881 /* ip6GroupMember table */ 18882 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18883 optp->level = MIB2_IP6; 18884 optp->name = EXPER_IP6_GROUP_SOURCES; 18885 18886 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18887 ill = ILL_START_WALK_V6(&ctx, ipst); 18888 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18889 ILM_WALKER_HOLD(ill); 18890 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 18891 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18892 ASSERT(ilm->ilm_ipif == NULL); 18893 ASSERT(ilm->ilm_ill != NULL); 18894 sl = ilm->ilm_filter; 18895 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 18896 continue; 18897 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 18898 for (i = 0; i < sl->sl_numsrc; i++) { 18899 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 18900 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18901 (char *)&ips6, (int)sizeof (ips6))) { 18902 ip1dbg(("ip_snmp_get_mib2_ip6_" 18903 "group_src: failed to allocate " 18904 "%u bytes\n", 18905 (uint_t)sizeof (ips6))); 18906 } 18907 } 18908 } 18909 ILM_WALKER_RELE(ill); 18910 } 18911 rw_exit(&ipst->ips_ill_g_lock); 18912 18913 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18914 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18915 (int)optp->level, (int)optp->name, (int)optp->len)); 18916 qreply(q, mpctl); 18917 return (mp2ctl); 18918 } 18919 18920 /* Multicast routing virtual interface table. */ 18921 static mblk_t * 18922 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18923 { 18924 struct opthdr *optp; 18925 mblk_t *mp2ctl; 18926 18927 /* 18928 * make a copy of the original message 18929 */ 18930 mp2ctl = copymsg(mpctl); 18931 18932 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18933 optp->level = EXPER_DVMRP; 18934 optp->name = EXPER_DVMRP_VIF; 18935 if (!ip_mroute_vif(mpctl->b_cont, ipst)) { 18936 ip0dbg(("ip_mroute_vif: failed\n")); 18937 } 18938 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18939 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 18940 (int)optp->level, (int)optp->name, (int)optp->len)); 18941 qreply(q, mpctl); 18942 return (mp2ctl); 18943 } 18944 18945 /* Multicast routing table. */ 18946 static mblk_t * 18947 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18948 { 18949 struct opthdr *optp; 18950 mblk_t *mp2ctl; 18951 18952 /* 18953 * make a copy of the original message 18954 */ 18955 mp2ctl = copymsg(mpctl); 18956 18957 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18958 optp->level = EXPER_DVMRP; 18959 optp->name = EXPER_DVMRP_MRT; 18960 if (!ip_mroute_mrt(mpctl->b_cont, ipst)) { 18961 ip0dbg(("ip_mroute_mrt: failed\n")); 18962 } 18963 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18964 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 18965 (int)optp->level, (int)optp->name, (int)optp->len)); 18966 qreply(q, mpctl); 18967 return (mp2ctl); 18968 } 18969 18970 /* 18971 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 18972 * in one IRE walk. 18973 */ 18974 static mblk_t * 18975 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18976 { 18977 struct opthdr *optp; 18978 mblk_t *mp2ctl; /* Returned */ 18979 mblk_t *mp3ctl; /* nettomedia */ 18980 mblk_t *mp4ctl; /* routeattrs */ 18981 iproutedata_t ird; 18982 zoneid_t zoneid; 18983 18984 /* 18985 * make copies of the original message 18986 * - mp2ctl is returned unchanged to the caller for his use 18987 * - mpctl is sent upstream as ipRouteEntryTable 18988 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 18989 * - mp4ctl is sent upstream as ipRouteAttributeTable 18990 */ 18991 mp2ctl = copymsg(mpctl); 18992 mp3ctl = copymsg(mpctl); 18993 mp4ctl = copymsg(mpctl); 18994 if (mp3ctl == NULL || mp4ctl == NULL) { 18995 freemsg(mp4ctl); 18996 freemsg(mp3ctl); 18997 freemsg(mp2ctl); 18998 freemsg(mpctl); 18999 return (NULL); 19000 } 19001 19002 bzero(&ird, sizeof (ird)); 19003 19004 ird.ird_route.lp_head = mpctl->b_cont; 19005 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19006 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19007 19008 zoneid = Q_TO_CONN(q)->conn_zoneid; 19009 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); 19010 19011 /* ipRouteEntryTable in mpctl */ 19012 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19013 optp->level = MIB2_IP; 19014 optp->name = MIB2_IP_ROUTE; 19015 optp->len = msgdsize(ird.ird_route.lp_head); 19016 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19017 (int)optp->level, (int)optp->name, (int)optp->len)); 19018 qreply(q, mpctl); 19019 19020 /* ipNetToMediaEntryTable in mp3ctl */ 19021 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19022 optp->level = MIB2_IP; 19023 optp->name = MIB2_IP_MEDIA; 19024 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19025 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19026 (int)optp->level, (int)optp->name, (int)optp->len)); 19027 qreply(q, mp3ctl); 19028 19029 /* ipRouteAttributeTable in mp4ctl */ 19030 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19031 optp->level = MIB2_IP; 19032 optp->name = EXPER_IP_RTATTR; 19033 optp->len = msgdsize(ird.ird_attrs.lp_head); 19034 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19035 (int)optp->level, (int)optp->name, (int)optp->len)); 19036 if (optp->len == 0) 19037 freemsg(mp4ctl); 19038 else 19039 qreply(q, mp4ctl); 19040 19041 return (mp2ctl); 19042 } 19043 19044 /* 19045 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 19046 * ipv6NetToMediaEntryTable in an NDP walk. 19047 */ 19048 static mblk_t * 19049 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19050 { 19051 struct opthdr *optp; 19052 mblk_t *mp2ctl; /* Returned */ 19053 mblk_t *mp3ctl; /* nettomedia */ 19054 mblk_t *mp4ctl; /* routeattrs */ 19055 iproutedata_t ird; 19056 zoneid_t zoneid; 19057 19058 /* 19059 * make copies of the original message 19060 * - mp2ctl is returned unchanged to the caller for his use 19061 * - mpctl is sent upstream as ipv6RouteEntryTable 19062 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 19063 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 19064 */ 19065 mp2ctl = copymsg(mpctl); 19066 mp3ctl = copymsg(mpctl); 19067 mp4ctl = copymsg(mpctl); 19068 if (mp3ctl == NULL || mp4ctl == NULL) { 19069 freemsg(mp4ctl); 19070 freemsg(mp3ctl); 19071 freemsg(mp2ctl); 19072 freemsg(mpctl); 19073 return (NULL); 19074 } 19075 19076 bzero(&ird, sizeof (ird)); 19077 19078 ird.ird_route.lp_head = mpctl->b_cont; 19079 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19080 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19081 19082 zoneid = Q_TO_CONN(q)->conn_zoneid; 19083 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); 19084 19085 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19086 optp->level = MIB2_IP6; 19087 optp->name = MIB2_IP6_ROUTE; 19088 optp->len = msgdsize(ird.ird_route.lp_head); 19089 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19090 (int)optp->level, (int)optp->name, (int)optp->len)); 19091 qreply(q, mpctl); 19092 19093 /* ipv6NetToMediaEntryTable in mp3ctl */ 19094 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst); 19095 19096 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19097 optp->level = MIB2_IP6; 19098 optp->name = MIB2_IP6_MEDIA; 19099 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19100 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19101 (int)optp->level, (int)optp->name, (int)optp->len)); 19102 qreply(q, mp3ctl); 19103 19104 /* ipv6RouteAttributeTable in mp4ctl */ 19105 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19106 optp->level = MIB2_IP6; 19107 optp->name = EXPER_IP_RTATTR; 19108 optp->len = msgdsize(ird.ird_attrs.lp_head); 19109 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19110 (int)optp->level, (int)optp->name, (int)optp->len)); 19111 if (optp->len == 0) 19112 freemsg(mp4ctl); 19113 else 19114 qreply(q, mp4ctl); 19115 19116 return (mp2ctl); 19117 } 19118 19119 /* 19120 * IPv6 mib: One per ill 19121 */ 19122 static mblk_t * 19123 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19124 { 19125 struct opthdr *optp; 19126 mblk_t *mp2ctl; 19127 ill_t *ill; 19128 ill_walk_context_t ctx; 19129 mblk_t *mp_tail = NULL; 19130 19131 /* 19132 * Make a copy of the original message 19133 */ 19134 mp2ctl = copymsg(mpctl); 19135 19136 /* fixed length IPv6 structure ... */ 19137 19138 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19139 optp->level = MIB2_IP6; 19140 optp->name = 0; 19141 /* Include "unknown interface" ip6_mib */ 19142 ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 19143 ipst->ips_ip6_mib.ipIfStatsIfIndex = 19144 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 19145 SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding, 19146 ipst->ips_ipv6_forward ? 1 : 2); 19147 SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit, 19148 ipst->ips_ipv6_def_hops); 19149 SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize, 19150 sizeof (mib2_ipIfStatsEntry_t)); 19151 SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize, 19152 sizeof (mib2_ipv6AddrEntry_t)); 19153 SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize, 19154 sizeof (mib2_ipv6RouteEntry_t)); 19155 SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize, 19156 sizeof (mib2_ipv6NetToMediaEntry_t)); 19157 SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize, 19158 sizeof (ipv6_member_t)); 19159 SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize, 19160 sizeof (ipv6_grpsrc_t)); 19161 19162 /* 19163 * Synchronize 64- and 32-bit counters 19164 */ 19165 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives, 19166 ipIfStatsHCInReceives); 19167 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers, 19168 ipIfStatsHCInDelivers); 19169 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests, 19170 ipIfStatsHCOutRequests); 19171 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams, 19172 ipIfStatsHCOutForwDatagrams); 19173 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts, 19174 ipIfStatsHCOutMcastPkts); 19175 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts, 19176 ipIfStatsHCInMcastPkts); 19177 19178 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19179 (char *)&ipst->ips_ip6_mib, (int)sizeof (ipst->ips_ip6_mib))) { 19180 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 19181 (uint_t)sizeof (ipst->ips_ip6_mib))); 19182 } 19183 19184 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19185 ill = ILL_START_WALK_V6(&ctx, ipst); 19186 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19187 ill->ill_ip_mib->ipIfStatsIfIndex = 19188 ill->ill_phyint->phyint_ifindex; 19189 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 19190 ipst->ips_ipv6_forward ? 1 : 2); 19191 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit, 19192 ill->ill_max_hops); 19193 19194 /* 19195 * Synchronize 64- and 32-bit counters 19196 */ 19197 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives, 19198 ipIfStatsHCInReceives); 19199 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers, 19200 ipIfStatsHCInDelivers); 19201 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests, 19202 ipIfStatsHCOutRequests); 19203 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams, 19204 ipIfStatsHCOutForwDatagrams); 19205 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts, 19206 ipIfStatsHCOutMcastPkts); 19207 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts, 19208 ipIfStatsHCInMcastPkts); 19209 19210 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19211 (char *)ill->ill_ip_mib, 19212 (int)sizeof (*ill->ill_ip_mib))) { 19213 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 19214 "%u bytes\n", (uint_t)sizeof (*ill->ill_ip_mib))); 19215 } 19216 } 19217 rw_exit(&ipst->ips_ill_g_lock); 19218 19219 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19220 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 19221 (int)optp->level, (int)optp->name, (int)optp->len)); 19222 qreply(q, mpctl); 19223 return (mp2ctl); 19224 } 19225 19226 /* 19227 * ICMPv6 mib: One per ill 19228 */ 19229 static mblk_t * 19230 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19231 { 19232 struct opthdr *optp; 19233 mblk_t *mp2ctl; 19234 ill_t *ill; 19235 ill_walk_context_t ctx; 19236 mblk_t *mp_tail = NULL; 19237 /* 19238 * Make a copy of the original message 19239 */ 19240 mp2ctl = copymsg(mpctl); 19241 19242 /* fixed length ICMPv6 structure ... */ 19243 19244 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19245 optp->level = MIB2_ICMP6; 19246 optp->name = 0; 19247 /* Include "unknown interface" icmp6_mib */ 19248 ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex = 19249 MIB2_UNKNOWN_INTERFACE; /* netstat flag */ 19250 ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize = 19251 sizeof (mib2_ipv6IfIcmpEntry_t); 19252 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19253 (char *)&ipst->ips_icmp6_mib, 19254 (int)sizeof (ipst->ips_icmp6_mib))) { 19255 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 19256 (uint_t)sizeof (ipst->ips_icmp6_mib))); 19257 } 19258 19259 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19260 ill = ILL_START_WALK_V6(&ctx, ipst); 19261 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19262 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 19263 ill->ill_phyint->phyint_ifindex; 19264 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19265 (char *)ill->ill_icmp6_mib, 19266 (int)sizeof (*ill->ill_icmp6_mib))) { 19267 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 19268 "%u bytes\n", 19269 (uint_t)sizeof (*ill->ill_icmp6_mib))); 19270 } 19271 } 19272 rw_exit(&ipst->ips_ill_g_lock); 19273 19274 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19275 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 19276 (int)optp->level, (int)optp->name, (int)optp->len)); 19277 qreply(q, mpctl); 19278 return (mp2ctl); 19279 } 19280 19281 /* 19282 * ire_walk routine to create both ipRouteEntryTable and 19283 * ipRouteAttributeTable in one IRE walk 19284 */ 19285 static void 19286 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 19287 { 19288 ill_t *ill; 19289 ipif_t *ipif; 19290 mib2_ipRouteEntry_t *re; 19291 mib2_ipAttributeEntry_t *iae, *iaeptr; 19292 ipaddr_t gw_addr; 19293 tsol_ire_gw_secattr_t *attrp; 19294 tsol_gc_t *gc = NULL; 19295 tsol_gcgrp_t *gcgrp = NULL; 19296 uint_t sacnt = 0; 19297 int i; 19298 19299 ASSERT(ire->ire_ipversion == IPV4_VERSION); 19300 19301 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19302 return; 19303 19304 if ((attrp = ire->ire_gw_secattr) != NULL) { 19305 mutex_enter(&attrp->igsa_lock); 19306 if ((gc = attrp->igsa_gc) != NULL) { 19307 gcgrp = gc->gc_grp; 19308 ASSERT(gcgrp != NULL); 19309 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19310 sacnt = 1; 19311 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19312 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19313 gc = gcgrp->gcgrp_head; 19314 sacnt = gcgrp->gcgrp_count; 19315 } 19316 mutex_exit(&attrp->igsa_lock); 19317 19318 /* do nothing if there's no gc to report */ 19319 if (gc == NULL) { 19320 ASSERT(sacnt == 0); 19321 if (gcgrp != NULL) { 19322 /* we might as well drop the lock now */ 19323 rw_exit(&gcgrp->gcgrp_rwlock); 19324 gcgrp = NULL; 19325 } 19326 attrp = NULL; 19327 } 19328 19329 ASSERT(gc == NULL || (gcgrp != NULL && 19330 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19331 } 19332 ASSERT(sacnt == 0 || gc != NULL); 19333 19334 if (sacnt != 0 && 19335 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19336 kmem_free(re, sizeof (*re)); 19337 rw_exit(&gcgrp->gcgrp_rwlock); 19338 return; 19339 } 19340 19341 /* 19342 * Return all IRE types for route table... let caller pick and choose 19343 */ 19344 re->ipRouteDest = ire->ire_addr; 19345 ipif = ire->ire_ipif; 19346 re->ipRouteIfIndex.o_length = 0; 19347 if (ire->ire_type == IRE_CACHE) { 19348 ill = (ill_t *)ire->ire_stq->q_ptr; 19349 re->ipRouteIfIndex.o_length = 19350 ill->ill_name_length == 0 ? 0 : 19351 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19352 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 19353 re->ipRouteIfIndex.o_length); 19354 } else if (ipif != NULL) { 19355 ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH); 19356 re->ipRouteIfIndex.o_length = 19357 mi_strlen(re->ipRouteIfIndex.o_bytes); 19358 } 19359 re->ipRouteMetric1 = -1; 19360 re->ipRouteMetric2 = -1; 19361 re->ipRouteMetric3 = -1; 19362 re->ipRouteMetric4 = -1; 19363 19364 gw_addr = ire->ire_gateway_addr; 19365 19366 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 19367 re->ipRouteNextHop = ire->ire_src_addr; 19368 else 19369 re->ipRouteNextHop = gw_addr; 19370 /* indirect(4), direct(3), or invalid(2) */ 19371 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19372 re->ipRouteType = 2; 19373 else 19374 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 19375 re->ipRouteProto = -1; 19376 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 19377 re->ipRouteMask = ire->ire_mask; 19378 re->ipRouteMetric5 = -1; 19379 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 19380 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 19381 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19382 re->ipRouteInfo.re_ref = ire->ire_refcnt; 19383 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 19384 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19385 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19386 re->ipRouteInfo.re_flags = ire->ire_flags; 19387 19388 if (ire->ire_flags & RTF_DYNAMIC) { 19389 re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19390 } else { 19391 re->ipRouteInfo.re_ire_type = ire->ire_type; 19392 } 19393 19394 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19395 (char *)re, (int)sizeof (*re))) { 19396 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19397 (uint_t)sizeof (*re))); 19398 } 19399 19400 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19401 iaeptr->iae_routeidx = ird->ird_idx; 19402 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19403 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19404 } 19405 19406 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19407 (char *)iae, sacnt * sizeof (*iae))) { 19408 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19409 (unsigned)(sacnt * sizeof (*iae)))); 19410 } 19411 19412 /* bump route index for next pass */ 19413 ird->ird_idx++; 19414 19415 kmem_free(re, sizeof (*re)); 19416 if (sacnt != 0) 19417 kmem_free(iae, sacnt * sizeof (*iae)); 19418 19419 if (gcgrp != NULL) 19420 rw_exit(&gcgrp->gcgrp_rwlock); 19421 } 19422 19423 /* 19424 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 19425 */ 19426 static void 19427 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 19428 { 19429 ill_t *ill; 19430 ipif_t *ipif; 19431 mib2_ipv6RouteEntry_t *re; 19432 mib2_ipAttributeEntry_t *iae, *iaeptr; 19433 in6_addr_t gw_addr_v6; 19434 tsol_ire_gw_secattr_t *attrp; 19435 tsol_gc_t *gc = NULL; 19436 tsol_gcgrp_t *gcgrp = NULL; 19437 uint_t sacnt = 0; 19438 int i; 19439 19440 ASSERT(ire->ire_ipversion == IPV6_VERSION); 19441 19442 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19443 return; 19444 19445 if ((attrp = ire->ire_gw_secattr) != NULL) { 19446 mutex_enter(&attrp->igsa_lock); 19447 if ((gc = attrp->igsa_gc) != NULL) { 19448 gcgrp = gc->gc_grp; 19449 ASSERT(gcgrp != NULL); 19450 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19451 sacnt = 1; 19452 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19453 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19454 gc = gcgrp->gcgrp_head; 19455 sacnt = gcgrp->gcgrp_count; 19456 } 19457 mutex_exit(&attrp->igsa_lock); 19458 19459 /* do nothing if there's no gc to report */ 19460 if (gc == NULL) { 19461 ASSERT(sacnt == 0); 19462 if (gcgrp != NULL) { 19463 /* we might as well drop the lock now */ 19464 rw_exit(&gcgrp->gcgrp_rwlock); 19465 gcgrp = NULL; 19466 } 19467 attrp = NULL; 19468 } 19469 19470 ASSERT(gc == NULL || (gcgrp != NULL && 19471 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19472 } 19473 ASSERT(sacnt == 0 || gc != NULL); 19474 19475 if (sacnt != 0 && 19476 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19477 kmem_free(re, sizeof (*re)); 19478 rw_exit(&gcgrp->gcgrp_rwlock); 19479 return; 19480 } 19481 19482 /* 19483 * Return all IRE types for route table... let caller pick and choose 19484 */ 19485 re->ipv6RouteDest = ire->ire_addr_v6; 19486 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 19487 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 19488 re->ipv6RouteIfIndex.o_length = 0; 19489 ipif = ire->ire_ipif; 19490 if (ire->ire_type == IRE_CACHE) { 19491 ill = (ill_t *)ire->ire_stq->q_ptr; 19492 re->ipv6RouteIfIndex.o_length = 19493 ill->ill_name_length == 0 ? 0 : 19494 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19495 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 19496 re->ipv6RouteIfIndex.o_length); 19497 } else if (ipif != NULL) { 19498 ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH); 19499 re->ipv6RouteIfIndex.o_length = 19500 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 19501 } 19502 19503 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19504 19505 mutex_enter(&ire->ire_lock); 19506 gw_addr_v6 = ire->ire_gateway_addr_v6; 19507 mutex_exit(&ire->ire_lock); 19508 19509 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 19510 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 19511 else 19512 re->ipv6RouteNextHop = gw_addr_v6; 19513 19514 /* remote(4), local(3), or discard(2) */ 19515 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19516 re->ipv6RouteType = 2; 19517 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 19518 re->ipv6RouteType = 3; 19519 else 19520 re->ipv6RouteType = 4; 19521 19522 re->ipv6RouteProtocol = -1; 19523 re->ipv6RoutePolicy = 0; 19524 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 19525 re->ipv6RouteNextHopRDI = 0; 19526 re->ipv6RouteWeight = 0; 19527 re->ipv6RouteMetric = 0; 19528 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 19529 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 19530 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19531 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 19532 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19533 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19534 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 19535 re->ipv6RouteInfo.re_flags = ire->ire_flags; 19536 19537 if (ire->ire_flags & RTF_DYNAMIC) { 19538 re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19539 } else { 19540 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 19541 } 19542 19543 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19544 (char *)re, (int)sizeof (*re))) { 19545 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19546 (uint_t)sizeof (*re))); 19547 } 19548 19549 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19550 iaeptr->iae_routeidx = ird->ird_idx; 19551 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19552 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19553 } 19554 19555 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19556 (char *)iae, sacnt * sizeof (*iae))) { 19557 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19558 (unsigned)(sacnt * sizeof (*iae)))); 19559 } 19560 19561 /* bump route index for next pass */ 19562 ird->ird_idx++; 19563 19564 kmem_free(re, sizeof (*re)); 19565 if (sacnt != 0) 19566 kmem_free(iae, sacnt * sizeof (*iae)); 19567 19568 if (gcgrp != NULL) 19569 rw_exit(&gcgrp->gcgrp_rwlock); 19570 } 19571 19572 /* 19573 * ndp_walk routine to create ipv6NetToMediaEntryTable 19574 */ 19575 static int 19576 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 19577 { 19578 ill_t *ill; 19579 mib2_ipv6NetToMediaEntry_t ntme; 19580 dl_unitdata_req_t *dl; 19581 19582 ill = nce->nce_ill; 19583 if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */ 19584 return (0); 19585 19586 /* 19587 * Neighbor cache entry attached to IRE with on-link 19588 * destination. 19589 */ 19590 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 19591 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 19592 if ((ill->ill_flags & ILLF_XRESOLV) && 19593 (nce->nce_res_mp != NULL)) { 19594 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 19595 ntme.ipv6NetToMediaPhysAddress.o_length = 19596 dl->dl_dest_addr_length; 19597 } else { 19598 ntme.ipv6NetToMediaPhysAddress.o_length = 19599 ill->ill_phys_addr_length; 19600 } 19601 if (nce->nce_res_mp != NULL) { 19602 bcopy((char *)nce->nce_res_mp->b_rptr + 19603 NCE_LL_ADDR_OFFSET(ill), 19604 ntme.ipv6NetToMediaPhysAddress.o_bytes, 19605 ntme.ipv6NetToMediaPhysAddress.o_length); 19606 } else { 19607 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 19608 ill->ill_phys_addr_length); 19609 } 19610 /* 19611 * Note: Returns ND_* states. Should be: 19612 * reachable(1), stale(2), delay(3), probe(4), 19613 * invalid(5), unknown(6) 19614 */ 19615 ntme.ipv6NetToMediaState = nce->nce_state; 19616 ntme.ipv6NetToMediaLastUpdated = 0; 19617 19618 /* other(1), dynamic(2), static(3), local(4) */ 19619 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 19620 ntme.ipv6NetToMediaType = 4; 19621 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 19622 ntme.ipv6NetToMediaType = 1; 19623 } else { 19624 ntme.ipv6NetToMediaType = 2; 19625 } 19626 19627 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 19628 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 19629 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 19630 (uint_t)sizeof (ntme))); 19631 } 19632 return (0); 19633 } 19634 19635 /* 19636 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 19637 */ 19638 /* ARGSUSED */ 19639 int 19640 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 19641 { 19642 switch (level) { 19643 case MIB2_IP: 19644 case MIB2_ICMP: 19645 switch (name) { 19646 default: 19647 break; 19648 } 19649 return (1); 19650 default: 19651 return (1); 19652 } 19653 } 19654 19655 /* 19656 * When there exists both a 64- and 32-bit counter of a particular type 19657 * (i.e., InReceives), only the 64-bit counters are added. 19658 */ 19659 void 19660 ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2) 19661 { 19662 UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors); 19663 UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors); 19664 UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes); 19665 UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors); 19666 UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos); 19667 UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts); 19668 UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards); 19669 UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards); 19670 UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs); 19671 UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails); 19672 UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates); 19673 UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds); 19674 UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs); 19675 UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails); 19676 UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes); 19677 UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates); 19678 UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups); 19679 UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits); 19680 UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs); 19681 UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows); 19682 UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows); 19683 UPDATE_MIB(o1, ipIfStatsInWrongIPVersion, 19684 o2->ipIfStatsInWrongIPVersion); 19685 UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion, 19686 o2->ipIfStatsInWrongIPVersion); 19687 UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion, 19688 o2->ipIfStatsOutSwitchIPVersion); 19689 UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives); 19690 UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets); 19691 UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams, 19692 o2->ipIfStatsHCInForwDatagrams); 19693 UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers); 19694 UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests); 19695 UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams, 19696 o2->ipIfStatsHCOutForwDatagrams); 19697 UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds); 19698 UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits); 19699 UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets); 19700 UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts); 19701 UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets); 19702 UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts); 19703 UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets, 19704 o2->ipIfStatsHCOutMcastOctets); 19705 UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts); 19706 UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts); 19707 UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded); 19708 UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed); 19709 UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs); 19710 UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs); 19711 UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts); 19712 } 19713 19714 void 19715 ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2) 19716 { 19717 UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs); 19718 UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors); 19719 UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs); 19720 UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs); 19721 UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds); 19722 UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems); 19723 UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs); 19724 UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos); 19725 UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies); 19726 UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits, 19727 o2->ipv6IfIcmpInRouterSolicits); 19728 UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements, 19729 o2->ipv6IfIcmpInRouterAdvertisements); 19730 UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits, 19731 o2->ipv6IfIcmpInNeighborSolicits); 19732 UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements, 19733 o2->ipv6IfIcmpInNeighborAdvertisements); 19734 UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects); 19735 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries, 19736 o2->ipv6IfIcmpInGroupMembQueries); 19737 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses, 19738 o2->ipv6IfIcmpInGroupMembResponses); 19739 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions, 19740 o2->ipv6IfIcmpInGroupMembReductions); 19741 UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs); 19742 UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors); 19743 UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs, 19744 o2->ipv6IfIcmpOutDestUnreachs); 19745 UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs, 19746 o2->ipv6IfIcmpOutAdminProhibs); 19747 UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds); 19748 UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems, 19749 o2->ipv6IfIcmpOutParmProblems); 19750 UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs); 19751 UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos); 19752 UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies); 19753 UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits, 19754 o2->ipv6IfIcmpOutRouterSolicits); 19755 UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements, 19756 o2->ipv6IfIcmpOutRouterAdvertisements); 19757 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits, 19758 o2->ipv6IfIcmpOutNeighborSolicits); 19759 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements, 19760 o2->ipv6IfIcmpOutNeighborAdvertisements); 19761 UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects); 19762 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries, 19763 o2->ipv6IfIcmpOutGroupMembQueries); 19764 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses, 19765 o2->ipv6IfIcmpOutGroupMembResponses); 19766 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions, 19767 o2->ipv6IfIcmpOutGroupMembReductions); 19768 UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows); 19769 UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit); 19770 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements, 19771 o2->ipv6IfIcmpInBadNeighborAdvertisements); 19772 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations, 19773 o2->ipv6IfIcmpInBadNeighborSolicitations); 19774 UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects); 19775 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal, 19776 o2->ipv6IfIcmpInGroupMembTotal); 19777 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries, 19778 o2->ipv6IfIcmpInGroupMembBadQueries); 19779 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports, 19780 o2->ipv6IfIcmpInGroupMembBadReports); 19781 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports, 19782 o2->ipv6IfIcmpInGroupMembOurReports); 19783 } 19784 19785 /* 19786 * Called before the options are updated to check if this packet will 19787 * be source routed from here. 19788 * This routine assumes that the options are well formed i.e. that they 19789 * have already been checked. 19790 */ 19791 static boolean_t 19792 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) 19793 { 19794 ipoptp_t opts; 19795 uchar_t *opt; 19796 uint8_t optval; 19797 uint8_t optlen; 19798 ipaddr_t dst; 19799 ire_t *ire; 19800 19801 if (IS_SIMPLE_IPH(ipha)) { 19802 ip2dbg(("not source routed\n")); 19803 return (B_FALSE); 19804 } 19805 dst = ipha->ipha_dst; 19806 for (optval = ipoptp_first(&opts, ipha); 19807 optval != IPOPT_EOL; 19808 optval = ipoptp_next(&opts)) { 19809 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 19810 opt = opts.ipoptp_cur; 19811 optlen = opts.ipoptp_len; 19812 ip2dbg(("ip_source_routed: opt %d, len %d\n", 19813 optval, optlen)); 19814 switch (optval) { 19815 uint32_t off; 19816 case IPOPT_SSRR: 19817 case IPOPT_LSRR: 19818 /* 19819 * If dst is one of our addresses and there are some 19820 * entries left in the source route return (true). 19821 */ 19822 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 19823 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19824 if (ire == NULL) { 19825 ip2dbg(("ip_source_routed: not next" 19826 " source route 0x%x\n", 19827 ntohl(dst))); 19828 return (B_FALSE); 19829 } 19830 ire_refrele(ire); 19831 off = opt[IPOPT_OFFSET]; 19832 off--; 19833 if (optlen < IP_ADDR_LEN || 19834 off > optlen - IP_ADDR_LEN) { 19835 /* End of source route */ 19836 ip1dbg(("ip_source_routed: end of SR\n")); 19837 return (B_FALSE); 19838 } 19839 return (B_TRUE); 19840 } 19841 } 19842 ip2dbg(("not source routed\n")); 19843 return (B_FALSE); 19844 } 19845 19846 /* 19847 * Check if the packet contains any source route. 19848 */ 19849 static boolean_t 19850 ip_source_route_included(ipha_t *ipha) 19851 { 19852 ipoptp_t opts; 19853 uint8_t optval; 19854 19855 if (IS_SIMPLE_IPH(ipha)) 19856 return (B_FALSE); 19857 for (optval = ipoptp_first(&opts, ipha); 19858 optval != IPOPT_EOL; 19859 optval = ipoptp_next(&opts)) { 19860 switch (optval) { 19861 case IPOPT_SSRR: 19862 case IPOPT_LSRR: 19863 return (B_TRUE); 19864 } 19865 } 19866 return (B_FALSE); 19867 } 19868 19869 /* 19870 * Called when the IRE expiration timer fires. 19871 */ 19872 void 19873 ip_trash_timer_expire(void *args) 19874 { 19875 int flush_flag = 0; 19876 ire_expire_arg_t iea; 19877 ip_stack_t *ipst = (ip_stack_t *)args; 19878 19879 iea.iea_ipst = ipst; /* No netstack_hold */ 19880 19881 /* 19882 * ip_ire_expire_id is protected by ip_trash_timer_lock. 19883 * This lock makes sure that a new invocation of this function 19884 * that occurs due to an almost immediate timer firing will not 19885 * progress beyond this point until the current invocation is done 19886 */ 19887 mutex_enter(&ipst->ips_ip_trash_timer_lock); 19888 ipst->ips_ip_ire_expire_id = 0; 19889 mutex_exit(&ipst->ips_ip_trash_timer_lock); 19890 19891 /* Periodic timer */ 19892 if (ipst->ips_ip_ire_arp_time_elapsed >= 19893 ipst->ips_ip_ire_arp_interval) { 19894 /* 19895 * Remove all IRE_CACHE entries since they might 19896 * contain arp information. 19897 */ 19898 flush_flag |= FLUSH_ARP_TIME; 19899 ipst->ips_ip_ire_arp_time_elapsed = 0; 19900 IP_STAT(ipst, ip_ire_arp_timer_expired); 19901 } 19902 if (ipst->ips_ip_ire_rd_time_elapsed >= 19903 ipst->ips_ip_ire_redir_interval) { 19904 /* Remove all redirects */ 19905 flush_flag |= FLUSH_REDIRECT_TIME; 19906 ipst->ips_ip_ire_rd_time_elapsed = 0; 19907 IP_STAT(ipst, ip_ire_redirect_timer_expired); 19908 } 19909 if (ipst->ips_ip_ire_pmtu_time_elapsed >= 19910 ipst->ips_ip_ire_pathmtu_interval) { 19911 /* Increase path mtu */ 19912 flush_flag |= FLUSH_MTU_TIME; 19913 ipst->ips_ip_ire_pmtu_time_elapsed = 0; 19914 IP_STAT(ipst, ip_ire_pmtu_timer_expired); 19915 } 19916 19917 /* 19918 * Optimize for the case when there are no redirects in the 19919 * ftable, that is, no need to walk the ftable in that case. 19920 */ 19921 if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) { 19922 iea.iea_flush_flag = flush_flag; 19923 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire, 19924 (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL, 19925 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 19926 NULL, ALL_ZONES, ipst); 19927 } 19928 if ((flush_flag & FLUSH_REDIRECT_TIME) && 19929 ipst->ips_ip_redirect_cnt > 0) { 19930 iea.iea_flush_flag = flush_flag; 19931 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE, 19932 ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 19933 0, NULL, 0, NULL, NULL, ALL_ZONES, ipst); 19934 } 19935 if (flush_flag & FLUSH_MTU_TIME) { 19936 /* 19937 * Walk all IPv6 IRE's and update them 19938 * Note that ARP and redirect timers are not 19939 * needed since NUD handles stale entries. 19940 */ 19941 flush_flag = FLUSH_MTU_TIME; 19942 iea.iea_flush_flag = flush_flag; 19943 ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea, 19944 ALL_ZONES, ipst); 19945 } 19946 19947 ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval; 19948 ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval; 19949 ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval; 19950 19951 /* 19952 * Hold the lock to serialize timeout calls and prevent 19953 * stale values in ip_ire_expire_id. Otherwise it is possible 19954 * for the timer to fire and a new invocation of this function 19955 * to start before the return value of timeout has been stored 19956 * in ip_ire_expire_id by the current invocation. 19957 */ 19958 mutex_enter(&ipst->ips_ip_trash_timer_lock); 19959 ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire, 19960 (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 19961 mutex_exit(&ipst->ips_ip_trash_timer_lock); 19962 } 19963 19964 /* 19965 * Called by the memory allocator subsystem directly, when the system 19966 * is running low on memory. 19967 */ 19968 /* ARGSUSED */ 19969 void 19970 ip_trash_ire_reclaim(void *args) 19971 { 19972 netstack_handle_t nh; 19973 netstack_t *ns; 19974 19975 netstack_next_init(&nh); 19976 while ((ns = netstack_next(&nh)) != NULL) { 19977 ip_trash_ire_reclaim_stack(ns->netstack_ip); 19978 netstack_rele(ns); 19979 } 19980 netstack_next_fini(&nh); 19981 } 19982 19983 static void 19984 ip_trash_ire_reclaim_stack(ip_stack_t *ipst) 19985 { 19986 ire_cache_count_t icc; 19987 ire_cache_reclaim_t icr; 19988 ncc_cache_count_t ncc; 19989 nce_cache_reclaim_t ncr; 19990 uint_t delete_cnt; 19991 /* 19992 * Memory reclaim call back. 19993 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 19994 * Then, with a target of freeing 1/Nth of IRE_CACHE 19995 * entries, determine what fraction to free for 19996 * each category of IRE_CACHE entries giving absolute priority 19997 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 19998 * entry will be freed unless all offlink entries are freed). 19999 */ 20000 icc.icc_total = 0; 20001 icc.icc_unused = 0; 20002 icc.icc_offlink = 0; 20003 icc.icc_pmtu = 0; 20004 icc.icc_onlink = 0; 20005 ire_walk(ire_cache_count, (char *)&icc, ipst); 20006 20007 /* 20008 * Free NCEs for IPv6 like the onlink ires. 20009 */ 20010 ncc.ncc_total = 0; 20011 ncc.ncc_host = 0; 20012 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst); 20013 20014 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 20015 icc.icc_pmtu + icc.icc_onlink); 20016 delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction; 20017 IP_STAT(ipst, ip_trash_ire_reclaim_calls); 20018 if (delete_cnt == 0) 20019 return; 20020 IP_STAT(ipst, ip_trash_ire_reclaim_success); 20021 /* Always delete all unused offlink entries */ 20022 icr.icr_ipst = ipst; 20023 icr.icr_unused = 1; 20024 if (delete_cnt <= icc.icc_unused) { 20025 /* 20026 * Only need to free unused entries. In other words, 20027 * there are enough unused entries to free to meet our 20028 * target number of freed ire cache entries. 20029 */ 20030 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 20031 ncr.ncr_host = 0; 20032 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 20033 /* 20034 * Only need to free unused entries, plus a fraction of offlink 20035 * entries. It follows from the first if statement that 20036 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 20037 */ 20038 delete_cnt -= icc.icc_unused; 20039 /* Round up # deleted by truncating fraction */ 20040 icr.icr_offlink = icc.icc_offlink / delete_cnt; 20041 icr.icr_pmtu = icr.icr_onlink = 0; 20042 ncr.ncr_host = 0; 20043 } else if (delete_cnt <= 20044 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 20045 /* 20046 * Free all unused and offlink entries, plus a fraction of 20047 * pmtu entries. It follows from the previous if statement 20048 * that icc_pmtu is non-zero, and that 20049 * delete_cnt != icc_unused + icc_offlink. 20050 */ 20051 icr.icr_offlink = 1; 20052 delete_cnt -= icc.icc_unused + icc.icc_offlink; 20053 /* Round up # deleted by truncating fraction */ 20054 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 20055 icr.icr_onlink = 0; 20056 ncr.ncr_host = 0; 20057 } else { 20058 /* 20059 * Free all unused, offlink, and pmtu entries, plus a fraction 20060 * of onlink entries. If we're here, then we know that 20061 * icc_onlink is non-zero, and that 20062 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 20063 */ 20064 icr.icr_offlink = icr.icr_pmtu = 1; 20065 delete_cnt -= icc.icc_unused + icc.icc_offlink + 20066 icc.icc_pmtu; 20067 /* Round up # deleted by truncating fraction */ 20068 icr.icr_onlink = icc.icc_onlink / delete_cnt; 20069 /* Using the same delete fraction as for onlink IREs */ 20070 ncr.ncr_host = ncc.ncc_host / delete_cnt; 20071 } 20072 #ifdef DEBUG 20073 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 20074 "fractions %d/%d/%d/%d\n", 20075 icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total, 20076 icc.icc_unused, icc.icc_offlink, 20077 icc.icc_pmtu, icc.icc_onlink, 20078 icr.icr_unused, icr.icr_offlink, 20079 icr.icr_pmtu, icr.icr_onlink)); 20080 #endif 20081 ire_walk(ire_cache_reclaim, (char *)&icr, ipst); 20082 if (ncr.ncr_host != 0) 20083 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 20084 (uchar_t *)&ncr, ipst); 20085 #ifdef DEBUG 20086 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 20087 icc.icc_pmtu = 0; icc.icc_onlink = 0; 20088 ire_walk(ire_cache_count, (char *)&icc, ipst); 20089 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 20090 icc.icc_total, icc.icc_unused, icc.icc_offlink, 20091 icc.icc_pmtu, icc.icc_onlink)); 20092 #endif 20093 } 20094 20095 /* 20096 * ip_unbind is called when a copy of an unbind request is received from the 20097 * upper level protocol. We remove this conn from any fanout hash list it is 20098 * on, and zero out the bind information. No reply is expected up above. 20099 */ 20100 mblk_t * 20101 ip_unbind(queue_t *q, mblk_t *mp) 20102 { 20103 conn_t *connp = Q_TO_CONN(q); 20104 20105 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 20106 20107 if (is_system_labeled() && connp->conn_anon_port) { 20108 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 20109 connp->conn_mlp_type, connp->conn_ulp, 20110 ntohs(connp->conn_lport), B_FALSE); 20111 connp->conn_anon_port = 0; 20112 } 20113 connp->conn_mlp_type = mlptSingle; 20114 20115 ipcl_hash_remove(connp); 20116 20117 ASSERT(mp->b_cont == NULL); 20118 /* 20119 * Convert mp into a T_OK_ACK 20120 */ 20121 mp = mi_tpi_ok_ack_alloc(mp); 20122 20123 /* 20124 * should not happen in practice... T_OK_ACK is smaller than the 20125 * original message. 20126 */ 20127 if (mp == NULL) 20128 return (NULL); 20129 20130 return (mp); 20131 } 20132 20133 /* 20134 * Write side put procedure. Outbound data, IOCTLs, responses from 20135 * resolvers, etc, come down through here. 20136 * 20137 * arg2 is always a queue_t *. 20138 * When that queue is an ill_t (i.e. q_next != NULL), then arg must be 20139 * the zoneid. 20140 * When that queue is not an ill_t, then arg must be a conn_t pointer. 20141 */ 20142 void 20143 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 20144 { 20145 ip_output_options(arg, mp, arg2, caller, &zero_info); 20146 } 20147 20148 void 20149 ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, 20150 ip_opt_info_t *infop) 20151 { 20152 conn_t *connp = NULL; 20153 queue_t *q = (queue_t *)arg2; 20154 ipha_t *ipha; 20155 #define rptr ((uchar_t *)ipha) 20156 ire_t *ire = NULL; 20157 ire_t *sctp_ire = NULL; 20158 uint32_t v_hlen_tos_len; 20159 ipaddr_t dst; 20160 mblk_t *first_mp = NULL; 20161 boolean_t mctl_present; 20162 ipsec_out_t *io; 20163 int match_flags; 20164 ill_t *attach_ill = NULL; 20165 /* Bind to IPIF_NOFAILOVER ill etc. */ 20166 ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ 20167 ipif_t *dst_ipif; 20168 boolean_t multirt_need_resolve = B_FALSE; 20169 mblk_t *copy_mp = NULL; 20170 int err; 20171 zoneid_t zoneid; 20172 boolean_t need_decref = B_FALSE; 20173 boolean_t ignore_dontroute = B_FALSE; 20174 boolean_t ignore_nexthop = B_FALSE; 20175 boolean_t ip_nexthop = B_FALSE; 20176 ipaddr_t nexthop_addr; 20177 ip_stack_t *ipst; 20178 20179 #ifdef _BIG_ENDIAN 20180 #define V_HLEN (v_hlen_tos_len >> 24) 20181 #else 20182 #define V_HLEN (v_hlen_tos_len & 0xFF) 20183 #endif 20184 20185 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 20186 "ip_wput_start: q %p", q); 20187 20188 /* 20189 * ip_wput fast path 20190 */ 20191 20192 /* is packet from ARP ? */ 20193 if (q->q_next != NULL) { 20194 zoneid = (zoneid_t)(uintptr_t)arg; 20195 goto qnext; 20196 } 20197 20198 connp = (conn_t *)arg; 20199 ASSERT(connp != NULL); 20200 zoneid = connp->conn_zoneid; 20201 ipst = connp->conn_netstack->netstack_ip; 20202 20203 /* is queue flow controlled? */ 20204 if ((q->q_first != NULL || connp->conn_draining) && 20205 (caller == IP_WPUT)) { 20206 ASSERT(!need_decref); 20207 (void) putq(q, mp); 20208 return; 20209 } 20210 20211 /* Multidata transmit? */ 20212 if (DB_TYPE(mp) == M_MULTIDATA) { 20213 /* 20214 * We should never get here, since all Multidata messages 20215 * originating from tcp should have been directed over to 20216 * tcp_multisend() in the first place. 20217 */ 20218 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20219 freemsg(mp); 20220 return; 20221 } else if (DB_TYPE(mp) != M_DATA) 20222 goto notdata; 20223 20224 if (mp->b_flag & MSGHASREF) { 20225 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20226 mp->b_flag &= ~MSGHASREF; 20227 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 20228 need_decref = B_TRUE; 20229 } 20230 ipha = (ipha_t *)mp->b_rptr; 20231 20232 /* is IP header non-aligned or mblk smaller than basic IP header */ 20233 #ifndef SAFETY_BEFORE_SPEED 20234 if (!OK_32PTR(rptr) || 20235 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 20236 goto hdrtoosmall; 20237 #endif 20238 20239 ASSERT(OK_32PTR(ipha)); 20240 20241 /* 20242 * This function assumes that mp points to an IPv4 packet. If it's the 20243 * wrong version, we'll catch it again in ip_output_v6. 20244 * 20245 * Note that this is *only* locally-generated output here, and never 20246 * forwarded data, and that we need to deal only with transports that 20247 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 20248 * label.) 20249 */ 20250 if (is_system_labeled() && 20251 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 20252 !connp->conn_ulp_labeled) { 20253 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 20254 connp->conn_mac_exempt, ipst); 20255 ipha = (ipha_t *)mp->b_rptr; 20256 if (err != 0) { 20257 first_mp = mp; 20258 if (err == EINVAL) 20259 goto icmp_parameter_problem; 20260 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 20261 goto discard_pkt; 20262 } 20263 } 20264 20265 ASSERT(infop != NULL); 20266 20267 if (infop->ip_opt_flags & IP_VERIFY_SRC) { 20268 /* 20269 * IP_PKTINFO ancillary option is present. 20270 * IPCL_ZONEID is used to honor IP_ALLZONES option which 20271 * allows using address of any zone as the source address. 20272 */ 20273 ire = ire_ctable_lookup(ipha->ipha_src, 0, 20274 (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp), 20275 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 20276 if (ire == NULL) 20277 goto drop_pkt; 20278 ire_refrele(ire); 20279 ire = NULL; 20280 } 20281 20282 /* 20283 * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index 20284 * passed in IP_PKTINFO. 20285 */ 20286 if (infop->ip_opt_ill_index != 0 && 20287 connp->conn_outgoing_ill == NULL && 20288 connp->conn_nofailover_ill == NULL) { 20289 20290 xmit_ill = ill_lookup_on_ifindex( 20291 infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL, 20292 ipst); 20293 20294 if (xmit_ill == NULL || IS_VNI(xmit_ill)) 20295 goto drop_pkt; 20296 /* 20297 * check that there is an ipif belonging 20298 * to our zone. IPCL_ZONEID is not used because 20299 * IP_ALLZONES option is valid only when the ill is 20300 * accessible from all zones i.e has a valid ipif in 20301 * all zones. 20302 */ 20303 if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) { 20304 goto drop_pkt; 20305 } 20306 } 20307 20308 /* 20309 * If there is a policy, try to attach an ipsec_out in 20310 * the front. At the end, first_mp either points to a 20311 * M_DATA message or IPSEC_OUT message linked to a 20312 * M_DATA message. We have to do it now as we might 20313 * lose the "conn" if we go through ip_newroute. 20314 */ 20315 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 20316 if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL, 20317 ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) { 20318 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20319 if (need_decref) 20320 CONN_DEC_REF(connp); 20321 return; 20322 } else { 20323 ASSERT(mp->b_datap->db_type == M_CTL); 20324 first_mp = mp; 20325 mp = mp->b_cont; 20326 mctl_present = B_TRUE; 20327 } 20328 } else { 20329 first_mp = mp; 20330 mctl_present = B_FALSE; 20331 } 20332 20333 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20334 20335 /* is wrong version or IP options present */ 20336 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 20337 goto version_hdrlen_check; 20338 dst = ipha->ipha_dst; 20339 20340 if (connp->conn_nofailover_ill != NULL) { 20341 attach_ill = conn_get_held_ill(connp, 20342 &connp->conn_nofailover_ill, &err); 20343 if (err == ILL_LOOKUP_FAILED) { 20344 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20345 if (need_decref) 20346 CONN_DEC_REF(connp); 20347 freemsg(first_mp); 20348 return; 20349 } 20350 } 20351 20352 /* If IP_BOUND_IF has been set, use that ill. */ 20353 if (connp->conn_outgoing_ill != NULL) { 20354 xmit_ill = conn_get_held_ill(connp, 20355 &connp->conn_outgoing_ill, &err); 20356 if (err == ILL_LOOKUP_FAILED) 20357 goto drop_pkt; 20358 20359 goto send_from_ill; 20360 } 20361 20362 /* is packet multicast? */ 20363 if (CLASSD(dst)) 20364 goto multicast; 20365 20366 /* 20367 * If xmit_ill is set above due to index passed in ip_pkt_info. It 20368 * takes precedence over conn_dontroute and conn_nexthop_set 20369 */ 20370 if (xmit_ill != NULL) 20371 goto send_from_ill; 20372 20373 if (connp->conn_dontroute || connp->conn_nexthop_set) { 20374 /* 20375 * If the destination is a broadcast, local, or loopback 20376 * address, SO_DONTROUTE and IP_NEXTHOP go through the 20377 * standard path. 20378 */ 20379 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20380 if ((ire == NULL) || (ire->ire_type & 20381 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) { 20382 if (ire != NULL) { 20383 ire_refrele(ire); 20384 /* No more access to ire */ 20385 ire = NULL; 20386 } 20387 /* 20388 * bypass routing checks and go directly to interface. 20389 */ 20390 if (connp->conn_dontroute) 20391 goto dontroute; 20392 20393 ASSERT(connp->conn_nexthop_set); 20394 ip_nexthop = B_TRUE; 20395 nexthop_addr = connp->conn_nexthop_v4; 20396 goto send_from_ill; 20397 } 20398 20399 /* Must be a broadcast, a loopback or a local ire */ 20400 ire_refrele(ire); 20401 /* No more access to ire */ 20402 ire = NULL; 20403 } 20404 20405 if (attach_ill != NULL) 20406 goto send_from_ill; 20407 20408 /* 20409 * We cache IRE_CACHEs to avoid lookups. We don't do 20410 * this for the tcp global queue and listen end point 20411 * as it does not really have a real destination to 20412 * talk to. This is also true for SCTP. 20413 */ 20414 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 20415 !connp->conn_fully_bound) { 20416 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20417 if (ire == NULL) 20418 goto noirefound; 20419 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20420 "ip_wput_end: q %p (%S)", q, "end"); 20421 20422 /* 20423 * Check if the ire has the RTF_MULTIRT flag, inherited 20424 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20425 */ 20426 if (ire->ire_flags & RTF_MULTIRT) { 20427 20428 /* 20429 * Force the TTL of multirouted packets if required. 20430 * The TTL of such packets is bounded by the 20431 * ip_multirt_ttl ndd variable. 20432 */ 20433 if ((ipst->ips_ip_multirt_ttl > 0) && 20434 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20435 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20436 "(was %d), dst 0x%08x\n", 20437 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20438 ntohl(ire->ire_addr))); 20439 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20440 } 20441 /* 20442 * We look at this point if there are pending 20443 * unresolved routes. ire_multirt_resolvable() 20444 * checks in O(n) that all IRE_OFFSUBNET ire 20445 * entries for the packet's destination and 20446 * flagged RTF_MULTIRT are currently resolved. 20447 * If some remain unresolved, we make a copy 20448 * of the current message. It will be used 20449 * to initiate additional route resolutions. 20450 */ 20451 multirt_need_resolve = 20452 ire_multirt_need_resolve(ire->ire_addr, 20453 MBLK_GETLABEL(first_mp), ipst); 20454 ip2dbg(("ip_wput[TCP]: ire %p, " 20455 "multirt_need_resolve %d, first_mp %p\n", 20456 (void *)ire, multirt_need_resolve, 20457 (void *)first_mp)); 20458 if (multirt_need_resolve) { 20459 copy_mp = copymsg(first_mp); 20460 if (copy_mp != NULL) { 20461 MULTIRT_DEBUG_TAG(copy_mp); 20462 } 20463 } 20464 } 20465 20466 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20467 20468 /* 20469 * Try to resolve another multiroute if 20470 * ire_multirt_need_resolve() deemed it necessary. 20471 */ 20472 if (copy_mp != NULL) 20473 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20474 if (need_decref) 20475 CONN_DEC_REF(connp); 20476 return; 20477 } 20478 20479 /* 20480 * Access to conn_ire_cache. (protected by conn_lock) 20481 * 20482 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 20483 * the ire bucket lock here to check for CONDEMNED as it is okay to 20484 * send a packet or two with the IRE_CACHE that is going away. 20485 * Access to the ire requires an ire refhold on the ire prior to 20486 * its use since an interface unplumb thread may delete the cached 20487 * ire and release the refhold at any time. 20488 * 20489 * Caching an ire in the conn_ire_cache 20490 * 20491 * o Caching an ire pointer in the conn requires a strict check for 20492 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 20493 * ires before cleaning up the conns. So the caching of an ire pointer 20494 * in the conn is done after making sure under the bucket lock that the 20495 * ire has not yet been marked CONDEMNED. Otherwise we will end up 20496 * caching an ire after the unplumb thread has cleaned up the conn. 20497 * If the conn does not send a packet subsequently the unplumb thread 20498 * will be hanging waiting for the ire count to drop to zero. 20499 * 20500 * o We also need to atomically test for a null conn_ire_cache and 20501 * set the conn_ire_cache under the the protection of the conn_lock 20502 * to avoid races among concurrent threads trying to simultaneously 20503 * cache an ire in the conn_ire_cache. 20504 */ 20505 mutex_enter(&connp->conn_lock); 20506 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 20507 20508 if (ire != NULL && ire->ire_addr == dst && 20509 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20510 20511 IRE_REFHOLD(ire); 20512 mutex_exit(&connp->conn_lock); 20513 20514 } else { 20515 boolean_t cached = B_FALSE; 20516 connp->conn_ire_cache = NULL; 20517 mutex_exit(&connp->conn_lock); 20518 /* Release the old ire */ 20519 if (ire != NULL && sctp_ire == NULL) 20520 IRE_REFRELE_NOTR(ire); 20521 20522 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20523 if (ire == NULL) 20524 goto noirefound; 20525 IRE_REFHOLD_NOTR(ire); 20526 20527 mutex_enter(&connp->conn_lock); 20528 if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) { 20529 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20530 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20531 if (connp->conn_ulp == IPPROTO_TCP) 20532 TCP_CHECK_IREINFO(connp->conn_tcp, ire); 20533 connp->conn_ire_cache = ire; 20534 cached = B_TRUE; 20535 } 20536 rw_exit(&ire->ire_bucket->irb_lock); 20537 } 20538 mutex_exit(&connp->conn_lock); 20539 20540 /* 20541 * We can continue to use the ire but since it was 20542 * not cached, we should drop the extra reference. 20543 */ 20544 if (!cached) 20545 IRE_REFRELE_NOTR(ire); 20546 } 20547 20548 20549 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20550 "ip_wput_end: q %p (%S)", q, "end"); 20551 20552 /* 20553 * Check if the ire has the RTF_MULTIRT flag, inherited 20554 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20555 */ 20556 if (ire->ire_flags & RTF_MULTIRT) { 20557 20558 /* 20559 * Force the TTL of multirouted packets if required. 20560 * The TTL of such packets is bounded by the 20561 * ip_multirt_ttl ndd variable. 20562 */ 20563 if ((ipst->ips_ip_multirt_ttl > 0) && 20564 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20565 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20566 "(was %d), dst 0x%08x\n", 20567 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20568 ntohl(ire->ire_addr))); 20569 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20570 } 20571 20572 /* 20573 * At this point, we check to see if there are any pending 20574 * unresolved routes. ire_multirt_resolvable() 20575 * checks in O(n) that all IRE_OFFSUBNET ire 20576 * entries for the packet's destination and 20577 * flagged RTF_MULTIRT are currently resolved. 20578 * If some remain unresolved, we make a copy 20579 * of the current message. It will be used 20580 * to initiate additional route resolutions. 20581 */ 20582 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 20583 MBLK_GETLABEL(first_mp), ipst); 20584 ip2dbg(("ip_wput[not TCP]: ire %p, " 20585 "multirt_need_resolve %d, first_mp %p\n", 20586 (void *)ire, multirt_need_resolve, (void *)first_mp)); 20587 if (multirt_need_resolve) { 20588 copy_mp = copymsg(first_mp); 20589 if (copy_mp != NULL) { 20590 MULTIRT_DEBUG_TAG(copy_mp); 20591 } 20592 } 20593 } 20594 20595 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20596 20597 /* 20598 * Try to resolve another multiroute if 20599 * ire_multirt_resolvable() deemed it necessary 20600 */ 20601 if (copy_mp != NULL) 20602 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20603 if (need_decref) 20604 CONN_DEC_REF(connp); 20605 return; 20606 20607 qnext: 20608 /* 20609 * Upper Level Protocols pass down complete IP datagrams 20610 * as M_DATA messages. Everything else is a sideshow. 20611 * 20612 * 1) We could be re-entering ip_wput because of ip_neworute 20613 * in which case we could have a IPSEC_OUT message. We 20614 * need to pass through ip_wput like other datagrams and 20615 * hence cannot branch to ip_wput_nondata. 20616 * 20617 * 2) ARP, AH, ESP, and other clients who are on the module 20618 * instance of IP stream, give us something to deal with. 20619 * We will handle AH and ESP here and rest in ip_wput_nondata. 20620 * 20621 * 3) ICMP replies also could come here. 20622 */ 20623 ipst = ILLQ_TO_IPST(q); 20624 20625 if (DB_TYPE(mp) != M_DATA) { 20626 notdata: 20627 if (DB_TYPE(mp) == M_CTL) { 20628 /* 20629 * M_CTL messages are used by ARP, AH and ESP to 20630 * communicate with IP. We deal with IPSEC_IN and 20631 * IPSEC_OUT here. ip_wput_nondata handles other 20632 * cases. 20633 */ 20634 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 20635 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 20636 first_mp = mp->b_cont; 20637 first_mp->b_flag &= ~MSGHASREF; 20638 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20639 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 20640 CONN_DEC_REF(connp); 20641 connp = NULL; 20642 } 20643 if (ii->ipsec_info_type == IPSEC_IN) { 20644 /* 20645 * Either this message goes back to 20646 * IPsec for further processing or to 20647 * ULP after policy checks. 20648 */ 20649 ip_fanout_proto_again(mp, NULL, NULL, NULL); 20650 return; 20651 } else if (ii->ipsec_info_type == IPSEC_OUT) { 20652 io = (ipsec_out_t *)ii; 20653 if (io->ipsec_out_proc_begin) { 20654 /* 20655 * IPsec processing has already started. 20656 * Complete it. 20657 * IPQoS notes: We don't care what is 20658 * in ipsec_out_ill_index since this 20659 * won't be processed for IPQoS policies 20660 * in ipsec_out_process. 20661 */ 20662 ipsec_out_process(q, mp, NULL, 20663 io->ipsec_out_ill_index); 20664 return; 20665 } else { 20666 connp = (q->q_next != NULL) ? 20667 NULL : Q_TO_CONN(q); 20668 first_mp = mp; 20669 mp = mp->b_cont; 20670 mctl_present = B_TRUE; 20671 } 20672 zoneid = io->ipsec_out_zoneid; 20673 ASSERT(zoneid != ALL_ZONES); 20674 } else if (ii->ipsec_info_type == IPSEC_CTL) { 20675 /* 20676 * It's an IPsec control message requesting 20677 * an SADB update to be sent to the IPsec 20678 * hardware acceleration capable ills. 20679 */ 20680 ipsec_ctl_t *ipsec_ctl = 20681 (ipsec_ctl_t *)mp->b_rptr; 20682 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 20683 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 20684 mblk_t *cmp = mp->b_cont; 20685 20686 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 20687 ASSERT(cmp != NULL); 20688 20689 freeb(mp); 20690 ill_ipsec_capab_send_all(satype, cmp, sa, 20691 ipst->ips_netstack); 20692 return; 20693 } else { 20694 /* 20695 * This must be ARP or special TSOL signaling. 20696 */ 20697 ip_wput_nondata(NULL, q, mp, NULL); 20698 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20699 "ip_wput_end: q %p (%S)", q, "nondata"); 20700 return; 20701 } 20702 } else { 20703 /* 20704 * This must be non-(ARP/AH/ESP) messages. 20705 */ 20706 ASSERT(!need_decref); 20707 ip_wput_nondata(NULL, q, mp, NULL); 20708 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20709 "ip_wput_end: q %p (%S)", q, "nondata"); 20710 return; 20711 } 20712 } else { 20713 first_mp = mp; 20714 mctl_present = B_FALSE; 20715 } 20716 20717 ASSERT(first_mp != NULL); 20718 /* 20719 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if 20720 * to make sure that this packet goes out on the same interface it 20721 * came in. We handle that here. 20722 */ 20723 if (mctl_present) { 20724 uint_t ifindex; 20725 20726 io = (ipsec_out_t *)first_mp->b_rptr; 20727 if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) { 20728 /* 20729 * We may have lost the conn context if we are 20730 * coming here from ip_newroute(). Copy the 20731 * nexthop information. 20732 */ 20733 if (io->ipsec_out_ip_nexthop) { 20734 ip_nexthop = B_TRUE; 20735 nexthop_addr = io->ipsec_out_nexthop_addr; 20736 20737 ipha = (ipha_t *)mp->b_rptr; 20738 dst = ipha->ipha_dst; 20739 goto send_from_ill; 20740 } else { 20741 ASSERT(io->ipsec_out_ill_index != 0); 20742 ifindex = io->ipsec_out_ill_index; 20743 attach_ill = ill_lookup_on_ifindex(ifindex, 20744 B_FALSE, NULL, NULL, NULL, NULL, ipst); 20745 if (attach_ill == NULL) { 20746 ASSERT(xmit_ill == NULL); 20747 ip1dbg(("ip_output: bad ifindex for " 20748 "(BIND TO IPIF_NOFAILOVER) %d\n", 20749 ifindex)); 20750 freemsg(first_mp); 20751 BUMP_MIB(&ipst->ips_ip_mib, 20752 ipIfStatsOutDiscards); 20753 ASSERT(!need_decref); 20754 return; 20755 } 20756 } 20757 } 20758 } 20759 20760 ASSERT(xmit_ill == NULL); 20761 20762 /* We have a complete IP datagram heading outbound. */ 20763 ipha = (ipha_t *)mp->b_rptr; 20764 20765 #ifndef SPEED_BEFORE_SAFETY 20766 /* 20767 * Make sure we have a full-word aligned message and that at least 20768 * a simple IP header is accessible in the first message. If not, 20769 * try a pullup. For labeled systems we need to always take this 20770 * path as M_CTLs are "notdata" but have trailing data to process. 20771 */ 20772 if (!OK_32PTR(rptr) || 20773 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) { 20774 hdrtoosmall: 20775 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 20776 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20777 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 20778 if (first_mp == NULL) 20779 first_mp = mp; 20780 goto discard_pkt; 20781 } 20782 20783 /* This function assumes that mp points to an IPv4 packet. */ 20784 if (is_system_labeled() && q->q_next == NULL && 20785 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 20786 !connp->conn_ulp_labeled) { 20787 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 20788 connp->conn_mac_exempt, ipst); 20789 ipha = (ipha_t *)mp->b_rptr; 20790 if (first_mp != NULL) 20791 first_mp->b_cont = mp; 20792 if (err != 0) { 20793 if (first_mp == NULL) 20794 first_mp = mp; 20795 if (err == EINVAL) 20796 goto icmp_parameter_problem; 20797 ip2dbg(("ip_wput: label check failed (%d)\n", 20798 err)); 20799 goto discard_pkt; 20800 } 20801 } 20802 20803 ipha = (ipha_t *)mp->b_rptr; 20804 if (first_mp == NULL) { 20805 ASSERT(attach_ill == NULL && xmit_ill == NULL); 20806 /* 20807 * If we got here because of "goto hdrtoosmall" 20808 * We need to attach a IPSEC_OUT. 20809 */ 20810 if (connp->conn_out_enforce_policy) { 20811 if (((mp = ipsec_attach_ipsec_out(&mp, connp, 20812 NULL, ipha->ipha_protocol, 20813 ipst->ips_netstack)) == NULL)) { 20814 BUMP_MIB(&ipst->ips_ip_mib, 20815 ipIfStatsOutDiscards); 20816 if (need_decref) 20817 CONN_DEC_REF(connp); 20818 return; 20819 } else { 20820 ASSERT(mp->b_datap->db_type == M_CTL); 20821 first_mp = mp; 20822 mp = mp->b_cont; 20823 mctl_present = B_TRUE; 20824 } 20825 } else { 20826 first_mp = mp; 20827 mctl_present = B_FALSE; 20828 } 20829 } 20830 } 20831 #endif 20832 20833 /* Most of the code below is written for speed, not readability */ 20834 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20835 20836 /* 20837 * If ip_newroute() fails, we're going to need a full 20838 * header for the icmp wraparound. 20839 */ 20840 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 20841 uint_t v_hlen; 20842 version_hdrlen_check: 20843 ASSERT(first_mp != NULL); 20844 v_hlen = V_HLEN; 20845 /* 20846 * siphon off IPv6 packets coming down from transport 20847 * layer modules here. 20848 * Note: high-order bit carries NUD reachability confirmation 20849 */ 20850 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 20851 /* 20852 * FIXME: assume that callers of ip_output* call 20853 * the right version? 20854 */ 20855 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); 20856 ASSERT(xmit_ill == NULL); 20857 if (attach_ill != NULL) 20858 ill_refrele(attach_ill); 20859 if (need_decref) 20860 mp->b_flag |= MSGHASREF; 20861 (void) ip_output_v6(arg, first_mp, arg2, caller); 20862 return; 20863 } 20864 20865 if ((v_hlen >> 4) != IP_VERSION) { 20866 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20867 "ip_wput_end: q %p (%S)", q, "badvers"); 20868 goto discard_pkt; 20869 } 20870 /* 20871 * Is the header length at least 20 bytes? 20872 * 20873 * Are there enough bytes accessible in the header? If 20874 * not, try a pullup. 20875 */ 20876 v_hlen &= 0xF; 20877 v_hlen <<= 2; 20878 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 20879 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20880 "ip_wput_end: q %p (%S)", q, "badlen"); 20881 goto discard_pkt; 20882 } 20883 if (v_hlen > (mp->b_wptr - rptr)) { 20884 if (!pullupmsg(mp, v_hlen)) { 20885 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20886 "ip_wput_end: q %p (%S)", q, "badpullup2"); 20887 goto discard_pkt; 20888 } 20889 ipha = (ipha_t *)mp->b_rptr; 20890 } 20891 /* 20892 * Move first entry from any source route into ipha_dst and 20893 * verify the options 20894 */ 20895 if (ip_wput_options(q, first_mp, ipha, mctl_present, 20896 zoneid, ipst)) { 20897 ASSERT(xmit_ill == NULL); 20898 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20899 if (attach_ill != NULL) 20900 ill_refrele(attach_ill); 20901 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20902 "ip_wput_end: q %p (%S)", q, "badopts"); 20903 if (need_decref) 20904 CONN_DEC_REF(connp); 20905 return; 20906 } 20907 } 20908 dst = ipha->ipha_dst; 20909 20910 /* 20911 * Try to get an IRE_CACHE for the destination address. If we can't, 20912 * we have to run the packet through ip_newroute which will take 20913 * the appropriate action to arrange for an IRE_CACHE, such as querying 20914 * a resolver, or assigning a default gateway, etc. 20915 */ 20916 if (CLASSD(dst)) { 20917 ipif_t *ipif; 20918 uint32_t setsrc = 0; 20919 20920 multicast: 20921 ASSERT(first_mp != NULL); 20922 ip2dbg(("ip_wput: CLASSD\n")); 20923 if (connp == NULL) { 20924 /* 20925 * Use the first good ipif on the ill. 20926 * XXX Should this ever happen? (Appears 20927 * to show up with just ppp and no ethernet due 20928 * to in.rdisc.) 20929 * However, ire_send should be able to 20930 * call ip_wput_ire directly. 20931 * 20932 * XXX Also, this can happen for ICMP and other packets 20933 * with multicast source addresses. Perhaps we should 20934 * fix things so that we drop the packet in question, 20935 * but for now, just run with it. 20936 */ 20937 ill_t *ill = (ill_t *)q->q_ptr; 20938 20939 /* 20940 * Don't honor attach_if for this case. If ill 20941 * is part of the group, ipif could belong to 20942 * any ill and we cannot maintain attach_ill 20943 * and ipif_ill same anymore and the assert 20944 * below would fail. 20945 */ 20946 if (mctl_present && io->ipsec_out_attach_if) { 20947 io->ipsec_out_ill_index = 0; 20948 io->ipsec_out_attach_if = B_FALSE; 20949 ASSERT(attach_ill != NULL); 20950 ill_refrele(attach_ill); 20951 attach_ill = NULL; 20952 } 20953 20954 ASSERT(attach_ill == NULL); 20955 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 20956 if (ipif == NULL) { 20957 if (need_decref) 20958 CONN_DEC_REF(connp); 20959 freemsg(first_mp); 20960 return; 20961 } 20962 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 20963 ntohl(dst), ill->ill_name)); 20964 } else { 20965 /* 20966 * The order of precedence is IP_BOUND_IF, IP_PKTINFO 20967 * and IP_MULTICAST_IF. The block comment above this 20968 * function explains the locking mechanism used here. 20969 */ 20970 if (xmit_ill == NULL) { 20971 xmit_ill = conn_get_held_ill(connp, 20972 &connp->conn_outgoing_ill, &err); 20973 if (err == ILL_LOOKUP_FAILED) { 20974 ip1dbg(("ip_wput: No ill for " 20975 "IP_BOUND_IF\n")); 20976 BUMP_MIB(&ipst->ips_ip_mib, 20977 ipIfStatsOutNoRoutes); 20978 goto drop_pkt; 20979 } 20980 } 20981 20982 if (xmit_ill == NULL) { 20983 ipif = conn_get_held_ipif(connp, 20984 &connp->conn_multicast_ipif, &err); 20985 if (err == IPIF_LOOKUP_FAILED) { 20986 ip1dbg(("ip_wput: No ipif for " 20987 "multicast\n")); 20988 BUMP_MIB(&ipst->ips_ip_mib, 20989 ipIfStatsOutNoRoutes); 20990 goto drop_pkt; 20991 } 20992 } 20993 if (xmit_ill != NULL) { 20994 ipif = ipif_get_next_ipif(NULL, xmit_ill); 20995 if (ipif == NULL) { 20996 ip1dbg(("ip_wput: No ipif for " 20997 "xmit_ill\n")); 20998 BUMP_MIB(&ipst->ips_ip_mib, 20999 ipIfStatsOutNoRoutes); 21000 goto drop_pkt; 21001 } 21002 } else if (ipif == NULL || ipif->ipif_isv6) { 21003 /* 21004 * We must do this ipif determination here 21005 * else we could pass through ip_newroute 21006 * and come back here without the conn context. 21007 * 21008 * Note: we do late binding i.e. we bind to 21009 * the interface when the first packet is sent. 21010 * For performance reasons we do not rebind on 21011 * each packet but keep the binding until the 21012 * next IP_MULTICAST_IF option. 21013 * 21014 * conn_multicast_{ipif,ill} are shared between 21015 * IPv4 and IPv6 and AF_INET6 sockets can 21016 * send both IPv4 and IPv6 packets. Hence 21017 * we have to check that "isv6" matches above. 21018 */ 21019 if (ipif != NULL) 21020 ipif_refrele(ipif); 21021 ipif = ipif_lookup_group(dst, zoneid, ipst); 21022 if (ipif == NULL) { 21023 ip1dbg(("ip_wput: No ipif for " 21024 "multicast\n")); 21025 BUMP_MIB(&ipst->ips_ip_mib, 21026 ipIfStatsOutNoRoutes); 21027 goto drop_pkt; 21028 } 21029 err = conn_set_held_ipif(connp, 21030 &connp->conn_multicast_ipif, ipif); 21031 if (err == IPIF_LOOKUP_FAILED) { 21032 ipif_refrele(ipif); 21033 ip1dbg(("ip_wput: No ipif for " 21034 "multicast\n")); 21035 BUMP_MIB(&ipst->ips_ip_mib, 21036 ipIfStatsOutNoRoutes); 21037 goto drop_pkt; 21038 } 21039 } 21040 } 21041 ASSERT(!ipif->ipif_isv6); 21042 /* 21043 * As we may lose the conn by the time we reach ip_wput_ire, 21044 * we copy conn_multicast_loop and conn_dontroute on to an 21045 * ipsec_out. In case if this datagram goes out secure, 21046 * we need the ill_index also. Copy that also into the 21047 * ipsec_out. 21048 */ 21049 if (mctl_present) { 21050 io = (ipsec_out_t *)first_mp->b_rptr; 21051 ASSERT(first_mp->b_datap->db_type == M_CTL); 21052 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21053 } else { 21054 ASSERT(mp == first_mp); 21055 if ((first_mp = allocb(sizeof (ipsec_info_t), 21056 BPRI_HI)) == NULL) { 21057 ipif_refrele(ipif); 21058 first_mp = mp; 21059 goto discard_pkt; 21060 } 21061 first_mp->b_datap->db_type = M_CTL; 21062 first_mp->b_wptr += sizeof (ipsec_info_t); 21063 /* ipsec_out_secure is B_FALSE now */ 21064 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 21065 io = (ipsec_out_t *)first_mp->b_rptr; 21066 io->ipsec_out_type = IPSEC_OUT; 21067 io->ipsec_out_len = sizeof (ipsec_out_t); 21068 io->ipsec_out_use_global_policy = B_TRUE; 21069 io->ipsec_out_ns = ipst->ips_netstack; 21070 first_mp->b_cont = mp; 21071 mctl_present = B_TRUE; 21072 } 21073 if (attach_ill != NULL) { 21074 ASSERT(attach_ill == ipif->ipif_ill); 21075 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21076 21077 /* 21078 * Check if we need an ire that will not be 21079 * looked up by anybody else i.e. HIDDEN. 21080 */ 21081 if (ill_is_probeonly(attach_ill)) { 21082 match_flags |= MATCH_IRE_MARK_HIDDEN; 21083 } 21084 io->ipsec_out_ill_index = 21085 attach_ill->ill_phyint->phyint_ifindex; 21086 io->ipsec_out_attach_if = B_TRUE; 21087 } else { 21088 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 21089 io->ipsec_out_ill_index = 21090 ipif->ipif_ill->ill_phyint->phyint_ifindex; 21091 } 21092 if (connp != NULL) { 21093 io->ipsec_out_multicast_loop = 21094 connp->conn_multicast_loop; 21095 io->ipsec_out_dontroute = connp->conn_dontroute; 21096 io->ipsec_out_zoneid = connp->conn_zoneid; 21097 } 21098 /* 21099 * If the application uses IP_MULTICAST_IF with 21100 * different logical addresses of the same ILL, we 21101 * need to make sure that the soruce address of 21102 * the packet matches the logical IP address used 21103 * in the option. We do it by initializing ipha_src 21104 * here. This should keep IPsec also happy as 21105 * when we return from IPsec processing, we don't 21106 * have to worry about getting the right address on 21107 * the packet. Thus it is sufficient to look for 21108 * IRE_CACHE using MATCH_IRE_ILL rathen than 21109 * MATCH_IRE_IPIF. 21110 * 21111 * NOTE : We need to do it for non-secure case also as 21112 * this might go out secure if there is a global policy 21113 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER 21114 * address, the source should be initialized already and 21115 * hence we won't be initializing here. 21116 * 21117 * As we do not have the ire yet, it is possible that 21118 * we set the source address here and then later discover 21119 * that the ire implies the source address to be assigned 21120 * through the RTF_SETSRC flag. 21121 * In that case, the setsrc variable will remind us 21122 * that overwritting the source address by the one 21123 * of the RTF_SETSRC-flagged ire is allowed. 21124 */ 21125 if (ipha->ipha_src == INADDR_ANY && 21126 (connp == NULL || !connp->conn_unspec_src)) { 21127 ipha->ipha_src = ipif->ipif_src_addr; 21128 setsrc = RTF_SETSRC; 21129 } 21130 /* 21131 * Find an IRE which matches the destination and the outgoing 21132 * queue (i.e. the outgoing interface.) 21133 * For loopback use a unicast IP address for 21134 * the ire lookup. 21135 */ 21136 if (IS_LOOPBACK(ipif->ipif_ill)) 21137 dst = ipif->ipif_lcl_addr; 21138 21139 /* 21140 * If xmit_ill is set, we branch out to ip_newroute_ipif. 21141 * We don't need to lookup ire in ctable as the packet 21142 * needs to be sent to the destination through the specified 21143 * ill irrespective of ires in the cache table. 21144 */ 21145 ire = NULL; 21146 if (xmit_ill == NULL) { 21147 ire = ire_ctable_lookup(dst, 0, 0, ipif, 21148 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21149 } 21150 21151 /* 21152 * refrele attach_ill as its not needed anymore. 21153 */ 21154 if (attach_ill != NULL) { 21155 ill_refrele(attach_ill); 21156 attach_ill = NULL; 21157 } 21158 21159 if (ire == NULL) { 21160 /* 21161 * Multicast loopback and multicast forwarding is 21162 * done in ip_wput_ire. 21163 * 21164 * Mark this packet to make it be delivered to 21165 * ip_wput_ire after the new ire has been 21166 * created. 21167 * 21168 * The call to ip_newroute_ipif takes into account 21169 * the setsrc reminder. In any case, we take care 21170 * of the RTF_MULTIRT flag. 21171 */ 21172 mp->b_prev = mp->b_next = NULL; 21173 if (xmit_ill == NULL || 21174 xmit_ill->ill_ipif_up_count > 0) { 21175 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 21176 setsrc | RTF_MULTIRT, zoneid, infop); 21177 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21178 "ip_wput_end: q %p (%S)", q, "noire"); 21179 } else { 21180 freemsg(first_mp); 21181 } 21182 ipif_refrele(ipif); 21183 if (xmit_ill != NULL) 21184 ill_refrele(xmit_ill); 21185 if (need_decref) 21186 CONN_DEC_REF(connp); 21187 return; 21188 } 21189 21190 ipif_refrele(ipif); 21191 ipif = NULL; 21192 ASSERT(xmit_ill == NULL); 21193 21194 /* 21195 * Honor the RTF_SETSRC flag for multicast packets, 21196 * if allowed by the setsrc reminder. 21197 */ 21198 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 21199 ipha->ipha_src = ire->ire_src_addr; 21200 } 21201 21202 /* 21203 * Unconditionally force the TTL to 1 for 21204 * multirouted multicast packets: 21205 * multirouted multicast should not cross 21206 * multicast routers. 21207 */ 21208 if (ire->ire_flags & RTF_MULTIRT) { 21209 if (ipha->ipha_ttl > 1) { 21210 ip2dbg(("ip_wput: forcing multicast " 21211 "multirt TTL to 1 (was %d), dst 0x%08x\n", 21212 ipha->ipha_ttl, ntohl(ire->ire_addr))); 21213 ipha->ipha_ttl = 1; 21214 } 21215 } 21216 } else { 21217 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 21218 if ((ire != NULL) && (ire->ire_type & 21219 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 21220 ignore_dontroute = B_TRUE; 21221 ignore_nexthop = B_TRUE; 21222 } 21223 if (ire != NULL) { 21224 ire_refrele(ire); 21225 ire = NULL; 21226 } 21227 /* 21228 * Guard against coming in from arp in which case conn is NULL. 21229 * Also guard against non M_DATA with dontroute set but 21230 * destined to local, loopback or broadcast addresses. 21231 */ 21232 if (connp != NULL && connp->conn_dontroute && 21233 !ignore_dontroute) { 21234 dontroute: 21235 /* 21236 * Set TTL to 1 if SO_DONTROUTE is set to prevent 21237 * routing protocols from seeing false direct 21238 * connectivity. 21239 */ 21240 ipha->ipha_ttl = 1; 21241 21242 /* If suitable ipif not found, drop packet */ 21243 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst); 21244 if (dst_ipif == NULL) { 21245 noroute: 21246 ip1dbg(("ip_wput: no route for dst using" 21247 " SO_DONTROUTE\n")); 21248 BUMP_MIB(&ipst->ips_ip_mib, 21249 ipIfStatsOutNoRoutes); 21250 mp->b_prev = mp->b_next = NULL; 21251 if (first_mp == NULL) 21252 first_mp = mp; 21253 goto drop_pkt; 21254 } else { 21255 /* 21256 * If suitable ipif has been found, set 21257 * xmit_ill to the corresponding 21258 * ipif_ill because we'll be using the 21259 * send_from_ill logic below. 21260 */ 21261 ASSERT(xmit_ill == NULL); 21262 xmit_ill = dst_ipif->ipif_ill; 21263 mutex_enter(&xmit_ill->ill_lock); 21264 if (!ILL_CAN_LOOKUP(xmit_ill)) { 21265 mutex_exit(&xmit_ill->ill_lock); 21266 xmit_ill = NULL; 21267 ipif_refrele(dst_ipif); 21268 goto noroute; 21269 } 21270 ill_refhold_locked(xmit_ill); 21271 mutex_exit(&xmit_ill->ill_lock); 21272 ipif_refrele(dst_ipif); 21273 } 21274 } 21275 /* 21276 * If we are bound to IPIF_NOFAILOVER address, look for 21277 * an IRE_CACHE matching the ill. 21278 */ 21279 send_from_ill: 21280 if (attach_ill != NULL) { 21281 ipif_t *attach_ipif; 21282 21283 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21284 21285 /* 21286 * Check if we need an ire that will not be 21287 * looked up by anybody else i.e. HIDDEN. 21288 */ 21289 if (ill_is_probeonly(attach_ill)) { 21290 match_flags |= MATCH_IRE_MARK_HIDDEN; 21291 } 21292 21293 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 21294 if (attach_ipif == NULL) { 21295 ip1dbg(("ip_wput: No ipif for attach_ill\n")); 21296 goto discard_pkt; 21297 } 21298 ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, 21299 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21300 ipif_refrele(attach_ipif); 21301 } else if (xmit_ill != NULL) { 21302 ipif_t *ipif; 21303 21304 /* 21305 * Mark this packet as originated locally 21306 */ 21307 mp->b_prev = mp->b_next = NULL; 21308 21309 /* 21310 * Could be SO_DONTROUTE case also. 21311 * Verify that at least one ipif is up on the ill. 21312 */ 21313 if (xmit_ill->ill_ipif_up_count == 0) { 21314 ip1dbg(("ip_output: xmit_ill %s is down\n", 21315 xmit_ill->ill_name)); 21316 goto drop_pkt; 21317 } 21318 21319 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21320 if (ipif == NULL) { 21321 ip1dbg(("ip_output: xmit_ill %s NULL ipif\n", 21322 xmit_ill->ill_name)); 21323 goto drop_pkt; 21324 } 21325 21326 /* 21327 * Look for a ire that is part of the group, 21328 * if found use it else call ip_newroute_ipif. 21329 * IPCL_ZONEID is not used for matching because 21330 * IP_ALLZONES option is valid only when the 21331 * ill is accessible from all zones i.e has a 21332 * valid ipif in all zones. 21333 */ 21334 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 21335 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 21336 MBLK_GETLABEL(mp), match_flags, ipst); 21337 /* 21338 * If an ire exists use it or else create 21339 * an ire but don't add it to the cache. 21340 * Adding an ire may cause issues with 21341 * asymmetric routing. 21342 * In case of multiroute always act as if 21343 * ire does not exist. 21344 */ 21345 if (ire == NULL || ire->ire_flags & RTF_MULTIRT) { 21346 if (ire != NULL) 21347 ire_refrele(ire); 21348 ip_newroute_ipif(q, first_mp, ipif, 21349 dst, connp, 0, zoneid, infop); 21350 ipif_refrele(ipif); 21351 ip1dbg(("ip_output: xmit_ill via %s\n", 21352 xmit_ill->ill_name)); 21353 ill_refrele(xmit_ill); 21354 if (need_decref) 21355 CONN_DEC_REF(connp); 21356 return; 21357 } 21358 ipif_refrele(ipif); 21359 } else if (ip_nexthop || (connp != NULL && 21360 (connp->conn_nexthop_set)) && !ignore_nexthop) { 21361 if (!ip_nexthop) { 21362 ip_nexthop = B_TRUE; 21363 nexthop_addr = connp->conn_nexthop_v4; 21364 } 21365 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 21366 MATCH_IRE_GW; 21367 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 21368 NULL, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21369 } else { 21370 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), 21371 ipst); 21372 } 21373 if (!ire) { 21374 /* 21375 * Make sure we don't load spread if this 21376 * is IPIF_NOFAILOVER case. 21377 */ 21378 if ((attach_ill != NULL) || 21379 (ip_nexthop && !ignore_nexthop)) { 21380 if (mctl_present) { 21381 io = (ipsec_out_t *)first_mp->b_rptr; 21382 ASSERT(first_mp->b_datap->db_type == 21383 M_CTL); 21384 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21385 } else { 21386 ASSERT(mp == first_mp); 21387 first_mp = allocb( 21388 sizeof (ipsec_info_t), BPRI_HI); 21389 if (first_mp == NULL) { 21390 first_mp = mp; 21391 goto discard_pkt; 21392 } 21393 first_mp->b_datap->db_type = M_CTL; 21394 first_mp->b_wptr += 21395 sizeof (ipsec_info_t); 21396 /* ipsec_out_secure is B_FALSE now */ 21397 bzero(first_mp->b_rptr, 21398 sizeof (ipsec_info_t)); 21399 io = (ipsec_out_t *)first_mp->b_rptr; 21400 io->ipsec_out_type = IPSEC_OUT; 21401 io->ipsec_out_len = 21402 sizeof (ipsec_out_t); 21403 io->ipsec_out_use_global_policy = 21404 B_TRUE; 21405 io->ipsec_out_ns = ipst->ips_netstack; 21406 first_mp->b_cont = mp; 21407 mctl_present = B_TRUE; 21408 } 21409 if (attach_ill != NULL) { 21410 io->ipsec_out_ill_index = attach_ill-> 21411 ill_phyint->phyint_ifindex; 21412 io->ipsec_out_attach_if = B_TRUE; 21413 } else { 21414 io->ipsec_out_ip_nexthop = ip_nexthop; 21415 io->ipsec_out_nexthop_addr = 21416 nexthop_addr; 21417 } 21418 } 21419 noirefound: 21420 /* 21421 * Mark this packet as having originated on 21422 * this machine. This will be noted in 21423 * ire_add_then_send, which needs to know 21424 * whether to run it back through ip_wput or 21425 * ip_rput following successful resolution. 21426 */ 21427 mp->b_prev = NULL; 21428 mp->b_next = NULL; 21429 ip_newroute(q, first_mp, dst, connp, zoneid, ipst); 21430 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21431 "ip_wput_end: q %p (%S)", q, "newroute"); 21432 if (attach_ill != NULL) 21433 ill_refrele(attach_ill); 21434 if (xmit_ill != NULL) 21435 ill_refrele(xmit_ill); 21436 if (need_decref) 21437 CONN_DEC_REF(connp); 21438 return; 21439 } 21440 } 21441 21442 /* We now know where we are going with it. */ 21443 21444 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21445 "ip_wput_end: q %p (%S)", q, "end"); 21446 21447 /* 21448 * Check if the ire has the RTF_MULTIRT flag, inherited 21449 * from an IRE_OFFSUBNET ire entry in ip_newroute. 21450 */ 21451 if (ire->ire_flags & RTF_MULTIRT) { 21452 /* 21453 * Force the TTL of multirouted packets if required. 21454 * The TTL of such packets is bounded by the 21455 * ip_multirt_ttl ndd variable. 21456 */ 21457 if ((ipst->ips_ip_multirt_ttl > 0) && 21458 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 21459 ip2dbg(("ip_wput: forcing multirt TTL to %d " 21460 "(was %d), dst 0x%08x\n", 21461 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 21462 ntohl(ire->ire_addr))); 21463 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 21464 } 21465 /* 21466 * At this point, we check to see if there are any pending 21467 * unresolved routes. ire_multirt_resolvable() 21468 * checks in O(n) that all IRE_OFFSUBNET ire 21469 * entries for the packet's destination and 21470 * flagged RTF_MULTIRT are currently resolved. 21471 * If some remain unresolved, we make a copy 21472 * of the current message. It will be used 21473 * to initiate additional route resolutions. 21474 */ 21475 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 21476 MBLK_GETLABEL(first_mp), ipst); 21477 ip2dbg(("ip_wput[noirefound]: ire %p, " 21478 "multirt_need_resolve %d, first_mp %p\n", 21479 (void *)ire, multirt_need_resolve, (void *)first_mp)); 21480 if (multirt_need_resolve) { 21481 copy_mp = copymsg(first_mp); 21482 if (copy_mp != NULL) { 21483 MULTIRT_DEBUG_TAG(copy_mp); 21484 } 21485 } 21486 } 21487 21488 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 21489 /* 21490 * Try to resolve another multiroute if 21491 * ire_multirt_resolvable() deemed it necessary. 21492 * At this point, we need to distinguish 21493 * multicasts from other packets. For multicasts, 21494 * we call ip_newroute_ipif() and request that both 21495 * multirouting and setsrc flags are checked. 21496 */ 21497 if (copy_mp != NULL) { 21498 if (CLASSD(dst)) { 21499 ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst); 21500 if (ipif) { 21501 ASSERT(infop->ip_opt_ill_index == 0); 21502 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 21503 RTF_SETSRC | RTF_MULTIRT, zoneid, infop); 21504 ipif_refrele(ipif); 21505 } else { 21506 MULTIRT_DEBUG_UNTAG(copy_mp); 21507 freemsg(copy_mp); 21508 copy_mp = NULL; 21509 } 21510 } else { 21511 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 21512 } 21513 } 21514 if (attach_ill != NULL) 21515 ill_refrele(attach_ill); 21516 if (xmit_ill != NULL) 21517 ill_refrele(xmit_ill); 21518 if (need_decref) 21519 CONN_DEC_REF(connp); 21520 return; 21521 21522 icmp_parameter_problem: 21523 /* could not have originated externally */ 21524 ASSERT(mp->b_prev == NULL); 21525 if (ip_hdr_complete(ipha, zoneid, ipst) == 0) { 21526 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 21527 /* it's the IP header length that's in trouble */ 21528 icmp_param_problem(q, first_mp, 0, zoneid, ipst); 21529 first_mp = NULL; 21530 } 21531 21532 discard_pkt: 21533 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 21534 drop_pkt: 21535 ip1dbg(("ip_wput: dropped packet\n")); 21536 if (ire != NULL) 21537 ire_refrele(ire); 21538 if (need_decref) 21539 CONN_DEC_REF(connp); 21540 freemsg(first_mp); 21541 if (attach_ill != NULL) 21542 ill_refrele(attach_ill); 21543 if (xmit_ill != NULL) 21544 ill_refrele(xmit_ill); 21545 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21546 "ip_wput_end: q %p (%S)", q, "droppkt"); 21547 } 21548 21549 /* 21550 * If this is a conn_t queue, then we pass in the conn. This includes the 21551 * zoneid. 21552 * Otherwise, this is a message coming back from ARP or for an ill_t queue, 21553 * in which case we use the global zoneid since those are all part of 21554 * the global zone. 21555 */ 21556 void 21557 ip_wput(queue_t *q, mblk_t *mp) 21558 { 21559 if (CONN_Q(q)) 21560 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 21561 else 21562 ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT); 21563 } 21564 21565 /* 21566 * 21567 * The following rules must be observed when accessing any ipif or ill 21568 * that has been cached in the conn. Typically conn_nofailover_ill, 21569 * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill. 21570 * 21571 * Access: The ipif or ill pointed to from the conn can be accessed under 21572 * the protection of the conn_lock or after it has been refheld under the 21573 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 21574 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 21575 * The reason for this is that a concurrent unplumb could actually be 21576 * cleaning up these cached pointers by walking the conns and might have 21577 * finished cleaning up the conn in question. The macros check that an 21578 * unplumb has not yet started on the ipif or ill. 21579 * 21580 * Caching: An ipif or ill pointer may be cached in the conn only after 21581 * making sure that an unplumb has not started. So the caching is done 21582 * while holding both the conn_lock and the ill_lock and after using the 21583 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 21584 * flag before starting the cleanup of conns. 21585 * 21586 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 21587 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 21588 * or a reference to the ipif or a reference to an ire that references the 21589 * ipif. An ipif does not change its ill except for failover/failback. Since 21590 * failover/failback happens only after bringing down the ipif and making sure 21591 * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock 21592 * the above holds. 21593 */ 21594 ipif_t * 21595 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 21596 { 21597 ipif_t *ipif; 21598 ill_t *ill; 21599 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 21600 21601 *err = 0; 21602 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21603 mutex_enter(&connp->conn_lock); 21604 ipif = *ipifp; 21605 if (ipif != NULL) { 21606 ill = ipif->ipif_ill; 21607 mutex_enter(&ill->ill_lock); 21608 if (IPIF_CAN_LOOKUP(ipif)) { 21609 ipif_refhold_locked(ipif); 21610 mutex_exit(&ill->ill_lock); 21611 mutex_exit(&connp->conn_lock); 21612 rw_exit(&ipst->ips_ill_g_lock); 21613 return (ipif); 21614 } else { 21615 *err = IPIF_LOOKUP_FAILED; 21616 } 21617 mutex_exit(&ill->ill_lock); 21618 } 21619 mutex_exit(&connp->conn_lock); 21620 rw_exit(&ipst->ips_ill_g_lock); 21621 return (NULL); 21622 } 21623 21624 ill_t * 21625 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 21626 { 21627 ill_t *ill; 21628 21629 *err = 0; 21630 mutex_enter(&connp->conn_lock); 21631 ill = *illp; 21632 if (ill != NULL) { 21633 mutex_enter(&ill->ill_lock); 21634 if (ILL_CAN_LOOKUP(ill)) { 21635 ill_refhold_locked(ill); 21636 mutex_exit(&ill->ill_lock); 21637 mutex_exit(&connp->conn_lock); 21638 return (ill); 21639 } else { 21640 *err = ILL_LOOKUP_FAILED; 21641 } 21642 mutex_exit(&ill->ill_lock); 21643 } 21644 mutex_exit(&connp->conn_lock); 21645 return (NULL); 21646 } 21647 21648 static int 21649 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 21650 { 21651 ill_t *ill; 21652 21653 ill = ipif->ipif_ill; 21654 mutex_enter(&connp->conn_lock); 21655 mutex_enter(&ill->ill_lock); 21656 if (IPIF_CAN_LOOKUP(ipif)) { 21657 *ipifp = ipif; 21658 mutex_exit(&ill->ill_lock); 21659 mutex_exit(&connp->conn_lock); 21660 return (0); 21661 } 21662 mutex_exit(&ill->ill_lock); 21663 mutex_exit(&connp->conn_lock); 21664 return (IPIF_LOOKUP_FAILED); 21665 } 21666 21667 /* 21668 * This is called if the outbound datagram needs fragmentation. 21669 * 21670 * NOTE : This function does not ire_refrele the ire argument passed in. 21671 */ 21672 static void 21673 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, 21674 ip_stack_t *ipst) 21675 { 21676 ipha_t *ipha; 21677 mblk_t *mp; 21678 uint32_t v_hlen_tos_len; 21679 uint32_t max_frag; 21680 uint32_t frag_flag; 21681 boolean_t dont_use; 21682 21683 if (ipsec_mp->b_datap->db_type == M_CTL) { 21684 mp = ipsec_mp->b_cont; 21685 } else { 21686 mp = ipsec_mp; 21687 } 21688 21689 ipha = (ipha_t *)mp->b_rptr; 21690 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 21691 21692 #ifdef _BIG_ENDIAN 21693 #define V_HLEN (v_hlen_tos_len >> 24) 21694 #define LENGTH (v_hlen_tos_len & 0xFFFF) 21695 #else 21696 #define V_HLEN (v_hlen_tos_len & 0xFF) 21697 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 21698 #endif 21699 21700 #ifndef SPEED_BEFORE_SAFETY 21701 /* 21702 * Check that ipha_length is consistent with 21703 * the mblk length 21704 */ 21705 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 21706 ip0dbg(("Packet length mismatch: %d, %ld\n", 21707 LENGTH, msgdsize(mp))); 21708 freemsg(ipsec_mp); 21709 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21710 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 21711 "packet length mismatch"); 21712 return; 21713 } 21714 #endif 21715 /* 21716 * Don't use frag_flag if pre-built packet or source 21717 * routed or if multicast (since multicast packets do not solicit 21718 * ICMP "packet too big" messages). Get the values of 21719 * max_frag and frag_flag atomically by acquiring the 21720 * ire_lock. 21721 */ 21722 mutex_enter(&ire->ire_lock); 21723 max_frag = ire->ire_max_frag; 21724 frag_flag = ire->ire_frag_flag; 21725 mutex_exit(&ire->ire_lock); 21726 21727 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 21728 (V_HLEN != IP_SIMPLE_HDR_VERSION && 21729 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 21730 21731 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 21732 (dont_use ? 0 : frag_flag), zoneid, ipst); 21733 } 21734 21735 /* 21736 * Used for deciding the MSS size for the upper layer. Thus 21737 * we need to check the outbound policy values in the conn. 21738 */ 21739 int 21740 conn_ipsec_length(conn_t *connp) 21741 { 21742 ipsec_latch_t *ipl; 21743 21744 ipl = connp->conn_latch; 21745 if (ipl == NULL) 21746 return (0); 21747 21748 if (ipl->ipl_out_policy == NULL) 21749 return (0); 21750 21751 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 21752 } 21753 21754 /* 21755 * Returns an estimate of the IPsec headers size. This is used if 21756 * we don't want to call into IPsec to get the exact size. 21757 */ 21758 int 21759 ipsec_out_extra_length(mblk_t *ipsec_mp) 21760 { 21761 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 21762 ipsec_action_t *a; 21763 21764 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21765 if (!io->ipsec_out_secure) 21766 return (0); 21767 21768 a = io->ipsec_out_act; 21769 21770 if (a == NULL) { 21771 ASSERT(io->ipsec_out_policy != NULL); 21772 a = io->ipsec_out_policy->ipsp_act; 21773 } 21774 ASSERT(a != NULL); 21775 21776 return (a->ipa_ovhd); 21777 } 21778 21779 /* 21780 * Returns an estimate of the IPsec headers size. This is used if 21781 * we don't want to call into IPsec to get the exact size. 21782 */ 21783 int 21784 ipsec_in_extra_length(mblk_t *ipsec_mp) 21785 { 21786 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21787 ipsec_action_t *a; 21788 21789 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21790 21791 a = ii->ipsec_in_action; 21792 return (a == NULL ? 0 : a->ipa_ovhd); 21793 } 21794 21795 /* 21796 * If there are any source route options, return the true final 21797 * destination. Otherwise, return the destination. 21798 */ 21799 ipaddr_t 21800 ip_get_dst(ipha_t *ipha) 21801 { 21802 ipoptp_t opts; 21803 uchar_t *opt; 21804 uint8_t optval; 21805 uint8_t optlen; 21806 ipaddr_t dst; 21807 uint32_t off; 21808 21809 dst = ipha->ipha_dst; 21810 21811 if (IS_SIMPLE_IPH(ipha)) 21812 return (dst); 21813 21814 for (optval = ipoptp_first(&opts, ipha); 21815 optval != IPOPT_EOL; 21816 optval = ipoptp_next(&opts)) { 21817 opt = opts.ipoptp_cur; 21818 optlen = opts.ipoptp_len; 21819 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 21820 switch (optval) { 21821 case IPOPT_SSRR: 21822 case IPOPT_LSRR: 21823 off = opt[IPOPT_OFFSET]; 21824 /* 21825 * If one of the conditions is true, it means 21826 * end of options and dst already has the right 21827 * value. 21828 */ 21829 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 21830 off = optlen - IP_ADDR_LEN; 21831 bcopy(&opt[off], &dst, IP_ADDR_LEN); 21832 } 21833 return (dst); 21834 default: 21835 break; 21836 } 21837 } 21838 21839 return (dst); 21840 } 21841 21842 mblk_t * 21843 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 21844 conn_t *connp, boolean_t unspec_src, zoneid_t zoneid) 21845 { 21846 ipsec_out_t *io; 21847 mblk_t *first_mp; 21848 boolean_t policy_present; 21849 ip_stack_t *ipst; 21850 ipsec_stack_t *ipss; 21851 21852 ASSERT(ire != NULL); 21853 ipst = ire->ire_ipst; 21854 ipss = ipst->ips_netstack->netstack_ipsec; 21855 21856 first_mp = mp; 21857 if (mp->b_datap->db_type == M_CTL) { 21858 io = (ipsec_out_t *)first_mp->b_rptr; 21859 /* 21860 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 21861 * 21862 * 1) There is per-socket policy (including cached global 21863 * policy) or a policy on the IP-in-IP tunnel. 21864 * 2) There is no per-socket policy, but it is 21865 * a multicast packet that needs to go out 21866 * on a specific interface. This is the case 21867 * where (ip_wput and ip_wput_multicast) attaches 21868 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 21869 * 21870 * In case (2) we check with global policy to 21871 * see if there is a match and set the ill_index 21872 * appropriately so that we can lookup the ire 21873 * properly in ip_wput_ipsec_out. 21874 */ 21875 21876 /* 21877 * ipsec_out_use_global_policy is set to B_FALSE 21878 * in ipsec_in_to_out(). Refer to that function for 21879 * details. 21880 */ 21881 if ((io->ipsec_out_latch == NULL) && 21882 (io->ipsec_out_use_global_policy)) { 21883 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 21884 ire, connp, unspec_src, zoneid)); 21885 } 21886 if (!io->ipsec_out_secure) { 21887 /* 21888 * If this is not a secure packet, drop 21889 * the IPSEC_OUT mp and treat it as a clear 21890 * packet. This happens when we are sending 21891 * a ICMP reply back to a clear packet. See 21892 * ipsec_in_to_out() for details. 21893 */ 21894 mp = first_mp->b_cont; 21895 freeb(first_mp); 21896 } 21897 return (mp); 21898 } 21899 /* 21900 * See whether we need to attach a global policy here. We 21901 * don't depend on the conn (as it could be null) for deciding 21902 * what policy this datagram should go through because it 21903 * should have happened in ip_wput if there was some 21904 * policy. This normally happens for connections which are not 21905 * fully bound preventing us from caching policies in 21906 * ip_bind. Packets coming from the TCP listener/global queue 21907 * - which are non-hard_bound - could also be affected by 21908 * applying policy here. 21909 * 21910 * If this packet is coming from tcp global queue or listener, 21911 * we will be applying policy here. This may not be *right* 21912 * if these packets are coming from the detached connection as 21913 * it could have gone in clear before. This happens only if a 21914 * TCP connection started when there is no policy and somebody 21915 * added policy before it became detached. Thus packets of the 21916 * detached connection could go out secure and the other end 21917 * would drop it because it will be expecting in clear. The 21918 * converse is not true i.e if somebody starts a TCP 21919 * connection and deletes the policy, all the packets will 21920 * still go out with the policy that existed before deleting 21921 * because ip_unbind sends up policy information which is used 21922 * by TCP on subsequent ip_wputs. The right solution is to fix 21923 * TCP to attach a dummy IPSEC_OUT and set 21924 * ipsec_out_use_global_policy to B_FALSE. As this might 21925 * affect performance for normal cases, we are not doing it. 21926 * Thus, set policy before starting any TCP connections. 21927 * 21928 * NOTE - We might apply policy even for a hard bound connection 21929 * - for which we cached policy in ip_bind - if somebody added 21930 * global policy after we inherited the policy in ip_bind. 21931 * This means that the packets that were going out in clear 21932 * previously would start going secure and hence get dropped 21933 * on the other side. To fix this, TCP attaches a dummy 21934 * ipsec_out and make sure that we don't apply global policy. 21935 */ 21936 if (ipha != NULL) 21937 policy_present = ipss->ipsec_outbound_v4_policy_present; 21938 else 21939 policy_present = ipss->ipsec_outbound_v6_policy_present; 21940 if (!policy_present) 21941 return (mp); 21942 21943 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src, 21944 zoneid)); 21945 } 21946 21947 ire_t * 21948 conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) 21949 { 21950 ipaddr_t addr; 21951 ire_t *save_ire; 21952 irb_t *irb; 21953 ill_group_t *illgrp; 21954 int err; 21955 21956 save_ire = ire; 21957 addr = ire->ire_addr; 21958 21959 ASSERT(ire->ire_type == IRE_BROADCAST); 21960 21961 illgrp = connp->conn_outgoing_ill->ill_group; 21962 if (illgrp == NULL) { 21963 *conn_outgoing_ill = conn_get_held_ill(connp, 21964 &connp->conn_outgoing_ill, &err); 21965 if (err == ILL_LOOKUP_FAILED) { 21966 ire_refrele(save_ire); 21967 return (NULL); 21968 } 21969 return (save_ire); 21970 } 21971 /* 21972 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. 21973 * If it is part of the group, we need to send on the ire 21974 * that has been cleared of IRE_MARK_NORECV and that belongs 21975 * to this group. This is okay as IP_BOUND_IF really means 21976 * any ill in the group. We depend on the fact that the 21977 * first ire in the group is always cleared of IRE_MARK_NORECV 21978 * if such an ire exists. This is possible only if you have 21979 * at least one ill in the group that has not failed. 21980 * 21981 * First get to the ire that matches the address and group. 21982 * 21983 * We don't look for an ire with a matching zoneid because a given zone 21984 * won't always have broadcast ires on all ills in the group. 21985 */ 21986 irb = ire->ire_bucket; 21987 rw_enter(&irb->irb_lock, RW_READER); 21988 if (ire->ire_marks & IRE_MARK_NORECV) { 21989 /* 21990 * If the current zone only has an ire broadcast for this 21991 * address marked NORECV, the ire we want is ahead in the 21992 * bucket, so we look it up deliberately ignoring the zoneid. 21993 */ 21994 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 21995 if (ire->ire_addr != addr) 21996 continue; 21997 /* skip over deleted ires */ 21998 if (ire->ire_marks & IRE_MARK_CONDEMNED) 21999 continue; 22000 } 22001 } 22002 while (ire != NULL) { 22003 /* 22004 * If a new interface is coming up, we could end up 22005 * seeing the loopback ire and the non-loopback ire 22006 * may not have been added yet. So check for ire_stq 22007 */ 22008 if (ire->ire_stq != NULL && (ire->ire_addr != addr || 22009 ire->ire_ipif->ipif_ill->ill_group == illgrp)) { 22010 break; 22011 } 22012 ire = ire->ire_next; 22013 } 22014 if (ire != NULL && ire->ire_addr == addr && 22015 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 22016 IRE_REFHOLD(ire); 22017 rw_exit(&irb->irb_lock); 22018 ire_refrele(save_ire); 22019 *conn_outgoing_ill = ire_to_ill(ire); 22020 /* 22021 * Refhold the ill to make the conn_outgoing_ill 22022 * independent of the ire. ip_wput_ire goes in a loop 22023 * and may refrele the ire. Since we have an ire at this 22024 * point we don't need to use ILL_CAN_LOOKUP on the ill. 22025 */ 22026 ill_refhold(*conn_outgoing_ill); 22027 return (ire); 22028 } 22029 rw_exit(&irb->irb_lock); 22030 ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); 22031 /* 22032 * If we can't find a suitable ire, return the original ire. 22033 */ 22034 return (save_ire); 22035 } 22036 22037 /* 22038 * This function does the ire_refrele of the ire passed in as the 22039 * argument. As this function looks up more ires i.e broadcast ires, 22040 * it needs to REFRELE them. Currently, for simplicity we don't 22041 * differentiate the one passed in and looked up here. We always 22042 * REFRELE. 22043 * IPQoS Notes: 22044 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 22045 * IPsec packets are done in ipsec_out_process. 22046 * 22047 */ 22048 void 22049 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, 22050 zoneid_t zoneid) 22051 { 22052 ipha_t *ipha; 22053 #define rptr ((uchar_t *)ipha) 22054 queue_t *stq; 22055 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 22056 uint32_t v_hlen_tos_len; 22057 uint32_t ttl_protocol; 22058 ipaddr_t src; 22059 ipaddr_t dst; 22060 uint32_t cksum; 22061 ipaddr_t orig_src; 22062 ire_t *ire1; 22063 mblk_t *next_mp; 22064 uint_t hlen; 22065 uint16_t *up; 22066 uint32_t max_frag = ire->ire_max_frag; 22067 ill_t *ill = ire_to_ill(ire); 22068 int clusterwide; 22069 uint16_t ip_hdr_included; /* IP header included by ULP? */ 22070 int ipsec_len; 22071 mblk_t *first_mp; 22072 ipsec_out_t *io; 22073 boolean_t conn_dontroute; /* conn value for multicast */ 22074 boolean_t conn_multicast_loop; /* conn value for multicast */ 22075 boolean_t multicast_forward; /* Should we forward ? */ 22076 boolean_t unspec_src; 22077 ill_t *conn_outgoing_ill = NULL; 22078 ill_t *ire_ill; 22079 ill_t *ire1_ill; 22080 ill_t *out_ill; 22081 uint32_t ill_index = 0; 22082 boolean_t multirt_send = B_FALSE; 22083 int err; 22084 ipxmit_state_t pktxmit_state; 22085 ip_stack_t *ipst = ire->ire_ipst; 22086 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 22087 22088 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 22089 "ip_wput_ire_start: q %p", q); 22090 22091 multicast_forward = B_FALSE; 22092 unspec_src = (connp != NULL && connp->conn_unspec_src); 22093 22094 if (ire->ire_flags & RTF_MULTIRT) { 22095 /* 22096 * Multirouting case. The bucket where ire is stored 22097 * probably holds other RTF_MULTIRT flagged ire 22098 * to the destination. In this call to ip_wput_ire, 22099 * we attempt to send the packet through all 22100 * those ires. Thus, we first ensure that ire is the 22101 * first RTF_MULTIRT ire in the bucket, 22102 * before walking the ire list. 22103 */ 22104 ire_t *first_ire; 22105 irb_t *irb = ire->ire_bucket; 22106 ASSERT(irb != NULL); 22107 22108 /* Make sure we do not omit any multiroute ire. */ 22109 IRB_REFHOLD(irb); 22110 for (first_ire = irb->irb_ire; 22111 first_ire != NULL; 22112 first_ire = first_ire->ire_next) { 22113 if ((first_ire->ire_flags & RTF_MULTIRT) && 22114 (first_ire->ire_addr == ire->ire_addr) && 22115 !(first_ire->ire_marks & 22116 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 22117 break; 22118 } 22119 } 22120 22121 if ((first_ire != NULL) && (first_ire != ire)) { 22122 IRE_REFHOLD(first_ire); 22123 ire_refrele(ire); 22124 ire = first_ire; 22125 ill = ire_to_ill(ire); 22126 } 22127 IRB_REFRELE(irb); 22128 } 22129 22130 /* 22131 * conn_outgoing_ill variable is used only in the broadcast loop. 22132 * for performance we don't grab the mutexs in the fastpath 22133 */ 22134 if ((connp != NULL) && 22135 (ire->ire_type == IRE_BROADCAST) && 22136 ((connp->conn_nofailover_ill != NULL) || 22137 (connp->conn_outgoing_ill != NULL))) { 22138 /* 22139 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF 22140 * option. So, see if this endpoint is bound to a 22141 * IPIF_NOFAILOVER address. If so, honor it. This implies 22142 * that if the interface is failed, we will still send 22143 * the packet on the same ill which is what we want. 22144 */ 22145 conn_outgoing_ill = conn_get_held_ill(connp, 22146 &connp->conn_nofailover_ill, &err); 22147 if (err == ILL_LOOKUP_FAILED) { 22148 ire_refrele(ire); 22149 freemsg(mp); 22150 return; 22151 } 22152 if (conn_outgoing_ill == NULL) { 22153 /* 22154 * Choose a good ill in the group to send the 22155 * packets on. 22156 */ 22157 ire = conn_set_outgoing_ill(connp, ire, 22158 &conn_outgoing_ill); 22159 if (ire == NULL) { 22160 freemsg(mp); 22161 return; 22162 } 22163 } 22164 } 22165 22166 if (mp->b_datap->db_type != M_CTL) { 22167 ipha = (ipha_t *)mp->b_rptr; 22168 } else { 22169 io = (ipsec_out_t *)mp->b_rptr; 22170 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22171 ASSERT(zoneid == io->ipsec_out_zoneid); 22172 ASSERT(zoneid != ALL_ZONES); 22173 ipha = (ipha_t *)mp->b_cont->b_rptr; 22174 dst = ipha->ipha_dst; 22175 /* 22176 * For the multicast case, ipsec_out carries conn_dontroute and 22177 * conn_multicast_loop as conn may not be available here. We 22178 * need this for multicast loopback and forwarding which is done 22179 * later in the code. 22180 */ 22181 if (CLASSD(dst)) { 22182 conn_dontroute = io->ipsec_out_dontroute; 22183 conn_multicast_loop = io->ipsec_out_multicast_loop; 22184 /* 22185 * If conn_dontroute is not set or conn_multicast_loop 22186 * is set, we need to do forwarding/loopback. For 22187 * datagrams from ip_wput_multicast, conn_dontroute is 22188 * set to B_TRUE and conn_multicast_loop is set to 22189 * B_FALSE so that we neither do forwarding nor 22190 * loopback. 22191 */ 22192 if (!conn_dontroute || conn_multicast_loop) 22193 multicast_forward = B_TRUE; 22194 } 22195 } 22196 22197 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 22198 ire->ire_zoneid != ALL_ZONES) { 22199 /* 22200 * When a zone sends a packet to another zone, we try to deliver 22201 * the packet under the same conditions as if the destination 22202 * was a real node on the network. To do so, we look for a 22203 * matching route in the forwarding table. 22204 * RTF_REJECT and RTF_BLACKHOLE are handled just like 22205 * ip_newroute() does. 22206 * Note that IRE_LOCAL are special, since they are used 22207 * when the zoneid doesn't match in some cases. This means that 22208 * we need to handle ipha_src differently since ire_src_addr 22209 * belongs to the receiving zone instead of the sending zone. 22210 * When ip_restrict_interzone_loopback is set, then 22211 * ire_cache_lookup() ensures that IRE_LOCAL are only used 22212 * for loopback between zones when the logical "Ethernet" would 22213 * have looped them back. 22214 */ 22215 ire_t *src_ire; 22216 22217 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 22218 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 22219 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst); 22220 if (src_ire != NULL && 22221 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && 22222 (!ipst->ips_ip_restrict_interzone_loopback || 22223 ire_local_same_ill_group(ire, src_ire))) { 22224 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 22225 ipha->ipha_src = src_ire->ire_src_addr; 22226 ire_refrele(src_ire); 22227 } else { 22228 ire_refrele(ire); 22229 if (conn_outgoing_ill != NULL) 22230 ill_refrele(conn_outgoing_ill); 22231 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 22232 if (src_ire != NULL) { 22233 if (src_ire->ire_flags & RTF_BLACKHOLE) { 22234 ire_refrele(src_ire); 22235 freemsg(mp); 22236 return; 22237 } 22238 ire_refrele(src_ire); 22239 } 22240 if (ip_hdr_complete(ipha, zoneid, ipst)) { 22241 /* Failed */ 22242 freemsg(mp); 22243 return; 22244 } 22245 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid, 22246 ipst); 22247 return; 22248 } 22249 } 22250 22251 if (mp->b_datap->db_type == M_CTL || 22252 ipss->ipsec_outbound_v4_policy_present) { 22253 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 22254 unspec_src, zoneid); 22255 if (mp == NULL) { 22256 ire_refrele(ire); 22257 if (conn_outgoing_ill != NULL) 22258 ill_refrele(conn_outgoing_ill); 22259 return; 22260 } 22261 /* 22262 * Trusted Extensions supports all-zones interfaces, so 22263 * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to 22264 * the global zone. 22265 */ 22266 if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) { 22267 io = (ipsec_out_t *)mp->b_rptr; 22268 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22269 zoneid = io->ipsec_out_zoneid; 22270 } 22271 } 22272 22273 first_mp = mp; 22274 ipsec_len = 0; 22275 22276 if (first_mp->b_datap->db_type == M_CTL) { 22277 io = (ipsec_out_t *)first_mp->b_rptr; 22278 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22279 mp = first_mp->b_cont; 22280 ipsec_len = ipsec_out_extra_length(first_mp); 22281 ASSERT(ipsec_len >= 0); 22282 /* We already picked up the zoneid from the M_CTL above */ 22283 ASSERT(zoneid == io->ipsec_out_zoneid); 22284 ASSERT(zoneid != ALL_ZONES); 22285 22286 /* 22287 * Drop M_CTL here if IPsec processing is not needed. 22288 * (Non-IPsec use of M_CTL extracted any information it 22289 * needed above). 22290 */ 22291 if (ipsec_len == 0) { 22292 freeb(first_mp); 22293 first_mp = mp; 22294 } 22295 } 22296 22297 /* 22298 * Fast path for ip_wput_ire 22299 */ 22300 22301 ipha = (ipha_t *)mp->b_rptr; 22302 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22303 dst = ipha->ipha_dst; 22304 22305 /* 22306 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 22307 * if the socket is a SOCK_RAW type. The transport checksum should 22308 * be provided in the pre-built packet, so we don't need to compute it. 22309 * Also, other application set flags, like DF, should not be altered. 22310 * Other transport MUST pass down zero. 22311 */ 22312 ip_hdr_included = ipha->ipha_ident; 22313 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 22314 22315 if (CLASSD(dst)) { 22316 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 22317 ntohl(dst), 22318 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 22319 ntohl(ire->ire_addr))); 22320 } 22321 22322 /* Macros to extract header fields from data already in registers */ 22323 #ifdef _BIG_ENDIAN 22324 #define V_HLEN (v_hlen_tos_len >> 24) 22325 #define LENGTH (v_hlen_tos_len & 0xFFFF) 22326 #define PROTO (ttl_protocol & 0xFF) 22327 #else 22328 #define V_HLEN (v_hlen_tos_len & 0xFF) 22329 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 22330 #define PROTO (ttl_protocol >> 8) 22331 #endif 22332 22333 22334 orig_src = src = ipha->ipha_src; 22335 /* (The loop back to "another" is explained down below.) */ 22336 another:; 22337 /* 22338 * Assign an ident value for this packet. We assign idents on 22339 * a per destination basis out of the IRE. There could be 22340 * other threads targeting the same destination, so we have to 22341 * arrange for a atomic increment. Note that we use a 32-bit 22342 * atomic add because it has better performance than its 22343 * 16-bit sibling. 22344 * 22345 * If running in cluster mode and if the source address 22346 * belongs to a replicated service then vector through 22347 * cl_inet_ipident vector to allocate ip identifier 22348 * NOTE: This is a contract private interface with the 22349 * clustering group. 22350 */ 22351 clusterwide = 0; 22352 if (cl_inet_ipident) { 22353 ASSERT(cl_inet_isclusterwide); 22354 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 22355 AF_INET, (uint8_t *)(uintptr_t)src)) { 22356 ipha->ipha_ident = (*cl_inet_ipident)(IPPROTO_IP, 22357 AF_INET, (uint8_t *)(uintptr_t)src, 22358 (uint8_t *)(uintptr_t)dst); 22359 clusterwide = 1; 22360 } 22361 } 22362 if (!clusterwide) { 22363 ipha->ipha_ident = 22364 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 22365 } 22366 22367 #ifndef _BIG_ENDIAN 22368 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 22369 #endif 22370 22371 /* 22372 * Set source address unless sent on an ill or conn_unspec_src is set. 22373 * This is needed to obey conn_unspec_src when packets go through 22374 * ip_newroute + arp. 22375 * Assumes ip_newroute{,_multi} sets the source address as well. 22376 */ 22377 if (src == INADDR_ANY && !unspec_src) { 22378 /* 22379 * Assign the appropriate source address from the IRE if none 22380 * was specified. 22381 */ 22382 ASSERT(ire->ire_ipversion == IPV4_VERSION); 22383 22384 /* 22385 * With IP multipathing, broadcast packets are sent on the ire 22386 * that has been cleared of IRE_MARK_NORECV and that belongs to 22387 * the group. However, this ire might not be in the same zone so 22388 * we can't always use its source address. We look for a 22389 * broadcast ire in the same group and in the right zone. 22390 */ 22391 if (ire->ire_type == IRE_BROADCAST && 22392 ire->ire_zoneid != zoneid) { 22393 ire_t *src_ire = ire_ctable_lookup(dst, 0, 22394 IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, 22395 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); 22396 if (src_ire != NULL) { 22397 src = src_ire->ire_src_addr; 22398 ire_refrele(src_ire); 22399 } else { 22400 ire_refrele(ire); 22401 if (conn_outgoing_ill != NULL) 22402 ill_refrele(conn_outgoing_ill); 22403 freemsg(first_mp); 22404 if (ill != NULL) { 22405 BUMP_MIB(ill->ill_ip_mib, 22406 ipIfStatsOutDiscards); 22407 } else { 22408 BUMP_MIB(&ipst->ips_ip_mib, 22409 ipIfStatsOutDiscards); 22410 } 22411 return; 22412 } 22413 } else { 22414 src = ire->ire_src_addr; 22415 } 22416 22417 if (connp == NULL) { 22418 ip1dbg(("ip_wput_ire: no connp and no src " 22419 "address for dst 0x%x, using src 0x%x\n", 22420 ntohl(dst), 22421 ntohl(src))); 22422 } 22423 ipha->ipha_src = src; 22424 } 22425 stq = ire->ire_stq; 22426 22427 /* 22428 * We only allow ire chains for broadcasts since there will 22429 * be multiple IRE_CACHE entries for the same multicast 22430 * address (one per ipif). 22431 */ 22432 next_mp = NULL; 22433 22434 /* broadcast packet */ 22435 if (ire->ire_type == IRE_BROADCAST) 22436 goto broadcast; 22437 22438 /* loopback ? */ 22439 if (stq == NULL) 22440 goto nullstq; 22441 22442 /* The ill_index for outbound ILL */ 22443 ill_index = Q_TO_INDEX(stq); 22444 22445 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 22446 ttl_protocol = ((uint16_t *)ipha)[4]; 22447 22448 /* pseudo checksum (do it in parts for IP header checksum) */ 22449 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 22450 22451 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 22452 queue_t *dev_q = stq->q_next; 22453 22454 /* flow controlled */ 22455 if ((dev_q->q_next || dev_q->q_first) && 22456 !canput(dev_q)) 22457 goto blocked; 22458 if ((PROTO == IPPROTO_UDP) && 22459 (ip_hdr_included != IP_HDR_INCLUDED)) { 22460 hlen = (V_HLEN & 0xF) << 2; 22461 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 22462 if (*up != 0) { 22463 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 22464 hlen, LENGTH, max_frag, ipsec_len, cksum); 22465 /* Software checksum? */ 22466 if (DB_CKSUMFLAGS(mp) == 0) { 22467 IP_STAT(ipst, ip_out_sw_cksum); 22468 IP_STAT_UPDATE(ipst, 22469 ip_udp_out_sw_cksum_bytes, 22470 LENGTH - hlen); 22471 } 22472 } 22473 } 22474 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 22475 hlen = (V_HLEN & 0xF) << 2; 22476 if (PROTO == IPPROTO_TCP) { 22477 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22478 /* 22479 * The packet header is processed once and for all, even 22480 * in the multirouting case. We disable hardware 22481 * checksum if the packet is multirouted, as it will be 22482 * replicated via several interfaces, and not all of 22483 * them may have this capability. 22484 */ 22485 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 22486 LENGTH, max_frag, ipsec_len, cksum); 22487 /* Software checksum? */ 22488 if (DB_CKSUMFLAGS(mp) == 0) { 22489 IP_STAT(ipst, ip_out_sw_cksum); 22490 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22491 LENGTH - hlen); 22492 } 22493 } else { 22494 sctp_hdr_t *sctph; 22495 22496 ASSERT(PROTO == IPPROTO_SCTP); 22497 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22498 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22499 /* 22500 * Zero out the checksum field to ensure proper 22501 * checksum calculation. 22502 */ 22503 sctph->sh_chksum = 0; 22504 #ifdef DEBUG 22505 if (!skip_sctp_cksum) 22506 #endif 22507 sctph->sh_chksum = sctp_cksum(mp, hlen); 22508 } 22509 } 22510 22511 /* 22512 * If this is a multicast packet and originated from ip_wput 22513 * we need to do loopback and forwarding checks. If it comes 22514 * from ip_wput_multicast, we SHOULD not do this. 22515 */ 22516 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 22517 22518 /* checksum */ 22519 cksum += ttl_protocol; 22520 22521 /* fragment the packet */ 22522 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 22523 goto fragmentit; 22524 /* 22525 * Don't use frag_flag if packet is pre-built or source 22526 * routed or if multicast (since multicast packets do 22527 * not solicit ICMP "packet too big" messages). 22528 */ 22529 if ((ip_hdr_included != IP_HDR_INCLUDED) && 22530 (V_HLEN == IP_SIMPLE_HDR_VERSION || 22531 !ip_source_route_included(ipha)) && 22532 !CLASSD(ipha->ipha_dst)) 22533 ipha->ipha_fragment_offset_and_flags |= 22534 htons(ire->ire_frag_flag); 22535 22536 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 22537 /* calculate IP header checksum */ 22538 cksum += ipha->ipha_ident; 22539 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 22540 cksum += ipha->ipha_fragment_offset_and_flags; 22541 22542 /* IP options present */ 22543 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 22544 if (hlen) 22545 goto checksumoptions; 22546 22547 /* calculate hdr checksum */ 22548 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 22549 cksum = ~(cksum + (cksum >> 16)); 22550 ipha->ipha_hdr_checksum = (uint16_t)cksum; 22551 } 22552 if (ipsec_len != 0) { 22553 /* 22554 * We will do the rest of the processing after 22555 * we come back from IPsec in ip_wput_ipsec_out(). 22556 */ 22557 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 22558 22559 io = (ipsec_out_t *)first_mp->b_rptr; 22560 io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> 22561 ill_phyint->phyint_ifindex; 22562 22563 ipsec_out_process(q, first_mp, ire, ill_index); 22564 ire_refrele(ire); 22565 if (conn_outgoing_ill != NULL) 22566 ill_refrele(conn_outgoing_ill); 22567 return; 22568 } 22569 22570 /* 22571 * In most cases, the emission loop below is entered only 22572 * once. Only in the case where the ire holds the 22573 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 22574 * flagged ires in the bucket, and send the packet 22575 * through all crossed RTF_MULTIRT routes. 22576 */ 22577 if (ire->ire_flags & RTF_MULTIRT) { 22578 multirt_send = B_TRUE; 22579 } 22580 do { 22581 if (multirt_send) { 22582 irb_t *irb; 22583 /* 22584 * We are in a multiple send case, need to get 22585 * the next ire and make a duplicate of the packet. 22586 * ire1 holds here the next ire to process in the 22587 * bucket. If multirouting is expected, 22588 * any non-RTF_MULTIRT ire that has the 22589 * right destination address is ignored. 22590 */ 22591 irb = ire->ire_bucket; 22592 ASSERT(irb != NULL); 22593 22594 IRB_REFHOLD(irb); 22595 for (ire1 = ire->ire_next; 22596 ire1 != NULL; 22597 ire1 = ire1->ire_next) { 22598 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22599 continue; 22600 if (ire1->ire_addr != ire->ire_addr) 22601 continue; 22602 if (ire1->ire_marks & 22603 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 22604 continue; 22605 22606 /* Got one */ 22607 IRE_REFHOLD(ire1); 22608 break; 22609 } 22610 IRB_REFRELE(irb); 22611 22612 if (ire1 != NULL) { 22613 next_mp = copyb(mp); 22614 if ((next_mp == NULL) || 22615 ((mp->b_cont != NULL) && 22616 ((next_mp->b_cont = 22617 dupmsg(mp->b_cont)) == NULL))) { 22618 freemsg(next_mp); 22619 next_mp = NULL; 22620 ire_refrele(ire1); 22621 ire1 = NULL; 22622 } 22623 } 22624 22625 /* Last multiroute ire; don't loop anymore. */ 22626 if (ire1 == NULL) { 22627 multirt_send = B_FALSE; 22628 } 22629 } 22630 22631 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 22632 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha, 22633 mblk_t *, mp); 22634 FW_HOOKS(ipst->ips_ip4_physical_out_event, 22635 ipst->ips_ipv4firewall_physical_out, 22636 NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst); 22637 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 22638 if (mp == NULL) 22639 goto release_ire_and_ill; 22640 22641 mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT); 22642 DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire); 22643 pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE); 22644 if ((pktxmit_state == SEND_FAILED) || 22645 (pktxmit_state == LLHDR_RESLV_FAILED)) { 22646 ip2dbg(("ip_wput_ire: ip_xmit_v4 failed" 22647 "- packet dropped\n")); 22648 release_ire_and_ill: 22649 ire_refrele(ire); 22650 if (next_mp != NULL) { 22651 freemsg(next_mp); 22652 ire_refrele(ire1); 22653 } 22654 if (conn_outgoing_ill != NULL) 22655 ill_refrele(conn_outgoing_ill); 22656 return; 22657 } 22658 22659 if (CLASSD(dst)) { 22660 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts); 22661 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets, 22662 LENGTH); 22663 } 22664 22665 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22666 "ip_wput_ire_end: q %p (%S)", 22667 q, "last copy out"); 22668 IRE_REFRELE(ire); 22669 22670 if (multirt_send) { 22671 ASSERT(ire1); 22672 /* 22673 * Proceed with the next RTF_MULTIRT ire, 22674 * Also set up the send-to queue accordingly. 22675 */ 22676 ire = ire1; 22677 ire1 = NULL; 22678 stq = ire->ire_stq; 22679 mp = next_mp; 22680 next_mp = NULL; 22681 ipha = (ipha_t *)mp->b_rptr; 22682 ill_index = Q_TO_INDEX(stq); 22683 ill = (ill_t *)stq->q_ptr; 22684 } 22685 } while (multirt_send); 22686 if (conn_outgoing_ill != NULL) 22687 ill_refrele(conn_outgoing_ill); 22688 return; 22689 22690 /* 22691 * ire->ire_type == IRE_BROADCAST (minimize diffs) 22692 */ 22693 broadcast: 22694 { 22695 /* 22696 * To avoid broadcast storms, we usually set the TTL to 1 for 22697 * broadcasts. However, if SO_DONTROUTE isn't set, this value 22698 * can be overridden stack-wide through the ip_broadcast_ttl 22699 * ndd tunable, or on a per-connection basis through the 22700 * IP_BROADCAST_TTL socket option. 22701 * 22702 * In the event that we are replying to incoming ICMP packets, 22703 * connp could be NULL. 22704 */ 22705 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 22706 if (connp != NULL) { 22707 if (connp->conn_dontroute) 22708 ipha->ipha_ttl = 1; 22709 else if (connp->conn_broadcast_ttl != 0) 22710 ipha->ipha_ttl = connp->conn_broadcast_ttl; 22711 } 22712 22713 /* 22714 * Note that we are not doing a IRB_REFHOLD here. 22715 * Actually we don't care if the list changes i.e 22716 * if somebody deletes an IRE from the list while 22717 * we drop the lock, the next time we come around 22718 * ire_next will be NULL and hence we won't send 22719 * out multiple copies which is fine. 22720 */ 22721 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 22722 ire1 = ire->ire_next; 22723 if (conn_outgoing_ill != NULL) { 22724 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 22725 ASSERT(ire1 == ire->ire_next); 22726 if (ire1 != NULL && ire1->ire_addr == dst) { 22727 ire_refrele(ire); 22728 ire = ire1; 22729 IRE_REFHOLD(ire); 22730 ire1 = ire->ire_next; 22731 continue; 22732 } 22733 rw_exit(&ire->ire_bucket->irb_lock); 22734 /* Did not find a matching ill */ 22735 ip1dbg(("ip_wput_ire: broadcast with no " 22736 "matching IP_BOUND_IF ill %s dst %x\n", 22737 conn_outgoing_ill->ill_name, dst)); 22738 freemsg(first_mp); 22739 if (ire != NULL) 22740 ire_refrele(ire); 22741 ill_refrele(conn_outgoing_ill); 22742 return; 22743 } 22744 } else if (ire1 != NULL && ire1->ire_addr == dst) { 22745 /* 22746 * If the next IRE has the same address and is not one 22747 * of the two copies that we need to send, try to see 22748 * whether this copy should be sent at all. This 22749 * assumes that we insert loopbacks first and then 22750 * non-loopbacks. This is acheived by inserting the 22751 * loopback always before non-loopback. 22752 * This is used to send a single copy of a broadcast 22753 * packet out all physical interfaces that have an 22754 * matching IRE_BROADCAST while also looping 22755 * back one copy (to ip_wput_local) for each 22756 * matching physical interface. However, we avoid 22757 * sending packets out different logical that match by 22758 * having ipif_up/ipif_down supress duplicate 22759 * IRE_BROADCASTS. 22760 * 22761 * This feature is currently used to get broadcasts 22762 * sent to multiple interfaces, when the broadcast 22763 * address being used applies to multiple interfaces. 22764 * For example, a whole net broadcast will be 22765 * replicated on every connected subnet of 22766 * the target net. 22767 * 22768 * Each zone has its own set of IRE_BROADCASTs, so that 22769 * we're able to distribute inbound packets to multiple 22770 * zones who share a broadcast address. We avoid looping 22771 * back outbound packets in different zones but on the 22772 * same ill, as the application would see duplicates. 22773 * 22774 * If the interfaces are part of the same group, 22775 * we would want to send only one copy out for 22776 * whole group. 22777 * 22778 * This logic assumes that ire_add_v4() groups the 22779 * IRE_BROADCAST entries so that those with the same 22780 * ire_addr and ill_group are kept together. 22781 */ 22782 ire_ill = ire->ire_ipif->ipif_ill; 22783 if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { 22784 if (ire_ill->ill_group != NULL && 22785 (ire->ire_marks & IRE_MARK_NORECV)) { 22786 /* 22787 * If the current zone only has an ire 22788 * broadcast for this address marked 22789 * NORECV, the ire we want is ahead in 22790 * the bucket, so we look it up 22791 * deliberately ignoring the zoneid. 22792 */ 22793 for (ire1 = ire->ire_bucket->irb_ire; 22794 ire1 != NULL; 22795 ire1 = ire1->ire_next) { 22796 ire1_ill = 22797 ire1->ire_ipif->ipif_ill; 22798 if (ire1->ire_addr != dst) 22799 continue; 22800 /* skip over the current ire */ 22801 if (ire1 == ire) 22802 continue; 22803 /* skip over deleted ires */ 22804 if (ire1->ire_marks & 22805 IRE_MARK_CONDEMNED) 22806 continue; 22807 /* 22808 * non-loopback ire in our 22809 * group: use it for the next 22810 * pass in the loop 22811 */ 22812 if (ire1->ire_stq != NULL && 22813 ire1_ill->ill_group == 22814 ire_ill->ill_group) 22815 break; 22816 } 22817 } 22818 } else { 22819 while (ire1 != NULL && ire1->ire_addr == dst) { 22820 ire1_ill = ire1->ire_ipif->ipif_ill; 22821 /* 22822 * We can have two broadcast ires on the 22823 * same ill in different zones; here 22824 * we'll send a copy of the packet on 22825 * each ill and the fanout code will 22826 * call conn_wantpacket() to check that 22827 * the zone has the broadcast address 22828 * configured on the ill. If the two 22829 * ires are in the same group we only 22830 * send one copy up. 22831 */ 22832 if (ire1_ill != ire_ill && 22833 (ire1_ill->ill_group == NULL || 22834 ire_ill->ill_group == NULL || 22835 ire1_ill->ill_group != 22836 ire_ill->ill_group)) { 22837 break; 22838 } 22839 ire1 = ire1->ire_next; 22840 } 22841 } 22842 } 22843 ASSERT(multirt_send == B_FALSE); 22844 if (ire1 != NULL && ire1->ire_addr == dst) { 22845 if ((ire->ire_flags & RTF_MULTIRT) && 22846 (ire1->ire_flags & RTF_MULTIRT)) { 22847 /* 22848 * We are in the multirouting case. 22849 * The message must be sent at least 22850 * on both ires. These ires have been 22851 * inserted AFTER the standard ones 22852 * in ip_rt_add(). There are thus no 22853 * other ire entries for the destination 22854 * address in the rest of the bucket 22855 * that do not have the RTF_MULTIRT 22856 * flag. We don't process a copy 22857 * of the message here. This will be 22858 * done in the final sending loop. 22859 */ 22860 multirt_send = B_TRUE; 22861 } else { 22862 next_mp = ip_copymsg(first_mp); 22863 if (next_mp != NULL) 22864 IRE_REFHOLD(ire1); 22865 } 22866 } 22867 rw_exit(&ire->ire_bucket->irb_lock); 22868 } 22869 22870 if (stq) { 22871 /* 22872 * A non-NULL send-to queue means this packet is going 22873 * out of this machine. 22874 */ 22875 out_ill = (ill_t *)stq->q_ptr; 22876 22877 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests); 22878 ttl_protocol = ((uint16_t *)ipha)[4]; 22879 /* 22880 * We accumulate the pseudo header checksum in cksum. 22881 * This is pretty hairy code, so watch close. One 22882 * thing to keep in mind is that UDP and TCP have 22883 * stored their respective datagram lengths in their 22884 * checksum fields. This lines things up real nice. 22885 */ 22886 cksum = (dst >> 16) + (dst & 0xFFFF) + 22887 (src >> 16) + (src & 0xFFFF); 22888 /* 22889 * We assume the udp checksum field contains the 22890 * length, so to compute the pseudo header checksum, 22891 * all we need is the protocol number and src/dst. 22892 */ 22893 /* Provide the checksums for UDP and TCP. */ 22894 if ((PROTO == IPPROTO_TCP) && 22895 (ip_hdr_included != IP_HDR_INCLUDED)) { 22896 /* hlen gets the number of uchar_ts in the IP header */ 22897 hlen = (V_HLEN & 0xF) << 2; 22898 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22899 IP_STAT(ipst, ip_out_sw_cksum); 22900 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22901 LENGTH - hlen); 22902 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 22903 } else if (PROTO == IPPROTO_SCTP && 22904 (ip_hdr_included != IP_HDR_INCLUDED)) { 22905 sctp_hdr_t *sctph; 22906 22907 hlen = (V_HLEN & 0xF) << 2; 22908 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22909 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22910 sctph->sh_chksum = 0; 22911 #ifdef DEBUG 22912 if (!skip_sctp_cksum) 22913 #endif 22914 sctph->sh_chksum = sctp_cksum(mp, hlen); 22915 } else { 22916 queue_t *dev_q = stq->q_next; 22917 22918 if ((dev_q->q_next || dev_q->q_first) && 22919 !canput(dev_q)) { 22920 blocked: 22921 ipha->ipha_ident = ip_hdr_included; 22922 /* 22923 * If we don't have a conn to apply 22924 * backpressure, free the message. 22925 * In the ire_send path, we don't know 22926 * the position to requeue the packet. Rather 22927 * than reorder packets, we just drop this 22928 * packet. 22929 */ 22930 if (ipst->ips_ip_output_queue && 22931 connp != NULL && 22932 caller != IRE_SEND) { 22933 if (caller == IP_WSRV) { 22934 connp->conn_did_putbq = 1; 22935 (void) putbq(connp->conn_wq, 22936 first_mp); 22937 conn_drain_insert(connp); 22938 /* 22939 * This is the service thread, 22940 * and the queue is already 22941 * noenabled. The check for 22942 * canput and the putbq is not 22943 * atomic. So we need to check 22944 * again. 22945 */ 22946 if (canput(stq->q_next)) 22947 connp->conn_did_putbq 22948 = 0; 22949 IP_STAT(ipst, ip_conn_flputbq); 22950 } else { 22951 /* 22952 * We are not the service proc. 22953 * ip_wsrv will be scheduled or 22954 * is already running. 22955 */ 22956 (void) putq(connp->conn_wq, 22957 first_mp); 22958 } 22959 } else { 22960 out_ill = (ill_t *)stq->q_ptr; 22961 BUMP_MIB(out_ill->ill_ip_mib, 22962 ipIfStatsOutDiscards); 22963 freemsg(first_mp); 22964 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22965 "ip_wput_ire_end: q %p (%S)", 22966 q, "discard"); 22967 } 22968 ire_refrele(ire); 22969 if (next_mp) { 22970 ire_refrele(ire1); 22971 freemsg(next_mp); 22972 } 22973 if (conn_outgoing_ill != NULL) 22974 ill_refrele(conn_outgoing_ill); 22975 return; 22976 } 22977 if ((PROTO == IPPROTO_UDP) && 22978 (ip_hdr_included != IP_HDR_INCLUDED)) { 22979 /* 22980 * hlen gets the number of uchar_ts in the 22981 * IP header 22982 */ 22983 hlen = (V_HLEN & 0xF) << 2; 22984 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 22985 max_frag = ire->ire_max_frag; 22986 if (*up != 0) { 22987 IP_CKSUM_XMIT(out_ill, ire, mp, ipha, 22988 up, PROTO, hlen, LENGTH, max_frag, 22989 ipsec_len, cksum); 22990 /* Software checksum? */ 22991 if (DB_CKSUMFLAGS(mp) == 0) { 22992 IP_STAT(ipst, ip_out_sw_cksum); 22993 IP_STAT_UPDATE(ipst, 22994 ip_udp_out_sw_cksum_bytes, 22995 LENGTH - hlen); 22996 } 22997 } 22998 } 22999 } 23000 /* 23001 * Need to do this even when fragmenting. The local 23002 * loopback can be done without computing checksums 23003 * but forwarding out other interface must be done 23004 * after the IP checksum (and ULP checksums) have been 23005 * computed. 23006 * 23007 * NOTE : multicast_forward is set only if this packet 23008 * originated from ip_wput. For packets originating from 23009 * ip_wput_multicast, it is not set. 23010 */ 23011 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 23012 multi_loopback: 23013 ip2dbg(("ip_wput: multicast, loop %d\n", 23014 conn_multicast_loop)); 23015 23016 /* Forget header checksum offload */ 23017 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 23018 23019 /* 23020 * Local loopback of multicasts? Check the 23021 * ill. 23022 * 23023 * Note that the loopback function will not come 23024 * in through ip_rput - it will only do the 23025 * client fanout thus we need to do an mforward 23026 * as well. The is different from the BSD 23027 * logic. 23028 */ 23029 if (ill != NULL) { 23030 ilm_t *ilm; 23031 23032 ILM_WALKER_HOLD(ill); 23033 ilm = ilm_lookup_ill(ill, ipha->ipha_dst, 23034 ALL_ZONES); 23035 ILM_WALKER_RELE(ill); 23036 if (ilm != NULL) { 23037 /* 23038 * Pass along the virtual output q. 23039 * ip_wput_local() will distribute the 23040 * packet to all the matching zones, 23041 * except the sending zone when 23042 * IP_MULTICAST_LOOP is false. 23043 */ 23044 ip_multicast_loopback(q, ill, first_mp, 23045 conn_multicast_loop ? 0 : 23046 IP_FF_NO_MCAST_LOOP, zoneid); 23047 } 23048 } 23049 if (ipha->ipha_ttl == 0) { 23050 /* 23051 * 0 => only to this host i.e. we are 23052 * done. We are also done if this was the 23053 * loopback interface since it is sufficient 23054 * to loopback one copy of a multicast packet. 23055 */ 23056 freemsg(first_mp); 23057 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23058 "ip_wput_ire_end: q %p (%S)", 23059 q, "loopback"); 23060 ire_refrele(ire); 23061 if (conn_outgoing_ill != NULL) 23062 ill_refrele(conn_outgoing_ill); 23063 return; 23064 } 23065 /* 23066 * ILLF_MULTICAST is checked in ip_newroute 23067 * i.e. we don't need to check it here since 23068 * all IRE_CACHEs come from ip_newroute. 23069 * For multicast traffic, SO_DONTROUTE is interpreted 23070 * to mean only send the packet out the interface 23071 * (optionally specified with IP_MULTICAST_IF) 23072 * and do not forward it out additional interfaces. 23073 * RSVP and the rsvp daemon is an example of a 23074 * protocol and user level process that 23075 * handles it's own routing. Hence, it uses the 23076 * SO_DONTROUTE option to accomplish this. 23077 */ 23078 23079 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 23080 ill != NULL) { 23081 /* Unconditionally redo the checksum */ 23082 ipha->ipha_hdr_checksum = 0; 23083 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 23084 23085 /* 23086 * If this needs to go out secure, we need 23087 * to wait till we finish the IPsec 23088 * processing. 23089 */ 23090 if (ipsec_len == 0 && 23091 ip_mforward(ill, ipha, mp)) { 23092 freemsg(first_mp); 23093 ip1dbg(("ip_wput: mforward failed\n")); 23094 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23095 "ip_wput_ire_end: q %p (%S)", 23096 q, "mforward failed"); 23097 ire_refrele(ire); 23098 if (conn_outgoing_ill != NULL) 23099 ill_refrele(conn_outgoing_ill); 23100 return; 23101 } 23102 } 23103 } 23104 max_frag = ire->ire_max_frag; 23105 cksum += ttl_protocol; 23106 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 23107 /* No fragmentation required for this one. */ 23108 /* 23109 * Don't use frag_flag if packet is pre-built or source 23110 * routed or if multicast (since multicast packets do 23111 * not solicit ICMP "packet too big" messages). 23112 */ 23113 if ((ip_hdr_included != IP_HDR_INCLUDED) && 23114 (V_HLEN == IP_SIMPLE_HDR_VERSION || 23115 !ip_source_route_included(ipha)) && 23116 !CLASSD(ipha->ipha_dst)) 23117 ipha->ipha_fragment_offset_and_flags |= 23118 htons(ire->ire_frag_flag); 23119 23120 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 23121 /* Complete the IP header checksum. */ 23122 cksum += ipha->ipha_ident; 23123 cksum += (v_hlen_tos_len >> 16)+ 23124 (v_hlen_tos_len & 0xFFFF); 23125 cksum += ipha->ipha_fragment_offset_and_flags; 23126 hlen = (V_HLEN & 0xF) - 23127 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 23128 if (hlen) { 23129 checksumoptions: 23130 /* 23131 * Account for the IP Options in the IP 23132 * header checksum. 23133 */ 23134 up = (uint16_t *)(rptr+ 23135 IP_SIMPLE_HDR_LENGTH); 23136 do { 23137 cksum += up[0]; 23138 cksum += up[1]; 23139 up += 2; 23140 } while (--hlen); 23141 } 23142 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 23143 cksum = ~(cksum + (cksum >> 16)); 23144 ipha->ipha_hdr_checksum = (uint16_t)cksum; 23145 } 23146 if (ipsec_len != 0) { 23147 ipsec_out_process(q, first_mp, ire, ill_index); 23148 if (!next_mp) { 23149 ire_refrele(ire); 23150 if (conn_outgoing_ill != NULL) 23151 ill_refrele(conn_outgoing_ill); 23152 return; 23153 } 23154 goto next; 23155 } 23156 23157 /* 23158 * multirt_send has already been handled 23159 * for broadcast, but not yet for multicast 23160 * or IP options. 23161 */ 23162 if (next_mp == NULL) { 23163 if (ire->ire_flags & RTF_MULTIRT) { 23164 multirt_send = B_TRUE; 23165 } 23166 } 23167 23168 /* 23169 * In most cases, the emission loop below is 23170 * entered only once. Only in the case where 23171 * the ire holds the RTF_MULTIRT flag, do we loop 23172 * to process all RTF_MULTIRT ires in the bucket, 23173 * and send the packet through all crossed 23174 * RTF_MULTIRT routes. 23175 */ 23176 do { 23177 if (multirt_send) { 23178 irb_t *irb; 23179 23180 irb = ire->ire_bucket; 23181 ASSERT(irb != NULL); 23182 /* 23183 * We are in a multiple send case, 23184 * need to get the next IRE and make 23185 * a duplicate of the packet. 23186 */ 23187 IRB_REFHOLD(irb); 23188 for (ire1 = ire->ire_next; 23189 ire1 != NULL; 23190 ire1 = ire1->ire_next) { 23191 if (!(ire1->ire_flags & 23192 RTF_MULTIRT)) { 23193 continue; 23194 } 23195 if (ire1->ire_addr != 23196 ire->ire_addr) { 23197 continue; 23198 } 23199 if (ire1->ire_marks & 23200 (IRE_MARK_CONDEMNED| 23201 IRE_MARK_HIDDEN)) { 23202 continue; 23203 } 23204 23205 /* Got one */ 23206 IRE_REFHOLD(ire1); 23207 break; 23208 } 23209 IRB_REFRELE(irb); 23210 23211 if (ire1 != NULL) { 23212 next_mp = copyb(mp); 23213 if ((next_mp == NULL) || 23214 ((mp->b_cont != NULL) && 23215 ((next_mp->b_cont = 23216 dupmsg(mp->b_cont)) 23217 == NULL))) { 23218 freemsg(next_mp); 23219 next_mp = NULL; 23220 ire_refrele(ire1); 23221 ire1 = NULL; 23222 } 23223 } 23224 23225 /* 23226 * Last multiroute ire; don't loop 23227 * anymore. The emission is over 23228 * and next_mp is NULL. 23229 */ 23230 if (ire1 == NULL) { 23231 multirt_send = B_FALSE; 23232 } 23233 } 23234 23235 out_ill = ire_to_ill(ire); 23236 DTRACE_PROBE4(ip4__physical__out__start, 23237 ill_t *, NULL, 23238 ill_t *, out_ill, 23239 ipha_t *, ipha, mblk_t *, mp); 23240 FW_HOOKS(ipst->ips_ip4_physical_out_event, 23241 ipst->ips_ipv4firewall_physical_out, 23242 NULL, out_ill, ipha, mp, mp, 0, ipst); 23243 DTRACE_PROBE1(ip4__physical__out__end, 23244 mblk_t *, mp); 23245 if (mp == NULL) 23246 goto release_ire_and_ill_2; 23247 23248 ASSERT(ipsec_len == 0); 23249 mp->b_prev = 23250 SET_BPREV_FLAG(IPP_LOCAL_OUT); 23251 DTRACE_PROBE2(ip__xmit__2, 23252 mblk_t *, mp, ire_t *, ire); 23253 pktxmit_state = ip_xmit_v4(mp, ire, 23254 NULL, B_TRUE); 23255 if ((pktxmit_state == SEND_FAILED) || 23256 (pktxmit_state == LLHDR_RESLV_FAILED)) { 23257 release_ire_and_ill_2: 23258 if (next_mp) { 23259 freemsg(next_mp); 23260 ire_refrele(ire1); 23261 } 23262 ire_refrele(ire); 23263 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23264 "ip_wput_ire_end: q %p (%S)", 23265 q, "discard MDATA"); 23266 if (conn_outgoing_ill != NULL) 23267 ill_refrele(conn_outgoing_ill); 23268 return; 23269 } 23270 23271 if (CLASSD(dst)) { 23272 BUMP_MIB(out_ill->ill_ip_mib, 23273 ipIfStatsHCOutMcastPkts); 23274 UPDATE_MIB(out_ill->ill_ip_mib, 23275 ipIfStatsHCOutMcastOctets, 23276 LENGTH); 23277 } else if (ire->ire_type == IRE_BROADCAST) { 23278 BUMP_MIB(out_ill->ill_ip_mib, 23279 ipIfStatsHCOutBcastPkts); 23280 } 23281 23282 if (multirt_send) { 23283 /* 23284 * We are in a multiple send case, 23285 * need to re-enter the sending loop 23286 * using the next ire. 23287 */ 23288 ire_refrele(ire); 23289 ire = ire1; 23290 stq = ire->ire_stq; 23291 mp = next_mp; 23292 next_mp = NULL; 23293 ipha = (ipha_t *)mp->b_rptr; 23294 ill_index = Q_TO_INDEX(stq); 23295 } 23296 } while (multirt_send); 23297 23298 if (!next_mp) { 23299 /* 23300 * Last copy going out (the ultra-common 23301 * case). Note that we intentionally replicate 23302 * the putnext rather than calling it before 23303 * the next_mp check in hopes of a little 23304 * tail-call action out of the compiler. 23305 */ 23306 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23307 "ip_wput_ire_end: q %p (%S)", 23308 q, "last copy out(1)"); 23309 ire_refrele(ire); 23310 if (conn_outgoing_ill != NULL) 23311 ill_refrele(conn_outgoing_ill); 23312 return; 23313 } 23314 /* More copies going out below. */ 23315 } else { 23316 int offset; 23317 fragmentit: 23318 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 23319 /* 23320 * If this would generate a icmp_frag_needed message, 23321 * we need to handle it before we do the IPsec 23322 * processing. Otherwise, we need to strip the IPsec 23323 * headers before we send up the message to the ULPs 23324 * which becomes messy and difficult. 23325 */ 23326 if (ipsec_len != 0) { 23327 if ((max_frag < (unsigned int)(LENGTH + 23328 ipsec_len)) && (offset & IPH_DF)) { 23329 out_ill = (ill_t *)stq->q_ptr; 23330 BUMP_MIB(out_ill->ill_ip_mib, 23331 ipIfStatsOutFragFails); 23332 BUMP_MIB(out_ill->ill_ip_mib, 23333 ipIfStatsOutFragReqds); 23334 ipha->ipha_hdr_checksum = 0; 23335 ipha->ipha_hdr_checksum = 23336 (uint16_t)ip_csum_hdr(ipha); 23337 icmp_frag_needed(ire->ire_stq, first_mp, 23338 max_frag, zoneid, ipst); 23339 if (!next_mp) { 23340 ire_refrele(ire); 23341 if (conn_outgoing_ill != NULL) { 23342 ill_refrele( 23343 conn_outgoing_ill); 23344 } 23345 return; 23346 } 23347 } else { 23348 /* 23349 * This won't cause a icmp_frag_needed 23350 * message. to be generated. Send it on 23351 * the wire. Note that this could still 23352 * cause fragmentation and all we 23353 * do is the generation of the message 23354 * to the ULP if needed before IPsec. 23355 */ 23356 if (!next_mp) { 23357 ipsec_out_process(q, first_mp, 23358 ire, ill_index); 23359 TRACE_2(TR_FAC_IP, 23360 TR_IP_WPUT_IRE_END, 23361 "ip_wput_ire_end: q %p " 23362 "(%S)", q, 23363 "last ipsec_out_process"); 23364 ire_refrele(ire); 23365 if (conn_outgoing_ill != NULL) { 23366 ill_refrele( 23367 conn_outgoing_ill); 23368 } 23369 return; 23370 } 23371 ipsec_out_process(q, first_mp, 23372 ire, ill_index); 23373 } 23374 } else { 23375 /* 23376 * Initiate IPPF processing. For 23377 * fragmentable packets we finish 23378 * all QOS packet processing before 23379 * calling: 23380 * ip_wput_ire_fragmentit->ip_wput_frag 23381 */ 23382 23383 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23384 ip_process(IPP_LOCAL_OUT, &mp, 23385 ill_index); 23386 if (mp == NULL) { 23387 out_ill = (ill_t *)stq->q_ptr; 23388 BUMP_MIB(out_ill->ill_ip_mib, 23389 ipIfStatsOutDiscards); 23390 if (next_mp != NULL) { 23391 freemsg(next_mp); 23392 ire_refrele(ire1); 23393 } 23394 ire_refrele(ire); 23395 TRACE_2(TR_FAC_IP, 23396 TR_IP_WPUT_IRE_END, 23397 "ip_wput_ire: q %p (%S)", 23398 q, "discard MDATA"); 23399 if (conn_outgoing_ill != NULL) { 23400 ill_refrele( 23401 conn_outgoing_ill); 23402 } 23403 return; 23404 } 23405 } 23406 if (!next_mp) { 23407 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23408 "ip_wput_ire_end: q %p (%S)", 23409 q, "last fragmentation"); 23410 ip_wput_ire_fragmentit(mp, ire, 23411 zoneid, ipst); 23412 ire_refrele(ire); 23413 if (conn_outgoing_ill != NULL) 23414 ill_refrele(conn_outgoing_ill); 23415 return; 23416 } 23417 ip_wput_ire_fragmentit(mp, ire, zoneid, ipst); 23418 } 23419 } 23420 } else { 23421 nullstq: 23422 /* A NULL stq means the destination address is local. */ 23423 UPDATE_OB_PKT_COUNT(ire); 23424 ire->ire_last_used_time = lbolt; 23425 ASSERT(ire->ire_ipif != NULL); 23426 if (!next_mp) { 23427 /* 23428 * Is there an "in" and "out" for traffic local 23429 * to a host (loopback)? The code in Solaris doesn't 23430 * explicitly draw a line in its code for in vs out, 23431 * so we've had to draw a line in the sand: ip_wput_ire 23432 * is considered to be the "output" side and 23433 * ip_wput_local to be the "input" side. 23434 */ 23435 out_ill = ire_to_ill(ire); 23436 23437 /* 23438 * DTrace this as ip:::send. A blocked packet will 23439 * fire the send probe, but not the receive probe. 23440 */ 23441 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23442 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23443 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23444 23445 DTRACE_PROBE4(ip4__loopback__out__start, 23446 ill_t *, NULL, ill_t *, out_ill, 23447 ipha_t *, ipha, mblk_t *, first_mp); 23448 23449 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23450 ipst->ips_ipv4firewall_loopback_out, 23451 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23452 23453 DTRACE_PROBE1(ip4__loopback__out_end, 23454 mblk_t *, first_mp); 23455 23456 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23457 "ip_wput_ire_end: q %p (%S)", 23458 q, "local address"); 23459 23460 if (first_mp != NULL) 23461 ip_wput_local(q, out_ill, ipha, 23462 first_mp, ire, 0, ire->ire_zoneid); 23463 ire_refrele(ire); 23464 if (conn_outgoing_ill != NULL) 23465 ill_refrele(conn_outgoing_ill); 23466 return; 23467 } 23468 23469 out_ill = ire_to_ill(ire); 23470 23471 /* 23472 * DTrace this as ip:::send. A blocked packet will fire the 23473 * send probe, but not the receive probe. 23474 */ 23475 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23476 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23477 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23478 23479 DTRACE_PROBE4(ip4__loopback__out__start, 23480 ill_t *, NULL, ill_t *, out_ill, 23481 ipha_t *, ipha, mblk_t *, first_mp); 23482 23483 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23484 ipst->ips_ipv4firewall_loopback_out, 23485 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23486 23487 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp); 23488 23489 if (first_mp != NULL) 23490 ip_wput_local(q, out_ill, ipha, 23491 first_mp, ire, 0, ire->ire_zoneid); 23492 } 23493 next: 23494 /* 23495 * More copies going out to additional interfaces. 23496 * ire1 has already been held. We don't need the 23497 * "ire" anymore. 23498 */ 23499 ire_refrele(ire); 23500 ire = ire1; 23501 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 23502 mp = next_mp; 23503 ASSERT(ire->ire_ipversion == IPV4_VERSION); 23504 ill = ire_to_ill(ire); 23505 first_mp = mp; 23506 if (ipsec_len != 0) { 23507 ASSERT(first_mp->b_datap->db_type == M_CTL); 23508 mp = mp->b_cont; 23509 } 23510 dst = ire->ire_addr; 23511 ipha = (ipha_t *)mp->b_rptr; 23512 /* 23513 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 23514 * Restore ipha_ident "no checksum" flag. 23515 */ 23516 src = orig_src; 23517 ipha->ipha_ident = ip_hdr_included; 23518 goto another; 23519 23520 #undef rptr 23521 #undef Q_TO_INDEX 23522 } 23523 23524 /* 23525 * Routine to allocate a message that is used to notify the ULP about MDT. 23526 * The caller may provide a pointer to the link-layer MDT capabilities, 23527 * or NULL if MDT is to be disabled on the stream. 23528 */ 23529 mblk_t * 23530 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 23531 { 23532 mblk_t *mp; 23533 ip_mdt_info_t *mdti; 23534 ill_mdt_capab_t *idst; 23535 23536 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 23537 DB_TYPE(mp) = M_CTL; 23538 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 23539 mdti = (ip_mdt_info_t *)mp->b_rptr; 23540 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 23541 idst = &(mdti->mdt_capab); 23542 23543 /* 23544 * If the caller provides us with the capability, copy 23545 * it over into our notification message; otherwise 23546 * we zero out the capability portion. 23547 */ 23548 if (isrc != NULL) 23549 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23550 else 23551 bzero((caddr_t)idst, sizeof (*idst)); 23552 } 23553 return (mp); 23554 } 23555 23556 /* 23557 * Routine which determines whether MDT can be enabled on the destination 23558 * IRE and IPC combination, and if so, allocates and returns the MDT 23559 * notification mblk that may be used by ULP. We also check if we need to 23560 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 23561 * MDT usage in the past have been lifted. This gets called during IP 23562 * and ULP binding. 23563 */ 23564 mblk_t * 23565 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23566 ill_mdt_capab_t *mdt_cap) 23567 { 23568 mblk_t *mp; 23569 boolean_t rc = B_FALSE; 23570 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23571 23572 ASSERT(dst_ire != NULL); 23573 ASSERT(connp != NULL); 23574 ASSERT(mdt_cap != NULL); 23575 23576 /* 23577 * Currently, we only support simple TCP/{IPv4,IPv6} with 23578 * Multidata, which is handled in tcp_multisend(). This 23579 * is the reason why we do all these checks here, to ensure 23580 * that we don't enable Multidata for the cases which we 23581 * can't handle at the moment. 23582 */ 23583 do { 23584 /* Only do TCP at the moment */ 23585 if (connp->conn_ulp != IPPROTO_TCP) 23586 break; 23587 23588 /* 23589 * IPsec outbound policy present? Note that we get here 23590 * after calling ipsec_conn_cache_policy() where the global 23591 * policy checking is performed. conn_latch will be 23592 * non-NULL as long as there's a policy defined, 23593 * i.e. conn_out_enforce_policy may be NULL in such case 23594 * when the connection is non-secure, and hence we check 23595 * further if the latch refers to an outbound policy. 23596 */ 23597 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 23598 break; 23599 23600 /* CGTP (multiroute) is enabled? */ 23601 if (dst_ire->ire_flags & RTF_MULTIRT) 23602 break; 23603 23604 /* Outbound IPQoS enabled? */ 23605 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23606 /* 23607 * In this case, we disable MDT for this and all 23608 * future connections going over the interface. 23609 */ 23610 mdt_cap->ill_mdt_on = 0; 23611 break; 23612 } 23613 23614 /* socket option(s) present? */ 23615 if (!CONN_IS_LSO_MD_FASTPATH(connp)) 23616 break; 23617 23618 rc = B_TRUE; 23619 /* CONSTCOND */ 23620 } while (0); 23621 23622 /* Remember the result */ 23623 connp->conn_mdt_ok = rc; 23624 23625 if (!rc) 23626 return (NULL); 23627 else if (!mdt_cap->ill_mdt_on) { 23628 /* 23629 * If MDT has been previously turned off in the past, and we 23630 * currently can do MDT (due to IPQoS policy removal, etc.) 23631 * then enable it for this interface. 23632 */ 23633 mdt_cap->ill_mdt_on = 1; 23634 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 23635 "interface %s\n", ill_name)); 23636 } 23637 23638 /* Allocate the MDT info mblk */ 23639 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 23640 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 23641 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23642 return (NULL); 23643 } 23644 return (mp); 23645 } 23646 23647 /* 23648 * Routine to allocate a message that is used to notify the ULP about LSO. 23649 * The caller may provide a pointer to the link-layer LSO capabilities, 23650 * or NULL if LSO is to be disabled on the stream. 23651 */ 23652 mblk_t * 23653 ip_lsoinfo_alloc(ill_lso_capab_t *isrc) 23654 { 23655 mblk_t *mp; 23656 ip_lso_info_t *lsoi; 23657 ill_lso_capab_t *idst; 23658 23659 if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) { 23660 DB_TYPE(mp) = M_CTL; 23661 mp->b_wptr = mp->b_rptr + sizeof (*lsoi); 23662 lsoi = (ip_lso_info_t *)mp->b_rptr; 23663 lsoi->lso_info_id = LSO_IOC_INFO_UPDATE; 23664 idst = &(lsoi->lso_capab); 23665 23666 /* 23667 * If the caller provides us with the capability, copy 23668 * it over into our notification message; otherwise 23669 * we zero out the capability portion. 23670 */ 23671 if (isrc != NULL) 23672 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23673 else 23674 bzero((caddr_t)idst, sizeof (*idst)); 23675 } 23676 return (mp); 23677 } 23678 23679 /* 23680 * Routine which determines whether LSO can be enabled on the destination 23681 * IRE and IPC combination, and if so, allocates and returns the LSO 23682 * notification mblk that may be used by ULP. We also check if we need to 23683 * turn LSO back to 'on' when certain restrictions prohibiting us to allow 23684 * LSO usage in the past have been lifted. This gets called during IP 23685 * and ULP binding. 23686 */ 23687 mblk_t * 23688 ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23689 ill_lso_capab_t *lso_cap) 23690 { 23691 mblk_t *mp; 23692 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23693 23694 ASSERT(dst_ire != NULL); 23695 ASSERT(connp != NULL); 23696 ASSERT(lso_cap != NULL); 23697 23698 connp->conn_lso_ok = B_TRUE; 23699 23700 if ((connp->conn_ulp != IPPROTO_TCP) || 23701 CONN_IPSEC_OUT_ENCAPSULATED(connp) || 23702 (dst_ire->ire_flags & RTF_MULTIRT) || 23703 !CONN_IS_LSO_MD_FASTPATH(connp) || 23704 (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { 23705 connp->conn_lso_ok = B_FALSE; 23706 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23707 /* 23708 * Disable LSO for this and all future connections going 23709 * over the interface. 23710 */ 23711 lso_cap->ill_lso_on = 0; 23712 } 23713 } 23714 23715 if (!connp->conn_lso_ok) 23716 return (NULL); 23717 else if (!lso_cap->ill_lso_on) { 23718 /* 23719 * If LSO has been previously turned off in the past, and we 23720 * currently can do LSO (due to IPQoS policy removal, etc.) 23721 * then enable it for this interface. 23722 */ 23723 lso_cap->ill_lso_on = 1; 23724 ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n", 23725 ill_name)); 23726 } 23727 23728 /* Allocate the LSO info mblk */ 23729 if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL) 23730 ip0dbg(("ip_lsoinfo_return: can't enable LSO for " 23731 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23732 23733 return (mp); 23734 } 23735 23736 /* 23737 * Create destination address attribute, and fill it with the physical 23738 * destination address and SAP taken from the template DL_UNITDATA_REQ 23739 * message block. 23740 */ 23741 boolean_t 23742 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 23743 { 23744 dl_unitdata_req_t *dlurp; 23745 pattr_t *pa; 23746 pattrinfo_t pa_info; 23747 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 23748 uint_t das_len, das_off; 23749 23750 ASSERT(dlmp != NULL); 23751 23752 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 23753 das_len = dlurp->dl_dest_addr_length; 23754 das_off = dlurp->dl_dest_addr_offset; 23755 23756 pa_info.type = PATTR_DSTADDRSAP; 23757 pa_info.len = sizeof (**das) + das_len - 1; 23758 23759 /* create and associate the attribute */ 23760 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23761 if (pa != NULL) { 23762 ASSERT(*das != NULL); 23763 (*das)->addr_is_group = 0; 23764 (*das)->addr_len = (uint8_t)das_len; 23765 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 23766 } 23767 23768 return (pa != NULL); 23769 } 23770 23771 /* 23772 * Create hardware checksum attribute and fill it with the values passed. 23773 */ 23774 boolean_t 23775 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 23776 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 23777 { 23778 pattr_t *pa; 23779 pattrinfo_t pa_info; 23780 23781 ASSERT(mmd != NULL); 23782 23783 pa_info.type = PATTR_HCKSUM; 23784 pa_info.len = sizeof (pattr_hcksum_t); 23785 23786 /* create and associate the attribute */ 23787 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23788 if (pa != NULL) { 23789 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 23790 23791 hck->hcksum_start_offset = start_offset; 23792 hck->hcksum_stuff_offset = stuff_offset; 23793 hck->hcksum_end_offset = end_offset; 23794 hck->hcksum_flags = flags; 23795 } 23796 return (pa != NULL); 23797 } 23798 23799 /* 23800 * Create zerocopy attribute and fill it with the specified flags 23801 */ 23802 boolean_t 23803 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 23804 { 23805 pattr_t *pa; 23806 pattrinfo_t pa_info; 23807 23808 ASSERT(mmd != NULL); 23809 pa_info.type = PATTR_ZCOPY; 23810 pa_info.len = sizeof (pattr_zcopy_t); 23811 23812 /* create and associate the attribute */ 23813 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23814 if (pa != NULL) { 23815 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 23816 23817 zcopy->zcopy_flags = flags; 23818 } 23819 return (pa != NULL); 23820 } 23821 23822 /* 23823 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 23824 * block chain. We could rewrite to handle arbitrary message block chains but 23825 * that would make the code complicated and slow. Right now there three 23826 * restrictions: 23827 * 23828 * 1. The first message block must contain the complete IP header and 23829 * at least 1 byte of payload data. 23830 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 23831 * so that we can use a single Multidata message. 23832 * 3. No frag must be distributed over two or more message blocks so 23833 * that we don't need more than two packet descriptors per frag. 23834 * 23835 * The above restrictions allow us to support userland applications (which 23836 * will send down a single message block) and NFS over UDP (which will 23837 * send down a chain of at most three message blocks). 23838 * 23839 * We also don't use MDT for payloads with less than or equal to 23840 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 23841 */ 23842 boolean_t 23843 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 23844 { 23845 int blocks; 23846 ssize_t total, missing, size; 23847 23848 ASSERT(mp != NULL); 23849 ASSERT(hdr_len > 0); 23850 23851 size = MBLKL(mp) - hdr_len; 23852 if (size <= 0) 23853 return (B_FALSE); 23854 23855 /* The first mblk contains the header and some payload. */ 23856 blocks = 1; 23857 total = size; 23858 size %= len; 23859 missing = (size == 0) ? 0 : (len - size); 23860 mp = mp->b_cont; 23861 23862 while (mp != NULL) { 23863 /* 23864 * Give up if we encounter a zero length message block. 23865 * In practice, this should rarely happen and therefore 23866 * not worth the trouble of freeing and re-linking the 23867 * mblk from the chain to handle such case. 23868 */ 23869 if ((size = MBLKL(mp)) == 0) 23870 return (B_FALSE); 23871 23872 /* Too many payload buffers for a single Multidata message? */ 23873 if (++blocks > MULTIDATA_MAX_PBUFS) 23874 return (B_FALSE); 23875 23876 total += size; 23877 /* Is a frag distributed over two or more message blocks? */ 23878 if (missing > size) 23879 return (B_FALSE); 23880 size -= missing; 23881 23882 size %= len; 23883 missing = (size == 0) ? 0 : (len - size); 23884 23885 mp = mp->b_cont; 23886 } 23887 23888 return (total > ip_wput_frag_mdt_min); 23889 } 23890 23891 /* 23892 * Outbound IPv4 fragmentation routine using MDT. 23893 */ 23894 static void 23895 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 23896 uint32_t frag_flag, int offset) 23897 { 23898 ipha_t *ipha_orig; 23899 int i1, ip_data_end; 23900 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 23901 mblk_t *hdr_mp, *md_mp = NULL; 23902 unsigned char *hdr_ptr, *pld_ptr; 23903 multidata_t *mmd; 23904 ip_pdescinfo_t pdi; 23905 ill_t *ill; 23906 ip_stack_t *ipst = ire->ire_ipst; 23907 23908 ASSERT(DB_TYPE(mp) == M_DATA); 23909 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 23910 23911 ill = ire_to_ill(ire); 23912 ASSERT(ill != NULL); 23913 23914 ipha_orig = (ipha_t *)mp->b_rptr; 23915 mp->b_rptr += sizeof (ipha_t); 23916 23917 /* Calculate how many packets we will send out */ 23918 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 23919 pkts = (i1 + len - 1) / len; 23920 ASSERT(pkts > 1); 23921 23922 /* Allocate a message block which will hold all the IP Headers. */ 23923 wroff = ipst->ips_ip_wroff_extra; 23924 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 23925 23926 i1 = pkts * hdr_chunk_len; 23927 /* 23928 * Create the header buffer, Multidata and destination address 23929 * and SAP attribute that should be associated with it. 23930 */ 23931 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 23932 ((hdr_mp->b_wptr += i1), 23933 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 23934 !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { 23935 freemsg(mp); 23936 if (md_mp == NULL) { 23937 freemsg(hdr_mp); 23938 } else { 23939 free_mmd: IP_STAT(ipst, ip_frag_mdt_discarded); 23940 freemsg(md_mp); 23941 } 23942 IP_STAT(ipst, ip_frag_mdt_allocfail); 23943 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); 23944 return; 23945 } 23946 IP_STAT(ipst, ip_frag_mdt_allocd); 23947 23948 /* 23949 * Add a payload buffer to the Multidata; this operation must not 23950 * fail, or otherwise our logic in this routine is broken. There 23951 * is no memory allocation done by the routine, so any returned 23952 * failure simply tells us that we've done something wrong. 23953 * 23954 * A failure tells us that either we're adding the same payload 23955 * buffer more than once, or we're trying to add more buffers than 23956 * allowed. None of the above cases should happen, and we panic 23957 * because either there's horrible heap corruption, and/or 23958 * programming mistake. 23959 */ 23960 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 23961 goto pbuf_panic; 23962 23963 hdr_ptr = hdr_mp->b_rptr; 23964 pld_ptr = mp->b_rptr; 23965 23966 /* Establish the ending byte offset, based on the starting offset. */ 23967 offset <<= 3; 23968 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 23969 IP_SIMPLE_HDR_LENGTH; 23970 23971 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 23972 23973 while (pld_ptr < mp->b_wptr) { 23974 ipha_t *ipha; 23975 uint16_t offset_and_flags; 23976 uint16_t ip_len; 23977 int error; 23978 23979 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 23980 ipha = (ipha_t *)(hdr_ptr + wroff); 23981 ASSERT(OK_32PTR(ipha)); 23982 *ipha = *ipha_orig; 23983 23984 if (ip_data_end - offset > len) { 23985 offset_and_flags = IPH_MF; 23986 } else { 23987 /* 23988 * Last frag. Set len to the length of this last piece. 23989 */ 23990 len = ip_data_end - offset; 23991 /* A frag of a frag might have IPH_MF non-zero */ 23992 offset_and_flags = 23993 ntohs(ipha->ipha_fragment_offset_and_flags) & 23994 IPH_MF; 23995 } 23996 offset_and_flags |= (uint16_t)(offset >> 3); 23997 offset_and_flags |= (uint16_t)frag_flag; 23998 /* Store the offset and flags in the IP header. */ 23999 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24000 24001 /* Store the length in the IP header. */ 24002 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 24003 ipha->ipha_length = htons(ip_len); 24004 24005 /* 24006 * Set the IP header checksum. Note that mp is just 24007 * the header, so this is easy to pass to ip_csum. 24008 */ 24009 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24010 24011 DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *, 24012 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 24013 NULL, int, 0); 24014 24015 /* 24016 * Record offset and size of header and data of the next packet 24017 * in the multidata message. 24018 */ 24019 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 24020 PDESC_PLD_INIT(&pdi); 24021 i1 = MIN(mp->b_wptr - pld_ptr, len); 24022 ASSERT(i1 > 0); 24023 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 24024 if (i1 == len) { 24025 pld_ptr += len; 24026 } else { 24027 i1 = len - i1; 24028 mp = mp->b_cont; 24029 ASSERT(mp != NULL); 24030 ASSERT(MBLKL(mp) >= i1); 24031 /* 24032 * Attach the next payload message block to the 24033 * multidata message. 24034 */ 24035 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24036 goto pbuf_panic; 24037 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 24038 pld_ptr = mp->b_rptr + i1; 24039 } 24040 24041 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 24042 KM_NOSLEEP)) == NULL) { 24043 /* 24044 * Any failure other than ENOMEM indicates that we 24045 * have passed in invalid pdesc info or parameters 24046 * to mmd_addpdesc, which must not happen. 24047 * 24048 * EINVAL is a result of failure on boundary checks 24049 * against the pdesc info contents. It should not 24050 * happen, and we panic because either there's 24051 * horrible heap corruption, and/or programming 24052 * mistake. 24053 */ 24054 if (error != ENOMEM) { 24055 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 24056 "pdesc logic error detected for " 24057 "mmd %p pinfo %p (%d)\n", 24058 (void *)mmd, (void *)&pdi, error); 24059 /* NOTREACHED */ 24060 } 24061 IP_STAT(ipst, ip_frag_mdt_addpdescfail); 24062 /* Free unattached payload message blocks as well */ 24063 md_mp->b_cont = mp->b_cont; 24064 goto free_mmd; 24065 } 24066 24067 /* Advance fragment offset. */ 24068 offset += len; 24069 24070 /* Advance to location for next header in the buffer. */ 24071 hdr_ptr += hdr_chunk_len; 24072 24073 /* Did we reach the next payload message block? */ 24074 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 24075 mp = mp->b_cont; 24076 /* 24077 * Attach the next message block with payload 24078 * data to the multidata message. 24079 */ 24080 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24081 goto pbuf_panic; 24082 pld_ptr = mp->b_rptr; 24083 } 24084 } 24085 24086 ASSERT(hdr_mp->b_wptr == hdr_ptr); 24087 ASSERT(mp->b_wptr == pld_ptr); 24088 24089 /* Update IP statistics */ 24090 IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts); 24091 24092 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts); 24093 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); 24094 24095 len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH; 24096 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts); 24097 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len); 24098 24099 if (pkt_type == OB_PKT) { 24100 ire->ire_ob_pkt_count += pkts; 24101 if (ire->ire_ipif != NULL) 24102 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 24103 } else { 24104 /* The type is IB_PKT in the forwarding path. */ 24105 ire->ire_ib_pkt_count += pkts; 24106 ASSERT(!IRE_IS_LOCAL(ire)); 24107 if (ire->ire_type & IRE_BROADCAST) { 24108 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 24109 } else { 24110 UPDATE_MIB(ill->ill_ip_mib, 24111 ipIfStatsHCOutForwDatagrams, pkts); 24112 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 24113 } 24114 } 24115 ire->ire_last_used_time = lbolt; 24116 /* Send it down */ 24117 putnext(ire->ire_stq, md_mp); 24118 return; 24119 24120 pbuf_panic: 24121 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 24122 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 24123 pbuf_idx); 24124 /* NOTREACHED */ 24125 } 24126 24127 /* 24128 * Outbound IP fragmentation routine. 24129 * 24130 * NOTE : This routine does not ire_refrele the ire that is passed in 24131 * as the argument. 24132 */ 24133 static void 24134 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 24135 uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst) 24136 { 24137 int i1; 24138 mblk_t *ll_hdr_mp; 24139 int ll_hdr_len; 24140 int hdr_len; 24141 mblk_t *hdr_mp; 24142 ipha_t *ipha; 24143 int ip_data_end; 24144 int len; 24145 mblk_t *mp = mp_orig, *mp1; 24146 int offset; 24147 queue_t *q; 24148 uint32_t v_hlen_tos_len; 24149 mblk_t *first_mp; 24150 boolean_t mctl_present; 24151 ill_t *ill; 24152 ill_t *out_ill; 24153 mblk_t *xmit_mp; 24154 mblk_t *carve_mp; 24155 ire_t *ire1 = NULL; 24156 ire_t *save_ire = NULL; 24157 mblk_t *next_mp = NULL; 24158 boolean_t last_frag = B_FALSE; 24159 boolean_t multirt_send = B_FALSE; 24160 ire_t *first_ire = NULL; 24161 irb_t *irb = NULL; 24162 mib2_ipIfStatsEntry_t *mibptr = NULL; 24163 24164 ill = ire_to_ill(ire); 24165 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 24166 24167 BUMP_MIB(mibptr, ipIfStatsOutFragReqds); 24168 24169 if (max_frag == 0) { 24170 ip1dbg(("ip_wput_frag: ire frag size is 0" 24171 " - dropping packet\n")); 24172 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24173 freemsg(mp); 24174 return; 24175 } 24176 24177 /* 24178 * IPsec does not allow hw accelerated packets to be fragmented 24179 * This check is made in ip_wput_ipsec_out prior to coming here 24180 * via ip_wput_ire_fragmentit. 24181 * 24182 * If at this point we have an ire whose ARP request has not 24183 * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger 24184 * sending of ARP query and change ire's state to ND_INCOMPLETE. 24185 * This packet and all fragmentable packets for this ire will 24186 * continue to get dropped while ire_nce->nce_state remains in 24187 * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to 24188 * ND_REACHABLE, all subsquent large packets for this ire will 24189 * get fragemented and sent out by this function. 24190 */ 24191 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 24192 /* If nce_state is ND_INITIAL, trigger ARP query */ 24193 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 24194 ip1dbg(("ip_wput_frag: mac address for ire is unresolved" 24195 " - dropping packet\n")); 24196 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24197 freemsg(mp); 24198 return; 24199 } 24200 24201 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 24202 "ip_wput_frag_start:"); 24203 24204 if (mp->b_datap->db_type == M_CTL) { 24205 first_mp = mp; 24206 mp_orig = mp = mp->b_cont; 24207 mctl_present = B_TRUE; 24208 } else { 24209 first_mp = mp; 24210 mctl_present = B_FALSE; 24211 } 24212 24213 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 24214 ipha = (ipha_t *)mp->b_rptr; 24215 24216 /* 24217 * If the Don't Fragment flag is on, generate an ICMP destination 24218 * unreachable, fragmentation needed. 24219 */ 24220 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 24221 if (offset & IPH_DF) { 24222 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24223 if (is_system_labeled()) { 24224 max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag, 24225 ire->ire_max_frag - max_frag, AF_INET); 24226 } 24227 /* 24228 * Need to compute hdr checksum if called from ip_wput_ire. 24229 * Note that ip_rput_forward verifies the checksum before 24230 * calling this routine so in that case this is a noop. 24231 */ 24232 ipha->ipha_hdr_checksum = 0; 24233 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24234 icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid, 24235 ipst); 24236 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24237 "ip_wput_frag_end:(%S)", 24238 "don't fragment"); 24239 return; 24240 } 24241 /* 24242 * Labeled systems adjust max_frag if they add a label 24243 * to send the correct path mtu. We need the real mtu since we 24244 * are fragmenting the packet after label adjustment. 24245 */ 24246 if (is_system_labeled()) 24247 max_frag = ire->ire_max_frag; 24248 if (mctl_present) 24249 freeb(first_mp); 24250 /* 24251 * Establish the starting offset. May not be zero if we are fragging 24252 * a fragment that is being forwarded. 24253 */ 24254 offset = offset & IPH_OFFSET; 24255 24256 /* TODO why is this test needed? */ 24257 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 24258 if (((max_frag - LENGTH) & ~7) < 8) { 24259 /* TODO: notify ulp somehow */ 24260 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24261 freemsg(mp); 24262 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24263 "ip_wput_frag_end:(%S)", 24264 "len < 8"); 24265 return; 24266 } 24267 24268 hdr_len = (V_HLEN & 0xF) << 2; 24269 24270 ipha->ipha_hdr_checksum = 0; 24271 24272 /* 24273 * Establish the number of bytes maximum per frag, after putting 24274 * in the header. 24275 */ 24276 len = (max_frag - hdr_len) & ~7; 24277 24278 /* Check if we can use MDT to send out the frags. */ 24279 ASSERT(!IRE_IS_LOCAL(ire)); 24280 if (hdr_len == IP_SIMPLE_HDR_LENGTH && 24281 ipst->ips_ip_multidata_outbound && 24282 !(ire->ire_flags & RTF_MULTIRT) && 24283 !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && 24284 ill != NULL && ILL_MDT_CAPABLE(ill) && 24285 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 24286 ASSERT(ill->ill_mdt_capab != NULL); 24287 if (!ill->ill_mdt_capab->ill_mdt_on) { 24288 /* 24289 * If MDT has been previously turned off in the past, 24290 * and we currently can do MDT (due to IPQoS policy 24291 * removal, etc.) then enable it for this interface. 24292 */ 24293 ill->ill_mdt_capab->ill_mdt_on = 1; 24294 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 24295 ill->ill_name)); 24296 } 24297 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 24298 offset); 24299 return; 24300 } 24301 24302 /* Get a copy of the header for the trailing frags */ 24303 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst); 24304 if (!hdr_mp) { 24305 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24306 freemsg(mp); 24307 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24308 "ip_wput_frag_end:(%S)", 24309 "couldn't copy hdr"); 24310 return; 24311 } 24312 if (DB_CRED(mp) != NULL) 24313 mblk_setcred(hdr_mp, DB_CRED(mp)); 24314 24315 /* Store the starting offset, with the MoreFrags flag. */ 24316 i1 = offset | IPH_MF | frag_flag; 24317 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 24318 24319 /* Establish the ending byte offset, based on the starting offset. */ 24320 offset <<= 3; 24321 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 24322 24323 /* Store the length of the first fragment in the IP header. */ 24324 i1 = len + hdr_len; 24325 ASSERT(i1 <= IP_MAXPACKET); 24326 ipha->ipha_length = htons((uint16_t)i1); 24327 24328 /* 24329 * Compute the IP header checksum for the first frag. We have to 24330 * watch out that we stop at the end of the header. 24331 */ 24332 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24333 24334 /* 24335 * Now carve off the first frag. Note that this will include the 24336 * original IP header. 24337 */ 24338 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 24339 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24340 freeb(hdr_mp); 24341 freemsg(mp_orig); 24342 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24343 "ip_wput_frag_end:(%S)", 24344 "couldn't carve first"); 24345 return; 24346 } 24347 24348 /* 24349 * Multirouting case. Each fragment is replicated 24350 * via all non-condemned RTF_MULTIRT routes 24351 * currently resolved. 24352 * We ensure that first_ire is the first RTF_MULTIRT 24353 * ire in the bucket. 24354 */ 24355 if (ire->ire_flags & RTF_MULTIRT) { 24356 irb = ire->ire_bucket; 24357 ASSERT(irb != NULL); 24358 24359 multirt_send = B_TRUE; 24360 24361 /* Make sure we do not omit any multiroute ire. */ 24362 IRB_REFHOLD(irb); 24363 for (first_ire = irb->irb_ire; 24364 first_ire != NULL; 24365 first_ire = first_ire->ire_next) { 24366 if ((first_ire->ire_flags & RTF_MULTIRT) && 24367 (first_ire->ire_addr == ire->ire_addr) && 24368 !(first_ire->ire_marks & 24369 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 24370 break; 24371 } 24372 } 24373 24374 if (first_ire != NULL) { 24375 if (first_ire != ire) { 24376 IRE_REFHOLD(first_ire); 24377 /* 24378 * Do not release the ire passed in 24379 * as the argument. 24380 */ 24381 ire = first_ire; 24382 } else { 24383 first_ire = NULL; 24384 } 24385 } 24386 IRB_REFRELE(irb); 24387 24388 /* 24389 * Save the first ire; we will need to restore it 24390 * for the trailing frags. 24391 * We REFHOLD save_ire, as each iterated ire will be 24392 * REFRELEd. 24393 */ 24394 save_ire = ire; 24395 IRE_REFHOLD(save_ire); 24396 } 24397 24398 /* 24399 * First fragment emission loop. 24400 * In most cases, the emission loop below is entered only 24401 * once. Only in the case where the ire holds the RTF_MULTIRT 24402 * flag, do we loop to process all RTF_MULTIRT ires in the 24403 * bucket, and send the fragment through all crossed 24404 * RTF_MULTIRT routes. 24405 */ 24406 do { 24407 if (ire->ire_flags & RTF_MULTIRT) { 24408 /* 24409 * We are in a multiple send case, need to get 24410 * the next ire and make a copy of the packet. 24411 * ire1 holds here the next ire to process in the 24412 * bucket. If multirouting is expected, 24413 * any non-RTF_MULTIRT ire that has the 24414 * right destination address is ignored. 24415 * 24416 * We have to take into account the MTU of 24417 * each walked ire. max_frag is set by the 24418 * the caller and generally refers to 24419 * the primary ire entry. Here we ensure that 24420 * no route with a lower MTU will be used, as 24421 * fragments are carved once for all ires, 24422 * then replicated. 24423 */ 24424 ASSERT(irb != NULL); 24425 IRB_REFHOLD(irb); 24426 for (ire1 = ire->ire_next; 24427 ire1 != NULL; 24428 ire1 = ire1->ire_next) { 24429 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24430 continue; 24431 if (ire1->ire_addr != ire->ire_addr) 24432 continue; 24433 if (ire1->ire_marks & 24434 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 24435 continue; 24436 /* 24437 * Ensure we do not exceed the MTU 24438 * of the next route. 24439 */ 24440 if (ire1->ire_max_frag < max_frag) { 24441 ip_multirt_bad_mtu(ire1, max_frag); 24442 continue; 24443 } 24444 24445 /* Got one. */ 24446 IRE_REFHOLD(ire1); 24447 break; 24448 } 24449 IRB_REFRELE(irb); 24450 24451 if (ire1 != NULL) { 24452 next_mp = copyb(mp); 24453 if ((next_mp == NULL) || 24454 ((mp->b_cont != NULL) && 24455 ((next_mp->b_cont = 24456 dupmsg(mp->b_cont)) == NULL))) { 24457 freemsg(next_mp); 24458 next_mp = NULL; 24459 ire_refrele(ire1); 24460 ire1 = NULL; 24461 } 24462 } 24463 24464 /* Last multiroute ire; don't loop anymore. */ 24465 if (ire1 == NULL) { 24466 multirt_send = B_FALSE; 24467 } 24468 } 24469 24470 ll_hdr_len = 0; 24471 LOCK_IRE_FP_MP(ire); 24472 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24473 if (ll_hdr_mp != NULL) { 24474 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24475 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 24476 } else { 24477 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24478 } 24479 24480 /* If there is a transmit header, get a copy for this frag. */ 24481 /* 24482 * TODO: should check db_ref before calling ip_carve_mp since 24483 * it might give us a dup. 24484 */ 24485 if (!ll_hdr_mp) { 24486 /* No xmit header. */ 24487 xmit_mp = mp; 24488 24489 /* We have a link-layer header that can fit in our mblk. */ 24490 } else if (mp->b_datap->db_ref == 1 && 24491 ll_hdr_len != 0 && 24492 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24493 /* M_DATA fastpath */ 24494 mp->b_rptr -= ll_hdr_len; 24495 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 24496 xmit_mp = mp; 24497 24498 /* Corner case if copyb has failed */ 24499 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 24500 UNLOCK_IRE_FP_MP(ire); 24501 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24502 freeb(hdr_mp); 24503 freemsg(mp); 24504 freemsg(mp_orig); 24505 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24506 "ip_wput_frag_end:(%S)", 24507 "discard"); 24508 24509 if (multirt_send) { 24510 ASSERT(ire1); 24511 ASSERT(next_mp); 24512 24513 freemsg(next_mp); 24514 ire_refrele(ire1); 24515 } 24516 if (save_ire != NULL) 24517 IRE_REFRELE(save_ire); 24518 24519 if (first_ire != NULL) 24520 ire_refrele(first_ire); 24521 return; 24522 24523 /* 24524 * Case of res_mp OR the fastpath mp can't fit 24525 * in the mblk 24526 */ 24527 } else { 24528 xmit_mp->b_cont = mp; 24529 if (DB_CRED(mp) != NULL) 24530 mblk_setcred(xmit_mp, DB_CRED(mp)); 24531 /* 24532 * Get priority marking, if any. 24533 * We propagate the CoS marking from the 24534 * original packet that went to QoS processing 24535 * in ip_wput_ire to the newly carved mp. 24536 */ 24537 if (DB_TYPE(xmit_mp) == M_DATA) 24538 xmit_mp->b_band = mp->b_band; 24539 } 24540 UNLOCK_IRE_FP_MP(ire); 24541 24542 q = ire->ire_stq; 24543 out_ill = (ill_t *)q->q_ptr; 24544 24545 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24546 24547 DTRACE_PROBE4(ip4__physical__out__start, 24548 ill_t *, NULL, ill_t *, out_ill, 24549 ipha_t *, ipha, mblk_t *, xmit_mp); 24550 24551 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24552 ipst->ips_ipv4firewall_physical_out, 24553 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24554 24555 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp); 24556 24557 if (xmit_mp != NULL) { 24558 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL, 24559 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 24560 ipha_t *, ipha, ip6_t *, NULL, int, 0); 24561 24562 putnext(q, xmit_mp); 24563 24564 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 24565 UPDATE_MIB(out_ill->ill_ip_mib, 24566 ipIfStatsHCOutOctets, i1); 24567 24568 if (pkt_type != OB_PKT) { 24569 /* 24570 * Update the packet count and MIB stats 24571 * of trailing RTF_MULTIRT ires. 24572 */ 24573 UPDATE_OB_PKT_COUNT(ire); 24574 BUMP_MIB(out_ill->ill_ip_mib, 24575 ipIfStatsOutFragReqds); 24576 } 24577 } 24578 24579 if (multirt_send) { 24580 /* 24581 * We are in a multiple send case; look for 24582 * the next ire and re-enter the loop. 24583 */ 24584 ASSERT(ire1); 24585 ASSERT(next_mp); 24586 /* REFRELE the current ire before looping */ 24587 ire_refrele(ire); 24588 ire = ire1; 24589 ire1 = NULL; 24590 mp = next_mp; 24591 next_mp = NULL; 24592 } 24593 } while (multirt_send); 24594 24595 ASSERT(ire1 == NULL); 24596 24597 /* Restore the original ire; we need it for the trailing frags */ 24598 if (save_ire != NULL) { 24599 /* REFRELE the last iterated ire */ 24600 ire_refrele(ire); 24601 /* save_ire has been REFHOLDed */ 24602 ire = save_ire; 24603 save_ire = NULL; 24604 q = ire->ire_stq; 24605 } 24606 24607 if (pkt_type == OB_PKT) { 24608 UPDATE_OB_PKT_COUNT(ire); 24609 } else { 24610 out_ill = (ill_t *)q->q_ptr; 24611 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 24612 UPDATE_IB_PKT_COUNT(ire); 24613 } 24614 24615 /* Advance the offset to the second frag starting point. */ 24616 offset += len; 24617 /* 24618 * Update hdr_len from the copied header - there might be less options 24619 * in the later fragments. 24620 */ 24621 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 24622 /* Loop until done. */ 24623 for (;;) { 24624 uint16_t offset_and_flags; 24625 uint16_t ip_len; 24626 24627 if (ip_data_end - offset > len) { 24628 /* 24629 * Carve off the appropriate amount from the original 24630 * datagram. 24631 */ 24632 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24633 mp = NULL; 24634 break; 24635 } 24636 /* 24637 * More frags after this one. Get another copy 24638 * of the header. 24639 */ 24640 if (carve_mp->b_datap->db_ref == 1 && 24641 hdr_mp->b_wptr - hdr_mp->b_rptr < 24642 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24643 /* Inline IP header */ 24644 carve_mp->b_rptr -= hdr_mp->b_wptr - 24645 hdr_mp->b_rptr; 24646 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24647 hdr_mp->b_wptr - hdr_mp->b_rptr); 24648 mp = carve_mp; 24649 } else { 24650 if (!(mp = copyb(hdr_mp))) { 24651 freemsg(carve_mp); 24652 break; 24653 } 24654 /* Get priority marking, if any. */ 24655 mp->b_band = carve_mp->b_band; 24656 mp->b_cont = carve_mp; 24657 } 24658 ipha = (ipha_t *)mp->b_rptr; 24659 offset_and_flags = IPH_MF; 24660 } else { 24661 /* 24662 * Last frag. Consume the header. Set len to 24663 * the length of this last piece. 24664 */ 24665 len = ip_data_end - offset; 24666 24667 /* 24668 * Carve off the appropriate amount from the original 24669 * datagram. 24670 */ 24671 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24672 mp = NULL; 24673 break; 24674 } 24675 if (carve_mp->b_datap->db_ref == 1 && 24676 hdr_mp->b_wptr - hdr_mp->b_rptr < 24677 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24678 /* Inline IP header */ 24679 carve_mp->b_rptr -= hdr_mp->b_wptr - 24680 hdr_mp->b_rptr; 24681 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24682 hdr_mp->b_wptr - hdr_mp->b_rptr); 24683 mp = carve_mp; 24684 freeb(hdr_mp); 24685 hdr_mp = mp; 24686 } else { 24687 mp = hdr_mp; 24688 /* Get priority marking, if any. */ 24689 mp->b_band = carve_mp->b_band; 24690 mp->b_cont = carve_mp; 24691 } 24692 ipha = (ipha_t *)mp->b_rptr; 24693 /* A frag of a frag might have IPH_MF non-zero */ 24694 offset_and_flags = 24695 ntohs(ipha->ipha_fragment_offset_and_flags) & 24696 IPH_MF; 24697 } 24698 offset_and_flags |= (uint16_t)(offset >> 3); 24699 offset_and_flags |= (uint16_t)frag_flag; 24700 /* Store the offset and flags in the IP header. */ 24701 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24702 24703 /* Store the length in the IP header. */ 24704 ip_len = (uint16_t)(len + hdr_len); 24705 ipha->ipha_length = htons(ip_len); 24706 24707 /* 24708 * Set the IP header checksum. Note that mp is just 24709 * the header, so this is easy to pass to ip_csum. 24710 */ 24711 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24712 24713 /* Attach a transmit header, if any, and ship it. */ 24714 if (pkt_type == OB_PKT) { 24715 UPDATE_OB_PKT_COUNT(ire); 24716 } else { 24717 out_ill = (ill_t *)q->q_ptr; 24718 BUMP_MIB(out_ill->ill_ip_mib, 24719 ipIfStatsHCOutForwDatagrams); 24720 UPDATE_IB_PKT_COUNT(ire); 24721 } 24722 24723 if (ire->ire_flags & RTF_MULTIRT) { 24724 irb = ire->ire_bucket; 24725 ASSERT(irb != NULL); 24726 24727 multirt_send = B_TRUE; 24728 24729 /* 24730 * Save the original ire; we will need to restore it 24731 * for the tailing frags. 24732 */ 24733 save_ire = ire; 24734 IRE_REFHOLD(save_ire); 24735 } 24736 /* 24737 * Emission loop for this fragment, similar 24738 * to what is done for the first fragment. 24739 */ 24740 do { 24741 if (multirt_send) { 24742 /* 24743 * We are in a multiple send case, need to get 24744 * the next ire and make a copy of the packet. 24745 */ 24746 ASSERT(irb != NULL); 24747 IRB_REFHOLD(irb); 24748 for (ire1 = ire->ire_next; 24749 ire1 != NULL; 24750 ire1 = ire1->ire_next) { 24751 if (!(ire1->ire_flags & RTF_MULTIRT)) 24752 continue; 24753 if (ire1->ire_addr != ire->ire_addr) 24754 continue; 24755 if (ire1->ire_marks & 24756 (IRE_MARK_CONDEMNED| 24757 IRE_MARK_HIDDEN)) { 24758 continue; 24759 } 24760 /* 24761 * Ensure we do not exceed the MTU 24762 * of the next route. 24763 */ 24764 if (ire1->ire_max_frag < max_frag) { 24765 ip_multirt_bad_mtu(ire1, 24766 max_frag); 24767 continue; 24768 } 24769 24770 /* Got one. */ 24771 IRE_REFHOLD(ire1); 24772 break; 24773 } 24774 IRB_REFRELE(irb); 24775 24776 if (ire1 != NULL) { 24777 next_mp = copyb(mp); 24778 if ((next_mp == NULL) || 24779 ((mp->b_cont != NULL) && 24780 ((next_mp->b_cont = 24781 dupmsg(mp->b_cont)) == NULL))) { 24782 freemsg(next_mp); 24783 next_mp = NULL; 24784 ire_refrele(ire1); 24785 ire1 = NULL; 24786 } 24787 } 24788 24789 /* Last multiroute ire; don't loop anymore. */ 24790 if (ire1 == NULL) { 24791 multirt_send = B_FALSE; 24792 } 24793 } 24794 24795 /* Update transmit header */ 24796 ll_hdr_len = 0; 24797 LOCK_IRE_FP_MP(ire); 24798 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24799 if (ll_hdr_mp != NULL) { 24800 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24801 ll_hdr_len = MBLKL(ll_hdr_mp); 24802 } else { 24803 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24804 } 24805 24806 if (!ll_hdr_mp) { 24807 xmit_mp = mp; 24808 24809 /* 24810 * We have link-layer header that can fit in 24811 * our mblk. 24812 */ 24813 } else if (mp->b_datap->db_ref == 1 && 24814 ll_hdr_len != 0 && 24815 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24816 /* M_DATA fastpath */ 24817 mp->b_rptr -= ll_hdr_len; 24818 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 24819 ll_hdr_len); 24820 xmit_mp = mp; 24821 24822 /* 24823 * Case of res_mp OR the fastpath mp can't fit 24824 * in the mblk 24825 */ 24826 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 24827 xmit_mp->b_cont = mp; 24828 if (DB_CRED(mp) != NULL) 24829 mblk_setcred(xmit_mp, DB_CRED(mp)); 24830 /* Get priority marking, if any. */ 24831 if (DB_TYPE(xmit_mp) == M_DATA) 24832 xmit_mp->b_band = mp->b_band; 24833 24834 /* Corner case if copyb failed */ 24835 } else { 24836 /* 24837 * Exit both the replication and 24838 * fragmentation loops. 24839 */ 24840 UNLOCK_IRE_FP_MP(ire); 24841 goto drop_pkt; 24842 } 24843 UNLOCK_IRE_FP_MP(ire); 24844 24845 mp1 = mp; 24846 out_ill = (ill_t *)q->q_ptr; 24847 24848 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24849 24850 DTRACE_PROBE4(ip4__physical__out__start, 24851 ill_t *, NULL, ill_t *, out_ill, 24852 ipha_t *, ipha, mblk_t *, xmit_mp); 24853 24854 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24855 ipst->ips_ipv4firewall_physical_out, 24856 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24857 24858 DTRACE_PROBE1(ip4__physical__out__end, 24859 mblk_t *, xmit_mp); 24860 24861 if (mp != mp1 && hdr_mp == mp1) 24862 hdr_mp = mp; 24863 if (mp != mp1 && mp_orig == mp1) 24864 mp_orig = mp; 24865 24866 if (xmit_mp != NULL) { 24867 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, 24868 NULL, void_ip_t *, ipha, 24869 __dtrace_ipsr_ill_t *, out_ill, ipha_t *, 24870 ipha, ip6_t *, NULL, int, 0); 24871 24872 putnext(q, xmit_mp); 24873 24874 BUMP_MIB(out_ill->ill_ip_mib, 24875 ipIfStatsHCOutTransmits); 24876 UPDATE_MIB(out_ill->ill_ip_mib, 24877 ipIfStatsHCOutOctets, ip_len); 24878 24879 if (pkt_type != OB_PKT) { 24880 /* 24881 * Update the packet count of trailing 24882 * RTF_MULTIRT ires. 24883 */ 24884 UPDATE_OB_PKT_COUNT(ire); 24885 } 24886 } 24887 24888 /* All done if we just consumed the hdr_mp. */ 24889 if (mp == hdr_mp) { 24890 last_frag = B_TRUE; 24891 BUMP_MIB(out_ill->ill_ip_mib, 24892 ipIfStatsOutFragOKs); 24893 } 24894 24895 if (multirt_send) { 24896 /* 24897 * We are in a multiple send case; look for 24898 * the next ire and re-enter the loop. 24899 */ 24900 ASSERT(ire1); 24901 ASSERT(next_mp); 24902 /* REFRELE the current ire before looping */ 24903 ire_refrele(ire); 24904 ire = ire1; 24905 ire1 = NULL; 24906 q = ire->ire_stq; 24907 mp = next_mp; 24908 next_mp = NULL; 24909 } 24910 } while (multirt_send); 24911 /* 24912 * Restore the original ire; we need it for the 24913 * trailing frags 24914 */ 24915 if (save_ire != NULL) { 24916 ASSERT(ire1 == NULL); 24917 /* REFRELE the last iterated ire */ 24918 ire_refrele(ire); 24919 /* save_ire has been REFHOLDed */ 24920 ire = save_ire; 24921 q = ire->ire_stq; 24922 save_ire = NULL; 24923 } 24924 24925 if (last_frag) { 24926 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24927 "ip_wput_frag_end:(%S)", 24928 "consumed hdr_mp"); 24929 24930 if (first_ire != NULL) 24931 ire_refrele(first_ire); 24932 return; 24933 } 24934 /* Otherwise, advance and loop. */ 24935 offset += len; 24936 } 24937 24938 drop_pkt: 24939 /* Clean up following allocation failure. */ 24940 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24941 freemsg(mp); 24942 if (mp != hdr_mp) 24943 freeb(hdr_mp); 24944 if (mp != mp_orig) 24945 freemsg(mp_orig); 24946 24947 if (save_ire != NULL) 24948 IRE_REFRELE(save_ire); 24949 if (first_ire != NULL) 24950 ire_refrele(first_ire); 24951 24952 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24953 "ip_wput_frag_end:(%S)", 24954 "end--alloc failure"); 24955 } 24956 24957 /* 24958 * Copy the header plus those options which have the copy bit set 24959 */ 24960 static mblk_t * 24961 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst) 24962 { 24963 mblk_t *mp; 24964 uchar_t *up; 24965 24966 /* 24967 * Quick check if we need to look for options without the copy bit 24968 * set 24969 */ 24970 mp = allocb(ipst->ips_ip_wroff_extra + hdr_len, BPRI_HI); 24971 if (!mp) 24972 return (mp); 24973 mp->b_rptr += ipst->ips_ip_wroff_extra; 24974 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 24975 bcopy(rptr, mp->b_rptr, hdr_len); 24976 mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra; 24977 return (mp); 24978 } 24979 up = mp->b_rptr; 24980 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 24981 up += IP_SIMPLE_HDR_LENGTH; 24982 rptr += IP_SIMPLE_HDR_LENGTH; 24983 hdr_len -= IP_SIMPLE_HDR_LENGTH; 24984 while (hdr_len > 0) { 24985 uint32_t optval; 24986 uint32_t optlen; 24987 24988 optval = *rptr; 24989 if (optval == IPOPT_EOL) 24990 break; 24991 if (optval == IPOPT_NOP) 24992 optlen = 1; 24993 else 24994 optlen = rptr[1]; 24995 if (optval & IPOPT_COPY) { 24996 bcopy(rptr, up, optlen); 24997 up += optlen; 24998 } 24999 rptr += optlen; 25000 hdr_len -= optlen; 25001 } 25002 /* 25003 * Make sure that we drop an even number of words by filling 25004 * with EOL to the next word boundary. 25005 */ 25006 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 25007 hdr_len & 0x3; hdr_len++) 25008 *up++ = IPOPT_EOL; 25009 mp->b_wptr = up; 25010 /* Update header length */ 25011 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 25012 return (mp); 25013 } 25014 25015 /* 25016 * Delivery to local recipients including fanout to multiple recipients. 25017 * Does not do checksumming of UDP/TCP. 25018 * Note: q should be the read side queue for either the ill or conn. 25019 * Note: rq should be the read side q for the lower (ill) stream. 25020 * We don't send packets to IPPF processing, thus the last argument 25021 * to all the fanout calls are B_FALSE. 25022 */ 25023 void 25024 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 25025 int fanout_flags, zoneid_t zoneid) 25026 { 25027 uint32_t protocol; 25028 mblk_t *first_mp; 25029 boolean_t mctl_present; 25030 int ire_type; 25031 #define rptr ((uchar_t *)ipha) 25032 ip_stack_t *ipst = ill->ill_ipst; 25033 25034 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 25035 "ip_wput_local_start: q %p", q); 25036 25037 if (ire != NULL) { 25038 ire_type = ire->ire_type; 25039 } else { 25040 /* 25041 * Only ip_multicast_loopback() calls us with a NULL ire. If the 25042 * packet is not multicast, we can't tell the ire type. 25043 */ 25044 ASSERT(CLASSD(ipha->ipha_dst)); 25045 ire_type = IRE_BROADCAST; 25046 } 25047 25048 first_mp = mp; 25049 if (first_mp->b_datap->db_type == M_CTL) { 25050 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 25051 if (!io->ipsec_out_secure) { 25052 /* 25053 * This ipsec_out_t was allocated in ip_wput 25054 * for multicast packets to store the ill_index. 25055 * As this is being delivered locally, we don't 25056 * need this anymore. 25057 */ 25058 mp = first_mp->b_cont; 25059 freeb(first_mp); 25060 first_mp = mp; 25061 mctl_present = B_FALSE; 25062 } else { 25063 /* 25064 * Convert IPSEC_OUT to IPSEC_IN, preserving all 25065 * security properties for the looped-back packet. 25066 */ 25067 mctl_present = B_TRUE; 25068 mp = first_mp->b_cont; 25069 ASSERT(mp != NULL); 25070 ipsec_out_to_in(first_mp); 25071 } 25072 } else { 25073 mctl_present = B_FALSE; 25074 } 25075 25076 DTRACE_PROBE4(ip4__loopback__in__start, 25077 ill_t *, ill, ill_t *, NULL, 25078 ipha_t *, ipha, mblk_t *, first_mp); 25079 25080 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 25081 ipst->ips_ipv4firewall_loopback_in, 25082 ill, NULL, ipha, first_mp, mp, 0, ipst); 25083 25084 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp); 25085 25086 if (first_mp == NULL) 25087 return; 25088 25089 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, 25090 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 25091 int, 1); 25092 25093 ipst->ips_loopback_packets++; 25094 25095 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 25096 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 25097 if (!IS_SIMPLE_IPH(ipha)) { 25098 ip_wput_local_options(ipha, ipst); 25099 } 25100 25101 protocol = ipha->ipha_protocol; 25102 switch (protocol) { 25103 case IPPROTO_ICMP: { 25104 ire_t *ire_zone; 25105 ilm_t *ilm; 25106 mblk_t *mp1; 25107 zoneid_t last_zoneid; 25108 25109 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { 25110 ASSERT(ire_type == IRE_BROADCAST); 25111 /* 25112 * In the multicast case, applications may have joined 25113 * the group from different zones, so we need to deliver 25114 * the packet to each of them. Loop through the 25115 * multicast memberships structures (ilm) on the receive 25116 * ill and send a copy of the packet up each matching 25117 * one. However, we don't do this for multicasts sent on 25118 * the loopback interface (PHYI_LOOPBACK flag set) as 25119 * they must stay in the sender's zone. 25120 * 25121 * ilm_add_v6() ensures that ilms in the same zone are 25122 * contiguous in the ill_ilm list. We use this property 25123 * to avoid sending duplicates needed when two 25124 * applications in the same zone join the same group on 25125 * different logical interfaces: we ignore the ilm if 25126 * it's zoneid is the same as the last matching one. 25127 * In addition, the sending of the packet for 25128 * ire_zoneid is delayed until all of the other ilms 25129 * have been exhausted. 25130 */ 25131 last_zoneid = -1; 25132 ILM_WALKER_HOLD(ill); 25133 for (ilm = ill->ill_ilm; ilm != NULL; 25134 ilm = ilm->ilm_next) { 25135 if ((ilm->ilm_flags & ILM_DELETED) || 25136 ipha->ipha_dst != ilm->ilm_addr || 25137 ilm->ilm_zoneid == last_zoneid || 25138 ilm->ilm_zoneid == zoneid || 25139 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 25140 continue; 25141 mp1 = ip_copymsg(first_mp); 25142 if (mp1 == NULL) 25143 continue; 25144 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25145 mctl_present, B_FALSE, ill, 25146 ilm->ilm_zoneid); 25147 last_zoneid = ilm->ilm_zoneid; 25148 } 25149 ILM_WALKER_RELE(ill); 25150 /* 25151 * Loopback case: the sending endpoint has 25152 * IP_MULTICAST_LOOP disabled, therefore we don't 25153 * dispatch the multicast packet to the sending zone. 25154 */ 25155 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 25156 freemsg(first_mp); 25157 return; 25158 } 25159 } else if (ire_type == IRE_BROADCAST) { 25160 /* 25161 * In the broadcast case, there may be many zones 25162 * which need a copy of the packet delivered to them. 25163 * There is one IRE_BROADCAST per broadcast address 25164 * and per zone; we walk those using a helper function. 25165 * In addition, the sending of the packet for zoneid is 25166 * delayed until all of the other ires have been 25167 * processed. 25168 */ 25169 IRB_REFHOLD(ire->ire_bucket); 25170 ire_zone = NULL; 25171 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 25172 ire)) != NULL) { 25173 mp1 = ip_copymsg(first_mp); 25174 if (mp1 == NULL) 25175 continue; 25176 25177 UPDATE_IB_PKT_COUNT(ire_zone); 25178 ire_zone->ire_last_used_time = lbolt; 25179 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25180 mctl_present, B_FALSE, ill, 25181 ire_zone->ire_zoneid); 25182 } 25183 IRB_REFRELE(ire->ire_bucket); 25184 } 25185 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 25186 0, mctl_present, B_FALSE, ill, zoneid); 25187 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25188 "ip_wput_local_end: q %p (%S)", 25189 q, "icmp"); 25190 return; 25191 } 25192 case IPPROTO_IGMP: 25193 if ((mp = igmp_input(q, mp, ill)) == NULL) { 25194 /* Bad packet - discarded by igmp_input */ 25195 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25196 "ip_wput_local_end: q %p (%S)", 25197 q, "igmp_input--bad packet"); 25198 if (mctl_present) 25199 freeb(first_mp); 25200 return; 25201 } 25202 /* 25203 * igmp_input() may have returned the pulled up message. 25204 * So first_mp and ipha need to be reinitialized. 25205 */ 25206 ipha = (ipha_t *)mp->b_rptr; 25207 if (mctl_present) 25208 first_mp->b_cont = mp; 25209 else 25210 first_mp = mp; 25211 /* deliver to local raw users */ 25212 break; 25213 case IPPROTO_ENCAP: 25214 /* 25215 * This case is covered by either ip_fanout_proto, or by 25216 * the above security processing for self-tunneled packets. 25217 */ 25218 break; 25219 case IPPROTO_UDP: { 25220 uint16_t *up; 25221 uint32_t ports; 25222 25223 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 25224 UDP_PORTS_OFFSET); 25225 /* Force a 'valid' checksum. */ 25226 up[3] = 0; 25227 25228 ports = *(uint32_t *)up; 25229 ip_fanout_udp(q, first_mp, ill, ipha, ports, 25230 (ire_type == IRE_BROADCAST), 25231 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25232 IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE, 25233 ill, zoneid); 25234 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25235 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 25236 return; 25237 } 25238 case IPPROTO_TCP: { 25239 25240 /* 25241 * For TCP, discard broadcast packets. 25242 */ 25243 if ((ushort_t)ire_type == IRE_BROADCAST) { 25244 freemsg(first_mp); 25245 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 25246 ip2dbg(("ip_wput_local: discard broadcast\n")); 25247 return; 25248 } 25249 25250 if (mp->b_datap->db_type == M_DATA) { 25251 /* 25252 * M_DATA mblk, so init mblk (chain) for no struio(). 25253 */ 25254 mblk_t *mp1 = mp; 25255 25256 do { 25257 mp1->b_datap->db_struioflag = 0; 25258 } while ((mp1 = mp1->b_cont) != NULL); 25259 } 25260 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 25261 <= mp->b_wptr); 25262 ip_fanout_tcp(q, first_mp, ill, ipha, 25263 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25264 IP_FF_SYN_ADDIRE | IP_FF_IPINFO, 25265 mctl_present, B_FALSE, zoneid); 25266 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25267 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 25268 return; 25269 } 25270 case IPPROTO_SCTP: 25271 { 25272 uint32_t ports; 25273 25274 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 25275 ip_fanout_sctp(first_mp, ill, ipha, ports, 25276 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25277 IP_FF_IPINFO, mctl_present, B_FALSE, zoneid); 25278 return; 25279 } 25280 25281 default: 25282 break; 25283 } 25284 /* 25285 * Find a client for some other protocol. We give 25286 * copies to multiple clients, if more than one is 25287 * bound. 25288 */ 25289 ip_fanout_proto(q, first_mp, ill, ipha, 25290 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 25291 mctl_present, B_FALSE, ill, zoneid); 25292 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25293 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 25294 #undef rptr 25295 } 25296 25297 /* 25298 * Update any source route, record route, or timestamp options. 25299 * Check that we are at end of strict source route. 25300 * The options have been sanity checked by ip_wput_options(). 25301 */ 25302 static void 25303 ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) 25304 { 25305 ipoptp_t opts; 25306 uchar_t *opt; 25307 uint8_t optval; 25308 uint8_t optlen; 25309 ipaddr_t dst; 25310 uint32_t ts; 25311 ire_t *ire; 25312 timestruc_t now; 25313 25314 ip2dbg(("ip_wput_local_options\n")); 25315 for (optval = ipoptp_first(&opts, ipha); 25316 optval != IPOPT_EOL; 25317 optval = ipoptp_next(&opts)) { 25318 opt = opts.ipoptp_cur; 25319 optlen = opts.ipoptp_len; 25320 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 25321 switch (optval) { 25322 uint32_t off; 25323 case IPOPT_SSRR: 25324 case IPOPT_LSRR: 25325 off = opt[IPOPT_OFFSET]; 25326 off--; 25327 if (optlen < IP_ADDR_LEN || 25328 off > optlen - IP_ADDR_LEN) { 25329 /* End of source route */ 25330 break; 25331 } 25332 /* 25333 * This will only happen if two consecutive entries 25334 * in the source route contains our address or if 25335 * it is a packet with a loose source route which 25336 * reaches us before consuming the whole source route 25337 */ 25338 ip1dbg(("ip_wput_local_options: not end of SR\n")); 25339 if (optval == IPOPT_SSRR) { 25340 return; 25341 } 25342 /* 25343 * Hack: instead of dropping the packet truncate the 25344 * source route to what has been used by filling the 25345 * rest with IPOPT_NOP. 25346 */ 25347 opt[IPOPT_OLEN] = (uint8_t)off; 25348 while (off < optlen) { 25349 opt[off++] = IPOPT_NOP; 25350 } 25351 break; 25352 case IPOPT_RR: 25353 off = opt[IPOPT_OFFSET]; 25354 off--; 25355 if (optlen < IP_ADDR_LEN || 25356 off > optlen - IP_ADDR_LEN) { 25357 /* No more room - ignore */ 25358 ip1dbg(( 25359 "ip_wput_forward_options: end of RR\n")); 25360 break; 25361 } 25362 dst = htonl(INADDR_LOOPBACK); 25363 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25364 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25365 break; 25366 case IPOPT_TS: 25367 /* Insert timestamp if there is romm */ 25368 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25369 case IPOPT_TS_TSONLY: 25370 off = IPOPT_TS_TIMELEN; 25371 break; 25372 case IPOPT_TS_PRESPEC: 25373 case IPOPT_TS_PRESPEC_RFC791: 25374 /* Verify that the address matched */ 25375 off = opt[IPOPT_OFFSET] - 1; 25376 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 25377 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 25378 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 25379 ipst); 25380 if (ire == NULL) { 25381 /* Not for us */ 25382 break; 25383 } 25384 ire_refrele(ire); 25385 /* FALLTHRU */ 25386 case IPOPT_TS_TSANDADDR: 25387 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25388 break; 25389 default: 25390 /* 25391 * ip_*put_options should have already 25392 * dropped this packet. 25393 */ 25394 cmn_err(CE_PANIC, "ip_wput_local_options: " 25395 "unknown IT - bug in ip_wput_options?\n"); 25396 return; /* Keep "lint" happy */ 25397 } 25398 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 25399 /* Increase overflow counter */ 25400 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 25401 opt[IPOPT_POS_OV_FLG] = (uint8_t) 25402 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 25403 (off << 4); 25404 break; 25405 } 25406 off = opt[IPOPT_OFFSET] - 1; 25407 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25408 case IPOPT_TS_PRESPEC: 25409 case IPOPT_TS_PRESPEC_RFC791: 25410 case IPOPT_TS_TSANDADDR: 25411 dst = htonl(INADDR_LOOPBACK); 25412 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25413 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25414 /* FALLTHRU */ 25415 case IPOPT_TS_TSONLY: 25416 off = opt[IPOPT_OFFSET] - 1; 25417 /* Compute # of milliseconds since midnight */ 25418 gethrestime(&now); 25419 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 25420 now.tv_nsec / (NANOSEC / MILLISEC); 25421 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 25422 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 25423 break; 25424 } 25425 break; 25426 } 25427 } 25428 } 25429 25430 /* 25431 * Send out a multicast packet on interface ipif. 25432 * The sender does not have an conn. 25433 * Caller verifies that this isn't a PHYI_LOOPBACK. 25434 */ 25435 void 25436 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) 25437 { 25438 ipha_t *ipha; 25439 ire_t *ire; 25440 ipaddr_t dst; 25441 mblk_t *first_mp; 25442 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 25443 25444 /* igmp_sendpkt always allocates a ipsec_out_t */ 25445 ASSERT(mp->b_datap->db_type == M_CTL); 25446 ASSERT(!ipif->ipif_isv6); 25447 ASSERT(!IS_LOOPBACK(ipif->ipif_ill)); 25448 25449 first_mp = mp; 25450 mp = first_mp->b_cont; 25451 ASSERT(mp->b_datap->db_type == M_DATA); 25452 ipha = (ipha_t *)mp->b_rptr; 25453 25454 /* 25455 * Find an IRE which matches the destination and the outgoing 25456 * queue (i.e. the outgoing interface.) 25457 */ 25458 if (ipif->ipif_flags & IPIF_POINTOPOINT) 25459 dst = ipif->ipif_pp_dst_addr; 25460 else 25461 dst = ipha->ipha_dst; 25462 /* 25463 * The source address has already been initialized by the 25464 * caller and hence matching on ILL (MATCH_IRE_ILL) would 25465 * be sufficient rather than MATCH_IRE_IPIF. 25466 * 25467 * This function is used for sending IGMP packets. We need 25468 * to make sure that we send the packet out of the interface 25469 * (ipif->ipif_ill) where we joined the group. This is to 25470 * prevent from switches doing IGMP snooping to send us multicast 25471 * packets for a given group on the interface we have joined. 25472 * If we can't find an ire, igmp_sendpkt has already initialized 25473 * ipsec_out_attach_if so that this will not be load spread in 25474 * ip_newroute_ipif. 25475 */ 25476 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, 25477 MATCH_IRE_ILL, ipst); 25478 if (!ire) { 25479 /* 25480 * Mark this packet to make it be delivered to 25481 * ip_wput_ire after the new ire has been 25482 * created. 25483 */ 25484 mp->b_prev = NULL; 25485 mp->b_next = NULL; 25486 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC, 25487 zoneid, &zero_info); 25488 return; 25489 } 25490 25491 /* 25492 * Honor the RTF_SETSRC flag; this is the only case 25493 * where we force this addr whatever the current src addr is, 25494 * because this address is set by igmp_sendpkt(), and 25495 * cannot be specified by any user. 25496 */ 25497 if (ire->ire_flags & RTF_SETSRC) { 25498 ipha->ipha_src = ire->ire_src_addr; 25499 } 25500 25501 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid); 25502 } 25503 25504 /* 25505 * NOTE : This function does not ire_refrele the ire argument passed in. 25506 * 25507 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 25508 * failure. The nce_fp_mp can vanish any time in the case of 25509 * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 25510 * the ire_lock to access the nce_fp_mp in this case. 25511 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 25512 * prepending a fastpath message IPQoS processing must precede it, we also set 25513 * the b_band of the fastpath message to that of the mblk returned by IPQoS 25514 * (IPQoS might have set the b_band for CoS marking). 25515 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 25516 * must follow it so that IPQoS can mark the dl_priority field for CoS 25517 * marking, if needed. 25518 */ 25519 static mblk_t * 25520 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, 25521 uint32_t ill_index, ipha_t **iphap) 25522 { 25523 uint_t hlen; 25524 ipha_t *ipha; 25525 mblk_t *mp1; 25526 boolean_t qos_done = B_FALSE; 25527 uchar_t *ll_hdr; 25528 ip_stack_t *ipst = ire->ire_ipst; 25529 25530 #define rptr ((uchar_t *)ipha) 25531 25532 ipha = (ipha_t *)mp->b_rptr; 25533 hlen = 0; 25534 LOCK_IRE_FP_MP(ire); 25535 if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) { 25536 ASSERT(DB_TYPE(mp1) == M_DATA); 25537 /* Initiate IPPF processing */ 25538 if ((proc != 0) && IPP_ENABLED(proc, ipst)) { 25539 UNLOCK_IRE_FP_MP(ire); 25540 ip_process(proc, &mp, ill_index); 25541 if (mp == NULL) 25542 return (NULL); 25543 25544 ipha = (ipha_t *)mp->b_rptr; 25545 LOCK_IRE_FP_MP(ire); 25546 if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) { 25547 qos_done = B_TRUE; 25548 goto no_fp_mp; 25549 } 25550 ASSERT(DB_TYPE(mp1) == M_DATA); 25551 } 25552 hlen = MBLKL(mp1); 25553 /* 25554 * Check if we have enough room to prepend fastpath 25555 * header 25556 */ 25557 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 25558 ll_hdr = rptr - hlen; 25559 bcopy(mp1->b_rptr, ll_hdr, hlen); 25560 /* 25561 * Set the b_rptr to the start of the link layer 25562 * header 25563 */ 25564 mp->b_rptr = ll_hdr; 25565 mp1 = mp; 25566 } else { 25567 mp1 = copyb(mp1); 25568 if (mp1 == NULL) 25569 goto unlock_err; 25570 mp1->b_band = mp->b_band; 25571 mp1->b_cont = mp; 25572 /* 25573 * certain system generated traffic may not 25574 * have cred/label in ip header block. This 25575 * is true even for a labeled system. But for 25576 * labeled traffic, inherit the label in the 25577 * new header. 25578 */ 25579 if (DB_CRED(mp) != NULL) 25580 mblk_setcred(mp1, DB_CRED(mp)); 25581 /* 25582 * XXX disable ICK_VALID and compute checksum 25583 * here; can happen if nce_fp_mp changes and 25584 * it can't be copied now due to insufficient 25585 * space. (unlikely, fp mp can change, but it 25586 * does not increase in length) 25587 */ 25588 } 25589 UNLOCK_IRE_FP_MP(ire); 25590 } else { 25591 no_fp_mp: 25592 mp1 = copyb(ire->ire_nce->nce_res_mp); 25593 if (mp1 == NULL) { 25594 unlock_err: 25595 UNLOCK_IRE_FP_MP(ire); 25596 freemsg(mp); 25597 return (NULL); 25598 } 25599 UNLOCK_IRE_FP_MP(ire); 25600 mp1->b_cont = mp; 25601 /* 25602 * certain system generated traffic may not 25603 * have cred/label in ip header block. This 25604 * is true even for a labeled system. But for 25605 * labeled traffic, inherit the label in the 25606 * new header. 25607 */ 25608 if (DB_CRED(mp) != NULL) 25609 mblk_setcred(mp1, DB_CRED(mp)); 25610 if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) { 25611 ip_process(proc, &mp1, ill_index); 25612 if (mp1 == NULL) 25613 return (NULL); 25614 25615 if (mp1->b_cont == NULL) 25616 ipha = NULL; 25617 else 25618 ipha = (ipha_t *)mp1->b_cont->b_rptr; 25619 } 25620 } 25621 25622 *iphap = ipha; 25623 return (mp1); 25624 #undef rptr 25625 } 25626 25627 /* 25628 * Finish the outbound IPsec processing for an IPv6 packet. This function 25629 * is called from ipsec_out_process() if the IPsec packet was processed 25630 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25631 * asynchronously. 25632 */ 25633 void 25634 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 25635 ire_t *ire_arg) 25636 { 25637 in6_addr_t *v6dstp; 25638 ire_t *ire; 25639 mblk_t *mp; 25640 ip6_t *ip6h1; 25641 uint_t ill_index; 25642 ipsec_out_t *io; 25643 boolean_t attach_if, hwaccel; 25644 uint32_t flags = IP6_NO_IPPOLICY; 25645 int match_flags; 25646 zoneid_t zoneid; 25647 boolean_t ill_need_rele = B_FALSE; 25648 boolean_t ire_need_rele = B_FALSE; 25649 ip_stack_t *ipst; 25650 25651 mp = ipsec_mp->b_cont; 25652 ip6h1 = (ip6_t *)mp->b_rptr; 25653 io = (ipsec_out_t *)ipsec_mp->b_rptr; 25654 ASSERT(io->ipsec_out_ns != NULL); 25655 ipst = io->ipsec_out_ns->netstack_ip; 25656 ill_index = io->ipsec_out_ill_index; 25657 if (io->ipsec_out_reachable) { 25658 flags |= IPV6_REACHABILITY_CONFIRMATION; 25659 } 25660 attach_if = io->ipsec_out_attach_if; 25661 hwaccel = io->ipsec_out_accelerated; 25662 zoneid = io->ipsec_out_zoneid; 25663 ASSERT(zoneid != ALL_ZONES); 25664 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 25665 /* Multicast addresses should have non-zero ill_index. */ 25666 v6dstp = &ip6h->ip6_dst; 25667 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 25668 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 25669 ASSERT(!attach_if || ill_index != 0); 25670 if (ill_index != 0) { 25671 if (ill == NULL) { 25672 ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, 25673 B_TRUE, ipst); 25674 25675 /* Failure case frees things for us. */ 25676 if (ill == NULL) 25677 return; 25678 25679 ill_need_rele = B_TRUE; 25680 } 25681 /* 25682 * If this packet needs to go out on a particular interface 25683 * honor it. 25684 */ 25685 if (attach_if) { 25686 match_flags = MATCH_IRE_ILL; 25687 25688 /* 25689 * Check if we need an ire that will not be 25690 * looked up by anybody else i.e. HIDDEN. 25691 */ 25692 if (ill_is_probeonly(ill)) { 25693 match_flags |= MATCH_IRE_MARK_HIDDEN; 25694 } 25695 } 25696 } 25697 ASSERT(mp != NULL); 25698 25699 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 25700 boolean_t unspec_src; 25701 ipif_t *ipif; 25702 25703 /* 25704 * Use the ill_index to get the right ill. 25705 */ 25706 unspec_src = io->ipsec_out_unspec_src; 25707 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 25708 if (ipif == NULL) { 25709 if (ill_need_rele) 25710 ill_refrele(ill); 25711 freemsg(ipsec_mp); 25712 return; 25713 } 25714 25715 if (ire_arg != NULL) { 25716 ire = ire_arg; 25717 } else { 25718 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25719 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 25720 ire_need_rele = B_TRUE; 25721 } 25722 if (ire != NULL) { 25723 ipif_refrele(ipif); 25724 /* 25725 * XXX Do the multicast forwarding now, as the IPsec 25726 * processing has been done. 25727 */ 25728 goto send; 25729 } 25730 25731 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 25732 mp->b_prev = NULL; 25733 mp->b_next = NULL; 25734 25735 /* 25736 * If the IPsec packet was processed asynchronously, 25737 * drop it now. 25738 */ 25739 if (q == NULL) { 25740 if (ill_need_rele) 25741 ill_refrele(ill); 25742 freemsg(ipsec_mp); 25743 return; 25744 } 25745 25746 ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, 25747 unspec_src, zoneid); 25748 ipif_refrele(ipif); 25749 } else { 25750 if (attach_if) { 25751 ipif_t *ipif; 25752 25753 ipif = ipif_get_next_ipif(NULL, ill); 25754 if (ipif == NULL) { 25755 if (ill_need_rele) 25756 ill_refrele(ill); 25757 freemsg(ipsec_mp); 25758 return; 25759 } 25760 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25761 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 25762 ire_need_rele = B_TRUE; 25763 ipif_refrele(ipif); 25764 } else { 25765 if (ire_arg != NULL) { 25766 ire = ire_arg; 25767 } else { 25768 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, 25769 ipst); 25770 ire_need_rele = B_TRUE; 25771 } 25772 } 25773 if (ire != NULL) 25774 goto send; 25775 /* 25776 * ire disappeared underneath. 25777 * 25778 * What we need to do here is the ip_newroute 25779 * logic to get the ire without doing the IPsec 25780 * processing. Follow the same old path. But this 25781 * time, ip_wput or ire_add_then_send will call us 25782 * directly as all the IPsec operations are done. 25783 */ 25784 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 25785 mp->b_prev = NULL; 25786 mp->b_next = NULL; 25787 25788 /* 25789 * If the IPsec packet was processed asynchronously, 25790 * drop it now. 25791 */ 25792 if (q == NULL) { 25793 if (ill_need_rele) 25794 ill_refrele(ill); 25795 freemsg(ipsec_mp); 25796 return; 25797 } 25798 25799 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 25800 zoneid, ipst); 25801 } 25802 if (ill != NULL && ill_need_rele) 25803 ill_refrele(ill); 25804 return; 25805 send: 25806 if (ill != NULL && ill_need_rele) 25807 ill_refrele(ill); 25808 25809 /* Local delivery */ 25810 if (ire->ire_stq == NULL) { 25811 ill_t *out_ill; 25812 ASSERT(q != NULL); 25813 25814 /* PFHooks: LOOPBACK_OUT */ 25815 out_ill = ire_to_ill(ire); 25816 25817 /* 25818 * DTrace this as ip:::send. A blocked packet will fire the 25819 * send probe, but not the receive probe. 25820 */ 25821 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 25822 void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill, 25823 ipha_t *, NULL, ip6_t *, ip6h, int, 1); 25824 25825 DTRACE_PROBE4(ip6__loopback__out__start, 25826 ill_t *, NULL, ill_t *, out_ill, 25827 ip6_t *, ip6h1, mblk_t *, ipsec_mp); 25828 25829 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 25830 ipst->ips_ipv6firewall_loopback_out, 25831 NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst); 25832 25833 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp); 25834 25835 if (ipsec_mp != NULL) 25836 ip_wput_local_v6(RD(q), out_ill, 25837 ip6h, ipsec_mp, ire, 0); 25838 if (ire_need_rele) 25839 ire_refrele(ire); 25840 return; 25841 } 25842 /* 25843 * Everything is done. Send it out on the wire. 25844 * We force the insertion of a fragment header using the 25845 * IPH_FRAG_HDR flag in two cases: 25846 * - after reception of an ICMPv6 "packet too big" message 25847 * with a MTU < 1280 (cf. RFC 2460 section 5) 25848 * - for multirouted IPv6 packets, so that the receiver can 25849 * discard duplicates according to their fragment identifier 25850 */ 25851 /* XXX fix flow control problems. */ 25852 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 25853 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 25854 if (hwaccel) { 25855 /* 25856 * hardware acceleration does not handle these 25857 * "slow path" cases. 25858 */ 25859 /* IPsec KSTATS: should bump bean counter here. */ 25860 if (ire_need_rele) 25861 ire_refrele(ire); 25862 freemsg(ipsec_mp); 25863 return; 25864 } 25865 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 25866 (mp->b_cont ? msgdsize(mp) : 25867 mp->b_wptr - (uchar_t *)ip6h)) { 25868 /* IPsec KSTATS: should bump bean counter here. */ 25869 ip0dbg(("Packet length mismatch: %d, %ld\n", 25870 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 25871 msgdsize(mp))); 25872 if (ire_need_rele) 25873 ire_refrele(ire); 25874 freemsg(ipsec_mp); 25875 return; 25876 } 25877 ASSERT(mp->b_prev == NULL); 25878 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 25879 ntohs(ip6h->ip6_plen) + 25880 IPV6_HDR_LEN, ire->ire_max_frag)); 25881 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 25882 ire->ire_max_frag); 25883 } else { 25884 UPDATE_OB_PKT_COUNT(ire); 25885 ire->ire_last_used_time = lbolt; 25886 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 25887 } 25888 if (ire_need_rele) 25889 ire_refrele(ire); 25890 freeb(ipsec_mp); 25891 } 25892 25893 void 25894 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 25895 { 25896 mblk_t *hada_mp; /* attributes M_CTL mblk */ 25897 da_ipsec_t *hada; /* data attributes */ 25898 ill_t *ill = (ill_t *)q->q_ptr; 25899 25900 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 25901 25902 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 25903 /* IPsec KSTATS: Bump lose counter here! */ 25904 freemsg(mp); 25905 return; 25906 } 25907 25908 /* 25909 * It's an IPsec packet that must be 25910 * accelerated by the Provider, and the 25911 * outbound ill is IPsec acceleration capable. 25912 * Prepends the mblk with an IPHADA_M_CTL, and ship it 25913 * to the ill. 25914 * IPsec KSTATS: should bump packet counter here. 25915 */ 25916 25917 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 25918 if (hada_mp == NULL) { 25919 /* IPsec KSTATS: should bump packet counter here. */ 25920 freemsg(mp); 25921 return; 25922 } 25923 25924 hada_mp->b_datap->db_type = M_CTL; 25925 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 25926 hada_mp->b_cont = mp; 25927 25928 hada = (da_ipsec_t *)hada_mp->b_rptr; 25929 bzero(hada, sizeof (da_ipsec_t)); 25930 hada->da_type = IPHADA_M_CTL; 25931 25932 putnext(q, hada_mp); 25933 } 25934 25935 /* 25936 * Finish the outbound IPsec processing. This function is called from 25937 * ipsec_out_process() if the IPsec packet was processed 25938 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25939 * asynchronously. 25940 */ 25941 void 25942 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 25943 ire_t *ire_arg) 25944 { 25945 uint32_t v_hlen_tos_len; 25946 ipaddr_t dst; 25947 ipif_t *ipif = NULL; 25948 ire_t *ire; 25949 ire_t *ire1 = NULL; 25950 mblk_t *next_mp = NULL; 25951 uint32_t max_frag; 25952 boolean_t multirt_send = B_FALSE; 25953 mblk_t *mp; 25954 ipha_t *ipha1; 25955 uint_t ill_index; 25956 ipsec_out_t *io; 25957 boolean_t attach_if; 25958 int match_flags; 25959 irb_t *irb = NULL; 25960 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 25961 zoneid_t zoneid; 25962 ipxmit_state_t pktxmit_state; 25963 ip_stack_t *ipst; 25964 25965 #ifdef _BIG_ENDIAN 25966 #define LENGTH (v_hlen_tos_len & 0xFFFF) 25967 #else 25968 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 25969 #endif 25970 25971 mp = ipsec_mp->b_cont; 25972 ipha1 = (ipha_t *)mp->b_rptr; 25973 ASSERT(mp != NULL); 25974 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 25975 dst = ipha->ipha_dst; 25976 25977 io = (ipsec_out_t *)ipsec_mp->b_rptr; 25978 ill_index = io->ipsec_out_ill_index; 25979 attach_if = io->ipsec_out_attach_if; 25980 zoneid = io->ipsec_out_zoneid; 25981 ASSERT(zoneid != ALL_ZONES); 25982 ipst = io->ipsec_out_ns->netstack_ip; 25983 ASSERT(io->ipsec_out_ns != NULL); 25984 25985 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 25986 if (ill_index != 0) { 25987 if (ill == NULL) { 25988 ill = ip_grab_attach_ill(NULL, ipsec_mp, 25989 ill_index, B_FALSE, ipst); 25990 25991 /* Failure case frees things for us. */ 25992 if (ill == NULL) 25993 return; 25994 25995 ill_need_rele = B_TRUE; 25996 } 25997 /* 25998 * If this packet needs to go out on a particular interface 25999 * honor it. 26000 */ 26001 if (attach_if) { 26002 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 26003 26004 /* 26005 * Check if we need an ire that will not be 26006 * looked up by anybody else i.e. HIDDEN. 26007 */ 26008 if (ill_is_probeonly(ill)) { 26009 match_flags |= MATCH_IRE_MARK_HIDDEN; 26010 } 26011 } 26012 } 26013 26014 if (CLASSD(dst)) { 26015 boolean_t conn_dontroute; 26016 /* 26017 * Use the ill_index to get the right ipif. 26018 */ 26019 conn_dontroute = io->ipsec_out_dontroute; 26020 if (ill_index == 0) 26021 ipif = ipif_lookup_group(dst, zoneid, ipst); 26022 else 26023 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 26024 if (ipif == NULL) { 26025 ip1dbg(("ip_wput_ipsec_out: No ipif for" 26026 " multicast\n")); 26027 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 26028 freemsg(ipsec_mp); 26029 goto done; 26030 } 26031 /* 26032 * ipha_src has already been intialized with the 26033 * value of the ipif in ip_wput. All we need now is 26034 * an ire to send this downstream. 26035 */ 26036 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 26037 MBLK_GETLABEL(mp), match_flags, ipst); 26038 if (ire != NULL) { 26039 ill_t *ill1; 26040 /* 26041 * Do the multicast forwarding now, as the IPsec 26042 * processing has been done. 26043 */ 26044 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 26045 (ill1 = ire_to_ill(ire))) { 26046 if (ip_mforward(ill1, ipha, mp)) { 26047 freemsg(ipsec_mp); 26048 ip1dbg(("ip_wput_ipsec_out: mforward " 26049 "failed\n")); 26050 ire_refrele(ire); 26051 goto done; 26052 } 26053 } 26054 goto send; 26055 } 26056 26057 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 26058 mp->b_prev = NULL; 26059 mp->b_next = NULL; 26060 26061 /* 26062 * If the IPsec packet was processed asynchronously, 26063 * drop it now. 26064 */ 26065 if (q == NULL) { 26066 freemsg(ipsec_mp); 26067 goto done; 26068 } 26069 26070 /* 26071 * We may be using a wrong ipif to create the ire. 26072 * But it is okay as the source address is assigned 26073 * for the packet already. Next outbound packet would 26074 * create the IRE with the right IPIF in ip_wput. 26075 * 26076 * Also handle RTF_MULTIRT routes. 26077 */ 26078 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, 26079 zoneid, &zero_info); 26080 } else { 26081 if (attach_if) { 26082 ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, 26083 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 26084 } else { 26085 if (ire_arg != NULL) { 26086 ire = ire_arg; 26087 ire_need_rele = B_FALSE; 26088 } else { 26089 ire = ire_cache_lookup(dst, zoneid, 26090 MBLK_GETLABEL(mp), ipst); 26091 } 26092 } 26093 if (ire != NULL) { 26094 goto send; 26095 } 26096 26097 /* 26098 * ire disappeared underneath. 26099 * 26100 * What we need to do here is the ip_newroute 26101 * logic to get the ire without doing the IPsec 26102 * processing. Follow the same old path. But this 26103 * time, ip_wput or ire_add_then_put will call us 26104 * directly as all the IPsec operations are done. 26105 */ 26106 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 26107 mp->b_prev = NULL; 26108 mp->b_next = NULL; 26109 26110 /* 26111 * If the IPsec packet was processed asynchronously, 26112 * drop it now. 26113 */ 26114 if (q == NULL) { 26115 freemsg(ipsec_mp); 26116 goto done; 26117 } 26118 26119 /* 26120 * Since we're going through ip_newroute() again, we 26121 * need to make sure we don't: 26122 * 26123 * 1.) Trigger the ASSERT() with the ipha_ident 26124 * overloading. 26125 * 2.) Redo transport-layer checksumming, since we've 26126 * already done all that to get this far. 26127 * 26128 * The easiest way not do either of the above is to set 26129 * the ipha_ident field to IP_HDR_INCLUDED. 26130 */ 26131 ipha->ipha_ident = IP_HDR_INCLUDED; 26132 ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 26133 zoneid, ipst); 26134 } 26135 goto done; 26136 send: 26137 if (ire->ire_stq == NULL) { 26138 ill_t *out_ill; 26139 /* 26140 * Loopbacks go through ip_wput_local except for one case. 26141 * We come here if we generate a icmp_frag_needed message 26142 * after IPsec processing is over. When this function calls 26143 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 26144 * icmp_frag_needed. The message generated comes back here 26145 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 26146 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 26147 * source address as it is usually set in ip_wput_ire. As 26148 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 26149 * and we end up here. We can't enter ip_wput_ire once the 26150 * IPsec processing is over and hence we need to do it here. 26151 */ 26152 ASSERT(q != NULL); 26153 UPDATE_OB_PKT_COUNT(ire); 26154 ire->ire_last_used_time = lbolt; 26155 if (ipha->ipha_src == 0) 26156 ipha->ipha_src = ire->ire_src_addr; 26157 26158 /* PFHooks: LOOPBACK_OUT */ 26159 out_ill = ire_to_ill(ire); 26160 26161 /* 26162 * DTrace this as ip:::send. A blocked packet will fire the 26163 * send probe, but not the receive probe. 26164 */ 26165 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 26166 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 26167 ipha_t *, ipha, ip6_t *, NULL, int, 1); 26168 26169 DTRACE_PROBE4(ip4__loopback__out__start, 26170 ill_t *, NULL, ill_t *, out_ill, 26171 ipha_t *, ipha1, mblk_t *, ipsec_mp); 26172 26173 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 26174 ipst->ips_ipv4firewall_loopback_out, 26175 NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst); 26176 26177 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp); 26178 26179 if (ipsec_mp != NULL) 26180 ip_wput_local(RD(q), out_ill, 26181 ipha, ipsec_mp, ire, 0, zoneid); 26182 if (ire_need_rele) 26183 ire_refrele(ire); 26184 goto done; 26185 } 26186 26187 if (ire->ire_max_frag < (unsigned int)LENGTH) { 26188 /* 26189 * We are through with IPsec processing. 26190 * Fragment this and send it on the wire. 26191 */ 26192 if (io->ipsec_out_accelerated) { 26193 /* 26194 * The packet has been accelerated but must 26195 * be fragmented. This should not happen 26196 * since AH and ESP must not accelerate 26197 * packets that need fragmentation, however 26198 * the configuration could have changed 26199 * since the AH or ESP processing. 26200 * Drop packet. 26201 * IPsec KSTATS: bump bean counter here. 26202 */ 26203 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 26204 "fragmented accelerated packet!\n")); 26205 freemsg(ipsec_mp); 26206 } else { 26207 ip_wput_ire_fragmentit(ipsec_mp, ire, zoneid, ipst); 26208 } 26209 if (ire_need_rele) 26210 ire_refrele(ire); 26211 goto done; 26212 } 26213 26214 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 26215 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 26216 (void *)ire->ire_ipif, (void *)ipif)); 26217 26218 /* 26219 * Multiroute the secured packet, unless IPsec really 26220 * requires the packet to go out only through a particular 26221 * interface. 26222 */ 26223 if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { 26224 ire_t *first_ire; 26225 irb = ire->ire_bucket; 26226 ASSERT(irb != NULL); 26227 /* 26228 * This ire has been looked up as the one that 26229 * goes through the given ipif; 26230 * make sure we do not omit any other multiroute ire 26231 * that may be present in the bucket before this one. 26232 */ 26233 IRB_REFHOLD(irb); 26234 for (first_ire = irb->irb_ire; 26235 first_ire != NULL; 26236 first_ire = first_ire->ire_next) { 26237 if ((first_ire->ire_flags & RTF_MULTIRT) && 26238 (first_ire->ire_addr == ire->ire_addr) && 26239 !(first_ire->ire_marks & 26240 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 26241 break; 26242 } 26243 } 26244 26245 if ((first_ire != NULL) && (first_ire != ire)) { 26246 /* 26247 * Don't change the ire if the packet must 26248 * be fragmented if sent via this new one. 26249 */ 26250 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 26251 IRE_REFHOLD(first_ire); 26252 if (ire_need_rele) 26253 ire_refrele(ire); 26254 else 26255 ire_need_rele = B_TRUE; 26256 ire = first_ire; 26257 } 26258 } 26259 IRB_REFRELE(irb); 26260 26261 multirt_send = B_TRUE; 26262 max_frag = ire->ire_max_frag; 26263 } else { 26264 if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { 26265 ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " 26266 "flag, attach_if %d\n", attach_if)); 26267 } 26268 } 26269 26270 /* 26271 * In most cases, the emission loop below is entered only once. 26272 * Only in the case where the ire holds the RTF_MULTIRT 26273 * flag, we loop to process all RTF_MULTIRT ires in the 26274 * bucket, and send the packet through all crossed 26275 * RTF_MULTIRT routes. 26276 */ 26277 do { 26278 if (multirt_send) { 26279 /* 26280 * ire1 holds here the next ire to process in the 26281 * bucket. If multirouting is expected, 26282 * any non-RTF_MULTIRT ire that has the 26283 * right destination address is ignored. 26284 */ 26285 ASSERT(irb != NULL); 26286 IRB_REFHOLD(irb); 26287 for (ire1 = ire->ire_next; 26288 ire1 != NULL; 26289 ire1 = ire1->ire_next) { 26290 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 26291 continue; 26292 if (ire1->ire_addr != ire->ire_addr) 26293 continue; 26294 if (ire1->ire_marks & 26295 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 26296 continue; 26297 /* No loopback here */ 26298 if (ire1->ire_stq == NULL) 26299 continue; 26300 /* 26301 * Ensure we do not exceed the MTU 26302 * of the next route. 26303 */ 26304 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 26305 ip_multirt_bad_mtu(ire1, max_frag); 26306 continue; 26307 } 26308 26309 IRE_REFHOLD(ire1); 26310 break; 26311 } 26312 IRB_REFRELE(irb); 26313 if (ire1 != NULL) { 26314 /* 26315 * We are in a multiple send case, need to 26316 * make a copy of the packet. 26317 */ 26318 next_mp = copymsg(ipsec_mp); 26319 if (next_mp == NULL) { 26320 ire_refrele(ire1); 26321 ire1 = NULL; 26322 } 26323 } 26324 } 26325 /* 26326 * Everything is done. Send it out on the wire 26327 * 26328 * ip_xmit_v4 will call ip_wput_attach_llhdr and then 26329 * either send it on the wire or, in the case of 26330 * HW acceleration, call ipsec_hw_putnext. 26331 */ 26332 if (ire->ire_nce && 26333 ire->ire_nce->nce_state != ND_REACHABLE) { 26334 DTRACE_PROBE2(ip__wput__ipsec__bail, 26335 (ire_t *), ire, (mblk_t *), ipsec_mp); 26336 /* 26337 * If ire's link-layer is unresolved (this 26338 * would only happen if the incomplete ire 26339 * was added to cachetable via forwarding path) 26340 * don't bother going to ip_xmit_v4. Just drop the 26341 * packet. 26342 * There is a slight risk here, in that, if we 26343 * have the forwarding path create an incomplete 26344 * IRE, then until the IRE is completed, any 26345 * transmitted IPsec packets will be dropped 26346 * instead of being queued waiting for resolution. 26347 * 26348 * But the likelihood of a forwarding packet and a wput 26349 * packet sending to the same dst at the same time 26350 * and there not yet be an ARP entry for it is small. 26351 * Furthermore, if this actually happens, it might 26352 * be likely that wput would generate multiple 26353 * packets (and forwarding would also have a train 26354 * of packets) for that destination. If this is 26355 * the case, some of them would have been dropped 26356 * anyway, since ARP only queues a few packets while 26357 * waiting for resolution 26358 * 26359 * NOTE: We should really call ip_xmit_v4, 26360 * and let it queue the packet and send the 26361 * ARP query and have ARP come back thus: 26362 * <ARP> ip_wput->ip_output->ip-wput_nondata-> 26363 * ip_xmit_v4->ip_wput_attach_llhdr + ipsec 26364 * hw accel work. But it's too complex to get 26365 * the IPsec hw acceleration approach to fit 26366 * well with ip_xmit_v4 doing ARP without 26367 * doing IPsec simplification. For now, we just 26368 * poke ip_xmit_v4 to trigger the arp resolve, so 26369 * that we can continue with the send on the next 26370 * attempt. 26371 * 26372 * XXX THis should be revisited, when 26373 * the IPsec/IP interaction is cleaned up 26374 */ 26375 ip1dbg(("ip_wput_ipsec_out: ire is incomplete" 26376 " - dropping packet\n")); 26377 freemsg(ipsec_mp); 26378 /* 26379 * Call ip_xmit_v4() to trigger ARP query 26380 * in case the nce_state is ND_INITIAL 26381 */ 26382 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 26383 goto drop_pkt; 26384 } 26385 26386 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 26387 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1, 26388 mblk_t *, ipsec_mp); 26389 FW_HOOKS(ipst->ips_ip4_physical_out_event, 26390 ipst->ips_ipv4firewall_physical_out, NULL, 26391 ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst); 26392 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp); 26393 if (ipsec_mp == NULL) 26394 goto drop_pkt; 26395 26396 ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n")); 26397 pktxmit_state = ip_xmit_v4(mp, ire, 26398 (io->ipsec_out_accelerated ? io : NULL), B_FALSE); 26399 26400 if ((pktxmit_state == SEND_FAILED) || 26401 (pktxmit_state == LLHDR_RESLV_FAILED)) { 26402 26403 freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */ 26404 drop_pkt: 26405 BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib, 26406 ipIfStatsOutDiscards); 26407 if (ire_need_rele) 26408 ire_refrele(ire); 26409 if (ire1 != NULL) { 26410 ire_refrele(ire1); 26411 freemsg(next_mp); 26412 } 26413 goto done; 26414 } 26415 26416 freeb(ipsec_mp); 26417 if (ire_need_rele) 26418 ire_refrele(ire); 26419 26420 if (ire1 != NULL) { 26421 ire = ire1; 26422 ire_need_rele = B_TRUE; 26423 ASSERT(next_mp); 26424 ipsec_mp = next_mp; 26425 mp = ipsec_mp->b_cont; 26426 ire1 = NULL; 26427 next_mp = NULL; 26428 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26429 } else { 26430 multirt_send = B_FALSE; 26431 } 26432 } while (multirt_send); 26433 done: 26434 if (ill != NULL && ill_need_rele) 26435 ill_refrele(ill); 26436 if (ipif != NULL) 26437 ipif_refrele(ipif); 26438 } 26439 26440 /* 26441 * Get the ill corresponding to the specified ire, and compare its 26442 * capabilities with the protocol and algorithms specified by the 26443 * the SA obtained from ipsec_out. If they match, annotate the 26444 * ipsec_out structure to indicate that the packet needs acceleration. 26445 * 26446 * 26447 * A packet is eligible for outbound hardware acceleration if the 26448 * following conditions are satisfied: 26449 * 26450 * 1. the packet will not be fragmented 26451 * 2. the provider supports the algorithm 26452 * 3. there is no pending control message being exchanged 26453 * 4. snoop is not attached 26454 * 5. the destination address is not a broadcast or multicast address. 26455 * 26456 * Rationale: 26457 * - Hardware drivers do not support fragmentation with 26458 * the current interface. 26459 * - snoop, multicast, and broadcast may result in exposure of 26460 * a cleartext datagram. 26461 * We check all five of these conditions here. 26462 * 26463 * XXX would like to nuke "ire_t *" parameter here; problem is that 26464 * IRE is only way to figure out if a v4 address is a broadcast and 26465 * thus ineligible for acceleration... 26466 */ 26467 static void 26468 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 26469 { 26470 ipsec_out_t *io; 26471 mblk_t *data_mp; 26472 uint_t plen, overhead; 26473 ip_stack_t *ipst; 26474 26475 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 26476 return; 26477 26478 if (ill == NULL) 26479 return; 26480 ipst = ill->ill_ipst; 26481 /* 26482 * Destination address is a broadcast or multicast. Punt. 26483 */ 26484 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 26485 IRE_LOCAL))) 26486 return; 26487 26488 data_mp = ipsec_mp->b_cont; 26489 26490 if (ill->ill_isv6) { 26491 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 26492 26493 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 26494 return; 26495 26496 plen = ip6h->ip6_plen; 26497 } else { 26498 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 26499 26500 if (CLASSD(ipha->ipha_dst)) 26501 return; 26502 26503 plen = ipha->ipha_length; 26504 } 26505 /* 26506 * Is there a pending DLPI control message being exchanged 26507 * between IP/IPsec and the DLS Provider? If there is, it 26508 * could be a SADB update, and the state of the DLS Provider 26509 * SADB might not be in sync with the SADB maintained by 26510 * IPsec. To avoid dropping packets or using the wrong keying 26511 * material, we do not accelerate this packet. 26512 */ 26513 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 26514 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26515 "ill_dlpi_pending! don't accelerate packet\n")); 26516 return; 26517 } 26518 26519 /* 26520 * Is the Provider in promiscous mode? If it does, we don't 26521 * accelerate the packet since it will bounce back up to the 26522 * listeners in the clear. 26523 */ 26524 if (ill->ill_promisc_on_phys) { 26525 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26526 "ill in promiscous mode, don't accelerate packet\n")); 26527 return; 26528 } 26529 26530 /* 26531 * Will the packet require fragmentation? 26532 */ 26533 26534 /* 26535 * IPsec ESP note: this is a pessimistic estimate, but the same 26536 * as is used elsewhere. 26537 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 26538 * + 2-byte trailer 26539 */ 26540 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 26541 IPSEC_BASE_ESP_HDR_SIZE(sa); 26542 26543 if ((plen + overhead) > ill->ill_max_mtu) 26544 return; 26545 26546 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26547 26548 /* 26549 * Can the ill accelerate this IPsec protocol and algorithm 26550 * specified by the SA? 26551 */ 26552 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 26553 ill->ill_isv6, sa, ipst->ips_netstack)) { 26554 return; 26555 } 26556 26557 /* 26558 * Tell AH or ESP that the outbound ill is capable of 26559 * accelerating this packet. 26560 */ 26561 io->ipsec_out_is_capab_ill = B_TRUE; 26562 } 26563 26564 /* 26565 * Select which AH & ESP SA's to use (if any) for the outbound packet. 26566 * 26567 * If this function returns B_TRUE, the requested SA's have been filled 26568 * into the ipsec_out_*_sa pointers. 26569 * 26570 * If the function returns B_FALSE, the packet has been "consumed", most 26571 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 26572 * 26573 * The SA references created by the protocol-specific "select" 26574 * function will be released when the ipsec_mp is freed, thanks to the 26575 * ipsec_out_free destructor -- see spd.c. 26576 */ 26577 static boolean_t 26578 ipsec_out_select_sa(mblk_t *ipsec_mp) 26579 { 26580 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 26581 ipsec_out_t *io; 26582 ipsec_policy_t *pp; 26583 ipsec_action_t *ap; 26584 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26585 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26586 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26587 26588 if (!io->ipsec_out_secure) { 26589 /* 26590 * We came here by mistake. 26591 * Don't bother with ipsec processing 26592 * We should "discourage" this path in the future. 26593 */ 26594 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26595 return (B_FALSE); 26596 } 26597 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26598 ASSERT((io->ipsec_out_policy != NULL) || 26599 (io->ipsec_out_act != NULL)); 26600 26601 ASSERT(io->ipsec_out_failed == B_FALSE); 26602 26603 /* 26604 * IPsec processing has started. 26605 */ 26606 io->ipsec_out_proc_begin = B_TRUE; 26607 ap = io->ipsec_out_act; 26608 if (ap == NULL) { 26609 pp = io->ipsec_out_policy; 26610 ASSERT(pp != NULL); 26611 ap = pp->ipsp_act; 26612 ASSERT(ap != NULL); 26613 } 26614 26615 /* 26616 * We have an action. now, let's select SA's. 26617 * (In the future, we can cache this in the conn_t..) 26618 */ 26619 if (ap->ipa_want_esp) { 26620 if (io->ipsec_out_esp_sa == NULL) { 26621 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 26622 IPPROTO_ESP); 26623 } 26624 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 26625 } 26626 26627 if (ap->ipa_want_ah) { 26628 if (io->ipsec_out_ah_sa == NULL) { 26629 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 26630 IPPROTO_AH); 26631 } 26632 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 26633 /* 26634 * The ESP and AH processing order needs to be preserved 26635 * when both protocols are required (ESP should be applied 26636 * before AH for an outbound packet). Force an ESP ACQUIRE 26637 * when both ESP and AH are required, and an AH ACQUIRE 26638 * is needed. 26639 */ 26640 if (ap->ipa_want_esp && need_ah_acquire) 26641 need_esp_acquire = B_TRUE; 26642 } 26643 26644 /* 26645 * Send an ACQUIRE (extended, regular, or both) if we need one. 26646 * Release SAs that got referenced, but will not be used until we 26647 * acquire _all_ of the SAs we need. 26648 */ 26649 if (need_ah_acquire || need_esp_acquire) { 26650 if (io->ipsec_out_ah_sa != NULL) { 26651 IPSA_REFRELE(io->ipsec_out_ah_sa); 26652 io->ipsec_out_ah_sa = NULL; 26653 } 26654 if (io->ipsec_out_esp_sa != NULL) { 26655 IPSA_REFRELE(io->ipsec_out_esp_sa); 26656 io->ipsec_out_esp_sa = NULL; 26657 } 26658 26659 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 26660 return (B_FALSE); 26661 } 26662 26663 return (B_TRUE); 26664 } 26665 26666 /* 26667 * Process an IPSEC_OUT message and see what you can 26668 * do with it. 26669 * IPQoS Notes: 26670 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 26671 * IPsec. 26672 * XXX would like to nuke ire_t. 26673 * XXX ill_index better be "real" 26674 */ 26675 void 26676 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 26677 { 26678 ipsec_out_t *io; 26679 ipsec_policy_t *pp; 26680 ipsec_action_t *ap; 26681 ipha_t *ipha; 26682 ip6_t *ip6h; 26683 mblk_t *mp; 26684 ill_t *ill; 26685 zoneid_t zoneid; 26686 ipsec_status_t ipsec_rc; 26687 boolean_t ill_need_rele = B_FALSE; 26688 ip_stack_t *ipst; 26689 ipsec_stack_t *ipss; 26690 26691 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26692 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26693 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26694 ipst = io->ipsec_out_ns->netstack_ip; 26695 mp = ipsec_mp->b_cont; 26696 26697 /* 26698 * Initiate IPPF processing. We do it here to account for packets 26699 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 26700 * We can check for ipsec_out_proc_begin even for such packets, as 26701 * they will always be false (asserted below). 26702 */ 26703 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) { 26704 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 26705 io->ipsec_out_ill_index : ill_index); 26706 if (mp == NULL) { 26707 ip2dbg(("ipsec_out_process: packet dropped "\ 26708 "during IPPF processing\n")); 26709 freeb(ipsec_mp); 26710 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26711 return; 26712 } 26713 } 26714 26715 if (!io->ipsec_out_secure) { 26716 /* 26717 * We came here by mistake. 26718 * Don't bother with ipsec processing 26719 * Should "discourage" this path in the future. 26720 */ 26721 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26722 goto done; 26723 } 26724 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26725 ASSERT((io->ipsec_out_policy != NULL) || 26726 (io->ipsec_out_act != NULL)); 26727 ASSERT(io->ipsec_out_failed == B_FALSE); 26728 26729 ipss = ipst->ips_netstack->netstack_ipsec; 26730 if (!ipsec_loaded(ipss)) { 26731 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 26732 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26733 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26734 } else { 26735 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 26736 } 26737 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 26738 DROPPER(ipss, ipds_ip_ipsec_not_loaded), 26739 &ipss->ipsec_dropper); 26740 return; 26741 } 26742 26743 /* 26744 * IPsec processing has started. 26745 */ 26746 io->ipsec_out_proc_begin = B_TRUE; 26747 ap = io->ipsec_out_act; 26748 if (ap == NULL) { 26749 pp = io->ipsec_out_policy; 26750 ASSERT(pp != NULL); 26751 ap = pp->ipsp_act; 26752 ASSERT(ap != NULL); 26753 } 26754 26755 /* 26756 * Save the outbound ill index. When the packet comes back 26757 * from IPsec, we make sure the ill hasn't changed or disappeared 26758 * before sending it the accelerated packet. 26759 */ 26760 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 26761 int ifindex; 26762 ill = ire_to_ill(ire); 26763 ifindex = ill->ill_phyint->phyint_ifindex; 26764 io->ipsec_out_capab_ill_index = ifindex; 26765 } 26766 26767 /* 26768 * The order of processing is first insert a IP header if needed. 26769 * Then insert the ESP header and then the AH header. 26770 */ 26771 if ((io->ipsec_out_se_done == B_FALSE) && 26772 (ap->ipa_want_se)) { 26773 /* 26774 * First get the outer IP header before sending 26775 * it to ESP. 26776 */ 26777 ipha_t *oipha, *iipha; 26778 mblk_t *outer_mp, *inner_mp; 26779 26780 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 26781 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 26782 "ipsec_out_process: " 26783 "Self-Encapsulation failed: Out of memory\n"); 26784 freemsg(ipsec_mp); 26785 if (ill != NULL) { 26786 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26787 } else { 26788 BUMP_MIB(&ipst->ips_ip_mib, 26789 ipIfStatsOutDiscards); 26790 } 26791 return; 26792 } 26793 inner_mp = ipsec_mp->b_cont; 26794 ASSERT(inner_mp->b_datap->db_type == M_DATA); 26795 oipha = (ipha_t *)outer_mp->b_rptr; 26796 iipha = (ipha_t *)inner_mp->b_rptr; 26797 *oipha = *iipha; 26798 outer_mp->b_wptr += sizeof (ipha_t); 26799 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 26800 sizeof (ipha_t)); 26801 oipha->ipha_protocol = IPPROTO_ENCAP; 26802 oipha->ipha_version_and_hdr_length = 26803 IP_SIMPLE_HDR_VERSION; 26804 oipha->ipha_hdr_checksum = 0; 26805 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 26806 outer_mp->b_cont = inner_mp; 26807 ipsec_mp->b_cont = outer_mp; 26808 26809 io->ipsec_out_se_done = B_TRUE; 26810 io->ipsec_out_tunnel = B_TRUE; 26811 } 26812 26813 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 26814 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 26815 !ipsec_out_select_sa(ipsec_mp)) 26816 return; 26817 26818 /* 26819 * By now, we know what SA's to use. Toss over to ESP & AH 26820 * to do the heavy lifting. 26821 */ 26822 zoneid = io->ipsec_out_zoneid; 26823 ASSERT(zoneid != ALL_ZONES); 26824 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 26825 ASSERT(io->ipsec_out_esp_sa != NULL); 26826 io->ipsec_out_esp_done = B_TRUE; 26827 /* 26828 * Note that since hw accel can only apply one transform, 26829 * not two, we skip hw accel for ESP if we also have AH 26830 * This is an design limitation of the interface 26831 * which should be revisited. 26832 */ 26833 ASSERT(ire != NULL); 26834 if (io->ipsec_out_ah_sa == NULL) { 26835 ill = (ill_t *)ire->ire_stq->q_ptr; 26836 ipsec_out_is_accelerated(ipsec_mp, 26837 io->ipsec_out_esp_sa, ill, ire); 26838 } 26839 26840 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 26841 switch (ipsec_rc) { 26842 case IPSEC_STATUS_SUCCESS: 26843 break; 26844 case IPSEC_STATUS_FAILED: 26845 if (ill != NULL) { 26846 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26847 } else { 26848 BUMP_MIB(&ipst->ips_ip_mib, 26849 ipIfStatsOutDiscards); 26850 } 26851 /* FALLTHRU */ 26852 case IPSEC_STATUS_PENDING: 26853 return; 26854 } 26855 } 26856 26857 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 26858 ASSERT(io->ipsec_out_ah_sa != NULL); 26859 io->ipsec_out_ah_done = B_TRUE; 26860 if (ire == NULL) { 26861 int idx = io->ipsec_out_capab_ill_index; 26862 ill = ill_lookup_on_ifindex(idx, B_FALSE, 26863 NULL, NULL, NULL, NULL, ipst); 26864 ill_need_rele = B_TRUE; 26865 } else { 26866 ill = (ill_t *)ire->ire_stq->q_ptr; 26867 } 26868 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 26869 ire); 26870 26871 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 26872 switch (ipsec_rc) { 26873 case IPSEC_STATUS_SUCCESS: 26874 break; 26875 case IPSEC_STATUS_FAILED: 26876 if (ill != NULL) { 26877 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26878 } else { 26879 BUMP_MIB(&ipst->ips_ip_mib, 26880 ipIfStatsOutDiscards); 26881 } 26882 /* FALLTHRU */ 26883 case IPSEC_STATUS_PENDING: 26884 if (ill != NULL && ill_need_rele) 26885 ill_refrele(ill); 26886 return; 26887 } 26888 } 26889 /* 26890 * We are done with IPsec processing. Send it over 26891 * the wire. 26892 */ 26893 done: 26894 mp = ipsec_mp->b_cont; 26895 ipha = (ipha_t *)mp->b_rptr; 26896 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26897 ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); 26898 } else { 26899 ip6h = (ip6_t *)ipha; 26900 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); 26901 } 26902 if (ill != NULL && ill_need_rele) 26903 ill_refrele(ill); 26904 } 26905 26906 /* ARGSUSED */ 26907 void 26908 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 26909 { 26910 opt_restart_t *or; 26911 int err; 26912 conn_t *connp; 26913 26914 ASSERT(CONN_Q(q)); 26915 connp = Q_TO_CONN(q); 26916 26917 ASSERT(first_mp->b_datap->db_type == M_CTL); 26918 or = (opt_restart_t *)first_mp->b_rptr; 26919 /* 26920 * We don't need to pass any credentials here since this is just 26921 * a restart. The credentials are passed in when svr4_optcom_req 26922 * is called the first time (from ip_wput_nondata). 26923 */ 26924 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 26925 err = svr4_optcom_req(q, first_mp, NULL, 26926 &ip_opt_obj, B_FALSE); 26927 } else { 26928 ASSERT(or->or_type == T_OPTMGMT_REQ); 26929 err = tpi_optcom_req(q, first_mp, NULL, 26930 &ip_opt_obj, B_FALSE); 26931 } 26932 if (err != EINPROGRESS) { 26933 /* operation is done */ 26934 CONN_OPER_PENDING_DONE(connp); 26935 } 26936 } 26937 26938 /* 26939 * ioctls that go through a down/up sequence may need to wait for the down 26940 * to complete. This involves waiting for the ire and ipif refcnts to go down 26941 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 26942 */ 26943 /* ARGSUSED */ 26944 void 26945 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 26946 { 26947 struct iocblk *iocp; 26948 mblk_t *mp1; 26949 ip_ioctl_cmd_t *ipip; 26950 int err; 26951 sin_t *sin; 26952 struct lifreq *lifr; 26953 struct ifreq *ifr; 26954 26955 iocp = (struct iocblk *)mp->b_rptr; 26956 ASSERT(ipsq != NULL); 26957 /* Existence of mp1 verified in ip_wput_nondata */ 26958 mp1 = mp->b_cont->b_cont; 26959 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 26960 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 26961 /* 26962 * Special case where ipsq_current_ipif is not set: 26963 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 26964 * ill could also have become part of a ipmp group in the 26965 * process, we are here as were not able to complete the 26966 * operation in ipif_set_values because we could not become 26967 * exclusive on the new ipsq, In such a case ipsq_current_ipif 26968 * will not be set so we need to set it. 26969 */ 26970 ill_t *ill = q->q_ptr; 26971 ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd); 26972 } 26973 ASSERT(ipsq->ipsq_current_ipif != NULL); 26974 26975 if (ipip->ipi_cmd_type == IF_CMD) { 26976 /* This a old style SIOC[GS]IF* command */ 26977 ifr = (struct ifreq *)mp1->b_rptr; 26978 sin = (sin_t *)&ifr->ifr_addr; 26979 } else if (ipip->ipi_cmd_type == LIF_CMD) { 26980 /* This a new style SIOC[GS]LIF* command */ 26981 lifr = (struct lifreq *)mp1->b_rptr; 26982 sin = (sin_t *)&lifr->lifr_addr; 26983 } else { 26984 sin = NULL; 26985 } 26986 26987 err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp, 26988 ipip, mp1->b_rptr); 26989 26990 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 26991 } 26992 26993 /* 26994 * ioctl processing 26995 * 26996 * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up 26997 * the ioctl command in the ioctl tables, determines the copyin data size 26998 * from the ipi_copyin_size field, and does an mi_copyin() of that size. 26999 * 27000 * ioctl processing then continues when the M_IOCDATA makes its way down to 27001 * ip_wput_nondata(). The ioctl is looked up again in the ioctl table, its 27002 * associated 'conn' is refheld till the end of the ioctl and the general 27003 * ioctl processing function ip_process_ioctl() is called to extract the 27004 * arguments and process the ioctl. To simplify extraction, ioctl commands 27005 * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a 27006 * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq()) 27007 * is used to extract the ioctl's arguments. 27008 * 27009 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 27010 * so goes thru the serialization primitive ipsq_try_enter. Then the 27011 * appropriate function to handle the ioctl is called based on the entry in 27012 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 27013 * which also refreleases the 'conn' that was refheld at the start of the 27014 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 27015 * 27016 * Many exclusive ioctls go thru an internal down up sequence as part of 27017 * the operation. For example an attempt to change the IP address of an 27018 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 27019 * does all the cleanup such as deleting all ires that use this address. 27020 * Then we need to wait till all references to the interface go away. 27021 */ 27022 void 27023 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 27024 { 27025 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 27026 ip_ioctl_cmd_t *ipip = arg; 27027 ip_extract_func_t *extract_funcp; 27028 cmd_info_t ci; 27029 int err; 27030 boolean_t entered_ipsq = B_FALSE; 27031 27032 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 27033 27034 if (ipip == NULL) 27035 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27036 27037 /* 27038 * SIOCLIFADDIF needs to go thru a special path since the 27039 * ill may not exist yet. This happens in the case of lo0 27040 * which is created using this ioctl. 27041 */ 27042 if (ipip->ipi_cmd == SIOCLIFADDIF) { 27043 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 27044 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27045 return; 27046 } 27047 27048 ci.ci_ipif = NULL; 27049 if (ipip->ipi_cmd_type == MISC_CMD) { 27050 /* 27051 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF. 27052 */ 27053 if (ipip->ipi_cmd == IF_UNITSEL) { 27054 /* ioctl comes down the ill */ 27055 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 27056 ipif_refhold(ci.ci_ipif); 27057 } 27058 err = 0; 27059 ci.ci_sin = NULL; 27060 ci.ci_sin6 = NULL; 27061 ci.ci_lifr = NULL; 27062 } else { 27063 switch (ipip->ipi_cmd_type) { 27064 case IF_CMD: 27065 case LIF_CMD: 27066 extract_funcp = ip_extract_lifreq; 27067 break; 27068 27069 case ARP_CMD: 27070 case XARP_CMD: 27071 extract_funcp = ip_extract_arpreq; 27072 break; 27073 27074 case TUN_CMD: 27075 extract_funcp = ip_extract_tunreq; 27076 break; 27077 27078 case MSFILT_CMD: 27079 extract_funcp = ip_extract_msfilter; 27080 break; 27081 27082 default: 27083 ASSERT(0); 27084 } 27085 27086 err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl); 27087 if (err != 0) { 27088 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27089 return; 27090 } 27091 27092 /* 27093 * All of the extraction functions return a refheld ipif. 27094 */ 27095 ASSERT(ci.ci_ipif != NULL); 27096 } 27097 27098 /* 27099 * If ipsq is non-null, we are already being called exclusively 27100 */ 27101 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 27102 if (!(ipip->ipi_flags & IPI_WR)) { 27103 /* 27104 * A return value of EINPROGRESS means the ioctl is 27105 * either queued and waiting for some reason or has 27106 * already completed. 27107 */ 27108 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 27109 ci.ci_lifr); 27110 if (ci.ci_ipif != NULL) 27111 ipif_refrele(ci.ci_ipif); 27112 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27113 return; 27114 } 27115 27116 ASSERT(ci.ci_ipif != NULL); 27117 27118 if (ipsq == NULL) { 27119 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, 27120 ip_process_ioctl, NEW_OP, B_TRUE); 27121 entered_ipsq = B_TRUE; 27122 } 27123 /* 27124 * Release the ipif so that ipif_down and friends that wait for 27125 * references to go away are not misled about the current ipif_refcnt 27126 * values. We are writer so we can access the ipif even after releasing 27127 * the ipif. 27128 */ 27129 ipif_refrele(ci.ci_ipif); 27130 if (ipsq == NULL) 27131 return; 27132 27133 ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); 27134 27135 /* 27136 * For most set ioctls that come here, this serves as a single point 27137 * where we set the IPIF_CHANGING flag. This ensures that there won't 27138 * be any new references to the ipif. This helps functions that go 27139 * through this path and end up trying to wait for the refcnts 27140 * associated with the ipif to go down to zero. Some exceptions are 27141 * Failover, Failback, and Groupname commands that operate on more than 27142 * just the ci.ci_ipif. These commands internally determine the 27143 * set of ipif's they operate on and set and clear the IPIF_CHANGING 27144 * flags on that set. Another exception is the Removeif command that 27145 * sets the IPIF_CONDEMNED flag internally after identifying the right 27146 * ipif to operate on. 27147 */ 27148 mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); 27149 if (ipip->ipi_cmd != SIOCLIFREMOVEIF && 27150 ipip->ipi_cmd != SIOCLIFFAILOVER && 27151 ipip->ipi_cmd != SIOCLIFFAILBACK && 27152 ipip->ipi_cmd != SIOCSLIFGROUPNAME) 27153 (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; 27154 mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); 27155 27156 /* 27157 * A return value of EINPROGRESS means the ioctl is 27158 * either queued and waiting for some reason or has 27159 * already completed. 27160 */ 27161 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); 27162 27163 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 27164 27165 if (entered_ipsq) 27166 ipsq_exit(ipsq, B_TRUE, B_TRUE); 27167 } 27168 27169 /* 27170 * Complete the ioctl. Typically ioctls use the mi package and need to 27171 * do mi_copyout/mi_copy_done. 27172 */ 27173 void 27174 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq) 27175 { 27176 conn_t *connp = NULL; 27177 27178 if (err == EINPROGRESS) 27179 return; 27180 27181 if (CONN_Q(q)) { 27182 connp = Q_TO_CONN(q); 27183 ASSERT(connp->conn_ref >= 2); 27184 } 27185 27186 switch (mode) { 27187 case COPYOUT: 27188 if (err == 0) 27189 mi_copyout(q, mp); 27190 else 27191 mi_copy_done(q, mp, err); 27192 break; 27193 27194 case NO_COPYOUT: 27195 mi_copy_done(q, mp, err); 27196 break; 27197 27198 default: 27199 ASSERT(mode == CONN_CLOSE); /* aborted through CONN_CLOSE */ 27200 break; 27201 } 27202 27203 /* 27204 * The refhold placed at the start of the ioctl is released here. 27205 */ 27206 if (connp != NULL) 27207 CONN_OPER_PENDING_DONE(connp); 27208 27209 if (ipsq != NULL) 27210 ipsq_current_finish(ipsq); 27211 } 27212 27213 /* 27214 * This is called from ip_wput_nondata to resume a deferred TCP bind. 27215 */ 27216 /* ARGSUSED */ 27217 void 27218 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) 27219 { 27220 conn_t *connp = arg; 27221 tcp_t *tcp; 27222 27223 ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); 27224 tcp = connp->conn_tcp; 27225 27226 if (connp->conn_tcp->tcp_state == TCPS_CLOSED) 27227 freemsg(mp); 27228 else 27229 tcp_rput_other(tcp, mp); 27230 CONN_OPER_PENDING_DONE(connp); 27231 } 27232 27233 /* Called from ip_wput for all non data messages */ 27234 /* ARGSUSED */ 27235 void 27236 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 27237 { 27238 mblk_t *mp1; 27239 ire_t *ire, *fake_ire; 27240 ill_t *ill; 27241 struct iocblk *iocp; 27242 ip_ioctl_cmd_t *ipip; 27243 cred_t *cr; 27244 conn_t *connp; 27245 int err; 27246 nce_t *nce; 27247 ipif_t *ipif; 27248 ip_stack_t *ipst; 27249 char *proto_str; 27250 27251 if (CONN_Q(q)) { 27252 connp = Q_TO_CONN(q); 27253 ipst = connp->conn_netstack->netstack_ip; 27254 } else { 27255 connp = NULL; 27256 ipst = ILLQ_TO_IPST(q); 27257 } 27258 27259 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(q)); 27260 27261 switch (DB_TYPE(mp)) { 27262 case M_IOCTL: 27263 /* 27264 * IOCTL processing begins in ip_sioctl_copyin_setup which 27265 * will arrange to copy in associated control structures. 27266 */ 27267 ip_sioctl_copyin_setup(q, mp); 27268 return; 27269 case M_IOCDATA: 27270 /* 27271 * Ensure that this is associated with one of our trans- 27272 * parent ioctls. If it's not ours, discard it if we're 27273 * running as a driver, or pass it on if we're a module. 27274 */ 27275 iocp = (struct iocblk *)mp->b_rptr; 27276 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27277 if (ipip == NULL) { 27278 if (q->q_next == NULL) { 27279 goto nak; 27280 } else { 27281 putnext(q, mp); 27282 } 27283 return; 27284 } 27285 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 27286 /* 27287 * the ioctl is one we recognise, but is not 27288 * consumed by IP as a module, pass M_IOCDATA 27289 * for processing downstream, but only for 27290 * common Streams ioctls. 27291 */ 27292 if (ipip->ipi_flags & IPI_PASS_DOWN) { 27293 putnext(q, mp); 27294 return; 27295 } else { 27296 goto nak; 27297 } 27298 } 27299 27300 /* IOCTL continuation following copyin or copyout. */ 27301 if (mi_copy_state(q, mp, NULL) == -1) { 27302 /* 27303 * The copy operation failed. mi_copy_state already 27304 * cleaned up, so we're out of here. 27305 */ 27306 return; 27307 } 27308 /* 27309 * If we just completed a copy in, we become writer and 27310 * continue processing in ip_sioctl_copyin_done. If it 27311 * was a copy out, we call mi_copyout again. If there is 27312 * nothing more to copy out, it will complete the IOCTL. 27313 */ 27314 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 27315 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 27316 mi_copy_done(q, mp, EPROTO); 27317 return; 27318 } 27319 /* 27320 * Check for cases that need more copying. A return 27321 * value of 0 means a second copyin has been started, 27322 * so we return; a return value of 1 means no more 27323 * copying is needed, so we continue. 27324 */ 27325 if (ipip->ipi_cmd_type == MSFILT_CMD && 27326 MI_COPY_COUNT(mp) == 1) { 27327 if (ip_copyin_msfilter(q, mp) == 0) 27328 return; 27329 } 27330 /* 27331 * Refhold the conn, till the ioctl completes. This is 27332 * needed in case the ioctl ends up in the pending mp 27333 * list. Every mp in the ill_pending_mp list and 27334 * the ipsq_pending_mp must have a refhold on the conn 27335 * to resume processing. The refhold is released when 27336 * the ioctl completes. (normally or abnormally) 27337 * In all cases ip_ioctl_finish is called to finish 27338 * the ioctl. 27339 */ 27340 if (connp != NULL) { 27341 /* This is not a reentry */ 27342 ASSERT(ipsq == NULL); 27343 CONN_INC_REF(connp); 27344 } else { 27345 if (!(ipip->ipi_flags & IPI_MODOK)) { 27346 mi_copy_done(q, mp, EINVAL); 27347 return; 27348 } 27349 } 27350 27351 ip_process_ioctl(ipsq, q, mp, ipip); 27352 27353 } else { 27354 mi_copyout(q, mp); 27355 } 27356 return; 27357 nak: 27358 iocp->ioc_error = EINVAL; 27359 mp->b_datap->db_type = M_IOCNAK; 27360 iocp->ioc_count = 0; 27361 qreply(q, mp); 27362 return; 27363 27364 case M_IOCNAK: 27365 /* 27366 * The only way we could get here is if a resolver didn't like 27367 * an IOCTL we sent it. This shouldn't happen. 27368 */ 27369 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 27370 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 27371 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 27372 freemsg(mp); 27373 return; 27374 case M_IOCACK: 27375 /* /dev/ip shouldn't see this */ 27376 if (CONN_Q(q)) 27377 goto nak; 27378 27379 /* Finish socket ioctls passed through to ARP. */ 27380 ip_sioctl_iocack(q, mp); 27381 return; 27382 case M_FLUSH: 27383 if (*mp->b_rptr & FLUSHW) 27384 flushq(q, FLUSHALL); 27385 if (q->q_next) { 27386 putnext(q, mp); 27387 return; 27388 } 27389 if (*mp->b_rptr & FLUSHR) { 27390 *mp->b_rptr &= ~FLUSHW; 27391 qreply(q, mp); 27392 return; 27393 } 27394 freemsg(mp); 27395 return; 27396 case IRE_DB_REQ_TYPE: 27397 if (connp == NULL) { 27398 proto_str = "IRE_DB_REQ_TYPE"; 27399 goto protonak; 27400 } 27401 /* An Upper Level Protocol wants a copy of an IRE. */ 27402 ip_ire_req(q, mp); 27403 return; 27404 case M_CTL: 27405 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 27406 break; 27407 27408 if (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == 27409 TUN_HELLO) { 27410 ASSERT(connp != NULL); 27411 connp->conn_flags |= IPCL_IPTUN; 27412 freeb(mp); 27413 return; 27414 } 27415 27416 /* M_CTL messages are used by ARP to tell us things. */ 27417 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 27418 break; 27419 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 27420 case AR_ENTRY_SQUERY: 27421 ip_wput_ctl(q, mp); 27422 return; 27423 case AR_CLIENT_NOTIFY: 27424 ip_arp_news(q, mp); 27425 return; 27426 case AR_DLPIOP_DONE: 27427 ASSERT(q->q_next != NULL); 27428 ill = (ill_t *)q->q_ptr; 27429 /* qwriter_ip releases the refhold */ 27430 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 27431 ill_refhold(ill); 27432 qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE); 27433 return; 27434 case AR_ARP_CLOSING: 27435 /* 27436 * ARP (above us) is closing. If no ARP bringup is 27437 * currently pending, ack the message so that ARP 27438 * can complete its close. Also mark ill_arp_closing 27439 * so that new ARP bringups will fail. If any 27440 * ARP bringup is currently in progress, we will 27441 * ack this when the current ARP bringup completes. 27442 */ 27443 ASSERT(q->q_next != NULL); 27444 ill = (ill_t *)q->q_ptr; 27445 mutex_enter(&ill->ill_lock); 27446 ill->ill_arp_closing = 1; 27447 if (!ill->ill_arp_bringup_pending) { 27448 mutex_exit(&ill->ill_lock); 27449 qreply(q, mp); 27450 } else { 27451 mutex_exit(&ill->ill_lock); 27452 freemsg(mp); 27453 } 27454 return; 27455 case AR_ARP_EXTEND: 27456 /* 27457 * The ARP module above us is capable of duplicate 27458 * address detection. Old ATM drivers will not send 27459 * this message. 27460 */ 27461 ASSERT(q->q_next != NULL); 27462 ill = (ill_t *)q->q_ptr; 27463 ill->ill_arp_extend = B_TRUE; 27464 freemsg(mp); 27465 return; 27466 default: 27467 break; 27468 } 27469 break; 27470 case M_PROTO: 27471 case M_PCPROTO: 27472 /* 27473 * The only PROTO messages we expect are ULP binds and 27474 * copies of option negotiation acknowledgements. 27475 */ 27476 switch (((union T_primitives *)mp->b_rptr)->type) { 27477 case O_T_BIND_REQ: 27478 case T_BIND_REQ: { 27479 /* Request can get queued in bind */ 27480 if (connp == NULL) { 27481 proto_str = "O_T_BIND_REQ/T_BIND_REQ"; 27482 goto protonak; 27483 } 27484 /* 27485 * The transports except SCTP call ip_bind_{v4,v6}() 27486 * directly instead of a a putnext. SCTP doesn't 27487 * generate any T_BIND_REQ since it has its own 27488 * fanout data structures. However, ESP and AH 27489 * come in for regular binds; all other cases are 27490 * bind retries. 27491 */ 27492 ASSERT(!IPCL_IS_SCTP(connp)); 27493 27494 /* Don't increment refcnt if this is a re-entry */ 27495 if (ipsq == NULL) 27496 CONN_INC_REF(connp); 27497 27498 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 27499 connp, NULL) : ip_bind_v4(q, mp, connp); 27500 if (mp == NULL) 27501 return; 27502 if (IPCL_IS_TCP(connp)) { 27503 /* 27504 * In the case of TCP endpoint we 27505 * come here only for bind retries 27506 */ 27507 ASSERT(ipsq != NULL); 27508 CONN_INC_REF(connp); 27509 squeue_fill(connp->conn_sqp, mp, 27510 ip_resume_tcp_bind, connp, 27511 SQTAG_BIND_RETRY); 27512 } else if (IPCL_IS_UDP(connp)) { 27513 /* 27514 * In the case of UDP endpoint we 27515 * come here only for bind retries 27516 */ 27517 ASSERT(ipsq != NULL); 27518 udp_resume_bind(connp, mp); 27519 } else if (IPCL_IS_RAWIP(connp)) { 27520 /* 27521 * In the case of RAWIP endpoint we 27522 * come here only for bind retries 27523 */ 27524 ASSERT(ipsq != NULL); 27525 rawip_resume_bind(connp, mp); 27526 } else { 27527 /* The case of AH and ESP */ 27528 qreply(q, mp); 27529 CONN_OPER_PENDING_DONE(connp); 27530 } 27531 return; 27532 } 27533 case T_SVR4_OPTMGMT_REQ: 27534 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 27535 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 27536 27537 if (connp == NULL) { 27538 proto_str = "T_SVR4_OPTMGMT_REQ"; 27539 goto protonak; 27540 } 27541 27542 if (!snmpcom_req(q, mp, ip_snmp_set, 27543 ip_snmp_get, cr)) { 27544 /* 27545 * Call svr4_optcom_req so that it can 27546 * generate the ack. We don't come here 27547 * if this operation is being restarted. 27548 * ip_restart_optmgmt will drop the conn ref. 27549 * In the case of ipsec option after the ipsec 27550 * load is complete conn_restart_ipsec_waiter 27551 * drops the conn ref. 27552 */ 27553 ASSERT(ipsq == NULL); 27554 CONN_INC_REF(connp); 27555 if (ip_check_for_ipsec_opt(q, mp)) 27556 return; 27557 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj, 27558 B_FALSE); 27559 if (err != EINPROGRESS) { 27560 /* Operation is done */ 27561 CONN_OPER_PENDING_DONE(connp); 27562 } 27563 } 27564 return; 27565 case T_OPTMGMT_REQ: 27566 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 27567 /* 27568 * Note: No snmpcom_req support through new 27569 * T_OPTMGMT_REQ. 27570 * Call tpi_optcom_req so that it can 27571 * generate the ack. 27572 */ 27573 if (connp == NULL) { 27574 proto_str = "T_OPTMGMT_REQ"; 27575 goto protonak; 27576 } 27577 27578 ASSERT(ipsq == NULL); 27579 /* 27580 * We don't come here for restart. ip_restart_optmgmt 27581 * will drop the conn ref. In the case of ipsec option 27582 * after the ipsec load is complete 27583 * conn_restart_ipsec_waiter drops the conn ref. 27584 */ 27585 CONN_INC_REF(connp); 27586 if (ip_check_for_ipsec_opt(q, mp)) 27587 return; 27588 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE); 27589 if (err != EINPROGRESS) { 27590 /* Operation is done */ 27591 CONN_OPER_PENDING_DONE(connp); 27592 } 27593 return; 27594 case T_UNBIND_REQ: 27595 if (connp == NULL) { 27596 proto_str = "T_UNBIND_REQ"; 27597 goto protonak; 27598 } 27599 mp = ip_unbind(q, mp); 27600 qreply(q, mp); 27601 return; 27602 default: 27603 /* 27604 * Have to drop any DLPI messages coming down from 27605 * arp (such as an info_req which would cause ip 27606 * to receive an extra info_ack if it was passed 27607 * through. 27608 */ 27609 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 27610 (int)*(uint_t *)mp->b_rptr)); 27611 freemsg(mp); 27612 return; 27613 } 27614 /* NOTREACHED */ 27615 case IRE_DB_TYPE: { 27616 nce_t *nce; 27617 ill_t *ill; 27618 in6_addr_t gw_addr_v6; 27619 27620 27621 /* 27622 * This is a response back from a resolver. It 27623 * consists of a message chain containing: 27624 * IRE_MBLK-->LL_HDR_MBLK->pkt 27625 * The IRE_MBLK is the one we allocated in ip_newroute. 27626 * The LL_HDR_MBLK is the DLPI header to use to get 27627 * the attached packet, and subsequent ones for the 27628 * same destination, transmitted. 27629 */ 27630 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 27631 break; 27632 /* 27633 * First, check to make sure the resolution succeeded. 27634 * If it failed, the second mblk will be empty. 27635 * If it is, free the chain, dropping the packet. 27636 * (We must ire_delete the ire; that frees the ire mblk) 27637 * We're doing this now to support PVCs for ATM; it's 27638 * a partial xresolv implementation. When we fully implement 27639 * xresolv interfaces, instead of freeing everything here 27640 * we'll initiate neighbor discovery. 27641 * 27642 * For v4 (ARP and other external resolvers) the resolver 27643 * frees the message, so no check is needed. This check 27644 * is required, though, for a full xresolve implementation. 27645 * Including this code here now both shows how external 27646 * resolvers can NACK a resolution request using an 27647 * existing design that has no specific provisions for NACKs, 27648 * and also takes into account that the current non-ARP 27649 * external resolver has been coded to use this method of 27650 * NACKing for all IPv6 (xresolv) cases, 27651 * whether our xresolv implementation is complete or not. 27652 * 27653 */ 27654 ire = (ire_t *)mp->b_rptr; 27655 ill = ire_to_ill(ire); 27656 mp1 = mp->b_cont; /* dl_unitdata_req */ 27657 if (mp1->b_rptr == mp1->b_wptr) { 27658 if (ire->ire_ipversion == IPV6_VERSION) { 27659 /* 27660 * XRESOLV interface. 27661 */ 27662 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27663 mutex_enter(&ire->ire_lock); 27664 gw_addr_v6 = ire->ire_gateway_addr_v6; 27665 mutex_exit(&ire->ire_lock); 27666 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27667 nce = ndp_lookup_v6(ill, 27668 &ire->ire_addr_v6, B_FALSE); 27669 } else { 27670 nce = ndp_lookup_v6(ill, &gw_addr_v6, 27671 B_FALSE); 27672 } 27673 if (nce != NULL) { 27674 nce_resolv_failed(nce); 27675 ndp_delete(nce); 27676 NCE_REFRELE(nce); 27677 } 27678 } 27679 mp->b_cont = NULL; 27680 freemsg(mp1); /* frees the pkt as well */ 27681 ASSERT(ire->ire_nce == NULL); 27682 ire_delete((ire_t *)mp->b_rptr); 27683 return; 27684 } 27685 27686 /* 27687 * Split them into IRE_MBLK and pkt and feed it into 27688 * ire_add_then_send. Then in ire_add_then_send 27689 * the IRE will be added, and then the packet will be 27690 * run back through ip_wput. This time it will make 27691 * it to the wire. 27692 */ 27693 mp->b_cont = NULL; 27694 mp = mp1->b_cont; /* now, mp points to pkt */ 27695 mp1->b_cont = NULL; 27696 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 27697 if (ire->ire_ipversion == IPV6_VERSION) { 27698 /* 27699 * XRESOLV interface. Find the nce and put a copy 27700 * of the dl_unitdata_req in nce_res_mp 27701 */ 27702 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27703 mutex_enter(&ire->ire_lock); 27704 gw_addr_v6 = ire->ire_gateway_addr_v6; 27705 mutex_exit(&ire->ire_lock); 27706 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27707 nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, 27708 B_FALSE); 27709 } else { 27710 nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE); 27711 } 27712 if (nce != NULL) { 27713 /* 27714 * We have to protect nce_res_mp here 27715 * from being accessed by other threads 27716 * while we change the mblk pointer. 27717 * Other functions will also lock the nce when 27718 * accessing nce_res_mp. 27719 * 27720 * The reason we change the mblk pointer 27721 * here rather than copying the resolved address 27722 * into the template is that, unlike with 27723 * ethernet, we have no guarantee that the 27724 * resolved address length will be 27725 * smaller than or equal to the lla length 27726 * with which the template was allocated, 27727 * (for ethernet, they're equal) 27728 * so we have to use the actual resolved 27729 * address mblk - which holds the real 27730 * dl_unitdata_req with the resolved address. 27731 * 27732 * Doing this is the same behavior as was 27733 * previously used in the v4 ARP case. 27734 */ 27735 mutex_enter(&nce->nce_lock); 27736 if (nce->nce_res_mp != NULL) 27737 freemsg(nce->nce_res_mp); 27738 nce->nce_res_mp = mp1; 27739 mutex_exit(&nce->nce_lock); 27740 /* 27741 * We do a fastpath probe here because 27742 * we have resolved the address without 27743 * using Neighbor Discovery. 27744 * In the non-XRESOLV v6 case, the fastpath 27745 * probe is done right after neighbor 27746 * discovery completes. 27747 */ 27748 if (nce->nce_res_mp != NULL) { 27749 int res; 27750 nce_fastpath_list_add(nce); 27751 res = ill_fastpath_probe(ill, 27752 nce->nce_res_mp); 27753 if (res != 0 && res != EAGAIN) 27754 nce_fastpath_list_delete(nce); 27755 } 27756 27757 ire_add_then_send(q, ire, mp); 27758 /* 27759 * Now we have to clean out any packets 27760 * that may have been queued on the nce 27761 * while it was waiting for address resolution 27762 * to complete. 27763 */ 27764 mutex_enter(&nce->nce_lock); 27765 mp1 = nce->nce_qd_mp; 27766 nce->nce_qd_mp = NULL; 27767 mutex_exit(&nce->nce_lock); 27768 while (mp1 != NULL) { 27769 mblk_t *nxt_mp; 27770 queue_t *fwdq = NULL; 27771 ill_t *inbound_ill; 27772 uint_t ifindex; 27773 27774 nxt_mp = mp1->b_next; 27775 mp1->b_next = NULL; 27776 /* 27777 * Retrieve ifindex stored in 27778 * ip_rput_data_v6() 27779 */ 27780 ifindex = 27781 (uint_t)(uintptr_t)mp1->b_prev; 27782 inbound_ill = 27783 ill_lookup_on_ifindex(ifindex, 27784 B_TRUE, NULL, NULL, NULL, 27785 NULL, ipst); 27786 mp1->b_prev = NULL; 27787 if (inbound_ill != NULL) 27788 fwdq = inbound_ill->ill_rq; 27789 27790 if (fwdq != NULL) { 27791 put(fwdq, mp1); 27792 ill_refrele(inbound_ill); 27793 } else 27794 put(WR(ill->ill_rq), mp1); 27795 mp1 = nxt_mp; 27796 } 27797 NCE_REFRELE(nce); 27798 } else { /* nce is NULL; clean up */ 27799 ire_delete(ire); 27800 freemsg(mp); 27801 freemsg(mp1); 27802 return; 27803 } 27804 } else { 27805 nce_t *arpce; 27806 /* 27807 * Link layer resolution succeeded. Recompute the 27808 * ire_nce. 27809 */ 27810 ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST)); 27811 if ((arpce = ndp_lookup_v4(ill, 27812 (ire->ire_gateway_addr != INADDR_ANY ? 27813 &ire->ire_gateway_addr : &ire->ire_addr), 27814 B_FALSE)) == NULL) { 27815 freeb(ire->ire_mp); 27816 freeb(mp1); 27817 freemsg(mp); 27818 return; 27819 } 27820 mutex_enter(&arpce->nce_lock); 27821 arpce->nce_last = TICK_TO_MSEC(lbolt64); 27822 if (arpce->nce_state == ND_REACHABLE) { 27823 /* 27824 * Someone resolved this before us; 27825 * cleanup the res_mp. Since ire has 27826 * not been added yet, the call to ire_add_v4 27827 * from ire_add_then_send (when a dup is 27828 * detected) will clean up the ire. 27829 */ 27830 freeb(mp1); 27831 } else { 27832 ASSERT(arpce->nce_res_mp == NULL); 27833 arpce->nce_res_mp = mp1; 27834 arpce->nce_state = ND_REACHABLE; 27835 } 27836 mutex_exit(&arpce->nce_lock); 27837 if (ire->ire_marks & IRE_MARK_NOADD) { 27838 /* 27839 * this ire will not be added to the ire 27840 * cache table, so we can set the ire_nce 27841 * here, as there are no atomicity constraints. 27842 */ 27843 ire->ire_nce = arpce; 27844 /* 27845 * We are associating this nce with the ire 27846 * so change the nce ref taken in 27847 * ndp_lookup_v4() from 27848 * NCE_REFHOLD to NCE_REFHOLD_NOTR 27849 */ 27850 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 27851 } else { 27852 NCE_REFRELE(arpce); 27853 } 27854 ire_add_then_send(q, ire, mp); 27855 } 27856 return; /* All is well, the packet has been sent. */ 27857 } 27858 case IRE_ARPRESOLVE_TYPE: { 27859 27860 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */ 27861 break; 27862 mp1 = mp->b_cont; /* dl_unitdata_req */ 27863 mp->b_cont = NULL; 27864 /* 27865 * First, check to make sure the resolution succeeded. 27866 * If it failed, the second mblk will be empty. 27867 */ 27868 if (mp1->b_rptr == mp1->b_wptr) { 27869 /* cleanup the incomplete ire, free queued packets */ 27870 freemsg(mp); /* fake ire */ 27871 freeb(mp1); /* dl_unitdata response */ 27872 return; 27873 } 27874 27875 /* 27876 * update any incomplete nce_t found. we lookup the ctable 27877 * and find the nce from the ire->ire_nce because we need 27878 * to pass the ire to ip_xmit_v4 later, and can find both 27879 * ire and nce in one lookup from the ctable. 27880 */ 27881 fake_ire = (ire_t *)mp->b_rptr; 27882 /* 27883 * By the time we come back here from ARP 27884 * the logical outgoing interface of the incomplete ire 27885 * we added in ire_forward could have disappeared, 27886 * causing the incomplete ire to also have 27887 * dissapeared. So we need to retreive the 27888 * proper ipif for the ire before looking 27889 * in ctable; do the ctablelookup based on ire_ipif_seqid 27890 */ 27891 ill = q->q_ptr; 27892 27893 /* Get the outgoing ipif */ 27894 mutex_enter(&ill->ill_lock); 27895 if (ill->ill_state_flags & ILL_CONDEMNED) { 27896 mutex_exit(&ill->ill_lock); 27897 freemsg(mp); /* fake ire */ 27898 freeb(mp1); /* dl_unitdata response */ 27899 return; 27900 } 27901 ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid); 27902 27903 if (ipif == NULL) { 27904 mutex_exit(&ill->ill_lock); 27905 ip1dbg(("logical intrf to incomplete ire vanished\n")); 27906 freemsg(mp); 27907 freeb(mp1); 27908 return; 27909 } 27910 ipif_refhold_locked(ipif); 27911 mutex_exit(&ill->ill_lock); 27912 ire = ire_ctable_lookup(fake_ire->ire_addr, 27913 fake_ire->ire_gateway_addr, IRE_CACHE, 27914 ipif, fake_ire->ire_zoneid, NULL, 27915 (MATCH_IRE_GW|MATCH_IRE_IPIF|MATCH_IRE_ZONEONLY| 27916 MATCH_IRE_TYPE), ipst); 27917 ipif_refrele(ipif); 27918 if (ire == NULL) { 27919 /* 27920 * no ire was found; check if there is an nce 27921 * for this lookup; if it has no ire's pointing at it 27922 * cleanup. 27923 */ 27924 if ((nce = ndp_lookup_v4(ill, 27925 (fake_ire->ire_gateway_addr != INADDR_ANY ? 27926 &fake_ire->ire_gateway_addr : &fake_ire->ire_addr), 27927 B_FALSE)) != NULL) { 27928 /* 27929 * cleanup: 27930 * We check for refcnt 2 (one for the nce 27931 * hash list + 1 for the ref taken by 27932 * ndp_lookup_v4) to check that there are 27933 * no ire's pointing at the nce. 27934 */ 27935 if (nce->nce_refcnt == 2) 27936 ndp_delete(nce); 27937 NCE_REFRELE(nce); 27938 } 27939 freeb(mp1); /* dl_unitdata response */ 27940 freemsg(mp); /* fake ire */ 27941 return; 27942 } 27943 nce = ire->ire_nce; 27944 DTRACE_PROBE2(ire__arpresolve__type, 27945 ire_t *, ire, nce_t *, nce); 27946 ASSERT(nce->nce_state != ND_INITIAL); 27947 mutex_enter(&nce->nce_lock); 27948 nce->nce_last = TICK_TO_MSEC(lbolt64); 27949 if (nce->nce_state == ND_REACHABLE) { 27950 /* 27951 * Someone resolved this before us; 27952 * our response is not needed any more. 27953 */ 27954 mutex_exit(&nce->nce_lock); 27955 freeb(mp1); /* dl_unitdata response */ 27956 } else { 27957 ASSERT(nce->nce_res_mp == NULL); 27958 nce->nce_res_mp = mp1; 27959 nce->nce_state = ND_REACHABLE; 27960 mutex_exit(&nce->nce_lock); 27961 nce_fastpath(nce); 27962 } 27963 /* 27964 * The cached nce_t has been updated to be reachable; 27965 * Clear the IRE_MARK_UNCACHED flag and free the fake_ire. 27966 */ 27967 fake_ire->ire_marks &= ~IRE_MARK_UNCACHED; 27968 freemsg(mp); 27969 /* 27970 * send out queued packets. 27971 */ 27972 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 27973 27974 IRE_REFRELE(ire); 27975 return; 27976 } 27977 default: 27978 break; 27979 } 27980 if (q->q_next) { 27981 putnext(q, mp); 27982 } else 27983 freemsg(mp); 27984 return; 27985 27986 protonak: 27987 cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str); 27988 if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL) 27989 qreply(q, mp); 27990 } 27991 27992 /* 27993 * Process IP options in an outbound packet. Modify the destination if there 27994 * is a source route option. 27995 * Returns non-zero if something fails in which case an ICMP error has been 27996 * sent and mp freed. 27997 */ 27998 static int 27999 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 28000 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 28001 { 28002 ipoptp_t opts; 28003 uchar_t *opt; 28004 uint8_t optval; 28005 uint8_t optlen; 28006 ipaddr_t dst; 28007 intptr_t code = 0; 28008 mblk_t *mp; 28009 ire_t *ire = NULL; 28010 28011 ip2dbg(("ip_wput_options\n")); 28012 mp = ipsec_mp; 28013 if (mctl_present) { 28014 mp = ipsec_mp->b_cont; 28015 } 28016 28017 dst = ipha->ipha_dst; 28018 for (optval = ipoptp_first(&opts, ipha); 28019 optval != IPOPT_EOL; 28020 optval = ipoptp_next(&opts)) { 28021 opt = opts.ipoptp_cur; 28022 optlen = opts.ipoptp_len; 28023 ip2dbg(("ip_wput_options: opt %d, len %d\n", 28024 optval, optlen)); 28025 switch (optval) { 28026 uint32_t off; 28027 case IPOPT_SSRR: 28028 case IPOPT_LSRR: 28029 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28030 ip1dbg(( 28031 "ip_wput_options: bad option offset\n")); 28032 code = (char *)&opt[IPOPT_OLEN] - 28033 (char *)ipha; 28034 goto param_prob; 28035 } 28036 off = opt[IPOPT_OFFSET]; 28037 ip1dbg(("ip_wput_options: next hop 0x%x\n", 28038 ntohl(dst))); 28039 /* 28040 * For strict: verify that dst is directly 28041 * reachable. 28042 */ 28043 if (optval == IPOPT_SSRR) { 28044 ire = ire_ftable_lookup(dst, 0, 0, 28045 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 28046 MBLK_GETLABEL(mp), 28047 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 28048 if (ire == NULL) { 28049 ip1dbg(("ip_wput_options: SSRR not" 28050 " directly reachable: 0x%x\n", 28051 ntohl(dst))); 28052 goto bad_src_route; 28053 } 28054 ire_refrele(ire); 28055 } 28056 break; 28057 case IPOPT_RR: 28058 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28059 ip1dbg(( 28060 "ip_wput_options: bad option offset\n")); 28061 code = (char *)&opt[IPOPT_OLEN] - 28062 (char *)ipha; 28063 goto param_prob; 28064 } 28065 break; 28066 case IPOPT_TS: 28067 /* 28068 * Verify that length >=5 and that there is either 28069 * room for another timestamp or that the overflow 28070 * counter is not maxed out. 28071 */ 28072 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 28073 if (optlen < IPOPT_MINLEN_IT) { 28074 goto param_prob; 28075 } 28076 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28077 ip1dbg(( 28078 "ip_wput_options: bad option offset\n")); 28079 code = (char *)&opt[IPOPT_OFFSET] - 28080 (char *)ipha; 28081 goto param_prob; 28082 } 28083 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 28084 case IPOPT_TS_TSONLY: 28085 off = IPOPT_TS_TIMELEN; 28086 break; 28087 case IPOPT_TS_TSANDADDR: 28088 case IPOPT_TS_PRESPEC: 28089 case IPOPT_TS_PRESPEC_RFC791: 28090 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 28091 break; 28092 default: 28093 code = (char *)&opt[IPOPT_POS_OV_FLG] - 28094 (char *)ipha; 28095 goto param_prob; 28096 } 28097 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 28098 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 28099 /* 28100 * No room and the overflow counter is 15 28101 * already. 28102 */ 28103 goto param_prob; 28104 } 28105 break; 28106 } 28107 } 28108 28109 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 28110 return (0); 28111 28112 ip1dbg(("ip_wput_options: error processing IP options.")); 28113 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 28114 28115 param_prob: 28116 /* 28117 * Since ip_wput() isn't close to finished, we fill 28118 * in enough of the header for credible error reporting. 28119 */ 28120 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 28121 /* Failed */ 28122 freemsg(ipsec_mp); 28123 return (-1); 28124 } 28125 icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst); 28126 return (-1); 28127 28128 bad_src_route: 28129 /* 28130 * Since ip_wput() isn't close to finished, we fill 28131 * in enough of the header for credible error reporting. 28132 */ 28133 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 28134 /* Failed */ 28135 freemsg(ipsec_mp); 28136 return (-1); 28137 } 28138 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 28139 return (-1); 28140 } 28141 28142 /* 28143 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 28144 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 28145 * thru /etc/system. 28146 */ 28147 #define CONN_MAXDRAINCNT 64 28148 28149 static void 28150 conn_drain_init(ip_stack_t *ipst) 28151 { 28152 int i; 28153 28154 ipst->ips_conn_drain_list_cnt = conn_drain_nthreads; 28155 28156 if ((ipst->ips_conn_drain_list_cnt == 0) || 28157 (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 28158 /* 28159 * Default value of the number of drainers is the 28160 * number of cpus, subject to maximum of 8 drainers. 28161 */ 28162 if (boot_max_ncpus != -1) 28163 ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 28164 else 28165 ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8); 28166 } 28167 28168 ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt * 28169 sizeof (idl_t), KM_SLEEP); 28170 28171 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28172 mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL, 28173 MUTEX_DEFAULT, NULL); 28174 } 28175 } 28176 28177 static void 28178 conn_drain_fini(ip_stack_t *ipst) 28179 { 28180 int i; 28181 28182 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) 28183 mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock); 28184 kmem_free(ipst->ips_conn_drain_list, 28185 ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); 28186 ipst->ips_conn_drain_list = NULL; 28187 } 28188 28189 /* 28190 * Note: For an overview of how flowcontrol is handled in IP please see the 28191 * IP Flowcontrol notes at the top of this file. 28192 * 28193 * Flow control has blocked us from proceeding. Insert the given conn in one 28194 * of the conn drain lists. These conn wq's will be qenabled later on when 28195 * STREAMS flow control does a backenable. conn_walk_drain will enable 28196 * the first conn in each of these drain lists. Each of these qenabled conns 28197 * in turn enables the next in the list, after it runs, or when it closes, 28198 * thus sustaining the drain process. 28199 * 28200 * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> 28201 * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert 28202 * running at any time, on a given conn, since there can be only 1 service proc 28203 * running on a queue at any time. 28204 */ 28205 void 28206 conn_drain_insert(conn_t *connp) 28207 { 28208 idl_t *idl; 28209 uint_t index; 28210 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28211 28212 mutex_enter(&connp->conn_lock); 28213 if (connp->conn_state_flags & CONN_CLOSING) { 28214 /* 28215 * The conn is closing as a result of which CONN_CLOSING 28216 * is set. Return. 28217 */ 28218 mutex_exit(&connp->conn_lock); 28219 return; 28220 } else if (connp->conn_idl == NULL) { 28221 /* 28222 * Assign the next drain list round robin. We dont' use 28223 * a lock, and thus it may not be strictly round robin. 28224 * Atomicity of load/stores is enough to make sure that 28225 * conn_drain_list_index is always within bounds. 28226 */ 28227 index = ipst->ips_conn_drain_list_index; 28228 ASSERT(index < ipst->ips_conn_drain_list_cnt); 28229 connp->conn_idl = &ipst->ips_conn_drain_list[index]; 28230 index++; 28231 if (index == ipst->ips_conn_drain_list_cnt) 28232 index = 0; 28233 ipst->ips_conn_drain_list_index = index; 28234 } 28235 mutex_exit(&connp->conn_lock); 28236 28237 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28238 if ((connp->conn_drain_prev != NULL) || 28239 (connp->conn_state_flags & CONN_CLOSING)) { 28240 /* 28241 * The conn is already in the drain list, OR 28242 * the conn is closing. We need to check again for 28243 * the closing case again since close can happen 28244 * after we drop the conn_lock, and before we 28245 * acquire the CONN_DRAIN_LIST_LOCK. 28246 */ 28247 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28248 return; 28249 } else { 28250 idl = connp->conn_idl; 28251 } 28252 28253 /* 28254 * The conn is not in the drain list. Insert it at the 28255 * tail of the drain list. The drain list is circular 28256 * and doubly linked. idl_conn points to the 1st element 28257 * in the list. 28258 */ 28259 if (idl->idl_conn == NULL) { 28260 idl->idl_conn = connp; 28261 connp->conn_drain_next = connp; 28262 connp->conn_drain_prev = connp; 28263 } else { 28264 conn_t *head = idl->idl_conn; 28265 28266 connp->conn_drain_next = head; 28267 connp->conn_drain_prev = head->conn_drain_prev; 28268 head->conn_drain_prev->conn_drain_next = connp; 28269 head->conn_drain_prev = connp; 28270 } 28271 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28272 } 28273 28274 /* 28275 * This conn is closing, and we are called from ip_close. OR 28276 * This conn has been serviced by ip_wsrv, and we need to do the tail 28277 * processing. 28278 * If this conn is part of the drain list, we may need to sustain the drain 28279 * process by qenabling the next conn in the drain list. We may also need to 28280 * remove this conn from the list, if it is done. 28281 */ 28282 static void 28283 conn_drain_tail(conn_t *connp, boolean_t closing) 28284 { 28285 idl_t *idl; 28286 28287 /* 28288 * connp->conn_idl is stable at this point, and no lock is needed 28289 * to check it. If we are called from ip_close, close has already 28290 * set CONN_CLOSING, thus freezing the value of conn_idl, and 28291 * called us only because conn_idl is non-null. If we are called thru 28292 * service, conn_idl could be null, but it cannot change because 28293 * service is single-threaded per queue, and there cannot be another 28294 * instance of service trying to call conn_drain_insert on this conn 28295 * now. 28296 */ 28297 ASSERT(!closing || (connp->conn_idl != NULL)); 28298 28299 /* 28300 * If connp->conn_idl is null, the conn has not been inserted into any 28301 * drain list even once since creation of the conn. Just return. 28302 */ 28303 if (connp->conn_idl == NULL) 28304 return; 28305 28306 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28307 28308 if (connp->conn_drain_prev == NULL) { 28309 /* This conn is currently not in the drain list. */ 28310 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28311 return; 28312 } 28313 idl = connp->conn_idl; 28314 if (idl->idl_conn_draining == connp) { 28315 /* 28316 * This conn is the current drainer. If this is the last conn 28317 * in the drain list, we need to do more checks, in the 'if' 28318 * below. Otherwwise we need to just qenable the next conn, 28319 * to sustain the draining, and is handled in the 'else' 28320 * below. 28321 */ 28322 if (connp->conn_drain_next == idl->idl_conn) { 28323 /* 28324 * This conn is the last in this list. This round 28325 * of draining is complete. If idl_repeat is set, 28326 * it means another flow enabling has happened from 28327 * the driver/streams and we need to another round 28328 * of draining. 28329 * If there are more than 2 conns in the drain list, 28330 * do a left rotate by 1, so that all conns except the 28331 * conn at the head move towards the head by 1, and the 28332 * the conn at the head goes to the tail. This attempts 28333 * a more even share for all queues that are being 28334 * drained. 28335 */ 28336 if ((connp->conn_drain_next != connp) && 28337 (idl->idl_conn->conn_drain_next != connp)) { 28338 idl->idl_conn = idl->idl_conn->conn_drain_next; 28339 } 28340 if (idl->idl_repeat) { 28341 qenable(idl->idl_conn->conn_wq); 28342 idl->idl_conn_draining = idl->idl_conn; 28343 idl->idl_repeat = 0; 28344 } else { 28345 idl->idl_conn_draining = NULL; 28346 } 28347 } else { 28348 /* 28349 * If the next queue that we are now qenable'ing, 28350 * is closing, it will remove itself from this list 28351 * and qenable the subsequent queue in ip_close(). 28352 * Serialization is acheived thru idl_lock. 28353 */ 28354 qenable(connp->conn_drain_next->conn_wq); 28355 idl->idl_conn_draining = connp->conn_drain_next; 28356 } 28357 } 28358 if (!connp->conn_did_putbq || closing) { 28359 /* 28360 * Remove ourself from the drain list, if we did not do 28361 * a putbq, or if the conn is closing. 28362 * Note: It is possible that q->q_first is non-null. It means 28363 * that these messages landed after we did a enableok() in 28364 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 28365 * service them. 28366 */ 28367 if (connp->conn_drain_next == connp) { 28368 /* Singleton in the list */ 28369 ASSERT(connp->conn_drain_prev == connp); 28370 idl->idl_conn = NULL; 28371 idl->idl_conn_draining = NULL; 28372 } else { 28373 connp->conn_drain_prev->conn_drain_next = 28374 connp->conn_drain_next; 28375 connp->conn_drain_next->conn_drain_prev = 28376 connp->conn_drain_prev; 28377 if (idl->idl_conn == connp) 28378 idl->idl_conn = connp->conn_drain_next; 28379 ASSERT(idl->idl_conn_draining != connp); 28380 28381 } 28382 connp->conn_drain_next = NULL; 28383 connp->conn_drain_prev = NULL; 28384 } 28385 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28386 } 28387 28388 /* 28389 * Write service routine. Shared perimeter entry point. 28390 * ip_wsrv can be called in any of the following ways. 28391 * 1. The device queue's messages has fallen below the low water mark 28392 * and STREAMS has backenabled the ill_wq. We walk thru all the 28393 * the drain lists and backenable the first conn in each list. 28394 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 28395 * qenabled non-tcp upper layers. We start dequeing messages and call 28396 * ip_wput for each message. 28397 */ 28398 28399 void 28400 ip_wsrv(queue_t *q) 28401 { 28402 conn_t *connp; 28403 ill_t *ill; 28404 mblk_t *mp; 28405 28406 if (q->q_next) { 28407 ill = (ill_t *)q->q_ptr; 28408 if (ill->ill_state_flags == 0) { 28409 /* 28410 * The device flow control has opened up. 28411 * Walk through conn drain lists and qenable the 28412 * first conn in each list. This makes sense only 28413 * if the stream is fully plumbed and setup. 28414 * Hence the if check above. 28415 */ 28416 ip1dbg(("ip_wsrv: walking\n")); 28417 conn_walk_drain(ill->ill_ipst); 28418 } 28419 return; 28420 } 28421 28422 connp = Q_TO_CONN(q); 28423 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 28424 28425 /* 28426 * 1. Set conn_draining flag to signal that service is active. 28427 * 28428 * 2. ip_output determines whether it has been called from service, 28429 * based on the last parameter. If it is IP_WSRV it concludes it 28430 * has been called from service. 28431 * 28432 * 3. Message ordering is preserved by the following logic. 28433 * i. A directly called ip_output (i.e. not thru service) will queue 28434 * the message at the tail, if conn_draining is set (i.e. service 28435 * is running) or if q->q_first is non-null. 28436 * 28437 * ii. If ip_output is called from service, and if ip_output cannot 28438 * putnext due to flow control, it does a putbq. 28439 * 28440 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 28441 * (causing an infinite loop). 28442 */ 28443 ASSERT(!connp->conn_did_putbq); 28444 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 28445 connp->conn_draining = 1; 28446 noenable(q); 28447 while ((mp = getq(q)) != NULL) { 28448 ASSERT(CONN_Q(q)); 28449 28450 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 28451 if (connp->conn_did_putbq) { 28452 /* ip_wput did a putbq */ 28453 break; 28454 } 28455 } 28456 /* 28457 * At this point, a thread coming down from top, calling 28458 * ip_wput, may end up queueing the message. We have not yet 28459 * enabled the queue, so ip_wsrv won't be called again. 28460 * To avoid this race, check q->q_first again (in the loop) 28461 * If the other thread queued the message before we call 28462 * enableok(), we will catch it in the q->q_first check. 28463 * If the other thread queues the message after we call 28464 * enableok(), ip_wsrv will be called again by STREAMS. 28465 */ 28466 connp->conn_draining = 0; 28467 enableok(q); 28468 } 28469 28470 /* Enable the next conn for draining */ 28471 conn_drain_tail(connp, B_FALSE); 28472 28473 connp->conn_did_putbq = 0; 28474 } 28475 28476 /* 28477 * Walk the list of all conn's calling the function provided with the 28478 * specified argument for each. Note that this only walks conn's that 28479 * have been bound. 28480 * Applies to both IPv4 and IPv6. 28481 */ 28482 static void 28483 conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 28484 { 28485 conn_walk_fanout_table(ipst->ips_ipcl_udp_fanout, 28486 ipst->ips_ipcl_udp_fanout_size, 28487 func, arg, zoneid); 28488 conn_walk_fanout_table(ipst->ips_ipcl_conn_fanout, 28489 ipst->ips_ipcl_conn_fanout_size, 28490 func, arg, zoneid); 28491 conn_walk_fanout_table(ipst->ips_ipcl_bind_fanout, 28492 ipst->ips_ipcl_bind_fanout_size, 28493 func, arg, zoneid); 28494 conn_walk_fanout_table(ipst->ips_ipcl_proto_fanout, 28495 IPPROTO_MAX, func, arg, zoneid); 28496 conn_walk_fanout_table(ipst->ips_ipcl_proto_fanout_v6, 28497 IPPROTO_MAX, func, arg, zoneid); 28498 } 28499 28500 /* 28501 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 28502 * of conns that need to be drained, check if drain is already in progress. 28503 * If so set the idl_repeat bit, indicating that the last conn in the list 28504 * needs to reinitiate the drain once again, for the list. If drain is not 28505 * in progress for the list, initiate the draining, by qenabling the 1st 28506 * conn in the list. The drain is self-sustaining, each qenabled conn will 28507 * in turn qenable the next conn, when it is done/blocked/closing. 28508 */ 28509 static void 28510 conn_walk_drain(ip_stack_t *ipst) 28511 { 28512 int i; 28513 idl_t *idl; 28514 28515 IP_STAT(ipst, ip_conn_walk_drain); 28516 28517 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28518 idl = &ipst->ips_conn_drain_list[i]; 28519 mutex_enter(&idl->idl_lock); 28520 if (idl->idl_conn == NULL) { 28521 mutex_exit(&idl->idl_lock); 28522 continue; 28523 } 28524 /* 28525 * If this list is not being drained currently by 28526 * an ip_wsrv thread, start the process. 28527 */ 28528 if (idl->idl_conn_draining == NULL) { 28529 ASSERT(idl->idl_repeat == 0); 28530 qenable(idl->idl_conn->conn_wq); 28531 idl->idl_conn_draining = idl->idl_conn; 28532 } else { 28533 idl->idl_repeat = 1; 28534 } 28535 mutex_exit(&idl->idl_lock); 28536 } 28537 } 28538 28539 /* 28540 * Walk an conn hash table of `count' buckets, calling func for each entry. 28541 */ 28542 static void 28543 conn_walk_fanout_table(connf_t *connfp, uint_t count, pfv_t func, void *arg, 28544 zoneid_t zoneid) 28545 { 28546 conn_t *connp; 28547 28548 while (count-- > 0) { 28549 mutex_enter(&connfp->connf_lock); 28550 for (connp = connfp->connf_head; connp != NULL; 28551 connp = connp->conn_next) { 28552 if (zoneid == GLOBAL_ZONEID || 28553 zoneid == connp->conn_zoneid) { 28554 CONN_INC_REF(connp); 28555 mutex_exit(&connfp->connf_lock); 28556 (*func)(connp, arg); 28557 mutex_enter(&connfp->connf_lock); 28558 CONN_DEC_REF(connp); 28559 } 28560 } 28561 mutex_exit(&connfp->connf_lock); 28562 connfp++; 28563 } 28564 } 28565 28566 /* conn_walk_fanout routine invoked for ip_conn_report for each conn. */ 28567 static void 28568 conn_report1(conn_t *connp, void *mp) 28569 { 28570 char buf1[INET6_ADDRSTRLEN]; 28571 char buf2[INET6_ADDRSTRLEN]; 28572 uint_t print_len, buf_len; 28573 28574 ASSERT(connp != NULL); 28575 28576 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 28577 if (buf_len <= 0) 28578 return; 28579 (void) inet_ntop(AF_INET6, &connp->conn_srcv6, buf1, sizeof (buf1)); 28580 (void) inet_ntop(AF_INET6, &connp->conn_remv6, buf2, sizeof (buf2)); 28581 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 28582 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 28583 "%5d %s/%05d %s/%05d\n", 28584 (void *)connp, (void *)CONNP_TO_RQ(connp), 28585 (void *)CONNP_TO_WQ(connp), connp->conn_zoneid, 28586 buf1, connp->conn_lport, 28587 buf2, connp->conn_fport); 28588 if (print_len < buf_len) { 28589 ((mblk_t *)mp)->b_wptr += print_len; 28590 } else { 28591 ((mblk_t *)mp)->b_wptr += buf_len; 28592 } 28593 } 28594 28595 /* 28596 * Named Dispatch routine to produce a formatted report on all conns 28597 * that are listed in one of the fanout tables. 28598 * This report is accessed by using the ndd utility to "get" ND variable 28599 * "ip_conn_status". 28600 */ 28601 /* ARGSUSED */ 28602 static int 28603 ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 28604 { 28605 conn_t *connp = Q_TO_CONN(q); 28606 28607 (void) mi_mpprintf(mp, 28608 "CONN " MI_COL_HDRPAD_STR 28609 "rfq " MI_COL_HDRPAD_STR 28610 "stq " MI_COL_HDRPAD_STR 28611 " zone local remote"); 28612 28613 /* 28614 * Because of the ndd constraint, at most we can have 64K buffer 28615 * to put in all conn info. So to be more efficient, just 28616 * allocate a 64K buffer here, assuming we need that large buffer. 28617 * This should be OK as only privileged processes can do ndd /dev/ip. 28618 */ 28619 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 28620 /* The following may work even if we cannot get a large buf. */ 28621 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 28622 return (0); 28623 } 28624 28625 conn_walk_fanout(conn_report1, mp->b_cont, connp->conn_zoneid, 28626 connp->conn_netstack->netstack_ip); 28627 return (0); 28628 } 28629 28630 /* 28631 * Determine if the ill and multicast aspects of that packets 28632 * "matches" the conn. 28633 */ 28634 boolean_t 28635 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 28636 zoneid_t zoneid) 28637 { 28638 ill_t *in_ill; 28639 boolean_t found; 28640 ipif_t *ipif; 28641 ire_t *ire; 28642 ipaddr_t dst, src; 28643 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28644 28645 dst = ipha->ipha_dst; 28646 src = ipha->ipha_src; 28647 28648 /* 28649 * conn_incoming_ill is set by IP_BOUND_IF which limits 28650 * unicast, broadcast and multicast reception to 28651 * conn_incoming_ill. conn_wantpacket itself is called 28652 * only for BROADCAST and multicast. 28653 * 28654 * 1) ip_rput supresses duplicate broadcasts if the ill 28655 * is part of a group. Hence, we should be receiving 28656 * just one copy of broadcast for the whole group. 28657 * Thus, if it is part of the group the packet could 28658 * come on any ill of the group and hence we need a 28659 * match on the group. Otherwise, match on ill should 28660 * be sufficient. 28661 * 28662 * 2) ip_rput does not suppress duplicate multicast packets. 28663 * If there are two interfaces in a ill group and we have 28664 * 2 applications (conns) joined a multicast group G on 28665 * both the interfaces, ilm_lookup_ill filter in ip_rput 28666 * will give us two packets because we join G on both the 28667 * interfaces rather than nominating just one interface 28668 * for receiving multicast like broadcast above. So, 28669 * we have to call ilg_lookup_ill to filter out duplicate 28670 * copies, if ill is part of a group. 28671 */ 28672 in_ill = connp->conn_incoming_ill; 28673 if (in_ill != NULL) { 28674 if (in_ill->ill_group == NULL) { 28675 if (in_ill != ill) 28676 return (B_FALSE); 28677 } else if (in_ill->ill_group != ill->ill_group) { 28678 return (B_FALSE); 28679 } 28680 } 28681 28682 if (!CLASSD(dst)) { 28683 if (IPCL_ZONE_MATCH(connp, zoneid)) 28684 return (B_TRUE); 28685 /* 28686 * The conn is in a different zone; we need to check that this 28687 * broadcast address is configured in the application's zone and 28688 * on one ill in the group. 28689 */ 28690 ipif = ipif_get_next_ipif(NULL, ill); 28691 if (ipif == NULL) 28692 return (B_FALSE); 28693 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 28694 connp->conn_zoneid, NULL, 28695 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); 28696 ipif_refrele(ipif); 28697 if (ire != NULL) { 28698 ire_refrele(ire); 28699 return (B_TRUE); 28700 } else { 28701 return (B_FALSE); 28702 } 28703 } 28704 28705 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 28706 connp->conn_zoneid == zoneid) { 28707 /* 28708 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 28709 * disabled, therefore we don't dispatch the multicast packet to 28710 * the sending zone. 28711 */ 28712 return (B_FALSE); 28713 } 28714 28715 if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) { 28716 /* 28717 * Multicast packet on the loopback interface: we only match 28718 * conns who joined the group in the specified zone. 28719 */ 28720 return (B_FALSE); 28721 } 28722 28723 if (connp->conn_multi_router) { 28724 /* multicast packet and multicast router socket: send up */ 28725 return (B_TRUE); 28726 } 28727 28728 mutex_enter(&connp->conn_lock); 28729 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 28730 mutex_exit(&connp->conn_lock); 28731 return (found); 28732 } 28733 28734 /* 28735 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 28736 */ 28737 /* ARGSUSED */ 28738 static void 28739 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 28740 { 28741 ill_t *ill = (ill_t *)q->q_ptr; 28742 mblk_t *mp1, *mp2; 28743 ipif_t *ipif; 28744 int err = 0; 28745 conn_t *connp = NULL; 28746 ipsq_t *ipsq; 28747 arc_t *arc; 28748 28749 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 28750 28751 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 28752 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 28753 28754 ASSERT(IAM_WRITER_ILL(ill)); 28755 mp2 = mp->b_cont; 28756 mp->b_cont = NULL; 28757 28758 /* 28759 * We have now received the arp bringup completion message 28760 * from ARP. Mark the arp bringup as done. Also if the arp 28761 * stream has already started closing, send up the AR_ARP_CLOSING 28762 * ack now since ARP is waiting in close for this ack. 28763 */ 28764 mutex_enter(&ill->ill_lock); 28765 ill->ill_arp_bringup_pending = 0; 28766 if (ill->ill_arp_closing) { 28767 mutex_exit(&ill->ill_lock); 28768 /* Let's reuse the mp for sending the ack */ 28769 arc = (arc_t *)mp->b_rptr; 28770 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 28771 arc->arc_cmd = AR_ARP_CLOSING; 28772 qreply(q, mp); 28773 } else { 28774 mutex_exit(&ill->ill_lock); 28775 freeb(mp); 28776 } 28777 28778 ipsq = ill->ill_phyint->phyint_ipsq; 28779 ipif = ipsq->ipsq_pending_ipif; 28780 mp1 = ipsq_pending_mp_get(ipsq, &connp); 28781 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 28782 if (mp1 == NULL) { 28783 /* bringup was aborted by the user */ 28784 freemsg(mp2); 28785 return; 28786 } 28787 28788 /* 28789 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 28790 * must have an associated conn_t. Otherwise, we're bringing this 28791 * interface back up as part of handling an asynchronous event (e.g., 28792 * physical address change). 28793 */ 28794 if (ipsq->ipsq_current_ioctl != 0) { 28795 ASSERT(connp != NULL); 28796 q = CONNP_TO_WQ(connp); 28797 } else { 28798 ASSERT(connp == NULL); 28799 q = ill->ill_rq; 28800 } 28801 28802 /* 28803 * If the DL_BIND_REQ fails, it is noted 28804 * in arc_name_offset. 28805 */ 28806 err = *((int *)mp2->b_rptr); 28807 if (err == 0) { 28808 if (ipif->ipif_isv6) { 28809 if ((err = ipif_up_done_v6(ipif)) != 0) 28810 ip0dbg(("ip_arp_done: init failed\n")); 28811 } else { 28812 if ((err = ipif_up_done(ipif)) != 0) 28813 ip0dbg(("ip_arp_done: init failed\n")); 28814 } 28815 } else { 28816 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 28817 } 28818 28819 freemsg(mp2); 28820 28821 if ((err == 0) && (ill->ill_up_ipifs)) { 28822 err = ill_up_ipifs(ill, q, mp1); 28823 if (err == EINPROGRESS) 28824 return; 28825 } 28826 28827 if (ill->ill_up_ipifs) 28828 ill_group_cleanup(ill); 28829 28830 /* 28831 * The operation must complete without EINPROGRESS since 28832 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 28833 * Otherwise, the operation will be stuck forever in the ipsq. 28834 */ 28835 ASSERT(err != EINPROGRESS); 28836 if (ipsq->ipsq_current_ioctl != 0) 28837 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 28838 else 28839 ipsq_current_finish(ipsq); 28840 } 28841 28842 /* Allocate the private structure */ 28843 static int 28844 ip_priv_alloc(void **bufp) 28845 { 28846 void *buf; 28847 28848 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 28849 return (ENOMEM); 28850 28851 *bufp = buf; 28852 return (0); 28853 } 28854 28855 /* Function to delete the private structure */ 28856 void 28857 ip_priv_free(void *buf) 28858 { 28859 ASSERT(buf != NULL); 28860 kmem_free(buf, sizeof (ip_priv_t)); 28861 } 28862 28863 /* 28864 * The entry point for IPPF processing. 28865 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 28866 * routine just returns. 28867 * 28868 * When called, ip_process generates an ipp_packet_t structure 28869 * which holds the state information for this packet and invokes the 28870 * the classifier (via ipp_packet_process). The classification, depending on 28871 * configured filters, results in a list of actions for this packet. Invoking 28872 * an action may cause the packet to be dropped, in which case the resulting 28873 * mblk (*mpp) is NULL. proc indicates the callout position for 28874 * this packet and ill_index is the interface this packet on or will leave 28875 * on (inbound and outbound resp.). 28876 */ 28877 void 28878 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 28879 { 28880 mblk_t *mp; 28881 ip_priv_t *priv; 28882 ipp_action_id_t aid; 28883 int rc = 0; 28884 ipp_packet_t *pp; 28885 #define IP_CLASS "ip" 28886 28887 /* If the classifier is not loaded, return */ 28888 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 28889 return; 28890 } 28891 28892 mp = *mpp; 28893 ASSERT(mp != NULL); 28894 28895 /* Allocate the packet structure */ 28896 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 28897 if (rc != 0) { 28898 *mpp = NULL; 28899 freemsg(mp); 28900 return; 28901 } 28902 28903 /* Allocate the private structure */ 28904 rc = ip_priv_alloc((void **)&priv); 28905 if (rc != 0) { 28906 *mpp = NULL; 28907 freemsg(mp); 28908 ipp_packet_free(pp); 28909 return; 28910 } 28911 priv->proc = proc; 28912 priv->ill_index = ill_index; 28913 ipp_packet_set_private(pp, priv, ip_priv_free); 28914 ipp_packet_set_data(pp, mp); 28915 28916 /* Invoke the classifier */ 28917 rc = ipp_packet_process(&pp); 28918 if (pp != NULL) { 28919 mp = ipp_packet_get_data(pp); 28920 ipp_packet_free(pp); 28921 if (rc != 0) { 28922 freemsg(mp); 28923 *mpp = NULL; 28924 } 28925 } else { 28926 *mpp = NULL; 28927 } 28928 #undef IP_CLASS 28929 } 28930 28931 /* 28932 * Propagate a multicast group membership operation (add/drop) on 28933 * all the interfaces crossed by the related multirt routes. 28934 * The call is considered successful if the operation succeeds 28935 * on at least one interface. 28936 */ 28937 static int 28938 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 28939 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 28940 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 28941 mblk_t *first_mp) 28942 { 28943 ire_t *ire_gw; 28944 irb_t *irb; 28945 int error = 0; 28946 opt_restart_t *or; 28947 ip_stack_t *ipst = ire->ire_ipst; 28948 28949 irb = ire->ire_bucket; 28950 ASSERT(irb != NULL); 28951 28952 ASSERT(DB_TYPE(first_mp) == M_CTL); 28953 28954 or = (opt_restart_t *)first_mp->b_rptr; 28955 IRB_REFHOLD(irb); 28956 for (; ire != NULL; ire = ire->ire_next) { 28957 if ((ire->ire_flags & RTF_MULTIRT) == 0) 28958 continue; 28959 if (ire->ire_addr != group) 28960 continue; 28961 28962 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 28963 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 28964 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst); 28965 /* No resolver exists for the gateway; skip this ire. */ 28966 if (ire_gw == NULL) 28967 continue; 28968 28969 /* 28970 * This function can return EINPROGRESS. If so the operation 28971 * will be restarted from ip_restart_optmgmt which will 28972 * call ip_opt_set and option processing will restart for 28973 * this option. So we may end up calling 'fn' more than once. 28974 * This requires that 'fn' is idempotent except for the 28975 * return value. The operation is considered a success if 28976 * it succeeds at least once on any one interface. 28977 */ 28978 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 28979 NULL, fmode, src, first_mp); 28980 if (error == 0) 28981 or->or_private = CGTP_MCAST_SUCCESS; 28982 28983 if (ip_debug > 0) { 28984 ulong_t off; 28985 char *ksym; 28986 ksym = kobj_getsymname((uintptr_t)fn, &off); 28987 ip2dbg(("ip_multirt_apply_membership: " 28988 "called %s, multirt group 0x%08x via itf 0x%08x, " 28989 "error %d [success %u]\n", 28990 ksym ? ksym : "?", 28991 ntohl(group), ntohl(ire_gw->ire_src_addr), 28992 error, or->or_private)); 28993 } 28994 28995 ire_refrele(ire_gw); 28996 if (error == EINPROGRESS) { 28997 IRB_REFRELE(irb); 28998 return (error); 28999 } 29000 } 29001 IRB_REFRELE(irb); 29002 /* 29003 * Consider the call as successful if we succeeded on at least 29004 * one interface. Otherwise, return the last encountered error. 29005 */ 29006 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 29007 } 29008 29009 29010 /* 29011 * Issue a warning regarding a route crossing an interface with an 29012 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 29013 * amount of time is logged. 29014 */ 29015 static void 29016 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 29017 { 29018 hrtime_t current = gethrtime(); 29019 char buf[INET_ADDRSTRLEN]; 29020 ip_stack_t *ipst = ire->ire_ipst; 29021 29022 /* Convert interval in ms to hrtime in ns */ 29023 if (ipst->ips_multirt_bad_mtu_last_time + 29024 ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <= 29025 current) { 29026 cmn_err(CE_WARN, "ip: ignoring multiroute " 29027 "to %s, incorrect MTU %u (expected %u)\n", 29028 ip_dot_addr(ire->ire_addr, buf), 29029 ire->ire_max_frag, max_frag); 29030 29031 ipst->ips_multirt_bad_mtu_last_time = current; 29032 } 29033 } 29034 29035 29036 /* 29037 * Get the CGTP (multirouting) filtering status. 29038 * If 0, the CGTP hooks are transparent. 29039 */ 29040 /* ARGSUSED */ 29041 static int 29042 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 29043 { 29044 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 29045 29046 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 29047 return (0); 29048 } 29049 29050 29051 /* 29052 * Set the CGTP (multirouting) filtering status. 29053 * If the status is changed from active to transparent 29054 * or from transparent to active, forward the new status 29055 * to the filtering module (if loaded). 29056 */ 29057 /* ARGSUSED */ 29058 static int 29059 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 29060 cred_t *ioc_cr) 29061 { 29062 long new_value; 29063 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 29064 ip_stack_t *ipst = CONNQ_TO_IPST(q); 29065 29066 if (secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 29067 return (EPERM); 29068 29069 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 29070 new_value < 0 || new_value > 1) { 29071 return (EINVAL); 29072 } 29073 29074 if ((!*ip_cgtp_filter_value) && new_value) { 29075 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 29076 ipst->ips_ip_cgtp_filter_ops == NULL ? 29077 " (module not loaded)" : ""); 29078 } 29079 if (*ip_cgtp_filter_value && (!new_value)) { 29080 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 29081 ipst->ips_ip_cgtp_filter_ops == NULL ? 29082 " (module not loaded)" : ""); 29083 } 29084 29085 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 29086 int res; 29087 netstackid_t stackid; 29088 29089 stackid = ipst->ips_netstack->netstack_stackid; 29090 res = ipst->ips_ip_cgtp_filter_ops->cfo_change_state(stackid, 29091 new_value); 29092 if (res) 29093 return (res); 29094 } 29095 29096 *ip_cgtp_filter_value = (boolean_t)new_value; 29097 29098 return (0); 29099 } 29100 29101 29102 /* 29103 * Return the expected CGTP hooks version number. 29104 */ 29105 int 29106 ip_cgtp_filter_supported(void) 29107 { 29108 return (ip_cgtp_filter_rev); 29109 } 29110 29111 29112 /* 29113 * CGTP hooks can be registered by invoking this function. 29114 * Checks that the version number matches. 29115 */ 29116 int 29117 ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops) 29118 { 29119 netstack_t *ns; 29120 ip_stack_t *ipst; 29121 29122 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 29123 return (ENOTSUP); 29124 29125 ns = netstack_find_by_stackid(stackid); 29126 if (ns == NULL) 29127 return (EINVAL); 29128 ipst = ns->netstack_ip; 29129 ASSERT(ipst != NULL); 29130 29131 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 29132 netstack_rele(ns); 29133 return (EALREADY); 29134 } 29135 29136 ipst->ips_ip_cgtp_filter_ops = ops; 29137 netstack_rele(ns); 29138 return (0); 29139 } 29140 29141 /* 29142 * CGTP hooks can be unregistered by invoking this function. 29143 * Returns ENXIO if there was no registration. 29144 * Returns EBUSY if the ndd variable has not been turned off. 29145 */ 29146 int 29147 ip_cgtp_filter_unregister(netstackid_t stackid) 29148 { 29149 netstack_t *ns; 29150 ip_stack_t *ipst; 29151 29152 ns = netstack_find_by_stackid(stackid); 29153 if (ns == NULL) 29154 return (EINVAL); 29155 ipst = ns->netstack_ip; 29156 ASSERT(ipst != NULL); 29157 29158 if (ipst->ips_ip_cgtp_filter) { 29159 netstack_rele(ns); 29160 return (EBUSY); 29161 } 29162 29163 if (ipst->ips_ip_cgtp_filter_ops == NULL) { 29164 netstack_rele(ns); 29165 return (ENXIO); 29166 } 29167 ipst->ips_ip_cgtp_filter_ops = NULL; 29168 netstack_rele(ns); 29169 return (0); 29170 } 29171 29172 /* 29173 * Check whether there is a CGTP filter registration. 29174 * Returns non-zero if there is a registration, otherwise returns zero. 29175 * Note: returns zero if bad stackid. 29176 */ 29177 int 29178 ip_cgtp_filter_is_registered(netstackid_t stackid) 29179 { 29180 netstack_t *ns; 29181 ip_stack_t *ipst; 29182 int ret; 29183 29184 ns = netstack_find_by_stackid(stackid); 29185 if (ns == NULL) 29186 return (0); 29187 ipst = ns->netstack_ip; 29188 ASSERT(ipst != NULL); 29189 29190 if (ipst->ips_ip_cgtp_filter_ops != NULL) 29191 ret = 1; 29192 else 29193 ret = 0; 29194 29195 netstack_rele(ns); 29196 return (ret); 29197 } 29198 29199 static squeue_func_t 29200 ip_squeue_switch(int val) 29201 { 29202 squeue_func_t rval = squeue_fill; 29203 29204 switch (val) { 29205 case IP_SQUEUE_ENTER_NODRAIN: 29206 rval = squeue_enter_nodrain; 29207 break; 29208 case IP_SQUEUE_ENTER: 29209 rval = squeue_enter; 29210 break; 29211 default: 29212 break; 29213 } 29214 return (rval); 29215 } 29216 29217 /* ARGSUSED */ 29218 static int 29219 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 29220 caddr_t addr, cred_t *cr) 29221 { 29222 int *v = (int *)addr; 29223 long new_value; 29224 29225 if (secpolicy_net_config(cr, B_FALSE) != 0) 29226 return (EPERM); 29227 29228 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29229 return (EINVAL); 29230 29231 ip_input_proc = ip_squeue_switch(new_value); 29232 *v = new_value; 29233 return (0); 29234 } 29235 29236 /* 29237 * Handle ndd set of variables which require PRIV_SYS_NET_CONFIG such as 29238 * ip_debug. 29239 */ 29240 /* ARGSUSED */ 29241 static int 29242 ip_int_set(queue_t *q, mblk_t *mp, char *value, 29243 caddr_t addr, cred_t *cr) 29244 { 29245 int *v = (int *)addr; 29246 long new_value; 29247 29248 if (secpolicy_net_config(cr, B_FALSE) != 0) 29249 return (EPERM); 29250 29251 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29252 return (EINVAL); 29253 29254 *v = new_value; 29255 return (0); 29256 } 29257 29258 /* 29259 * Handle changes to ipmp_hook_emulation ndd variable. 29260 * Need to update phyint_hook_ifindex. 29261 * Also generate a nic plumb event should a new ifidex be assigned to a group. 29262 */ 29263 static void 29264 ipmp_hook_emulation_changed(ip_stack_t *ipst) 29265 { 29266 phyint_t *phyi; 29267 phyint_t *phyi_tmp; 29268 char *groupname; 29269 int namelen; 29270 ill_t *ill; 29271 boolean_t new_group; 29272 29273 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29274 /* 29275 * Group indicies are stored in the phyint - a common structure 29276 * to both IPv4 and IPv6. 29277 */ 29278 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 29279 for (; phyi != NULL; 29280 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 29281 phyi, AVL_AFTER)) { 29282 /* Ignore the ones that do not have a group */ 29283 if (phyi->phyint_groupname_len == 0) 29284 continue; 29285 29286 /* 29287 * Look for other phyint in group. 29288 * Clear name/namelen so the lookup doesn't find ourselves. 29289 */ 29290 namelen = phyi->phyint_groupname_len; 29291 groupname = phyi->phyint_groupname; 29292 phyi->phyint_groupname_len = 0; 29293 phyi->phyint_groupname = NULL; 29294 29295 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); 29296 /* Restore */ 29297 phyi->phyint_groupname_len = namelen; 29298 phyi->phyint_groupname = groupname; 29299 29300 new_group = B_FALSE; 29301 if (ipst->ips_ipmp_hook_emulation) { 29302 /* 29303 * If the group already exists and has already 29304 * been assigned a group ifindex, we use the existing 29305 * group_ifindex, otherwise we pick a new group_ifindex 29306 * here. 29307 */ 29308 if (phyi_tmp != NULL && 29309 phyi_tmp->phyint_group_ifindex != 0) { 29310 phyi->phyint_group_ifindex = 29311 phyi_tmp->phyint_group_ifindex; 29312 } else { 29313 /* XXX We need a recovery strategy here. */ 29314 if (!ip_assign_ifindex( 29315 &phyi->phyint_group_ifindex, ipst)) 29316 cmn_err(CE_PANIC, 29317 "ip_assign_ifindex() failed"); 29318 new_group = B_TRUE; 29319 } 29320 } else { 29321 phyi->phyint_group_ifindex = 0; 29322 } 29323 if (ipst->ips_ipmp_hook_emulation) 29324 phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; 29325 else 29326 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 29327 29328 /* 29329 * For IP Filter to find out the relationship between 29330 * names and interface indicies, we need to generate 29331 * a NE_PLUMB event when a new group can appear. 29332 * We always generate events when a new interface appears 29333 * (even when ipmp_hook_emulation is set) so there 29334 * is no need to generate NE_PLUMB events when 29335 * ipmp_hook_emulation is turned off. 29336 * And since it isn't critical for IP Filter to get 29337 * the NE_UNPLUMB events we skip those here. 29338 */ 29339 if (new_group) { 29340 /* 29341 * First phyint in group - generate group PLUMB event. 29342 * Since we are not running inside the ipsq we do 29343 * the dispatch immediately. 29344 */ 29345 if (phyi->phyint_illv4 != NULL) 29346 ill = phyi->phyint_illv4; 29347 else 29348 ill = phyi->phyint_illv6; 29349 29350 if (ill != NULL) { 29351 mutex_enter(&ill->ill_lock); 29352 ill_nic_info_plumb(ill, B_TRUE); 29353 ill_nic_info_dispatch(ill); 29354 mutex_exit(&ill->ill_lock); 29355 } 29356 } 29357 } 29358 rw_exit(&ipst->ips_ill_g_lock); 29359 } 29360 29361 /* ARGSUSED */ 29362 static int 29363 ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value, 29364 caddr_t addr, cred_t *cr) 29365 { 29366 int *v = (int *)addr; 29367 long new_value; 29368 ip_stack_t *ipst = CONNQ_TO_IPST(q); 29369 29370 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29371 return (EINVAL); 29372 29373 if (*v != new_value) { 29374 *v = new_value; 29375 ipmp_hook_emulation_changed(ipst); 29376 } 29377 return (0); 29378 } 29379 29380 static void * 29381 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) 29382 { 29383 kstat_t *ksp; 29384 29385 ip_stat_t template = { 29386 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 29387 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 29388 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 29389 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 29390 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 29391 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 29392 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 29393 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 29394 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 29395 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 29396 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 29397 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 29398 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 29399 { "ip_db_ref", KSTAT_DATA_UINT64 }, 29400 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 29401 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 29402 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 29403 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 29404 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 29405 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 29406 { "ip_opt", KSTAT_DATA_UINT64 }, 29407 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 29408 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 29409 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 29410 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 29411 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 29412 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 29413 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 29414 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 29415 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 29416 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 29417 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 29418 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 29419 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29420 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29421 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29422 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29423 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29424 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29425 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29426 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29427 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 29428 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 29429 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 29430 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 29431 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 29432 }; 29433 29434 ksp = kstat_create_netstack("ip", 0, "ipstat", "net", 29435 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 29436 KSTAT_FLAG_VIRTUAL, stackid); 29437 29438 if (ksp == NULL) 29439 return (NULL); 29440 29441 bcopy(&template, ip_statisticsp, sizeof (template)); 29442 ksp->ks_data = (void *)ip_statisticsp; 29443 ksp->ks_private = (void *)(uintptr_t)stackid; 29444 29445 kstat_install(ksp); 29446 return (ksp); 29447 } 29448 29449 static void 29450 ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp) 29451 { 29452 if (ksp != NULL) { 29453 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29454 kstat_delete_netstack(ksp, stackid); 29455 } 29456 } 29457 29458 static void * 29459 ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst) 29460 { 29461 kstat_t *ksp; 29462 29463 ip_named_kstat_t template = { 29464 { "forwarding", KSTAT_DATA_UINT32, 0 }, 29465 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 29466 { "inReceives", KSTAT_DATA_UINT64, 0 }, 29467 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 29468 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 29469 { "forwDatagrams", KSTAT_DATA_UINT64, 0 }, 29470 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 29471 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 29472 { "inDelivers", KSTAT_DATA_UINT64, 0 }, 29473 { "outRequests", KSTAT_DATA_UINT64, 0 }, 29474 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 29475 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 29476 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 29477 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 29478 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 29479 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 29480 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 29481 { "fragFails", KSTAT_DATA_UINT32, 0 }, 29482 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 29483 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 29484 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 29485 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 29486 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 29487 { "inErrs", KSTAT_DATA_UINT32, 0 }, 29488 { "noPorts", KSTAT_DATA_UINT32, 0 }, 29489 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 29490 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 29491 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 29492 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 29493 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 29494 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 29495 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 29496 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 29497 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 29498 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 29499 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 29500 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 29501 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 29502 }; 29503 29504 ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 29505 NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid); 29506 if (ksp == NULL || ksp->ks_data == NULL) 29507 return (NULL); 29508 29509 template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2; 29510 template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl; 29511 template.reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29512 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 29513 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 29514 29515 template.netToMediaEntrySize.value.i32 = 29516 sizeof (mib2_ipNetToMediaEntry_t); 29517 29518 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 29519 29520 bcopy(&template, ksp->ks_data, sizeof (template)); 29521 ksp->ks_update = ip_kstat_update; 29522 ksp->ks_private = (void *)(uintptr_t)stackid; 29523 29524 kstat_install(ksp); 29525 return (ksp); 29526 } 29527 29528 static void 29529 ip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29530 { 29531 if (ksp != NULL) { 29532 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29533 kstat_delete_netstack(ksp, stackid); 29534 } 29535 } 29536 29537 static int 29538 ip_kstat_update(kstat_t *kp, int rw) 29539 { 29540 ip_named_kstat_t *ipkp; 29541 mib2_ipIfStatsEntry_t ipmib; 29542 ill_walk_context_t ctx; 29543 ill_t *ill; 29544 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29545 netstack_t *ns; 29546 ip_stack_t *ipst; 29547 29548 if (kp == NULL || kp->ks_data == NULL) 29549 return (EIO); 29550 29551 if (rw == KSTAT_WRITE) 29552 return (EACCES); 29553 29554 ns = netstack_find_by_stackid(stackid); 29555 if (ns == NULL) 29556 return (-1); 29557 ipst = ns->netstack_ip; 29558 if (ipst == NULL) { 29559 netstack_rele(ns); 29560 return (-1); 29561 } 29562 ipkp = (ip_named_kstat_t *)kp->ks_data; 29563 29564 bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib)); 29565 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29566 ill = ILL_START_WALK_V4(&ctx, ipst); 29567 for (; ill != NULL; ill = ill_next(&ctx, ill)) 29568 ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib); 29569 rw_exit(&ipst->ips_ill_g_lock); 29570 29571 ipkp->forwarding.value.ui32 = ipmib.ipIfStatsForwarding; 29572 ipkp->defaultTTL.value.ui32 = ipmib.ipIfStatsDefaultTTL; 29573 ipkp->inReceives.value.ui64 = ipmib.ipIfStatsHCInReceives; 29574 ipkp->inHdrErrors.value.ui32 = ipmib.ipIfStatsInHdrErrors; 29575 ipkp->inAddrErrors.value.ui32 = ipmib.ipIfStatsInAddrErrors; 29576 ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams; 29577 ipkp->inUnknownProtos.value.ui32 = ipmib.ipIfStatsInUnknownProtos; 29578 ipkp->inDiscards.value.ui32 = ipmib.ipIfStatsInDiscards; 29579 ipkp->inDelivers.value.ui64 = ipmib.ipIfStatsHCInDelivers; 29580 ipkp->outRequests.value.ui64 = ipmib.ipIfStatsHCOutRequests; 29581 ipkp->outDiscards.value.ui32 = ipmib.ipIfStatsOutDiscards; 29582 ipkp->outNoRoutes.value.ui32 = ipmib.ipIfStatsOutNoRoutes; 29583 ipkp->reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29584 ipkp->reasmReqds.value.ui32 = ipmib.ipIfStatsReasmReqds; 29585 ipkp->reasmOKs.value.ui32 = ipmib.ipIfStatsReasmOKs; 29586 ipkp->reasmFails.value.ui32 = ipmib.ipIfStatsReasmFails; 29587 ipkp->fragOKs.value.ui32 = ipmib.ipIfStatsOutFragOKs; 29588 ipkp->fragFails.value.ui32 = ipmib.ipIfStatsOutFragFails; 29589 ipkp->fragCreates.value.ui32 = ipmib.ipIfStatsOutFragCreates; 29590 29591 ipkp->routingDiscards.value.ui32 = 0; 29592 ipkp->inErrs.value.ui32 = ipmib.tcpIfStatsInErrs; 29593 ipkp->noPorts.value.ui32 = ipmib.udpIfStatsNoPorts; 29594 ipkp->inCksumErrs.value.ui32 = ipmib.ipIfStatsInCksumErrs; 29595 ipkp->reasmDuplicates.value.ui32 = ipmib.ipIfStatsReasmDuplicates; 29596 ipkp->reasmPartDups.value.ui32 = ipmib.ipIfStatsReasmPartDups; 29597 ipkp->forwProhibits.value.ui32 = ipmib.ipIfStatsForwProhibits; 29598 ipkp->udpInCksumErrs.value.ui32 = ipmib.udpIfStatsInCksumErrs; 29599 ipkp->udpInOverflows.value.ui32 = ipmib.udpIfStatsInOverflows; 29600 ipkp->rawipInOverflows.value.ui32 = ipmib.rawipIfStatsInOverflows; 29601 ipkp->ipsecInSucceeded.value.ui32 = ipmib.ipsecIfStatsInSucceeded; 29602 ipkp->ipsecInFailed.value.i32 = ipmib.ipsecIfStatsInFailed; 29603 29604 ipkp->inIPv6.value.ui32 = ipmib.ipIfStatsInWrongIPVersion; 29605 ipkp->outIPv6.value.ui32 = ipmib.ipIfStatsOutWrongIPVersion; 29606 ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion; 29607 29608 netstack_rele(ns); 29609 29610 return (0); 29611 } 29612 29613 static void * 29614 icmp_kstat_init(netstackid_t stackid) 29615 { 29616 kstat_t *ksp; 29617 29618 icmp_named_kstat_t template = { 29619 { "inMsgs", KSTAT_DATA_UINT32 }, 29620 { "inErrors", KSTAT_DATA_UINT32 }, 29621 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 29622 { "inTimeExcds", KSTAT_DATA_UINT32 }, 29623 { "inParmProbs", KSTAT_DATA_UINT32 }, 29624 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 29625 { "inRedirects", KSTAT_DATA_UINT32 }, 29626 { "inEchos", KSTAT_DATA_UINT32 }, 29627 { "inEchoReps", KSTAT_DATA_UINT32 }, 29628 { "inTimestamps", KSTAT_DATA_UINT32 }, 29629 { "inTimestampReps", KSTAT_DATA_UINT32 }, 29630 { "inAddrMasks", KSTAT_DATA_UINT32 }, 29631 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 29632 { "outMsgs", KSTAT_DATA_UINT32 }, 29633 { "outErrors", KSTAT_DATA_UINT32 }, 29634 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 29635 { "outTimeExcds", KSTAT_DATA_UINT32 }, 29636 { "outParmProbs", KSTAT_DATA_UINT32 }, 29637 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 29638 { "outRedirects", KSTAT_DATA_UINT32 }, 29639 { "outEchos", KSTAT_DATA_UINT32 }, 29640 { "outEchoReps", KSTAT_DATA_UINT32 }, 29641 { "outTimestamps", KSTAT_DATA_UINT32 }, 29642 { "outTimestampReps", KSTAT_DATA_UINT32 }, 29643 { "outAddrMasks", KSTAT_DATA_UINT32 }, 29644 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 29645 { "inChksumErrs", KSTAT_DATA_UINT32 }, 29646 { "inUnknowns", KSTAT_DATA_UINT32 }, 29647 { "inFragNeeded", KSTAT_DATA_UINT32 }, 29648 { "outFragNeeded", KSTAT_DATA_UINT32 }, 29649 { "outDrops", KSTAT_DATA_UINT32 }, 29650 { "inOverFlows", KSTAT_DATA_UINT32 }, 29651 { "inBadRedirects", KSTAT_DATA_UINT32 }, 29652 }; 29653 29654 ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 29655 NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid); 29656 if (ksp == NULL || ksp->ks_data == NULL) 29657 return (NULL); 29658 29659 bcopy(&template, ksp->ks_data, sizeof (template)); 29660 29661 ksp->ks_update = icmp_kstat_update; 29662 ksp->ks_private = (void *)(uintptr_t)stackid; 29663 29664 kstat_install(ksp); 29665 return (ksp); 29666 } 29667 29668 static void 29669 icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29670 { 29671 if (ksp != NULL) { 29672 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29673 kstat_delete_netstack(ksp, stackid); 29674 } 29675 } 29676 29677 static int 29678 icmp_kstat_update(kstat_t *kp, int rw) 29679 { 29680 icmp_named_kstat_t *icmpkp; 29681 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29682 netstack_t *ns; 29683 ip_stack_t *ipst; 29684 29685 if ((kp == NULL) || (kp->ks_data == NULL)) 29686 return (EIO); 29687 29688 if (rw == KSTAT_WRITE) 29689 return (EACCES); 29690 29691 ns = netstack_find_by_stackid(stackid); 29692 if (ns == NULL) 29693 return (-1); 29694 ipst = ns->netstack_ip; 29695 if (ipst == NULL) { 29696 netstack_rele(ns); 29697 return (-1); 29698 } 29699 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 29700 29701 icmpkp->inMsgs.value.ui32 = ipst->ips_icmp_mib.icmpInMsgs; 29702 icmpkp->inErrors.value.ui32 = ipst->ips_icmp_mib.icmpInErrors; 29703 icmpkp->inDestUnreachs.value.ui32 = 29704 ipst->ips_icmp_mib.icmpInDestUnreachs; 29705 icmpkp->inTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpInTimeExcds; 29706 icmpkp->inParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpInParmProbs; 29707 icmpkp->inSrcQuenchs.value.ui32 = ipst->ips_icmp_mib.icmpInSrcQuenchs; 29708 icmpkp->inRedirects.value.ui32 = ipst->ips_icmp_mib.icmpInRedirects; 29709 icmpkp->inEchos.value.ui32 = ipst->ips_icmp_mib.icmpInEchos; 29710 icmpkp->inEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpInEchoReps; 29711 icmpkp->inTimestamps.value.ui32 = ipst->ips_icmp_mib.icmpInTimestamps; 29712 icmpkp->inTimestampReps.value.ui32 = 29713 ipst->ips_icmp_mib.icmpInTimestampReps; 29714 icmpkp->inAddrMasks.value.ui32 = ipst->ips_icmp_mib.icmpInAddrMasks; 29715 icmpkp->inAddrMaskReps.value.ui32 = 29716 ipst->ips_icmp_mib.icmpInAddrMaskReps; 29717 icmpkp->outMsgs.value.ui32 = ipst->ips_icmp_mib.icmpOutMsgs; 29718 icmpkp->outErrors.value.ui32 = ipst->ips_icmp_mib.icmpOutErrors; 29719 icmpkp->outDestUnreachs.value.ui32 = 29720 ipst->ips_icmp_mib.icmpOutDestUnreachs; 29721 icmpkp->outTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpOutTimeExcds; 29722 icmpkp->outParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpOutParmProbs; 29723 icmpkp->outSrcQuenchs.value.ui32 = 29724 ipst->ips_icmp_mib.icmpOutSrcQuenchs; 29725 icmpkp->outRedirects.value.ui32 = ipst->ips_icmp_mib.icmpOutRedirects; 29726 icmpkp->outEchos.value.ui32 = ipst->ips_icmp_mib.icmpOutEchos; 29727 icmpkp->outEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpOutEchoReps; 29728 icmpkp->outTimestamps.value.ui32 = 29729 ipst->ips_icmp_mib.icmpOutTimestamps; 29730 icmpkp->outTimestampReps.value.ui32 = 29731 ipst->ips_icmp_mib.icmpOutTimestampReps; 29732 icmpkp->outAddrMasks.value.ui32 = 29733 ipst->ips_icmp_mib.icmpOutAddrMasks; 29734 icmpkp->outAddrMaskReps.value.ui32 = 29735 ipst->ips_icmp_mib.icmpOutAddrMaskReps; 29736 icmpkp->inCksumErrs.value.ui32 = ipst->ips_icmp_mib.icmpInCksumErrs; 29737 icmpkp->inUnknowns.value.ui32 = ipst->ips_icmp_mib.icmpInUnknowns; 29738 icmpkp->inFragNeeded.value.ui32 = ipst->ips_icmp_mib.icmpInFragNeeded; 29739 icmpkp->outFragNeeded.value.ui32 = 29740 ipst->ips_icmp_mib.icmpOutFragNeeded; 29741 icmpkp->outDrops.value.ui32 = ipst->ips_icmp_mib.icmpOutDrops; 29742 icmpkp->inOverflows.value.ui32 = ipst->ips_icmp_mib.icmpInOverflows; 29743 icmpkp->inBadRedirects.value.ui32 = 29744 ipst->ips_icmp_mib.icmpInBadRedirects; 29745 29746 netstack_rele(ns); 29747 return (0); 29748 } 29749 29750 /* 29751 * This is the fanout function for raw socket opened for SCTP. Note 29752 * that it is called after SCTP checks that there is no socket which 29753 * wants a packet. Then before SCTP handles this out of the blue packet, 29754 * this function is called to see if there is any raw socket for SCTP. 29755 * If there is and it is bound to the correct address, the packet will 29756 * be sent to that socket. Note that only one raw socket can be bound to 29757 * a port. This is assured in ipcl_sctp_hash_insert(); 29758 */ 29759 void 29760 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 29761 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 29762 zoneid_t zoneid) 29763 { 29764 conn_t *connp; 29765 queue_t *rq; 29766 mblk_t *first_mp; 29767 boolean_t secure; 29768 ip6_t *ip6h; 29769 ip_stack_t *ipst = recv_ill->ill_ipst; 29770 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 29771 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; 29772 boolean_t sctp_csum_err = B_FALSE; 29773 29774 if (flags & IP_FF_SCTP_CSUM_ERR) { 29775 sctp_csum_err = B_TRUE; 29776 flags &= ~IP_FF_SCTP_CSUM_ERR; 29777 } 29778 29779 first_mp = mp; 29780 if (mctl_present) { 29781 mp = first_mp->b_cont; 29782 secure = ipsec_in_is_secure(first_mp); 29783 ASSERT(mp != NULL); 29784 } else { 29785 secure = B_FALSE; 29786 } 29787 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 29788 29789 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst); 29790 if (connp == NULL) { 29791 /* 29792 * Although raw sctp is not summed, OOB chunks must be. 29793 * Drop the packet here if the sctp checksum failed. 29794 */ 29795 if (sctp_csum_err) { 29796 BUMP_MIB(&sctps->sctps_mib, sctpChecksumError); 29797 freemsg(first_mp); 29798 return; 29799 } 29800 sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present); 29801 return; 29802 } 29803 rq = connp->conn_rq; 29804 if (!canputnext(rq)) { 29805 CONN_DEC_REF(connp); 29806 BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows); 29807 freemsg(first_mp); 29808 return; 29809 } 29810 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) : 29811 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) { 29812 first_mp = ipsec_check_inbound_policy(first_mp, connp, 29813 (isv4 ? ipha : NULL), ip6h, mctl_present); 29814 if (first_mp == NULL) { 29815 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 29816 CONN_DEC_REF(connp); 29817 return; 29818 } 29819 } 29820 /* 29821 * We probably should not send M_CTL message up to 29822 * raw socket. 29823 */ 29824 if (mctl_present) 29825 freeb(first_mp); 29826 29827 /* Initiate IPPF processing here if needed. */ 29828 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) || 29829 (!isv4 && IP6_IN_IPP(flags, ipst))) { 29830 ip_process(IPP_LOCAL_IN, &mp, 29831 recv_ill->ill_phyint->phyint_ifindex); 29832 if (mp == NULL) { 29833 CONN_DEC_REF(connp); 29834 return; 29835 } 29836 } 29837 29838 if (connp->conn_recvif || connp->conn_recvslla || 29839 ((connp->conn_ip_recvpktinfo || 29840 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 29841 (flags & IP_FF_IPINFO))) { 29842 int in_flags = 0; 29843 29844 /* 29845 * Since sctp does not support IP_RECVPKTINFO for v4, only pass 29846 * IPF_RECVIF. 29847 */ 29848 if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { 29849 in_flags = IPF_RECVIF; 29850 } 29851 if (connp->conn_recvslla) { 29852 in_flags |= IPF_RECVSLLA; 29853 } 29854 if (isv4) { 29855 mp = ip_add_info(mp, recv_ill, in_flags, 29856 IPCL_ZONEID(connp), ipst); 29857 } else { 29858 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 29859 if (mp == NULL) { 29860 BUMP_MIB(recv_ill->ill_ip_mib, 29861 ipIfStatsInDiscards); 29862 CONN_DEC_REF(connp); 29863 return; 29864 } 29865 } 29866 } 29867 29868 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 29869 /* 29870 * We are sending the IPSEC_IN message also up. Refer 29871 * to comments above this function. 29872 * This is the SOCK_RAW, IPPROTO_SCTP case. 29873 */ 29874 (connp->conn_recv)(connp, mp, NULL); 29875 CONN_DEC_REF(connp); 29876 } 29877 29878 #define UPDATE_IP_MIB_OB_COUNTERS(ill, len) \ 29879 { \ 29880 BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits); \ 29881 UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len)); \ 29882 } 29883 /* 29884 * This function should be called only if all packet processing 29885 * including fragmentation is complete. Callers of this function 29886 * must set mp->b_prev to one of these values: 29887 * {0, IPP_FWD_OUT, IPP_LOCAL_OUT} 29888 * prior to handing over the mp as first argument to this function. 29889 * 29890 * If the ire passed by caller is incomplete, this function 29891 * queues the packet and if necessary, sends ARP request and bails. 29892 * If the ire passed is fully resolved, we simply prepend 29893 * the link-layer header to the packet, do ipsec hw acceleration 29894 * work if necessary, and send the packet out on the wire. 29895 * 29896 * NOTE: IPsec will only call this function with fully resolved 29897 * ires if hw acceleration is involved. 29898 * TODO list : 29899 * a Handle M_MULTIDATA so that 29900 * tcp_multisend->tcp_multisend_data can 29901 * call ip_xmit_v4 directly 29902 * b Handle post-ARP work for fragments so that 29903 * ip_wput_frag can call this function. 29904 */ 29905 ipxmit_state_t 29906 ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled) 29907 { 29908 nce_t *arpce; 29909 ipha_t *ipha; 29910 queue_t *q; 29911 int ill_index; 29912 mblk_t *nxt_mp, *first_mp; 29913 boolean_t xmit_drop = B_FALSE; 29914 ip_proc_t proc; 29915 ill_t *out_ill; 29916 int pkt_len; 29917 29918 arpce = ire->ire_nce; 29919 ASSERT(arpce != NULL); 29920 29921 DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire, nce_t *, arpce); 29922 29923 mutex_enter(&arpce->nce_lock); 29924 switch (arpce->nce_state) { 29925 case ND_REACHABLE: 29926 /* If there are other queued packets, queue this packet */ 29927 if (arpce->nce_qd_mp != NULL) { 29928 if (mp != NULL) 29929 nce_queue_mp_common(arpce, mp, B_FALSE); 29930 mp = arpce->nce_qd_mp; 29931 } 29932 arpce->nce_qd_mp = NULL; 29933 mutex_exit(&arpce->nce_lock); 29934 29935 /* 29936 * Flush the queue. In the common case, where the 29937 * ARP is already resolved, it will go through the 29938 * while loop only once. 29939 */ 29940 while (mp != NULL) { 29941 29942 nxt_mp = mp->b_next; 29943 mp->b_next = NULL; 29944 ASSERT(mp->b_datap->db_type != M_CTL); 29945 pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length); 29946 /* 29947 * This info is needed for IPQOS to do COS marking 29948 * in ip_wput_attach_llhdr->ip_process. 29949 */ 29950 proc = (ip_proc_t)(uintptr_t)mp->b_prev; 29951 mp->b_prev = NULL; 29952 29953 /* set up ill index for outbound qos processing */ 29954 out_ill = ire_to_ill(ire); 29955 ill_index = out_ill->ill_phyint->phyint_ifindex; 29956 first_mp = ip_wput_attach_llhdr(mp, ire, proc, 29957 ill_index, &ipha); 29958 if (first_mp == NULL) { 29959 xmit_drop = B_TRUE; 29960 BUMP_MIB(out_ill->ill_ip_mib, 29961 ipIfStatsOutDiscards); 29962 goto next_mp; 29963 } 29964 29965 /* non-ipsec hw accel case */ 29966 if (io == NULL || !io->ipsec_out_accelerated) { 29967 /* send it */ 29968 q = ire->ire_stq; 29969 if (proc == IPP_FWD_OUT) { 29970 UPDATE_IB_PKT_COUNT(ire); 29971 } else { 29972 UPDATE_OB_PKT_COUNT(ire); 29973 } 29974 ire->ire_last_used_time = lbolt; 29975 29976 if (flow_ctl_enabled || canputnext(q)) { 29977 if (proc == IPP_FWD_OUT) { 29978 29979 BUMP_MIB(out_ill->ill_ip_mib, 29980 ipIfStatsHCOutForwDatagrams); 29981 29982 } 29983 UPDATE_IP_MIB_OB_COUNTERS(out_ill, 29984 pkt_len); 29985 29986 DTRACE_IP7(send, mblk_t *, first_mp, 29987 conn_t *, NULL, void_ip_t *, ipha, 29988 __dtrace_ipsr_ill_t *, out_ill, 29989 ipha_t *, ipha, ip6_t *, NULL, int, 29990 0); 29991 29992 putnext(q, first_mp); 29993 } else { 29994 BUMP_MIB(out_ill->ill_ip_mib, 29995 ipIfStatsOutDiscards); 29996 xmit_drop = B_TRUE; 29997 freemsg(first_mp); 29998 } 29999 } else { 30000 /* 30001 * Safety Pup says: make sure this 30002 * is going to the right interface! 30003 */ 30004 ill_t *ill1 = 30005 (ill_t *)ire->ire_stq->q_ptr; 30006 int ifindex = 30007 ill1->ill_phyint->phyint_ifindex; 30008 if (ifindex != 30009 io->ipsec_out_capab_ill_index) { 30010 xmit_drop = B_TRUE; 30011 freemsg(mp); 30012 } else { 30013 UPDATE_IP_MIB_OB_COUNTERS(ill1, 30014 pkt_len); 30015 30016 DTRACE_IP7(send, mblk_t *, first_mp, 30017 conn_t *, NULL, void_ip_t *, ipha, 30018 __dtrace_ipsr_ill_t *, ill1, 30019 ipha_t *, ipha, ip6_t *, NULL, 30020 int, 0); 30021 30022 ipsec_hw_putnext(ire->ire_stq, mp); 30023 } 30024 } 30025 next_mp: 30026 mp = nxt_mp; 30027 } /* while (mp != NULL) */ 30028 if (xmit_drop) 30029 return (SEND_FAILED); 30030 else 30031 return (SEND_PASSED); 30032 30033 case ND_INITIAL: 30034 case ND_INCOMPLETE: 30035 30036 /* 30037 * While we do send off packets to dests that 30038 * use fully-resolved CGTP routes, we do not 30039 * handle unresolved CGTP routes. 30040 */ 30041 ASSERT(!(ire->ire_flags & RTF_MULTIRT)); 30042 ASSERT(io == NULL || !io->ipsec_out_accelerated); 30043 30044 if (mp != NULL) { 30045 /* queue the packet */ 30046 nce_queue_mp_common(arpce, mp, B_FALSE); 30047 } 30048 30049 if (arpce->nce_state == ND_INCOMPLETE) { 30050 mutex_exit(&arpce->nce_lock); 30051 DTRACE_PROBE3(ip__xmit__incomplete, 30052 (ire_t *), ire, (mblk_t *), mp, 30053 (ipsec_out_t *), io); 30054 return (LOOKUP_IN_PROGRESS); 30055 } 30056 30057 arpce->nce_state = ND_INCOMPLETE; 30058 mutex_exit(&arpce->nce_lock); 30059 /* 30060 * Note that ire_add() (called from ire_forward()) 30061 * holds a ref on the ire until ARP is completed. 30062 */ 30063 30064 ire_arpresolve(ire, ire_to_ill(ire)); 30065 return (LOOKUP_IN_PROGRESS); 30066 default: 30067 ASSERT(0); 30068 mutex_exit(&arpce->nce_lock); 30069 return (LLHDR_RESLV_FAILED); 30070 } 30071 } 30072 30073 #undef UPDATE_IP_MIB_OB_COUNTERS 30074 30075 /* 30076 * Return B_TRUE if the buffers differ in length or content. 30077 * This is used for comparing extension header buffers. 30078 * Note that an extension header would be declared different 30079 * even if all that changed was the next header value in that header i.e. 30080 * what really changed is the next extension header. 30081 */ 30082 boolean_t 30083 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 30084 uint_t blen) 30085 { 30086 if (!b_valid) 30087 blen = 0; 30088 30089 if (alen != blen) 30090 return (B_TRUE); 30091 if (alen == 0) 30092 return (B_FALSE); /* Both zero length */ 30093 return (bcmp(abuf, bbuf, alen)); 30094 } 30095 30096 /* 30097 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 30098 * Return B_FALSE if memory allocation fails - don't change any state! 30099 */ 30100 boolean_t 30101 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 30102 const void *src, uint_t srclen) 30103 { 30104 void *dst; 30105 30106 if (!src_valid) 30107 srclen = 0; 30108 30109 ASSERT(*dstlenp == 0); 30110 if (src != NULL && srclen != 0) { 30111 dst = mi_alloc(srclen, BPRI_MED); 30112 if (dst == NULL) 30113 return (B_FALSE); 30114 } else { 30115 dst = NULL; 30116 } 30117 if (*dstp != NULL) 30118 mi_free(*dstp); 30119 *dstp = dst; 30120 *dstlenp = dst == NULL ? 0 : srclen; 30121 return (B_TRUE); 30122 } 30123 30124 /* 30125 * Replace what is in *dst, *dstlen with the source. 30126 * Assumes ip_allocbuf has already been called. 30127 */ 30128 void 30129 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 30130 const void *src, uint_t srclen) 30131 { 30132 if (!src_valid) 30133 srclen = 0; 30134 30135 ASSERT(*dstlenp == srclen); 30136 if (src != NULL && srclen != 0) 30137 bcopy(src, *dstp, srclen); 30138 } 30139 30140 /* 30141 * Free the storage pointed to by the members of an ip6_pkt_t. 30142 */ 30143 void 30144 ip6_pkt_free(ip6_pkt_t *ipp) 30145 { 30146 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 30147 30148 if (ipp->ipp_fields & IPPF_HOPOPTS) { 30149 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 30150 ipp->ipp_hopopts = NULL; 30151 ipp->ipp_hopoptslen = 0; 30152 } 30153 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 30154 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 30155 ipp->ipp_rtdstopts = NULL; 30156 ipp->ipp_rtdstoptslen = 0; 30157 } 30158 if (ipp->ipp_fields & IPPF_DSTOPTS) { 30159 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 30160 ipp->ipp_dstopts = NULL; 30161 ipp->ipp_dstoptslen = 0; 30162 } 30163 if (ipp->ipp_fields & IPPF_RTHDR) { 30164 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 30165 ipp->ipp_rthdr = NULL; 30166 ipp->ipp_rthdrlen = 0; 30167 } 30168 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 30169 IPPF_RTHDR); 30170 } 30171