1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/dlpi.h> 31 #include <sys/stropts.h> 32 #include <sys/sysmacros.h> 33 #include <sys/strsubr.h> 34 #include <sys/strlog.h> 35 #include <sys/strsun.h> 36 #include <sys/zone.h> 37 #define _SUN_TPI_VERSION 2 38 #include <sys/tihdr.h> 39 #include <sys/xti_inet.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/debug.h> 44 #include <sys/kobj.h> 45 #include <sys/modctl.h> 46 #include <sys/atomic.h> 47 #include <sys/policy.h> 48 #include <sys/priv.h> 49 50 #include <sys/systm.h> 51 #include <sys/param.h> 52 #include <sys/kmem.h> 53 #include <sys/sdt.h> 54 #include <sys/socket.h> 55 #include <sys/vtrace.h> 56 #include <sys/isa_defs.h> 57 #include <sys/mac.h> 58 #include <net/if.h> 59 #include <net/if_arp.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <net/if_dl.h> 64 65 #include <inet/common.h> 66 #include <inet/mi.h> 67 #include <inet/mib2.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/snmpcom.h> 71 #include <inet/optcom.h> 72 #include <inet/kstatcom.h> 73 74 #include <netinet/igmp_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet/icmp6.h> 77 #include <netinet/sctp.h> 78 79 #include <inet/ip.h> 80 #include <inet/ip_impl.h> 81 #include <inet/ip6.h> 82 #include <inet/ip6_asp.h> 83 #include <inet/tcp.h> 84 #include <inet/tcp_impl.h> 85 #include <inet/ip_multi.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_ire.h> 88 #include <inet/ip_ftable.h> 89 #include <inet/ip_rts.h> 90 #include <inet/ip_ndp.h> 91 #include <inet/ip_listutils.h> 92 #include <netinet/igmp.h> 93 #include <netinet/ip_mroute.h> 94 #include <inet/ipp_common.h> 95 96 #include <net/pfkeyv2.h> 97 #include <inet/ipsec_info.h> 98 #include <inet/sadb.h> 99 #include <inet/ipsec_impl.h> 100 #include <sys/iphada.h> 101 #include <inet/tun.h> 102 #include <inet/ipdrop.h> 103 #include <inet/ip_netinfo.h> 104 105 #include <sys/ethernet.h> 106 #include <net/if_types.h> 107 #include <sys/cpuvar.h> 108 109 #include <ipp/ipp.h> 110 #include <ipp/ipp_impl.h> 111 #include <ipp/ipgpc/ipgpc.h> 112 113 #include <sys/multidata.h> 114 #include <sys/pattr.h> 115 116 #include <inet/ipclassifier.h> 117 #include <inet/sctp_ip.h> 118 #include <inet/sctp/sctp_impl.h> 119 #include <inet/udp_impl.h> 120 #include <inet/rawip_impl.h> 121 #include <inet/rts_impl.h> 122 #include <sys/sunddi.h> 123 124 #include <sys/tsol/label.h> 125 #include <sys/tsol/tnet.h> 126 127 #include <rpc/pmap_prot.h> 128 129 /* 130 * Values for squeue switch: 131 * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain 132 * IP_SQUEUE_ENTER: squeue_enter 133 * IP_SQUEUE_FILL: squeue_fill 134 */ 135 int ip_squeue_enter = 2; /* Setable in /etc/system */ 136 137 squeue_func_t ip_input_proc; 138 #define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) 139 140 /* 141 * Setable in /etc/system 142 */ 143 int ip_poll_normal_ms = 100; 144 int ip_poll_normal_ticks = 0; 145 int ip_modclose_ackwait_ms = 3000; 146 147 /* 148 * It would be nice to have these present only in DEBUG systems, but the 149 * current design of the global symbol checking logic requires them to be 150 * unconditionally present. 151 */ 152 uint_t ip_thread_data; /* TSD key for debug support */ 153 krwlock_t ip_thread_rwlock; 154 list_t ip_thread_list; 155 156 /* 157 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 158 */ 159 160 struct listptr_s { 161 mblk_t *lp_head; /* pointer to the head of the list */ 162 mblk_t *lp_tail; /* pointer to the tail of the list */ 163 }; 164 165 typedef struct listptr_s listptr_t; 166 167 /* 168 * This is used by ip_snmp_get_mib2_ip_route_media and 169 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 170 */ 171 typedef struct iproutedata_s { 172 uint_t ird_idx; 173 listptr_t ird_route; /* ipRouteEntryTable */ 174 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 175 listptr_t ird_attrs; /* ipRouteAttributeTable */ 176 } iproutedata_t; 177 178 /* 179 * Cluster specific hooks. These should be NULL when booted as a non-cluster 180 */ 181 182 /* 183 * Hook functions to enable cluster networking 184 * On non-clustered systems these vectors must always be NULL. 185 * 186 * Hook function to Check ip specified ip address is a shared ip address 187 * in the cluster 188 * 189 */ 190 int (*cl_inet_isclusterwide)(uint8_t protocol, 191 sa_family_t addr_family, uint8_t *laddrp) = NULL; 192 193 /* 194 * Hook function to generate cluster wide ip fragment identifier 195 */ 196 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 197 uint8_t *laddrp, uint8_t *faddrp) = NULL; 198 199 /* 200 * Hook function to generate cluster wide SPI. 201 */ 202 void (*cl_inet_getspi)(uint8_t, uint8_t *, size_t) = NULL; 203 204 /* 205 * Hook function to verify if the SPI is already utlized. 206 */ 207 208 int (*cl_inet_checkspi)(uint8_t, uint32_t) = NULL; 209 210 /* 211 * Hook function to delete the SPI from the cluster wide repository. 212 */ 213 214 void (*cl_inet_deletespi)(uint8_t, uint32_t) = NULL; 215 216 /* 217 * Hook function to inform the cluster when packet received on an IDLE SA 218 */ 219 220 void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t, 221 in6_addr_t) = NULL; 222 223 /* 224 * Synchronization notes: 225 * 226 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 227 * MT level protection given by STREAMS. IP uses a combination of its own 228 * internal serialization mechanism and standard Solaris locking techniques. 229 * The internal serialization is per phyint (no IPMP) or per IPMP group. 230 * This is used to serialize plumbing operations, IPMP operations, certain 231 * multicast operations, most set ioctls, igmp/mld timers etc. 232 * 233 * Plumbing is a long sequence of operations involving message 234 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 235 * involved in plumbing operations. A natural model is to serialize these 236 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 237 * parallel without any interference. But various set ioctls on hme0 are best 238 * serialized. However if the system uses IPMP, the operations are easier if 239 * they are serialized on a per IPMP group basis since IPMP operations 240 * happen across ill's of a group. Thus the lowest common denominator is to 241 * serialize most set ioctls, multicast join/leave operations, IPMP operations 242 * igmp/mld timer operations, and processing of DLPI control messages received 243 * from drivers on a per IPMP group basis. If the system does not employ 244 * IPMP the serialization is on a per phyint basis. This serialization is 245 * provided by the ipsq_t and primitives operating on this. Details can 246 * be found in ip_if.c above the core primitives operating on ipsq_t. 247 * 248 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 249 * Simiarly lookup of an ire by a thread also returns a refheld ire. 250 * In addition ipif's and ill's referenced by the ire are also indirectly 251 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 252 * the ipif's address or netmask change as long as an ipif is refheld 253 * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the 254 * address of an ipif has to go through the ipsq_t. This ensures that only 255 * 1 such exclusive operation proceeds at any time on the ipif. It then 256 * deletes all ires associated with this ipif, and waits for all refcnts 257 * associated with this ipif to come down to zero. The address is changed 258 * only after the ipif has been quiesced. Then the ipif is brought up again. 259 * More details are described above the comment in ip_sioctl_flags. 260 * 261 * Packet processing is based mostly on IREs and are fully multi-threaded 262 * using standard Solaris MT techniques. 263 * 264 * There are explicit locks in IP to handle: 265 * - The ip_g_head list maintained by mi_open_link() and friends. 266 * 267 * - The reassembly data structures (one lock per hash bucket) 268 * 269 * - conn_lock is meant to protect conn_t fields. The fields actually 270 * protected by conn_lock are documented in the conn_t definition. 271 * 272 * - ire_lock to protect some of the fields of the ire, IRE tables 273 * (one lock per hash bucket). Refer to ip_ire.c for details. 274 * 275 * - ndp_g_lock and nce_lock for protecting NCEs. 276 * 277 * - ill_lock protects fields of the ill and ipif. Details in ip.h 278 * 279 * - ill_g_lock: This is a global reader/writer lock. Protects the following 280 * * The AVL tree based global multi list of all ills. 281 * * The linked list of all ipifs of an ill 282 * * The <ill-ipsq> mapping 283 * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next 284 * * The illgroup list threaded by ill_group_next. 285 * * <ill-phyint> association 286 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 287 * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion 288 * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill 289 * will all have to hold the ill_g_lock as writer for the actual duration 290 * of the insertion/deletion/change. More details about the <ill-ipsq> mapping 291 * may be found in the IPMP section. 292 * 293 * - ill_lock: This is a per ill mutex. 294 * It protects some members of the ill and is documented below. 295 * It also protects the <ill-ipsq> mapping 296 * It also protects the illgroup list threaded by ill_group_next. 297 * It also protects the <ill-phyint> assoc. 298 * It also protects the list of ipifs hanging off the ill. 299 * 300 * - ipsq_lock: This is a per ipsq_t mutex lock. 301 * This protects all the other members of the ipsq struct except 302 * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock 303 * 304 * - illgrp_lock: This is a per ill_group mutex lock. 305 * The only thing it protects is the illgrp_ill_schednext member of ill_group 306 * which dictates which is the next ill in an ill_group that is to be chosen 307 * for sending outgoing packets, through creation of an IRE_CACHE that 308 * references this ill. 309 * 310 * - phyint_lock: This is a per phyint mutex lock. Protects just the 311 * phyint_flags 312 * 313 * - ip_g_nd_lock: This is a global reader/writer lock. 314 * Any call to nd_load to load a new parameter to the ND table must hold the 315 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 316 * as reader. 317 * 318 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 319 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 320 * uniqueness check also done atomically. 321 * 322 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 323 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 324 * as a writer when adding or deleting elements from these lists, and 325 * as a reader when walking these lists to send a SADB update to the 326 * IPsec capable ills. 327 * 328 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 329 * group list linked by ill_usesrc_grp_next. It also protects the 330 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 331 * group is being added or deleted. This lock is taken as a reader when 332 * walking the list/group(eg: to get the number of members in a usesrc group). 333 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 334 * field is changing state i.e from NULL to non-NULL or vice-versa. For 335 * example, it is not necessary to take this lock in the initial portion 336 * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and 337 * ip_sioctl_flags since the these operations are executed exclusively and 338 * that ensures that the "usesrc group state" cannot change. The "usesrc 339 * group state" change can happen only in the latter part of 340 * ip_sioctl_slifusesrc and in ill_delete. 341 * 342 * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. 343 * 344 * To change the <ill-phyint> association, the ill_g_lock must be held 345 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 346 * must be held. 347 * 348 * To change the <ill-ipsq> association the ill_g_lock must be held as writer 349 * and the ill_lock of the ill in question must be held. 350 * 351 * To change the <ill-illgroup> association the ill_g_lock must be held as 352 * writer and the ill_lock of the ill in question must be held. 353 * 354 * To add or delete an ipif from the list of ipifs hanging off the ill, 355 * ill_g_lock (writer) and ill_lock must be held and the thread must be 356 * a writer on the associated ipsq,. 357 * 358 * To add or delete an ill to the system, the ill_g_lock must be held as 359 * writer and the thread must be a writer on the associated ipsq. 360 * 361 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 362 * must be a writer on the associated ipsq. 363 * 364 * Lock hierarchy 365 * 366 * Some lock hierarchy scenarios are listed below. 367 * 368 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 369 * ill_g_lock -> illgrp_lock -> ill_lock 370 * ill_g_lock -> ill_lock(s) -> phyint_lock 371 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 372 * ill_g_lock -> ip_addr_avail_lock 373 * conn_lock -> irb_lock -> ill_lock -> ire_lock 374 * ill_g_lock -> ip_g_nd_lock 375 * 376 * When more than 1 ill lock is needed to be held, all ill lock addresses 377 * are sorted on address and locked starting from highest addressed lock 378 * downward. 379 * 380 * IPsec scenarios 381 * 382 * ipsa_lock -> ill_g_lock -> ill_lock 383 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 384 * ipsec_capab_ills_lock -> ipsa_lock 385 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 386 * 387 * Trusted Solaris scenarios 388 * 389 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 390 * igsa_lock -> gcdb_lock 391 * gcgrp_rwlock -> ire_lock 392 * gcgrp_rwlock -> gcdb_lock 393 * 394 * 395 * Routing/forwarding table locking notes: 396 * 397 * Lock acquisition order: Radix tree lock, irb_lock. 398 * Requirements: 399 * i. Walker must not hold any locks during the walker callback. 400 * ii Walker must not see a truncated tree during the walk because of any node 401 * deletion. 402 * iii Existing code assumes ire_bucket is valid if it is non-null and is used 403 * in many places in the code to walk the irb list. Thus even if all the 404 * ires in a bucket have been deleted, we still can't free the radix node 405 * until the ires have actually been inactive'd (freed). 406 * 407 * Tree traversal - Need to hold the global tree lock in read mode. 408 * Before dropping the global tree lock, need to either increment the ire_refcnt 409 * to ensure that the radix node can't be deleted. 410 * 411 * Tree add - Need to hold the global tree lock in write mode to add a 412 * radix node. To prevent the node from being deleted, increment the 413 * irb_refcnt, after the node is added to the tree. The ire itself is 414 * added later while holding the irb_lock, but not the tree lock. 415 * 416 * Tree delete - Need to hold the global tree lock and irb_lock in write mode. 417 * All associated ires must be inactive (i.e. freed), and irb_refcnt 418 * must be zero. 419 * 420 * Walker - Increment irb_refcnt before calling the walker callback. Hold the 421 * global tree lock (read mode) for traversal. 422 * 423 * IPsec notes : 424 * 425 * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message 426 * in front of the actual packet. For outbound datagrams, the M_CTL 427 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 428 * information used by the IPsec code for applying the right level of 429 * protection. The information initialized by IP in the ipsec_out_t 430 * is determined by the per-socket policy or global policy in the system. 431 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 432 * ipsec_info.h) which starts out with nothing in it. It gets filled 433 * with the right information if it goes through the AH/ESP code, which 434 * happens if the incoming packet is secure. The information initialized 435 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 436 * the policy requirements needed by per-socket policy or global policy 437 * is met or not. 438 * 439 * If there is both per-socket policy (set using setsockopt) and there 440 * is also global policy match for the 5 tuples of the socket, 441 * ipsec_override_policy() makes the decision of which one to use. 442 * 443 * For fully connected sockets i.e dst, src [addr, port] is known, 444 * conn_policy_cached is set indicating that policy has been cached. 445 * conn_in_enforce_policy may or may not be set depending on whether 446 * there is a global policy match or per-socket policy match. 447 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 448 * Once the right policy is set on the conn_t, policy cannot change for 449 * this socket. This makes life simpler for TCP (UDP ?) where 450 * re-transmissions go out with the same policy. For symmetry, policy 451 * is cached for fully connected UDP sockets also. Thus if policy is cached, 452 * it also implies that policy is latched i.e policy cannot change 453 * on these sockets. As we have the right policy on the conn, we don't 454 * have to lookup global policy for every outbound and inbound datagram 455 * and thus serving as an optimization. Note that a global policy change 456 * does not affect fully connected sockets if they have policy. If fully 457 * connected sockets did not have any policy associated with it, global 458 * policy change may affect them. 459 * 460 * IP Flow control notes: 461 * 462 * Non-TCP streams are flow controlled by IP. On the send side, if the packet 463 * cannot be sent down to the driver by IP, because of a canput failure, IP 464 * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. 465 * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained 466 * when the flowcontrol condition subsides. Ultimately STREAMS backenables the 467 * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the 468 * first conn in the list of conn's to be drained. ip_wsrv on this conn drains 469 * the queued messages, and removes the conn from the drain list, if all 470 * messages were drained. It also qenables the next conn in the drain list to 471 * continue the drain process. 472 * 473 * In reality the drain list is not a single list, but a configurable number 474 * of lists. The ip_wsrv on the IP module, qenables the first conn in each 475 * list. If the ip_wsrv of the next qenabled conn does not run, because the 476 * stream closes, ip_close takes responsibility to qenable the next conn in 477 * the drain list. The directly called ip_wput path always does a putq, if 478 * it cannot putnext. Thus synchronization problems are handled between 479 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 480 * functions that manipulate this drain list. Furthermore conn_drain_insert 481 * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv 482 * running on a queue at any time. conn_drain_tail can be simultaneously called 483 * from both ip_wsrv and ip_close. 484 * 485 * IPQOS notes: 486 * 487 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 488 * and IPQoS modules. IPPF includes hooks in IP at different control points 489 * (callout positions) which direct packets to IPQoS modules for policy 490 * processing. Policies, if present, are global. 491 * 492 * The callout positions are located in the following paths: 493 * o local_in (packets destined for this host) 494 * o local_out (packets orginating from this host ) 495 * o fwd_in (packets forwarded by this m/c - inbound) 496 * o fwd_out (packets forwarded by this m/c - outbound) 497 * Hooks at these callout points can be enabled/disabled using the ndd variable 498 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 499 * By default all the callout positions are enabled. 500 * 501 * Outbound (local_out) 502 * Hooks are placed in ip_wput_ire and ipsec_out_process. 503 * 504 * Inbound (local_in) 505 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 506 * TCP and UDP fanout routines. 507 * 508 * Forwarding (in and out) 509 * Hooks are placed in ip_rput_forward. 510 * 511 * IP Policy Framework processing (IPPF processing) 512 * Policy processing for a packet is initiated by ip_process, which ascertains 513 * that the classifier (ipgpc) is loaded and configured, failing which the 514 * packet resumes normal processing in IP. If the clasifier is present, the 515 * packet is acted upon by one or more IPQoS modules (action instances), per 516 * filters configured in ipgpc and resumes normal IP processing thereafter. 517 * An action instance can drop a packet in course of its processing. 518 * 519 * A boolean variable, ip_policy, is used in all the fanout routines that can 520 * invoke ip_process for a packet. This variable indicates if the packet should 521 * to be sent for policy processing. The variable is set to B_TRUE by default, 522 * i.e. when the routines are invoked in the normal ip procesing path for a 523 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 524 * ip_policy is set to B_FALSE for all the routines called in these two 525 * functions because, in the former case, we don't process loopback traffic 526 * currently while in the latter, the packets have already been processed in 527 * icmp_inbound. 528 * 529 * Zones notes: 530 * 531 * The partitioning rules for networking are as follows: 532 * 1) Packets coming from a zone must have a source address belonging to that 533 * zone. 534 * 2) Packets coming from a zone can only be sent on a physical interface on 535 * which the zone has an IP address. 536 * 3) Between two zones on the same machine, packet delivery is only allowed if 537 * there's a matching route for the destination and zone in the forwarding 538 * table. 539 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 540 * different zones can bind to the same port with the wildcard address 541 * (INADDR_ANY). 542 * 543 * The granularity of interface partitioning is at the logical interface level. 544 * Therefore, every zone has its own IP addresses, and incoming packets can be 545 * attributed to a zone unambiguously. A logical interface is placed into a zone 546 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 547 * structure. Rule (1) is implemented by modifying the source address selection 548 * algorithm so that the list of eligible addresses is filtered based on the 549 * sending process zone. 550 * 551 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 552 * across all zones, depending on their type. Here is the break-up: 553 * 554 * IRE type Shared/exclusive 555 * -------- ---------------- 556 * IRE_BROADCAST Exclusive 557 * IRE_DEFAULT (default routes) Shared (*) 558 * IRE_LOCAL Exclusive (x) 559 * IRE_LOOPBACK Exclusive 560 * IRE_PREFIX (net routes) Shared (*) 561 * IRE_CACHE Exclusive 562 * IRE_IF_NORESOLVER (interface routes) Exclusive 563 * IRE_IF_RESOLVER (interface routes) Exclusive 564 * IRE_HOST (host routes) Shared (*) 565 * 566 * (*) A zone can only use a default or off-subnet route if the gateway is 567 * directly reachable from the zone, that is, if the gateway's address matches 568 * one of the zone's logical interfaces. 569 * 570 * (x) IRE_LOCAL are handled a bit differently, since for all other entries 571 * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source 572 * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP 573 * address of the zone itself (the destination). Since IRE_LOCAL is used 574 * for communication between zones, ip_wput_ire has special logic to set 575 * the right source address when sending using an IRE_LOCAL. 576 * 577 * Furthermore, when ip_restrict_interzone_loopback is set (the default), 578 * ire_cache_lookup restricts loopback using an IRE_LOCAL 579 * between zone to the case when L2 would have conceptually looped the packet 580 * back, i.e. the loopback which is required since neither Ethernet drivers 581 * nor Ethernet hardware loops them back. This is the case when the normal 582 * routes (ignoring IREs with different zoneids) would send out the packet on 583 * the same ill (or ill group) as the ill with which is IRE_LOCAL is 584 * associated. 585 * 586 * Multiple zones can share a common broadcast address; typically all zones 587 * share the 255.255.255.255 address. Incoming as well as locally originated 588 * broadcast packets must be dispatched to all the zones on the broadcast 589 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 590 * since some zones may not be on the 10.16.72/24 network. To handle this, each 591 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 592 * sent to every zone that has an IRE_BROADCAST entry for the destination 593 * address on the input ill, see conn_wantpacket(). 594 * 595 * Applications in different zones can join the same multicast group address. 596 * For IPv4, group memberships are per-logical interface, so they're already 597 * inherently part of a zone. For IPv6, group memberships are per-physical 598 * interface, so we distinguish IPv6 group memberships based on group address, 599 * interface and zoneid. In both cases, received multicast packets are sent to 600 * every zone for which a group membership entry exists. On IPv6 we need to 601 * check that the target zone still has an address on the receiving physical 602 * interface; it could have been removed since the application issued the 603 * IPV6_JOIN_GROUP. 604 */ 605 606 /* 607 * Squeue Fanout flags: 608 * 0: No fanout. 609 * 1: Fanout across all squeues 610 */ 611 boolean_t ip_squeue_fanout = 0; 612 613 /* 614 * Maximum dups allowed per packet. 615 */ 616 uint_t ip_max_frag_dups = 10; 617 618 #define IS_SIMPLE_IPH(ipha) \ 619 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 620 621 /* RFC1122 Conformance */ 622 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 623 624 #define ILL_MAX_NAMELEN LIFNAMSIZ 625 626 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 627 628 static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, 629 cred_t *credp, boolean_t isv6); 630 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t, 631 ipha_t **); 632 633 static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t, 634 ip_stack_t *); 635 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 636 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 637 static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp); 638 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t, 639 mblk_t *, int, ip_stack_t *); 640 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 641 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 642 ill_t *, zoneid_t); 643 static void icmp_options_update(ipha_t *); 644 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t, 645 ip_stack_t *); 646 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t, 647 zoneid_t zoneid, ip_stack_t *); 648 static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *); 649 static void icmp_redirect(ill_t *, mblk_t *); 650 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, 651 ip_stack_t *); 652 653 static void ip_arp_news(queue_t *, mblk_t *); 654 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *, 655 ip_stack_t *); 656 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 657 char *ip_dot_addr(ipaddr_t, char *); 658 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 659 int ip_close(queue_t *, int); 660 static char *ip_dot_saddr(uchar_t *, char *); 661 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 662 boolean_t, boolean_t, ill_t *, zoneid_t); 663 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 664 boolean_t, boolean_t, zoneid_t); 665 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 666 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 667 static void ip_lrput(queue_t *, mblk_t *); 668 ipaddr_t ip_net_mask(ipaddr_t); 669 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, 670 ip_stack_t *); 671 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 672 conn_t *, uint32_t, zoneid_t, ip_opt_info_t *); 673 char *ip_nv_lookup(nv_t *, int); 674 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 675 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 676 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 677 static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, 678 ipndp_t *, size_t); 679 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 680 void ip_rput(queue_t *, mblk_t *); 681 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 682 void *dummy_arg); 683 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 684 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *, 685 ip_stack_t *); 686 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 687 ire_t *, ip_stack_t *); 688 static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, 689 mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); 690 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, 691 ip_stack_t *); 692 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, 693 uint16_t *); 694 int ip_snmp_get(queue_t *, mblk_t *, int); 695 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, 696 mib2_ipIfStatsEntry_t *, ip_stack_t *); 697 static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, 698 ip_stack_t *); 699 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); 700 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); 701 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); 702 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); 703 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); 704 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, 705 ip_stack_t *ipst); 706 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, 707 ip_stack_t *ipst); 708 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, 709 ip_stack_t *ipst); 710 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, 711 ip_stack_t *ipst); 712 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, 713 ip_stack_t *ipst); 714 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, 715 ip_stack_t *ipst); 716 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, 717 ip_stack_t *ipst); 718 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, 719 ip_stack_t *ipst); 720 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, 721 ip_stack_t *ipst); 722 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, 723 ip_stack_t *ipst); 724 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 725 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 726 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 727 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 728 static boolean_t ip_source_routed(ipha_t *, ip_stack_t *); 729 static boolean_t ip_source_route_included(ipha_t *); 730 static void ip_trash_ire_reclaim_stack(ip_stack_t *); 731 732 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, 733 zoneid_t, ip_stack_t *); 734 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *); 735 static void ip_wput_local_options(ipha_t *, ip_stack_t *); 736 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 737 zoneid_t, ip_stack_t *); 738 739 static void conn_drain_init(ip_stack_t *); 740 static void conn_drain_fini(ip_stack_t *); 741 static void conn_drain_tail(conn_t *connp, boolean_t closing); 742 743 static void conn_walk_drain(ip_stack_t *); 744 static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, 745 zoneid_t); 746 747 static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); 748 static void ip_stack_shutdown(netstackid_t stackid, void *arg); 749 static void ip_stack_fini(netstackid_t stackid, void *arg); 750 751 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 752 zoneid_t); 753 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 754 void *dummy_arg); 755 756 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 757 758 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 759 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 760 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 761 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 762 763 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 764 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 765 caddr_t, cred_t *); 766 extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 767 caddr_t cp, cred_t *cr); 768 extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, 769 cred_t *); 770 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 771 caddr_t cp, cred_t *cr); 772 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 773 cred_t *); 774 static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, 775 cred_t *); 776 static squeue_func_t ip_squeue_switch(int); 777 778 static void *ip_kstat_init(netstackid_t, ip_stack_t *); 779 static void ip_kstat_fini(netstackid_t, kstat_t *); 780 static int ip_kstat_update(kstat_t *kp, int rw); 781 static void *icmp_kstat_init(netstackid_t); 782 static void icmp_kstat_fini(netstackid_t, kstat_t *); 783 static int icmp_kstat_update(kstat_t *kp, int rw); 784 static void *ip_kstat2_init(netstackid_t, ip_stat_t *); 785 static void ip_kstat2_fini(netstackid_t, kstat_t *); 786 787 static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); 788 789 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 790 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 791 792 static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, 793 ipha_t *, ill_t *, boolean_t); 794 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 795 796 /* How long, in seconds, we allow frags to hang around. */ 797 #define IP_FRAG_TIMEOUT 60 798 799 /* 800 * Threshold which determines whether MDT should be used when 801 * generating IP fragments; payload size must be greater than 802 * this threshold for MDT to take place. 803 */ 804 #define IP_WPUT_FRAG_MDT_MIN 32768 805 806 /* Setable in /etc/system only */ 807 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 808 809 static long ip_rput_pullups; 810 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 811 812 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ 813 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ 814 815 int ip_debug; 816 817 #ifdef DEBUG 818 uint32_t ipsechw_debug = 0; 819 #endif 820 821 /* 822 * Multirouting/CGTP stuff 823 */ 824 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 825 826 /* 827 * XXX following really should only be in a header. Would need more 828 * header and .c clean up first. 829 */ 830 extern optdb_obj_t ip_opt_obj; 831 832 ulong_t ip_squeue_enter_unbound = 0; 833 834 /* 835 * Named Dispatch Parameter Table. 836 * All of these are alterable, within the min/max values given, at run time. 837 */ 838 static ipparam_t lcl_param_arr[] = { 839 /* min max value name */ 840 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 841 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 842 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 843 { 0, 1, 0, "ip_respond_to_timestamp"}, 844 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 845 { 0, 1, 1, "ip_send_redirects"}, 846 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 847 { 0, 10, 0, "ip_mrtdebug"}, 848 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 849 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 850 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 851 { 1, 255, 255, "ip_def_ttl" }, 852 { 0, 1, 0, "ip_forward_src_routed"}, 853 { 0, 256, 32, "ip_wroff_extra" }, 854 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 855 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 856 { 0, 1, 1, "ip_path_mtu_discovery" }, 857 { 0, 240, 30, "ip_ignore_delete_time" }, 858 { 0, 1, 0, "ip_ignore_redirect" }, 859 { 0, 1, 1, "ip_output_queue" }, 860 { 1, 254, 1, "ip_broadcast_ttl" }, 861 { 0, 99999, 100, "ip_icmp_err_interval" }, 862 { 1, 99999, 10, "ip_icmp_err_burst" }, 863 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 864 { 0, 1, 0, "ip_strict_dst_multihoming" }, 865 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 866 { 0, 1, 0, "ipsec_override_persocket_policy" }, 867 { 0, 1, 1, "icmp_accept_clear_messages" }, 868 { 0, 1, 1, "igmp_accept_clear_messages" }, 869 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 870 "ip_ndp_delay_first_probe_time"}, 871 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 872 "ip_ndp_max_unicast_solicit"}, 873 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 874 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 875 { 0, 1, 0, "ip6_forward_src_routed"}, 876 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 877 { 0, 1, 1, "ip6_send_redirects"}, 878 { 0, 1, 0, "ip6_ignore_redirect" }, 879 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 880 881 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 882 883 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 884 885 { 0, 1, 1, "pim_accept_clear_messages" }, 886 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 887 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 888 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 889 { 0, 15, 0, "ip_policy_mask" }, 890 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 891 { 0, 255, 1, "ip_multirt_ttl" }, 892 { 0, 1, 1, "ip_multidata_outbound" }, 893 { 0, 3600000, 300000, "ip_ndp_defense_interval" }, 894 { 0, 999999, 60*60*24, "ip_max_temp_idle" }, 895 { 0, 1000, 1, "ip_max_temp_defend" }, 896 { 0, 1000, 3, "ip_max_defend" }, 897 { 0, 999999, 30, "ip_defend_interval" }, 898 { 0, 3600000, 300000, "ip_dup_recovery" }, 899 { 0, 1, 1, "ip_restrict_interzone_loopback" }, 900 { 0, 1, 1, "ip_lso_outbound" }, 901 { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, 902 { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, 903 #ifdef DEBUG 904 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 905 #else 906 { 0, 0, 0, "" }, 907 #endif 908 }; 909 910 /* 911 * Extended NDP table 912 * The addresses for the first two are filled in to be ips_ip_g_forward 913 * and ips_ipv6_forward at init time. 914 */ 915 static ipndp_t lcl_ndp_arr[] = { 916 /* getf setf data name */ 917 #define IPNDP_IP_FORWARDING_OFFSET 0 918 { ip_param_generic_get, ip_forward_set, NULL, 919 "ip_forwarding" }, 920 #define IPNDP_IP6_FORWARDING_OFFSET 1 921 { ip_param_generic_get, ip_forward_set, NULL, 922 "ip6_forwarding" }, 923 { ip_ill_report, NULL, NULL, 924 "ip_ill_status" }, 925 { ip_ipif_report, NULL, NULL, 926 "ip_ipif_status" }, 927 { ip_conn_report, NULL, NULL, 928 "ip_conn_status" }, 929 { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, 930 "ip_rput_pullups" }, 931 { ip_srcid_report, NULL, NULL, 932 "ip_srcid_status" }, 933 { ip_param_generic_get, ip_squeue_profile_set, 934 (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, 935 { ip_param_generic_get, ip_squeue_bind_set, 936 (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, 937 { ip_param_generic_get, ip_input_proc_set, 938 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 939 { ip_param_generic_get, ip_int_set, 940 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 941 #define IPNDP_CGTP_FILTER_OFFSET 11 942 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, 943 "ip_cgtp_filter" }, 944 { ip_param_generic_get, ip_int_set, 945 (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" }, 946 #define IPNDP_IPMP_HOOK_OFFSET 13 947 { ip_param_generic_get, ipmp_hook_emulation_set, NULL, 948 "ipmp_hook_emulation" }, 949 { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, 950 "ip_debug" }, 951 }; 952 953 /* 954 * Table of IP ioctls encoding the various properties of the ioctl and 955 * indexed based on the last byte of the ioctl command. Occasionally there 956 * is a clash, and there is more than 1 ioctl with the same last byte. 957 * In such a case 1 ioctl is encoded in the ndx table and the remaining 958 * ioctls are encoded in the misc table. An entry in the ndx table is 959 * retrieved by indexing on the last byte of the ioctl command and comparing 960 * the ioctl command with the value in the ndx table. In the event of a 961 * mismatch the misc table is then searched sequentially for the desired 962 * ioctl command. 963 * 964 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 965 */ 966 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 967 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 968 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 969 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 970 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 971 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 972 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 973 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 974 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 975 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 976 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 977 978 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 979 MISC_CMD, ip_siocaddrt, NULL }, 980 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 981 MISC_CMD, ip_siocdelrt, NULL }, 982 983 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 984 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 985 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 986 IF_CMD, ip_sioctl_get_addr, NULL }, 987 988 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 989 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 990 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 991 IPI_GET_CMD | IPI_REPL, 992 IF_CMD, ip_sioctl_get_dstaddr, NULL }, 993 994 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 995 IPI_PRIV | IPI_WR | IPI_REPL, 996 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 997 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 998 IPI_MODOK | IPI_GET_CMD | IPI_REPL, 999 IF_CMD, ip_sioctl_get_flags, NULL }, 1000 1001 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1002 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1003 1004 /* copyin size cannot be coded for SIOCGIFCONF */ 1005 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, 1006 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1007 1008 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1009 IF_CMD, ip_sioctl_mtu, NULL }, 1010 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1011 IF_CMD, ip_sioctl_get_mtu, NULL }, 1012 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 1013 IPI_GET_CMD | IPI_REPL, 1014 IF_CMD, ip_sioctl_get_brdaddr, NULL }, 1015 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1016 IF_CMD, ip_sioctl_brdaddr, NULL }, 1017 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1018 IPI_GET_CMD | IPI_REPL, 1019 IF_CMD, ip_sioctl_get_netmask, NULL }, 1020 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1021 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1022 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1023 IPI_GET_CMD | IPI_REPL, 1024 IF_CMD, ip_sioctl_get_metric, NULL }, 1025 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1026 IF_CMD, ip_sioctl_metric, NULL }, 1027 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1028 1029 /* See 166-168 below for extended SIOC*XARP ioctls */ 1030 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, 1031 ARP_CMD, ip_sioctl_arp, NULL }, 1032 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, 1033 ARP_CMD, ip_sioctl_arp, NULL }, 1034 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, 1035 ARP_CMD, ip_sioctl_arp, NULL }, 1036 1037 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1038 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1039 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1040 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1041 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1042 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1043 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1044 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1045 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1046 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1047 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1048 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1049 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1050 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1051 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1052 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1053 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1054 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1055 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1056 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1057 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1058 1059 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1060 MISC_CMD, if_unitsel, if_unitsel_restart }, 1061 1062 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1063 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1064 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1065 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1066 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1067 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1068 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1069 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1070 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1071 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1072 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1073 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1077 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1078 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1079 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1080 1081 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1082 IPI_PRIV | IPI_WR | IPI_MODOK, 1083 IF_CMD, ip_sioctl_sifname, NULL }, 1084 1085 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1086 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1087 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1088 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1089 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1090 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1091 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1092 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1093 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1094 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1095 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1096 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1097 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1098 1099 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, 1100 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1101 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1102 IF_CMD, ip_sioctl_get_muxid, NULL }, 1103 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1104 IPI_PRIV | IPI_WR | IPI_REPL, 1105 IF_CMD, ip_sioctl_muxid, NULL }, 1106 1107 /* Both if and lif variants share same func */ 1108 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1109 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1110 /* Both if and lif variants share same func */ 1111 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1112 IPI_PRIV | IPI_WR | IPI_REPL, 1113 IF_CMD, ip_sioctl_slifindex, NULL }, 1114 1115 /* copyin size cannot be coded for SIOCGIFCONF */ 1116 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, 1117 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1118 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1119 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1120 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1121 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1122 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1123 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1124 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1125 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1126 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1127 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1128 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1129 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1130 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1131 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1132 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1133 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1134 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1135 1136 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1137 IPI_PRIV | IPI_WR | IPI_REPL, 1138 LIF_CMD, ip_sioctl_removeif, 1139 ip_sioctl_removeif_restart }, 1140 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1141 IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, 1142 LIF_CMD, ip_sioctl_addif, NULL }, 1143 #define SIOCLIFADDR_NDX 112 1144 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1145 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1146 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1147 IPI_GET_CMD | IPI_REPL, 1148 LIF_CMD, ip_sioctl_get_addr, NULL }, 1149 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1150 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1151 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1152 IPI_GET_CMD | IPI_REPL, 1153 LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1154 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1155 IPI_PRIV | IPI_WR | IPI_REPL, 1156 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1157 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1158 IPI_GET_CMD | IPI_MODOK | IPI_REPL, 1159 LIF_CMD, ip_sioctl_get_flags, NULL }, 1160 1161 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1162 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1163 1164 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1165 ip_sioctl_get_lifconf, NULL }, 1166 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1167 LIF_CMD, ip_sioctl_mtu, NULL }, 1168 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, 1169 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1170 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1171 IPI_GET_CMD | IPI_REPL, 1172 LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1173 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1174 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1175 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1176 IPI_GET_CMD | IPI_REPL, 1177 LIF_CMD, ip_sioctl_get_netmask, NULL }, 1178 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1179 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1180 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1181 IPI_GET_CMD | IPI_REPL, 1182 LIF_CMD, ip_sioctl_get_metric, NULL }, 1183 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1184 LIF_CMD, ip_sioctl_metric, NULL }, 1185 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1186 IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, 1187 LIF_CMD, ip_sioctl_slifname, 1188 ip_sioctl_slifname_restart }, 1189 1190 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, 1191 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1192 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1193 IPI_GET_CMD | IPI_REPL, 1194 LIF_CMD, ip_sioctl_get_muxid, NULL }, 1195 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1196 IPI_PRIV | IPI_WR | IPI_REPL, 1197 LIF_CMD, ip_sioctl_muxid, NULL }, 1198 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1199 IPI_GET_CMD | IPI_REPL, 1200 LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1201 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1202 IPI_PRIV | IPI_WR | IPI_REPL, 1203 LIF_CMD, ip_sioctl_slifindex, 0 }, 1204 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1205 LIF_CMD, ip_sioctl_token, NULL }, 1206 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1207 IPI_GET_CMD | IPI_REPL, 1208 LIF_CMD, ip_sioctl_get_token, NULL }, 1209 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1210 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1211 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1212 IPI_GET_CMD | IPI_REPL, 1213 LIF_CMD, ip_sioctl_get_subnet, NULL }, 1214 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1215 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1216 1217 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1218 IPI_GET_CMD | IPI_REPL, 1219 LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1220 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1221 LIF_CMD, ip_siocdelndp_v6, NULL }, 1222 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1223 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1224 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1225 LIF_CMD, ip_siocsetndp_v6, NULL }, 1226 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1227 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1228 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1229 MISC_CMD, ip_sioctl_tonlink, NULL }, 1230 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1231 MISC_CMD, ip_sioctl_tmysite, NULL }, 1232 /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, 1233 TUN_CMD, ip_sioctl_tunparam, NULL }, 1234 /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), 1235 IPI_PRIV | IPI_WR, 1236 TUN_CMD, ip_sioctl_tunparam, NULL }, 1237 1238 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1239 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1240 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1241 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1242 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1243 1244 /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), 1245 IPI_PRIV | IPI_WR | IPI_REPL, 1246 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1247 /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), 1248 IPI_PRIV | IPI_WR | IPI_REPL, 1249 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1250 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1251 IPI_PRIV | IPI_WR | IPI_REPL, 1252 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1253 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1254 IPI_GET_CMD | IPI_REPL, 1255 LIF_CMD, ip_sioctl_get_groupname, NULL }, 1256 /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), 1257 IPI_GET_CMD | IPI_REPL, 1258 LIF_CMD, ip_sioctl_get_oindex, NULL }, 1259 1260 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1261 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1262 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1263 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1264 1265 /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1266 LIF_CMD, ip_sioctl_slifoindex, NULL }, 1267 1268 /* These are handled in ip_sioctl_copyin_setup itself */ 1269 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1270 MISC_CMD, NULL, NULL }, 1271 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1272 MISC_CMD, NULL, NULL }, 1273 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1274 1275 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1276 ip_sioctl_get_lifconf, NULL }, 1277 1278 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, 1279 XARP_CMD, ip_sioctl_arp, NULL }, 1280 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, 1281 XARP_CMD, ip_sioctl_arp, NULL }, 1282 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, 1283 XARP_CMD, ip_sioctl_arp, NULL }, 1284 1285 /* SIOCPOPSOCKFS is not handled by IP */ 1286 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1287 1288 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1289 IPI_GET_CMD | IPI_REPL, 1290 LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1291 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1292 IPI_PRIV | IPI_WR | IPI_REPL, 1293 LIF_CMD, ip_sioctl_slifzone, 1294 ip_sioctl_slifzone_restart }, 1295 /* 172-174 are SCTP ioctls and not handled by IP */ 1296 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1297 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1298 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1299 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1300 IPI_GET_CMD, LIF_CMD, 1301 ip_sioctl_get_lifusesrc, 0 }, 1302 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1303 IPI_PRIV | IPI_WR, 1304 LIF_CMD, ip_sioctl_slifusesrc, 1305 NULL }, 1306 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1307 ip_sioctl_get_lifsrcof, NULL }, 1308 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1309 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1310 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1311 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1312 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1313 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1314 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1315 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1316 /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, 1317 ip_sioctl_set_ipmpfailback, NULL }, 1318 /* SIOCSENABLESDP is handled by SDP */ 1319 /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, 1320 }; 1321 1322 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1323 1324 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1325 { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), 1326 IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, 1327 { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, 1328 TUN_CMD, ip_sioctl_tunparam, NULL }, 1329 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1330 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1331 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1332 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1333 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1334 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1335 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1336 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, 1337 MISC_CMD, mrt_ioctl}, 1338 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, 1339 MISC_CMD, mrt_ioctl}, 1340 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, 1341 MISC_CMD, mrt_ioctl} 1342 }; 1343 1344 int ip_misc_ioctl_count = 1345 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1346 1347 int conn_drain_nthreads; /* Number of drainers reqd. */ 1348 /* Settable in /etc/system */ 1349 /* Defined in ip_ire.c */ 1350 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1351 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1352 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1353 1354 static nv_t ire_nv_arr[] = { 1355 { IRE_BROADCAST, "BROADCAST" }, 1356 { IRE_LOCAL, "LOCAL" }, 1357 { IRE_LOOPBACK, "LOOPBACK" }, 1358 { IRE_CACHE, "CACHE" }, 1359 { IRE_DEFAULT, "DEFAULT" }, 1360 { IRE_PREFIX, "PREFIX" }, 1361 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1362 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1363 { IRE_HOST, "HOST" }, 1364 { 0 } 1365 }; 1366 1367 nv_t *ire_nv_tbl = ire_nv_arr; 1368 1369 /* Simple ICMP IP Header Template */ 1370 static ipha_t icmp_ipha = { 1371 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1372 }; 1373 1374 struct module_info ip_mod_info = { 1375 IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 1376 }; 1377 1378 /* 1379 * Duplicate static symbols within a module confuses mdb; so we avoid the 1380 * problem by making the symbols here distinct from those in udp.c. 1381 */ 1382 1383 /* 1384 * Entry points for IP as a device and as a module. 1385 * FIXME: down the road we might want a separate module and driver qinit. 1386 * We have separate open functions for the /dev/ip and /dev/ip6 devices. 1387 */ 1388 static struct qinit iprinitv4 = { 1389 (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, 1390 &ip_mod_info 1391 }; 1392 1393 struct qinit iprinitv6 = { 1394 (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, 1395 &ip_mod_info 1396 }; 1397 1398 static struct qinit ipwinitv4 = { 1399 (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1400 &ip_mod_info 1401 }; 1402 1403 struct qinit ipwinitv6 = { 1404 (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1405 &ip_mod_info 1406 }; 1407 1408 static struct qinit iplrinit = { 1409 (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, 1410 &ip_mod_info 1411 }; 1412 1413 static struct qinit iplwinit = { 1414 (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, 1415 &ip_mod_info 1416 }; 1417 1418 /* For AF_INET aka /dev/ip */ 1419 struct streamtab ipinfov4 = { 1420 &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit 1421 }; 1422 1423 /* For AF_INET6 aka /dev/ip6 */ 1424 struct streamtab ipinfov6 = { 1425 &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit 1426 }; 1427 1428 #ifdef DEBUG 1429 static boolean_t skip_sctp_cksum = B_FALSE; 1430 #endif 1431 1432 /* 1433 * Prepend the zoneid using an ipsec_out_t for later use by functions like 1434 * ip_rput_v6(), ip_output(), etc. If the message 1435 * block already has a M_CTL at the front of it, then simply set the zoneid 1436 * appropriately. 1437 */ 1438 mblk_t * 1439 ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1440 { 1441 mblk_t *first_mp; 1442 ipsec_out_t *io; 1443 1444 ASSERT(zoneid != ALL_ZONES); 1445 if (mp->b_datap->db_type == M_CTL) { 1446 io = (ipsec_out_t *)mp->b_rptr; 1447 ASSERT(io->ipsec_out_type == IPSEC_OUT); 1448 io->ipsec_out_zoneid = zoneid; 1449 return (mp); 1450 } 1451 1452 first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack); 1453 if (first_mp == NULL) 1454 return (NULL); 1455 io = (ipsec_out_t *)first_mp->b_rptr; 1456 /* This is not a secure packet */ 1457 io->ipsec_out_secure = B_FALSE; 1458 io->ipsec_out_zoneid = zoneid; 1459 first_mp->b_cont = mp; 1460 return (first_mp); 1461 } 1462 1463 /* 1464 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1465 */ 1466 mblk_t * 1467 ip_copymsg(mblk_t *mp) 1468 { 1469 mblk_t *nmp; 1470 ipsec_info_t *in; 1471 1472 if (mp->b_datap->db_type != M_CTL) 1473 return (copymsg(mp)); 1474 1475 in = (ipsec_info_t *)mp->b_rptr; 1476 1477 /* 1478 * Note that M_CTL is also used for delivering ICMP error messages 1479 * upstream to transport layers. 1480 */ 1481 if (in->ipsec_info_type != IPSEC_OUT && 1482 in->ipsec_info_type != IPSEC_IN) 1483 return (copymsg(mp)); 1484 1485 nmp = copymsg(mp->b_cont); 1486 1487 if (in->ipsec_info_type == IPSEC_OUT) { 1488 return (ipsec_out_tag(mp, nmp, 1489 ((ipsec_out_t *)in)->ipsec_out_ns)); 1490 } else { 1491 return (ipsec_in_tag(mp, nmp, 1492 ((ipsec_in_t *)in)->ipsec_in_ns)); 1493 } 1494 } 1495 1496 /* Generate an ICMP fragmentation needed message. */ 1497 static void 1498 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, 1499 ip_stack_t *ipst) 1500 { 1501 icmph_t icmph; 1502 mblk_t *first_mp; 1503 boolean_t mctl_present; 1504 1505 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1506 1507 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 1508 if (mctl_present) 1509 freeb(first_mp); 1510 return; 1511 } 1512 1513 bzero(&icmph, sizeof (icmph_t)); 1514 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1515 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1516 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1517 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); 1518 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 1519 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 1520 ipst); 1521 } 1522 1523 /* 1524 * icmp_inbound deals with ICMP messages in the following ways. 1525 * 1526 * 1) It needs to send a reply back and possibly delivering it 1527 * to the "interested" upper clients. 1528 * 2) It needs to send it to the upper clients only. 1529 * 3) It needs to change some values in IP only. 1530 * 4) It needs to change some values in IP and upper layers e.g TCP. 1531 * 1532 * We need to accomodate icmp messages coming in clear until we get 1533 * everything secure from the wire. If icmp_accept_clear_messages 1534 * is zero we check with the global policy and act accordingly. If 1535 * it is non-zero, we accept the message without any checks. But 1536 * *this does not mean* that this will be delivered to the upper 1537 * clients. By accepting we might send replies back, change our MTU 1538 * value etc. but delivery to the ULP/clients depends on their policy 1539 * dispositions. 1540 * 1541 * We handle the above 4 cases in the context of IPsec in the 1542 * following way : 1543 * 1544 * 1) Send the reply back in the same way as the request came in. 1545 * If it came in encrypted, it goes out encrypted. If it came in 1546 * clear, it goes out in clear. Thus, this will prevent chosen 1547 * plain text attack. 1548 * 2) The client may or may not expect things to come in secure. 1549 * If it comes in secure, the policy constraints are checked 1550 * before delivering it to the upper layers. If it comes in 1551 * clear, ipsec_inbound_accept_clear will decide whether to 1552 * accept this in clear or not. In both the cases, if the returned 1553 * message (IP header + 8 bytes) that caused the icmp message has 1554 * AH/ESP headers, it is sent up to AH/ESP for validation before 1555 * sending up. If there are only 8 bytes of returned message, then 1556 * upper client will not be notified. 1557 * 3) Check with global policy to see whether it matches the constaints. 1558 * But this will be done only if icmp_accept_messages_in_clear is 1559 * zero. 1560 * 4) If we need to change both in IP and ULP, then the decision taken 1561 * while affecting the values in IP and while delivering up to TCP 1562 * should be the same. 1563 * 1564 * There are two cases. 1565 * 1566 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1567 * failed), we will not deliver it to the ULP, even though they 1568 * are *willing* to accept in *clear*. This is fine as our global 1569 * disposition to icmp messages asks us reject the datagram. 1570 * 1571 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1572 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1573 * to deliver it to ULP (policy failed), it can lead to 1574 * consistency problems. The cases known at this time are 1575 * ICMP_DESTINATION_UNREACHABLE messages with following code 1576 * values : 1577 * 1578 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1579 * and Upper layer rejects. Then the communication will 1580 * come to a stop. This is solved by making similar decisions 1581 * at both levels. Currently, when we are unable to deliver 1582 * to the Upper Layer (due to policy failures) while IP has 1583 * adjusted ire_max_frag, the next outbound datagram would 1584 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1585 * will be with the right level of protection. Thus the right 1586 * value will be communicated even if we are not able to 1587 * communicate when we get from the wire initially. But this 1588 * assumes there would be at least one outbound datagram after 1589 * IP has adjusted its ire_max_frag value. To make things 1590 * simpler, we accept in clear after the validation of 1591 * AH/ESP headers. 1592 * 1593 * - Other ICMP ERRORS : We may not be able to deliver it to the 1594 * upper layer depending on the level of protection the upper 1595 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1596 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1597 * should be accepted in clear when the Upper layer expects secure. 1598 * Thus the communication may get aborted by some bad ICMP 1599 * packets. 1600 * 1601 * IPQoS Notes: 1602 * The only instance when a packet is sent for processing is when there 1603 * isn't an ICMP client and if we are interested in it. 1604 * If there is a client, IPPF processing will take place in the 1605 * ip_fanout_proto routine. 1606 * 1607 * Zones notes: 1608 * The packet is only processed in the context of the specified zone: typically 1609 * only this zone will reply to an echo request, and only interested clients in 1610 * this zone will receive a copy of the packet. This means that the caller must 1611 * call icmp_inbound() for each relevant zone. 1612 */ 1613 static void 1614 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1615 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1616 ill_t *recv_ill, zoneid_t zoneid) 1617 { 1618 icmph_t *icmph; 1619 ipha_t *ipha; 1620 int iph_hdr_length; 1621 int hdr_length; 1622 boolean_t interested; 1623 uint32_t ts; 1624 uchar_t *wptr; 1625 ipif_t *ipif; 1626 mblk_t *first_mp; 1627 ipsec_in_t *ii; 1628 ire_t *src_ire; 1629 boolean_t onlink; 1630 timestruc_t now; 1631 uint32_t ill_index; 1632 ip_stack_t *ipst; 1633 1634 ASSERT(ill != NULL); 1635 ipst = ill->ill_ipst; 1636 1637 first_mp = mp; 1638 if (mctl_present) { 1639 mp = first_mp->b_cont; 1640 ASSERT(mp != NULL); 1641 } 1642 1643 ipha = (ipha_t *)mp->b_rptr; 1644 if (ipst->ips_icmp_accept_clear_messages == 0) { 1645 first_mp = ipsec_check_global_policy(first_mp, NULL, 1646 ipha, NULL, mctl_present, ipst->ips_netstack); 1647 if (first_mp == NULL) 1648 return; 1649 } 1650 1651 /* 1652 * On a labeled system, we have to check whether the zone itself is 1653 * permitted to receive raw traffic. 1654 */ 1655 if (is_system_labeled()) { 1656 if (zoneid == ALL_ZONES) 1657 zoneid = tsol_packet_to_zoneid(mp); 1658 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1659 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1660 zoneid)); 1661 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1662 freemsg(first_mp); 1663 return; 1664 } 1665 } 1666 1667 /* 1668 * We have accepted the ICMP message. It means that we will 1669 * respond to the packet if needed. It may not be delivered 1670 * to the upper client depending on the policy constraints 1671 * and the disposition in ipsec_inbound_accept_clear. 1672 */ 1673 1674 ASSERT(ill != NULL); 1675 1676 BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); 1677 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1678 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1679 /* Last chance to get real. */ 1680 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1681 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1682 freemsg(first_mp); 1683 return; 1684 } 1685 /* Refresh iph following the pullup. */ 1686 ipha = (ipha_t *)mp->b_rptr; 1687 } 1688 /* ICMP header checksum, including checksum field, should be zero. */ 1689 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1690 IP_CSUM(mp, iph_hdr_length, 0)) { 1691 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 1692 freemsg(first_mp); 1693 return; 1694 } 1695 /* The IP header will always be a multiple of four bytes */ 1696 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1697 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1698 icmph->icmph_code)); 1699 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1700 /* We will set "interested" to "true" if we want a copy */ 1701 interested = B_FALSE; 1702 switch (icmph->icmph_type) { 1703 case ICMP_ECHO_REPLY: 1704 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); 1705 break; 1706 case ICMP_DEST_UNREACHABLE: 1707 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1708 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); 1709 interested = B_TRUE; /* Pass up to transport */ 1710 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); 1711 break; 1712 case ICMP_SOURCE_QUENCH: 1713 interested = B_TRUE; /* Pass up to transport */ 1714 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); 1715 break; 1716 case ICMP_REDIRECT: 1717 if (!ipst->ips_ip_ignore_redirect) 1718 interested = B_TRUE; 1719 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); 1720 break; 1721 case ICMP_ECHO_REQUEST: 1722 /* 1723 * Whether to respond to echo requests that come in as IP 1724 * broadcasts or as IP multicast is subject to debate 1725 * (what isn't?). We aim to please, you pick it. 1726 * Default is do it. 1727 */ 1728 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1729 /* unicast: always respond */ 1730 interested = B_TRUE; 1731 } else if (CLASSD(ipha->ipha_dst)) { 1732 /* multicast: respond based on tunable */ 1733 interested = ipst->ips_ip_g_resp_to_echo_mcast; 1734 } else if (broadcast) { 1735 /* broadcast: respond based on tunable */ 1736 interested = ipst->ips_ip_g_resp_to_echo_bcast; 1737 } 1738 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); 1739 break; 1740 case ICMP_ROUTER_ADVERTISEMENT: 1741 case ICMP_ROUTER_SOLICITATION: 1742 break; 1743 case ICMP_TIME_EXCEEDED: 1744 interested = B_TRUE; /* Pass up to transport */ 1745 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); 1746 break; 1747 case ICMP_PARAM_PROBLEM: 1748 interested = B_TRUE; /* Pass up to transport */ 1749 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); 1750 break; 1751 case ICMP_TIME_STAMP_REQUEST: 1752 /* Response to Time Stamp Requests is local policy. */ 1753 if (ipst->ips_ip_g_resp_to_timestamp && 1754 /* So is whether to respond if it was an IP broadcast. */ 1755 (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) { 1756 int tstamp_len = 3 * sizeof (uint32_t); 1757 1758 if (wptr + tstamp_len > mp->b_wptr) { 1759 if (!pullupmsg(mp, wptr + tstamp_len - 1760 mp->b_rptr)) { 1761 BUMP_MIB(ill->ill_ip_mib, 1762 ipIfStatsInDiscards); 1763 freemsg(first_mp); 1764 return; 1765 } 1766 /* Refresh ipha following the pullup. */ 1767 ipha = (ipha_t *)mp->b_rptr; 1768 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1769 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1770 } 1771 interested = B_TRUE; 1772 } 1773 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); 1774 break; 1775 case ICMP_TIME_STAMP_REPLY: 1776 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); 1777 break; 1778 case ICMP_INFO_REQUEST: 1779 /* Per RFC 1122 3.2.2.7, ignore this. */ 1780 case ICMP_INFO_REPLY: 1781 break; 1782 case ICMP_ADDRESS_MASK_REQUEST: 1783 if ((ipst->ips_ip_respond_to_address_mask_broadcast || 1784 !broadcast) && 1785 /* TODO m_pullup of complete header? */ 1786 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) { 1787 interested = B_TRUE; 1788 } 1789 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); 1790 break; 1791 case ICMP_ADDRESS_MASK_REPLY: 1792 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); 1793 break; 1794 default: 1795 interested = B_TRUE; /* Pass up to transport */ 1796 BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); 1797 break; 1798 } 1799 /* See if there is an ICMP client. */ 1800 if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) { 1801 /* If there is an ICMP client and we want one too, copy it. */ 1802 mblk_t *first_mp1; 1803 1804 if (!interested) { 1805 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1806 ip_policy, recv_ill, zoneid); 1807 return; 1808 } 1809 first_mp1 = ip_copymsg(first_mp); 1810 if (first_mp1 != NULL) { 1811 ip_fanout_proto(q, first_mp1, ill, ipha, 1812 0, mctl_present, ip_policy, recv_ill, zoneid); 1813 } 1814 } else if (!interested) { 1815 freemsg(first_mp); 1816 return; 1817 } else { 1818 /* 1819 * Initiate policy processing for this packet if ip_policy 1820 * is true. 1821 */ 1822 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 1823 ill_index = ill->ill_phyint->phyint_ifindex; 1824 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1825 if (mp == NULL) { 1826 if (mctl_present) { 1827 freeb(first_mp); 1828 } 1829 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1830 return; 1831 } 1832 } 1833 } 1834 /* We want to do something with it. */ 1835 /* Check db_ref to make sure we can modify the packet. */ 1836 if (mp->b_datap->db_ref > 1) { 1837 mblk_t *first_mp1; 1838 1839 first_mp1 = ip_copymsg(first_mp); 1840 freemsg(first_mp); 1841 if (!first_mp1) { 1842 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1843 return; 1844 } 1845 first_mp = first_mp1; 1846 if (mctl_present) { 1847 mp = first_mp->b_cont; 1848 ASSERT(mp != NULL); 1849 } else { 1850 mp = first_mp; 1851 } 1852 ipha = (ipha_t *)mp->b_rptr; 1853 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1854 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1855 } 1856 switch (icmph->icmph_type) { 1857 case ICMP_ADDRESS_MASK_REQUEST: 1858 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1859 if (ipif == NULL) { 1860 freemsg(first_mp); 1861 return; 1862 } 1863 /* 1864 * outging interface must be IPv4 1865 */ 1866 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1867 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1868 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1869 ipif_refrele(ipif); 1870 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); 1871 break; 1872 case ICMP_ECHO_REQUEST: 1873 icmph->icmph_type = ICMP_ECHO_REPLY; 1874 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); 1875 break; 1876 case ICMP_TIME_STAMP_REQUEST: { 1877 uint32_t *tsp; 1878 1879 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1880 tsp = (uint32_t *)wptr; 1881 tsp++; /* Skip past 'originate time' */ 1882 /* Compute # of milliseconds since midnight */ 1883 gethrestime(&now); 1884 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1885 now.tv_nsec / (NANOSEC / MILLISEC); 1886 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1887 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1888 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); 1889 break; 1890 } 1891 default: 1892 ipha = (ipha_t *)&icmph[1]; 1893 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1894 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1895 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1896 freemsg(first_mp); 1897 return; 1898 } 1899 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1900 ipha = (ipha_t *)&icmph[1]; 1901 } 1902 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1903 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1904 freemsg(first_mp); 1905 return; 1906 } 1907 hdr_length = IPH_HDR_LENGTH(ipha); 1908 if (hdr_length < sizeof (ipha_t)) { 1909 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1910 freemsg(first_mp); 1911 return; 1912 } 1913 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1914 if (!pullupmsg(mp, 1915 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1916 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1917 freemsg(first_mp); 1918 return; 1919 } 1920 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1921 ipha = (ipha_t *)&icmph[1]; 1922 } 1923 switch (icmph->icmph_type) { 1924 case ICMP_REDIRECT: 1925 /* 1926 * As there is no upper client to deliver, we don't 1927 * need the first_mp any more. 1928 */ 1929 if (mctl_present) { 1930 freeb(first_mp); 1931 } 1932 icmp_redirect(ill, mp); 1933 return; 1934 case ICMP_DEST_UNREACHABLE: 1935 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1936 if (!icmp_inbound_too_big(icmph, ipha, ill, 1937 zoneid, mp, iph_hdr_length, ipst)) { 1938 freemsg(first_mp); 1939 return; 1940 } 1941 /* 1942 * icmp_inbound_too_big() may alter mp. 1943 * Resynch ipha and icmph accordingly. 1944 */ 1945 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1946 ipha = (ipha_t *)&icmph[1]; 1947 } 1948 /* FALLTHRU */ 1949 default : 1950 /* 1951 * IPQoS notes: Since we have already done IPQoS 1952 * processing we don't want to do it again in 1953 * the fanout routines called by 1954 * icmp_inbound_error_fanout, hence the last 1955 * argument, ip_policy, is B_FALSE. 1956 */ 1957 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1958 ipha, iph_hdr_length, hdr_length, mctl_present, 1959 B_FALSE, recv_ill, zoneid); 1960 } 1961 return; 1962 } 1963 /* Send out an ICMP packet */ 1964 icmph->icmph_checksum = 0; 1965 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 1966 if (broadcast || CLASSD(ipha->ipha_dst)) { 1967 ipif_t *ipif_chosen; 1968 /* 1969 * Make it look like it was directed to us, so we don't look 1970 * like a fool with a broadcast or multicast source address. 1971 */ 1972 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1973 /* 1974 * Make sure that we haven't grabbed an interface that's DOWN. 1975 */ 1976 if (ipif != NULL) { 1977 ipif_chosen = ipif_select_source(ipif->ipif_ill, 1978 ipha->ipha_src, zoneid); 1979 if (ipif_chosen != NULL) { 1980 ipif_refrele(ipif); 1981 ipif = ipif_chosen; 1982 } 1983 } 1984 if (ipif == NULL) { 1985 ip0dbg(("icmp_inbound: " 1986 "No source for broadcast/multicast:\n" 1987 "\tsrc 0x%x dst 0x%x ill %p " 1988 "ipif_lcl_addr 0x%x\n", 1989 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1990 (void *)ill, 1991 ill->ill_ipif->ipif_lcl_addr)); 1992 freemsg(first_mp); 1993 return; 1994 } 1995 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1996 ipha->ipha_dst = ipif->ipif_src_addr; 1997 ipif_refrele(ipif); 1998 } 1999 /* Reset time to live. */ 2000 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 2001 { 2002 /* Swap source and destination addresses */ 2003 ipaddr_t tmp; 2004 2005 tmp = ipha->ipha_src; 2006 ipha->ipha_src = ipha->ipha_dst; 2007 ipha->ipha_dst = tmp; 2008 } 2009 ipha->ipha_ident = 0; 2010 if (!IS_SIMPLE_IPH(ipha)) 2011 icmp_options_update(ipha); 2012 2013 /* 2014 * ICMP echo replies should go out on the same interface 2015 * the request came on as probes used by in.mpathd for detecting 2016 * NIC failures are ECHO packets. We turn-off load spreading 2017 * by setting ipsec_in_attach_if to B_TRUE, which is copied 2018 * to ipsec_out_attach_if by ipsec_in_to_out called later in this 2019 * function. This is in turn handled by ip_wput and ip_newroute 2020 * to make sure that the packet goes out on the interface it came 2021 * in on. If we don't turnoff load spreading, the packets might get 2022 * dropped if there are no non-FAILED/INACTIVE interfaces for it 2023 * to go out and in.mpathd would wrongly detect a failure or 2024 * mis-detect a NIC failure for link failure. As load spreading 2025 * can happen only if ill_group is not NULL, we do only for 2026 * that case and this does not affect the normal case. 2027 * 2028 * We turn off load spreading only on echo packets that came from 2029 * on-link hosts. If the interface route has been deleted, this will 2030 * not be enforced as we can't do much. For off-link hosts, as the 2031 * default routes in IPv4 does not typically have an ire_ipif 2032 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. 2033 * Moreover, expecting a default route through this interface may 2034 * not be correct. We use ipha_dst because of the swap above. 2035 */ 2036 onlink = B_FALSE; 2037 if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { 2038 /* 2039 * First, we need to make sure that it is not one of our 2040 * local addresses. If we set onlink when it is one of 2041 * our local addresses, we will end up creating IRE_CACHES 2042 * for one of our local addresses. Then, we will never 2043 * accept packets for them afterwards. 2044 */ 2045 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, 2046 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2047 if (src_ire == NULL) { 2048 ipif = ipif_get_next_ipif(NULL, ill); 2049 if (ipif == NULL) { 2050 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2051 freemsg(mp); 2052 return; 2053 } 2054 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 2055 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 2056 NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst); 2057 ipif_refrele(ipif); 2058 if (src_ire != NULL) { 2059 onlink = B_TRUE; 2060 ire_refrele(src_ire); 2061 } 2062 } else { 2063 ire_refrele(src_ire); 2064 } 2065 } 2066 if (!mctl_present) { 2067 /* 2068 * This packet should go out the same way as it 2069 * came in i.e in clear. To make sure that global 2070 * policy will not be applied to this in ip_wput_ire, 2071 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2072 */ 2073 ASSERT(first_mp == mp); 2074 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2075 if (first_mp == NULL) { 2076 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2077 freemsg(mp); 2078 return; 2079 } 2080 ii = (ipsec_in_t *)first_mp->b_rptr; 2081 2082 /* This is not a secure packet */ 2083 ii->ipsec_in_secure = B_FALSE; 2084 if (onlink) { 2085 ii->ipsec_in_attach_if = B_TRUE; 2086 ii->ipsec_in_ill_index = 2087 ill->ill_phyint->phyint_ifindex; 2088 ii->ipsec_in_rill_index = 2089 recv_ill->ill_phyint->phyint_ifindex; 2090 } 2091 first_mp->b_cont = mp; 2092 } else if (onlink) { 2093 ii = (ipsec_in_t *)first_mp->b_rptr; 2094 ii->ipsec_in_attach_if = B_TRUE; 2095 ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; 2096 ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; 2097 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2098 } else { 2099 ii = (ipsec_in_t *)first_mp->b_rptr; 2100 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2101 } 2102 ii->ipsec_in_zoneid = zoneid; 2103 ASSERT(zoneid != ALL_ZONES); 2104 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2105 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2106 return; 2107 } 2108 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 2109 put(WR(q), first_mp); 2110 } 2111 2112 static ipaddr_t 2113 icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp) 2114 { 2115 conn_t *connp; 2116 connf_t *connfp; 2117 ipaddr_t nexthop_addr = INADDR_ANY; 2118 int hdr_length = IPH_HDR_LENGTH(ipha); 2119 uint16_t *up; 2120 uint32_t ports; 2121 ip_stack_t *ipst = ill->ill_ipst; 2122 2123 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2124 switch (ipha->ipha_protocol) { 2125 case IPPROTO_TCP: 2126 { 2127 tcph_t *tcph; 2128 2129 /* do a reverse lookup */ 2130 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2131 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, 2132 TCPS_LISTEN, ipst); 2133 break; 2134 } 2135 case IPPROTO_UDP: 2136 { 2137 uint32_t dstport, srcport; 2138 2139 ((uint16_t *)&ports)[0] = up[1]; 2140 ((uint16_t *)&ports)[1] = up[0]; 2141 2142 /* Extract ports in net byte order */ 2143 dstport = htons(ntohl(ports) & 0xFFFF); 2144 srcport = htons(ntohl(ports) >> 16); 2145 2146 connfp = &ipst->ips_ipcl_udp_fanout[ 2147 IPCL_UDP_HASH(dstport, ipst)]; 2148 mutex_enter(&connfp->connf_lock); 2149 connp = connfp->connf_head; 2150 2151 /* do a reverse lookup */ 2152 while ((connp != NULL) && 2153 (!IPCL_UDP_MATCH(connp, dstport, 2154 ipha->ipha_src, srcport, ipha->ipha_dst) || 2155 !IPCL_ZONE_MATCH(connp, zoneid))) { 2156 connp = connp->conn_next; 2157 } 2158 if (connp != NULL) 2159 CONN_INC_REF(connp); 2160 mutex_exit(&connfp->connf_lock); 2161 break; 2162 } 2163 case IPPROTO_SCTP: 2164 { 2165 in6_addr_t map_src, map_dst; 2166 2167 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src); 2168 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst); 2169 ((uint16_t *)&ports)[0] = up[1]; 2170 ((uint16_t *)&ports)[1] = up[0]; 2171 2172 connp = sctp_find_conn(&map_src, &map_dst, ports, 2173 zoneid, ipst->ips_netstack->netstack_sctp); 2174 if (connp == NULL) { 2175 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, 2176 zoneid, ports, ipha, ipst); 2177 } else { 2178 CONN_INC_REF(connp); 2179 SCTP_REFRELE(CONN2SCTP(connp)); 2180 } 2181 break; 2182 } 2183 default: 2184 { 2185 ipha_t ripha; 2186 2187 ripha.ipha_src = ipha->ipha_dst; 2188 ripha.ipha_dst = ipha->ipha_src; 2189 ripha.ipha_protocol = ipha->ipha_protocol; 2190 2191 connfp = &ipst->ips_ipcl_proto_fanout[ 2192 ipha->ipha_protocol]; 2193 mutex_enter(&connfp->connf_lock); 2194 connp = connfp->connf_head; 2195 for (connp = connfp->connf_head; connp != NULL; 2196 connp = connp->conn_next) { 2197 if (IPCL_PROTO_MATCH(connp, 2198 ipha->ipha_protocol, &ripha, ill, 2199 0, zoneid)) { 2200 CONN_INC_REF(connp); 2201 break; 2202 } 2203 } 2204 mutex_exit(&connfp->connf_lock); 2205 } 2206 } 2207 if (connp != NULL) { 2208 if (connp->conn_nexthop_set) 2209 nexthop_addr = connp->conn_nexthop_v4; 2210 CONN_DEC_REF(connp); 2211 } 2212 return (nexthop_addr); 2213 } 2214 2215 /* Table from RFC 1191 */ 2216 static int icmp_frag_size_table[] = 2217 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2218 2219 /* 2220 * Process received ICMP Packet too big. 2221 * After updating any IRE it does the fanout to any matching transport streams. 2222 * Assumes the message has been pulled up till the IP header that caused 2223 * the error. 2224 * 2225 * Returns B_FALSE on failure and B_TRUE on success. 2226 */ 2227 static boolean_t 2228 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, 2229 zoneid_t zoneid, mblk_t *mp, int iph_hdr_length, 2230 ip_stack_t *ipst) 2231 { 2232 ire_t *ire, *first_ire; 2233 int mtu; 2234 int hdr_length; 2235 ipaddr_t nexthop_addr; 2236 2237 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2238 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2239 ASSERT(ill != NULL); 2240 2241 hdr_length = IPH_HDR_LENGTH(ipha); 2242 2243 /* Drop if the original packet contained a source route */ 2244 if (ip_source_route_included(ipha)) { 2245 return (B_FALSE); 2246 } 2247 /* 2248 * Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport 2249 * header. 2250 */ 2251 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2252 mp->b_wptr) { 2253 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2254 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2255 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2256 ip1dbg(("icmp_inbound_too_big: insufficient hdr\n")); 2257 return (B_FALSE); 2258 } 2259 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2260 ipha = (ipha_t *)&icmph[1]; 2261 } 2262 nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp); 2263 if (nexthop_addr != INADDR_ANY) { 2264 /* nexthop set */ 2265 first_ire = ire_ctable_lookup(ipha->ipha_dst, 2266 nexthop_addr, 0, NULL, ALL_ZONES, MBLK_GETLABEL(mp), 2267 MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst); 2268 } else { 2269 /* nexthop not set */ 2270 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, 2271 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2272 } 2273 2274 if (!first_ire) { 2275 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2276 ntohl(ipha->ipha_dst))); 2277 return (B_FALSE); 2278 } 2279 /* Check for MTU discovery advice as described in RFC 1191 */ 2280 mtu = ntohs(icmph->icmph_du_mtu); 2281 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2282 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2283 ire = ire->ire_next) { 2284 /* 2285 * Look for the connection to which this ICMP message is 2286 * directed. If it has the IP_NEXTHOP option set, then the 2287 * search is limited to IREs with the MATCH_IRE_PRIVATE 2288 * option. Else the search is limited to regular IREs. 2289 */ 2290 if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2291 (nexthop_addr != ire->ire_gateway_addr)) || 2292 (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2293 (nexthop_addr != INADDR_ANY))) 2294 continue; 2295 2296 mutex_enter(&ire->ire_lock); 2297 if (icmph->icmph_du_zero == 0 && mtu > 68) { 2298 /* Reduce the IRE max frag value as advised. */ 2299 ip1dbg(("Received mtu from router: %d (was %d)\n", 2300 mtu, ire->ire_max_frag)); 2301 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2302 } else { 2303 uint32_t length; 2304 int i; 2305 2306 /* 2307 * Use the table from RFC 1191 to figure out 2308 * the next "plateau" based on the length in 2309 * the original IP packet. 2310 */ 2311 length = ntohs(ipha->ipha_length); 2312 if (ire->ire_max_frag <= length && 2313 ire->ire_max_frag >= length - hdr_length) { 2314 /* 2315 * Handle broken BSD 4.2 systems that 2316 * return the wrong iph_length in ICMP 2317 * errors. 2318 */ 2319 ip1dbg(("Wrong mtu: sent %d, ire %d\n", 2320 length, ire->ire_max_frag)); 2321 length -= hdr_length; 2322 } 2323 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2324 if (length > icmp_frag_size_table[i]) 2325 break; 2326 } 2327 if (i == A_CNT(icmp_frag_size_table)) { 2328 /* Smaller than 68! */ 2329 ip1dbg(("Too big for packet size %d\n", 2330 length)); 2331 ire->ire_max_frag = MIN(ire->ire_max_frag, 576); 2332 ire->ire_frag_flag = 0; 2333 } else { 2334 mtu = icmp_frag_size_table[i]; 2335 ip1dbg(("Calculated mtu %d, packet size %d, " 2336 "before %d", mtu, length, 2337 ire->ire_max_frag)); 2338 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2339 ip1dbg((", after %d\n", ire->ire_max_frag)); 2340 } 2341 /* Record the new max frag size for the ULP. */ 2342 icmph->icmph_du_zero = 0; 2343 icmph->icmph_du_mtu = 2344 htons((uint16_t)ire->ire_max_frag); 2345 } 2346 mutex_exit(&ire->ire_lock); 2347 } 2348 rw_exit(&first_ire->ire_bucket->irb_lock); 2349 ire_refrele(first_ire); 2350 return (B_TRUE); 2351 } 2352 2353 /* 2354 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2355 * calls this function. 2356 */ 2357 static mblk_t * 2358 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2359 { 2360 ipha_t *ipha; 2361 icmph_t *icmph; 2362 ipha_t *in_ipha; 2363 int length; 2364 2365 ASSERT(mp->b_datap->db_type == M_DATA); 2366 2367 /* 2368 * For Self-encapsulated packets, we added an extra IP header 2369 * without the options. Inner IP header is the one from which 2370 * the outer IP header was formed. Thus, we need to remove the 2371 * outer IP header. To do this, we pullup the whole message 2372 * and overlay whatever follows the outer IP header over the 2373 * outer IP header. 2374 */ 2375 2376 if (!pullupmsg(mp, -1)) 2377 return (NULL); 2378 2379 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2380 ipha = (ipha_t *)&icmph[1]; 2381 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2382 2383 /* 2384 * The length that we want to overlay is following the inner 2385 * IP header. Subtracting the IP header + icmp header + outer 2386 * IP header's length should give us the length that we want to 2387 * overlay. 2388 */ 2389 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2390 hdr_length; 2391 /* 2392 * Overlay whatever follows the inner header over the 2393 * outer header. 2394 */ 2395 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2396 2397 /* Set the wptr to account for the outer header */ 2398 mp->b_wptr -= hdr_length; 2399 return (mp); 2400 } 2401 2402 /* 2403 * Try to pass the ICMP message upstream in case the ULP cares. 2404 * 2405 * If the packet that caused the ICMP error is secure, we send 2406 * it to AH/ESP to make sure that the attached packet has a 2407 * valid association. ipha in the code below points to the 2408 * IP header of the packet that caused the error. 2409 * 2410 * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently 2411 * in the context of IPsec. Normally we tell the upper layer 2412 * whenever we send the ire (including ip_bind), the IPsec header 2413 * length in ire_ipsec_overhead. TCP can deduce the MSS as it 2414 * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. 2415 * Similarly, we pass the new MTU icmph_du_mtu and TCP does the 2416 * same thing. As TCP has the IPsec options size that needs to be 2417 * adjusted, we just pass the MTU unchanged. 2418 * 2419 * IFN could have been generated locally or by some router. 2420 * 2421 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2422 * This happens because IP adjusted its value of MTU on an 2423 * earlier IFN message and could not tell the upper layer, 2424 * the new adjusted value of MTU e.g. Packet was encrypted 2425 * or there was not enough information to fanout to upper 2426 * layers. Thus on the next outbound datagram, ip_wput_ire 2427 * generates the IFN, where IPsec processing has *not* been 2428 * done. 2429 * 2430 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2431 * could have generated this. This happens because ire_max_frag 2432 * value in IP was set to a new value, while the IPsec processing 2433 * was being done and after we made the fragmentation check in 2434 * ip_wput_ire. Thus on return from IPsec processing, 2435 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2436 * and generates the IFN. As IPsec processing is over, we fanout 2437 * to AH/ESP to remove the header. 2438 * 2439 * In both these cases, ipsec_in_loopback will be set indicating 2440 * that IFN was generated locally. 2441 * 2442 * ROUTER : IFN could be secure or non-secure. 2443 * 2444 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2445 * packet in error has AH/ESP headers to validate the AH/ESP 2446 * headers. AH/ESP will verify whether there is a valid SA or 2447 * not and send it back. We will fanout again if we have more 2448 * data in the packet. 2449 * 2450 * If the packet in error does not have AH/ESP, we handle it 2451 * like any other case. 2452 * 2453 * * NON_SECURE : If the packet in error has AH/ESP headers, 2454 * we attach a dummy ipsec_in and send it up to AH/ESP 2455 * for validation. AH/ESP will verify whether there is a 2456 * valid SA or not and send it back. We will fanout again if 2457 * we have more data in the packet. 2458 * 2459 * If the packet in error does not have AH/ESP, we handle it 2460 * like any other case. 2461 */ 2462 static void 2463 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2464 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2465 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2466 zoneid_t zoneid) 2467 { 2468 uint16_t *up; /* Pointer to ports in ULP header */ 2469 uint32_t ports; /* reversed ports for fanout */ 2470 ipha_t ripha; /* With reversed addresses */ 2471 mblk_t *first_mp; 2472 ipsec_in_t *ii; 2473 tcph_t *tcph; 2474 conn_t *connp; 2475 ip_stack_t *ipst; 2476 2477 ASSERT(ill != NULL); 2478 2479 ASSERT(recv_ill != NULL); 2480 ipst = recv_ill->ill_ipst; 2481 2482 first_mp = mp; 2483 if (mctl_present) { 2484 mp = first_mp->b_cont; 2485 ASSERT(mp != NULL); 2486 2487 ii = (ipsec_in_t *)first_mp->b_rptr; 2488 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2489 } else { 2490 ii = NULL; 2491 } 2492 2493 switch (ipha->ipha_protocol) { 2494 case IPPROTO_UDP: 2495 /* 2496 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2497 * transport header. 2498 */ 2499 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2500 mp->b_wptr) { 2501 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2502 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2503 goto discard_pkt; 2504 } 2505 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2506 ipha = (ipha_t *)&icmph[1]; 2507 } 2508 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2509 2510 /* 2511 * Attempt to find a client stream based on port. 2512 * Note that we do a reverse lookup since the header is 2513 * in the form we sent it out. 2514 * The ripha header is only used for the IP_UDP_MATCH and we 2515 * only set the src and dst addresses and protocol. 2516 */ 2517 ripha.ipha_src = ipha->ipha_dst; 2518 ripha.ipha_dst = ipha->ipha_src; 2519 ripha.ipha_protocol = ipha->ipha_protocol; 2520 ((uint16_t *)&ports)[0] = up[1]; 2521 ((uint16_t *)&ports)[1] = up[0]; 2522 ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", 2523 ntohl(ipha->ipha_src), ntohs(up[0]), 2524 ntohl(ipha->ipha_dst), ntohs(up[1]), 2525 icmph->icmph_type, icmph->icmph_code)); 2526 2527 /* Have to change db_type after any pullupmsg */ 2528 DB_TYPE(mp) = M_CTL; 2529 2530 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2531 mctl_present, ip_policy, recv_ill, zoneid); 2532 return; 2533 2534 case IPPROTO_TCP: 2535 /* 2536 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2537 * transport header. 2538 */ 2539 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2540 mp->b_wptr) { 2541 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2542 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2543 goto discard_pkt; 2544 } 2545 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2546 ipha = (ipha_t *)&icmph[1]; 2547 } 2548 /* 2549 * Find a TCP client stream for this packet. 2550 * Note that we do a reverse lookup since the header is 2551 * in the form we sent it out. 2552 */ 2553 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2554 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, 2555 ipst); 2556 if (connp == NULL) 2557 goto discard_pkt; 2558 2559 /* Have to change db_type after any pullupmsg */ 2560 DB_TYPE(mp) = M_CTL; 2561 squeue_fill(connp->conn_sqp, first_mp, tcp_input, 2562 connp, SQTAG_TCP_INPUT_ICMP_ERR); 2563 return; 2564 2565 case IPPROTO_SCTP: 2566 /* 2567 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2568 * transport header. 2569 */ 2570 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2571 mp->b_wptr) { 2572 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2573 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2574 goto discard_pkt; 2575 } 2576 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2577 ipha = (ipha_t *)&icmph[1]; 2578 } 2579 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2580 /* 2581 * Find a SCTP client stream for this packet. 2582 * Note that we do a reverse lookup since the header is 2583 * in the form we sent it out. 2584 * The ripha header is only used for the matching and we 2585 * only set the src and dst addresses, protocol, and version. 2586 */ 2587 ripha.ipha_src = ipha->ipha_dst; 2588 ripha.ipha_dst = ipha->ipha_src; 2589 ripha.ipha_protocol = ipha->ipha_protocol; 2590 ripha.ipha_version_and_hdr_length = 2591 ipha->ipha_version_and_hdr_length; 2592 ((uint16_t *)&ports)[0] = up[1]; 2593 ((uint16_t *)&ports)[1] = up[0]; 2594 2595 /* Have to change db_type after any pullupmsg */ 2596 DB_TYPE(mp) = M_CTL; 2597 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2598 mctl_present, ip_policy, zoneid); 2599 return; 2600 2601 case IPPROTO_ESP: 2602 case IPPROTO_AH: { 2603 int ipsec_rc; 2604 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 2605 2606 /* 2607 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2608 * We will re-use the IPSEC_IN if it is already present as 2609 * AH/ESP will not affect any fields in the IPSEC_IN for 2610 * ICMP errors. If there is no IPSEC_IN, allocate a new 2611 * one and attach it in the front. 2612 */ 2613 if (ii != NULL) { 2614 /* 2615 * ip_fanout_proto_again converts the ICMP errors 2616 * that come back from AH/ESP to M_DATA so that 2617 * if it is non-AH/ESP and we do a pullupmsg in 2618 * this function, it would work. Convert it back 2619 * to M_CTL before we send up as this is a ICMP 2620 * error. This could have been generated locally or 2621 * by some router. Validate the inner IPsec 2622 * headers. 2623 * 2624 * NOTE : ill_index is used by ip_fanout_proto_again 2625 * to locate the ill. 2626 */ 2627 ASSERT(ill != NULL); 2628 ii->ipsec_in_ill_index = 2629 ill->ill_phyint->phyint_ifindex; 2630 ii->ipsec_in_rill_index = 2631 recv_ill->ill_phyint->phyint_ifindex; 2632 DB_TYPE(first_mp->b_cont) = M_CTL; 2633 } else { 2634 /* 2635 * IPSEC_IN is not present. We attach a ipsec_in 2636 * message and send up to IPsec for validating 2637 * and removing the IPsec headers. Clear 2638 * ipsec_in_secure so that when we return 2639 * from IPsec, we don't mistakenly think that this 2640 * is a secure packet came from the network. 2641 * 2642 * NOTE : ill_index is used by ip_fanout_proto_again 2643 * to locate the ill. 2644 */ 2645 ASSERT(first_mp == mp); 2646 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2647 if (first_mp == NULL) { 2648 freemsg(mp); 2649 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2650 return; 2651 } 2652 ii = (ipsec_in_t *)first_mp->b_rptr; 2653 2654 /* This is not a secure packet */ 2655 ii->ipsec_in_secure = B_FALSE; 2656 first_mp->b_cont = mp; 2657 DB_TYPE(mp) = M_CTL; 2658 ASSERT(ill != NULL); 2659 ii->ipsec_in_ill_index = 2660 ill->ill_phyint->phyint_ifindex; 2661 ii->ipsec_in_rill_index = 2662 recv_ill->ill_phyint->phyint_ifindex; 2663 } 2664 ip2dbg(("icmp_inbound_error: ipsec\n")); 2665 2666 if (!ipsec_loaded(ipss)) { 2667 ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); 2668 return; 2669 } 2670 2671 if (ipha->ipha_protocol == IPPROTO_ESP) 2672 ipsec_rc = ipsecesp_icmp_error(first_mp); 2673 else 2674 ipsec_rc = ipsecah_icmp_error(first_mp); 2675 if (ipsec_rc == IPSEC_STATUS_FAILED) 2676 return; 2677 2678 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2679 return; 2680 } 2681 default: 2682 /* 2683 * The ripha header is only used for the lookup and we 2684 * only set the src and dst addresses and protocol. 2685 */ 2686 ripha.ipha_src = ipha->ipha_dst; 2687 ripha.ipha_dst = ipha->ipha_src; 2688 ripha.ipha_protocol = ipha->ipha_protocol; 2689 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2690 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2691 ntohl(ipha->ipha_dst), 2692 icmph->icmph_type, icmph->icmph_code)); 2693 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2694 ipha_t *in_ipha; 2695 2696 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2697 mp->b_wptr) { 2698 if (!pullupmsg(mp, (uchar_t *)ipha + 2699 hdr_length + sizeof (ipha_t) - 2700 mp->b_rptr)) { 2701 goto discard_pkt; 2702 } 2703 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2704 ipha = (ipha_t *)&icmph[1]; 2705 } 2706 /* 2707 * Caller has verified that length has to be 2708 * at least the size of IP header. 2709 */ 2710 ASSERT(hdr_length >= sizeof (ipha_t)); 2711 /* 2712 * Check the sanity of the inner IP header like 2713 * we did for the outer header. 2714 */ 2715 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2716 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2717 goto discard_pkt; 2718 } 2719 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2720 goto discard_pkt; 2721 } 2722 /* Check for Self-encapsulated tunnels */ 2723 if (in_ipha->ipha_src == ipha->ipha_src && 2724 in_ipha->ipha_dst == ipha->ipha_dst) { 2725 2726 mp = icmp_inbound_self_encap_error(mp, 2727 iph_hdr_length, hdr_length); 2728 if (mp == NULL) 2729 goto discard_pkt; 2730 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2731 ipha = (ipha_t *)&icmph[1]; 2732 hdr_length = IPH_HDR_LENGTH(ipha); 2733 /* 2734 * The packet in error is self-encapsualted. 2735 * And we are finding it further encapsulated 2736 * which we could not have possibly generated. 2737 */ 2738 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2739 goto discard_pkt; 2740 } 2741 icmp_inbound_error_fanout(q, ill, first_mp, 2742 icmph, ipha, iph_hdr_length, hdr_length, 2743 mctl_present, ip_policy, recv_ill, zoneid); 2744 return; 2745 } 2746 } 2747 if ((ipha->ipha_protocol == IPPROTO_ENCAP || 2748 ipha->ipha_protocol == IPPROTO_IPV6) && 2749 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 2750 ii != NULL && 2751 ii->ipsec_in_loopback && 2752 ii->ipsec_in_secure) { 2753 /* 2754 * For IP tunnels that get a looped-back 2755 * ICMP_FRAGMENTATION_NEEDED message, adjust the 2756 * reported new MTU to take into account the IPsec 2757 * headers protecting this configured tunnel. 2758 * 2759 * This allows the tunnel module (tun.c) to blindly 2760 * accept the MTU reported in an ICMP "too big" 2761 * message. 2762 * 2763 * Non-looped back ICMP messages will just be 2764 * handled by the security protocols (if needed), 2765 * and the first subsequent packet will hit this 2766 * path. 2767 */ 2768 icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - 2769 ipsec_in_extra_length(first_mp)); 2770 } 2771 /* Have to change db_type after any pullupmsg */ 2772 DB_TYPE(mp) = M_CTL; 2773 2774 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2775 ip_policy, recv_ill, zoneid); 2776 return; 2777 } 2778 /* NOTREACHED */ 2779 discard_pkt: 2780 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2781 drop_pkt:; 2782 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2783 freemsg(first_mp); 2784 } 2785 2786 /* 2787 * Common IP options parser. 2788 * 2789 * Setup routine: fill in *optp with options-parsing state, then 2790 * tail-call ipoptp_next to return the first option. 2791 */ 2792 uint8_t 2793 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2794 { 2795 uint32_t totallen; /* total length of all options */ 2796 2797 totallen = ipha->ipha_version_and_hdr_length - 2798 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2799 totallen <<= 2; 2800 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2801 optp->ipoptp_end = optp->ipoptp_next + totallen; 2802 optp->ipoptp_flags = 0; 2803 return (ipoptp_next(optp)); 2804 } 2805 2806 /* 2807 * Common IP options parser: extract next option. 2808 */ 2809 uint8_t 2810 ipoptp_next(ipoptp_t *optp) 2811 { 2812 uint8_t *end = optp->ipoptp_end; 2813 uint8_t *cur = optp->ipoptp_next; 2814 uint8_t opt, len, pointer; 2815 2816 /* 2817 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2818 * has been corrupted. 2819 */ 2820 ASSERT(cur <= end); 2821 2822 if (cur == end) 2823 return (IPOPT_EOL); 2824 2825 opt = cur[IPOPT_OPTVAL]; 2826 2827 /* 2828 * Skip any NOP options. 2829 */ 2830 while (opt == IPOPT_NOP) { 2831 cur++; 2832 if (cur == end) 2833 return (IPOPT_EOL); 2834 opt = cur[IPOPT_OPTVAL]; 2835 } 2836 2837 if (opt == IPOPT_EOL) 2838 return (IPOPT_EOL); 2839 2840 /* 2841 * Option requiring a length. 2842 */ 2843 if ((cur + 1) >= end) { 2844 optp->ipoptp_flags |= IPOPTP_ERROR; 2845 return (IPOPT_EOL); 2846 } 2847 len = cur[IPOPT_OLEN]; 2848 if (len < 2) { 2849 optp->ipoptp_flags |= IPOPTP_ERROR; 2850 return (IPOPT_EOL); 2851 } 2852 optp->ipoptp_cur = cur; 2853 optp->ipoptp_len = len; 2854 optp->ipoptp_next = cur + len; 2855 if (cur + len > end) { 2856 optp->ipoptp_flags |= IPOPTP_ERROR; 2857 return (IPOPT_EOL); 2858 } 2859 2860 /* 2861 * For the options which require a pointer field, make sure 2862 * its there, and make sure it points to either something 2863 * inside this option, or the end of the option. 2864 */ 2865 switch (opt) { 2866 case IPOPT_RR: 2867 case IPOPT_TS: 2868 case IPOPT_LSRR: 2869 case IPOPT_SSRR: 2870 if (len <= IPOPT_OFFSET) { 2871 optp->ipoptp_flags |= IPOPTP_ERROR; 2872 return (opt); 2873 } 2874 pointer = cur[IPOPT_OFFSET]; 2875 if (pointer - 1 > len) { 2876 optp->ipoptp_flags |= IPOPTP_ERROR; 2877 return (opt); 2878 } 2879 break; 2880 } 2881 2882 /* 2883 * Sanity check the pointer field based on the type of the 2884 * option. 2885 */ 2886 switch (opt) { 2887 case IPOPT_RR: 2888 case IPOPT_SSRR: 2889 case IPOPT_LSRR: 2890 if (pointer < IPOPT_MINOFF_SR) 2891 optp->ipoptp_flags |= IPOPTP_ERROR; 2892 break; 2893 case IPOPT_TS: 2894 if (pointer < IPOPT_MINOFF_IT) 2895 optp->ipoptp_flags |= IPOPTP_ERROR; 2896 /* 2897 * Note that the Internet Timestamp option also 2898 * contains two four bit fields (the Overflow field, 2899 * and the Flag field), which follow the pointer 2900 * field. We don't need to check that these fields 2901 * fall within the length of the option because this 2902 * was implicitely done above. We've checked that the 2903 * pointer value is at least IPOPT_MINOFF_IT, and that 2904 * it falls within the option. Since IPOPT_MINOFF_IT > 2905 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2906 */ 2907 ASSERT(len > IPOPT_POS_OV_FLG); 2908 break; 2909 } 2910 2911 return (opt); 2912 } 2913 2914 /* 2915 * Use the outgoing IP header to create an IP_OPTIONS option the way 2916 * it was passed down from the application. 2917 */ 2918 int 2919 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2920 { 2921 ipoptp_t opts; 2922 const uchar_t *opt; 2923 uint8_t optval; 2924 uint8_t optlen; 2925 uint32_t len = 0; 2926 uchar_t *buf1 = buf; 2927 2928 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2929 len += IP_ADDR_LEN; 2930 bzero(buf1, IP_ADDR_LEN); 2931 2932 /* 2933 * OK to cast away const here, as we don't store through the returned 2934 * opts.ipoptp_cur pointer. 2935 */ 2936 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2937 optval != IPOPT_EOL; 2938 optval = ipoptp_next(&opts)) { 2939 int off; 2940 2941 opt = opts.ipoptp_cur; 2942 optlen = opts.ipoptp_len; 2943 switch (optval) { 2944 case IPOPT_SSRR: 2945 case IPOPT_LSRR: 2946 2947 /* 2948 * Insert ipha_dst as the first entry in the source 2949 * route and move down the entries on step. 2950 * The last entry gets placed at buf1. 2951 */ 2952 buf[IPOPT_OPTVAL] = optval; 2953 buf[IPOPT_OLEN] = optlen; 2954 buf[IPOPT_OFFSET] = optlen; 2955 2956 off = optlen - IP_ADDR_LEN; 2957 if (off < 0) { 2958 /* No entries in source route */ 2959 break; 2960 } 2961 /* Last entry in source route */ 2962 bcopy(opt + off, buf1, IP_ADDR_LEN); 2963 off -= IP_ADDR_LEN; 2964 2965 while (off > 0) { 2966 bcopy(opt + off, 2967 buf + off + IP_ADDR_LEN, 2968 IP_ADDR_LEN); 2969 off -= IP_ADDR_LEN; 2970 } 2971 /* ipha_dst into first slot */ 2972 bcopy(&ipha->ipha_dst, 2973 buf + off + IP_ADDR_LEN, 2974 IP_ADDR_LEN); 2975 buf += optlen; 2976 len += optlen; 2977 break; 2978 2979 case IPOPT_COMSEC: 2980 case IPOPT_SECURITY: 2981 /* if passing up a label is not ok, then remove */ 2982 if (is_system_labeled()) 2983 break; 2984 /* FALLTHROUGH */ 2985 default: 2986 bcopy(opt, buf, optlen); 2987 buf += optlen; 2988 len += optlen; 2989 break; 2990 } 2991 } 2992 done: 2993 /* Pad the resulting options */ 2994 while (len & 0x3) { 2995 *buf++ = IPOPT_EOL; 2996 len++; 2997 } 2998 return (len); 2999 } 3000 3001 /* 3002 * Update any record route or timestamp options to include this host. 3003 * Reverse any source route option. 3004 * This routine assumes that the options are well formed i.e. that they 3005 * have already been checked. 3006 */ 3007 static void 3008 icmp_options_update(ipha_t *ipha) 3009 { 3010 ipoptp_t opts; 3011 uchar_t *opt; 3012 uint8_t optval; 3013 ipaddr_t src; /* Our local address */ 3014 ipaddr_t dst; 3015 3016 ip2dbg(("icmp_options_update\n")); 3017 src = ipha->ipha_src; 3018 dst = ipha->ipha_dst; 3019 3020 for (optval = ipoptp_first(&opts, ipha); 3021 optval != IPOPT_EOL; 3022 optval = ipoptp_next(&opts)) { 3023 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 3024 opt = opts.ipoptp_cur; 3025 ip2dbg(("icmp_options_update: opt %d, len %d\n", 3026 optval, opts.ipoptp_len)); 3027 switch (optval) { 3028 int off1, off2; 3029 case IPOPT_SSRR: 3030 case IPOPT_LSRR: 3031 /* 3032 * Reverse the source route. The first entry 3033 * should be the next to last one in the current 3034 * source route (the last entry is our address). 3035 * The last entry should be the final destination. 3036 */ 3037 off1 = IPOPT_MINOFF_SR - 1; 3038 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 3039 if (off2 < 0) { 3040 /* No entries in source route */ 3041 ip1dbg(( 3042 "icmp_options_update: bad src route\n")); 3043 break; 3044 } 3045 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 3046 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 3047 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 3048 off2 -= IP_ADDR_LEN; 3049 3050 while (off1 < off2) { 3051 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 3052 bcopy((char *)opt + off2, (char *)opt + off1, 3053 IP_ADDR_LEN); 3054 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 3055 off1 += IP_ADDR_LEN; 3056 off2 -= IP_ADDR_LEN; 3057 } 3058 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 3059 break; 3060 } 3061 } 3062 } 3063 3064 /* 3065 * Process received ICMP Redirect messages. 3066 */ 3067 static void 3068 icmp_redirect(ill_t *ill, mblk_t *mp) 3069 { 3070 ipha_t *ipha; 3071 int iph_hdr_length; 3072 icmph_t *icmph; 3073 ipha_t *ipha_err; 3074 ire_t *ire; 3075 ire_t *prev_ire; 3076 ire_t *save_ire; 3077 ipaddr_t src, dst, gateway; 3078 iulp_t ulp_info = { 0 }; 3079 int error; 3080 ip_stack_t *ipst; 3081 3082 ASSERT(ill != NULL); 3083 ipst = ill->ill_ipst; 3084 3085 ipha = (ipha_t *)mp->b_rptr; 3086 iph_hdr_length = IPH_HDR_LENGTH(ipha); 3087 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 3088 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 3089 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3090 freemsg(mp); 3091 return; 3092 } 3093 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 3094 ipha_err = (ipha_t *)&icmph[1]; 3095 src = ipha->ipha_src; 3096 dst = ipha_err->ipha_dst; 3097 gateway = icmph->icmph_rd_gateway; 3098 /* Make sure the new gateway is reachable somehow. */ 3099 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 3100 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3101 /* 3102 * Make sure we had a route for the dest in question and that 3103 * that route was pointing to the old gateway (the source of the 3104 * redirect packet.) 3105 */ 3106 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3107 NULL, MATCH_IRE_GW, ipst); 3108 /* 3109 * Check that 3110 * the redirect was not from ourselves 3111 * the new gateway and the old gateway are directly reachable 3112 */ 3113 if (!prev_ire || 3114 !ire || 3115 ire->ire_type == IRE_LOCAL) { 3116 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3117 freemsg(mp); 3118 if (ire != NULL) 3119 ire_refrele(ire); 3120 if (prev_ire != NULL) 3121 ire_refrele(prev_ire); 3122 return; 3123 } 3124 3125 /* 3126 * Should we use the old ULP info to create the new gateway? From 3127 * a user's perspective, we should inherit the info so that it 3128 * is a "smooth" transition. If we do not do that, then new 3129 * connections going thru the new gateway will have no route metrics, 3130 * which is counter-intuitive to user. From a network point of 3131 * view, this may or may not make sense even though the new gateway 3132 * is still directly connected to us so the route metrics should not 3133 * change much. 3134 * 3135 * But if the old ire_uinfo is not initialized, we do another 3136 * recursive lookup on the dest using the new gateway. There may 3137 * be a route to that. If so, use it to initialize the redirect 3138 * route. 3139 */ 3140 if (prev_ire->ire_uinfo.iulp_set) { 3141 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3142 } else { 3143 ire_t *tmp_ire; 3144 ire_t *sire; 3145 3146 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3147 ALL_ZONES, 0, NULL, 3148 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), 3149 ipst); 3150 if (sire != NULL) { 3151 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3152 /* 3153 * If sire != NULL, ire_ftable_lookup() should not 3154 * return a NULL value. 3155 */ 3156 ASSERT(tmp_ire != NULL); 3157 ire_refrele(tmp_ire); 3158 ire_refrele(sire); 3159 } else if (tmp_ire != NULL) { 3160 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3161 sizeof (iulp_t)); 3162 ire_refrele(tmp_ire); 3163 } 3164 } 3165 if (prev_ire->ire_type == IRE_CACHE) 3166 ire_delete(prev_ire); 3167 ire_refrele(prev_ire); 3168 /* 3169 * TODO: more precise handling for cases 0, 2, 3, the latter two 3170 * require TOS routing 3171 */ 3172 switch (icmph->icmph_code) { 3173 case 0: 3174 case 1: 3175 /* TODO: TOS specificity for cases 2 and 3 */ 3176 case 2: 3177 case 3: 3178 break; 3179 default: 3180 freemsg(mp); 3181 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3182 ire_refrele(ire); 3183 return; 3184 } 3185 /* 3186 * Create a Route Association. This will allow us to remember that 3187 * someone we believe told us to use the particular gateway. 3188 */ 3189 save_ire = ire; 3190 ire = ire_create( 3191 (uchar_t *)&dst, /* dest addr */ 3192 (uchar_t *)&ip_g_all_ones, /* mask */ 3193 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3194 (uchar_t *)&gateway, /* gateway addr */ 3195 &save_ire->ire_max_frag, /* max frag */ 3196 NULL, /* no src nce */ 3197 NULL, /* no rfq */ 3198 NULL, /* no stq */ 3199 IRE_HOST, 3200 NULL, /* ipif */ 3201 0, /* cmask */ 3202 0, /* phandle */ 3203 0, /* ihandle */ 3204 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3205 &ulp_info, 3206 NULL, /* tsol_gc_t */ 3207 NULL, /* gcgrp */ 3208 ipst); 3209 3210 if (ire == NULL) { 3211 freemsg(mp); 3212 ire_refrele(save_ire); 3213 return; 3214 } 3215 error = ire_add(&ire, NULL, NULL, NULL, B_FALSE); 3216 ire_refrele(save_ire); 3217 atomic_inc_32(&ipst->ips_ip_redirect_cnt); 3218 3219 if (error == 0) { 3220 ire_refrele(ire); /* Held in ire_add_v4 */ 3221 /* tell routing sockets that we received a redirect */ 3222 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3223 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3224 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); 3225 } 3226 3227 /* 3228 * Delete any existing IRE_HOST type redirect ires for this destination. 3229 * This together with the added IRE has the effect of 3230 * modifying an existing redirect. 3231 */ 3232 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL, 3233 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst); 3234 if (prev_ire != NULL) { 3235 if (prev_ire ->ire_flags & RTF_DYNAMIC) 3236 ire_delete(prev_ire); 3237 ire_refrele(prev_ire); 3238 } 3239 3240 freemsg(mp); 3241 } 3242 3243 /* 3244 * Generate an ICMP parameter problem message. 3245 */ 3246 static void 3247 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, 3248 ip_stack_t *ipst) 3249 { 3250 icmph_t icmph; 3251 boolean_t mctl_present; 3252 mblk_t *first_mp; 3253 3254 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3255 3256 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3257 if (mctl_present) 3258 freeb(first_mp); 3259 return; 3260 } 3261 3262 bzero(&icmph, sizeof (icmph_t)); 3263 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3264 icmph.icmph_pp_ptr = ptr; 3265 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); 3266 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3267 ipst); 3268 } 3269 3270 /* 3271 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3272 * the ICMP header pointed to by "stuff". (May be called as writer.) 3273 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3274 * an icmp error packet can be sent. 3275 * Assigns an appropriate source address to the packet. If ipha_dst is 3276 * one of our addresses use it for source. Otherwise pick a source based 3277 * on a route lookup back to ipha_src. 3278 * Note that ipha_src must be set here since the 3279 * packet is likely to arrive on an ill queue in ip_wput() which will 3280 * not set a source address. 3281 */ 3282 static void 3283 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3284 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 3285 { 3286 ipaddr_t dst; 3287 icmph_t *icmph; 3288 ipha_t *ipha; 3289 uint_t len_needed; 3290 size_t msg_len; 3291 mblk_t *mp1; 3292 ipaddr_t src; 3293 ire_t *ire; 3294 mblk_t *ipsec_mp; 3295 ipsec_out_t *io = NULL; 3296 3297 if (mctl_present) { 3298 /* 3299 * If it is : 3300 * 3301 * 1) a IPSEC_OUT, then this is caused by outbound 3302 * datagram originating on this host. IPsec processing 3303 * may or may not have been done. Refer to comments above 3304 * icmp_inbound_error_fanout for details. 3305 * 3306 * 2) a IPSEC_IN if we are generating a icmp_message 3307 * for an incoming datagram destined for us i.e called 3308 * from ip_fanout_send_icmp. 3309 */ 3310 ipsec_info_t *in; 3311 ipsec_mp = mp; 3312 mp = ipsec_mp->b_cont; 3313 3314 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3315 ipha = (ipha_t *)mp->b_rptr; 3316 3317 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3318 in->ipsec_info_type == IPSEC_IN); 3319 3320 if (in->ipsec_info_type == IPSEC_IN) { 3321 /* 3322 * Convert the IPSEC_IN to IPSEC_OUT. 3323 */ 3324 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3325 BUMP_MIB(&ipst->ips_ip_mib, 3326 ipIfStatsOutDiscards); 3327 return; 3328 } 3329 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3330 } else { 3331 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3332 io = (ipsec_out_t *)in; 3333 /* 3334 * Clear out ipsec_out_proc_begin, so we do a fresh 3335 * ire lookup. 3336 */ 3337 io->ipsec_out_proc_begin = B_FALSE; 3338 } 3339 ASSERT(zoneid == io->ipsec_out_zoneid); 3340 ASSERT(zoneid != ALL_ZONES); 3341 } else { 3342 /* 3343 * This is in clear. The icmp message we are building 3344 * here should go out in clear. 3345 * 3346 * Pardon the convolution of it all, but it's easier to 3347 * allocate a "use cleartext" IPSEC_IN message and convert 3348 * it than it is to allocate a new one. 3349 */ 3350 ipsec_in_t *ii; 3351 ASSERT(DB_TYPE(mp) == M_DATA); 3352 ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 3353 if (ipsec_mp == NULL) { 3354 freemsg(mp); 3355 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3356 return; 3357 } 3358 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3359 3360 /* This is not a secure packet */ 3361 ii->ipsec_in_secure = B_FALSE; 3362 /* 3363 * For trusted extensions using a shared IP address we can 3364 * send using any zoneid. 3365 */ 3366 if (zoneid == ALL_ZONES) 3367 ii->ipsec_in_zoneid = GLOBAL_ZONEID; 3368 else 3369 ii->ipsec_in_zoneid = zoneid; 3370 ipsec_mp->b_cont = mp; 3371 ipha = (ipha_t *)mp->b_rptr; 3372 /* 3373 * Convert the IPSEC_IN to IPSEC_OUT. 3374 */ 3375 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3376 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3377 return; 3378 } 3379 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3380 } 3381 3382 /* Remember our eventual destination */ 3383 dst = ipha->ipha_src; 3384 3385 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3386 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst); 3387 if (ire != NULL && 3388 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3389 src = ipha->ipha_dst; 3390 } else { 3391 if (ire != NULL) 3392 ire_refrele(ire); 3393 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3394 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY), 3395 ipst); 3396 if (ire == NULL) { 3397 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 3398 freemsg(ipsec_mp); 3399 return; 3400 } 3401 src = ire->ire_src_addr; 3402 } 3403 3404 if (ire != NULL) 3405 ire_refrele(ire); 3406 3407 /* 3408 * Check if we can send back more then 8 bytes in addition to 3409 * the IP header. We try to send 64 bytes of data and the internal 3410 * header in the special cases of ipv4 encapsulated ipv4 or ipv6. 3411 */ 3412 len_needed = IPH_HDR_LENGTH(ipha); 3413 if (ipha->ipha_protocol == IPPROTO_ENCAP || 3414 ipha->ipha_protocol == IPPROTO_IPV6) { 3415 3416 if (!pullupmsg(mp, -1)) { 3417 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3418 freemsg(ipsec_mp); 3419 return; 3420 } 3421 ipha = (ipha_t *)mp->b_rptr; 3422 3423 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 3424 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + 3425 len_needed)); 3426 } else { 3427 ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); 3428 3429 ASSERT(ipha->ipha_protocol == IPPROTO_IPV6); 3430 len_needed += ip_hdr_length_v6(mp, ip6h); 3431 } 3432 } 3433 len_needed += ipst->ips_ip_icmp_return; 3434 msg_len = msgdsize(mp); 3435 if (msg_len > len_needed) { 3436 (void) adjmsg(mp, len_needed - msg_len); 3437 msg_len = len_needed; 3438 } 3439 mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp); 3440 if (mp1 == NULL) { 3441 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors); 3442 freemsg(ipsec_mp); 3443 return; 3444 } 3445 mp1->b_cont = mp; 3446 mp = mp1; 3447 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3448 ipsec_mp->b_rptr == (uint8_t *)io && 3449 io->ipsec_out_type == IPSEC_OUT); 3450 ipsec_mp->b_cont = mp; 3451 3452 /* 3453 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3454 * node generates be accepted in peace by all on-host destinations. 3455 * If we do NOT assume that all on-host destinations trust 3456 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3457 * (Look for ipsec_out_icmp_loopback). 3458 */ 3459 io->ipsec_out_icmp_loopback = B_TRUE; 3460 3461 ipha = (ipha_t *)mp->b_rptr; 3462 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3463 *ipha = icmp_ipha; 3464 ipha->ipha_src = src; 3465 ipha->ipha_dst = dst; 3466 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 3467 msg_len += sizeof (icmp_ipha) + len; 3468 if (msg_len > IP_MAXPACKET) { 3469 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3470 msg_len = IP_MAXPACKET; 3471 } 3472 ipha->ipha_length = htons((uint16_t)msg_len); 3473 icmph = (icmph_t *)&ipha[1]; 3474 bcopy(stuff, icmph, len); 3475 icmph->icmph_checksum = 0; 3476 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3477 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 3478 put(q, ipsec_mp); 3479 } 3480 3481 /* 3482 * Determine if an ICMP error packet can be sent given the rate limit. 3483 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3484 * in milliseconds) and a burst size. Burst size number of packets can 3485 * be sent arbitrarely closely spaced. 3486 * The state is tracked using two variables to implement an approximate 3487 * token bucket filter: 3488 * icmp_pkt_err_last - lbolt value when the last burst started 3489 * icmp_pkt_err_sent - number of packets sent in current burst 3490 */ 3491 boolean_t 3492 icmp_err_rate_limit(ip_stack_t *ipst) 3493 { 3494 clock_t now = TICK_TO_MSEC(lbolt); 3495 uint_t refilled; /* Number of packets refilled in tbf since last */ 3496 /* Guard against changes by loading into local variable */ 3497 uint_t err_interval = ipst->ips_ip_icmp_err_interval; 3498 3499 if (err_interval == 0) 3500 return (B_FALSE); 3501 3502 if (ipst->ips_icmp_pkt_err_last > now) { 3503 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3504 ipst->ips_icmp_pkt_err_last = 0; 3505 ipst->ips_icmp_pkt_err_sent = 0; 3506 } 3507 /* 3508 * If we are in a burst update the token bucket filter. 3509 * Update the "last" time to be close to "now" but make sure 3510 * we don't loose precision. 3511 */ 3512 if (ipst->ips_icmp_pkt_err_sent != 0) { 3513 refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval; 3514 if (refilled > ipst->ips_icmp_pkt_err_sent) { 3515 ipst->ips_icmp_pkt_err_sent = 0; 3516 } else { 3517 ipst->ips_icmp_pkt_err_sent -= refilled; 3518 ipst->ips_icmp_pkt_err_last += refilled * err_interval; 3519 } 3520 } 3521 if (ipst->ips_icmp_pkt_err_sent == 0) { 3522 /* Start of new burst */ 3523 ipst->ips_icmp_pkt_err_last = now; 3524 } 3525 if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) { 3526 ipst->ips_icmp_pkt_err_sent++; 3527 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3528 ipst->ips_icmp_pkt_err_sent)); 3529 return (B_FALSE); 3530 } 3531 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3532 return (B_TRUE); 3533 } 3534 3535 /* 3536 * Check if it is ok to send an IPv4 ICMP error packet in 3537 * response to the IPv4 packet in mp. 3538 * Free the message and return null if no 3539 * ICMP error packet should be sent. 3540 */ 3541 static mblk_t * 3542 icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) 3543 { 3544 icmph_t *icmph; 3545 ipha_t *ipha; 3546 uint_t len_needed; 3547 ire_t *src_ire; 3548 ire_t *dst_ire; 3549 3550 if (!mp) 3551 return (NULL); 3552 ipha = (ipha_t *)mp->b_rptr; 3553 if (ip_csum_hdr(ipha)) { 3554 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs); 3555 freemsg(mp); 3556 return (NULL); 3557 } 3558 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3559 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3560 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3561 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3562 if (src_ire != NULL || dst_ire != NULL || 3563 CLASSD(ipha->ipha_dst) || 3564 CLASSD(ipha->ipha_src) || 3565 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3566 /* Note: only errors to the fragment with offset 0 */ 3567 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3568 freemsg(mp); 3569 if (src_ire != NULL) 3570 ire_refrele(src_ire); 3571 if (dst_ire != NULL) 3572 ire_refrele(dst_ire); 3573 return (NULL); 3574 } 3575 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3576 /* 3577 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3578 * errors in response to any ICMP errors. 3579 */ 3580 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3581 if (mp->b_wptr - mp->b_rptr < len_needed) { 3582 if (!pullupmsg(mp, len_needed)) { 3583 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3584 freemsg(mp); 3585 return (NULL); 3586 } 3587 ipha = (ipha_t *)mp->b_rptr; 3588 } 3589 icmph = (icmph_t *) 3590 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3591 switch (icmph->icmph_type) { 3592 case ICMP_DEST_UNREACHABLE: 3593 case ICMP_SOURCE_QUENCH: 3594 case ICMP_TIME_EXCEEDED: 3595 case ICMP_PARAM_PROBLEM: 3596 case ICMP_REDIRECT: 3597 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3598 freemsg(mp); 3599 return (NULL); 3600 default: 3601 break; 3602 } 3603 } 3604 /* 3605 * If this is a labeled system, then check to see if we're allowed to 3606 * send a response to this particular sender. If not, then just drop. 3607 */ 3608 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3609 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3610 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3611 freemsg(mp); 3612 return (NULL); 3613 } 3614 if (icmp_err_rate_limit(ipst)) { 3615 /* 3616 * Only send ICMP error packets every so often. 3617 * This should be done on a per port/source basis, 3618 * but for now this will suffice. 3619 */ 3620 freemsg(mp); 3621 return (NULL); 3622 } 3623 return (mp); 3624 } 3625 3626 /* 3627 * Generate an ICMP redirect message. 3628 */ 3629 static void 3630 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst) 3631 { 3632 icmph_t icmph; 3633 3634 /* 3635 * We are called from ip_rput where we could 3636 * not have attached an IPSEC_IN. 3637 */ 3638 ASSERT(mp->b_datap->db_type == M_DATA); 3639 3640 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3641 return; 3642 } 3643 3644 bzero(&icmph, sizeof (icmph_t)); 3645 icmph.icmph_type = ICMP_REDIRECT; 3646 icmph.icmph_code = 1; 3647 icmph.icmph_rd_gateway = gateway; 3648 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); 3649 /* Redirects sent by router, and router is global zone */ 3650 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst); 3651 } 3652 3653 /* 3654 * Generate an ICMP time exceeded message. 3655 */ 3656 void 3657 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3658 ip_stack_t *ipst) 3659 { 3660 icmph_t icmph; 3661 boolean_t mctl_present; 3662 mblk_t *first_mp; 3663 3664 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3665 3666 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3667 if (mctl_present) 3668 freeb(first_mp); 3669 return; 3670 } 3671 3672 bzero(&icmph, sizeof (icmph_t)); 3673 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3674 icmph.icmph_code = code; 3675 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); 3676 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3677 ipst); 3678 } 3679 3680 /* 3681 * Generate an ICMP unreachable message. 3682 */ 3683 void 3684 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3685 ip_stack_t *ipst) 3686 { 3687 icmph_t icmph; 3688 mblk_t *first_mp; 3689 boolean_t mctl_present; 3690 3691 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3692 3693 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3694 if (mctl_present) 3695 freeb(first_mp); 3696 return; 3697 } 3698 3699 bzero(&icmph, sizeof (icmph_t)); 3700 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3701 icmph.icmph_code = code; 3702 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 3703 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3704 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present, 3705 zoneid, ipst); 3706 } 3707 3708 /* 3709 * Attempt to start recovery of an IPv4 interface that's been shut down as a 3710 * duplicate. As long as someone else holds the address, the interface will 3711 * stay down. When that conflict goes away, the interface is brought back up. 3712 * This is done so that accidental shutdowns of addresses aren't made 3713 * permanent. Your server will recover from a failure. 3714 * 3715 * For DHCP, recovery is not done in the kernel. Instead, it's handled by a 3716 * user space process (dhcpagent). 3717 * 3718 * Recovery completes if ARP reports that the address is now ours (via 3719 * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. 3720 * 3721 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 3722 */ 3723 static void 3724 ipif_dup_recovery(void *arg) 3725 { 3726 ipif_t *ipif = arg; 3727 ill_t *ill = ipif->ipif_ill; 3728 mblk_t *arp_add_mp; 3729 mblk_t *arp_del_mp; 3730 area_t *area; 3731 ip_stack_t *ipst = ill->ill_ipst; 3732 3733 ipif->ipif_recovery_id = 0; 3734 3735 /* 3736 * No lock needed for moving or condemned check, as this is just an 3737 * optimization. 3738 */ 3739 if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || 3740 (ipif->ipif_flags & IPIF_POINTOPOINT) || 3741 (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { 3742 /* No reason to try to bring this address back. */ 3743 return; 3744 } 3745 3746 if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) 3747 goto alloc_fail; 3748 3749 if (ipif->ipif_arp_del_mp == NULL) { 3750 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 3751 goto alloc_fail; 3752 ipif->ipif_arp_del_mp = arp_del_mp; 3753 } 3754 3755 /* Setting the 'unverified' flag restarts DAD */ 3756 area = (area_t *)arp_add_mp->b_rptr; 3757 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 3758 ACE_F_UNVERIFIED; 3759 putnext(ill->ill_rq, arp_add_mp); 3760 return; 3761 3762 alloc_fail: 3763 /* 3764 * On allocation failure, just restart the timer. Note that the ipif 3765 * is down here, so no other thread could be trying to start a recovery 3766 * timer. The ill_lock protects the condemned flag and the recovery 3767 * timer ID. 3768 */ 3769 freemsg(arp_add_mp); 3770 mutex_enter(&ill->ill_lock); 3771 if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 && 3772 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 3773 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, 3774 MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3775 } 3776 mutex_exit(&ill->ill_lock); 3777 } 3778 3779 /* 3780 * This is for exclusive changes due to ARP. Either tear down an interface due 3781 * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. 3782 */ 3783 /* ARGSUSED */ 3784 static void 3785 ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3786 { 3787 ill_t *ill = rq->q_ptr; 3788 arh_t *arh; 3789 ipaddr_t src; 3790 ipif_t *ipif; 3791 char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ 3792 char hbuf[MAC_STR_LEN]; 3793 char sbuf[INET_ADDRSTRLEN]; 3794 const char *failtype; 3795 boolean_t bring_up; 3796 ip_stack_t *ipst = ill->ill_ipst; 3797 3798 switch (((arcn_t *)mp->b_rptr)->arcn_code) { 3799 case AR_CN_READY: 3800 failtype = NULL; 3801 bring_up = B_TRUE; 3802 break; 3803 case AR_CN_FAILED: 3804 failtype = "in use"; 3805 bring_up = B_FALSE; 3806 break; 3807 default: 3808 failtype = "claimed"; 3809 bring_up = B_FALSE; 3810 break; 3811 } 3812 3813 arh = (arh_t *)mp->b_cont->b_rptr; 3814 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3815 3816 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, 3817 sizeof (hbuf)); 3818 (void) ip_dot_addr(src, sbuf); 3819 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3820 3821 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || 3822 ipif->ipif_lcl_addr != src) { 3823 continue; 3824 } 3825 3826 /* 3827 * If we failed on a recovery probe, then restart the timer to 3828 * try again later. 3829 */ 3830 if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && 3831 !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3832 ill->ill_net_type == IRE_IF_RESOLVER && 3833 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3834 ipst->ips_ip_dup_recovery > 0 && 3835 ipif->ipif_recovery_id == 0) { 3836 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3837 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3838 continue; 3839 } 3840 3841 /* 3842 * If what we're trying to do has already been done, then do 3843 * nothing. 3844 */ 3845 if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) 3846 continue; 3847 3848 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 3849 3850 if (failtype == NULL) { 3851 cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, 3852 ibuf); 3853 } else { 3854 cmn_err(CE_WARN, "%s has duplicate address %s (%s " 3855 "by %s); disabled", ibuf, sbuf, failtype, hbuf); 3856 } 3857 3858 if (bring_up) { 3859 ASSERT(ill->ill_dl_up); 3860 /* 3861 * Free up the ARP delete message so we can allocate 3862 * a fresh one through the normal path. 3863 */ 3864 freemsg(ipif->ipif_arp_del_mp); 3865 ipif->ipif_arp_del_mp = NULL; 3866 if (ipif_resolver_up(ipif, Res_act_initial) != 3867 EINPROGRESS) { 3868 ipif->ipif_addr_ready = 1; 3869 (void) ipif_up_done(ipif); 3870 } 3871 continue; 3872 } 3873 3874 mutex_enter(&ill->ill_lock); 3875 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 3876 ipif->ipif_flags |= IPIF_DUPLICATE; 3877 ill->ill_ipif_dup_count++; 3878 mutex_exit(&ill->ill_lock); 3879 /* 3880 * Already exclusive on the ill; no need to handle deferred 3881 * processing here. 3882 */ 3883 (void) ipif_down(ipif, NULL, NULL); 3884 ipif_down_tail(ipif); 3885 mutex_enter(&ill->ill_lock); 3886 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3887 ill->ill_net_type == IRE_IF_RESOLVER && 3888 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3889 ipst->ips_ip_dup_recovery > 0) { 3890 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3891 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3892 } 3893 mutex_exit(&ill->ill_lock); 3894 } 3895 freemsg(mp); 3896 } 3897 3898 /* ARGSUSED */ 3899 static void 3900 ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3901 { 3902 ill_t *ill = rq->q_ptr; 3903 arh_t *arh; 3904 ipaddr_t src; 3905 ipif_t *ipif; 3906 3907 arh = (arh_t *)mp->b_cont->b_rptr; 3908 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3909 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3910 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src) 3911 (void) ipif_resolver_up(ipif, Res_act_defend); 3912 } 3913 freemsg(mp); 3914 } 3915 3916 /* 3917 * News from ARP. ARP sends notification of interesting events down 3918 * to its clients using M_CTL messages with the interesting ARP packet 3919 * attached via b_cont. 3920 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3921 * queue as opposed to ARP sending the message to all the clients, i.e. all 3922 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3923 * table if a cache IRE is found to delete all the entries for the address in 3924 * the packet. 3925 */ 3926 static void 3927 ip_arp_news(queue_t *q, mblk_t *mp) 3928 { 3929 arcn_t *arcn; 3930 arh_t *arh; 3931 ire_t *ire = NULL; 3932 char hbuf[MAC_STR_LEN]; 3933 char sbuf[INET_ADDRSTRLEN]; 3934 ipaddr_t src; 3935 in6_addr_t v6src; 3936 boolean_t isv6 = B_FALSE; 3937 ipif_t *ipif; 3938 ill_t *ill; 3939 ip_stack_t *ipst; 3940 3941 if (CONN_Q(q)) { 3942 conn_t *connp = Q_TO_CONN(q); 3943 3944 ipst = connp->conn_netstack->netstack_ip; 3945 } else { 3946 ill_t *ill = (ill_t *)q->q_ptr; 3947 3948 ipst = ill->ill_ipst; 3949 } 3950 3951 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3952 if (q->q_next) { 3953 putnext(q, mp); 3954 } else 3955 freemsg(mp); 3956 return; 3957 } 3958 arh = (arh_t *)mp->b_cont->b_rptr; 3959 /* Is it one we are interested in? */ 3960 if (BE16_TO_U16(arh->arh_proto) == IP6_DL_SAP) { 3961 isv6 = B_TRUE; 3962 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3963 IPV6_ADDR_LEN); 3964 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3965 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3966 IP_ADDR_LEN); 3967 } else { 3968 freemsg(mp); 3969 return; 3970 } 3971 3972 ill = q->q_ptr; 3973 3974 arcn = (arcn_t *)mp->b_rptr; 3975 switch (arcn->arcn_code) { 3976 case AR_CN_BOGON: 3977 /* 3978 * Someone is sending ARP packets with a source protocol 3979 * address that we have published and for which we believe our 3980 * entry is authoritative and (when ill_arp_extend is set) 3981 * verified to be unique on the network. 3982 * 3983 * The ARP module internally handles the cases where the sender 3984 * is just probing (for DAD) and where the hardware address of 3985 * a non-authoritative entry has changed. Thus, these are the 3986 * real conflicts, and we have to do resolution. 3987 * 3988 * We back away quickly from the address if it's from DHCP or 3989 * otherwise temporary and hasn't been used recently (or at 3990 * all). We'd like to include "deprecated" addresses here as 3991 * well (as there's no real reason to defend something we're 3992 * discarding), but IPMP "reuses" this flag to mean something 3993 * other than the standard meaning. 3994 * 3995 * If the ARP module above is not extended (meaning that it 3996 * doesn't know how to defend the address), then we just log 3997 * the problem as we always did and continue on. It's not 3998 * right, but there's little else we can do, and those old ATM 3999 * users are going away anyway. 4000 */ 4001 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, 4002 hbuf, sizeof (hbuf)); 4003 (void) ip_dot_addr(src, sbuf); 4004 if (isv6) { 4005 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL, 4006 ipst); 4007 } else { 4008 ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst); 4009 } 4010 if (ire != NULL && IRE_IS_LOCAL(ire)) { 4011 uint32_t now; 4012 uint32_t maxage; 4013 clock_t lused; 4014 uint_t maxdefense; 4015 uint_t defs; 4016 4017 /* 4018 * First, figure out if this address hasn't been used 4019 * in a while. If it hasn't, then it's a better 4020 * candidate for abandoning. 4021 */ 4022 ipif = ire->ire_ipif; 4023 ASSERT(ipif != NULL); 4024 now = gethrestime_sec(); 4025 maxage = now - ire->ire_create_time; 4026 if (maxage > ipst->ips_ip_max_temp_idle) 4027 maxage = ipst->ips_ip_max_temp_idle; 4028 lused = drv_hztousec(ddi_get_lbolt() - 4029 ire->ire_last_used_time) / MICROSEC + 1; 4030 if (lused >= maxage && (ipif->ipif_flags & 4031 (IPIF_DHCPRUNNING | IPIF_TEMPORARY))) 4032 maxdefense = ipst->ips_ip_max_temp_defend; 4033 else 4034 maxdefense = ipst->ips_ip_max_defend; 4035 4036 /* 4037 * Now figure out how many times we've defended 4038 * ourselves. Ignore defenses that happened long in 4039 * the past. 4040 */ 4041 mutex_enter(&ire->ire_lock); 4042 if ((defs = ire->ire_defense_count) > 0 && 4043 now - ire->ire_defense_time > 4044 ipst->ips_ip_defend_interval) { 4045 ire->ire_defense_count = defs = 0; 4046 } 4047 ire->ire_defense_count++; 4048 ire->ire_defense_time = now; 4049 mutex_exit(&ire->ire_lock); 4050 ill_refhold(ill); 4051 ire_refrele(ire); 4052 4053 /* 4054 * If we've defended ourselves too many times already, 4055 * then give up and tear down the interface(s) using 4056 * this address. Otherwise, defend by sending out a 4057 * gratuitous ARP. 4058 */ 4059 if (defs >= maxdefense && ill->ill_arp_extend) { 4060 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4061 B_FALSE); 4062 } else { 4063 cmn_err(CE_WARN, 4064 "node %s is using our IP address %s on %s", 4065 hbuf, sbuf, ill->ill_name); 4066 /* 4067 * If this is an old (ATM) ARP module, then 4068 * don't try to defend the address. Remain 4069 * compatible with the old behavior. Defend 4070 * only with new ARP. 4071 */ 4072 if (ill->ill_arp_extend) { 4073 qwriter_ip(ill, q, mp, ip_arp_defend, 4074 NEW_OP, B_FALSE); 4075 } else { 4076 ill_refrele(ill); 4077 } 4078 } 4079 return; 4080 } 4081 cmn_err(CE_WARN, 4082 "proxy ARP problem? Node '%s' is using %s on %s", 4083 hbuf, sbuf, ill->ill_name); 4084 if (ire != NULL) 4085 ire_refrele(ire); 4086 break; 4087 case AR_CN_ANNOUNCE: 4088 if (isv6) { 4089 /* 4090 * For XRESOLV interfaces. 4091 * Delete the IRE cache entry and NCE for this 4092 * v6 address 4093 */ 4094 ip_ire_clookup_and_delete_v6(&v6src, ipst); 4095 /* 4096 * If v6src is a non-zero, it's a router address 4097 * as below. Do the same sort of thing to clean 4098 * out off-net IRE_CACHE entries that go through 4099 * the router. 4100 */ 4101 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4102 ire_walk_v6(ire_delete_cache_gw_v6, 4103 (char *)&v6src, ALL_ZONES, ipst); 4104 } 4105 } else { 4106 nce_hw_map_t hwm; 4107 4108 /* 4109 * ARP gives us a copy of any packet where it thinks 4110 * the address has changed, so that we can update our 4111 * caches. We're responsible for caching known answers 4112 * in the current design. We check whether the 4113 * hardware address really has changed in all of our 4114 * entries that have cached this mapping, and if so, we 4115 * blow them away. This way we will immediately pick 4116 * up the rare case of a host changing hardware 4117 * address. 4118 */ 4119 if (src == 0) 4120 break; 4121 hwm.hwm_addr = src; 4122 hwm.hwm_hwlen = arh->arh_hlen; 4123 hwm.hwm_hwaddr = (uchar_t *)(arh + 1); 4124 NDP_HW_CHANGE_INCR(ipst->ips_ndp4); 4125 ndp_walk_common(ipst->ips_ndp4, NULL, 4126 (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES); 4127 NDP_HW_CHANGE_DECR(ipst->ips_ndp4); 4128 } 4129 break; 4130 case AR_CN_READY: 4131 /* No external v6 resolver has a contract to use this */ 4132 if (isv6) 4133 break; 4134 /* If the link is down, we'll retry this later */ 4135 if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 4136 break; 4137 ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL, 4138 NULL, NULL, ipst); 4139 if (ipif != NULL) { 4140 /* 4141 * If this is a duplicate recovery, then we now need to 4142 * go exclusive to bring this thing back up. 4143 */ 4144 if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) == 4145 IPIF_DUPLICATE) { 4146 ipif_refrele(ipif); 4147 ill_refhold(ill); 4148 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4149 B_FALSE); 4150 return; 4151 } 4152 /* 4153 * If this is the first notice that this address is 4154 * ready, then let the user know now. 4155 */ 4156 if ((ipif->ipif_flags & IPIF_UP) && 4157 !ipif->ipif_addr_ready) { 4158 ipif_mask_reply(ipif); 4159 ip_rts_ifmsg(ipif); 4160 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 4161 sctp_update_ipif(ipif, SCTP_IPIF_UP); 4162 } 4163 ipif->ipif_addr_ready = 1; 4164 ipif_refrele(ipif); 4165 } 4166 ire = ire_cache_lookup(src, ALL_ZONES, MBLK_GETLABEL(mp), ipst); 4167 if (ire != NULL) { 4168 ire->ire_defense_count = 0; 4169 ire_refrele(ire); 4170 } 4171 break; 4172 case AR_CN_FAILED: 4173 /* No external v6 resolver has a contract to use this */ 4174 if (isv6) 4175 break; 4176 ill_refhold(ill); 4177 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE); 4178 return; 4179 } 4180 freemsg(mp); 4181 } 4182 4183 /* 4184 * Create a mblk suitable for carrying the interface index and/or source link 4185 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 4186 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 4187 * application. 4188 */ 4189 mblk_t * 4190 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, 4191 ip_stack_t *ipst) 4192 { 4193 mblk_t *mp; 4194 ip_pktinfo_t *pinfo; 4195 ipha_t *ipha; 4196 struct ether_header *pether; 4197 4198 mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); 4199 if (mp == NULL) { 4200 ip1dbg(("ip_add_info: allocation failure.\n")); 4201 return (data_mp); 4202 } 4203 4204 ipha = (ipha_t *)data_mp->b_rptr; 4205 pinfo = (ip_pktinfo_t *)mp->b_rptr; 4206 bzero(pinfo, sizeof (ip_pktinfo_t)); 4207 pinfo->ip_pkt_flags = (uchar_t)flags; 4208 pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 4209 4210 if (flags & (IPF_RECVIF | IPF_RECVADDR)) 4211 pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 4212 if (flags & IPF_RECVADDR) { 4213 ipif_t *ipif; 4214 ire_t *ire; 4215 4216 /* 4217 * Only valid for V4 4218 */ 4219 ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) == 4220 (IPV4_VERSION << 4)); 4221 4222 ipif = ipif_get_next_ipif(NULL, ill); 4223 if (ipif != NULL) { 4224 /* 4225 * Since a decision has already been made to deliver the 4226 * packet, there is no need to test for SECATTR and 4227 * ZONEONLY. 4228 * When a multicast packet is transmitted 4229 * a cache entry is created for the multicast address. 4230 * When delivering a copy of the packet or when new 4231 * packets are received we do not want to match on the 4232 * cached entry so explicitly match on 4233 * IRE_LOCAL and IRE_LOOPBACK 4234 */ 4235 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4236 IRE_LOCAL | IRE_LOOPBACK, 4237 ipif, zoneid, NULL, 4238 MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); 4239 if (ire == NULL) { 4240 /* 4241 * packet must have come on a different 4242 * interface. 4243 * Since a decision has already been made to 4244 * deliver the packet, there is no need to test 4245 * for SECATTR and ZONEONLY. 4246 * Only match on local and broadcast ire's. 4247 * See detailed comment above. 4248 */ 4249 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4250 IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, 4251 NULL, MATCH_IRE_TYPE, ipst); 4252 } 4253 4254 if (ire == NULL) { 4255 /* 4256 * This is either a multicast packet or 4257 * the address has been removed since 4258 * the packet was received. 4259 * Return INADDR_ANY so that normal source 4260 * selection occurs for the response. 4261 */ 4262 4263 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4264 } else { 4265 pinfo->ip_pkt_match_addr.s_addr = 4266 ire->ire_src_addr; 4267 ire_refrele(ire); 4268 } 4269 ipif_refrele(ipif); 4270 } else { 4271 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4272 } 4273 } 4274 4275 pether = (struct ether_header *)((char *)ipha 4276 - sizeof (struct ether_header)); 4277 /* 4278 * Make sure the interface is an ethernet type, since this option 4279 * is currently supported only on this type of interface. Also make 4280 * sure we are pointing correctly above db_base. 4281 */ 4282 4283 if ((flags & IPF_RECVSLLA) && 4284 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 4285 (ill->ill_type == IFT_ETHER) && 4286 (ill->ill_net_type == IRE_IF_RESOLVER)) { 4287 4288 pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; 4289 bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, 4290 (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); 4291 } else { 4292 /* 4293 * Clear the bit. Indicate to upper layer that IP is not 4294 * sending this ancillary info. 4295 */ 4296 pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; 4297 } 4298 4299 mp->b_datap->db_type = M_CTL; 4300 mp->b_wptr += sizeof (ip_pktinfo_t); 4301 mp->b_cont = data_mp; 4302 4303 return (mp); 4304 } 4305 4306 /* 4307 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 4308 * part of the bind request. 4309 */ 4310 4311 boolean_t 4312 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 4313 { 4314 ipsec_in_t *ii; 4315 4316 ASSERT(policy_mp != NULL); 4317 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 4318 4319 ii = (ipsec_in_t *)policy_mp->b_rptr; 4320 ASSERT(ii->ipsec_in_type == IPSEC_IN); 4321 4322 connp->conn_policy = ii->ipsec_in_policy; 4323 ii->ipsec_in_policy = NULL; 4324 4325 if (ii->ipsec_in_action != NULL) { 4326 if (connp->conn_latch == NULL) { 4327 connp->conn_latch = iplatch_create(); 4328 if (connp->conn_latch == NULL) 4329 return (B_FALSE); 4330 } 4331 ipsec_latch_inbound(connp->conn_latch, ii); 4332 } 4333 return (B_TRUE); 4334 } 4335 4336 /* 4337 * Upper level protocols (ULP) pass through bind requests to IP for inspection 4338 * and to arrange for power-fanout assist. The ULP is identified by 4339 * adding a single byte at the end of the original bind message. 4340 * A ULP other than UDP or TCP that wishes to be recognized passes 4341 * down a bind with a zero length address. 4342 * 4343 * The binding works as follows: 4344 * - A zero byte address means just bind to the protocol. 4345 * - A four byte address is treated as a request to validate 4346 * that the address is a valid local address, appropriate for 4347 * an application to bind to. This does not affect any fanout 4348 * information in IP. 4349 * - A sizeof sin_t byte address is used to bind to only the local address 4350 * and port. 4351 * - A sizeof ipa_conn_t byte address contains complete fanout information 4352 * consisting of local and remote addresses and ports. In 4353 * this case, the addresses are both validated as appropriate 4354 * for this operation, and, if so, the information is retained 4355 * for use in the inbound fanout. 4356 * 4357 * The ULP (except in the zero-length bind) can append an 4358 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 4359 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 4360 * a copy of the source or destination IRE (source for local bind; 4361 * destination for complete bind). IPSEC_POLICY_SET indicates that the 4362 * policy information contained should be copied on to the conn. 4363 * 4364 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 4365 */ 4366 mblk_t * 4367 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 4368 { 4369 ssize_t len; 4370 struct T_bind_req *tbr; 4371 sin_t *sin; 4372 ipa_conn_t *ac; 4373 uchar_t *ucp; 4374 mblk_t *mp1; 4375 boolean_t ire_requested; 4376 boolean_t ipsec_policy_set = B_FALSE; 4377 int error = 0; 4378 int protocol; 4379 ipa_conn_x_t *acx; 4380 4381 ASSERT(!connp->conn_af_isv6); 4382 connp->conn_pkt_isv6 = B_FALSE; 4383 4384 len = MBLKL(mp); 4385 if (len < (sizeof (*tbr) + 1)) { 4386 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 4387 "ip_bind: bogus msg, len %ld", len); 4388 /* XXX: Need to return something better */ 4389 goto bad_addr; 4390 } 4391 /* Back up and extract the protocol identifier. */ 4392 mp->b_wptr--; 4393 protocol = *mp->b_wptr & 0xFF; 4394 tbr = (struct T_bind_req *)mp->b_rptr; 4395 /* Reset the message type in preparation for shipping it back. */ 4396 DB_TYPE(mp) = M_PCPROTO; 4397 4398 connp->conn_ulp = (uint8_t)protocol; 4399 4400 /* 4401 * Check for a zero length address. This is from a protocol that 4402 * wants to register to receive all packets of its type. 4403 */ 4404 if (tbr->ADDR_length == 0) { 4405 /* 4406 * These protocols are now intercepted in ip_bind_v6(). 4407 * Reject protocol-level binds here for now. 4408 * 4409 * For SCTP raw socket, ICMP sends down a bind with sin_t 4410 * so that the protocol type cannot be SCTP. 4411 */ 4412 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 4413 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 4414 goto bad_addr; 4415 } 4416 4417 /* 4418 * 4419 * The udp module never sends down a zero-length address, 4420 * and allowing this on a labeled system will break MLP 4421 * functionality. 4422 */ 4423 if (is_system_labeled() && protocol == IPPROTO_UDP) 4424 goto bad_addr; 4425 4426 if (connp->conn_mac_exempt) 4427 goto bad_addr; 4428 4429 /* No hash here really. The table is big enough. */ 4430 connp->conn_srcv6 = ipv6_all_zeros; 4431 4432 ipcl_proto_insert(connp, protocol); 4433 4434 tbr->PRIM_type = T_BIND_ACK; 4435 return (mp); 4436 } 4437 4438 /* Extract the address pointer from the message. */ 4439 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 4440 tbr->ADDR_length); 4441 if (ucp == NULL) { 4442 ip1dbg(("ip_bind: no address\n")); 4443 goto bad_addr; 4444 } 4445 if (!OK_32PTR(ucp)) { 4446 ip1dbg(("ip_bind: unaligned address\n")); 4447 goto bad_addr; 4448 } 4449 /* 4450 * Check for trailing mps. 4451 */ 4452 4453 mp1 = mp->b_cont; 4454 ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); 4455 ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); 4456 4457 switch (tbr->ADDR_length) { 4458 default: 4459 ip1dbg(("ip_bind: bad address length %d\n", 4460 (int)tbr->ADDR_length)); 4461 goto bad_addr; 4462 4463 case IP_ADDR_LEN: 4464 /* Verification of local address only */ 4465 error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, 4466 ire_requested, ipsec_policy_set, B_FALSE); 4467 break; 4468 4469 case sizeof (sin_t): 4470 sin = (sin_t *)ucp; 4471 error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, 4472 sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); 4473 break; 4474 4475 case sizeof (ipa_conn_t): 4476 ac = (ipa_conn_t *)ucp; 4477 /* For raw socket, the local port is not set. */ 4478 if (ac->ac_lport == 0) 4479 ac->ac_lport = connp->conn_lport; 4480 /* Always verify destination reachability. */ 4481 error = ip_bind_connected(connp, mp, &ac->ac_laddr, 4482 ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, 4483 ipsec_policy_set, B_TRUE, B_TRUE); 4484 break; 4485 4486 case sizeof (ipa_conn_x_t): 4487 acx = (ipa_conn_x_t *)ucp; 4488 /* 4489 * Whether or not to verify destination reachability depends 4490 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4491 */ 4492 error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, 4493 acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, 4494 acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, 4495 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); 4496 break; 4497 } 4498 if (error == EINPROGRESS) 4499 return (NULL); 4500 else if (error != 0) 4501 goto bad_addr; 4502 /* 4503 * Pass the IPsec headers size in ire_ipsec_overhead. 4504 * We can't do this in ip_bind_insert_ire because the policy 4505 * may not have been inherited at that point in time and hence 4506 * conn_out_enforce_policy may not be set. 4507 */ 4508 mp1 = mp->b_cont; 4509 if (ire_requested && connp->conn_out_enforce_policy && 4510 mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { 4511 ire_t *ire = (ire_t *)mp1->b_rptr; 4512 ASSERT(MBLKL(mp1) >= sizeof (ire_t)); 4513 ire->ire_ipsec_overhead = conn_ipsec_length(connp); 4514 } 4515 4516 /* Send it home. */ 4517 mp->b_datap->db_type = M_PCPROTO; 4518 tbr->PRIM_type = T_BIND_ACK; 4519 return (mp); 4520 4521 bad_addr: 4522 /* 4523 * If error = -1 then we generate a TBADADDR - otherwise error is 4524 * a unix errno. 4525 */ 4526 if (error > 0) 4527 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4528 else 4529 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4530 return (mp); 4531 } 4532 4533 /* 4534 * Here address is verified to be a valid local address. 4535 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4536 * address is also considered a valid local address. 4537 * In the case of a broadcast/multicast address, however, the 4538 * upper protocol is expected to reset the src address 4539 * to 0 if it sees a IRE_BROADCAST type returned so that 4540 * no packets are emitted with broadcast/multicast address as 4541 * source address (that violates hosts requirements RFC1122) 4542 * The addresses valid for bind are: 4543 * (1) - INADDR_ANY (0) 4544 * (2) - IP address of an UP interface 4545 * (3) - IP address of a DOWN interface 4546 * (4) - valid local IP broadcast addresses. In this case 4547 * the conn will only receive packets destined to 4548 * the specified broadcast address. 4549 * (5) - a multicast address. In this case 4550 * the conn will only receive packets destined to 4551 * the specified multicast address. Note: the 4552 * application still has to issue an 4553 * IP_ADD_MEMBERSHIP socket option. 4554 * 4555 * On error, return -1 for TBADADDR otherwise pass the 4556 * errno with TSYSERR reply. 4557 * 4558 * In all the above cases, the bound address must be valid in the current zone. 4559 * When the address is loopback, multicast or broadcast, there might be many 4560 * matching IREs so bind has to look up based on the zone. 4561 * 4562 * Note: lport is in network byte order. 4563 */ 4564 int 4565 ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, 4566 boolean_t ire_requested, boolean_t ipsec_policy_set, 4567 boolean_t fanout_insert) 4568 { 4569 int error = 0; 4570 ire_t *src_ire; 4571 mblk_t *policy_mp; 4572 ipif_t *ipif; 4573 zoneid_t zoneid; 4574 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4575 4576 if (ipsec_policy_set) { 4577 policy_mp = mp->b_cont; 4578 } 4579 4580 /* 4581 * If it was previously connected, conn_fully_bound would have 4582 * been set. 4583 */ 4584 connp->conn_fully_bound = B_FALSE; 4585 4586 src_ire = NULL; 4587 ipif = NULL; 4588 4589 zoneid = IPCL_ZONEID(connp); 4590 4591 if (src_addr) { 4592 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4593 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 4594 /* 4595 * If an address other than 0.0.0.0 is requested, 4596 * we verify that it is a valid address for bind 4597 * Note: Following code is in if-else-if form for 4598 * readability compared to a condition check. 4599 */ 4600 /* LINTED - statement has no consequent */ 4601 if (IRE_IS_LOCAL(src_ire)) { 4602 /* 4603 * (2) Bind to address of local UP interface 4604 */ 4605 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4606 /* 4607 * (4) Bind to broadcast address 4608 * Note: permitted only from transports that 4609 * request IRE 4610 */ 4611 if (!ire_requested) 4612 error = EADDRNOTAVAIL; 4613 } else { 4614 /* 4615 * (3) Bind to address of local DOWN interface 4616 * (ipif_lookup_addr() looks up all interfaces 4617 * but we do not get here for UP interfaces 4618 * - case (2) above) 4619 * We put the protocol byte back into the mblk 4620 * since we may come back via ip_wput_nondata() 4621 * later with this mblk if ipif_lookup_addr chooses 4622 * to defer processing. 4623 */ 4624 *mp->b_wptr++ = (char)connp->conn_ulp; 4625 if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, 4626 CONNP_TO_WQ(connp), mp, ip_wput_nondata, 4627 &error, ipst)) != NULL) { 4628 ipif_refrele(ipif); 4629 } else if (error == EINPROGRESS) { 4630 if (src_ire != NULL) 4631 ire_refrele(src_ire); 4632 return (EINPROGRESS); 4633 } else if (CLASSD(src_addr)) { 4634 error = 0; 4635 if (src_ire != NULL) 4636 ire_refrele(src_ire); 4637 /* 4638 * (5) bind to multicast address. 4639 * Fake out the IRE returned to upper 4640 * layer to be a broadcast IRE. 4641 */ 4642 src_ire = ire_ctable_lookup( 4643 INADDR_BROADCAST, INADDR_ANY, 4644 IRE_BROADCAST, NULL, zoneid, NULL, 4645 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY), 4646 ipst); 4647 if (src_ire == NULL || !ire_requested) 4648 error = EADDRNOTAVAIL; 4649 } else { 4650 /* 4651 * Not a valid address for bind 4652 */ 4653 error = EADDRNOTAVAIL; 4654 } 4655 /* 4656 * Just to keep it consistent with the processing in 4657 * ip_bind_v4() 4658 */ 4659 mp->b_wptr--; 4660 } 4661 if (error) { 4662 /* Red Alert! Attempting to be a bogon! */ 4663 ip1dbg(("ip_bind: bad src address 0x%x\n", 4664 ntohl(src_addr))); 4665 goto bad_addr; 4666 } 4667 } 4668 4669 /* 4670 * Allow setting new policies. For example, disconnects come 4671 * down as ipa_t bind. As we would have set conn_policy_cached 4672 * to B_TRUE before, we should set it to B_FALSE, so that policy 4673 * can change after the disconnect. 4674 */ 4675 connp->conn_policy_cached = B_FALSE; 4676 4677 /* 4678 * If not fanout_insert this was just an address verification 4679 */ 4680 if (fanout_insert) { 4681 /* 4682 * The addresses have been verified. Time to insert in 4683 * the correct fanout list. 4684 */ 4685 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4686 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4687 connp->conn_lport = lport; 4688 connp->conn_fport = 0; 4689 /* 4690 * Do we need to add a check to reject Multicast packets 4691 */ 4692 error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); 4693 } 4694 4695 if (error == 0) { 4696 if (ire_requested) { 4697 if (!ip_bind_insert_ire(mp, src_ire, NULL, ipst)) { 4698 error = -1; 4699 /* Falls through to bad_addr */ 4700 } 4701 } else if (ipsec_policy_set) { 4702 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4703 error = -1; 4704 /* Falls through to bad_addr */ 4705 } 4706 } 4707 } 4708 bad_addr: 4709 if (error != 0) { 4710 if (connp->conn_anon_port) { 4711 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4712 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4713 B_FALSE); 4714 } 4715 connp->conn_mlp_type = mlptSingle; 4716 } 4717 if (src_ire != NULL) 4718 IRE_REFRELE(src_ire); 4719 if (ipsec_policy_set) { 4720 ASSERT(policy_mp == mp->b_cont); 4721 ASSERT(policy_mp != NULL); 4722 freeb(policy_mp); 4723 /* 4724 * As of now assume that nothing else accompanies 4725 * IPSEC_POLICY_SET. 4726 */ 4727 mp->b_cont = NULL; 4728 } 4729 return (error); 4730 } 4731 4732 /* 4733 * Verify that both the source and destination addresses 4734 * are valid. If verify_dst is false, then the destination address may be 4735 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4736 * destination reachability, while tunnels do not. 4737 * Note that we allow connect to broadcast and multicast 4738 * addresses when ire_requested is set. Thus the ULP 4739 * has to check for IRE_BROADCAST and multicast. 4740 * 4741 * Returns zero if ok. 4742 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4743 * (for use with TSYSERR reply). 4744 * 4745 * Note: lport and fport are in network byte order. 4746 */ 4747 int 4748 ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, 4749 uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4750 boolean_t ire_requested, boolean_t ipsec_policy_set, 4751 boolean_t fanout_insert, boolean_t verify_dst) 4752 { 4753 ire_t *src_ire; 4754 ire_t *dst_ire; 4755 int error = 0; 4756 int protocol; 4757 mblk_t *policy_mp; 4758 ire_t *sire = NULL; 4759 ire_t *md_dst_ire = NULL; 4760 ire_t *lso_dst_ire = NULL; 4761 ill_t *ill = NULL; 4762 zoneid_t zoneid; 4763 ipaddr_t src_addr = *src_addrp; 4764 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4765 4766 src_ire = dst_ire = NULL; 4767 protocol = *mp->b_wptr & 0xFF; 4768 4769 /* 4770 * If we never got a disconnect before, clear it now. 4771 */ 4772 connp->conn_fully_bound = B_FALSE; 4773 4774 if (ipsec_policy_set) { 4775 policy_mp = mp->b_cont; 4776 } 4777 4778 zoneid = IPCL_ZONEID(connp); 4779 4780 if (CLASSD(dst_addr)) { 4781 /* Pick up an IRE_BROADCAST */ 4782 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4783 NULL, zoneid, MBLK_GETLABEL(mp), 4784 (MATCH_IRE_RECURSIVE | 4785 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4786 MATCH_IRE_SECATTR), ipst); 4787 } else { 4788 /* 4789 * If conn_dontroute is set or if conn_nexthop_set is set, 4790 * and onlink ipif is not found set ENETUNREACH error. 4791 */ 4792 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4793 ipif_t *ipif; 4794 4795 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4796 dst_addr : connp->conn_nexthop_v4, zoneid, ipst); 4797 if (ipif == NULL) { 4798 error = ENETUNREACH; 4799 goto bad_addr; 4800 } 4801 ipif_refrele(ipif); 4802 } 4803 4804 if (connp->conn_nexthop_set) { 4805 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4806 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), 4807 MATCH_IRE_SECATTR, ipst); 4808 } else { 4809 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4810 &sire, zoneid, MBLK_GETLABEL(mp), 4811 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4812 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4813 MATCH_IRE_SECATTR), ipst); 4814 } 4815 } 4816 /* 4817 * dst_ire can't be a broadcast when not ire_requested. 4818 * We also prevent ire's with src address INADDR_ANY to 4819 * be used, which are created temporarily for 4820 * sending out packets from endpoints that have 4821 * conn_unspec_src set. If verify_dst is true, the destination must be 4822 * reachable. If verify_dst is false, the destination needn't be 4823 * reachable. 4824 * 4825 * If we match on a reject or black hole, then we've got a 4826 * local failure. May as well fail out the connect() attempt, 4827 * since it's never going to succeed. 4828 */ 4829 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4830 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4831 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4832 /* 4833 * If we're verifying destination reachability, we always want 4834 * to complain here. 4835 * 4836 * If we're not verifying destination reachability but the 4837 * destination has a route, we still want to fail on the 4838 * temporary address and broadcast address tests. 4839 */ 4840 if (verify_dst || (dst_ire != NULL)) { 4841 if (ip_debug > 2) { 4842 pr_addr_dbg("ip_bind_connected: bad connected " 4843 "dst %s\n", AF_INET, &dst_addr); 4844 } 4845 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4846 error = ENETUNREACH; 4847 else 4848 error = EHOSTUNREACH; 4849 goto bad_addr; 4850 } 4851 } 4852 4853 /* 4854 * We now know that routing will allow us to reach the destination. 4855 * Check whether Trusted Solaris policy allows communication with this 4856 * host, and pretend that the destination is unreachable if not. 4857 * 4858 * This is never a problem for TCP, since that transport is known to 4859 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4860 * handling. If the remote is unreachable, it will be detected at that 4861 * point, so there's no reason to check it here. 4862 * 4863 * Note that for sendto (and other datagram-oriented friends), this 4864 * check is done as part of the data path label computation instead. 4865 * The check here is just to make non-TCP connect() report the right 4866 * error. 4867 */ 4868 if (dst_ire != NULL && is_system_labeled() && 4869 !IPCL_IS_TCP(connp) && 4870 tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst_addr, NULL, 4871 connp->conn_mac_exempt, ipst) != 0) { 4872 error = EHOSTUNREACH; 4873 if (ip_debug > 2) { 4874 pr_addr_dbg("ip_bind_connected: no label for dst %s\n", 4875 AF_INET, &dst_addr); 4876 } 4877 goto bad_addr; 4878 } 4879 4880 /* 4881 * If the app does a connect(), it means that it will most likely 4882 * send more than 1 packet to the destination. It makes sense 4883 * to clear the temporary flag. 4884 */ 4885 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4886 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4887 irb_t *irb = dst_ire->ire_bucket; 4888 4889 rw_enter(&irb->irb_lock, RW_WRITER); 4890 /* 4891 * We need to recheck for IRE_MARK_TEMPORARY after acquiring 4892 * the lock to guarantee irb_tmp_ire_cnt. 4893 */ 4894 if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) { 4895 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4896 irb->irb_tmp_ire_cnt--; 4897 } 4898 rw_exit(&irb->irb_lock); 4899 } 4900 4901 /* 4902 * See if we should notify ULP about LSO/MDT; we do this whether or not 4903 * ire_requested is TRUE, in order to handle active connects; LSO/MDT 4904 * eligibility tests for passive connects are handled separately 4905 * through tcp_adapt_ire(). We do this before the source address 4906 * selection, because dst_ire may change after a call to 4907 * ipif_select_source(). This is a best-effort check, as the 4908 * packet for this connection may not actually go through 4909 * dst_ire->ire_stq, and the exact IRE can only be known after 4910 * calling ip_newroute(). This is why we further check on the 4911 * IRE during LSO/Multidata packet transmission in 4912 * tcp_lsosend()/tcp_multisend(). 4913 */ 4914 if (!ipsec_policy_set && dst_ire != NULL && 4915 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4916 (ill = ire_to_ill(dst_ire), ill != NULL)) { 4917 if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { 4918 lso_dst_ire = dst_ire; 4919 IRE_REFHOLD(lso_dst_ire); 4920 } else if (ipst->ips_ip_multidata_outbound && 4921 ILL_MDT_CAPABLE(ill)) { 4922 md_dst_ire = dst_ire; 4923 IRE_REFHOLD(md_dst_ire); 4924 } 4925 } 4926 4927 if (dst_ire != NULL && 4928 dst_ire->ire_type == IRE_LOCAL && 4929 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4930 /* 4931 * If the IRE belongs to a different zone, look for a matching 4932 * route in the forwarding table and use the source address from 4933 * that route. 4934 */ 4935 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4936 zoneid, 0, NULL, 4937 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4938 MATCH_IRE_RJ_BHOLE, ipst); 4939 if (src_ire == NULL) { 4940 error = EHOSTUNREACH; 4941 goto bad_addr; 4942 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4943 if (!(src_ire->ire_type & IRE_HOST)) 4944 error = ENETUNREACH; 4945 else 4946 error = EHOSTUNREACH; 4947 goto bad_addr; 4948 } 4949 if (src_addr == INADDR_ANY) 4950 src_addr = src_ire->ire_src_addr; 4951 ire_refrele(src_ire); 4952 src_ire = NULL; 4953 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4954 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4955 src_addr = sire->ire_src_addr; 4956 ire_refrele(dst_ire); 4957 dst_ire = sire; 4958 sire = NULL; 4959 } else { 4960 /* 4961 * Pick a source address so that a proper inbound 4962 * load spreading would happen. 4963 */ 4964 ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; 4965 ipif_t *src_ipif = NULL; 4966 ire_t *ipif_ire; 4967 4968 /* 4969 * Supply a local source address such that inbound 4970 * load spreading happens. 4971 * 4972 * Determine the best source address on this ill for 4973 * the destination. 4974 * 4975 * 1) For broadcast, we should return a broadcast ire 4976 * found above so that upper layers know that the 4977 * destination address is a broadcast address. 4978 * 4979 * 2) If this is part of a group, select a better 4980 * source address so that better inbound load 4981 * balancing happens. Do the same if the ipif 4982 * is DEPRECATED. 4983 * 4984 * 3) If the outgoing interface is part of a usesrc 4985 * group, then try selecting a source address from 4986 * the usesrc ILL. 4987 */ 4988 if ((dst_ire->ire_zoneid != zoneid && 4989 dst_ire->ire_zoneid != ALL_ZONES) || 4990 (!(dst_ire->ire_flags & RTF_SETSRC)) && 4991 (!(dst_ire->ire_type & IRE_BROADCAST) && 4992 ((dst_ill->ill_group != NULL) || 4993 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4994 (dst_ill->ill_usesrc_ifindex != 0)))) { 4995 /* 4996 * If the destination is reachable via a 4997 * given gateway, the selected source address 4998 * should be in the same subnet as the gateway. 4999 * Otherwise, the destination is not reachable. 5000 * 5001 * If there are no interfaces on the same subnet 5002 * as the destination, ipif_select_source gives 5003 * first non-deprecated interface which might be 5004 * on a different subnet than the gateway. 5005 * This is not desirable. Hence pass the dst_ire 5006 * source address to ipif_select_source. 5007 * It is sure that the destination is reachable 5008 * with the dst_ire source address subnet. 5009 * So passing dst_ire source address to 5010 * ipif_select_source will make sure that the 5011 * selected source will be on the same subnet 5012 * as dst_ire source address. 5013 */ 5014 ipaddr_t saddr = 5015 dst_ire->ire_ipif->ipif_src_addr; 5016 src_ipif = ipif_select_source(dst_ill, 5017 saddr, zoneid); 5018 if (src_ipif != NULL) { 5019 if (IS_VNI(src_ipif->ipif_ill)) { 5020 /* 5021 * For VNI there is no 5022 * interface route 5023 */ 5024 src_addr = 5025 src_ipif->ipif_src_addr; 5026 } else { 5027 ipif_ire = 5028 ipif_to_ire(src_ipif); 5029 if (ipif_ire != NULL) { 5030 IRE_REFRELE(dst_ire); 5031 dst_ire = ipif_ire; 5032 } 5033 src_addr = 5034 dst_ire->ire_src_addr; 5035 } 5036 ipif_refrele(src_ipif); 5037 } else { 5038 src_addr = dst_ire->ire_src_addr; 5039 } 5040 } else { 5041 src_addr = dst_ire->ire_src_addr; 5042 } 5043 } 5044 } 5045 5046 /* 5047 * We do ire_route_lookup() here (and not 5048 * interface lookup as we assert that 5049 * src_addr should only come from an 5050 * UP interface for hard binding. 5051 */ 5052 ASSERT(src_ire == NULL); 5053 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 5054 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 5055 /* src_ire must be a local|loopback */ 5056 if (!IRE_IS_LOCAL(src_ire)) { 5057 if (ip_debug > 2) { 5058 pr_addr_dbg("ip_bind_connected: bad connected " 5059 "src %s\n", AF_INET, &src_addr); 5060 } 5061 error = EADDRNOTAVAIL; 5062 goto bad_addr; 5063 } 5064 5065 /* 5066 * If the source address is a loopback address, the 5067 * destination had best be local or multicast. 5068 * The transports that can't handle multicast will reject 5069 * those addresses. 5070 */ 5071 if (src_ire->ire_type == IRE_LOOPBACK && 5072 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 5073 ip1dbg(("ip_bind_connected: bad connected loopback\n")); 5074 error = -1; 5075 goto bad_addr; 5076 } 5077 5078 /* 5079 * Allow setting new policies. For example, disconnects come 5080 * down as ipa_t bind. As we would have set conn_policy_cached 5081 * to B_TRUE before, we should set it to B_FALSE, so that policy 5082 * can change after the disconnect. 5083 */ 5084 connp->conn_policy_cached = B_FALSE; 5085 5086 /* 5087 * Set the conn addresses/ports immediately, so the IPsec policy calls 5088 * can handle their passed-in conn's. 5089 */ 5090 5091 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 5092 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 5093 connp->conn_lport = lport; 5094 connp->conn_fport = fport; 5095 *src_addrp = src_addr; 5096 5097 ASSERT(!(ipsec_policy_set && ire_requested)); 5098 if (ire_requested) { 5099 iulp_t *ulp_info = NULL; 5100 5101 /* 5102 * Note that sire will not be NULL if this is an off-link 5103 * connection and there is not cache for that dest yet. 5104 * 5105 * XXX Because of an existing bug, if there are multiple 5106 * default routes, the IRE returned now may not be the actual 5107 * default route used (default routes are chosen in a 5108 * round robin fashion). So if the metrics for different 5109 * default routes are different, we may return the wrong 5110 * metrics. This will not be a problem if the existing 5111 * bug is fixed. 5112 */ 5113 if (sire != NULL) { 5114 ulp_info = &(sire->ire_uinfo); 5115 } 5116 if (!ip_bind_insert_ire(mp, dst_ire, ulp_info, ipst)) { 5117 error = -1; 5118 goto bad_addr; 5119 } 5120 } else if (ipsec_policy_set) { 5121 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 5122 error = -1; 5123 goto bad_addr; 5124 } 5125 } 5126 5127 /* 5128 * Cache IPsec policy in this conn. If we have per-socket policy, 5129 * we'll cache that. If we don't, we'll inherit global policy. 5130 * 5131 * We can't insert until the conn reflects the policy. Note that 5132 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 5133 * connections where we don't have a policy. This is to prevent 5134 * global policy lookups in the inbound path. 5135 * 5136 * If we insert before we set conn_policy_cached, 5137 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 5138 * because global policy cound be non-empty. We normally call 5139 * ipsec_check_policy() for conn_policy_cached connections only if 5140 * ipc_in_enforce_policy is set. But in this case, 5141 * conn_policy_cached can get set anytime since we made the 5142 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 5143 * called, which will make the above assumption false. Thus, we 5144 * need to insert after we set conn_policy_cached. 5145 */ 5146 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 5147 goto bad_addr; 5148 5149 if (fanout_insert) { 5150 /* 5151 * The addresses have been verified. Time to insert in 5152 * the correct fanout list. 5153 */ 5154 error = ipcl_conn_insert(connp, protocol, src_addr, 5155 dst_addr, connp->conn_ports); 5156 } 5157 5158 if (error == 0) { 5159 connp->conn_fully_bound = B_TRUE; 5160 /* 5161 * Our initial checks for LSO/MDT have passed; the IRE is not 5162 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 5163 * be supporting LSO/MDT. Pass the IRE, IPC and ILL into 5164 * ip_xxinfo_return(), which performs further checks 5165 * against them and upon success, returns the LSO/MDT info 5166 * mblk which we will attach to the bind acknowledgment. 5167 */ 5168 if (lso_dst_ire != NULL) { 5169 mblk_t *lsoinfo_mp; 5170 5171 ASSERT(ill->ill_lso_capab != NULL); 5172 if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp, 5173 ill->ill_name, ill->ill_lso_capab)) != NULL) 5174 linkb(mp, lsoinfo_mp); 5175 } else if (md_dst_ire != NULL) { 5176 mblk_t *mdinfo_mp; 5177 5178 ASSERT(ill->ill_mdt_capab != NULL); 5179 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 5180 ill->ill_name, ill->ill_mdt_capab)) != NULL) 5181 linkb(mp, mdinfo_mp); 5182 } 5183 } 5184 bad_addr: 5185 if (ipsec_policy_set) { 5186 ASSERT(policy_mp == mp->b_cont); 5187 ASSERT(policy_mp != NULL); 5188 freeb(policy_mp); 5189 /* 5190 * As of now assume that nothing else accompanies 5191 * IPSEC_POLICY_SET. 5192 */ 5193 mp->b_cont = NULL; 5194 } 5195 if (src_ire != NULL) 5196 IRE_REFRELE(src_ire); 5197 if (dst_ire != NULL) 5198 IRE_REFRELE(dst_ire); 5199 if (sire != NULL) 5200 IRE_REFRELE(sire); 5201 if (md_dst_ire != NULL) 5202 IRE_REFRELE(md_dst_ire); 5203 if (lso_dst_ire != NULL) 5204 IRE_REFRELE(lso_dst_ire); 5205 return (error); 5206 } 5207 5208 /* 5209 * Insert the ire in b_cont. Returns false if it fails (due to lack of space). 5210 * Prefers dst_ire over src_ire. 5211 */ 5212 static boolean_t 5213 ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) 5214 { 5215 mblk_t *mp1; 5216 ire_t *ret_ire = NULL; 5217 5218 mp1 = mp->b_cont; 5219 ASSERT(mp1 != NULL); 5220 5221 if (ire != NULL) { 5222 /* 5223 * mp1 initialized above to IRE_DB_REQ_TYPE 5224 * appended mblk. Its <upper protocol>'s 5225 * job to make sure there is room. 5226 */ 5227 if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) 5228 return (0); 5229 5230 mp1->b_datap->db_type = IRE_DB_TYPE; 5231 mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); 5232 bcopy(ire, mp1->b_rptr, sizeof (ire_t)); 5233 ret_ire = (ire_t *)mp1->b_rptr; 5234 /* 5235 * Pass the latest setting of the ip_path_mtu_discovery and 5236 * copy the ulp info if any. 5237 */ 5238 ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? 5239 IPH_DF : 0; 5240 if (ulp_info != NULL) { 5241 bcopy(ulp_info, &(ret_ire->ire_uinfo), 5242 sizeof (iulp_t)); 5243 } 5244 ret_ire->ire_mp = mp1; 5245 } else { 5246 /* 5247 * No IRE was found. Remove IRE mblk. 5248 */ 5249 mp->b_cont = mp1->b_cont; 5250 freeb(mp1); 5251 } 5252 5253 return (1); 5254 } 5255 5256 /* 5257 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 5258 * the final piece where we don't. Return a pointer to the first mblk in the 5259 * result, and update the pointer to the next mblk to chew on. If anything 5260 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 5261 * NULL pointer. 5262 */ 5263 mblk_t * 5264 ip_carve_mp(mblk_t **mpp, ssize_t len) 5265 { 5266 mblk_t *mp0; 5267 mblk_t *mp1; 5268 mblk_t *mp2; 5269 5270 if (!len || !mpp || !(mp0 = *mpp)) 5271 return (NULL); 5272 /* If we aren't going to consume the first mblk, we need a dup. */ 5273 if (mp0->b_wptr - mp0->b_rptr > len) { 5274 mp1 = dupb(mp0); 5275 if (mp1) { 5276 /* Partition the data between the two mblks. */ 5277 mp1->b_wptr = mp1->b_rptr + len; 5278 mp0->b_rptr = mp1->b_wptr; 5279 /* 5280 * after adjustments if mblk not consumed is now 5281 * unaligned, try to align it. If this fails free 5282 * all messages and let upper layer recover. 5283 */ 5284 if (!OK_32PTR(mp0->b_rptr)) { 5285 if (!pullupmsg(mp0, -1)) { 5286 freemsg(mp0); 5287 freemsg(mp1); 5288 *mpp = NULL; 5289 return (NULL); 5290 } 5291 } 5292 } 5293 return (mp1); 5294 } 5295 /* Eat through as many mblks as we need to get len bytes. */ 5296 len -= mp0->b_wptr - mp0->b_rptr; 5297 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 5298 if (mp2->b_wptr - mp2->b_rptr > len) { 5299 /* 5300 * We won't consume the entire last mblk. Like 5301 * above, dup and partition it. 5302 */ 5303 mp1->b_cont = dupb(mp2); 5304 mp1 = mp1->b_cont; 5305 if (!mp1) { 5306 /* 5307 * Trouble. Rather than go to a lot of 5308 * trouble to clean up, we free the messages. 5309 * This won't be any worse than losing it on 5310 * the wire. 5311 */ 5312 freemsg(mp0); 5313 freemsg(mp2); 5314 *mpp = NULL; 5315 return (NULL); 5316 } 5317 mp1->b_wptr = mp1->b_rptr + len; 5318 mp2->b_rptr = mp1->b_wptr; 5319 /* 5320 * after adjustments if mblk not consumed is now 5321 * unaligned, try to align it. If this fails free 5322 * all messages and let upper layer recover. 5323 */ 5324 if (!OK_32PTR(mp2->b_rptr)) { 5325 if (!pullupmsg(mp2, -1)) { 5326 freemsg(mp0); 5327 freemsg(mp2); 5328 *mpp = NULL; 5329 return (NULL); 5330 } 5331 } 5332 *mpp = mp2; 5333 return (mp0); 5334 } 5335 /* Decrement len by the amount we just got. */ 5336 len -= mp2->b_wptr - mp2->b_rptr; 5337 } 5338 /* 5339 * len should be reduced to zero now. If not our caller has 5340 * screwed up. 5341 */ 5342 if (len) { 5343 /* Shouldn't happen! */ 5344 freemsg(mp0); 5345 *mpp = NULL; 5346 return (NULL); 5347 } 5348 /* 5349 * We consumed up to exactly the end of an mblk. Detach the part 5350 * we are returning from the rest of the chain. 5351 */ 5352 mp1->b_cont = NULL; 5353 *mpp = mp2; 5354 return (mp0); 5355 } 5356 5357 /* The ill stream is being unplumbed. Called from ip_close */ 5358 int 5359 ip_modclose(ill_t *ill) 5360 { 5361 boolean_t success; 5362 ipsq_t *ipsq; 5363 ipif_t *ipif; 5364 queue_t *q = ill->ill_rq; 5365 ip_stack_t *ipst = ill->ill_ipst; 5366 clock_t timeout; 5367 5368 /* 5369 * Wait for the ACKs of all deferred control messages to be processed. 5370 * In particular, we wait for a potential capability reset initiated 5371 * in ip_sioctl_plink() to complete before proceeding. 5372 * 5373 * Note: we wait for at most ip_modclose_ackwait_ms (by default 3000 ms) 5374 * in case the driver never replies. 5375 */ 5376 timeout = lbolt + MSEC_TO_TICK(ip_modclose_ackwait_ms); 5377 mutex_enter(&ill->ill_lock); 5378 while (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 5379 if (cv_timedwait(&ill->ill_cv, &ill->ill_lock, timeout) < 0) { 5380 /* Timeout */ 5381 break; 5382 } 5383 } 5384 mutex_exit(&ill->ill_lock); 5385 5386 /* 5387 * Forcibly enter the ipsq after some delay. This is to take 5388 * care of the case when some ioctl does not complete because 5389 * we sent a control message to the driver and it did not 5390 * send us a reply. We want to be able to at least unplumb 5391 * and replumb rather than force the user to reboot the system. 5392 */ 5393 success = ipsq_enter(ill, B_FALSE); 5394 5395 /* 5396 * Open/close/push/pop is guaranteed to be single threaded 5397 * per stream by STREAMS. FS guarantees that all references 5398 * from top are gone before close is called. So there can't 5399 * be another close thread that has set CONDEMNED on this ill. 5400 * and cause ipsq_enter to return failure. 5401 */ 5402 ASSERT(success); 5403 ipsq = ill->ill_phyint->phyint_ipsq; 5404 5405 /* 5406 * Mark it condemned. No new reference will be made to this ill. 5407 * Lookup functions will return an error. Threads that try to 5408 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 5409 * that the refcnt will drop down to zero. 5410 */ 5411 mutex_enter(&ill->ill_lock); 5412 ill->ill_state_flags |= ILL_CONDEMNED; 5413 for (ipif = ill->ill_ipif; ipif != NULL; 5414 ipif = ipif->ipif_next) { 5415 ipif->ipif_state_flags |= IPIF_CONDEMNED; 5416 } 5417 /* 5418 * Wake up anybody waiting to enter the ipsq. ipsq_enter 5419 * returns error if ILL_CONDEMNED is set 5420 */ 5421 cv_broadcast(&ill->ill_cv); 5422 mutex_exit(&ill->ill_lock); 5423 5424 /* 5425 * Send all the deferred DLPI messages downstream which came in 5426 * during the small window right before ipsq_enter(). We do this 5427 * without waiting for the ACKs because all the ACKs for M_PROTO 5428 * messages are ignored in ip_rput() when ILL_CONDEMNED is set. 5429 */ 5430 ill_dlpi_send_deferred(ill); 5431 5432 /* 5433 * Shut down fragmentation reassembly. 5434 * ill_frag_timer won't start a timer again. 5435 * Now cancel any existing timer 5436 */ 5437 (void) untimeout(ill->ill_frag_timer_id); 5438 (void) ill_frag_timeout(ill, 0); 5439 5440 /* 5441 * If MOVE was in progress, clear the 5442 * move_in_progress fields also. 5443 */ 5444 if (ill->ill_move_in_progress) { 5445 ILL_CLEAR_MOVE(ill); 5446 } 5447 5448 /* 5449 * Call ill_delete to bring down the ipifs, ilms and ill on 5450 * this ill. Then wait for the refcnts to drop to zero. 5451 * ill_is_freeable checks whether the ill is really quiescent. 5452 * Then make sure that threads that are waiting to enter the 5453 * ipsq have seen the error returned by ipsq_enter and have 5454 * gone away. Then we call ill_delete_tail which does the 5455 * DL_UNBIND_REQ with the driver and then qprocsoff. 5456 */ 5457 ill_delete(ill); 5458 mutex_enter(&ill->ill_lock); 5459 while (!ill_is_freeable(ill)) 5460 cv_wait(&ill->ill_cv, &ill->ill_lock); 5461 while (ill->ill_waiters) 5462 cv_wait(&ill->ill_cv, &ill->ill_lock); 5463 5464 mutex_exit(&ill->ill_lock); 5465 5466 /* 5467 * ill_delete_tail drops reference on ill_ipst, but we need to keep 5468 * it held until the end of the function since the cleanup 5469 * below needs to be able to use the ip_stack_t. 5470 */ 5471 netstack_hold(ipst->ips_netstack); 5472 5473 /* qprocsoff is called in ill_delete_tail */ 5474 ill_delete_tail(ill); 5475 ASSERT(ill->ill_ipst == NULL); 5476 5477 /* 5478 * Walk through all upper (conn) streams and qenable 5479 * those that have queued data. 5480 * close synchronization needs this to 5481 * be done to ensure that all upper layers blocked 5482 * due to flow control to the closing device 5483 * get unblocked. 5484 */ 5485 ip1dbg(("ip_wsrv: walking\n")); 5486 conn_walk_drain(ipst); 5487 5488 mutex_enter(&ipst->ips_ip_mi_lock); 5489 mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); 5490 mutex_exit(&ipst->ips_ip_mi_lock); 5491 5492 /* 5493 * credp could be null if the open didn't succeed and ip_modopen 5494 * itself calls ip_close. 5495 */ 5496 if (ill->ill_credp != NULL) 5497 crfree(ill->ill_credp); 5498 5499 mutex_enter(&ill->ill_lock); 5500 ill_nic_info_dispatch(ill); 5501 mutex_exit(&ill->ill_lock); 5502 5503 /* 5504 * Now we are done with the module close pieces that 5505 * need the netstack_t. 5506 */ 5507 netstack_rele(ipst->ips_netstack); 5508 5509 mi_close_free((IDP)ill); 5510 q->q_ptr = WR(q)->q_ptr = NULL; 5511 5512 ipsq_exit(ipsq); 5513 5514 return (0); 5515 } 5516 5517 /* 5518 * This is called as part of close() for IP, UDP, ICMP, and RTS 5519 * in order to quiesce the conn. 5520 */ 5521 void 5522 ip_quiesce_conn(conn_t *connp) 5523 { 5524 boolean_t drain_cleanup_reqd = B_FALSE; 5525 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 5526 boolean_t ilg_cleanup_reqd = B_FALSE; 5527 ip_stack_t *ipst; 5528 5529 ASSERT(!IPCL_IS_TCP(connp)); 5530 ipst = connp->conn_netstack->netstack_ip; 5531 5532 /* 5533 * Mark the conn as closing, and this conn must not be 5534 * inserted in future into any list. Eg. conn_drain_insert(), 5535 * won't insert this conn into the conn_drain_list. 5536 * Similarly ill_pending_mp_add() will not add any mp to 5537 * the pending mp list, after this conn has started closing. 5538 * 5539 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 5540 * cannot get set henceforth. 5541 */ 5542 mutex_enter(&connp->conn_lock); 5543 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 5544 connp->conn_state_flags |= CONN_CLOSING; 5545 if (connp->conn_idl != NULL) 5546 drain_cleanup_reqd = B_TRUE; 5547 if (connp->conn_oper_pending_ill != NULL) 5548 conn_ioctl_cleanup_reqd = B_TRUE; 5549 if (connp->conn_dhcpinit_ill != NULL) { 5550 ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0); 5551 atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit); 5552 connp->conn_dhcpinit_ill = NULL; 5553 } 5554 if (connp->conn_ilg_inuse != 0) 5555 ilg_cleanup_reqd = B_TRUE; 5556 mutex_exit(&connp->conn_lock); 5557 5558 if (conn_ioctl_cleanup_reqd) 5559 conn_ioctl_cleanup(connp); 5560 5561 if (is_system_labeled() && connp->conn_anon_port) { 5562 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5563 connp->conn_mlp_type, connp->conn_ulp, 5564 ntohs(connp->conn_lport), B_FALSE); 5565 connp->conn_anon_port = 0; 5566 } 5567 connp->conn_mlp_type = mlptSingle; 5568 5569 /* 5570 * Remove this conn from any fanout list it is on. 5571 * and then wait for any threads currently operating 5572 * on this endpoint to finish 5573 */ 5574 ipcl_hash_remove(connp); 5575 5576 /* 5577 * Remove this conn from the drain list, and do 5578 * any other cleanup that may be required. 5579 * (Only non-tcp streams may have a non-null conn_idl. 5580 * TCP streams are never flow controlled, and 5581 * conn_idl will be null) 5582 */ 5583 if (drain_cleanup_reqd) 5584 conn_drain_tail(connp, B_TRUE); 5585 5586 if (connp == ipst->ips_ip_g_mrouter) 5587 (void) ip_mrouter_done(NULL, ipst); 5588 5589 if (ilg_cleanup_reqd) 5590 ilg_delete_all(connp); 5591 5592 conn_delete_ire(connp, NULL); 5593 5594 /* 5595 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5596 * callers from write side can't be there now because close 5597 * is in progress. The only other caller is ipcl_walk 5598 * which checks for the condemned flag. 5599 */ 5600 mutex_enter(&connp->conn_lock); 5601 connp->conn_state_flags |= CONN_CONDEMNED; 5602 while (connp->conn_ref != 1) 5603 cv_wait(&connp->conn_cv, &connp->conn_lock); 5604 connp->conn_state_flags |= CONN_QUIESCED; 5605 mutex_exit(&connp->conn_lock); 5606 } 5607 5608 /* ARGSUSED */ 5609 int 5610 ip_close(queue_t *q, int flags) 5611 { 5612 conn_t *connp; 5613 5614 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5615 5616 /* 5617 * Call the appropriate delete routine depending on whether this is 5618 * a module or device. 5619 */ 5620 if (WR(q)->q_next != NULL) { 5621 /* This is a module close */ 5622 return (ip_modclose((ill_t *)q->q_ptr)); 5623 } 5624 5625 connp = q->q_ptr; 5626 ip_quiesce_conn(connp); 5627 5628 qprocsoff(q); 5629 5630 /* 5631 * Now we are truly single threaded on this stream, and can 5632 * delete the things hanging off the connp, and finally the connp. 5633 * We removed this connp from the fanout list, it cannot be 5634 * accessed thru the fanouts, and we already waited for the 5635 * conn_ref to drop to 0. We are already in close, so 5636 * there cannot be any other thread from the top. qprocsoff 5637 * has completed, and service has completed or won't run in 5638 * future. 5639 */ 5640 ASSERT(connp->conn_ref == 1); 5641 5642 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 5643 5644 connp->conn_ref--; 5645 ipcl_conn_destroy(connp); 5646 5647 q->q_ptr = WR(q)->q_ptr = NULL; 5648 return (0); 5649 } 5650 5651 /* 5652 * Wapper around putnext() so that ip_rts_request can merely use 5653 * conn_recv. 5654 */ 5655 /*ARGSUSED2*/ 5656 static void 5657 ip_conn_input(void *arg1, mblk_t *mp, void *arg2) 5658 { 5659 conn_t *connp = (conn_t *)arg1; 5660 5661 putnext(connp->conn_rq, mp); 5662 } 5663 5664 /* Return the IP checksum for the IP header at "iph". */ 5665 uint16_t 5666 ip_csum_hdr(ipha_t *ipha) 5667 { 5668 uint16_t *uph; 5669 uint32_t sum; 5670 int opt_len; 5671 5672 opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - 5673 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5674 uph = (uint16_t *)ipha; 5675 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 5676 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 5677 if (opt_len > 0) { 5678 do { 5679 sum += uph[10]; 5680 sum += uph[11]; 5681 uph += 2; 5682 } while (--opt_len); 5683 } 5684 sum = (sum & 0xFFFF) + (sum >> 16); 5685 sum = ~(sum + (sum >> 16)) & 0xFFFF; 5686 if (sum == 0xffff) 5687 sum = 0; 5688 return ((uint16_t)sum); 5689 } 5690 5691 /* 5692 * Called when the module is about to be unloaded 5693 */ 5694 void 5695 ip_ddi_destroy(void) 5696 { 5697 tnet_fini(); 5698 5699 icmp_ddi_destroy(); 5700 rts_ddi_destroy(); 5701 udp_ddi_destroy(); 5702 sctp_ddi_g_destroy(); 5703 tcp_ddi_g_destroy(); 5704 ipsec_policy_g_destroy(); 5705 ipcl_g_destroy(); 5706 ip_net_g_destroy(); 5707 ip_ire_g_fini(); 5708 inet_minor_destroy(ip_minor_arena_sa); 5709 #if defined(_LP64) 5710 inet_minor_destroy(ip_minor_arena_la); 5711 #endif 5712 5713 #ifdef DEBUG 5714 list_destroy(&ip_thread_list); 5715 rw_destroy(&ip_thread_rwlock); 5716 tsd_destroy(&ip_thread_data); 5717 #endif 5718 5719 netstack_unregister(NS_IP); 5720 } 5721 5722 /* 5723 * First step in cleanup. 5724 */ 5725 /* ARGSUSED */ 5726 static void 5727 ip_stack_shutdown(netstackid_t stackid, void *arg) 5728 { 5729 ip_stack_t *ipst = (ip_stack_t *)arg; 5730 5731 #ifdef NS_DEBUG 5732 printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid); 5733 #endif 5734 5735 /* Get rid of loopback interfaces and their IREs */ 5736 ip_loopback_cleanup(ipst); 5737 5738 /* 5739 * The destroy functions here will end up causing notify callbacks 5740 * in the hook framework and these need to be run before the shtudown 5741 * of the hook framework is begun - that happens from netstack after 5742 * IP shutdown has completed. If we leave doing these actions until 5743 * ip_stack_fini then the notify callbacks for the net_*_unregister 5744 * are happening against a backdrop of shattered terain. 5745 */ 5746 ipv4_hook_destroy(ipst); 5747 ipv6_hook_destroy(ipst); 5748 ip_net_destroy(ipst); 5749 } 5750 5751 /* 5752 * Free the IP stack instance. 5753 */ 5754 static void 5755 ip_stack_fini(netstackid_t stackid, void *arg) 5756 { 5757 ip_stack_t *ipst = (ip_stack_t *)arg; 5758 int ret; 5759 5760 #ifdef NS_DEBUG 5761 printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); 5762 #endif 5763 rw_destroy(&ipst->ips_srcid_lock); 5764 5765 ip_kstat_fini(stackid, ipst->ips_ip_mibkp); 5766 ipst->ips_ip_mibkp = NULL; 5767 icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp); 5768 ipst->ips_icmp_mibkp = NULL; 5769 ip_kstat2_fini(stackid, ipst->ips_ip_kstat); 5770 ipst->ips_ip_kstat = NULL; 5771 bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics)); 5772 ip6_kstat_fini(stackid, ipst->ips_ip6_kstat); 5773 ipst->ips_ip6_kstat = NULL; 5774 bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics)); 5775 5776 nd_free(&ipst->ips_ip_g_nd); 5777 kmem_free(ipst->ips_param_arr, sizeof (lcl_param_arr)); 5778 ipst->ips_param_arr = NULL; 5779 kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 5780 ipst->ips_ndp_arr = NULL; 5781 5782 ip_mrouter_stack_destroy(ipst); 5783 5784 mutex_destroy(&ipst->ips_ip_mi_lock); 5785 rw_destroy(&ipst->ips_ipsec_capab_ills_lock); 5786 rw_destroy(&ipst->ips_ill_g_usesrc_lock); 5787 rw_destroy(&ipst->ips_ip_g_nd_lock); 5788 5789 ret = untimeout(ipst->ips_igmp_timeout_id); 5790 if (ret == -1) { 5791 ASSERT(ipst->ips_igmp_timeout_id == 0); 5792 } else { 5793 ASSERT(ipst->ips_igmp_timeout_id != 0); 5794 ipst->ips_igmp_timeout_id = 0; 5795 } 5796 ret = untimeout(ipst->ips_igmp_slowtimeout_id); 5797 if (ret == -1) { 5798 ASSERT(ipst->ips_igmp_slowtimeout_id == 0); 5799 } else { 5800 ASSERT(ipst->ips_igmp_slowtimeout_id != 0); 5801 ipst->ips_igmp_slowtimeout_id = 0; 5802 } 5803 ret = untimeout(ipst->ips_mld_timeout_id); 5804 if (ret == -1) { 5805 ASSERT(ipst->ips_mld_timeout_id == 0); 5806 } else { 5807 ASSERT(ipst->ips_mld_timeout_id != 0); 5808 ipst->ips_mld_timeout_id = 0; 5809 } 5810 ret = untimeout(ipst->ips_mld_slowtimeout_id); 5811 if (ret == -1) { 5812 ASSERT(ipst->ips_mld_slowtimeout_id == 0); 5813 } else { 5814 ASSERT(ipst->ips_mld_slowtimeout_id != 0); 5815 ipst->ips_mld_slowtimeout_id = 0; 5816 } 5817 ret = untimeout(ipst->ips_ip_ire_expire_id); 5818 if (ret == -1) { 5819 ASSERT(ipst->ips_ip_ire_expire_id == 0); 5820 } else { 5821 ASSERT(ipst->ips_ip_ire_expire_id != 0); 5822 ipst->ips_ip_ire_expire_id = 0; 5823 } 5824 5825 mutex_destroy(&ipst->ips_igmp_timer_lock); 5826 mutex_destroy(&ipst->ips_mld_timer_lock); 5827 mutex_destroy(&ipst->ips_igmp_slowtimeout_lock); 5828 mutex_destroy(&ipst->ips_mld_slowtimeout_lock); 5829 mutex_destroy(&ipst->ips_ip_addr_avail_lock); 5830 rw_destroy(&ipst->ips_ill_g_lock); 5831 5832 ip_ire_fini(ipst); 5833 ip6_asp_free(ipst); 5834 conn_drain_fini(ipst); 5835 ipcl_destroy(ipst); 5836 5837 mutex_destroy(&ipst->ips_ndp4->ndp_g_lock); 5838 mutex_destroy(&ipst->ips_ndp6->ndp_g_lock); 5839 kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t)); 5840 ipst->ips_ndp4 = NULL; 5841 kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t)); 5842 ipst->ips_ndp6 = NULL; 5843 5844 if (ipst->ips_loopback_ksp != NULL) { 5845 kstat_delete_netstack(ipst->ips_loopback_ksp, stackid); 5846 ipst->ips_loopback_ksp = NULL; 5847 } 5848 5849 kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t)); 5850 ipst->ips_phyint_g_list = NULL; 5851 kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS); 5852 ipst->ips_ill_g_heads = NULL; 5853 5854 kmem_free(ipst, sizeof (*ipst)); 5855 } 5856 5857 /* 5858 * This function is called from the TSD destructor, and is used to debug 5859 * reference count issues in IP. See block comment in <inet/ip_if.h> for 5860 * details. 5861 */ 5862 static void 5863 ip_thread_exit(void *phash) 5864 { 5865 th_hash_t *thh = phash; 5866 5867 rw_enter(&ip_thread_rwlock, RW_WRITER); 5868 list_remove(&ip_thread_list, thh); 5869 rw_exit(&ip_thread_rwlock); 5870 mod_hash_destroy_hash(thh->thh_hash); 5871 kmem_free(thh, sizeof (*thh)); 5872 } 5873 5874 /* 5875 * Called when the IP kernel module is loaded into the kernel 5876 */ 5877 void 5878 ip_ddi_init(void) 5879 { 5880 ip_input_proc = ip_squeue_switch(ip_squeue_enter); 5881 5882 /* 5883 * For IP and TCP the minor numbers should start from 2 since we have 4 5884 * initial devices: ip, ip6, tcp, tcp6. 5885 */ 5886 /* 5887 * If this is a 64-bit kernel, then create two separate arenas - 5888 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the 5889 * other for socket apps in the range 2^^18 through 2^^32-1. 5890 */ 5891 ip_minor_arena_la = NULL; 5892 ip_minor_arena_sa = NULL; 5893 #if defined(_LP64) 5894 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5895 INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) { 5896 cmn_err(CE_PANIC, 5897 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5898 } 5899 if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la", 5900 MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) { 5901 cmn_err(CE_PANIC, 5902 "ip_ddi_init: ip_minor_arena_la creation failed\n"); 5903 } 5904 #else 5905 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5906 INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) { 5907 cmn_err(CE_PANIC, 5908 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5909 } 5910 #endif 5911 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5912 5913 ipcl_g_init(); 5914 ip_ire_g_init(); 5915 ip_net_g_init(); 5916 5917 #ifdef DEBUG 5918 tsd_create(&ip_thread_data, ip_thread_exit); 5919 rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL); 5920 list_create(&ip_thread_list, sizeof (th_hash_t), 5921 offsetof(th_hash_t, thh_link)); 5922 #endif 5923 5924 /* 5925 * We want to be informed each time a stack is created or 5926 * destroyed in the kernel, so we can maintain the 5927 * set of udp_stack_t's. 5928 */ 5929 netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown, 5930 ip_stack_fini); 5931 5932 ipsec_policy_g_init(); 5933 tcp_ddi_g_init(); 5934 sctp_ddi_g_init(); 5935 5936 tnet_init(); 5937 5938 udp_ddi_init(); 5939 rts_ddi_init(); 5940 icmp_ddi_init(); 5941 } 5942 5943 /* 5944 * Initialize the IP stack instance. 5945 */ 5946 static void * 5947 ip_stack_init(netstackid_t stackid, netstack_t *ns) 5948 { 5949 ip_stack_t *ipst; 5950 ipparam_t *pa; 5951 ipndp_t *na; 5952 5953 #ifdef NS_DEBUG 5954 printf("ip_stack_init(stack %d)\n", stackid); 5955 #endif 5956 5957 ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP); 5958 ipst->ips_netstack = ns; 5959 5960 ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS, 5961 KM_SLEEP); 5962 ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t), 5963 KM_SLEEP); 5964 ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5965 ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5966 mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5967 mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5968 5969 rw_init(&ipst->ips_ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5970 mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5971 ipst->ips_igmp_deferred_next = INFINITY; 5972 mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5973 ipst->ips_mld_deferred_next = INFINITY; 5974 mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5975 mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5976 mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5977 mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5978 rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL); 5979 rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5980 rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5981 5982 ipcl_init(ipst); 5983 ip_ire_init(ipst); 5984 ip6_asp_init(ipst); 5985 ipif_init(ipst); 5986 conn_drain_init(ipst); 5987 ip_mrouter_stack_init(ipst); 5988 5989 ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT; 5990 ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 5991 5992 ipst->ips_ip_multirt_log_interval = 1000; 5993 5994 ipst->ips_ip_g_forward = IP_FORWARD_DEFAULT; 5995 ipst->ips_ipv6_forward = IP_FORWARD_DEFAULT; 5996 ipst->ips_ill_index = 1; 5997 5998 ipst->ips_saved_ip_g_forward = -1; 5999 ipst->ips_reg_vif_num = ALL_VIFS; /* Index to Register vif */ 6000 6001 pa = (ipparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP); 6002 ipst->ips_param_arr = pa; 6003 bcopy(lcl_param_arr, ipst->ips_param_arr, sizeof (lcl_param_arr)); 6004 6005 na = (ipndp_t *)kmem_alloc(sizeof (lcl_ndp_arr), KM_SLEEP); 6006 ipst->ips_ndp_arr = na; 6007 bcopy(lcl_ndp_arr, ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 6008 ipst->ips_ndp_arr[IPNDP_IP_FORWARDING_OFFSET].ip_ndp_data = 6009 (caddr_t)&ipst->ips_ip_g_forward; 6010 ipst->ips_ndp_arr[IPNDP_IP6_FORWARDING_OFFSET].ip_ndp_data = 6011 (caddr_t)&ipst->ips_ipv6_forward; 6012 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_name, 6013 "ip_cgtp_filter") == 0); 6014 ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data = 6015 (caddr_t)&ipst->ips_ip_cgtp_filter; 6016 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name, 6017 "ipmp_hook_emulation") == 0); 6018 ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data = 6019 (caddr_t)&ipst->ips_ipmp_hook_emulation; 6020 6021 (void) ip_param_register(&ipst->ips_ip_g_nd, 6022 ipst->ips_param_arr, A_CNT(lcl_param_arr), 6023 ipst->ips_ndp_arr, A_CNT(lcl_ndp_arr)); 6024 6025 ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst); 6026 ipst->ips_icmp_mibkp = icmp_kstat_init(stackid); 6027 ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics); 6028 ipst->ips_ip6_kstat = 6029 ip6_kstat_init(stackid, &ipst->ips_ip6_statistics); 6030 6031 ipst->ips_ipmp_enable_failback = B_TRUE; 6032 6033 ipst->ips_ip_src_id = 1; 6034 rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); 6035 6036 ip_net_init(ipst, ns); 6037 ipv4_hook_init(ipst); 6038 ipv6_hook_init(ipst); 6039 6040 return (ipst); 6041 } 6042 6043 /* 6044 * Allocate and initialize a DLPI template of the specified length. (May be 6045 * called as writer.) 6046 */ 6047 mblk_t * 6048 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 6049 { 6050 mblk_t *mp; 6051 6052 mp = allocb(len, BPRI_MED); 6053 if (!mp) 6054 return (NULL); 6055 6056 /* 6057 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 6058 * of which we don't seem to use) are sent with M_PCPROTO, and 6059 * that other DLPI are M_PROTO. 6060 */ 6061 if (prim == DL_INFO_REQ) { 6062 mp->b_datap->db_type = M_PCPROTO; 6063 } else { 6064 mp->b_datap->db_type = M_PROTO; 6065 } 6066 6067 mp->b_wptr = mp->b_rptr + len; 6068 bzero(mp->b_rptr, len); 6069 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 6070 return (mp); 6071 } 6072 6073 /* 6074 * Debug formatting routine. Returns a character string representation of the 6075 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 6076 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 6077 * 6078 * Once the ndd table-printing interfaces are removed, this can be changed to 6079 * standard dotted-decimal form. 6080 */ 6081 char * 6082 ip_dot_addr(ipaddr_t addr, char *buf) 6083 { 6084 uint8_t *ap = (uint8_t *)&addr; 6085 6086 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 6087 ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF); 6088 return (buf); 6089 } 6090 6091 /* 6092 * Write the given MAC address as a printable string in the usual colon- 6093 * separated format. 6094 */ 6095 const char * 6096 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen) 6097 { 6098 char *bp; 6099 6100 if (alen == 0 || buflen < 4) 6101 return ("?"); 6102 bp = buf; 6103 for (;;) { 6104 /* 6105 * If there are more MAC address bytes available, but we won't 6106 * have any room to print them, then add "..." to the string 6107 * instead. See below for the 'magic number' explanation. 6108 */ 6109 if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) { 6110 (void) strcpy(bp, "..."); 6111 break; 6112 } 6113 (void) sprintf(bp, "%02x", *addr++); 6114 bp += 2; 6115 if (--alen == 0) 6116 break; 6117 *bp++ = ':'; 6118 buflen -= 3; 6119 /* 6120 * At this point, based on the first 'if' statement above, 6121 * either alen == 1 and buflen >= 3, or alen > 1 and 6122 * buflen >= 4. The first case leaves room for the final "xx" 6123 * number and trailing NUL byte. The second leaves room for at 6124 * least "...". Thus the apparently 'magic' numbers chosen for 6125 * that statement. 6126 */ 6127 } 6128 return (buf); 6129 } 6130 6131 /* 6132 * Send an ICMP error after patching up the packet appropriately. Returns 6133 * non-zero if the appropriate MIB should be bumped; zero otherwise. 6134 */ 6135 static boolean_t 6136 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 6137 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, 6138 zoneid_t zoneid, ip_stack_t *ipst) 6139 { 6140 ipha_t *ipha; 6141 mblk_t *first_mp; 6142 boolean_t secure; 6143 unsigned char db_type; 6144 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6145 6146 first_mp = mp; 6147 if (mctl_present) { 6148 mp = mp->b_cont; 6149 secure = ipsec_in_is_secure(first_mp); 6150 ASSERT(mp != NULL); 6151 } else { 6152 /* 6153 * If this is an ICMP error being reported - which goes 6154 * up as M_CTLs, we need to convert them to M_DATA till 6155 * we finish checking with global policy because 6156 * ipsec_check_global_policy() assumes M_DATA as clear 6157 * and M_CTL as secure. 6158 */ 6159 db_type = DB_TYPE(mp); 6160 DB_TYPE(mp) = M_DATA; 6161 secure = B_FALSE; 6162 } 6163 /* 6164 * We are generating an icmp error for some inbound packet. 6165 * Called from all ip_fanout_(udp, tcp, proto) functions. 6166 * Before we generate an error, check with global policy 6167 * to see whether this is allowed to enter the system. As 6168 * there is no "conn", we are checking with global policy. 6169 */ 6170 ipha = (ipha_t *)mp->b_rptr; 6171 if (secure || ipss->ipsec_inbound_v4_policy_present) { 6172 first_mp = ipsec_check_global_policy(first_mp, NULL, 6173 ipha, NULL, mctl_present, ipst->ips_netstack); 6174 if (first_mp == NULL) 6175 return (B_FALSE); 6176 } 6177 6178 if (!mctl_present) 6179 DB_TYPE(mp) = db_type; 6180 6181 if (flags & IP_FF_SEND_ICMP) { 6182 if (flags & IP_FF_HDR_COMPLETE) { 6183 if (ip_hdr_complete(ipha, zoneid, ipst)) { 6184 freemsg(first_mp); 6185 return (B_TRUE); 6186 } 6187 } 6188 if (flags & IP_FF_CKSUM) { 6189 /* 6190 * Have to correct checksum since 6191 * the packet might have been 6192 * fragmented and the reassembly code in ip_rput 6193 * does not restore the IP checksum. 6194 */ 6195 ipha->ipha_hdr_checksum = 0; 6196 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6197 } 6198 switch (icmp_type) { 6199 case ICMP_DEST_UNREACHABLE: 6200 icmp_unreachable(WR(q), first_mp, icmp_code, zoneid, 6201 ipst); 6202 break; 6203 default: 6204 freemsg(first_mp); 6205 break; 6206 } 6207 } else { 6208 freemsg(first_mp); 6209 return (B_FALSE); 6210 } 6211 6212 return (B_TRUE); 6213 } 6214 6215 /* 6216 * Used to send an ICMP error message when a packet is received for 6217 * a protocol that is not supported. The mblk passed as argument 6218 * is consumed by this function. 6219 */ 6220 void 6221 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid, 6222 ip_stack_t *ipst) 6223 { 6224 mblk_t *mp; 6225 ipha_t *ipha; 6226 ill_t *ill; 6227 ipsec_in_t *ii; 6228 6229 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6230 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6231 6232 mp = ipsec_mp->b_cont; 6233 ipsec_mp->b_cont = NULL; 6234 ipha = (ipha_t *)mp->b_rptr; 6235 /* Get ill from index in ipsec_in_t. */ 6236 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 6237 (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL, 6238 ipst); 6239 if (ill != NULL) { 6240 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 6241 if (ip_fanout_send_icmp(q, mp, flags, 6242 ICMP_DEST_UNREACHABLE, 6243 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) { 6244 BUMP_MIB(ill->ill_ip_mib, 6245 ipIfStatsInUnknownProtos); 6246 } 6247 } else { 6248 if (ip_fanout_send_icmp_v6(q, mp, flags, 6249 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 6250 0, B_FALSE, zoneid, ipst)) { 6251 BUMP_MIB(ill->ill_ip_mib, 6252 ipIfStatsInUnknownProtos); 6253 } 6254 } 6255 ill_refrele(ill); 6256 } else { /* re-link for the freemsg() below. */ 6257 ipsec_mp->b_cont = mp; 6258 } 6259 6260 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 6261 freemsg(ipsec_mp); 6262 } 6263 6264 /* 6265 * See if the inbound datagram has had IPsec processing applied to it. 6266 */ 6267 boolean_t 6268 ipsec_in_is_secure(mblk_t *ipsec_mp) 6269 { 6270 ipsec_in_t *ii; 6271 6272 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6273 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6274 6275 if (ii->ipsec_in_loopback) { 6276 return (ii->ipsec_in_secure); 6277 } else { 6278 return (ii->ipsec_in_ah_sa != NULL || 6279 ii->ipsec_in_esp_sa != NULL || 6280 ii->ipsec_in_decaps); 6281 } 6282 } 6283 6284 /* 6285 * Handle protocols with which IP is less intimate. There 6286 * can be more than one stream bound to a particular 6287 * protocol. When this is the case, normally each one gets a copy 6288 * of any incoming packets. 6289 * 6290 * IPsec NOTE : 6291 * 6292 * Don't allow a secure packet going up a non-secure connection. 6293 * We don't allow this because 6294 * 6295 * 1) Reply might go out in clear which will be dropped at 6296 * the sending side. 6297 * 2) If the reply goes out in clear it will give the 6298 * adversary enough information for getting the key in 6299 * most of the cases. 6300 * 6301 * Moreover getting a secure packet when we expect clear 6302 * implies that SA's were added without checking for 6303 * policy on both ends. This should not happen once ISAKMP 6304 * is used to negotiate SAs as SAs will be added only after 6305 * verifying the policy. 6306 * 6307 * NOTE : If the packet was tunneled and not multicast we only send 6308 * to it the first match. Unlike TCP and UDP fanouts this doesn't fall 6309 * back to delivering packets to AF_INET6 raw sockets. 6310 * 6311 * IPQoS Notes: 6312 * Once we have determined the client, invoke IPPF processing. 6313 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6314 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6315 * ip_policy will be false. 6316 * 6317 * Zones notes: 6318 * Currently only applications in the global zone can create raw sockets for 6319 * protocols other than ICMP. So unlike the broadcast / multicast case of 6320 * ip_fanout_udp(), we only send a copy of the packet to streams in the 6321 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 6322 */ 6323 static void 6324 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 6325 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 6326 zoneid_t zoneid) 6327 { 6328 queue_t *rq; 6329 mblk_t *mp1, *first_mp1; 6330 uint_t protocol = ipha->ipha_protocol; 6331 ipaddr_t dst; 6332 boolean_t one_only; 6333 mblk_t *first_mp = mp; 6334 boolean_t secure; 6335 uint32_t ill_index; 6336 conn_t *connp, *first_connp, *next_connp; 6337 connf_t *connfp; 6338 boolean_t shared_addr; 6339 mib2_ipIfStatsEntry_t *mibptr; 6340 ip_stack_t *ipst = recv_ill->ill_ipst; 6341 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6342 6343 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 6344 if (mctl_present) { 6345 mp = first_mp->b_cont; 6346 secure = ipsec_in_is_secure(first_mp); 6347 ASSERT(mp != NULL); 6348 } else { 6349 secure = B_FALSE; 6350 } 6351 dst = ipha->ipha_dst; 6352 /* 6353 * If the packet was tunneled and not multicast we only send to it 6354 * the first match. 6355 */ 6356 one_only = ((protocol == IPPROTO_ENCAP || protocol == IPPROTO_IPV6) && 6357 !CLASSD(dst)); 6358 6359 shared_addr = (zoneid == ALL_ZONES); 6360 if (shared_addr) { 6361 /* 6362 * We don't allow multilevel ports for raw IP, so no need to 6363 * check for that here. 6364 */ 6365 zoneid = tsol_packet_to_zoneid(mp); 6366 } 6367 6368 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 6369 mutex_enter(&connfp->connf_lock); 6370 connp = connfp->connf_head; 6371 for (connp = connfp->connf_head; connp != NULL; 6372 connp = connp->conn_next) { 6373 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 6374 zoneid) && 6375 (!is_system_labeled() || 6376 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6377 connp))) { 6378 break; 6379 } 6380 } 6381 6382 if (connp == NULL || connp->conn_upq == NULL) { 6383 /* 6384 * No one bound to these addresses. Is 6385 * there a client that wants all 6386 * unclaimed datagrams? 6387 */ 6388 mutex_exit(&connfp->connf_lock); 6389 /* 6390 * Check for IPPROTO_ENCAP... 6391 */ 6392 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { 6393 /* 6394 * If an IPsec mblk is here on a multicast 6395 * tunnel (using ip_mroute stuff), check policy here, 6396 * THEN ship off to ip_mroute_decap(). 6397 * 6398 * BTW, If I match a configured IP-in-IP 6399 * tunnel, this path will not be reached, and 6400 * ip_mroute_decap will never be called. 6401 */ 6402 first_mp = ipsec_check_global_policy(first_mp, connp, 6403 ipha, NULL, mctl_present, ipst->ips_netstack); 6404 if (first_mp != NULL) { 6405 if (mctl_present) 6406 freeb(first_mp); 6407 ip_mroute_decap(q, mp, ill); 6408 } /* Else we already freed everything! */ 6409 } else { 6410 /* 6411 * Otherwise send an ICMP protocol unreachable. 6412 */ 6413 if (ip_fanout_send_icmp(q, first_mp, flags, 6414 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 6415 mctl_present, zoneid, ipst)) { 6416 BUMP_MIB(mibptr, ipIfStatsInUnknownProtos); 6417 } 6418 } 6419 return; 6420 } 6421 CONN_INC_REF(connp); 6422 first_connp = connp; 6423 6424 /* 6425 * Only send message to one tunnel driver by immediately 6426 * terminating the loop. 6427 */ 6428 connp = one_only ? NULL : connp->conn_next; 6429 6430 for (;;) { 6431 while (connp != NULL) { 6432 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 6433 flags, zoneid) && 6434 (!is_system_labeled() || 6435 tsol_receive_local(mp, &dst, IPV4_VERSION, 6436 shared_addr, connp))) 6437 break; 6438 connp = connp->conn_next; 6439 } 6440 6441 /* 6442 * Copy the packet. 6443 */ 6444 if (connp == NULL || connp->conn_upq == NULL || 6445 (((first_mp1 = dupmsg(first_mp)) == NULL) && 6446 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 6447 /* 6448 * No more interested clients or memory 6449 * allocation failed 6450 */ 6451 connp = first_connp; 6452 break; 6453 } 6454 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 6455 CONN_INC_REF(connp); 6456 mutex_exit(&connfp->connf_lock); 6457 rq = connp->conn_rq; 6458 if (!canputnext(rq)) { 6459 if (flags & IP_FF_RAWIP) { 6460 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6461 } else { 6462 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6463 } 6464 6465 freemsg(first_mp1); 6466 } else { 6467 /* 6468 * Don't enforce here if we're an actual tunnel - 6469 * let "tun" do it instead. 6470 */ 6471 if (!IPCL_IS_IPTUN(connp) && 6472 (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 6473 secure)) { 6474 first_mp1 = ipsec_check_inbound_policy 6475 (first_mp1, connp, ipha, NULL, 6476 mctl_present); 6477 } 6478 if (first_mp1 != NULL) { 6479 int in_flags = 0; 6480 /* 6481 * ip_fanout_proto also gets called from 6482 * icmp_inbound_error_fanout, in which case 6483 * the msg type is M_CTL. Don't add info 6484 * in this case for the time being. In future 6485 * when there is a need for knowing the 6486 * inbound iface index for ICMP error msgs, 6487 * then this can be changed. 6488 */ 6489 if (connp->conn_recvif) 6490 in_flags = IPF_RECVIF; 6491 /* 6492 * The ULP may support IP_RECVPKTINFO for both 6493 * IP v4 and v6 so pass the appropriate argument 6494 * based on conn IP version. 6495 */ 6496 if (connp->conn_ip_recvpktinfo) { 6497 if (connp->conn_af_isv6) { 6498 /* 6499 * V6 only needs index 6500 */ 6501 in_flags |= IPF_RECVIF; 6502 } else { 6503 /* 6504 * V4 needs index + 6505 * matching address. 6506 */ 6507 in_flags |= IPF_RECVADDR; 6508 } 6509 } 6510 if ((in_flags != 0) && 6511 (mp->b_datap->db_type != M_CTL)) { 6512 /* 6513 * the actual data will be 6514 * contained in b_cont upon 6515 * successful return of the 6516 * following call else 6517 * original mblk is returned 6518 */ 6519 ASSERT(recv_ill != NULL); 6520 mp1 = ip_add_info(mp1, recv_ill, 6521 in_flags, IPCL_ZONEID(connp), ipst); 6522 } 6523 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6524 if (mctl_present) 6525 freeb(first_mp1); 6526 (connp->conn_recv)(connp, mp1, NULL); 6527 } 6528 } 6529 mutex_enter(&connfp->connf_lock); 6530 /* Follow the next pointer before releasing the conn. */ 6531 next_connp = connp->conn_next; 6532 CONN_DEC_REF(connp); 6533 connp = next_connp; 6534 } 6535 6536 /* Last one. Send it upstream. */ 6537 mutex_exit(&connfp->connf_lock); 6538 6539 /* 6540 * If this packet is coming from icmp_inbound_error_fanout ip_policy 6541 * will be set to false. 6542 */ 6543 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6544 ill_index = ill->ill_phyint->phyint_ifindex; 6545 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6546 if (mp == NULL) { 6547 CONN_DEC_REF(connp); 6548 if (mctl_present) { 6549 freeb(first_mp); 6550 } 6551 return; 6552 } 6553 } 6554 6555 rq = connp->conn_rq; 6556 if (!canputnext(rq)) { 6557 if (flags & IP_FF_RAWIP) { 6558 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6559 } else { 6560 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6561 } 6562 6563 freemsg(first_mp); 6564 } else { 6565 if (IPCL_IS_IPTUN(connp)) { 6566 /* 6567 * Tunneled packet. We enforce policy in the tunnel 6568 * module itself. 6569 * 6570 * Send the WHOLE packet up (incl. IPSEC_IN) without 6571 * a policy check. 6572 * FIXME to use conn_recv for tun later. 6573 */ 6574 putnext(rq, first_mp); 6575 CONN_DEC_REF(connp); 6576 return; 6577 } 6578 6579 if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) { 6580 first_mp = ipsec_check_inbound_policy(first_mp, connp, 6581 ipha, NULL, mctl_present); 6582 } 6583 6584 if (first_mp != NULL) { 6585 int in_flags = 0; 6586 6587 /* 6588 * ip_fanout_proto also gets called 6589 * from icmp_inbound_error_fanout, in 6590 * which case the msg type is M_CTL. 6591 * Don't add info in this case for time 6592 * being. In future when there is a 6593 * need for knowing the inbound iface 6594 * index for ICMP error msgs, then this 6595 * can be changed 6596 */ 6597 if (connp->conn_recvif) 6598 in_flags = IPF_RECVIF; 6599 if (connp->conn_ip_recvpktinfo) { 6600 if (connp->conn_af_isv6) { 6601 /* 6602 * V6 only needs index 6603 */ 6604 in_flags |= IPF_RECVIF; 6605 } else { 6606 /* 6607 * V4 needs index + 6608 * matching address. 6609 */ 6610 in_flags |= IPF_RECVADDR; 6611 } 6612 } 6613 if ((in_flags != 0) && 6614 (mp->b_datap->db_type != M_CTL)) { 6615 6616 /* 6617 * the actual data will be contained in 6618 * b_cont upon successful return 6619 * of the following call else original 6620 * mblk is returned 6621 */ 6622 ASSERT(recv_ill != NULL); 6623 mp = ip_add_info(mp, recv_ill, 6624 in_flags, IPCL_ZONEID(connp), ipst); 6625 } 6626 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6627 (connp->conn_recv)(connp, mp, NULL); 6628 if (mctl_present) 6629 freeb(first_mp); 6630 } 6631 } 6632 CONN_DEC_REF(connp); 6633 } 6634 6635 /* 6636 * Fanout for TCP packets 6637 * The caller puts <fport, lport> in the ports parameter. 6638 * 6639 * IPQoS Notes 6640 * Before sending it to the client, invoke IPPF processing. 6641 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6642 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6643 * ip_policy is false. 6644 */ 6645 static void 6646 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 6647 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 6648 { 6649 mblk_t *first_mp; 6650 boolean_t secure; 6651 uint32_t ill_index; 6652 int ip_hdr_len; 6653 tcph_t *tcph; 6654 boolean_t syn_present = B_FALSE; 6655 conn_t *connp; 6656 ip_stack_t *ipst = recv_ill->ill_ipst; 6657 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6658 6659 ASSERT(recv_ill != NULL); 6660 6661 first_mp = mp; 6662 if (mctl_present) { 6663 ASSERT(first_mp->b_datap->db_type == M_CTL); 6664 mp = first_mp->b_cont; 6665 secure = ipsec_in_is_secure(first_mp); 6666 ASSERT(mp != NULL); 6667 } else { 6668 secure = B_FALSE; 6669 } 6670 6671 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 6672 6673 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 6674 zoneid, ipst)) == NULL) { 6675 /* 6676 * No connected connection or listener. Send a 6677 * TH_RST via tcp_xmit_listeners_reset. 6678 */ 6679 6680 /* Initiate IPPf processing, if needed. */ 6681 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 6682 uint32_t ill_index; 6683 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6684 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 6685 if (first_mp == NULL) 6686 return; 6687 } 6688 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6689 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 6690 zoneid)); 6691 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6692 ipst->ips_netstack->netstack_tcp, NULL); 6693 return; 6694 } 6695 6696 /* 6697 * Allocate the SYN for the TCP connection here itself 6698 */ 6699 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6700 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 6701 if (IPCL_IS_TCP(connp)) { 6702 squeue_t *sqp; 6703 6704 /* 6705 * For fused tcp loopback, assign the eager's 6706 * squeue to be that of the active connect's. 6707 * Note that we don't check for IP_FF_LOOPBACK 6708 * here since this routine gets called only 6709 * for loopback (unlike the IPv6 counterpart). 6710 */ 6711 ASSERT(Q_TO_CONN(q) != NULL); 6712 if (do_tcp_fusion && 6713 !CONN_INBOUND_POLICY_PRESENT(connp, ipss) && 6714 !secure && 6715 !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy && 6716 IPCL_IS_TCP(Q_TO_CONN(q))) { 6717 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 6718 sqp = Q_TO_CONN(q)->conn_sqp; 6719 } else { 6720 sqp = IP_SQUEUE_GET(lbolt); 6721 } 6722 6723 mp->b_datap->db_struioflag |= STRUIO_EAGER; 6724 DB_CKSUMSTART(mp) = (intptr_t)sqp; 6725 syn_present = B_TRUE; 6726 } 6727 } 6728 6729 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 6730 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 6731 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6732 if ((flags & TH_RST) || (flags & TH_URG)) { 6733 CONN_DEC_REF(connp); 6734 freemsg(first_mp); 6735 return; 6736 } 6737 if (flags & TH_ACK) { 6738 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6739 ipst->ips_netstack->netstack_tcp, connp); 6740 CONN_DEC_REF(connp); 6741 return; 6742 } 6743 6744 CONN_DEC_REF(connp); 6745 freemsg(first_mp); 6746 return; 6747 } 6748 6749 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 6750 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6751 NULL, mctl_present); 6752 if (first_mp == NULL) { 6753 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6754 CONN_DEC_REF(connp); 6755 return; 6756 } 6757 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 6758 ASSERT(syn_present); 6759 if (mctl_present) { 6760 ASSERT(first_mp != mp); 6761 first_mp->b_datap->db_struioflag |= 6762 STRUIO_POLICY; 6763 } else { 6764 ASSERT(first_mp == mp); 6765 mp->b_datap->db_struioflag &= 6766 ~STRUIO_EAGER; 6767 mp->b_datap->db_struioflag |= 6768 STRUIO_POLICY; 6769 } 6770 } else { 6771 /* 6772 * Discard first_mp early since we're dealing with a 6773 * fully-connected conn_t and tcp doesn't do policy in 6774 * this case. 6775 */ 6776 if (mctl_present) { 6777 freeb(first_mp); 6778 mctl_present = B_FALSE; 6779 } 6780 first_mp = mp; 6781 } 6782 } 6783 6784 /* 6785 * Initiate policy processing here if needed. If we get here from 6786 * icmp_inbound_error_fanout, ip_policy is false. 6787 */ 6788 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6789 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6790 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6791 if (mp == NULL) { 6792 CONN_DEC_REF(connp); 6793 if (mctl_present) 6794 freeb(first_mp); 6795 return; 6796 } else if (mctl_present) { 6797 ASSERT(first_mp != mp); 6798 first_mp->b_cont = mp; 6799 } else { 6800 first_mp = mp; 6801 } 6802 } 6803 6804 6805 6806 /* Handle socket options. */ 6807 if (!syn_present && 6808 connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 6809 /* Add header */ 6810 ASSERT(recv_ill != NULL); 6811 /* 6812 * Since tcp does not support IP_RECVPKTINFO for V4, only pass 6813 * IPF_RECVIF. 6814 */ 6815 mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp), 6816 ipst); 6817 if (mp == NULL) { 6818 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6819 CONN_DEC_REF(connp); 6820 if (mctl_present) 6821 freeb(first_mp); 6822 return; 6823 } else if (mctl_present) { 6824 /* 6825 * ip_add_info might return a new mp. 6826 */ 6827 ASSERT(first_mp != mp); 6828 first_mp->b_cont = mp; 6829 } else { 6830 first_mp = mp; 6831 } 6832 } 6833 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6834 if (IPCL_IS_TCP(connp)) { 6835 /* do not drain, certain use cases can blow the stack */ 6836 squeue_enter_nodrain(connp->conn_sqp, first_mp, 6837 connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); 6838 } else { 6839 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 6840 (connp->conn_recv)(connp, first_mp, NULL); 6841 CONN_DEC_REF(connp); 6842 } 6843 } 6844 6845 /* 6846 * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or 6847 * pass it along to ESP if the SPI is non-zero. Returns TRUE if the mblk 6848 * is not consumed. 6849 * 6850 * One of four things can happen, all of which affect the passed-in mblk: 6851 * 6852 * 1.) ICMP messages that go through here just get returned TRUE. 6853 * 6854 * 2.) The packet is stock UDP and gets its zero-SPI stripped. Return TRUE. 6855 * 6856 * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent 6857 * ESP packet, and is passed along to ESP for consumption. Return FALSE. 6858 * 6859 * 4.) The packet is an ESP-in-UDP Keepalive. Drop it and return FALSE. 6860 */ 6861 static boolean_t 6862 zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, 6863 ipsec_stack_t *ipss) 6864 { 6865 int shift, plen, iph_len; 6866 ipha_t *ipha; 6867 udpha_t *udpha; 6868 uint32_t *spi; 6869 uint32_t esp_ports; 6870 uint8_t *orptr; 6871 boolean_t free_ire; 6872 6873 if (DB_TYPE(mp) == M_CTL) { 6874 /* 6875 * ICMP message with UDP inside. Don't bother stripping, just 6876 * send it up. 6877 * 6878 * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going 6879 * to ignore errors set by ICMP anyway ('cause they might be 6880 * forged), but that's the app's decision, not ours. 6881 */ 6882 6883 /* Bunch of reality checks for DEBUG kernels... */ 6884 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 6885 ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP); 6886 6887 return (B_TRUE); 6888 } 6889 6890 ipha = (ipha_t *)mp->b_rptr; 6891 iph_len = IPH_HDR_LENGTH(ipha); 6892 plen = ntohs(ipha->ipha_length); 6893 6894 if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) { 6895 /* 6896 * Most likely a keepalive for the benefit of an intervening 6897 * NAT. These aren't for us, per se, so drop it. 6898 * 6899 * RFC 3947/8 doesn't say for sure what to do for 2-3 6900 * byte packets (keepalives are 1-byte), but we'll drop them 6901 * also. 6902 */ 6903 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6904 DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper); 6905 return (B_FALSE); 6906 } 6907 6908 if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) { 6909 /* might as well pull it all up - it might be ESP. */ 6910 if (!pullupmsg(mp, -1)) { 6911 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6912 DROPPER(ipss, ipds_esp_nomem), 6913 &ipss->ipsec_dropper); 6914 return (B_FALSE); 6915 } 6916 6917 ipha = (ipha_t *)mp->b_rptr; 6918 } 6919 spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t)); 6920 if (*spi == 0) { 6921 /* UDP packet - remove 0-spi. */ 6922 shift = sizeof (uint32_t); 6923 } else { 6924 /* ESP-in-UDP packet - reduce to ESP. */ 6925 ipha->ipha_protocol = IPPROTO_ESP; 6926 shift = sizeof (udpha_t); 6927 } 6928 6929 /* Fix IP header */ 6930 ipha->ipha_length = htons(plen - shift); 6931 ipha->ipha_hdr_checksum = 0; 6932 6933 orptr = mp->b_rptr; 6934 mp->b_rptr += shift; 6935 6936 udpha = (udpha_t *)(orptr + iph_len); 6937 if (*spi == 0) { 6938 ASSERT((uint8_t *)ipha == orptr); 6939 udpha->uha_length = htons(plen - shift - iph_len); 6940 iph_len += sizeof (udpha_t); /* For the call to ovbcopy(). */ 6941 esp_ports = 0; 6942 } else { 6943 esp_ports = *((uint32_t *)udpha); 6944 ASSERT(esp_ports != 0); 6945 } 6946 ovbcopy(orptr, orptr + shift, iph_len); 6947 if (esp_ports != 0) /* Punt up for ESP processing. */ { 6948 ipha = (ipha_t *)(orptr + shift); 6949 6950 free_ire = (ire == NULL); 6951 if (free_ire) { 6952 /* Re-acquire ire. */ 6953 ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL, 6954 ipss->ipsec_netstack->netstack_ip); 6955 if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) { 6956 if (ire != NULL) 6957 ire_refrele(ire); 6958 /* 6959 * Do a regular freemsg(), as this is an IP 6960 * error (no local route) not an IPsec one. 6961 */ 6962 freemsg(mp); 6963 } 6964 } 6965 6966 ip_proto_input(q, mp, ipha, ire, recv_ill, esp_ports); 6967 if (free_ire) 6968 ire_refrele(ire); 6969 } 6970 6971 return (esp_ports == 0); 6972 } 6973 6974 /* 6975 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 6976 * We are responsible for disposing of mp, such as by freemsg() or putnext() 6977 * Caller is responsible for dropping references to the conn, and freeing 6978 * first_mp. 6979 * 6980 * IPQoS Notes 6981 * Before sending it to the client, invoke IPPF processing. Policy processing 6982 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 6983 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 6984 * ip_wput_local, ip_policy is false. 6985 */ 6986 static void 6987 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 6988 boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 6989 boolean_t ip_policy) 6990 { 6991 boolean_t mctl_present = (first_mp != NULL); 6992 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 6993 uint32_t ill_index; 6994 ip_stack_t *ipst = recv_ill->ill_ipst; 6995 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6996 6997 ASSERT(ill != NULL); 6998 6999 if (mctl_present) 7000 first_mp->b_cont = mp; 7001 else 7002 first_mp = mp; 7003 7004 if (CONN_UDP_FLOWCTLD(connp)) { 7005 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 7006 freemsg(first_mp); 7007 return; 7008 } 7009 7010 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 7011 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 7012 NULL, mctl_present); 7013 if (first_mp == NULL) { 7014 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 7015 return; /* Freed by ipsec_check_inbound_policy(). */ 7016 } 7017 } 7018 if (mctl_present) 7019 freeb(first_mp); 7020 7021 /* Let's hope the compilers utter "branch, predict-not-taken..." ;) */ 7022 if (connp->conn_udp->udp_nat_t_endpoint) { 7023 if (mctl_present) { 7024 /* mctl_present *shouldn't* happen. */ 7025 ip_drop_packet(mp, B_TRUE, NULL, NULL, 7026 DROPPER(ipss, ipds_esp_nat_t_ipsec), 7027 &ipss->ipsec_dropper); 7028 return; 7029 } 7030 7031 if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss)) 7032 return; 7033 } 7034 7035 /* Handle options. */ 7036 if (connp->conn_recvif) 7037 in_flags = IPF_RECVIF; 7038 /* 7039 * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag 7040 * passed to ip_add_info is based on IP version of connp. 7041 */ 7042 if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 7043 if (connp->conn_af_isv6) { 7044 /* 7045 * V6 only needs index 7046 */ 7047 in_flags |= IPF_RECVIF; 7048 } else { 7049 /* 7050 * V4 needs index + matching address. 7051 */ 7052 in_flags |= IPF_RECVADDR; 7053 } 7054 } 7055 7056 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 7057 in_flags |= IPF_RECVSLLA; 7058 7059 /* 7060 * Initiate IPPF processing here, if needed. Note first_mp won't be 7061 * freed if the packet is dropped. The caller will do so. 7062 */ 7063 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 7064 ill_index = recv_ill->ill_phyint->phyint_ifindex; 7065 ip_process(IPP_LOCAL_IN, &mp, ill_index); 7066 if (mp == NULL) { 7067 return; 7068 } 7069 } 7070 if ((in_flags != 0) && 7071 (mp->b_datap->db_type != M_CTL)) { 7072 /* 7073 * The actual data will be contained in b_cont 7074 * upon successful return of the following call 7075 * else original mblk is returned 7076 */ 7077 ASSERT(recv_ill != NULL); 7078 mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp), 7079 ipst); 7080 } 7081 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 7082 /* Send it upstream */ 7083 (connp->conn_recv)(connp, mp, NULL); 7084 } 7085 7086 /* 7087 * Fanout for UDP packets. 7088 * The caller puts <fport, lport> in the ports parameter. 7089 * 7090 * If SO_REUSEADDR is set all multicast and broadcast packets 7091 * will be delivered to all streams bound to the same port. 7092 * 7093 * Zones notes: 7094 * Multicast and broadcast packets will be distributed to streams in all zones. 7095 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 7096 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 7097 * packets. To maintain this behavior with multiple zones, the conns are grouped 7098 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 7099 * each zone. If unset, all the following conns in the same zone are skipped. 7100 */ 7101 static void 7102 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 7103 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 7104 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 7105 { 7106 uint32_t dstport, srcport; 7107 ipaddr_t dst; 7108 mblk_t *first_mp; 7109 boolean_t secure; 7110 in6_addr_t v6src; 7111 conn_t *connp; 7112 connf_t *connfp; 7113 conn_t *first_connp; 7114 conn_t *next_connp; 7115 mblk_t *mp1, *first_mp1; 7116 ipaddr_t src; 7117 zoneid_t last_zoneid; 7118 boolean_t reuseaddr; 7119 boolean_t shared_addr; 7120 boolean_t unlabeled; 7121 ip_stack_t *ipst; 7122 7123 ASSERT(recv_ill != NULL); 7124 ipst = recv_ill->ill_ipst; 7125 7126 first_mp = mp; 7127 if (mctl_present) { 7128 mp = first_mp->b_cont; 7129 first_mp->b_cont = NULL; 7130 secure = ipsec_in_is_secure(first_mp); 7131 ASSERT(mp != NULL); 7132 } else { 7133 first_mp = NULL; 7134 secure = B_FALSE; 7135 } 7136 7137 /* Extract ports in net byte order */ 7138 dstport = htons(ntohl(ports) & 0xFFFF); 7139 srcport = htons(ntohl(ports) >> 16); 7140 dst = ipha->ipha_dst; 7141 src = ipha->ipha_src; 7142 7143 unlabeled = B_FALSE; 7144 if (is_system_labeled()) 7145 /* Cred cannot be null on IPv4 */ 7146 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 7147 TSLF_UNLABELED) != 0; 7148 shared_addr = (zoneid == ALL_ZONES); 7149 if (shared_addr) { 7150 /* 7151 * No need to handle exclusive-stack zones since ALL_ZONES 7152 * only applies to the shared stack. 7153 */ 7154 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 7155 /* 7156 * If no shared MLP is found, tsol_mlp_findzone returns 7157 * ALL_ZONES. In that case, we assume it's SLP, and 7158 * search for the zone based on the packet label. 7159 * 7160 * If there is such a zone, we prefer to find a 7161 * connection in it. Otherwise, we look for a 7162 * MAC-exempt connection in any zone whose label 7163 * dominates the default label on the packet. 7164 */ 7165 if (zoneid == ALL_ZONES) 7166 zoneid = tsol_packet_to_zoneid(mp); 7167 else 7168 unlabeled = B_FALSE; 7169 } 7170 7171 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7172 mutex_enter(&connfp->connf_lock); 7173 connp = connfp->connf_head; 7174 if (!broadcast && !CLASSD(dst)) { 7175 /* 7176 * Not broadcast or multicast. Send to the one (first) 7177 * client we find. No need to check conn_wantpacket() 7178 * since IP_BOUND_IF/conn_incoming_ill does not apply to 7179 * IPv4 unicast packets. 7180 */ 7181 while ((connp != NULL) && 7182 (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) || 7183 (!IPCL_ZONE_MATCH(connp, zoneid) && 7184 !(unlabeled && connp->conn_mac_exempt)))) { 7185 /* 7186 * We keep searching since the conn did not match, 7187 * or its zone did not match and it is not either 7188 * an allzones conn or a mac exempt conn (if the 7189 * sender is unlabeled.) 7190 */ 7191 connp = connp->conn_next; 7192 } 7193 7194 if (connp == NULL || connp->conn_upq == NULL) 7195 goto notfound; 7196 7197 if (is_system_labeled() && 7198 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7199 connp)) 7200 goto notfound; 7201 7202 CONN_INC_REF(connp); 7203 mutex_exit(&connfp->connf_lock); 7204 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7205 flags, recv_ill, ip_policy); 7206 IP_STAT(ipst, ip_udp_fannorm); 7207 CONN_DEC_REF(connp); 7208 return; 7209 } 7210 7211 /* 7212 * Broadcast and multicast case 7213 * 7214 * Need to check conn_wantpacket(). 7215 * If SO_REUSEADDR has been set on the first we send the 7216 * packet to all clients that have joined the group and 7217 * match the port. 7218 */ 7219 7220 while (connp != NULL) { 7221 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 7222 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7223 (!is_system_labeled() || 7224 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7225 connp))) 7226 break; 7227 connp = connp->conn_next; 7228 } 7229 7230 if (connp == NULL || connp->conn_upq == NULL) 7231 goto notfound; 7232 7233 first_connp = connp; 7234 /* 7235 * When SO_REUSEADDR is not set, send the packet only to the first 7236 * matching connection in its zone by keeping track of the zoneid. 7237 */ 7238 reuseaddr = first_connp->conn_reuseaddr; 7239 last_zoneid = first_connp->conn_zoneid; 7240 7241 CONN_INC_REF(connp); 7242 connp = connp->conn_next; 7243 for (;;) { 7244 while (connp != NULL) { 7245 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 7246 (reuseaddr || connp->conn_zoneid != last_zoneid) && 7247 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7248 (!is_system_labeled() || 7249 tsol_receive_local(mp, &dst, IPV4_VERSION, 7250 shared_addr, connp))) 7251 break; 7252 connp = connp->conn_next; 7253 } 7254 /* 7255 * Just copy the data part alone. The mctl part is 7256 * needed just for verifying policy and it is never 7257 * sent up. 7258 */ 7259 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7260 ((mp1 = copymsg(mp)) == NULL))) { 7261 /* 7262 * No more interested clients or memory 7263 * allocation failed 7264 */ 7265 connp = first_connp; 7266 break; 7267 } 7268 if (connp->conn_zoneid != last_zoneid) { 7269 /* 7270 * Update the zoneid so that the packet isn't sent to 7271 * any more conns in the same zone unless SO_REUSEADDR 7272 * is set. 7273 */ 7274 reuseaddr = connp->conn_reuseaddr; 7275 last_zoneid = connp->conn_zoneid; 7276 } 7277 if (first_mp != NULL) { 7278 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7279 ipsec_info_type == IPSEC_IN); 7280 first_mp1 = ipsec_in_tag(first_mp, NULL, 7281 ipst->ips_netstack); 7282 if (first_mp1 == NULL) { 7283 freemsg(mp1); 7284 connp = first_connp; 7285 break; 7286 } 7287 } else { 7288 first_mp1 = NULL; 7289 } 7290 CONN_INC_REF(connp); 7291 mutex_exit(&connfp->connf_lock); 7292 /* 7293 * IPQoS notes: We don't send the packet for policy 7294 * processing here, will do it for the last one (below). 7295 * i.e. we do it per-packet now, but if we do policy 7296 * processing per-conn, then we would need to do it 7297 * here too. 7298 */ 7299 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7300 ipha, flags, recv_ill, B_FALSE); 7301 mutex_enter(&connfp->connf_lock); 7302 /* Follow the next pointer before releasing the conn. */ 7303 next_connp = connp->conn_next; 7304 IP_STAT(ipst, ip_udp_fanmb); 7305 CONN_DEC_REF(connp); 7306 connp = next_connp; 7307 } 7308 7309 /* Last one. Send it upstream. */ 7310 mutex_exit(&connfp->connf_lock); 7311 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7312 recv_ill, ip_policy); 7313 IP_STAT(ipst, ip_udp_fanmb); 7314 CONN_DEC_REF(connp); 7315 return; 7316 7317 notfound: 7318 7319 mutex_exit(&connfp->connf_lock); 7320 IP_STAT(ipst, ip_udp_fanothers); 7321 /* 7322 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 7323 * have already been matched above, since they live in the IPv4 7324 * fanout tables. This implies we only need to 7325 * check for IPv6 in6addr_any endpoints here. 7326 * Thus we compare using ipv6_all_zeros instead of the destination 7327 * address, except for the multicast group membership lookup which 7328 * uses the IPv4 destination. 7329 */ 7330 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 7331 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7332 mutex_enter(&connfp->connf_lock); 7333 connp = connfp->connf_head; 7334 if (!broadcast && !CLASSD(dst)) { 7335 while (connp != NULL) { 7336 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7337 srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) && 7338 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7339 !connp->conn_ipv6_v6only) 7340 break; 7341 connp = connp->conn_next; 7342 } 7343 7344 if (connp != NULL && is_system_labeled() && 7345 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7346 connp)) 7347 connp = NULL; 7348 7349 if (connp == NULL || connp->conn_upq == NULL) { 7350 /* 7351 * No one bound to this port. Is 7352 * there a client that wants all 7353 * unclaimed datagrams? 7354 */ 7355 mutex_exit(&connfp->connf_lock); 7356 7357 if (mctl_present) 7358 first_mp->b_cont = mp; 7359 else 7360 first_mp = mp; 7361 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP]. 7362 connf_head != NULL) { 7363 ip_fanout_proto(q, first_mp, ill, ipha, 7364 flags | IP_FF_RAWIP, mctl_present, 7365 ip_policy, recv_ill, zoneid); 7366 } else { 7367 if (ip_fanout_send_icmp(q, first_mp, flags, 7368 ICMP_DEST_UNREACHABLE, 7369 ICMP_PORT_UNREACHABLE, 7370 mctl_present, zoneid, ipst)) { 7371 BUMP_MIB(ill->ill_ip_mib, 7372 udpIfStatsNoPorts); 7373 } 7374 } 7375 return; 7376 } 7377 7378 CONN_INC_REF(connp); 7379 mutex_exit(&connfp->connf_lock); 7380 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7381 flags, recv_ill, ip_policy); 7382 CONN_DEC_REF(connp); 7383 return; 7384 } 7385 /* 7386 * IPv4 multicast packet being delivered to an AF_INET6 7387 * in6addr_any endpoint. 7388 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 7389 * and not conn_wantpacket_v6() since any multicast membership is 7390 * for an IPv4-mapped multicast address. 7391 * The packet is sent to all clients in all zones that have joined the 7392 * group and match the port. 7393 */ 7394 while (connp != NULL) { 7395 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7396 srcport, v6src) && 7397 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7398 (!is_system_labeled() || 7399 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7400 connp))) 7401 break; 7402 connp = connp->conn_next; 7403 } 7404 7405 if (connp == NULL || connp->conn_upq == NULL) { 7406 /* 7407 * No one bound to this port. Is 7408 * there a client that wants all 7409 * unclaimed datagrams? 7410 */ 7411 mutex_exit(&connfp->connf_lock); 7412 7413 if (mctl_present) 7414 first_mp->b_cont = mp; 7415 else 7416 first_mp = mp; 7417 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head != 7418 NULL) { 7419 ip_fanout_proto(q, first_mp, ill, ipha, 7420 flags | IP_FF_RAWIP, mctl_present, ip_policy, 7421 recv_ill, zoneid); 7422 } else { 7423 /* 7424 * We used to attempt to send an icmp error here, but 7425 * since this is known to be a multicast packet 7426 * and we don't send icmp errors in response to 7427 * multicast, just drop the packet and give up sooner. 7428 */ 7429 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); 7430 freemsg(first_mp); 7431 } 7432 return; 7433 } 7434 7435 first_connp = connp; 7436 7437 CONN_INC_REF(connp); 7438 connp = connp->conn_next; 7439 for (;;) { 7440 while (connp != NULL) { 7441 if (IPCL_UDP_MATCH_V6(connp, dstport, 7442 ipv6_all_zeros, srcport, v6src) && 7443 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7444 (!is_system_labeled() || 7445 tsol_receive_local(mp, &dst, IPV4_VERSION, 7446 shared_addr, connp))) 7447 break; 7448 connp = connp->conn_next; 7449 } 7450 /* 7451 * Just copy the data part alone. The mctl part is 7452 * needed just for verifying policy and it is never 7453 * sent up. 7454 */ 7455 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7456 ((mp1 = copymsg(mp)) == NULL))) { 7457 /* 7458 * No more intested clients or memory 7459 * allocation failed 7460 */ 7461 connp = first_connp; 7462 break; 7463 } 7464 if (first_mp != NULL) { 7465 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7466 ipsec_info_type == IPSEC_IN); 7467 first_mp1 = ipsec_in_tag(first_mp, NULL, 7468 ipst->ips_netstack); 7469 if (first_mp1 == NULL) { 7470 freemsg(mp1); 7471 connp = first_connp; 7472 break; 7473 } 7474 } else { 7475 first_mp1 = NULL; 7476 } 7477 CONN_INC_REF(connp); 7478 mutex_exit(&connfp->connf_lock); 7479 /* 7480 * IPQoS notes: We don't send the packet for policy 7481 * processing here, will do it for the last one (below). 7482 * i.e. we do it per-packet now, but if we do policy 7483 * processing per-conn, then we would need to do it 7484 * here too. 7485 */ 7486 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7487 ipha, flags, recv_ill, B_FALSE); 7488 mutex_enter(&connfp->connf_lock); 7489 /* Follow the next pointer before releasing the conn. */ 7490 next_connp = connp->conn_next; 7491 CONN_DEC_REF(connp); 7492 connp = next_connp; 7493 } 7494 7495 /* Last one. Send it upstream. */ 7496 mutex_exit(&connfp->connf_lock); 7497 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7498 recv_ill, ip_policy); 7499 CONN_DEC_REF(connp); 7500 } 7501 7502 /* 7503 * Complete the ip_wput header so that it 7504 * is possible to generate ICMP 7505 * errors. 7506 */ 7507 int 7508 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst) 7509 { 7510 ire_t *ire; 7511 7512 if (ipha->ipha_src == INADDR_ANY) { 7513 ire = ire_lookup_local(zoneid, ipst); 7514 if (ire == NULL) { 7515 ip1dbg(("ip_hdr_complete: no source IRE\n")); 7516 return (1); 7517 } 7518 ipha->ipha_src = ire->ire_addr; 7519 ire_refrele(ire); 7520 } 7521 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 7522 ipha->ipha_hdr_checksum = 0; 7523 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 7524 return (0); 7525 } 7526 7527 /* 7528 * Nobody should be sending 7529 * packets up this stream 7530 */ 7531 static void 7532 ip_lrput(queue_t *q, mblk_t *mp) 7533 { 7534 mblk_t *mp1; 7535 7536 switch (mp->b_datap->db_type) { 7537 case M_FLUSH: 7538 /* Turn around */ 7539 if (*mp->b_rptr & FLUSHW) { 7540 *mp->b_rptr &= ~FLUSHR; 7541 qreply(q, mp); 7542 return; 7543 } 7544 break; 7545 } 7546 /* Could receive messages that passed through ar_rput */ 7547 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 7548 mp1->b_prev = mp1->b_next = NULL; 7549 freemsg(mp); 7550 } 7551 7552 /* Nobody should be sending packets down this stream */ 7553 /* ARGSUSED */ 7554 void 7555 ip_lwput(queue_t *q, mblk_t *mp) 7556 { 7557 freemsg(mp); 7558 } 7559 7560 /* 7561 * Move the first hop in any source route to ipha_dst and remove that part of 7562 * the source route. Called by other protocols. Errors in option formatting 7563 * are ignored - will be handled by ip_wput_options Return the final 7564 * destination (either ipha_dst or the last entry in a source route.) 7565 */ 7566 ipaddr_t 7567 ip_massage_options(ipha_t *ipha, netstack_t *ns) 7568 { 7569 ipoptp_t opts; 7570 uchar_t *opt; 7571 uint8_t optval; 7572 uint8_t optlen; 7573 ipaddr_t dst; 7574 int i; 7575 ire_t *ire; 7576 ip_stack_t *ipst = ns->netstack_ip; 7577 7578 ip2dbg(("ip_massage_options\n")); 7579 dst = ipha->ipha_dst; 7580 for (optval = ipoptp_first(&opts, ipha); 7581 optval != IPOPT_EOL; 7582 optval = ipoptp_next(&opts)) { 7583 opt = opts.ipoptp_cur; 7584 switch (optval) { 7585 uint8_t off; 7586 case IPOPT_SSRR: 7587 case IPOPT_LSRR: 7588 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 7589 ip1dbg(("ip_massage_options: bad src route\n")); 7590 break; 7591 } 7592 optlen = opts.ipoptp_len; 7593 off = opt[IPOPT_OFFSET]; 7594 off--; 7595 redo_srr: 7596 if (optlen < IP_ADDR_LEN || 7597 off > optlen - IP_ADDR_LEN) { 7598 /* End of source route */ 7599 ip1dbg(("ip_massage_options: end of SR\n")); 7600 break; 7601 } 7602 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 7603 ip1dbg(("ip_massage_options: next hop 0x%x\n", 7604 ntohl(dst))); 7605 /* 7606 * Check if our address is present more than 7607 * once as consecutive hops in source route. 7608 * XXX verify per-interface ip_forwarding 7609 * for source route? 7610 */ 7611 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 7612 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7613 if (ire != NULL) { 7614 ire_refrele(ire); 7615 off += IP_ADDR_LEN; 7616 goto redo_srr; 7617 } 7618 if (dst == htonl(INADDR_LOOPBACK)) { 7619 ip1dbg(("ip_massage_options: loopback addr in " 7620 "source route!\n")); 7621 break; 7622 } 7623 /* 7624 * Update ipha_dst to be the first hop and remove the 7625 * first hop from the source route (by overwriting 7626 * part of the option with NOP options). 7627 */ 7628 ipha->ipha_dst = dst; 7629 /* Put the last entry in dst */ 7630 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 7631 3; 7632 bcopy(&opt[off], &dst, IP_ADDR_LEN); 7633 7634 ip1dbg(("ip_massage_options: last hop 0x%x\n", 7635 ntohl(dst))); 7636 /* Move down and overwrite */ 7637 opt[IP_ADDR_LEN] = opt[0]; 7638 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 7639 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 7640 for (i = 0; i < IP_ADDR_LEN; i++) 7641 opt[i] = IPOPT_NOP; 7642 break; 7643 } 7644 } 7645 return (dst); 7646 } 7647 7648 /* 7649 * Return the network mask 7650 * associated with the specified address. 7651 */ 7652 ipaddr_t 7653 ip_net_mask(ipaddr_t addr) 7654 { 7655 uchar_t *up = (uchar_t *)&addr; 7656 ipaddr_t mask = 0; 7657 uchar_t *maskp = (uchar_t *)&mask; 7658 7659 #if defined(__i386) || defined(__amd64) 7660 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 7661 #endif 7662 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 7663 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 7664 #endif 7665 if (CLASSD(addr)) { 7666 maskp[0] = 0xF0; 7667 return (mask); 7668 } 7669 7670 /* We assume Class E default netmask to be 32 */ 7671 if (CLASSE(addr)) 7672 return (0xffffffffU); 7673 7674 if (addr == 0) 7675 return (0); 7676 maskp[0] = 0xFF; 7677 if ((up[0] & 0x80) == 0) 7678 return (mask); 7679 7680 maskp[1] = 0xFF; 7681 if ((up[0] & 0xC0) == 0x80) 7682 return (mask); 7683 7684 maskp[2] = 0xFF; 7685 if ((up[0] & 0xE0) == 0xC0) 7686 return (mask); 7687 7688 /* Otherwise return no mask */ 7689 return ((ipaddr_t)0); 7690 } 7691 7692 /* 7693 * Select an ill for the packet by considering load spreading across 7694 * a different ill in the group if dst_ill is part of some group. 7695 */ 7696 ill_t * 7697 ip_newroute_get_dst_ill(ill_t *dst_ill) 7698 { 7699 ill_t *ill; 7700 7701 /* 7702 * We schedule irrespective of whether the source address is 7703 * INADDR_ANY or not. illgrp_scheduler returns a held ill. 7704 */ 7705 ill = illgrp_scheduler(dst_ill); 7706 if (ill == NULL) 7707 return (NULL); 7708 7709 /* 7710 * For groups with names ip_sioctl_groupname ensures that all 7711 * ills are of same type. For groups without names, ifgrp_insert 7712 * ensures this. 7713 */ 7714 ASSERT(dst_ill->ill_type == ill->ill_type); 7715 7716 return (ill); 7717 } 7718 7719 /* 7720 * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. 7721 */ 7722 ill_t * 7723 ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6, 7724 ip_stack_t *ipst) 7725 { 7726 ill_t *ret_ill; 7727 7728 ASSERT(ifindex != 0); 7729 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 7730 ipst); 7731 if (ret_ill == NULL || 7732 (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { 7733 if (isv6) { 7734 if (ill != NULL) { 7735 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 7736 } else { 7737 BUMP_MIB(&ipst->ips_ip6_mib, 7738 ipIfStatsOutDiscards); 7739 } 7740 ip1dbg(("ip_grab_attach_ill (IPv6): " 7741 "bad ifindex %d.\n", ifindex)); 7742 } else { 7743 if (ill != NULL) { 7744 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 7745 } else { 7746 BUMP_MIB(&ipst->ips_ip_mib, 7747 ipIfStatsOutDiscards); 7748 } 7749 ip1dbg(("ip_grab_attach_ill (IPv4): " 7750 "bad ifindex %d.\n", ifindex)); 7751 } 7752 if (ret_ill != NULL) 7753 ill_refrele(ret_ill); 7754 freemsg(first_mp); 7755 return (NULL); 7756 } 7757 7758 return (ret_ill); 7759 } 7760 7761 /* 7762 * IPv4 - 7763 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 7764 * out a packet to a destination address for which we do not have specific 7765 * (or sufficient) routing information. 7766 * 7767 * NOTE : These are the scopes of some of the variables that point at IRE, 7768 * which needs to be followed while making any future modifications 7769 * to avoid memory leaks. 7770 * 7771 * - ire and sire are the entries looked up initially by 7772 * ire_ftable_lookup. 7773 * - ipif_ire is used to hold the interface ire associated with 7774 * the new cache ire. But it's scope is limited, so we always REFRELE 7775 * it before branching out to error paths. 7776 * - save_ire is initialized before ire_create, so that ire returned 7777 * by ire_create will not over-write the ire. We REFRELE save_ire 7778 * before breaking out of the switch. 7779 * 7780 * Thus on failures, we have to REFRELE only ire and sire, if they 7781 * are not NULL. 7782 */ 7783 void 7784 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, 7785 zoneid_t zoneid, ip_stack_t *ipst) 7786 { 7787 areq_t *areq; 7788 ipaddr_t gw = 0; 7789 ire_t *ire = NULL; 7790 mblk_t *res_mp; 7791 ipaddr_t *addrp; 7792 ipaddr_t nexthop_addr; 7793 ipif_t *src_ipif = NULL; 7794 ill_t *dst_ill = NULL; 7795 ipha_t *ipha; 7796 ire_t *sire = NULL; 7797 mblk_t *first_mp; 7798 ire_t *save_ire; 7799 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ 7800 ushort_t ire_marks = 0; 7801 boolean_t mctl_present; 7802 ipsec_out_t *io; 7803 mblk_t *saved_mp; 7804 ire_t *first_sire = NULL; 7805 mblk_t *copy_mp = NULL; 7806 mblk_t *xmit_mp = NULL; 7807 ipaddr_t save_dst; 7808 uint32_t multirt_flags = 7809 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 7810 boolean_t multirt_is_resolvable; 7811 boolean_t multirt_resolve_next; 7812 boolean_t unspec_src; 7813 boolean_t do_attach_ill = B_FALSE; 7814 boolean_t ip_nexthop = B_FALSE; 7815 tsol_ire_gw_secattr_t *attrp = NULL; 7816 tsol_gcgrp_t *gcgrp = NULL; 7817 tsol_gcgrp_addr_t ga; 7818 7819 if (ip_debug > 2) { 7820 /* ip1dbg */ 7821 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7822 } 7823 7824 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7825 if (mctl_present) { 7826 io = (ipsec_out_t *)first_mp->b_rptr; 7827 ASSERT(io->ipsec_out_type == IPSEC_OUT); 7828 ASSERT(zoneid == io->ipsec_out_zoneid); 7829 ASSERT(zoneid != ALL_ZONES); 7830 } 7831 7832 ipha = (ipha_t *)mp->b_rptr; 7833 7834 /* All multicast lookups come through ip_newroute_ipif() */ 7835 if (CLASSD(dst)) { 7836 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7837 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7838 freemsg(first_mp); 7839 return; 7840 } 7841 7842 if (mctl_present && io->ipsec_out_attach_if) { 7843 /* ip_grab_attach_ill returns a held ill */ 7844 attach_ill = ip_grab_attach_ill(NULL, first_mp, 7845 io->ipsec_out_ill_index, B_FALSE, ipst); 7846 7847 /* Failure case frees things for us. */ 7848 if (attach_ill == NULL) 7849 return; 7850 7851 /* 7852 * Check if we need an ire that will not be 7853 * looked up by anybody else i.e. HIDDEN. 7854 */ 7855 if (ill_is_probeonly(attach_ill)) 7856 ire_marks = IRE_MARK_HIDDEN; 7857 } 7858 if (mctl_present && io->ipsec_out_ip_nexthop) { 7859 ip_nexthop = B_TRUE; 7860 nexthop_addr = io->ipsec_out_nexthop_addr; 7861 } 7862 /* 7863 * If this IRE is created for forwarding or it is not for 7864 * traffic for congestion controlled protocols, mark it as temporary. 7865 */ 7866 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7867 ire_marks |= IRE_MARK_TEMPORARY; 7868 7869 /* 7870 * Get what we can from ire_ftable_lookup which will follow an IRE 7871 * chain until it gets the most specific information available. 7872 * For example, we know that there is no IRE_CACHE for this dest, 7873 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7874 * ire_ftable_lookup will look up the gateway, etc. 7875 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7876 * to the destination, of equal netmask length in the forward table, 7877 * will be recursively explored. If no information is available 7878 * for the final gateway of that route, we force the returned ire 7879 * to be equal to sire using MATCH_IRE_PARENT. 7880 * At least, in this case we have a starting point (in the buckets) 7881 * to look for other routes to the destination in the forward table. 7882 * This is actually used only for multirouting, where a list 7883 * of routes has to be processed in sequence. 7884 * 7885 * In the process of coming up with the most specific information, 7886 * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry 7887 * for the gateway (i.e., one for which the ire_nce->nce_state is 7888 * not yet ND_REACHABLE, and is in the middle of arp resolution). 7889 * Two caveats when handling incomplete ire's in ip_newroute: 7890 * - we should be careful when accessing its ire_nce (specifically 7891 * the nce_res_mp) ast it might change underneath our feet, and, 7892 * - not all legacy code path callers are prepared to handle 7893 * incomplete ire's, so we should not create/add incomplete 7894 * ire_cache entries here. (See discussion about temporary solution 7895 * further below). 7896 * 7897 * In order to minimize packet dropping, and to preserve existing 7898 * behavior, we treat this case as if there were no IRE_CACHE for the 7899 * gateway, and instead use the IF_RESOLVER ire to send out 7900 * another request to ARP (this is achieved by passing the 7901 * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the 7902 * arp response comes back in ip_wput_nondata, we will create 7903 * a per-dst ire_cache that has an ND_COMPLETE ire. 7904 * 7905 * Note that this is a temporary solution; the correct solution is 7906 * to create an incomplete per-dst ire_cache entry, and send the 7907 * packet out when the gw's nce is resolved. In order to achieve this, 7908 * all packet processing must have been completed prior to calling 7909 * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need 7910 * to be modified to accomodate this solution. 7911 */ 7912 if (ip_nexthop) { 7913 /* 7914 * The first time we come here, we look for an IRE_INTERFACE 7915 * entry for the specified nexthop, set the dst to be the 7916 * nexthop address and create an IRE_CACHE entry for the 7917 * nexthop. The next time around, we are able to find an 7918 * IRE_CACHE entry for the nexthop, set the gateway to be the 7919 * nexthop address and create an IRE_CACHE entry for the 7920 * destination address via the specified nexthop. 7921 */ 7922 ire = ire_cache_lookup(nexthop_addr, zoneid, 7923 MBLK_GETLABEL(mp), ipst); 7924 if (ire != NULL) { 7925 gw = nexthop_addr; 7926 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7927 } else { 7928 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7929 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7930 MBLK_GETLABEL(mp), 7931 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 7932 ipst); 7933 if (ire != NULL) { 7934 dst = nexthop_addr; 7935 } 7936 } 7937 } else if (attach_ill == NULL) { 7938 ire = ire_ftable_lookup(dst, 0, 0, 0, 7939 NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), 7940 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7941 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7942 MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, 7943 ipst); 7944 } else { 7945 /* 7946 * attach_ill is set only for communicating with 7947 * on-link hosts. So, don't look for DEFAULT. 7948 */ 7949 ipif_t *attach_ipif; 7950 7951 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 7952 if (attach_ipif == NULL) { 7953 ill_refrele(attach_ill); 7954 goto icmp_err_ret; 7955 } 7956 ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, 7957 &sire, zoneid, 0, MBLK_GETLABEL(mp), 7958 MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | 7959 MATCH_IRE_SECATTR, ipst); 7960 ipif_refrele(attach_ipif); 7961 } 7962 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7963 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7964 7965 /* 7966 * This loop is run only once in most cases. 7967 * We loop to resolve further routes only when the destination 7968 * can be reached through multiple RTF_MULTIRT-flagged ires. 7969 */ 7970 do { 7971 /* Clear the previous iteration's values */ 7972 if (src_ipif != NULL) { 7973 ipif_refrele(src_ipif); 7974 src_ipif = NULL; 7975 } 7976 if (dst_ill != NULL) { 7977 ill_refrele(dst_ill); 7978 dst_ill = NULL; 7979 } 7980 7981 multirt_resolve_next = B_FALSE; 7982 /* 7983 * We check if packets have to be multirouted. 7984 * In this case, given the current <ire, sire> couple, 7985 * we look for the next suitable <ire, sire>. 7986 * This check is done in ire_multirt_lookup(), 7987 * which applies various criteria to find the next route 7988 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7989 * unchanged if it detects it has not been tried yet. 7990 */ 7991 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7992 ip3dbg(("ip_newroute: starting next_resolution " 7993 "with first_mp %p, tag %d\n", 7994 (void *)first_mp, 7995 MULTIRT_DEBUG_TAGGED(first_mp))); 7996 7997 ASSERT(sire != NULL); 7998 multirt_is_resolvable = 7999 ire_multirt_lookup(&ire, &sire, multirt_flags, 8000 MBLK_GETLABEL(mp), ipst); 8001 8002 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 8003 "ire %p, sire %p\n", 8004 multirt_is_resolvable, 8005 (void *)ire, (void *)sire)); 8006 8007 if (!multirt_is_resolvable) { 8008 /* 8009 * No more multirt route to resolve; give up 8010 * (all routes resolved or no more 8011 * resolvable routes). 8012 */ 8013 if (ire != NULL) { 8014 ire_refrele(ire); 8015 ire = NULL; 8016 } 8017 } else { 8018 ASSERT(sire != NULL); 8019 ASSERT(ire != NULL); 8020 /* 8021 * We simply use first_sire as a flag that 8022 * indicates if a resolvable multirt route 8023 * has already been found. 8024 * If it is not the case, we may have to send 8025 * an ICMP error to report that the 8026 * destination is unreachable. 8027 * We do not IRE_REFHOLD first_sire. 8028 */ 8029 if (first_sire == NULL) { 8030 first_sire = sire; 8031 } 8032 } 8033 } 8034 if (ire == NULL) { 8035 if (ip_debug > 3) { 8036 /* ip2dbg */ 8037 pr_addr_dbg("ip_newroute: " 8038 "can't resolve %s\n", AF_INET, &dst); 8039 } 8040 ip3dbg(("ip_newroute: " 8041 "ire %p, sire %p, first_sire %p\n", 8042 (void *)ire, (void *)sire, (void *)first_sire)); 8043 8044 if (sire != NULL) { 8045 ire_refrele(sire); 8046 sire = NULL; 8047 } 8048 8049 if (first_sire != NULL) { 8050 /* 8051 * At least one multirt route has been found 8052 * in the same call to ip_newroute(); 8053 * there is no need to report an ICMP error. 8054 * first_sire was not IRE_REFHOLDed. 8055 */ 8056 MULTIRT_DEBUG_UNTAG(first_mp); 8057 freemsg(first_mp); 8058 return; 8059 } 8060 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 8061 RTA_DST, ipst); 8062 if (attach_ill != NULL) 8063 ill_refrele(attach_ill); 8064 goto icmp_err_ret; 8065 } 8066 8067 /* 8068 * Verify that the returned IRE does not have either 8069 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 8070 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 8071 */ 8072 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 8073 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 8074 if (attach_ill != NULL) 8075 ill_refrele(attach_ill); 8076 goto icmp_err_ret; 8077 } 8078 /* 8079 * Increment the ire_ob_pkt_count field for ire if it is an 8080 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 8081 * increment the same for the parent IRE, sire, if it is some 8082 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST) 8083 */ 8084 if ((ire->ire_type & IRE_INTERFACE) != 0) { 8085 UPDATE_OB_PKT_COUNT(ire); 8086 ire->ire_last_used_time = lbolt; 8087 } 8088 8089 if (sire != NULL) { 8090 gw = sire->ire_gateway_addr; 8091 ASSERT((sire->ire_type & (IRE_CACHETABLE | 8092 IRE_INTERFACE)) == 0); 8093 UPDATE_OB_PKT_COUNT(sire); 8094 sire->ire_last_used_time = lbolt; 8095 } 8096 /* 8097 * We have a route to reach the destination. 8098 * 8099 * 1) If the interface is part of ill group, try to get a new 8100 * ill taking load spreading into account. 8101 * 8102 * 2) After selecting the ill, get a source address that 8103 * might create good inbound load spreading. 8104 * ipif_select_source does this for us. 8105 * 8106 * If the application specified the ill (ifindex), we still 8107 * load spread. Only if the packets needs to go out 8108 * specifically on a given ill e.g. binding to 8109 * IPIF_NOFAILOVER address, then we don't try to use a 8110 * different ill for load spreading. 8111 */ 8112 if (attach_ill == NULL) { 8113 /* 8114 * Don't perform outbound load spreading in the 8115 * case of an RTF_MULTIRT route, as we actually 8116 * typically want to replicate outgoing packets 8117 * through particular interfaces. 8118 */ 8119 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8120 dst_ill = ire->ire_ipif->ipif_ill; 8121 /* for uniformity */ 8122 ill_refhold(dst_ill); 8123 } else { 8124 /* 8125 * If we are here trying to create an IRE_CACHE 8126 * for an offlink destination and have the 8127 * IRE_CACHE for the next hop and the latter is 8128 * using virtual IP source address selection i.e 8129 * it's ire->ire_ipif is pointing to a virtual 8130 * network interface (vni) then 8131 * ip_newroute_get_dst_ll() will return the vni 8132 * interface as the dst_ill. Since the vni is 8133 * virtual i.e not associated with any physical 8134 * interface, it cannot be the dst_ill, hence 8135 * in such a case call ip_newroute_get_dst_ll() 8136 * with the stq_ill instead of the ire_ipif ILL. 8137 * The function returns a refheld ill. 8138 */ 8139 if ((ire->ire_type == IRE_CACHE) && 8140 IS_VNI(ire->ire_ipif->ipif_ill)) 8141 dst_ill = ip_newroute_get_dst_ill( 8142 ire->ire_stq->q_ptr); 8143 else 8144 dst_ill = ip_newroute_get_dst_ill( 8145 ire->ire_ipif->ipif_ill); 8146 } 8147 if (dst_ill == NULL) { 8148 if (ip_debug > 2) { 8149 pr_addr_dbg("ip_newroute: " 8150 "no dst ill for dst" 8151 " %s\n", AF_INET, &dst); 8152 } 8153 goto icmp_err_ret; 8154 } 8155 } else { 8156 dst_ill = ire->ire_ipif->ipif_ill; 8157 /* for uniformity */ 8158 ill_refhold(dst_ill); 8159 /* 8160 * We should have found a route matching ill as we 8161 * called ire_ftable_lookup with MATCH_IRE_ILL. 8162 * Rather than asserting, when there is a mismatch, 8163 * we just drop the packet. 8164 */ 8165 if (dst_ill != attach_ill) { 8166 ip0dbg(("ip_newroute: Packet dropped as " 8167 "IPIF_NOFAILOVER ill is %s, " 8168 "ire->ire_ipif->ipif_ill is %s\n", 8169 attach_ill->ill_name, 8170 dst_ill->ill_name)); 8171 ill_refrele(attach_ill); 8172 goto icmp_err_ret; 8173 } 8174 } 8175 /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ 8176 if (attach_ill != NULL) { 8177 ill_refrele(attach_ill); 8178 attach_ill = NULL; 8179 do_attach_ill = B_TRUE; 8180 } 8181 ASSERT(dst_ill != NULL); 8182 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 8183 8184 /* 8185 * Pick the best source address from dst_ill. 8186 * 8187 * 1) If it is part of a multipathing group, we would 8188 * like to spread the inbound packets across different 8189 * interfaces. ipif_select_source picks a random source 8190 * across the different ills in the group. 8191 * 8192 * 2) If it is not part of a multipathing group, we try 8193 * to pick the source address from the destination 8194 * route. Clustering assumes that when we have multiple 8195 * prefixes hosted on an interface, the prefix of the 8196 * source address matches the prefix of the destination 8197 * route. We do this only if the address is not 8198 * DEPRECATED. 8199 * 8200 * 3) If the conn is in a different zone than the ire, we 8201 * need to pick a source address from the right zone. 8202 * 8203 * NOTE : If we hit case (1) above, the prefix of the source 8204 * address picked may not match the prefix of the 8205 * destination routes prefix as ipif_select_source 8206 * does not look at "dst" while picking a source 8207 * address. 8208 * If we want the same behavior as (2), we will need 8209 * to change the behavior of ipif_select_source. 8210 */ 8211 ASSERT(src_ipif == NULL); 8212 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 8213 /* 8214 * The RTF_SETSRC flag is set in the parent ire (sire). 8215 * Check that the ipif matching the requested source 8216 * address still exists. 8217 */ 8218 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 8219 zoneid, NULL, NULL, NULL, NULL, ipst); 8220 } 8221 8222 unspec_src = (connp != NULL && connp->conn_unspec_src); 8223 8224 if (src_ipif == NULL && 8225 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 8226 ire_marks |= IRE_MARK_USESRC_CHECK; 8227 if ((dst_ill->ill_group != NULL) || 8228 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 8229 (connp != NULL && ire->ire_zoneid != zoneid && 8230 ire->ire_zoneid != ALL_ZONES) || 8231 (dst_ill->ill_usesrc_ifindex != 0)) { 8232 /* 8233 * If the destination is reachable via a 8234 * given gateway, the selected source address 8235 * should be in the same subnet as the gateway. 8236 * Otherwise, the destination is not reachable. 8237 * 8238 * If there are no interfaces on the same subnet 8239 * as the destination, ipif_select_source gives 8240 * first non-deprecated interface which might be 8241 * on a different subnet than the gateway. 8242 * This is not desirable. Hence pass the dst_ire 8243 * source address to ipif_select_source. 8244 * It is sure that the destination is reachable 8245 * with the dst_ire source address subnet. 8246 * So passing dst_ire source address to 8247 * ipif_select_source will make sure that the 8248 * selected source will be on the same subnet 8249 * as dst_ire source address. 8250 */ 8251 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 8252 src_ipif = ipif_select_source(dst_ill, saddr, 8253 zoneid); 8254 if (src_ipif == NULL) { 8255 if (ip_debug > 2) { 8256 pr_addr_dbg("ip_newroute: " 8257 "no src for dst %s ", 8258 AF_INET, &dst); 8259 printf("through interface %s\n", 8260 dst_ill->ill_name); 8261 } 8262 goto icmp_err_ret; 8263 } 8264 } else { 8265 src_ipif = ire->ire_ipif; 8266 ASSERT(src_ipif != NULL); 8267 /* hold src_ipif for uniformity */ 8268 ipif_refhold(src_ipif); 8269 } 8270 } 8271 8272 /* 8273 * Assign a source address while we have the conn. 8274 * We can't have ip_wput_ire pick a source address when the 8275 * packet returns from arp since we need to look at 8276 * conn_unspec_src and conn_zoneid, and we lose the conn when 8277 * going through arp. 8278 * 8279 * NOTE : ip_newroute_v6 does not have this piece of code as 8280 * it uses ip6i to store this information. 8281 */ 8282 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 8283 ipha->ipha_src = src_ipif->ipif_src_addr; 8284 8285 if (ip_debug > 3) { 8286 /* ip2dbg */ 8287 pr_addr_dbg("ip_newroute: first hop %s\n", 8288 AF_INET, &gw); 8289 } 8290 ip2dbg(("\tire type %s (%d)\n", 8291 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 8292 8293 /* 8294 * The TTL of multirouted packets is bounded by the 8295 * ip_multirt_ttl ndd variable. 8296 */ 8297 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8298 /* Force TTL of multirouted packets */ 8299 if ((ipst->ips_ip_multirt_ttl > 0) && 8300 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 8301 ip2dbg(("ip_newroute: forcing multirt TTL " 8302 "to %d (was %d), dst 0x%08x\n", 8303 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 8304 ntohl(sire->ire_addr))); 8305 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 8306 } 8307 } 8308 /* 8309 * At this point in ip_newroute(), ire is either the 8310 * IRE_CACHE of the next-hop gateway for an off-subnet 8311 * destination or an IRE_INTERFACE type that should be used 8312 * to resolve an on-subnet destination or an on-subnet 8313 * next-hop gateway. 8314 * 8315 * In the IRE_CACHE case, we have the following : 8316 * 8317 * 1) src_ipif - used for getting a source address. 8318 * 8319 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8320 * means packets using this IRE_CACHE will go out on 8321 * dst_ill. 8322 * 8323 * 3) The IRE sire will point to the prefix that is the 8324 * longest matching route for the destination. These 8325 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST. 8326 * 8327 * The newly created IRE_CACHE entry for the off-subnet 8328 * destination is tied to both the prefix route and the 8329 * interface route used to resolve the next-hop gateway 8330 * via the ire_phandle and ire_ihandle fields, 8331 * respectively. 8332 * 8333 * In the IRE_INTERFACE case, we have the following : 8334 * 8335 * 1) src_ipif - used for getting a source address. 8336 * 8337 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8338 * means packets using the IRE_CACHE that we will build 8339 * here will go out on dst_ill. 8340 * 8341 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 8342 * to be created will only be tied to the IRE_INTERFACE 8343 * that was derived from the ire_ihandle field. 8344 * 8345 * If sire is non-NULL, it means the destination is 8346 * off-link and we will first create the IRE_CACHE for the 8347 * gateway. Next time through ip_newroute, we will create 8348 * the IRE_CACHE for the final destination as described 8349 * above. 8350 * 8351 * In both cases, after the current resolution has been 8352 * completed (or possibly initialised, in the IRE_INTERFACE 8353 * case), the loop may be re-entered to attempt the resolution 8354 * of another RTF_MULTIRT route. 8355 * 8356 * When an IRE_CACHE entry for the off-subnet destination is 8357 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 8358 * for further processing in emission loops. 8359 */ 8360 save_ire = ire; 8361 switch (ire->ire_type) { 8362 case IRE_CACHE: { 8363 ire_t *ipif_ire; 8364 8365 ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE); 8366 if (gw == 0) 8367 gw = ire->ire_gateway_addr; 8368 /* 8369 * We need 3 ire's to create a new cache ire for an 8370 * off-link destination from the cache ire of the 8371 * gateway. 8372 * 8373 * 1. The prefix ire 'sire' (Note that this does 8374 * not apply to the conn_nexthop_set case) 8375 * 2. The cache ire of the gateway 'ire' 8376 * 3. The interface ire 'ipif_ire' 8377 * 8378 * We have (1) and (2). We lookup (3) below. 8379 * 8380 * If there is no interface route to the gateway, 8381 * it is a race condition, where we found the cache 8382 * but the interface route has been deleted. 8383 */ 8384 if (ip_nexthop) { 8385 ipif_ire = ire_ihandle_lookup_onlink(ire); 8386 } else { 8387 ipif_ire = 8388 ire_ihandle_lookup_offlink(ire, sire); 8389 } 8390 if (ipif_ire == NULL) { 8391 ip1dbg(("ip_newroute: " 8392 "ire_ihandle_lookup_offlink failed\n")); 8393 goto icmp_err_ret; 8394 } 8395 8396 /* 8397 * Check cached gateway IRE for any security 8398 * attributes; if found, associate the gateway 8399 * credentials group to the destination IRE. 8400 */ 8401 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 8402 mutex_enter(&attrp->igsa_lock); 8403 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 8404 GCGRP_REFHOLD(gcgrp); 8405 mutex_exit(&attrp->igsa_lock); 8406 } 8407 8408 /* 8409 * XXX For the source of the resolver mp, 8410 * we are using the same DL_UNITDATA_REQ 8411 * (from save_ire->ire_nce->nce_res_mp) 8412 * though the save_ire is not pointing at the same ill. 8413 * This is incorrect. We need to send it up to the 8414 * resolver to get the right res_mp. For ethernets 8415 * this may be okay (ill_type == DL_ETHER). 8416 */ 8417 8418 ire = ire_create( 8419 (uchar_t *)&dst, /* dest address */ 8420 (uchar_t *)&ip_g_all_ones, /* mask */ 8421 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8422 (uchar_t *)&gw, /* gateway address */ 8423 &save_ire->ire_max_frag, 8424 save_ire->ire_nce, /* src nce */ 8425 dst_ill->ill_rq, /* recv-from queue */ 8426 dst_ill->ill_wq, /* send-to queue */ 8427 IRE_CACHE, /* IRE type */ 8428 src_ipif, 8429 (sire != NULL) ? 8430 sire->ire_mask : 0, /* Parent mask */ 8431 (sire != NULL) ? 8432 sire->ire_phandle : 0, /* Parent handle */ 8433 ipif_ire->ire_ihandle, /* Interface handle */ 8434 (sire != NULL) ? (sire->ire_flags & 8435 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 8436 (sire != NULL) ? 8437 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 8438 NULL, 8439 gcgrp, 8440 ipst); 8441 8442 if (ire == NULL) { 8443 if (gcgrp != NULL) { 8444 GCGRP_REFRELE(gcgrp); 8445 gcgrp = NULL; 8446 } 8447 ire_refrele(ipif_ire); 8448 ire_refrele(save_ire); 8449 break; 8450 } 8451 8452 /* reference now held by IRE */ 8453 gcgrp = NULL; 8454 8455 ire->ire_marks |= ire_marks; 8456 8457 /* 8458 * Prevent sire and ipif_ire from getting deleted. 8459 * The newly created ire is tied to both of them via 8460 * the phandle and ihandle respectively. 8461 */ 8462 if (sire != NULL) { 8463 IRB_REFHOLD(sire->ire_bucket); 8464 /* Has it been removed already ? */ 8465 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 8466 IRB_REFRELE(sire->ire_bucket); 8467 ire_refrele(ipif_ire); 8468 ire_refrele(save_ire); 8469 break; 8470 } 8471 } 8472 8473 IRB_REFHOLD(ipif_ire->ire_bucket); 8474 /* Has it been removed already ? */ 8475 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 8476 IRB_REFRELE(ipif_ire->ire_bucket); 8477 if (sire != NULL) 8478 IRB_REFRELE(sire->ire_bucket); 8479 ire_refrele(ipif_ire); 8480 ire_refrele(save_ire); 8481 break; 8482 } 8483 8484 xmit_mp = first_mp; 8485 /* 8486 * In the case of multirouting, a copy 8487 * of the packet is done before its sending. 8488 * The copy is used to attempt another 8489 * route resolution, in a next loop. 8490 */ 8491 if (ire->ire_flags & RTF_MULTIRT) { 8492 copy_mp = copymsg(first_mp); 8493 if (copy_mp != NULL) { 8494 xmit_mp = copy_mp; 8495 MULTIRT_DEBUG_TAG(first_mp); 8496 } 8497 } 8498 ire_add_then_send(q, ire, xmit_mp); 8499 ire_refrele(save_ire); 8500 8501 /* Assert that sire is not deleted yet. */ 8502 if (sire != NULL) { 8503 ASSERT(sire->ire_ptpn != NULL); 8504 IRB_REFRELE(sire->ire_bucket); 8505 } 8506 8507 /* Assert that ipif_ire is not deleted yet. */ 8508 ASSERT(ipif_ire->ire_ptpn != NULL); 8509 IRB_REFRELE(ipif_ire->ire_bucket); 8510 ire_refrele(ipif_ire); 8511 8512 /* 8513 * If copy_mp is not NULL, multirouting was 8514 * requested. We loop to initiate a next 8515 * route resolution attempt, starting from sire. 8516 */ 8517 if (copy_mp != NULL) { 8518 /* 8519 * Search for the next unresolved 8520 * multirt route. 8521 */ 8522 copy_mp = NULL; 8523 ipif_ire = NULL; 8524 ire = NULL; 8525 multirt_resolve_next = B_TRUE; 8526 continue; 8527 } 8528 if (sire != NULL) 8529 ire_refrele(sire); 8530 ipif_refrele(src_ipif); 8531 ill_refrele(dst_ill); 8532 return; 8533 } 8534 case IRE_IF_NORESOLVER: { 8535 8536 if (dst_ill->ill_phys_addr_length != IP_ADDR_LEN && 8537 dst_ill->ill_resolver_mp == NULL) { 8538 ip1dbg(("ip_newroute: dst_ill %p " 8539 "for IRE_IF_NORESOLVER ire %p has " 8540 "no ill_resolver_mp\n", 8541 (void *)dst_ill, (void *)ire)); 8542 break; 8543 } 8544 8545 /* 8546 * TSol note: We are creating the ire cache for the 8547 * destination 'dst'. If 'dst' is offlink, going 8548 * through the first hop 'gw', the security attributes 8549 * of 'dst' must be set to point to the gateway 8550 * credentials of gateway 'gw'. If 'dst' is onlink, it 8551 * is possible that 'dst' is a potential gateway that is 8552 * referenced by some route that has some security 8553 * attributes. Thus in the former case, we need to do a 8554 * gcgrp_lookup of 'gw' while in the latter case we 8555 * need to do gcgrp_lookup of 'dst' itself. 8556 */ 8557 ga.ga_af = AF_INET; 8558 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 8559 &ga.ga_addr); 8560 gcgrp = gcgrp_lookup(&ga, B_FALSE); 8561 8562 ire = ire_create( 8563 (uchar_t *)&dst, /* dest address */ 8564 (uchar_t *)&ip_g_all_ones, /* mask */ 8565 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8566 (uchar_t *)&gw, /* gateway address */ 8567 &save_ire->ire_max_frag, 8568 NULL, /* no src nce */ 8569 dst_ill->ill_rq, /* recv-from queue */ 8570 dst_ill->ill_wq, /* send-to queue */ 8571 IRE_CACHE, 8572 src_ipif, 8573 save_ire->ire_mask, /* Parent mask */ 8574 (sire != NULL) ? /* Parent handle */ 8575 sire->ire_phandle : 0, 8576 save_ire->ire_ihandle, /* Interface handle */ 8577 (sire != NULL) ? sire->ire_flags & 8578 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 8579 &(save_ire->ire_uinfo), 8580 NULL, 8581 gcgrp, 8582 ipst); 8583 8584 if (ire == NULL) { 8585 if (gcgrp != NULL) { 8586 GCGRP_REFRELE(gcgrp); 8587 gcgrp = NULL; 8588 } 8589 ire_refrele(save_ire); 8590 break; 8591 } 8592 8593 /* reference now held by IRE */ 8594 gcgrp = NULL; 8595 8596 ire->ire_marks |= ire_marks; 8597 8598 /* Prevent save_ire from getting deleted */ 8599 IRB_REFHOLD(save_ire->ire_bucket); 8600 /* Has it been removed already ? */ 8601 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8602 IRB_REFRELE(save_ire->ire_bucket); 8603 ire_refrele(save_ire); 8604 break; 8605 } 8606 8607 /* 8608 * In the case of multirouting, a copy 8609 * of the packet is made before it is sent. 8610 * The copy is used in the next 8611 * loop to attempt another resolution. 8612 */ 8613 xmit_mp = first_mp; 8614 if ((sire != NULL) && 8615 (sire->ire_flags & RTF_MULTIRT)) { 8616 copy_mp = copymsg(first_mp); 8617 if (copy_mp != NULL) { 8618 xmit_mp = copy_mp; 8619 MULTIRT_DEBUG_TAG(first_mp); 8620 } 8621 } 8622 ire_add_then_send(q, ire, xmit_mp); 8623 8624 /* Assert that it is not deleted yet. */ 8625 ASSERT(save_ire->ire_ptpn != NULL); 8626 IRB_REFRELE(save_ire->ire_bucket); 8627 ire_refrele(save_ire); 8628 8629 if (copy_mp != NULL) { 8630 /* 8631 * If we found a (no)resolver, we ignore any 8632 * trailing top priority IRE_CACHE in further 8633 * loops. This ensures that we do not omit any 8634 * (no)resolver. 8635 * This IRE_CACHE, if any, will be processed 8636 * by another thread entering ip_newroute(). 8637 * IRE_CACHE entries, if any, will be processed 8638 * by another thread entering ip_newroute(), 8639 * (upon resolver response, for instance). 8640 * This aims to force parallel multirt 8641 * resolutions as soon as a packet must be sent. 8642 * In the best case, after the tx of only one 8643 * packet, all reachable routes are resolved. 8644 * Otherwise, the resolution of all RTF_MULTIRT 8645 * routes would require several emissions. 8646 */ 8647 multirt_flags &= ~MULTIRT_CACHEGW; 8648 8649 /* 8650 * Search for the next unresolved multirt 8651 * route. 8652 */ 8653 copy_mp = NULL; 8654 save_ire = NULL; 8655 ire = NULL; 8656 multirt_resolve_next = B_TRUE; 8657 continue; 8658 } 8659 8660 /* 8661 * Don't need sire anymore 8662 */ 8663 if (sire != NULL) 8664 ire_refrele(sire); 8665 8666 ipif_refrele(src_ipif); 8667 ill_refrele(dst_ill); 8668 return; 8669 } 8670 case IRE_IF_RESOLVER: 8671 /* 8672 * We can't build an IRE_CACHE yet, but at least we 8673 * found a resolver that can help. 8674 */ 8675 res_mp = dst_ill->ill_resolver_mp; 8676 if (!OK_RESOLVER_MP(res_mp)) 8677 break; 8678 8679 /* 8680 * To be at this point in the code with a non-zero gw 8681 * means that dst is reachable through a gateway that 8682 * we have never resolved. By changing dst to the gw 8683 * addr we resolve the gateway first. 8684 * When ire_add_then_send() tries to put the IP dg 8685 * to dst, it will reenter ip_newroute() at which 8686 * time we will find the IRE_CACHE for the gw and 8687 * create another IRE_CACHE in case IRE_CACHE above. 8688 */ 8689 if (gw != INADDR_ANY) { 8690 /* 8691 * The source ipif that was determined above was 8692 * relative to the destination address, not the 8693 * gateway's. If src_ipif was not taken out of 8694 * the IRE_IF_RESOLVER entry, we'll need to call 8695 * ipif_select_source() again. 8696 */ 8697 if (src_ipif != ire->ire_ipif) { 8698 ipif_refrele(src_ipif); 8699 src_ipif = ipif_select_source(dst_ill, 8700 gw, zoneid); 8701 if (src_ipif == NULL) { 8702 if (ip_debug > 2) { 8703 pr_addr_dbg( 8704 "ip_newroute: no " 8705 "src for gw %s ", 8706 AF_INET, &gw); 8707 printf("through " 8708 "interface %s\n", 8709 dst_ill->ill_name); 8710 } 8711 goto icmp_err_ret; 8712 } 8713 } 8714 save_dst = dst; 8715 dst = gw; 8716 gw = INADDR_ANY; 8717 } 8718 8719 /* 8720 * We obtain a partial IRE_CACHE which we will pass 8721 * along with the resolver query. When the response 8722 * comes back it will be there ready for us to add. 8723 * The ire_max_frag is atomically set under the 8724 * irebucket lock in ire_add_v[46]. 8725 */ 8726 8727 ire = ire_create_mp( 8728 (uchar_t *)&dst, /* dest address */ 8729 (uchar_t *)&ip_g_all_ones, /* mask */ 8730 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8731 (uchar_t *)&gw, /* gateway address */ 8732 NULL, /* ire_max_frag */ 8733 NULL, /* no src nce */ 8734 dst_ill->ill_rq, /* recv-from queue */ 8735 dst_ill->ill_wq, /* send-to queue */ 8736 IRE_CACHE, 8737 src_ipif, /* Interface ipif */ 8738 save_ire->ire_mask, /* Parent mask */ 8739 0, 8740 save_ire->ire_ihandle, /* Interface handle */ 8741 0, /* flags if any */ 8742 &(save_ire->ire_uinfo), 8743 NULL, 8744 NULL, 8745 ipst); 8746 8747 if (ire == NULL) { 8748 ire_refrele(save_ire); 8749 break; 8750 } 8751 8752 if ((sire != NULL) && 8753 (sire->ire_flags & RTF_MULTIRT)) { 8754 copy_mp = copymsg(first_mp); 8755 if (copy_mp != NULL) 8756 MULTIRT_DEBUG_TAG(copy_mp); 8757 } 8758 8759 ire->ire_marks |= ire_marks; 8760 8761 /* 8762 * Construct message chain for the resolver 8763 * of the form: 8764 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8765 * Packet could contain a IPSEC_OUT mp. 8766 * 8767 * NOTE : ire will be added later when the response 8768 * comes back from ARP. If the response does not 8769 * come back, ARP frees the packet. For this reason, 8770 * we can't REFHOLD the bucket of save_ire to prevent 8771 * deletions. We may not be able to REFRELE the bucket 8772 * if the response never comes back. Thus, before 8773 * adding the ire, ire_add_v4 will make sure that the 8774 * interface route does not get deleted. This is the 8775 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8776 * where we can always prevent deletions because of 8777 * the synchronous nature of adding IRES i.e 8778 * ire_add_then_send is called after creating the IRE. 8779 */ 8780 ASSERT(ire->ire_mp != NULL); 8781 ire->ire_mp->b_cont = first_mp; 8782 /* Have saved_mp handy, for cleanup if canput fails */ 8783 saved_mp = mp; 8784 mp = copyb(res_mp); 8785 if (mp == NULL) { 8786 /* Prepare for cleanup */ 8787 mp = saved_mp; /* pkt */ 8788 ire_delete(ire); /* ire_mp */ 8789 ire = NULL; 8790 ire_refrele(save_ire); 8791 if (copy_mp != NULL) { 8792 MULTIRT_DEBUG_UNTAG(copy_mp); 8793 freemsg(copy_mp); 8794 copy_mp = NULL; 8795 } 8796 break; 8797 } 8798 linkb(mp, ire->ire_mp); 8799 8800 /* 8801 * Fill in the source and dest addrs for the resolver. 8802 * NOTE: this depends on memory layouts imposed by 8803 * ill_init(). 8804 */ 8805 areq = (areq_t *)mp->b_rptr; 8806 addrp = (ipaddr_t *)((char *)areq + 8807 areq->areq_sender_addr_offset); 8808 if (do_attach_ill) { 8809 /* 8810 * This is bind to no failover case. 8811 * arp packet also must go out on attach_ill. 8812 */ 8813 ASSERT(ipha->ipha_src != NULL); 8814 *addrp = ipha->ipha_src; 8815 } else { 8816 *addrp = save_ire->ire_src_addr; 8817 } 8818 8819 ire_refrele(save_ire); 8820 addrp = (ipaddr_t *)((char *)areq + 8821 areq->areq_target_addr_offset); 8822 *addrp = dst; 8823 /* Up to the resolver. */ 8824 if (canputnext(dst_ill->ill_rq) && 8825 !(dst_ill->ill_arp_closing)) { 8826 putnext(dst_ill->ill_rq, mp); 8827 ire = NULL; 8828 if (copy_mp != NULL) { 8829 /* 8830 * If we found a resolver, we ignore 8831 * any trailing top priority IRE_CACHE 8832 * in the further loops. This ensures 8833 * that we do not omit any resolver. 8834 * IRE_CACHE entries, if any, will be 8835 * processed next time we enter 8836 * ip_newroute(). 8837 */ 8838 multirt_flags &= ~MULTIRT_CACHEGW; 8839 /* 8840 * Search for the next unresolved 8841 * multirt route. 8842 */ 8843 first_mp = copy_mp; 8844 copy_mp = NULL; 8845 /* Prepare the next resolution loop. */ 8846 mp = first_mp; 8847 EXTRACT_PKT_MP(mp, first_mp, 8848 mctl_present); 8849 if (mctl_present) 8850 io = (ipsec_out_t *) 8851 first_mp->b_rptr; 8852 ipha = (ipha_t *)mp->b_rptr; 8853 8854 ASSERT(sire != NULL); 8855 8856 dst = save_dst; 8857 multirt_resolve_next = B_TRUE; 8858 continue; 8859 } 8860 8861 if (sire != NULL) 8862 ire_refrele(sire); 8863 8864 /* 8865 * The response will come back in ip_wput 8866 * with db_type IRE_DB_TYPE. 8867 */ 8868 ipif_refrele(src_ipif); 8869 ill_refrele(dst_ill); 8870 return; 8871 } else { 8872 /* Prepare for cleanup */ 8873 DTRACE_PROBE1(ip__newroute__drop, mblk_t *, 8874 mp); 8875 mp->b_cont = NULL; 8876 freeb(mp); /* areq */ 8877 /* 8878 * this is an ire that is not added to the 8879 * cache. ire_freemblk will handle the release 8880 * of any resources associated with the ire. 8881 */ 8882 ire_delete(ire); /* ire_mp */ 8883 mp = saved_mp; /* pkt */ 8884 ire = NULL; 8885 if (copy_mp != NULL) { 8886 MULTIRT_DEBUG_UNTAG(copy_mp); 8887 freemsg(copy_mp); 8888 copy_mp = NULL; 8889 } 8890 break; 8891 } 8892 default: 8893 break; 8894 } 8895 } while (multirt_resolve_next); 8896 8897 ip1dbg(("ip_newroute: dropped\n")); 8898 /* Did this packet originate externally? */ 8899 if (mp->b_prev) { 8900 mp->b_next = NULL; 8901 mp->b_prev = NULL; 8902 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 8903 } else { 8904 if (dst_ill != NULL) { 8905 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); 8906 } else { 8907 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 8908 } 8909 } 8910 ASSERT(copy_mp == NULL); 8911 MULTIRT_DEBUG_UNTAG(first_mp); 8912 freemsg(first_mp); 8913 if (ire != NULL) 8914 ire_refrele(ire); 8915 if (sire != NULL) 8916 ire_refrele(sire); 8917 if (src_ipif != NULL) 8918 ipif_refrele(src_ipif); 8919 if (dst_ill != NULL) 8920 ill_refrele(dst_ill); 8921 return; 8922 8923 icmp_err_ret: 8924 ip1dbg(("ip_newroute: no route\n")); 8925 if (src_ipif != NULL) 8926 ipif_refrele(src_ipif); 8927 if (dst_ill != NULL) 8928 ill_refrele(dst_ill); 8929 if (sire != NULL) 8930 ire_refrele(sire); 8931 /* Did this packet originate externally? */ 8932 if (mp->b_prev) { 8933 mp->b_next = NULL; 8934 mp->b_prev = NULL; 8935 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes); 8936 q = WR(q); 8937 } else { 8938 /* 8939 * There is no outgoing ill, so just increment the 8940 * system MIB. 8941 */ 8942 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 8943 /* 8944 * Since ip_wput() isn't close to finished, we fill 8945 * in enough of the header for credible error reporting. 8946 */ 8947 if (ip_hdr_complete(ipha, zoneid, ipst)) { 8948 /* Failed */ 8949 MULTIRT_DEBUG_UNTAG(first_mp); 8950 freemsg(first_mp); 8951 if (ire != NULL) 8952 ire_refrele(ire); 8953 return; 8954 } 8955 } 8956 8957 /* 8958 * At this point we will have ire only if RTF_BLACKHOLE 8959 * or RTF_REJECT flags are set on the IRE. It will not 8960 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8961 */ 8962 if (ire != NULL) { 8963 if (ire->ire_flags & RTF_BLACKHOLE) { 8964 ire_refrele(ire); 8965 MULTIRT_DEBUG_UNTAG(first_mp); 8966 freemsg(first_mp); 8967 return; 8968 } 8969 ire_refrele(ire); 8970 } 8971 if (ip_source_routed(ipha, ipst)) { 8972 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED, 8973 zoneid, ipst); 8974 return; 8975 } 8976 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 8977 } 8978 8979 ip_opt_info_t zero_info; 8980 8981 /* 8982 * IPv4 - 8983 * ip_newroute_ipif is called by ip_wput_multicast and 8984 * ip_rput_forward_multicast whenever we need to send 8985 * out a packet to a destination address for which we do not have specific 8986 * routing information. It is used when the packet will be sent out 8987 * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF 8988 * socket option is set or icmp error message wants to go out on a particular 8989 * interface for a unicast packet. 8990 * 8991 * In most cases, the destination address is resolved thanks to the ipif 8992 * intrinsic resolver. However, there are some cases where the call to 8993 * ip_newroute_ipif must take into account the potential presence of 8994 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8995 * that uses the interface. This is specified through flags, 8996 * which can be a combination of: 8997 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8998 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8999 * and flags. Additionally, the packet source address has to be set to 9000 * the specified address. The caller is thus expected to set this flag 9001 * if the packet has no specific source address yet. 9002 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 9003 * flag, the resulting ire will inherit the flag. All unresolved routes 9004 * to the destination must be explored in the same call to 9005 * ip_newroute_ipif(). 9006 */ 9007 static void 9008 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 9009 conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop) 9010 { 9011 areq_t *areq; 9012 ire_t *ire = NULL; 9013 mblk_t *res_mp; 9014 ipaddr_t *addrp; 9015 mblk_t *first_mp; 9016 ire_t *save_ire = NULL; 9017 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ 9018 ipif_t *src_ipif = NULL; 9019 ushort_t ire_marks = 0; 9020 ill_t *dst_ill = NULL; 9021 boolean_t mctl_present; 9022 ipsec_out_t *io; 9023 ipha_t *ipha; 9024 int ihandle = 0; 9025 mblk_t *saved_mp; 9026 ire_t *fire = NULL; 9027 mblk_t *copy_mp = NULL; 9028 boolean_t multirt_resolve_next; 9029 boolean_t unspec_src; 9030 ipaddr_t ipha_dst; 9031 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 9032 9033 /* 9034 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 9035 * here for uniformity 9036 */ 9037 ipif_refhold(ipif); 9038 9039 /* 9040 * This loop is run only once in most cases. 9041 * We loop to resolve further routes only when the destination 9042 * can be reached through multiple RTF_MULTIRT-flagged ires. 9043 */ 9044 do { 9045 if (dst_ill != NULL) { 9046 ill_refrele(dst_ill); 9047 dst_ill = NULL; 9048 } 9049 if (src_ipif != NULL) { 9050 ipif_refrele(src_ipif); 9051 src_ipif = NULL; 9052 } 9053 multirt_resolve_next = B_FALSE; 9054 9055 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 9056 ipif->ipif_ill->ill_name)); 9057 9058 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 9059 if (mctl_present) 9060 io = (ipsec_out_t *)first_mp->b_rptr; 9061 9062 ipha = (ipha_t *)mp->b_rptr; 9063 9064 /* 9065 * Save the packet destination address, we may need it after 9066 * the packet has been consumed. 9067 */ 9068 ipha_dst = ipha->ipha_dst; 9069 9070 /* 9071 * If the interface is a pt-pt interface we look for an 9072 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 9073 * local_address and the pt-pt destination address. Otherwise 9074 * we just match the local address. 9075 * NOTE: dst could be different than ipha->ipha_dst in case 9076 * of sending igmp multicast packets over a point-to-point 9077 * connection. 9078 * Thus we must be careful enough to check ipha_dst to be a 9079 * multicast address, otherwise it will take xmit_if path for 9080 * multicast packets resulting into kernel stack overflow by 9081 * repeated calls to ip_newroute_ipif from ire_send(). 9082 */ 9083 if (CLASSD(ipha_dst) && 9084 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 9085 goto err_ret; 9086 } 9087 9088 /* 9089 * We check if an IRE_OFFSUBNET for the addr that goes through 9090 * ipif exists. We need it to determine if the RTF_SETSRC and/or 9091 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 9092 * propagate its flags to the new ire. 9093 */ 9094 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 9095 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 9096 ip2dbg(("ip_newroute_ipif: " 9097 "ipif_lookup_multi_ire(" 9098 "ipif %p, dst %08x) = fire %p\n", 9099 (void *)ipif, ntohl(dst), (void *)fire)); 9100 } 9101 9102 if (mctl_present && io->ipsec_out_attach_if) { 9103 attach_ill = ip_grab_attach_ill(NULL, first_mp, 9104 io->ipsec_out_ill_index, B_FALSE, ipst); 9105 9106 /* Failure case frees things for us. */ 9107 if (attach_ill == NULL) { 9108 ipif_refrele(ipif); 9109 if (fire != NULL) 9110 ire_refrele(fire); 9111 return; 9112 } 9113 9114 /* 9115 * Check if we need an ire that will not be 9116 * looked up by anybody else i.e. HIDDEN. 9117 */ 9118 if (ill_is_probeonly(attach_ill)) { 9119 ire_marks = IRE_MARK_HIDDEN; 9120 } 9121 /* 9122 * ip_wput passes the right ipif for IPIF_NOFAILOVER 9123 * case. 9124 */ 9125 dst_ill = ipif->ipif_ill; 9126 /* attach_ill has been refheld by ip_grab_attach_ill */ 9127 ASSERT(dst_ill == attach_ill); 9128 } else { 9129 /* 9130 * If the interface belongs to an interface group, 9131 * make sure the next possible interface in the group 9132 * is used. This encourages load spreading among 9133 * peers in an interface group. 9134 * Note: load spreading is disabled for RTF_MULTIRT 9135 * routes. 9136 */ 9137 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9138 (fire->ire_flags & RTF_MULTIRT)) { 9139 /* 9140 * Don't perform outbound load spreading 9141 * in the case of an RTF_MULTIRT issued route, 9142 * we actually typically want to replicate 9143 * outgoing packets through particular 9144 * interfaces. 9145 */ 9146 dst_ill = ipif->ipif_ill; 9147 ill_refhold(dst_ill); 9148 } else { 9149 dst_ill = ip_newroute_get_dst_ill( 9150 ipif->ipif_ill); 9151 } 9152 if (dst_ill == NULL) { 9153 if (ip_debug > 2) { 9154 pr_addr_dbg("ip_newroute_ipif: " 9155 "no dst ill for dst %s\n", 9156 AF_INET, &dst); 9157 } 9158 goto err_ret; 9159 } 9160 } 9161 9162 /* 9163 * Pick a source address preferring non-deprecated ones. 9164 * Unlike ip_newroute, we don't do any source address 9165 * selection here since for multicast it really does not help 9166 * in inbound load spreading as in the unicast case. 9167 */ 9168 if ((flags & RTF_SETSRC) && (fire != NULL) && 9169 (fire->ire_flags & RTF_SETSRC)) { 9170 /* 9171 * As requested by flags, an IRE_OFFSUBNET was looked up 9172 * on that interface. This ire has RTF_SETSRC flag, so 9173 * the source address of the packet must be changed. 9174 * Check that the ipif matching the requested source 9175 * address still exists. 9176 */ 9177 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 9178 zoneid, NULL, NULL, NULL, NULL, ipst); 9179 } 9180 9181 unspec_src = (connp != NULL && connp->conn_unspec_src); 9182 9183 if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || 9184 (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || 9185 (connp != NULL && ipif->ipif_zoneid != zoneid && 9186 ipif->ipif_zoneid != ALL_ZONES)) && 9187 (src_ipif == NULL) && 9188 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 9189 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 9190 if (src_ipif == NULL) { 9191 if (ip_debug > 2) { 9192 /* ip1dbg */ 9193 pr_addr_dbg("ip_newroute_ipif: " 9194 "no src for dst %s", 9195 AF_INET, &dst); 9196 } 9197 ip1dbg((" through interface %s\n", 9198 dst_ill->ill_name)); 9199 goto err_ret; 9200 } 9201 ipif_refrele(ipif); 9202 ipif = src_ipif; 9203 ipif_refhold(ipif); 9204 } 9205 if (src_ipif == NULL) { 9206 src_ipif = ipif; 9207 ipif_refhold(src_ipif); 9208 } 9209 9210 /* 9211 * Assign a source address while we have the conn. 9212 * We can't have ip_wput_ire pick a source address when the 9213 * packet returns from arp since conn_unspec_src might be set 9214 * and we lose the conn when going through arp. 9215 */ 9216 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 9217 ipha->ipha_src = src_ipif->ipif_src_addr; 9218 9219 /* 9220 * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible 9221 * that the outgoing interface does not have an interface ire. 9222 */ 9223 if (CLASSD(ipha_dst) && (connp == NULL || 9224 connp->conn_outgoing_ill == NULL) && 9225 infop->ip_opt_ill_index == 0) { 9226 /* ipif_to_ire returns an held ire */ 9227 ire = ipif_to_ire(ipif); 9228 if (ire == NULL) 9229 goto err_ret; 9230 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 9231 goto err_ret; 9232 /* 9233 * ihandle is needed when the ire is added to 9234 * cache table. 9235 */ 9236 save_ire = ire; 9237 ihandle = save_ire->ire_ihandle; 9238 9239 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 9240 "flags %04x\n", 9241 (void *)ire, (void *)ipif, flags)); 9242 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9243 (fire->ire_flags & RTF_MULTIRT)) { 9244 /* 9245 * As requested by flags, an IRE_OFFSUBNET was 9246 * looked up on that interface. This ire has 9247 * RTF_MULTIRT flag, so the resolution loop will 9248 * be re-entered to resolve additional routes on 9249 * other interfaces. For that purpose, a copy of 9250 * the packet is performed at this point. 9251 */ 9252 fire->ire_last_used_time = lbolt; 9253 copy_mp = copymsg(first_mp); 9254 if (copy_mp) { 9255 MULTIRT_DEBUG_TAG(copy_mp); 9256 } 9257 } 9258 if ((flags & RTF_SETSRC) && (fire != NULL) && 9259 (fire->ire_flags & RTF_SETSRC)) { 9260 /* 9261 * As requested by flags, an IRE_OFFSUBET was 9262 * looked up on that interface. This ire has 9263 * RTF_SETSRC flag, so the source address of the 9264 * packet must be changed. 9265 */ 9266 ipha->ipha_src = fire->ire_src_addr; 9267 } 9268 } else { 9269 ASSERT((connp == NULL) || 9270 (connp->conn_outgoing_ill != NULL) || 9271 (connp->conn_dontroute) || 9272 infop->ip_opt_ill_index != 0); 9273 /* 9274 * The only ways we can come here are: 9275 * 1) IP_BOUND_IF socket option is set 9276 * 2) SO_DONTROUTE socket option is set 9277 * 3) IP_PKTINFO option is passed in as ancillary data. 9278 * In all cases, the new ire will not be added 9279 * into cache table. 9280 */ 9281 ire_marks |= IRE_MARK_NOADD; 9282 } 9283 9284 switch (ipif->ipif_net_type) { 9285 case IRE_IF_NORESOLVER: { 9286 /* We have what we need to build an IRE_CACHE. */ 9287 9288 if ((dst_ill->ill_phys_addr_length != IP_ADDR_LEN) && 9289 (dst_ill->ill_resolver_mp == NULL)) { 9290 ip1dbg(("ip_newroute_ipif: dst_ill %p " 9291 "for IRE_IF_NORESOLVER ire %p has " 9292 "no ill_resolver_mp\n", 9293 (void *)dst_ill, (void *)ire)); 9294 break; 9295 } 9296 9297 /* 9298 * The new ire inherits the IRE_OFFSUBNET flags 9299 * and source address, if this was requested. 9300 */ 9301 ire = ire_create( 9302 (uchar_t *)&dst, /* dest address */ 9303 (uchar_t *)&ip_g_all_ones, /* mask */ 9304 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9305 NULL, /* gateway address */ 9306 &ipif->ipif_mtu, 9307 NULL, /* no src nce */ 9308 dst_ill->ill_rq, /* recv-from queue */ 9309 dst_ill->ill_wq, /* send-to queue */ 9310 IRE_CACHE, 9311 src_ipif, 9312 (save_ire != NULL ? save_ire->ire_mask : 0), 9313 (fire != NULL) ? /* Parent handle */ 9314 fire->ire_phandle : 0, 9315 ihandle, /* Interface handle */ 9316 (fire != NULL) ? 9317 (fire->ire_flags & 9318 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9319 (save_ire == NULL ? &ire_uinfo_null : 9320 &save_ire->ire_uinfo), 9321 NULL, 9322 NULL, 9323 ipst); 9324 9325 if (ire == NULL) { 9326 if (save_ire != NULL) 9327 ire_refrele(save_ire); 9328 break; 9329 } 9330 9331 ire->ire_marks |= ire_marks; 9332 9333 /* 9334 * If IRE_MARK_NOADD is set then we need to convert 9335 * the max_fragp to a useable value now. This is 9336 * normally done in ire_add_v[46]. We also need to 9337 * associate the ire with an nce (normally would be 9338 * done in ip_wput_nondata()). 9339 * 9340 * Note that IRE_MARK_NOADD packets created here 9341 * do not have a non-null ire_mp pointer. The null 9342 * value of ire_bucket indicates that they were 9343 * never added. 9344 */ 9345 if (ire->ire_marks & IRE_MARK_NOADD) { 9346 uint_t max_frag; 9347 9348 max_frag = *ire->ire_max_fragp; 9349 ire->ire_max_fragp = NULL; 9350 ire->ire_max_frag = max_frag; 9351 9352 if ((ire->ire_nce = ndp_lookup_v4( 9353 ire_to_ill(ire), 9354 (ire->ire_gateway_addr != INADDR_ANY ? 9355 &ire->ire_gateway_addr : &ire->ire_addr), 9356 B_FALSE)) == NULL) { 9357 if (save_ire != NULL) 9358 ire_refrele(save_ire); 9359 break; 9360 } 9361 ASSERT(ire->ire_nce->nce_state == 9362 ND_REACHABLE); 9363 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 9364 } 9365 9366 /* Prevent save_ire from getting deleted */ 9367 if (save_ire != NULL) { 9368 IRB_REFHOLD(save_ire->ire_bucket); 9369 /* Has it been removed already ? */ 9370 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 9371 IRB_REFRELE(save_ire->ire_bucket); 9372 ire_refrele(save_ire); 9373 break; 9374 } 9375 } 9376 9377 ire_add_then_send(q, ire, first_mp); 9378 9379 /* Assert that save_ire is not deleted yet. */ 9380 if (save_ire != NULL) { 9381 ASSERT(save_ire->ire_ptpn != NULL); 9382 IRB_REFRELE(save_ire->ire_bucket); 9383 ire_refrele(save_ire); 9384 save_ire = NULL; 9385 } 9386 if (fire != NULL) { 9387 ire_refrele(fire); 9388 fire = NULL; 9389 } 9390 9391 /* 9392 * the resolution loop is re-entered if this 9393 * was requested through flags and if we 9394 * actually are in a multirouting case. 9395 */ 9396 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9397 boolean_t need_resolve = 9398 ire_multirt_need_resolve(ipha_dst, 9399 MBLK_GETLABEL(copy_mp), ipst); 9400 if (!need_resolve) { 9401 MULTIRT_DEBUG_UNTAG(copy_mp); 9402 freemsg(copy_mp); 9403 copy_mp = NULL; 9404 } else { 9405 /* 9406 * ipif_lookup_group() calls 9407 * ire_lookup_multi() that uses 9408 * ire_ftable_lookup() to find 9409 * an IRE_INTERFACE for the group. 9410 * In the multirt case, 9411 * ire_lookup_multi() then invokes 9412 * ire_multirt_lookup() to find 9413 * the next resolvable ire. 9414 * As a result, we obtain an new 9415 * interface, derived from the 9416 * next ire. 9417 */ 9418 ipif_refrele(ipif); 9419 ipif = ipif_lookup_group(ipha_dst, 9420 zoneid, ipst); 9421 ip2dbg(("ip_newroute_ipif: " 9422 "multirt dst %08x, ipif %p\n", 9423 htonl(dst), (void *)ipif)); 9424 if (ipif != NULL) { 9425 mp = copy_mp; 9426 copy_mp = NULL; 9427 multirt_resolve_next = B_TRUE; 9428 continue; 9429 } else { 9430 freemsg(copy_mp); 9431 } 9432 } 9433 } 9434 if (ipif != NULL) 9435 ipif_refrele(ipif); 9436 ill_refrele(dst_ill); 9437 ipif_refrele(src_ipif); 9438 return; 9439 } 9440 case IRE_IF_RESOLVER: 9441 /* 9442 * We can't build an IRE_CACHE yet, but at least 9443 * we found a resolver that can help. 9444 */ 9445 res_mp = dst_ill->ill_resolver_mp; 9446 if (!OK_RESOLVER_MP(res_mp)) 9447 break; 9448 9449 /* 9450 * We obtain a partial IRE_CACHE which we will pass 9451 * along with the resolver query. When the response 9452 * comes back it will be there ready for us to add. 9453 * The new ire inherits the IRE_OFFSUBNET flags 9454 * and source address, if this was requested. 9455 * The ire_max_frag is atomically set under the 9456 * irebucket lock in ire_add_v[46]. Only in the 9457 * case of IRE_MARK_NOADD, we set it here itself. 9458 */ 9459 ire = ire_create_mp( 9460 (uchar_t *)&dst, /* dest address */ 9461 (uchar_t *)&ip_g_all_ones, /* mask */ 9462 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9463 NULL, /* gateway address */ 9464 (ire_marks & IRE_MARK_NOADD) ? 9465 ipif->ipif_mtu : 0, /* max_frag */ 9466 NULL, /* no src nce */ 9467 dst_ill->ill_rq, /* recv-from queue */ 9468 dst_ill->ill_wq, /* send-to queue */ 9469 IRE_CACHE, 9470 src_ipif, 9471 (save_ire != NULL ? save_ire->ire_mask : 0), 9472 (fire != NULL) ? /* Parent handle */ 9473 fire->ire_phandle : 0, 9474 ihandle, /* Interface handle */ 9475 (fire != NULL) ? /* flags if any */ 9476 (fire->ire_flags & 9477 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9478 (save_ire == NULL ? &ire_uinfo_null : 9479 &save_ire->ire_uinfo), 9480 NULL, 9481 NULL, 9482 ipst); 9483 9484 if (save_ire != NULL) { 9485 ire_refrele(save_ire); 9486 save_ire = NULL; 9487 } 9488 if (ire == NULL) 9489 break; 9490 9491 ire->ire_marks |= ire_marks; 9492 /* 9493 * Construct message chain for the resolver of the 9494 * form: 9495 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 9496 * 9497 * NOTE : ire will be added later when the response 9498 * comes back from ARP. If the response does not 9499 * come back, ARP frees the packet. For this reason, 9500 * we can't REFHOLD the bucket of save_ire to prevent 9501 * deletions. We may not be able to REFRELE the 9502 * bucket if the response never comes back. 9503 * Thus, before adding the ire, ire_add_v4 will make 9504 * sure that the interface route does not get deleted. 9505 * This is the only case unlike ip_newroute_v6, 9506 * ip_newroute_ipif_v6 where we can always prevent 9507 * deletions because ire_add_then_send is called after 9508 * creating the IRE. 9509 * If IRE_MARK_NOADD is set, then ire_add_then_send 9510 * does not add this IRE into the IRE CACHE. 9511 */ 9512 ASSERT(ire->ire_mp != NULL); 9513 ire->ire_mp->b_cont = first_mp; 9514 /* Have saved_mp handy, for cleanup if canput fails */ 9515 saved_mp = mp; 9516 mp = copyb(res_mp); 9517 if (mp == NULL) { 9518 /* Prepare for cleanup */ 9519 mp = saved_mp; /* pkt */ 9520 ire_delete(ire); /* ire_mp */ 9521 ire = NULL; 9522 if (copy_mp != NULL) { 9523 MULTIRT_DEBUG_UNTAG(copy_mp); 9524 freemsg(copy_mp); 9525 copy_mp = NULL; 9526 } 9527 break; 9528 } 9529 linkb(mp, ire->ire_mp); 9530 9531 /* 9532 * Fill in the source and dest addrs for the resolver. 9533 * NOTE: this depends on memory layouts imposed by 9534 * ill_init(). 9535 */ 9536 areq = (areq_t *)mp->b_rptr; 9537 addrp = (ipaddr_t *)((char *)areq + 9538 areq->areq_sender_addr_offset); 9539 *addrp = ire->ire_src_addr; 9540 addrp = (ipaddr_t *)((char *)areq + 9541 areq->areq_target_addr_offset); 9542 *addrp = dst; 9543 /* Up to the resolver. */ 9544 if (canputnext(dst_ill->ill_rq) && 9545 !(dst_ill->ill_arp_closing)) { 9546 putnext(dst_ill->ill_rq, mp); 9547 /* 9548 * The response will come back in ip_wput 9549 * with db_type IRE_DB_TYPE. 9550 */ 9551 } else { 9552 mp->b_cont = NULL; 9553 freeb(mp); /* areq */ 9554 ire_delete(ire); /* ire_mp */ 9555 saved_mp->b_next = NULL; 9556 saved_mp->b_prev = NULL; 9557 freemsg(first_mp); /* pkt */ 9558 ip2dbg(("ip_newroute_ipif: dropped\n")); 9559 } 9560 9561 if (fire != NULL) { 9562 ire_refrele(fire); 9563 fire = NULL; 9564 } 9565 9566 9567 /* 9568 * The resolution loop is re-entered if this was 9569 * requested through flags and we actually are 9570 * in a multirouting case. 9571 */ 9572 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9573 boolean_t need_resolve = 9574 ire_multirt_need_resolve(ipha_dst, 9575 MBLK_GETLABEL(copy_mp), ipst); 9576 if (!need_resolve) { 9577 MULTIRT_DEBUG_UNTAG(copy_mp); 9578 freemsg(copy_mp); 9579 copy_mp = NULL; 9580 } else { 9581 /* 9582 * ipif_lookup_group() calls 9583 * ire_lookup_multi() that uses 9584 * ire_ftable_lookup() to find 9585 * an IRE_INTERFACE for the group. 9586 * In the multirt case, 9587 * ire_lookup_multi() then invokes 9588 * ire_multirt_lookup() to find 9589 * the next resolvable ire. 9590 * As a result, we obtain an new 9591 * interface, derived from the 9592 * next ire. 9593 */ 9594 ipif_refrele(ipif); 9595 ipif = ipif_lookup_group(ipha_dst, 9596 zoneid, ipst); 9597 if (ipif != NULL) { 9598 mp = copy_mp; 9599 copy_mp = NULL; 9600 multirt_resolve_next = B_TRUE; 9601 continue; 9602 } else { 9603 freemsg(copy_mp); 9604 } 9605 } 9606 } 9607 if (ipif != NULL) 9608 ipif_refrele(ipif); 9609 ill_refrele(dst_ill); 9610 ipif_refrele(src_ipif); 9611 return; 9612 default: 9613 break; 9614 } 9615 } while (multirt_resolve_next); 9616 9617 err_ret: 9618 ip2dbg(("ip_newroute_ipif: dropped\n")); 9619 if (fire != NULL) 9620 ire_refrele(fire); 9621 ipif_refrele(ipif); 9622 /* Did this packet originate externally? */ 9623 if (dst_ill != NULL) 9624 ill_refrele(dst_ill); 9625 if (src_ipif != NULL) 9626 ipif_refrele(src_ipif); 9627 if (mp->b_prev || mp->b_next) { 9628 mp->b_next = NULL; 9629 mp->b_prev = NULL; 9630 } else { 9631 /* 9632 * Since ip_wput() isn't close to finished, we fill 9633 * in enough of the header for credible error reporting. 9634 */ 9635 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 9636 /* Failed */ 9637 freemsg(first_mp); 9638 if (ire != NULL) 9639 ire_refrele(ire); 9640 return; 9641 } 9642 } 9643 /* 9644 * At this point we will have ire only if RTF_BLACKHOLE 9645 * or RTF_REJECT flags are set on the IRE. It will not 9646 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 9647 */ 9648 if (ire != NULL) { 9649 if (ire->ire_flags & RTF_BLACKHOLE) { 9650 ire_refrele(ire); 9651 freemsg(first_mp); 9652 return; 9653 } 9654 ire_refrele(ire); 9655 } 9656 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 9657 } 9658 9659 /* Name/Value Table Lookup Routine */ 9660 char * 9661 ip_nv_lookup(nv_t *nv, int value) 9662 { 9663 if (!nv) 9664 return (NULL); 9665 for (; nv->nv_name; nv++) { 9666 if (nv->nv_value == value) 9667 return (nv->nv_name); 9668 } 9669 return ("unknown"); 9670 } 9671 9672 /* 9673 * This is a module open, i.e. this is a control stream for access 9674 * to a DLPI device. We allocate an ill_t as the instance data in 9675 * this case. 9676 */ 9677 int 9678 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9679 { 9680 ill_t *ill; 9681 int err; 9682 zoneid_t zoneid; 9683 netstack_t *ns; 9684 ip_stack_t *ipst; 9685 9686 /* 9687 * Prevent unprivileged processes from pushing IP so that 9688 * they can't send raw IP. 9689 */ 9690 if (secpolicy_net_rawaccess(credp) != 0) 9691 return (EPERM); 9692 9693 ns = netstack_find_by_cred(credp); 9694 ASSERT(ns != NULL); 9695 ipst = ns->netstack_ip; 9696 ASSERT(ipst != NULL); 9697 9698 /* 9699 * For exclusive stacks we set the zoneid to zero 9700 * to make IP operate as if in the global zone. 9701 */ 9702 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9703 zoneid = GLOBAL_ZONEID; 9704 else 9705 zoneid = crgetzoneid(credp); 9706 9707 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 9708 q->q_ptr = WR(q)->q_ptr = ill; 9709 ill->ill_ipst = ipst; 9710 ill->ill_zoneid = zoneid; 9711 9712 /* 9713 * ill_init initializes the ill fields and then sends down 9714 * down a DL_INFO_REQ after calling qprocson. 9715 */ 9716 err = ill_init(q, ill); 9717 if (err != 0) { 9718 mi_free(ill); 9719 netstack_rele(ipst->ips_netstack); 9720 q->q_ptr = NULL; 9721 WR(q)->q_ptr = NULL; 9722 return (err); 9723 } 9724 9725 /* ill_init initializes the ipsq marking this thread as writer */ 9726 ipsq_exit(ill->ill_phyint->phyint_ipsq); 9727 /* Wait for the DL_INFO_ACK */ 9728 mutex_enter(&ill->ill_lock); 9729 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 9730 /* 9731 * Return value of 0 indicates a pending signal. 9732 */ 9733 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 9734 if (err == 0) { 9735 mutex_exit(&ill->ill_lock); 9736 (void) ip_close(q, 0); 9737 return (EINTR); 9738 } 9739 } 9740 mutex_exit(&ill->ill_lock); 9741 9742 /* 9743 * ip_rput_other could have set an error in ill_error on 9744 * receipt of M_ERROR. 9745 */ 9746 9747 err = ill->ill_error; 9748 if (err != 0) { 9749 (void) ip_close(q, 0); 9750 return (err); 9751 } 9752 9753 ill->ill_credp = credp; 9754 crhold(credp); 9755 9756 mutex_enter(&ipst->ips_ip_mi_lock); 9757 err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag, 9758 credp); 9759 mutex_exit(&ipst->ips_ip_mi_lock); 9760 if (err) { 9761 (void) ip_close(q, 0); 9762 return (err); 9763 } 9764 return (0); 9765 } 9766 9767 /* For /dev/ip aka AF_INET open */ 9768 int 9769 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9770 { 9771 return (ip_open(q, devp, flag, sflag, credp, B_FALSE)); 9772 } 9773 9774 /* For /dev/ip6 aka AF_INET6 open */ 9775 int 9776 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9777 { 9778 return (ip_open(q, devp, flag, sflag, credp, B_TRUE)); 9779 } 9780 9781 /* IP open routine. */ 9782 int 9783 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 9784 boolean_t isv6) 9785 { 9786 conn_t *connp; 9787 major_t maj; 9788 zoneid_t zoneid; 9789 netstack_t *ns; 9790 ip_stack_t *ipst; 9791 9792 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 9793 9794 /* Allow reopen. */ 9795 if (q->q_ptr != NULL) 9796 return (0); 9797 9798 if (sflag & MODOPEN) { 9799 /* This is a module open */ 9800 return (ip_modopen(q, devp, flag, sflag, credp)); 9801 } 9802 9803 ns = netstack_find_by_cred(credp); 9804 ASSERT(ns != NULL); 9805 ipst = ns->netstack_ip; 9806 ASSERT(ipst != NULL); 9807 9808 /* 9809 * For exclusive stacks we set the zoneid to zero 9810 * to make IP operate as if in the global zone. 9811 */ 9812 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9813 zoneid = GLOBAL_ZONEID; 9814 else 9815 zoneid = crgetzoneid(credp); 9816 9817 /* 9818 * We are opening as a device. This is an IP client stream, and we 9819 * allocate an conn_t as the instance data. 9820 */ 9821 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack); 9822 9823 /* 9824 * ipcl_conn_create did a netstack_hold. Undo the hold that was 9825 * done by netstack_find_by_cred() 9826 */ 9827 netstack_rele(ipst->ips_netstack); 9828 9829 connp->conn_zoneid = zoneid; 9830 9831 connp->conn_upq = q; 9832 q->q_ptr = WR(q)->q_ptr = connp; 9833 9834 if (flag & SO_SOCKSTR) 9835 connp->conn_flags |= IPCL_SOCKET; 9836 9837 /* Minor tells us which /dev entry was opened */ 9838 if (isv6) { 9839 connp->conn_flags |= IPCL_ISV6; 9840 connp->conn_af_isv6 = B_TRUE; 9841 ip_setpktversion(connp, isv6, B_FALSE, ipst); 9842 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9843 } else { 9844 connp->conn_af_isv6 = B_FALSE; 9845 connp->conn_pkt_isv6 = B_FALSE; 9846 } 9847 9848 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 9849 ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 9850 connp->conn_minor_arena = ip_minor_arena_la; 9851 } else { 9852 /* 9853 * Either minor numbers in the large arena were exhausted 9854 * or a non socket application is doing the open. 9855 * Try to allocate from the small arena. 9856 */ 9857 if ((connp->conn_dev = 9858 inet_minor_alloc(ip_minor_arena_sa)) == 0) { 9859 /* CONN_DEC_REF takes care of netstack_rele() */ 9860 q->q_ptr = WR(q)->q_ptr = NULL; 9861 CONN_DEC_REF(connp); 9862 return (EBUSY); 9863 } 9864 connp->conn_minor_arena = ip_minor_arena_sa; 9865 } 9866 9867 maj = getemajor(*devp); 9868 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9869 9870 /* 9871 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9872 */ 9873 connp->conn_cred = credp; 9874 9875 /* 9876 * Handle IP_RTS_REQUEST and other ioctls which use conn_recv 9877 */ 9878 connp->conn_recv = ip_conn_input; 9879 9880 crhold(connp->conn_cred); 9881 9882 /* 9883 * If the caller has the process-wide flag set, then default to MAC 9884 * exempt mode. This allows read-down to unlabeled hosts. 9885 */ 9886 if (getpflags(NET_MAC_AWARE, credp) != 0) 9887 connp->conn_mac_exempt = B_TRUE; 9888 9889 connp->conn_rq = q; 9890 connp->conn_wq = WR(q); 9891 9892 /* Non-zero default values */ 9893 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9894 9895 /* 9896 * Make the conn globally visible to walkers 9897 */ 9898 ASSERT(connp->conn_ref == 1); 9899 mutex_enter(&connp->conn_lock); 9900 connp->conn_state_flags &= ~CONN_INCIPIENT; 9901 mutex_exit(&connp->conn_lock); 9902 9903 qprocson(q); 9904 9905 return (0); 9906 } 9907 9908 /* 9909 * Change the output format (IPv4 vs. IPv6) for a conn_t. 9910 * Note that there is no race since either ip_output function works - it 9911 * is just an optimization to enter the best ip_output routine directly. 9912 */ 9913 void 9914 ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib, 9915 ip_stack_t *ipst) 9916 { 9917 if (isv6) { 9918 if (bump_mib) { 9919 BUMP_MIB(&ipst->ips_ip6_mib, 9920 ipIfStatsOutSwitchIPVersion); 9921 } 9922 connp->conn_send = ip_output_v6; 9923 connp->conn_pkt_isv6 = B_TRUE; 9924 } else { 9925 if (bump_mib) { 9926 BUMP_MIB(&ipst->ips_ip_mib, 9927 ipIfStatsOutSwitchIPVersion); 9928 } 9929 connp->conn_send = ip_output; 9930 connp->conn_pkt_isv6 = B_FALSE; 9931 } 9932 9933 } 9934 9935 /* 9936 * See if IPsec needs loading because of the options in mp. 9937 */ 9938 static boolean_t 9939 ipsec_opt_present(mblk_t *mp) 9940 { 9941 uint8_t *optcp, *next_optcp, *opt_endcp; 9942 struct opthdr *opt; 9943 struct T_opthdr *topt; 9944 int opthdr_len; 9945 t_uscalar_t optname, optlevel; 9946 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9947 ipsec_req_t *ipsr; 9948 9949 /* 9950 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9951 * return TRUE. 9952 */ 9953 9954 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9955 opt_endcp = optcp + tor->OPT_length; 9956 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9957 opthdr_len = sizeof (struct T_opthdr); 9958 } else { /* O_OPTMGMT_REQ */ 9959 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9960 opthdr_len = sizeof (struct opthdr); 9961 } 9962 for (; optcp < opt_endcp; optcp = next_optcp) { 9963 if (optcp + opthdr_len > opt_endcp) 9964 return (B_FALSE); /* Not enough option header. */ 9965 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9966 topt = (struct T_opthdr *)optcp; 9967 optlevel = topt->level; 9968 optname = topt->name; 9969 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9970 } else { 9971 opt = (struct opthdr *)optcp; 9972 optlevel = opt->level; 9973 optname = opt->name; 9974 next_optcp = optcp + opthdr_len + 9975 _TPI_ALIGN_OPT(opt->len); 9976 } 9977 if ((next_optcp < optcp) || /* wraparound pointer space */ 9978 ((next_optcp >= opt_endcp) && /* last option bad len */ 9979 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9980 return (B_FALSE); /* bad option buffer */ 9981 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9982 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9983 /* 9984 * Check to see if it's an all-bypass or all-zeroes 9985 * IPsec request. Don't bother loading IPsec if 9986 * the socket doesn't want to use it. (A good example 9987 * is a bypass request.) 9988 * 9989 * Basically, if any of the non-NEVER bits are set, 9990 * load IPsec. 9991 */ 9992 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9993 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9994 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9995 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9996 != 0) 9997 return (B_TRUE); 9998 } 9999 } 10000 return (B_FALSE); 10001 } 10002 10003 /* 10004 * If conn is is waiting for ipsec to finish loading, kick it. 10005 */ 10006 /* ARGSUSED */ 10007 static void 10008 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 10009 { 10010 t_scalar_t optreq_prim; 10011 mblk_t *mp; 10012 cred_t *cr; 10013 int err = 0; 10014 10015 /* 10016 * This function is called, after ipsec loading is complete. 10017 * Since IP checks exclusively and atomically (i.e it prevents 10018 * ipsec load from completing until ip_optcom_req completes) 10019 * whether ipsec load is complete, there cannot be a race with IP 10020 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 10021 */ 10022 mutex_enter(&connp->conn_lock); 10023 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 10024 ASSERT(connp->conn_ipsec_opt_mp != NULL); 10025 mp = connp->conn_ipsec_opt_mp; 10026 connp->conn_ipsec_opt_mp = NULL; 10027 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 10028 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(CONNP_TO_WQ(connp))); 10029 mutex_exit(&connp->conn_lock); 10030 10031 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 10032 10033 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 10034 if (optreq_prim == T_OPTMGMT_REQ) { 10035 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10036 &ip_opt_obj, B_FALSE); 10037 } else { 10038 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 10039 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10040 &ip_opt_obj, B_FALSE); 10041 } 10042 if (err != EINPROGRESS) 10043 CONN_OPER_PENDING_DONE(connp); 10044 return; 10045 } 10046 mutex_exit(&connp->conn_lock); 10047 } 10048 10049 /* 10050 * Called from the ipsec_loader thread, outside any perimeter, to tell 10051 * ip qenable any of the queues waiting for the ipsec loader to 10052 * complete. 10053 */ 10054 void 10055 ip_ipsec_load_complete(ipsec_stack_t *ipss) 10056 { 10057 netstack_t *ns = ipss->ipsec_netstack; 10058 10059 ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip); 10060 } 10061 10062 /* 10063 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 10064 * determines the grp on which it has to become exclusive, queues the mp 10065 * and sq draining restarts the optmgmt 10066 */ 10067 static boolean_t 10068 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 10069 { 10070 conn_t *connp = Q_TO_CONN(q); 10071 ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec; 10072 10073 /* 10074 * Take IPsec requests and treat them special. 10075 */ 10076 if (ipsec_opt_present(mp)) { 10077 /* First check if IPsec is loaded. */ 10078 mutex_enter(&ipss->ipsec_loader_lock); 10079 if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) { 10080 mutex_exit(&ipss->ipsec_loader_lock); 10081 return (B_FALSE); 10082 } 10083 mutex_enter(&connp->conn_lock); 10084 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 10085 10086 ASSERT(connp->conn_ipsec_opt_mp == NULL); 10087 connp->conn_ipsec_opt_mp = mp; 10088 mutex_exit(&connp->conn_lock); 10089 mutex_exit(&ipss->ipsec_loader_lock); 10090 10091 ipsec_loader_loadnow(ipss); 10092 return (B_TRUE); 10093 } 10094 return (B_FALSE); 10095 } 10096 10097 /* 10098 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 10099 * all of them are copied to the conn_t. If the req is "zero", the policy is 10100 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 10101 * fields. 10102 * We keep only the latest setting of the policy and thus policy setting 10103 * is not incremental/cumulative. 10104 * 10105 * Requests to set policies with multiple alternative actions will 10106 * go through a different API. 10107 */ 10108 int 10109 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 10110 { 10111 uint_t ah_req = 0; 10112 uint_t esp_req = 0; 10113 uint_t se_req = 0; 10114 ipsec_selkey_t sel; 10115 ipsec_act_t *actp = NULL; 10116 uint_t nact; 10117 ipsec_policy_t *pin4 = NULL, *pout4 = NULL; 10118 ipsec_policy_t *pin6 = NULL, *pout6 = NULL; 10119 ipsec_policy_root_t *pr; 10120 ipsec_policy_head_t *ph; 10121 int fam; 10122 boolean_t is_pol_reset; 10123 int error = 0; 10124 netstack_t *ns = connp->conn_netstack; 10125 ip_stack_t *ipst = ns->netstack_ip; 10126 ipsec_stack_t *ipss = ns->netstack_ipsec; 10127 10128 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 10129 10130 /* 10131 * The IP_SEC_OPT option does not allow variable length parameters, 10132 * hence a request cannot be NULL. 10133 */ 10134 if (req == NULL) 10135 return (EINVAL); 10136 10137 ah_req = req->ipsr_ah_req; 10138 esp_req = req->ipsr_esp_req; 10139 se_req = req->ipsr_self_encap_req; 10140 10141 /* Don't allow setting self-encap without one or more of AH/ESP. */ 10142 if (se_req != 0 && esp_req == 0 && ah_req == 0) 10143 return (EINVAL); 10144 10145 /* 10146 * Are we dealing with a request to reset the policy (i.e. 10147 * zero requests). 10148 */ 10149 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 10150 (esp_req & REQ_MASK) == 0 && 10151 (se_req & REQ_MASK) == 0); 10152 10153 if (!is_pol_reset) { 10154 /* 10155 * If we couldn't load IPsec, fail with "protocol 10156 * not supported". 10157 * IPsec may not have been loaded for a request with zero 10158 * policies, so we don't fail in this case. 10159 */ 10160 mutex_enter(&ipss->ipsec_loader_lock); 10161 if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 10162 mutex_exit(&ipss->ipsec_loader_lock); 10163 return (EPROTONOSUPPORT); 10164 } 10165 mutex_exit(&ipss->ipsec_loader_lock); 10166 10167 /* 10168 * Test for valid requests. Invalid algorithms 10169 * need to be tested by IPsec code because new 10170 * algorithms can be added dynamically. 10171 */ 10172 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10173 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10174 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 10175 return (EINVAL); 10176 } 10177 10178 /* 10179 * Only privileged users can issue these 10180 * requests. 10181 */ 10182 if (((ah_req & IPSEC_PREF_NEVER) || 10183 (esp_req & IPSEC_PREF_NEVER) || 10184 (se_req & IPSEC_PREF_NEVER)) && 10185 secpolicy_ip_config(cr, B_FALSE) != 0) { 10186 return (EPERM); 10187 } 10188 10189 /* 10190 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 10191 * are mutually exclusive. 10192 */ 10193 if (((ah_req & REQ_MASK) == REQ_MASK) || 10194 ((esp_req & REQ_MASK) == REQ_MASK) || 10195 ((se_req & REQ_MASK) == REQ_MASK)) { 10196 /* Both of them are set */ 10197 return (EINVAL); 10198 } 10199 } 10200 10201 mutex_enter(&connp->conn_lock); 10202 10203 /* 10204 * If we have already cached policies in ip_bind_connected*(), don't 10205 * let them change now. We cache policies for connections 10206 * whose src,dst [addr, port] is known. 10207 */ 10208 if (connp->conn_policy_cached) { 10209 mutex_exit(&connp->conn_lock); 10210 return (EINVAL); 10211 } 10212 10213 /* 10214 * We have a zero policies, reset the connection policy if already 10215 * set. This will cause the connection to inherit the 10216 * global policy, if any. 10217 */ 10218 if (is_pol_reset) { 10219 if (connp->conn_policy != NULL) { 10220 IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack); 10221 connp->conn_policy = NULL; 10222 } 10223 connp->conn_flags &= ~IPCL_CHECK_POLICY; 10224 connp->conn_in_enforce_policy = B_FALSE; 10225 connp->conn_out_enforce_policy = B_FALSE; 10226 mutex_exit(&connp->conn_lock); 10227 return (0); 10228 } 10229 10230 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy, 10231 ipst->ips_netstack); 10232 if (ph == NULL) 10233 goto enomem; 10234 10235 ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack); 10236 if (actp == NULL) 10237 goto enomem; 10238 10239 /* 10240 * Always allocate IPv4 policy entries, since they can also 10241 * apply to ipv6 sockets being used in ipv4-compat mode. 10242 */ 10243 bzero(&sel, sizeof (sel)); 10244 sel.ipsl_valid = IPSL_IPV4; 10245 10246 pin4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET, NULL, 10247 ipst->ips_netstack); 10248 if (pin4 == NULL) 10249 goto enomem; 10250 10251 pout4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET, NULL, 10252 ipst->ips_netstack); 10253 if (pout4 == NULL) 10254 goto enomem; 10255 10256 if (connp->conn_af_isv6) { 10257 /* 10258 * We're looking at a v6 socket, also allocate the 10259 * v6-specific entries... 10260 */ 10261 sel.ipsl_valid = IPSL_IPV6; 10262 pin6 = ipsec_policy_create(&sel, actp, nact, 10263 IPSEC_PRIO_SOCKET, NULL, ipst->ips_netstack); 10264 if (pin6 == NULL) 10265 goto enomem; 10266 10267 pout6 = ipsec_policy_create(&sel, actp, nact, 10268 IPSEC_PRIO_SOCKET, NULL, ipst->ips_netstack); 10269 if (pout6 == NULL) 10270 goto enomem; 10271 10272 /* 10273 * .. and file them away in the right place. 10274 */ 10275 fam = IPSEC_AF_V6; 10276 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 10277 HASHLIST_INSERT(pin6, ipsp_hash, pr->ipr_nonhash[fam]); 10278 ipsec_insert_always(&ph->iph_rulebyid, pin6); 10279 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 10280 HASHLIST_INSERT(pout6, ipsp_hash, pr->ipr_nonhash[fam]); 10281 ipsec_insert_always(&ph->iph_rulebyid, pout6); 10282 } 10283 10284 ipsec_actvec_free(actp, nact); 10285 10286 /* 10287 * File the v4 policies. 10288 */ 10289 fam = IPSEC_AF_V4; 10290 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 10291 HASHLIST_INSERT(pin4, ipsp_hash, pr->ipr_nonhash[fam]); 10292 ipsec_insert_always(&ph->iph_rulebyid, pin4); 10293 10294 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 10295 HASHLIST_INSERT(pout4, ipsp_hash, pr->ipr_nonhash[fam]); 10296 ipsec_insert_always(&ph->iph_rulebyid, pout4); 10297 10298 /* 10299 * If the requests need security, set enforce_policy. 10300 * If the requests are IPSEC_PREF_NEVER, one should 10301 * still set conn_out_enforce_policy so that an ipsec_out 10302 * gets attached in ip_wput. This is needed so that 10303 * for connections that we don't cache policy in ip_bind, 10304 * if global policy matches in ip_wput_attach_policy, we 10305 * don't wrongly inherit global policy. Similarly, we need 10306 * to set conn_in_enforce_policy also so that we don't verify 10307 * policy wrongly. 10308 */ 10309 if ((ah_req & REQ_MASK) != 0 || 10310 (esp_req & REQ_MASK) != 0 || 10311 (se_req & REQ_MASK) != 0) { 10312 connp->conn_in_enforce_policy = B_TRUE; 10313 connp->conn_out_enforce_policy = B_TRUE; 10314 connp->conn_flags |= IPCL_CHECK_POLICY; 10315 } 10316 10317 mutex_exit(&connp->conn_lock); 10318 return (error); 10319 #undef REQ_MASK 10320 10321 /* 10322 * Common memory-allocation-failure exit path. 10323 */ 10324 enomem: 10325 mutex_exit(&connp->conn_lock); 10326 if (actp != NULL) 10327 ipsec_actvec_free(actp, nact); 10328 if (pin4 != NULL) 10329 IPPOL_REFRELE(pin4, ipst->ips_netstack); 10330 if (pout4 != NULL) 10331 IPPOL_REFRELE(pout4, ipst->ips_netstack); 10332 if (pin6 != NULL) 10333 IPPOL_REFRELE(pin6, ipst->ips_netstack); 10334 if (pout6 != NULL) 10335 IPPOL_REFRELE(pout6, ipst->ips_netstack); 10336 return (ENOMEM); 10337 } 10338 10339 /* 10340 * Only for options that pass in an IP addr. Currently only V4 options 10341 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 10342 * So this function assumes level is IPPROTO_IP 10343 */ 10344 int 10345 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 10346 mblk_t *first_mp) 10347 { 10348 ipif_t *ipif = NULL; 10349 int error; 10350 ill_t *ill; 10351 int zoneid; 10352 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10353 10354 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 10355 10356 if (addr != INADDR_ANY || checkonly) { 10357 ASSERT(connp != NULL); 10358 zoneid = IPCL_ZONEID(connp); 10359 if (option == IP_NEXTHOP) { 10360 ipif = ipif_lookup_onlink_addr(addr, 10361 connp->conn_zoneid, ipst); 10362 } else { 10363 ipif = ipif_lookup_addr(addr, NULL, zoneid, 10364 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 10365 &error, ipst); 10366 } 10367 if (ipif == NULL) { 10368 if (error == EINPROGRESS) 10369 return (error); 10370 else if ((option == IP_MULTICAST_IF) || 10371 (option == IP_NEXTHOP)) 10372 return (EHOSTUNREACH); 10373 else 10374 return (EINVAL); 10375 } else if (checkonly) { 10376 if (option == IP_MULTICAST_IF) { 10377 ill = ipif->ipif_ill; 10378 /* not supported by the virtual network iface */ 10379 if (IS_VNI(ill)) { 10380 ipif_refrele(ipif); 10381 return (EINVAL); 10382 } 10383 } 10384 ipif_refrele(ipif); 10385 return (0); 10386 } 10387 ill = ipif->ipif_ill; 10388 mutex_enter(&connp->conn_lock); 10389 mutex_enter(&ill->ill_lock); 10390 if ((ill->ill_state_flags & ILL_CONDEMNED) || 10391 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 10392 mutex_exit(&ill->ill_lock); 10393 mutex_exit(&connp->conn_lock); 10394 ipif_refrele(ipif); 10395 return (option == IP_MULTICAST_IF ? 10396 EHOSTUNREACH : EINVAL); 10397 } 10398 } else { 10399 mutex_enter(&connp->conn_lock); 10400 } 10401 10402 /* None of the options below are supported on the VNI */ 10403 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 10404 mutex_exit(&ill->ill_lock); 10405 mutex_exit(&connp->conn_lock); 10406 ipif_refrele(ipif); 10407 return (EINVAL); 10408 } 10409 10410 switch (option) { 10411 case IP_DONTFAILOVER_IF: 10412 /* 10413 * This option is used by in.mpathd to ensure 10414 * that IPMP probe packets only go out on the 10415 * test interfaces. in.mpathd sets this option 10416 * on the non-failover interfaces. 10417 * For backward compatibility, this option 10418 * implicitly sets IP_MULTICAST_IF, as used 10419 * be done in bind(), so that ip_wput gets 10420 * this ipif to send mcast packets. 10421 */ 10422 if (ipif != NULL) { 10423 ASSERT(addr != INADDR_ANY); 10424 connp->conn_nofailover_ill = ipif->ipif_ill; 10425 connp->conn_multicast_ipif = ipif; 10426 } else { 10427 ASSERT(addr == INADDR_ANY); 10428 connp->conn_nofailover_ill = NULL; 10429 connp->conn_multicast_ipif = NULL; 10430 } 10431 break; 10432 10433 case IP_MULTICAST_IF: 10434 connp->conn_multicast_ipif = ipif; 10435 break; 10436 case IP_NEXTHOP: 10437 connp->conn_nexthop_v4 = addr; 10438 connp->conn_nexthop_set = B_TRUE; 10439 break; 10440 } 10441 10442 if (ipif != NULL) { 10443 mutex_exit(&ill->ill_lock); 10444 mutex_exit(&connp->conn_lock); 10445 ipif_refrele(ipif); 10446 return (0); 10447 } 10448 mutex_exit(&connp->conn_lock); 10449 /* We succeded in cleared the option */ 10450 return (0); 10451 } 10452 10453 /* 10454 * For options that pass in an ifindex specifying the ill. V6 options always 10455 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 10456 */ 10457 int 10458 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 10459 int level, int option, mblk_t *first_mp) 10460 { 10461 ill_t *ill = NULL; 10462 int error = 0; 10463 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10464 10465 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 10466 if (ifindex != 0) { 10467 ASSERT(connp != NULL); 10468 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 10469 first_mp, ip_restart_optmgmt, &error, ipst); 10470 if (ill != NULL) { 10471 if (checkonly) { 10472 /* not supported by the virtual network iface */ 10473 if (IS_VNI(ill)) { 10474 ill_refrele(ill); 10475 return (EINVAL); 10476 } 10477 ill_refrele(ill); 10478 return (0); 10479 } 10480 if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, 10481 0, NULL)) { 10482 ill_refrele(ill); 10483 ill = NULL; 10484 mutex_enter(&connp->conn_lock); 10485 goto setit; 10486 } 10487 mutex_enter(&connp->conn_lock); 10488 mutex_enter(&ill->ill_lock); 10489 if (ill->ill_state_flags & ILL_CONDEMNED) { 10490 mutex_exit(&ill->ill_lock); 10491 mutex_exit(&connp->conn_lock); 10492 ill_refrele(ill); 10493 ill = NULL; 10494 mutex_enter(&connp->conn_lock); 10495 } 10496 goto setit; 10497 } else if (error == EINPROGRESS) { 10498 return (error); 10499 } else { 10500 error = 0; 10501 } 10502 } 10503 mutex_enter(&connp->conn_lock); 10504 setit: 10505 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 10506 10507 /* 10508 * The options below assume that the ILL (if any) transmits and/or 10509 * receives traffic. Neither of which is true for the virtual network 10510 * interface, so fail setting these on a VNI. 10511 */ 10512 if (IS_VNI(ill)) { 10513 ASSERT(ill != NULL); 10514 mutex_exit(&ill->ill_lock); 10515 mutex_exit(&connp->conn_lock); 10516 ill_refrele(ill); 10517 return (EINVAL); 10518 } 10519 10520 if (level == IPPROTO_IP) { 10521 switch (option) { 10522 case IP_BOUND_IF: 10523 connp->conn_incoming_ill = ill; 10524 connp->conn_outgoing_ill = ill; 10525 connp->conn_orig_bound_ifindex = (ill == NULL) ? 10526 0 : ifindex; 10527 break; 10528 10529 case IP_MULTICAST_IF: 10530 /* 10531 * This option is an internal special. The socket 10532 * level IP_MULTICAST_IF specifies an 'ipaddr' and 10533 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 10534 * specifies an ifindex and we try first on V6 ill's. 10535 * If we don't find one, we they try using on v4 ill's 10536 * intenally and we come here. 10537 */ 10538 if (!checkonly && ill != NULL) { 10539 ipif_t *ipif; 10540 ipif = ill->ill_ipif; 10541 10542 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 10543 mutex_exit(&ill->ill_lock); 10544 mutex_exit(&connp->conn_lock); 10545 ill_refrele(ill); 10546 ill = NULL; 10547 mutex_enter(&connp->conn_lock); 10548 } else { 10549 connp->conn_multicast_ipif = ipif; 10550 } 10551 } 10552 break; 10553 10554 case IP_DHCPINIT_IF: 10555 if (connp->conn_dhcpinit_ill != NULL) { 10556 /* 10557 * We've locked the conn so conn_cleanup_ill() 10558 * cannot clear conn_dhcpinit_ill -- so it's 10559 * safe to access the ill. 10560 */ 10561 ill_t *oill = connp->conn_dhcpinit_ill; 10562 10563 ASSERT(oill->ill_dhcpinit != 0); 10564 atomic_dec_32(&oill->ill_dhcpinit); 10565 connp->conn_dhcpinit_ill = NULL; 10566 } 10567 10568 if (ill != NULL) { 10569 connp->conn_dhcpinit_ill = ill; 10570 atomic_inc_32(&ill->ill_dhcpinit); 10571 } 10572 break; 10573 } 10574 } else { 10575 switch (option) { 10576 case IPV6_BOUND_IF: 10577 connp->conn_incoming_ill = ill; 10578 connp->conn_outgoing_ill = ill; 10579 connp->conn_orig_bound_ifindex = (ill == NULL) ? 10580 0 : ifindex; 10581 break; 10582 10583 case IPV6_BOUND_PIF: 10584 /* 10585 * Limit all transmit to this ill. 10586 * Unlike IPV6_BOUND_IF, using this option 10587 * prevents load spreading and failover from 10588 * happening when the interface is part of the 10589 * group. That's why we don't need to remember 10590 * the ifindex in orig_bound_ifindex as in 10591 * IPV6_BOUND_IF. 10592 */ 10593 connp->conn_outgoing_pill = ill; 10594 break; 10595 10596 case IPV6_DONTFAILOVER_IF: 10597 /* 10598 * This option is used by in.mpathd to ensure 10599 * that IPMP probe packets only go out on the 10600 * test interfaces. in.mpathd sets this option 10601 * on the non-failover interfaces. 10602 */ 10603 connp->conn_nofailover_ill = ill; 10604 /* 10605 * For backward compatibility, this option 10606 * implicitly sets ip_multicast_ill as used in 10607 * IPV6_MULTICAST_IF so that ip_wput gets 10608 * this ill to send mcast packets. 10609 */ 10610 connp->conn_multicast_ill = ill; 10611 connp->conn_orig_multicast_ifindex = (ill == NULL) ? 10612 0 : ifindex; 10613 break; 10614 10615 case IPV6_MULTICAST_IF: 10616 /* 10617 * Set conn_multicast_ill to be the IPv6 ill. 10618 * Set conn_multicast_ipif to be an IPv4 ipif 10619 * for ifindex to make IPv4 mapped addresses 10620 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 10621 * Even if no IPv6 ill exists for the ifindex 10622 * we need to check for an IPv4 ifindex in order 10623 * for this to work with mapped addresses. In that 10624 * case only set conn_multicast_ipif. 10625 */ 10626 if (!checkonly) { 10627 if (ifindex == 0) { 10628 connp->conn_multicast_ill = NULL; 10629 connp->conn_orig_multicast_ifindex = 0; 10630 connp->conn_multicast_ipif = NULL; 10631 } else if (ill != NULL) { 10632 connp->conn_multicast_ill = ill; 10633 connp->conn_orig_multicast_ifindex = 10634 ifindex; 10635 } 10636 } 10637 break; 10638 } 10639 } 10640 10641 if (ill != NULL) { 10642 mutex_exit(&ill->ill_lock); 10643 mutex_exit(&connp->conn_lock); 10644 ill_refrele(ill); 10645 return (0); 10646 } 10647 mutex_exit(&connp->conn_lock); 10648 /* 10649 * We succeeded in clearing the option (ifindex == 0) or failed to 10650 * locate the ill and could not set the option (ifindex != 0) 10651 */ 10652 return (ifindex == 0 ? 0 : EINVAL); 10653 } 10654 10655 /* This routine sets socket options. */ 10656 /* ARGSUSED */ 10657 int 10658 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 10659 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 10660 void *dummy, cred_t *cr, mblk_t *first_mp) 10661 { 10662 int *i1 = (int *)invalp; 10663 conn_t *connp = Q_TO_CONN(q); 10664 int error = 0; 10665 boolean_t checkonly; 10666 ire_t *ire; 10667 boolean_t found; 10668 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10669 10670 switch (optset_context) { 10671 10672 case SETFN_OPTCOM_CHECKONLY: 10673 checkonly = B_TRUE; 10674 /* 10675 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 10676 * inlen != 0 implies value supplied and 10677 * we have to "pretend" to set it. 10678 * inlen == 0 implies that there is no 10679 * value part in T_CHECK request and just validation 10680 * done elsewhere should be enough, we just return here. 10681 */ 10682 if (inlen == 0) { 10683 *outlenp = 0; 10684 return (0); 10685 } 10686 break; 10687 case SETFN_OPTCOM_NEGOTIATE: 10688 case SETFN_UD_NEGOTIATE: 10689 case SETFN_CONN_NEGOTIATE: 10690 checkonly = B_FALSE; 10691 break; 10692 default: 10693 /* 10694 * We should never get here 10695 */ 10696 *outlenp = 0; 10697 return (EINVAL); 10698 } 10699 10700 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 10701 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 10702 10703 /* 10704 * For fixed length options, no sanity check 10705 * of passed in length is done. It is assumed *_optcom_req() 10706 * routines do the right thing. 10707 */ 10708 10709 switch (level) { 10710 case SOL_SOCKET: 10711 /* 10712 * conn_lock protects the bitfields, and is used to 10713 * set the fields atomically. 10714 */ 10715 switch (name) { 10716 case SO_BROADCAST: 10717 if (!checkonly) { 10718 /* TODO: use value someplace? */ 10719 mutex_enter(&connp->conn_lock); 10720 connp->conn_broadcast = *i1 ? 1 : 0; 10721 mutex_exit(&connp->conn_lock); 10722 } 10723 break; /* goto sizeof (int) option return */ 10724 case SO_USELOOPBACK: 10725 if (!checkonly) { 10726 /* TODO: use value someplace? */ 10727 mutex_enter(&connp->conn_lock); 10728 connp->conn_loopback = *i1 ? 1 : 0; 10729 mutex_exit(&connp->conn_lock); 10730 } 10731 break; /* goto sizeof (int) option return */ 10732 case SO_DONTROUTE: 10733 if (!checkonly) { 10734 mutex_enter(&connp->conn_lock); 10735 connp->conn_dontroute = *i1 ? 1 : 0; 10736 mutex_exit(&connp->conn_lock); 10737 } 10738 break; /* goto sizeof (int) option return */ 10739 case SO_REUSEADDR: 10740 if (!checkonly) { 10741 mutex_enter(&connp->conn_lock); 10742 connp->conn_reuseaddr = *i1 ? 1 : 0; 10743 mutex_exit(&connp->conn_lock); 10744 } 10745 break; /* goto sizeof (int) option return */ 10746 case SO_PROTOTYPE: 10747 if (!checkonly) { 10748 mutex_enter(&connp->conn_lock); 10749 connp->conn_proto = *i1; 10750 mutex_exit(&connp->conn_lock); 10751 } 10752 break; /* goto sizeof (int) option return */ 10753 case SO_ALLZONES: 10754 if (!checkonly) { 10755 mutex_enter(&connp->conn_lock); 10756 if (IPCL_IS_BOUND(connp)) { 10757 mutex_exit(&connp->conn_lock); 10758 return (EINVAL); 10759 } 10760 connp->conn_allzones = *i1 != 0 ? 1 : 0; 10761 mutex_exit(&connp->conn_lock); 10762 } 10763 break; /* goto sizeof (int) option return */ 10764 case SO_ANON_MLP: 10765 if (!checkonly) { 10766 mutex_enter(&connp->conn_lock); 10767 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 10768 mutex_exit(&connp->conn_lock); 10769 } 10770 break; /* goto sizeof (int) option return */ 10771 case SO_MAC_EXEMPT: 10772 if (secpolicy_net_mac_aware(cr) != 0 || 10773 IPCL_IS_BOUND(connp)) 10774 return (EACCES); 10775 if (!checkonly) { 10776 mutex_enter(&connp->conn_lock); 10777 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 10778 mutex_exit(&connp->conn_lock); 10779 } 10780 break; /* goto sizeof (int) option return */ 10781 default: 10782 /* 10783 * "soft" error (negative) 10784 * option not handled at this level 10785 * Note: Do not modify *outlenp 10786 */ 10787 return (-EINVAL); 10788 } 10789 break; 10790 case IPPROTO_IP: 10791 switch (name) { 10792 case IP_NEXTHOP: 10793 if (secpolicy_ip_config(cr, B_FALSE) != 0) 10794 return (EPERM); 10795 /* FALLTHRU */ 10796 case IP_MULTICAST_IF: 10797 case IP_DONTFAILOVER_IF: { 10798 ipaddr_t addr = *i1; 10799 10800 error = ip_opt_set_ipif(connp, addr, checkonly, name, 10801 first_mp); 10802 if (error != 0) 10803 return (error); 10804 break; /* goto sizeof (int) option return */ 10805 } 10806 10807 case IP_MULTICAST_TTL: 10808 /* Recorded in transport above IP */ 10809 *outvalp = *invalp; 10810 *outlenp = sizeof (uchar_t); 10811 return (0); 10812 case IP_MULTICAST_LOOP: 10813 if (!checkonly) { 10814 mutex_enter(&connp->conn_lock); 10815 connp->conn_multicast_loop = *invalp ? 1 : 0; 10816 mutex_exit(&connp->conn_lock); 10817 } 10818 *outvalp = *invalp; 10819 *outlenp = sizeof (uchar_t); 10820 return (0); 10821 case IP_ADD_MEMBERSHIP: 10822 case MCAST_JOIN_GROUP: 10823 case IP_DROP_MEMBERSHIP: 10824 case MCAST_LEAVE_GROUP: { 10825 struct ip_mreq *mreqp; 10826 struct group_req *greqp; 10827 ire_t *ire; 10828 boolean_t done = B_FALSE; 10829 ipaddr_t group, ifaddr; 10830 struct sockaddr_in *sin; 10831 uint32_t *ifindexp; 10832 boolean_t mcast_opt = B_TRUE; 10833 mcast_record_t fmode; 10834 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10835 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10836 10837 switch (name) { 10838 case IP_ADD_MEMBERSHIP: 10839 mcast_opt = B_FALSE; 10840 /* FALLTHRU */ 10841 case MCAST_JOIN_GROUP: 10842 fmode = MODE_IS_EXCLUDE; 10843 optfn = ip_opt_add_group; 10844 break; 10845 10846 case IP_DROP_MEMBERSHIP: 10847 mcast_opt = B_FALSE; 10848 /* FALLTHRU */ 10849 case MCAST_LEAVE_GROUP: 10850 fmode = MODE_IS_INCLUDE; 10851 optfn = ip_opt_delete_group; 10852 break; 10853 } 10854 10855 if (mcast_opt) { 10856 greqp = (struct group_req *)i1; 10857 sin = (struct sockaddr_in *)&greqp->gr_group; 10858 if (sin->sin_family != AF_INET) { 10859 *outlenp = 0; 10860 return (ENOPROTOOPT); 10861 } 10862 group = (ipaddr_t)sin->sin_addr.s_addr; 10863 ifaddr = INADDR_ANY; 10864 ifindexp = &greqp->gr_interface; 10865 } else { 10866 mreqp = (struct ip_mreq *)i1; 10867 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10868 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10869 ifindexp = NULL; 10870 } 10871 10872 /* 10873 * In the multirouting case, we need to replicate 10874 * the request on all interfaces that will take part 10875 * in replication. We do so because multirouting is 10876 * reflective, thus we will probably receive multi- 10877 * casts on those interfaces. 10878 * The ip_multirt_apply_membership() succeeds if the 10879 * operation succeeds on at least one interface. 10880 */ 10881 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10882 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10883 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10884 if (ire != NULL) { 10885 if (ire->ire_flags & RTF_MULTIRT) { 10886 error = ip_multirt_apply_membership( 10887 optfn, ire, connp, checkonly, group, 10888 fmode, INADDR_ANY, first_mp); 10889 done = B_TRUE; 10890 } 10891 ire_refrele(ire); 10892 } 10893 if (!done) { 10894 error = optfn(connp, checkonly, group, ifaddr, 10895 ifindexp, fmode, INADDR_ANY, first_mp); 10896 } 10897 if (error) { 10898 /* 10899 * EINPROGRESS is a soft error, needs retry 10900 * so don't make *outlenp zero. 10901 */ 10902 if (error != EINPROGRESS) 10903 *outlenp = 0; 10904 return (error); 10905 } 10906 /* OK return - copy input buffer into output buffer */ 10907 if (invalp != outvalp) { 10908 /* don't trust bcopy for identical src/dst */ 10909 bcopy(invalp, outvalp, inlen); 10910 } 10911 *outlenp = inlen; 10912 return (0); 10913 } 10914 case IP_BLOCK_SOURCE: 10915 case IP_UNBLOCK_SOURCE: 10916 case IP_ADD_SOURCE_MEMBERSHIP: 10917 case IP_DROP_SOURCE_MEMBERSHIP: 10918 case MCAST_BLOCK_SOURCE: 10919 case MCAST_UNBLOCK_SOURCE: 10920 case MCAST_JOIN_SOURCE_GROUP: 10921 case MCAST_LEAVE_SOURCE_GROUP: { 10922 struct ip_mreq_source *imreqp; 10923 struct group_source_req *gsreqp; 10924 in_addr_t grp, src, ifaddr = INADDR_ANY; 10925 uint32_t ifindex = 0; 10926 mcast_record_t fmode; 10927 struct sockaddr_in *sin; 10928 ire_t *ire; 10929 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10930 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10931 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10932 10933 switch (name) { 10934 case IP_BLOCK_SOURCE: 10935 mcast_opt = B_FALSE; 10936 /* FALLTHRU */ 10937 case MCAST_BLOCK_SOURCE: 10938 fmode = MODE_IS_EXCLUDE; 10939 optfn = ip_opt_add_group; 10940 break; 10941 10942 case IP_UNBLOCK_SOURCE: 10943 mcast_opt = B_FALSE; 10944 /* FALLTHRU */ 10945 case MCAST_UNBLOCK_SOURCE: 10946 fmode = MODE_IS_EXCLUDE; 10947 optfn = ip_opt_delete_group; 10948 break; 10949 10950 case IP_ADD_SOURCE_MEMBERSHIP: 10951 mcast_opt = B_FALSE; 10952 /* FALLTHRU */ 10953 case MCAST_JOIN_SOURCE_GROUP: 10954 fmode = MODE_IS_INCLUDE; 10955 optfn = ip_opt_add_group; 10956 break; 10957 10958 case IP_DROP_SOURCE_MEMBERSHIP: 10959 mcast_opt = B_FALSE; 10960 /* FALLTHRU */ 10961 case MCAST_LEAVE_SOURCE_GROUP: 10962 fmode = MODE_IS_INCLUDE; 10963 optfn = ip_opt_delete_group; 10964 break; 10965 } 10966 10967 if (mcast_opt) { 10968 gsreqp = (struct group_source_req *)i1; 10969 if (gsreqp->gsr_group.ss_family != AF_INET) { 10970 *outlenp = 0; 10971 return (ENOPROTOOPT); 10972 } 10973 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10974 grp = (ipaddr_t)sin->sin_addr.s_addr; 10975 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10976 src = (ipaddr_t)sin->sin_addr.s_addr; 10977 ifindex = gsreqp->gsr_interface; 10978 } else { 10979 imreqp = (struct ip_mreq_source *)i1; 10980 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10981 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10982 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10983 } 10984 10985 /* 10986 * In the multirouting case, we need to replicate 10987 * the request as noted in the mcast cases above. 10988 */ 10989 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10990 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10991 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10992 if (ire != NULL) { 10993 if (ire->ire_flags & RTF_MULTIRT) { 10994 error = ip_multirt_apply_membership( 10995 optfn, ire, connp, checkonly, grp, 10996 fmode, src, first_mp); 10997 done = B_TRUE; 10998 } 10999 ire_refrele(ire); 11000 } 11001 if (!done) { 11002 error = optfn(connp, checkonly, grp, ifaddr, 11003 &ifindex, fmode, src, first_mp); 11004 } 11005 if (error != 0) { 11006 /* 11007 * EINPROGRESS is a soft error, needs retry 11008 * so don't make *outlenp zero. 11009 */ 11010 if (error != EINPROGRESS) 11011 *outlenp = 0; 11012 return (error); 11013 } 11014 /* OK return - copy input buffer into output buffer */ 11015 if (invalp != outvalp) { 11016 bcopy(invalp, outvalp, inlen); 11017 } 11018 *outlenp = inlen; 11019 return (0); 11020 } 11021 case IP_SEC_OPT: 11022 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 11023 if (error != 0) { 11024 *outlenp = 0; 11025 return (error); 11026 } 11027 break; 11028 case IP_HDRINCL: 11029 case IP_OPTIONS: 11030 case T_IP_OPTIONS: 11031 case IP_TOS: 11032 case T_IP_TOS: 11033 case IP_TTL: 11034 case IP_RECVDSTADDR: 11035 case IP_RECVOPTS: 11036 /* OK return - copy input buffer into output buffer */ 11037 if (invalp != outvalp) { 11038 /* don't trust bcopy for identical src/dst */ 11039 bcopy(invalp, outvalp, inlen); 11040 } 11041 *outlenp = inlen; 11042 return (0); 11043 case IP_RECVIF: 11044 /* Retrieve the inbound interface index */ 11045 if (!checkonly) { 11046 mutex_enter(&connp->conn_lock); 11047 connp->conn_recvif = *i1 ? 1 : 0; 11048 mutex_exit(&connp->conn_lock); 11049 } 11050 break; /* goto sizeof (int) option return */ 11051 case IP_RECVPKTINFO: 11052 if (!checkonly) { 11053 mutex_enter(&connp->conn_lock); 11054 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11055 mutex_exit(&connp->conn_lock); 11056 } 11057 break; /* goto sizeof (int) option return */ 11058 case IP_RECVSLLA: 11059 /* Retrieve the source link layer address */ 11060 if (!checkonly) { 11061 mutex_enter(&connp->conn_lock); 11062 connp->conn_recvslla = *i1 ? 1 : 0; 11063 mutex_exit(&connp->conn_lock); 11064 } 11065 break; /* goto sizeof (int) option return */ 11066 case MRT_INIT: 11067 case MRT_DONE: 11068 case MRT_ADD_VIF: 11069 case MRT_DEL_VIF: 11070 case MRT_ADD_MFC: 11071 case MRT_DEL_MFC: 11072 case MRT_ASSERT: 11073 if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { 11074 *outlenp = 0; 11075 return (error); 11076 } 11077 error = ip_mrouter_set((int)name, q, checkonly, 11078 (uchar_t *)invalp, inlen, first_mp); 11079 if (error) { 11080 *outlenp = 0; 11081 return (error); 11082 } 11083 /* OK return - copy input buffer into output buffer */ 11084 if (invalp != outvalp) { 11085 /* don't trust bcopy for identical src/dst */ 11086 bcopy(invalp, outvalp, inlen); 11087 } 11088 *outlenp = inlen; 11089 return (0); 11090 case IP_BOUND_IF: 11091 case IP_DHCPINIT_IF: 11092 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 11093 level, name, first_mp); 11094 if (error != 0) 11095 return (error); 11096 break; /* goto sizeof (int) option return */ 11097 11098 case IP_UNSPEC_SRC: 11099 /* Allow sending with a zero source address */ 11100 if (!checkonly) { 11101 mutex_enter(&connp->conn_lock); 11102 connp->conn_unspec_src = *i1 ? 1 : 0; 11103 mutex_exit(&connp->conn_lock); 11104 } 11105 break; /* goto sizeof (int) option return */ 11106 default: 11107 /* 11108 * "soft" error (negative) 11109 * option not handled at this level 11110 * Note: Do not modify *outlenp 11111 */ 11112 return (-EINVAL); 11113 } 11114 break; 11115 case IPPROTO_IPV6: 11116 switch (name) { 11117 case IPV6_BOUND_IF: 11118 case IPV6_BOUND_PIF: 11119 case IPV6_DONTFAILOVER_IF: 11120 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 11121 level, name, first_mp); 11122 if (error != 0) 11123 return (error); 11124 break; /* goto sizeof (int) option return */ 11125 11126 case IPV6_MULTICAST_IF: 11127 /* 11128 * The only possible errors are EINPROGRESS and 11129 * EINVAL. EINPROGRESS will be restarted and is not 11130 * a hard error. We call this option on both V4 and V6 11131 * If both return EINVAL, then this call returns 11132 * EINVAL. If at least one of them succeeds we 11133 * return success. 11134 */ 11135 found = B_FALSE; 11136 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 11137 level, name, first_mp); 11138 if (error == EINPROGRESS) 11139 return (error); 11140 if (error == 0) 11141 found = B_TRUE; 11142 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 11143 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 11144 if (error == 0) 11145 found = B_TRUE; 11146 if (!found) 11147 return (error); 11148 break; /* goto sizeof (int) option return */ 11149 11150 case IPV6_MULTICAST_HOPS: 11151 /* Recorded in transport above IP */ 11152 break; /* goto sizeof (int) option return */ 11153 case IPV6_MULTICAST_LOOP: 11154 if (!checkonly) { 11155 mutex_enter(&connp->conn_lock); 11156 connp->conn_multicast_loop = *i1; 11157 mutex_exit(&connp->conn_lock); 11158 } 11159 break; /* goto sizeof (int) option return */ 11160 case IPV6_JOIN_GROUP: 11161 case MCAST_JOIN_GROUP: 11162 case IPV6_LEAVE_GROUP: 11163 case MCAST_LEAVE_GROUP: { 11164 struct ipv6_mreq *ip_mreqp; 11165 struct group_req *greqp; 11166 ire_t *ire; 11167 boolean_t done = B_FALSE; 11168 in6_addr_t groupv6; 11169 uint32_t ifindex; 11170 boolean_t mcast_opt = B_TRUE; 11171 mcast_record_t fmode; 11172 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11173 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11174 11175 switch (name) { 11176 case IPV6_JOIN_GROUP: 11177 mcast_opt = B_FALSE; 11178 /* FALLTHRU */ 11179 case MCAST_JOIN_GROUP: 11180 fmode = MODE_IS_EXCLUDE; 11181 optfn = ip_opt_add_group_v6; 11182 break; 11183 11184 case IPV6_LEAVE_GROUP: 11185 mcast_opt = B_FALSE; 11186 /* FALLTHRU */ 11187 case MCAST_LEAVE_GROUP: 11188 fmode = MODE_IS_INCLUDE; 11189 optfn = ip_opt_delete_group_v6; 11190 break; 11191 } 11192 11193 if (mcast_opt) { 11194 struct sockaddr_in *sin; 11195 struct sockaddr_in6 *sin6; 11196 greqp = (struct group_req *)i1; 11197 if (greqp->gr_group.ss_family == AF_INET) { 11198 sin = (struct sockaddr_in *) 11199 &(greqp->gr_group); 11200 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 11201 &groupv6); 11202 } else { 11203 sin6 = (struct sockaddr_in6 *) 11204 &(greqp->gr_group); 11205 groupv6 = sin6->sin6_addr; 11206 } 11207 ifindex = greqp->gr_interface; 11208 } else { 11209 ip_mreqp = (struct ipv6_mreq *)i1; 11210 groupv6 = ip_mreqp->ipv6mr_multiaddr; 11211 ifindex = ip_mreqp->ipv6mr_interface; 11212 } 11213 /* 11214 * In the multirouting case, we need to replicate 11215 * the request on all interfaces that will take part 11216 * in replication. We do so because multirouting is 11217 * reflective, thus we will probably receive multi- 11218 * casts on those interfaces. 11219 * The ip_multirt_apply_membership_v6() succeeds if 11220 * the operation succeeds on at least one interface. 11221 */ 11222 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 11223 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11224 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11225 if (ire != NULL) { 11226 if (ire->ire_flags & RTF_MULTIRT) { 11227 error = ip_multirt_apply_membership_v6( 11228 optfn, ire, connp, checkonly, 11229 &groupv6, fmode, &ipv6_all_zeros, 11230 first_mp); 11231 done = B_TRUE; 11232 } 11233 ire_refrele(ire); 11234 } 11235 if (!done) { 11236 error = optfn(connp, checkonly, &groupv6, 11237 ifindex, fmode, &ipv6_all_zeros, first_mp); 11238 } 11239 if (error) { 11240 /* 11241 * EINPROGRESS is a soft error, needs retry 11242 * so don't make *outlenp zero. 11243 */ 11244 if (error != EINPROGRESS) 11245 *outlenp = 0; 11246 return (error); 11247 } 11248 /* OK return - copy input buffer into output buffer */ 11249 if (invalp != outvalp) { 11250 /* don't trust bcopy for identical src/dst */ 11251 bcopy(invalp, outvalp, inlen); 11252 } 11253 *outlenp = inlen; 11254 return (0); 11255 } 11256 case MCAST_BLOCK_SOURCE: 11257 case MCAST_UNBLOCK_SOURCE: 11258 case MCAST_JOIN_SOURCE_GROUP: 11259 case MCAST_LEAVE_SOURCE_GROUP: { 11260 struct group_source_req *gsreqp; 11261 in6_addr_t v6grp, v6src; 11262 uint32_t ifindex; 11263 mcast_record_t fmode; 11264 ire_t *ire; 11265 boolean_t done = B_FALSE; 11266 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11267 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11268 11269 switch (name) { 11270 case MCAST_BLOCK_SOURCE: 11271 fmode = MODE_IS_EXCLUDE; 11272 optfn = ip_opt_add_group_v6; 11273 break; 11274 case MCAST_UNBLOCK_SOURCE: 11275 fmode = MODE_IS_EXCLUDE; 11276 optfn = ip_opt_delete_group_v6; 11277 break; 11278 case MCAST_JOIN_SOURCE_GROUP: 11279 fmode = MODE_IS_INCLUDE; 11280 optfn = ip_opt_add_group_v6; 11281 break; 11282 case MCAST_LEAVE_SOURCE_GROUP: 11283 fmode = MODE_IS_INCLUDE; 11284 optfn = ip_opt_delete_group_v6; 11285 break; 11286 } 11287 11288 gsreqp = (struct group_source_req *)i1; 11289 ifindex = gsreqp->gsr_interface; 11290 if (gsreqp->gsr_group.ss_family == AF_INET) { 11291 struct sockaddr_in *s; 11292 s = (struct sockaddr_in *)&gsreqp->gsr_group; 11293 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 11294 s = (struct sockaddr_in *)&gsreqp->gsr_source; 11295 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 11296 } else { 11297 struct sockaddr_in6 *s6; 11298 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 11299 v6grp = s6->sin6_addr; 11300 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 11301 v6src = s6->sin6_addr; 11302 } 11303 11304 /* 11305 * In the multirouting case, we need to replicate 11306 * the request as noted in the mcast cases above. 11307 */ 11308 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 11309 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11310 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11311 if (ire != NULL) { 11312 if (ire->ire_flags & RTF_MULTIRT) { 11313 error = ip_multirt_apply_membership_v6( 11314 optfn, ire, connp, checkonly, 11315 &v6grp, fmode, &v6src, first_mp); 11316 done = B_TRUE; 11317 } 11318 ire_refrele(ire); 11319 } 11320 if (!done) { 11321 error = optfn(connp, checkonly, &v6grp, 11322 ifindex, fmode, &v6src, first_mp); 11323 } 11324 if (error != 0) { 11325 /* 11326 * EINPROGRESS is a soft error, needs retry 11327 * so don't make *outlenp zero. 11328 */ 11329 if (error != EINPROGRESS) 11330 *outlenp = 0; 11331 return (error); 11332 } 11333 /* OK return - copy input buffer into output buffer */ 11334 if (invalp != outvalp) { 11335 bcopy(invalp, outvalp, inlen); 11336 } 11337 *outlenp = inlen; 11338 return (0); 11339 } 11340 case IPV6_UNICAST_HOPS: 11341 /* Recorded in transport above IP */ 11342 break; /* goto sizeof (int) option return */ 11343 case IPV6_UNSPEC_SRC: 11344 /* Allow sending with a zero source address */ 11345 if (!checkonly) { 11346 mutex_enter(&connp->conn_lock); 11347 connp->conn_unspec_src = *i1 ? 1 : 0; 11348 mutex_exit(&connp->conn_lock); 11349 } 11350 break; /* goto sizeof (int) option return */ 11351 case IPV6_RECVPKTINFO: 11352 if (!checkonly) { 11353 mutex_enter(&connp->conn_lock); 11354 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11355 mutex_exit(&connp->conn_lock); 11356 } 11357 break; /* goto sizeof (int) option return */ 11358 case IPV6_RECVTCLASS: 11359 if (!checkonly) { 11360 if (*i1 < 0 || *i1 > 1) { 11361 return (EINVAL); 11362 } 11363 mutex_enter(&connp->conn_lock); 11364 connp->conn_ipv6_recvtclass = *i1; 11365 mutex_exit(&connp->conn_lock); 11366 } 11367 break; 11368 case IPV6_RECVPATHMTU: 11369 if (!checkonly) { 11370 if (*i1 < 0 || *i1 > 1) { 11371 return (EINVAL); 11372 } 11373 mutex_enter(&connp->conn_lock); 11374 connp->conn_ipv6_recvpathmtu = *i1; 11375 mutex_exit(&connp->conn_lock); 11376 } 11377 break; 11378 case IPV6_RECVHOPLIMIT: 11379 if (!checkonly) { 11380 mutex_enter(&connp->conn_lock); 11381 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 11382 mutex_exit(&connp->conn_lock); 11383 } 11384 break; /* goto sizeof (int) option return */ 11385 case IPV6_RECVHOPOPTS: 11386 if (!checkonly) { 11387 mutex_enter(&connp->conn_lock); 11388 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 11389 mutex_exit(&connp->conn_lock); 11390 } 11391 break; /* goto sizeof (int) option return */ 11392 case IPV6_RECVDSTOPTS: 11393 if (!checkonly) { 11394 mutex_enter(&connp->conn_lock); 11395 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 11396 mutex_exit(&connp->conn_lock); 11397 } 11398 break; /* goto sizeof (int) option return */ 11399 case IPV6_RECVRTHDR: 11400 if (!checkonly) { 11401 mutex_enter(&connp->conn_lock); 11402 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 11403 mutex_exit(&connp->conn_lock); 11404 } 11405 break; /* goto sizeof (int) option return */ 11406 case IPV6_RECVRTHDRDSTOPTS: 11407 if (!checkonly) { 11408 mutex_enter(&connp->conn_lock); 11409 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 11410 mutex_exit(&connp->conn_lock); 11411 } 11412 break; /* goto sizeof (int) option return */ 11413 case IPV6_PKTINFO: 11414 if (inlen == 0) 11415 return (-EINVAL); /* clearing option */ 11416 error = ip6_set_pktinfo(cr, connp, 11417 (struct in6_pktinfo *)invalp, first_mp); 11418 if (error != 0) 11419 *outlenp = 0; 11420 else 11421 *outlenp = inlen; 11422 return (error); 11423 case IPV6_NEXTHOP: { 11424 struct sockaddr_in6 *sin6; 11425 11426 /* Verify that the nexthop is reachable */ 11427 if (inlen == 0) 11428 return (-EINVAL); /* clearing option */ 11429 11430 sin6 = (struct sockaddr_in6 *)invalp; 11431 ire = ire_route_lookup_v6(&sin6->sin6_addr, 11432 0, 0, 0, NULL, NULL, connp->conn_zoneid, 11433 NULL, MATCH_IRE_DEFAULT, ipst); 11434 11435 if (ire == NULL) { 11436 *outlenp = 0; 11437 return (EHOSTUNREACH); 11438 } 11439 ire_refrele(ire); 11440 return (-EINVAL); 11441 } 11442 case IPV6_SEC_OPT: 11443 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 11444 if (error != 0) { 11445 *outlenp = 0; 11446 return (error); 11447 } 11448 break; 11449 case IPV6_SRC_PREFERENCES: { 11450 /* 11451 * This is implemented strictly in the ip module 11452 * (here and in tcp_opt_*() to accomodate tcp 11453 * sockets). Modules above ip pass this option 11454 * down here since ip is the only one that needs to 11455 * be aware of source address preferences. 11456 * 11457 * This socket option only affects connected 11458 * sockets that haven't already bound to a specific 11459 * IPv6 address. In other words, sockets that 11460 * don't call bind() with an address other than the 11461 * unspecified address and that call connect(). 11462 * ip_bind_connected_v6() passes these preferences 11463 * to the ipif_select_source_v6() function. 11464 */ 11465 if (inlen != sizeof (uint32_t)) 11466 return (EINVAL); 11467 error = ip6_set_src_preferences(connp, 11468 *(uint32_t *)invalp); 11469 if (error != 0) { 11470 *outlenp = 0; 11471 return (error); 11472 } else { 11473 *outlenp = sizeof (uint32_t); 11474 } 11475 break; 11476 } 11477 case IPV6_V6ONLY: 11478 if (*i1 < 0 || *i1 > 1) { 11479 return (EINVAL); 11480 } 11481 mutex_enter(&connp->conn_lock); 11482 connp->conn_ipv6_v6only = *i1; 11483 mutex_exit(&connp->conn_lock); 11484 break; 11485 default: 11486 return (-EINVAL); 11487 } 11488 break; 11489 default: 11490 /* 11491 * "soft" error (negative) 11492 * option not handled at this level 11493 * Note: Do not modify *outlenp 11494 */ 11495 return (-EINVAL); 11496 } 11497 /* 11498 * Common case of return from an option that is sizeof (int) 11499 */ 11500 *(int *)outvalp = *i1; 11501 *outlenp = sizeof (int); 11502 return (0); 11503 } 11504 11505 /* 11506 * This routine gets default values of certain options whose default 11507 * values are maintained by protocol specific code 11508 */ 11509 /* ARGSUSED */ 11510 int 11511 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 11512 { 11513 int *i1 = (int *)ptr; 11514 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11515 11516 switch (level) { 11517 case IPPROTO_IP: 11518 switch (name) { 11519 case IP_MULTICAST_TTL: 11520 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 11521 return (sizeof (uchar_t)); 11522 case IP_MULTICAST_LOOP: 11523 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 11524 return (sizeof (uchar_t)); 11525 default: 11526 return (-1); 11527 } 11528 case IPPROTO_IPV6: 11529 switch (name) { 11530 case IPV6_UNICAST_HOPS: 11531 *i1 = ipst->ips_ipv6_def_hops; 11532 return (sizeof (int)); 11533 case IPV6_MULTICAST_HOPS: 11534 *i1 = IP_DEFAULT_MULTICAST_TTL; 11535 return (sizeof (int)); 11536 case IPV6_MULTICAST_LOOP: 11537 *i1 = IP_DEFAULT_MULTICAST_LOOP; 11538 return (sizeof (int)); 11539 case IPV6_V6ONLY: 11540 *i1 = 1; 11541 return (sizeof (int)); 11542 default: 11543 return (-1); 11544 } 11545 default: 11546 return (-1); 11547 } 11548 /* NOTREACHED */ 11549 } 11550 11551 /* 11552 * Given a destination address and a pointer to where to put the information 11553 * this routine fills in the mtuinfo. 11554 */ 11555 int 11556 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 11557 struct ip6_mtuinfo *mtuinfo, netstack_t *ns) 11558 { 11559 ire_t *ire; 11560 ip_stack_t *ipst = ns->netstack_ip; 11561 11562 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 11563 return (-1); 11564 11565 bzero(mtuinfo, sizeof (*mtuinfo)); 11566 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 11567 mtuinfo->ip6m_addr.sin6_port = port; 11568 mtuinfo->ip6m_addr.sin6_addr = *in6; 11569 11570 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst); 11571 if (ire != NULL) { 11572 mtuinfo->ip6m_mtu = ire->ire_max_frag; 11573 ire_refrele(ire); 11574 } else { 11575 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 11576 } 11577 return (sizeof (struct ip6_mtuinfo)); 11578 } 11579 11580 /* 11581 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 11582 * checking of GET_QUEUE_CRED(q) and that ip_g_mrouter is set should be done and 11583 * isn't. This doesn't matter as the error checking is done properly for the 11584 * other MRT options coming in through ip_opt_set. 11585 */ 11586 int 11587 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 11588 { 11589 conn_t *connp = Q_TO_CONN(q); 11590 ipsec_req_t *req = (ipsec_req_t *)ptr; 11591 11592 switch (level) { 11593 case IPPROTO_IP: 11594 switch (name) { 11595 case MRT_VERSION: 11596 case MRT_ASSERT: 11597 (void) ip_mrouter_get(name, q, ptr); 11598 return (sizeof (int)); 11599 case IP_SEC_OPT: 11600 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 11601 case IP_NEXTHOP: 11602 if (connp->conn_nexthop_set) { 11603 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 11604 return (sizeof (ipaddr_t)); 11605 } else 11606 return (0); 11607 case IP_RECVPKTINFO: 11608 *(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0; 11609 return (sizeof (int)); 11610 default: 11611 break; 11612 } 11613 break; 11614 case IPPROTO_IPV6: 11615 switch (name) { 11616 case IPV6_SEC_OPT: 11617 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 11618 case IPV6_SRC_PREFERENCES: { 11619 return (ip6_get_src_preferences(connp, 11620 (uint32_t *)ptr)); 11621 } 11622 case IPV6_V6ONLY: 11623 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 11624 return (sizeof (int)); 11625 case IPV6_PATHMTU: 11626 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 11627 (struct ip6_mtuinfo *)ptr, connp->conn_netstack)); 11628 default: 11629 break; 11630 } 11631 break; 11632 default: 11633 break; 11634 } 11635 return (-1); 11636 } 11637 11638 /* Named Dispatch routine to get a current value out of our parameter table. */ 11639 /* ARGSUSED */ 11640 static int 11641 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11642 { 11643 ipparam_t *ippa = (ipparam_t *)cp; 11644 11645 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 11646 return (0); 11647 } 11648 11649 /* ARGSUSED */ 11650 static int 11651 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11652 { 11653 11654 (void) mi_mpprintf(mp, "%d", *(int *)cp); 11655 return (0); 11656 } 11657 11658 /* 11659 * Set ip{,6}_forwarding values. This means walking through all of the 11660 * ill's and toggling their forwarding values. 11661 */ 11662 /* ARGSUSED */ 11663 static int 11664 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11665 { 11666 long new_value; 11667 int *forwarding_value = (int *)cp; 11668 ill_t *ill; 11669 boolean_t isv6; 11670 ill_walk_context_t ctx; 11671 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11672 11673 isv6 = (forwarding_value == &ipst->ips_ipv6_forward); 11674 11675 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11676 new_value < 0 || new_value > 1) { 11677 return (EINVAL); 11678 } 11679 11680 *forwarding_value = new_value; 11681 11682 /* 11683 * Regardless of the current value of ip_forwarding, set all per-ill 11684 * values of ip_forwarding to the value being set. 11685 * 11686 * Bring all the ill's up to date with the new global value. 11687 */ 11688 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11689 11690 if (isv6) 11691 ill = ILL_START_WALK_V6(&ctx, ipst); 11692 else 11693 ill = ILL_START_WALK_V4(&ctx, ipst); 11694 11695 for (; ill != NULL; ill = ill_next(&ctx, ill)) 11696 (void) ill_forward_set(ill, new_value != 0); 11697 11698 rw_exit(&ipst->ips_ill_g_lock); 11699 return (0); 11700 } 11701 11702 /* 11703 * Walk through the param array specified registering each element with the 11704 * Named Dispatch handler. This is called only during init. So it is ok 11705 * not to acquire any locks 11706 */ 11707 static boolean_t 11708 ip_param_register(IDP *ndp, ipparam_t *ippa, size_t ippa_cnt, 11709 ipndp_t *ipnd, size_t ipnd_cnt) 11710 { 11711 for (; ippa_cnt-- > 0; ippa++) { 11712 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 11713 if (!nd_load(ndp, ippa->ip_param_name, 11714 ip_param_get, ip_param_set, (caddr_t)ippa)) { 11715 nd_free(ndp); 11716 return (B_FALSE); 11717 } 11718 } 11719 } 11720 11721 for (; ipnd_cnt-- > 0; ipnd++) { 11722 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 11723 if (!nd_load(ndp, ipnd->ip_ndp_name, 11724 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 11725 ipnd->ip_ndp_data)) { 11726 nd_free(ndp); 11727 return (B_FALSE); 11728 } 11729 } 11730 } 11731 11732 return (B_TRUE); 11733 } 11734 11735 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 11736 /* ARGSUSED */ 11737 static int 11738 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11739 { 11740 long new_value; 11741 ipparam_t *ippa = (ipparam_t *)cp; 11742 11743 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11744 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 11745 return (EINVAL); 11746 } 11747 ippa->ip_param_value = new_value; 11748 return (0); 11749 } 11750 11751 /* 11752 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 11753 * When an ipf is passed here for the first time, if 11754 * we already have in-order fragments on the queue, we convert from the fast- 11755 * path reassembly scheme to the hard-case scheme. From then on, additional 11756 * fragments are reassembled here. We keep track of the start and end offsets 11757 * of each piece, and the number of holes in the chain. When the hole count 11758 * goes to zero, we are done! 11759 * 11760 * The ipf_count will be updated to account for any mblk(s) added (pointed to 11761 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 11762 * ipfb_count and ill_frag_count by the difference of ipf_count before and 11763 * after the call to ip_reassemble(). 11764 */ 11765 int 11766 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 11767 size_t msg_len) 11768 { 11769 uint_t end; 11770 mblk_t *next_mp; 11771 mblk_t *mp1; 11772 uint_t offset; 11773 boolean_t incr_dups = B_TRUE; 11774 boolean_t offset_zero_seen = B_FALSE; 11775 boolean_t pkt_boundary_checked = B_FALSE; 11776 11777 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 11778 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 11779 11780 /* Add in byte count */ 11781 ipf->ipf_count += msg_len; 11782 if (ipf->ipf_end) { 11783 /* 11784 * We were part way through in-order reassembly, but now there 11785 * is a hole. We walk through messages already queued, and 11786 * mark them for hard case reassembly. We know that up till 11787 * now they were in order starting from offset zero. 11788 */ 11789 offset = 0; 11790 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11791 IP_REASS_SET_START(mp1, offset); 11792 if (offset == 0) { 11793 ASSERT(ipf->ipf_nf_hdr_len != 0); 11794 offset = -ipf->ipf_nf_hdr_len; 11795 } 11796 offset += mp1->b_wptr - mp1->b_rptr; 11797 IP_REASS_SET_END(mp1, offset); 11798 } 11799 /* One hole at the end. */ 11800 ipf->ipf_hole_cnt = 1; 11801 /* Brand it as a hard case, forever. */ 11802 ipf->ipf_end = 0; 11803 } 11804 /* Walk through all the new pieces. */ 11805 do { 11806 end = start + (mp->b_wptr - mp->b_rptr); 11807 /* 11808 * If start is 0, decrease 'end' only for the first mblk of 11809 * the fragment. Otherwise 'end' can get wrong value in the 11810 * second pass of the loop if first mblk is exactly the 11811 * size of ipf_nf_hdr_len. 11812 */ 11813 if (start == 0 && !offset_zero_seen) { 11814 /* First segment */ 11815 ASSERT(ipf->ipf_nf_hdr_len != 0); 11816 end -= ipf->ipf_nf_hdr_len; 11817 offset_zero_seen = B_TRUE; 11818 } 11819 next_mp = mp->b_cont; 11820 /* 11821 * We are checking to see if there is any interesing data 11822 * to process. If there isn't and the mblk isn't the 11823 * one which carries the unfragmentable header then we 11824 * drop it. It's possible to have just the unfragmentable 11825 * header come through without any data. That needs to be 11826 * saved. 11827 * 11828 * If the assert at the top of this function holds then the 11829 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 11830 * is infrequently traveled enough that the test is left in 11831 * to protect against future code changes which break that 11832 * invariant. 11833 */ 11834 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 11835 /* Empty. Blast it. */ 11836 IP_REASS_SET_START(mp, 0); 11837 IP_REASS_SET_END(mp, 0); 11838 /* 11839 * If the ipf points to the mblk we are about to free, 11840 * update ipf to point to the next mblk (or NULL 11841 * if none). 11842 */ 11843 if (ipf->ipf_mp->b_cont == mp) 11844 ipf->ipf_mp->b_cont = next_mp; 11845 freeb(mp); 11846 continue; 11847 } 11848 mp->b_cont = NULL; 11849 IP_REASS_SET_START(mp, start); 11850 IP_REASS_SET_END(mp, end); 11851 if (!ipf->ipf_tail_mp) { 11852 ipf->ipf_tail_mp = mp; 11853 ipf->ipf_mp->b_cont = mp; 11854 if (start == 0 || !more) { 11855 ipf->ipf_hole_cnt = 1; 11856 /* 11857 * if the first fragment comes in more than one 11858 * mblk, this loop will be executed for each 11859 * mblk. Need to adjust hole count so exiting 11860 * this routine will leave hole count at 1. 11861 */ 11862 if (next_mp) 11863 ipf->ipf_hole_cnt++; 11864 } else 11865 ipf->ipf_hole_cnt = 2; 11866 continue; 11867 } else if (ipf->ipf_last_frag_seen && !more && 11868 !pkt_boundary_checked) { 11869 /* 11870 * We check datagram boundary only if this fragment 11871 * claims to be the last fragment and we have seen a 11872 * last fragment in the past too. We do this only 11873 * once for a given fragment. 11874 * 11875 * start cannot be 0 here as fragments with start=0 11876 * and MF=0 gets handled as a complete packet. These 11877 * fragments should not reach here. 11878 */ 11879 11880 if (start + msgdsize(mp) != 11881 IP_REASS_END(ipf->ipf_tail_mp)) { 11882 /* 11883 * We have two fragments both of which claim 11884 * to be the last fragment but gives conflicting 11885 * information about the whole datagram size. 11886 * Something fishy is going on. Drop the 11887 * fragment and free up the reassembly list. 11888 */ 11889 return (IP_REASS_FAILED); 11890 } 11891 11892 /* 11893 * We shouldn't come to this code block again for this 11894 * particular fragment. 11895 */ 11896 pkt_boundary_checked = B_TRUE; 11897 } 11898 11899 /* New stuff at or beyond tail? */ 11900 offset = IP_REASS_END(ipf->ipf_tail_mp); 11901 if (start >= offset) { 11902 if (ipf->ipf_last_frag_seen) { 11903 /* current fragment is beyond last fragment */ 11904 return (IP_REASS_FAILED); 11905 } 11906 /* Link it on end. */ 11907 ipf->ipf_tail_mp->b_cont = mp; 11908 ipf->ipf_tail_mp = mp; 11909 if (more) { 11910 if (start != offset) 11911 ipf->ipf_hole_cnt++; 11912 } else if (start == offset && next_mp == NULL) 11913 ipf->ipf_hole_cnt--; 11914 continue; 11915 } 11916 mp1 = ipf->ipf_mp->b_cont; 11917 offset = IP_REASS_START(mp1); 11918 /* New stuff at the front? */ 11919 if (start < offset) { 11920 if (start == 0) { 11921 if (end >= offset) { 11922 /* Nailed the hole at the begining. */ 11923 ipf->ipf_hole_cnt--; 11924 } 11925 } else if (end < offset) { 11926 /* 11927 * A hole, stuff, and a hole where there used 11928 * to be just a hole. 11929 */ 11930 ipf->ipf_hole_cnt++; 11931 } 11932 mp->b_cont = mp1; 11933 /* Check for overlap. */ 11934 while (end > offset) { 11935 if (end < IP_REASS_END(mp1)) { 11936 mp->b_wptr -= end - offset; 11937 IP_REASS_SET_END(mp, offset); 11938 BUMP_MIB(ill->ill_ip_mib, 11939 ipIfStatsReasmPartDups); 11940 break; 11941 } 11942 /* Did we cover another hole? */ 11943 if ((mp1->b_cont && 11944 IP_REASS_END(mp1) != 11945 IP_REASS_START(mp1->b_cont) && 11946 end >= IP_REASS_START(mp1->b_cont)) || 11947 (!ipf->ipf_last_frag_seen && !more)) { 11948 ipf->ipf_hole_cnt--; 11949 } 11950 /* Clip out mp1. */ 11951 if ((mp->b_cont = mp1->b_cont) == NULL) { 11952 /* 11953 * After clipping out mp1, this guy 11954 * is now hanging off the end. 11955 */ 11956 ipf->ipf_tail_mp = mp; 11957 } 11958 IP_REASS_SET_START(mp1, 0); 11959 IP_REASS_SET_END(mp1, 0); 11960 /* Subtract byte count */ 11961 ipf->ipf_count -= mp1->b_datap->db_lim - 11962 mp1->b_datap->db_base; 11963 freeb(mp1); 11964 BUMP_MIB(ill->ill_ip_mib, 11965 ipIfStatsReasmPartDups); 11966 mp1 = mp->b_cont; 11967 if (!mp1) 11968 break; 11969 offset = IP_REASS_START(mp1); 11970 } 11971 ipf->ipf_mp->b_cont = mp; 11972 continue; 11973 } 11974 /* 11975 * The new piece starts somewhere between the start of the head 11976 * and before the end of the tail. 11977 */ 11978 for (; mp1; mp1 = mp1->b_cont) { 11979 offset = IP_REASS_END(mp1); 11980 if (start < offset) { 11981 if (end <= offset) { 11982 /* Nothing new. */ 11983 IP_REASS_SET_START(mp, 0); 11984 IP_REASS_SET_END(mp, 0); 11985 /* Subtract byte count */ 11986 ipf->ipf_count -= mp->b_datap->db_lim - 11987 mp->b_datap->db_base; 11988 if (incr_dups) { 11989 ipf->ipf_num_dups++; 11990 incr_dups = B_FALSE; 11991 } 11992 freeb(mp); 11993 BUMP_MIB(ill->ill_ip_mib, 11994 ipIfStatsReasmDuplicates); 11995 break; 11996 } 11997 /* 11998 * Trim redundant stuff off beginning of new 11999 * piece. 12000 */ 12001 IP_REASS_SET_START(mp, offset); 12002 mp->b_rptr += offset - start; 12003 BUMP_MIB(ill->ill_ip_mib, 12004 ipIfStatsReasmPartDups); 12005 start = offset; 12006 if (!mp1->b_cont) { 12007 /* 12008 * After trimming, this guy is now 12009 * hanging off the end. 12010 */ 12011 mp1->b_cont = mp; 12012 ipf->ipf_tail_mp = mp; 12013 if (!more) { 12014 ipf->ipf_hole_cnt--; 12015 } 12016 break; 12017 } 12018 } 12019 if (start >= IP_REASS_START(mp1->b_cont)) 12020 continue; 12021 /* Fill a hole */ 12022 if (start > offset) 12023 ipf->ipf_hole_cnt++; 12024 mp->b_cont = mp1->b_cont; 12025 mp1->b_cont = mp; 12026 mp1 = mp->b_cont; 12027 offset = IP_REASS_START(mp1); 12028 if (end >= offset) { 12029 ipf->ipf_hole_cnt--; 12030 /* Check for overlap. */ 12031 while (end > offset) { 12032 if (end < IP_REASS_END(mp1)) { 12033 mp->b_wptr -= end - offset; 12034 IP_REASS_SET_END(mp, offset); 12035 /* 12036 * TODO we might bump 12037 * this up twice if there is 12038 * overlap at both ends. 12039 */ 12040 BUMP_MIB(ill->ill_ip_mib, 12041 ipIfStatsReasmPartDups); 12042 break; 12043 } 12044 /* Did we cover another hole? */ 12045 if ((mp1->b_cont && 12046 IP_REASS_END(mp1) 12047 != IP_REASS_START(mp1->b_cont) && 12048 end >= 12049 IP_REASS_START(mp1->b_cont)) || 12050 (!ipf->ipf_last_frag_seen && 12051 !more)) { 12052 ipf->ipf_hole_cnt--; 12053 } 12054 /* Clip out mp1. */ 12055 if ((mp->b_cont = mp1->b_cont) == 12056 NULL) { 12057 /* 12058 * After clipping out mp1, 12059 * this guy is now hanging 12060 * off the end. 12061 */ 12062 ipf->ipf_tail_mp = mp; 12063 } 12064 IP_REASS_SET_START(mp1, 0); 12065 IP_REASS_SET_END(mp1, 0); 12066 /* Subtract byte count */ 12067 ipf->ipf_count -= 12068 mp1->b_datap->db_lim - 12069 mp1->b_datap->db_base; 12070 freeb(mp1); 12071 BUMP_MIB(ill->ill_ip_mib, 12072 ipIfStatsReasmPartDups); 12073 mp1 = mp->b_cont; 12074 if (!mp1) 12075 break; 12076 offset = IP_REASS_START(mp1); 12077 } 12078 } 12079 break; 12080 } 12081 } while (start = end, mp = next_mp); 12082 12083 /* Fragment just processed could be the last one. Remember this fact */ 12084 if (!more) 12085 ipf->ipf_last_frag_seen = B_TRUE; 12086 12087 /* Still got holes? */ 12088 if (ipf->ipf_hole_cnt) 12089 return (IP_REASS_PARTIAL); 12090 /* Clean up overloaded fields to avoid upstream disasters. */ 12091 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 12092 IP_REASS_SET_START(mp1, 0); 12093 IP_REASS_SET_END(mp1, 0); 12094 } 12095 return (IP_REASS_COMPLETE); 12096 } 12097 12098 /* 12099 * ipsec processing for the fast path, used for input UDP Packets 12100 * Returns true if ready for passup to UDP. 12101 * Return false if packet is not passable to UDP (e.g. it failed IPsec policy, 12102 * was an ESP-in-UDP packet, etc.). 12103 */ 12104 static boolean_t 12105 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 12106 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire) 12107 { 12108 uint32_t ill_index; 12109 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 12110 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 12111 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12112 udp_t *udp = connp->conn_udp; 12113 12114 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12115 /* The ill_index of the incoming ILL */ 12116 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 12117 12118 /* pass packet up to the transport */ 12119 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 12120 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 12121 NULL, mctl_present); 12122 if (*first_mpp == NULL) { 12123 return (B_FALSE); 12124 } 12125 } 12126 12127 /* Initiate IPPF processing for fastpath UDP */ 12128 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 12129 ip_process(IPP_LOCAL_IN, mpp, ill_index); 12130 if (*mpp == NULL) { 12131 ip2dbg(("ip_input_ipsec_process: UDP pkt " 12132 "deferred/dropped during IPPF processing\n")); 12133 return (B_FALSE); 12134 } 12135 } 12136 /* 12137 * Remove 0-spi if it's 0, or move everything behind 12138 * the UDP header over it and forward to ESP via 12139 * ip_proto_input(). 12140 */ 12141 if (udp->udp_nat_t_endpoint) { 12142 if (mctl_present) { 12143 /* mctl_present *shouldn't* happen. */ 12144 ip_drop_packet(*first_mpp, B_TRUE, NULL, 12145 NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec), 12146 &ipss->ipsec_dropper); 12147 *first_mpp = NULL; 12148 return (B_FALSE); 12149 } 12150 12151 /* "ill" is "recv_ill" in actuality. */ 12152 if (!zero_spi_check(q, *mpp, ire, ill, ipss)) 12153 return (B_FALSE); 12154 12155 /* Else continue like a normal UDP packet. */ 12156 } 12157 12158 /* 12159 * We make the checks as below since we are in the fast path 12160 * and want to minimize the number of checks if the IP_RECVIF and/or 12161 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 12162 */ 12163 if (connp->conn_recvif || connp->conn_recvslla || 12164 connp->conn_ip_recvpktinfo) { 12165 if (connp->conn_recvif) { 12166 in_flags = IPF_RECVIF; 12167 } 12168 /* 12169 * UDP supports IP_RECVPKTINFO option for both v4 and v6 12170 * so the flag passed to ip_add_info is based on IP version 12171 * of connp. 12172 */ 12173 if (connp->conn_ip_recvpktinfo) { 12174 if (connp->conn_af_isv6) { 12175 /* 12176 * V6 only needs index 12177 */ 12178 in_flags |= IPF_RECVIF; 12179 } else { 12180 /* 12181 * V4 needs index + matching address. 12182 */ 12183 in_flags |= IPF_RECVADDR; 12184 } 12185 } 12186 if (connp->conn_recvslla) { 12187 in_flags |= IPF_RECVSLLA; 12188 } 12189 /* 12190 * since in_flags are being set ill will be 12191 * referenced in ip_add_info, so it better not 12192 * be NULL. 12193 */ 12194 /* 12195 * the actual data will be contained in b_cont 12196 * upon successful return of the following call. 12197 * If the call fails then the original mblk is 12198 * returned. 12199 */ 12200 *mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp), 12201 ipst); 12202 } 12203 12204 return (B_TRUE); 12205 } 12206 12207 /* 12208 * Fragmentation reassembly. Each ILL has a hash table for 12209 * queuing packets undergoing reassembly for all IPIFs 12210 * associated with the ILL. The hash is based on the packet 12211 * IP ident field. The ILL frag hash table was allocated 12212 * as a timer block at the time the ILL was created. Whenever 12213 * there is anything on the reassembly queue, the timer will 12214 * be running. Returns B_TRUE if successful else B_FALSE; 12215 * frees mp on failure. 12216 */ 12217 static boolean_t 12218 ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, 12219 uint32_t *cksum_val, uint16_t *cksum_flags) 12220 { 12221 uint32_t frag_offset_flags; 12222 ill_t *ill = (ill_t *)q->q_ptr; 12223 mblk_t *mp = *mpp; 12224 mblk_t *t_mp; 12225 ipaddr_t dst; 12226 uint8_t proto = ipha->ipha_protocol; 12227 uint32_t sum_val; 12228 uint16_t sum_flags; 12229 ipf_t *ipf; 12230 ipf_t **ipfp; 12231 ipfb_t *ipfb; 12232 uint16_t ident; 12233 uint32_t offset; 12234 ipaddr_t src; 12235 uint_t hdr_length; 12236 uint32_t end; 12237 mblk_t *mp1; 12238 mblk_t *tail_mp; 12239 size_t count; 12240 size_t msg_len; 12241 uint8_t ecn_info = 0; 12242 uint32_t packet_size; 12243 boolean_t pruned = B_FALSE; 12244 ip_stack_t *ipst = ill->ill_ipst; 12245 12246 if (cksum_val != NULL) 12247 *cksum_val = 0; 12248 if (cksum_flags != NULL) 12249 *cksum_flags = 0; 12250 12251 /* 12252 * Drop the fragmented as early as possible, if 12253 * we don't have resource(s) to re-assemble. 12254 */ 12255 if (ipst->ips_ip_reass_queue_bytes == 0) { 12256 freemsg(mp); 12257 return (B_FALSE); 12258 } 12259 12260 /* Check for fragmentation offset; return if there's none */ 12261 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 12262 (IPH_MF | IPH_OFFSET)) == 0) 12263 return (B_TRUE); 12264 12265 /* 12266 * We utilize hardware computed checksum info only for UDP since 12267 * IP fragmentation is a normal occurence for the protocol. In 12268 * addition, checksum offload support for IP fragments carrying 12269 * UDP payload is commonly implemented across network adapters. 12270 */ 12271 ASSERT(ill != NULL); 12272 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && 12273 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 12274 mblk_t *mp1 = mp->b_cont; 12275 int32_t len; 12276 12277 /* Record checksum information from the packet */ 12278 sum_val = (uint32_t)DB_CKSUM16(mp); 12279 sum_flags = DB_CKSUMFLAGS(mp); 12280 12281 /* IP payload offset from beginning of mblk */ 12282 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 12283 12284 if ((sum_flags & HCK_PARTIALCKSUM) && 12285 (mp1 == NULL || mp1->b_cont == NULL) && 12286 offset >= DB_CKSUMSTART(mp) && 12287 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 12288 uint32_t adj; 12289 /* 12290 * Partial checksum has been calculated by hardware 12291 * and attached to the packet; in addition, any 12292 * prepended extraneous data is even byte aligned. 12293 * If any such data exists, we adjust the checksum; 12294 * this would also handle any postpended data. 12295 */ 12296 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 12297 mp, mp1, len, adj); 12298 12299 /* One's complement subtract extraneous checksum */ 12300 if (adj >= sum_val) 12301 sum_val = ~(adj - sum_val) & 0xFFFF; 12302 else 12303 sum_val -= adj; 12304 } 12305 } else { 12306 sum_val = 0; 12307 sum_flags = 0; 12308 } 12309 12310 /* Clear hardware checksumming flag */ 12311 DB_CKSUMFLAGS(mp) = 0; 12312 12313 ident = ipha->ipha_ident; 12314 offset = (frag_offset_flags << 3) & 0xFFFF; 12315 src = ipha->ipha_src; 12316 dst = ipha->ipha_dst; 12317 hdr_length = IPH_HDR_LENGTH(ipha); 12318 end = ntohs(ipha->ipha_length) - hdr_length; 12319 12320 /* If end == 0 then we have a packet with no data, so just free it */ 12321 if (end == 0) { 12322 freemsg(mp); 12323 return (B_FALSE); 12324 } 12325 12326 /* Record the ECN field info. */ 12327 ecn_info = (ipha->ipha_type_of_service & 0x3); 12328 if (offset != 0) { 12329 /* 12330 * If this isn't the first piece, strip the header, and 12331 * add the offset to the end value. 12332 */ 12333 mp->b_rptr += hdr_length; 12334 end += offset; 12335 } 12336 12337 msg_len = MBLKSIZE(mp); 12338 tail_mp = mp; 12339 while (tail_mp->b_cont != NULL) { 12340 tail_mp = tail_mp->b_cont; 12341 msg_len += MBLKSIZE(tail_mp); 12342 } 12343 12344 /* If the reassembly list for this ILL will get too big, prune it */ 12345 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 12346 ipst->ips_ip_reass_queue_bytes) { 12347 ill_frag_prune(ill, 12348 (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : 12349 (ipst->ips_ip_reass_queue_bytes - msg_len)); 12350 pruned = B_TRUE; 12351 } 12352 12353 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 12354 mutex_enter(&ipfb->ipfb_lock); 12355 12356 ipfp = &ipfb->ipfb_ipf; 12357 /* Try to find an existing fragment queue for this packet. */ 12358 for (;;) { 12359 ipf = ipfp[0]; 12360 if (ipf != NULL) { 12361 /* 12362 * It has to match on ident and src/dst address. 12363 */ 12364 if (ipf->ipf_ident == ident && 12365 ipf->ipf_src == src && 12366 ipf->ipf_dst == dst && 12367 ipf->ipf_protocol == proto) { 12368 /* 12369 * If we have received too many 12370 * duplicate fragments for this packet 12371 * free it. 12372 */ 12373 if (ipf->ipf_num_dups > ip_max_frag_dups) { 12374 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12375 freemsg(mp); 12376 mutex_exit(&ipfb->ipfb_lock); 12377 return (B_FALSE); 12378 } 12379 /* Found it. */ 12380 break; 12381 } 12382 ipfp = &ipf->ipf_hash_next; 12383 continue; 12384 } 12385 12386 /* 12387 * If we pruned the list, do we want to store this new 12388 * fragment?. We apply an optimization here based on the 12389 * fact that most fragments will be received in order. 12390 * So if the offset of this incoming fragment is zero, 12391 * it is the first fragment of a new packet. We will 12392 * keep it. Otherwise drop the fragment, as we have 12393 * probably pruned the packet already (since the 12394 * packet cannot be found). 12395 */ 12396 if (pruned && offset != 0) { 12397 mutex_exit(&ipfb->ipfb_lock); 12398 freemsg(mp); 12399 return (B_FALSE); 12400 } 12401 12402 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) { 12403 /* 12404 * Too many fragmented packets in this hash 12405 * bucket. Free the oldest. 12406 */ 12407 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 12408 } 12409 12410 /* New guy. Allocate a frag message. */ 12411 mp1 = allocb(sizeof (*ipf), BPRI_MED); 12412 if (mp1 == NULL) { 12413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12414 freemsg(mp); 12415 reass_done: 12416 mutex_exit(&ipfb->ipfb_lock); 12417 return (B_FALSE); 12418 } 12419 12420 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); 12421 mp1->b_cont = mp; 12422 12423 /* Initialize the fragment header. */ 12424 ipf = (ipf_t *)mp1->b_rptr; 12425 ipf->ipf_mp = mp1; 12426 ipf->ipf_ptphn = ipfp; 12427 ipfp[0] = ipf; 12428 ipf->ipf_hash_next = NULL; 12429 ipf->ipf_ident = ident; 12430 ipf->ipf_protocol = proto; 12431 ipf->ipf_src = src; 12432 ipf->ipf_dst = dst; 12433 ipf->ipf_nf_hdr_len = 0; 12434 /* Record reassembly start time. */ 12435 ipf->ipf_timestamp = gethrestime_sec(); 12436 /* Record ipf generation and account for frag header */ 12437 ipf->ipf_gen = ill->ill_ipf_gen++; 12438 ipf->ipf_count = MBLKSIZE(mp1); 12439 ipf->ipf_last_frag_seen = B_FALSE; 12440 ipf->ipf_ecn = ecn_info; 12441 ipf->ipf_num_dups = 0; 12442 ipfb->ipfb_frag_pkts++; 12443 ipf->ipf_checksum = 0; 12444 ipf->ipf_checksum_flags = 0; 12445 12446 /* Store checksum value in fragment header */ 12447 if (sum_flags != 0) { 12448 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12449 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12450 ipf->ipf_checksum = sum_val; 12451 ipf->ipf_checksum_flags = sum_flags; 12452 } 12453 12454 /* 12455 * We handle reassembly two ways. In the easy case, 12456 * where all the fragments show up in order, we do 12457 * minimal bookkeeping, and just clip new pieces on 12458 * the end. If we ever see a hole, then we go off 12459 * to ip_reassemble which has to mark the pieces and 12460 * keep track of the number of holes, etc. Obviously, 12461 * the point of having both mechanisms is so we can 12462 * handle the easy case as efficiently as possible. 12463 */ 12464 if (offset == 0) { 12465 /* Easy case, in-order reassembly so far. */ 12466 ipf->ipf_count += msg_len; 12467 ipf->ipf_tail_mp = tail_mp; 12468 /* 12469 * Keep track of next expected offset in 12470 * ipf_end. 12471 */ 12472 ipf->ipf_end = end; 12473 ipf->ipf_nf_hdr_len = hdr_length; 12474 } else { 12475 /* Hard case, hole at the beginning. */ 12476 ipf->ipf_tail_mp = NULL; 12477 /* 12478 * ipf_end == 0 means that we have given up 12479 * on easy reassembly. 12480 */ 12481 ipf->ipf_end = 0; 12482 12483 /* Forget checksum offload from now on */ 12484 ipf->ipf_checksum_flags = 0; 12485 12486 /* 12487 * ipf_hole_cnt is set by ip_reassemble. 12488 * ipf_count is updated by ip_reassemble. 12489 * No need to check for return value here 12490 * as we don't expect reassembly to complete 12491 * or fail for the first fragment itself. 12492 */ 12493 (void) ip_reassemble(mp, ipf, 12494 (frag_offset_flags & IPH_OFFSET) << 3, 12495 (frag_offset_flags & IPH_MF), ill, msg_len); 12496 } 12497 /* Update per ipfb and ill byte counts */ 12498 ipfb->ipfb_count += ipf->ipf_count; 12499 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12500 atomic_add_32(&ill->ill_frag_count, ipf->ipf_count); 12501 /* If the frag timer wasn't already going, start it. */ 12502 mutex_enter(&ill->ill_lock); 12503 ill_frag_timer_start(ill); 12504 mutex_exit(&ill->ill_lock); 12505 goto reass_done; 12506 } 12507 12508 /* 12509 * If the packet's flag has changed (it could be coming up 12510 * from an interface different than the previous, therefore 12511 * possibly different checksum capability), then forget about 12512 * any stored checksum states. Otherwise add the value to 12513 * the existing one stored in the fragment header. 12514 */ 12515 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 12516 sum_val += ipf->ipf_checksum; 12517 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12518 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12519 ipf->ipf_checksum = sum_val; 12520 } else if (ipf->ipf_checksum_flags != 0) { 12521 /* Forget checksum offload from now on */ 12522 ipf->ipf_checksum_flags = 0; 12523 } 12524 12525 /* 12526 * We have a new piece of a datagram which is already being 12527 * reassembled. Update the ECN info if all IP fragments 12528 * are ECN capable. If there is one which is not, clear 12529 * all the info. If there is at least one which has CE 12530 * code point, IP needs to report that up to transport. 12531 */ 12532 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 12533 if (ecn_info == IPH_ECN_CE) 12534 ipf->ipf_ecn = IPH_ECN_CE; 12535 } else { 12536 ipf->ipf_ecn = IPH_ECN_NECT; 12537 } 12538 if (offset && ipf->ipf_end == offset) { 12539 /* The new fragment fits at the end */ 12540 ipf->ipf_tail_mp->b_cont = mp; 12541 /* Update the byte count */ 12542 ipf->ipf_count += msg_len; 12543 /* Update per ipfb and ill byte counts */ 12544 ipfb->ipfb_count += msg_len; 12545 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12546 atomic_add_32(&ill->ill_frag_count, msg_len); 12547 if (frag_offset_flags & IPH_MF) { 12548 /* More to come. */ 12549 ipf->ipf_end = end; 12550 ipf->ipf_tail_mp = tail_mp; 12551 goto reass_done; 12552 } 12553 } else { 12554 /* Go do the hard cases. */ 12555 int ret; 12556 12557 if (offset == 0) 12558 ipf->ipf_nf_hdr_len = hdr_length; 12559 12560 /* Save current byte count */ 12561 count = ipf->ipf_count; 12562 ret = ip_reassemble(mp, ipf, 12563 (frag_offset_flags & IPH_OFFSET) << 3, 12564 (frag_offset_flags & IPH_MF), ill, msg_len); 12565 /* Count of bytes added and subtracted (freeb()ed) */ 12566 count = ipf->ipf_count - count; 12567 if (count) { 12568 /* Update per ipfb and ill byte counts */ 12569 ipfb->ipfb_count += count; 12570 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12571 atomic_add_32(&ill->ill_frag_count, count); 12572 } 12573 if (ret == IP_REASS_PARTIAL) { 12574 goto reass_done; 12575 } else if (ret == IP_REASS_FAILED) { 12576 /* Reassembly failed. Free up all resources */ 12577 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12578 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 12579 IP_REASS_SET_START(t_mp, 0); 12580 IP_REASS_SET_END(t_mp, 0); 12581 } 12582 freemsg(mp); 12583 goto reass_done; 12584 } 12585 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 12586 } 12587 /* 12588 * We have completed reassembly. Unhook the frag header from 12589 * the reassembly list. 12590 * 12591 * Before we free the frag header, record the ECN info 12592 * to report back to the transport. 12593 */ 12594 ecn_info = ipf->ipf_ecn; 12595 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs); 12596 ipfp = ipf->ipf_ptphn; 12597 12598 /* We need to supply these to caller */ 12599 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 12600 sum_val = ipf->ipf_checksum; 12601 else 12602 sum_val = 0; 12603 12604 mp1 = ipf->ipf_mp; 12605 count = ipf->ipf_count; 12606 ipf = ipf->ipf_hash_next; 12607 if (ipf != NULL) 12608 ipf->ipf_ptphn = ipfp; 12609 ipfp[0] = ipf; 12610 atomic_add_32(&ill->ill_frag_count, -count); 12611 ASSERT(ipfb->ipfb_count >= count); 12612 ipfb->ipfb_count -= count; 12613 ipfb->ipfb_frag_pkts--; 12614 mutex_exit(&ipfb->ipfb_lock); 12615 /* Ditch the frag header. */ 12616 mp = mp1->b_cont; 12617 12618 freeb(mp1); 12619 12620 /* Restore original IP length in header. */ 12621 packet_size = (uint32_t)msgdsize(mp); 12622 if (packet_size > IP_MAXPACKET) { 12623 freemsg(mp); 12624 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 12625 return (B_FALSE); 12626 } 12627 12628 if (DB_REF(mp) > 1) { 12629 mblk_t *mp2 = copymsg(mp); 12630 12631 freemsg(mp); 12632 if (mp2 == NULL) { 12633 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12634 return (B_FALSE); 12635 } 12636 mp = mp2; 12637 } 12638 ipha = (ipha_t *)mp->b_rptr; 12639 12640 ipha->ipha_length = htons((uint16_t)packet_size); 12641 /* We're now complete, zip the frag state */ 12642 ipha->ipha_fragment_offset_and_flags = 0; 12643 /* Record the ECN info. */ 12644 ipha->ipha_type_of_service &= 0xFC; 12645 ipha->ipha_type_of_service |= ecn_info; 12646 *mpp = mp; 12647 12648 /* Reassembly is successful; return checksum information if needed */ 12649 if (cksum_val != NULL) 12650 *cksum_val = sum_val; 12651 if (cksum_flags != NULL) 12652 *cksum_flags = sum_flags; 12653 12654 return (B_TRUE); 12655 } 12656 12657 /* 12658 * Perform ip header check sum update local options. 12659 * return B_TRUE if all is well, else return B_FALSE and release 12660 * the mp. caller is responsible for decrementing ire ref cnt. 12661 */ 12662 static boolean_t 12663 ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12664 ip_stack_t *ipst) 12665 { 12666 mblk_t *first_mp; 12667 boolean_t mctl_present; 12668 uint16_t sum; 12669 12670 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12671 /* 12672 * Don't do the checksum if it has gone through AH/ESP 12673 * processing. 12674 */ 12675 if (!mctl_present) { 12676 sum = ip_csum_hdr(ipha); 12677 if (sum != 0) { 12678 if (ill != NULL) { 12679 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12680 } else { 12681 BUMP_MIB(&ipst->ips_ip_mib, 12682 ipIfStatsInCksumErrs); 12683 } 12684 freemsg(first_mp); 12685 return (B_FALSE); 12686 } 12687 } 12688 12689 if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) { 12690 if (mctl_present) 12691 freeb(first_mp); 12692 return (B_FALSE); 12693 } 12694 12695 return (B_TRUE); 12696 } 12697 12698 /* 12699 * All udp packet are delivered to the local host via this routine. 12700 */ 12701 void 12702 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12703 ill_t *recv_ill) 12704 { 12705 uint32_t sum; 12706 uint32_t u1; 12707 boolean_t mctl_present; 12708 conn_t *connp; 12709 mblk_t *first_mp; 12710 uint16_t *up; 12711 ill_t *ill = (ill_t *)q->q_ptr; 12712 uint16_t reass_hck_flags = 0; 12713 ip_stack_t *ipst; 12714 12715 ASSERT(recv_ill != NULL); 12716 ipst = recv_ill->ill_ipst; 12717 12718 #define rptr ((uchar_t *)ipha) 12719 12720 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12721 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 12722 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12723 ASSERT(ill != NULL); 12724 12725 /* 12726 * FAST PATH for udp packets 12727 */ 12728 12729 /* u1 is # words of IP options */ 12730 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 12731 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12732 12733 /* IP options present */ 12734 if (u1 != 0) 12735 goto ipoptions; 12736 12737 /* Check the IP header checksum. */ 12738 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12739 /* Clear the IP header h/w cksum flag */ 12740 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12741 } else if (!mctl_present) { 12742 /* 12743 * Don't verify header checksum if this packet is coming 12744 * back from AH/ESP as we already did it. 12745 */ 12746 #define uph ((uint16_t *)ipha) 12747 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 12748 uph[6] + uph[7] + uph[8] + uph[9]; 12749 #undef uph 12750 /* finish doing IP checksum */ 12751 sum = (sum & 0xFFFF) + (sum >> 16); 12752 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12753 if (sum != 0 && sum != 0xFFFF) { 12754 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12755 freemsg(first_mp); 12756 return; 12757 } 12758 } 12759 12760 /* 12761 * Count for SNMP of inbound packets for ire. 12762 * if mctl is present this might be a secure packet and 12763 * has already been counted for in ip_proto_input(). 12764 */ 12765 if (!mctl_present) { 12766 UPDATE_IB_PKT_COUNT(ire); 12767 ire->ire_last_used_time = lbolt; 12768 } 12769 12770 /* packet part of fragmented IP packet? */ 12771 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12772 if (u1 & (IPH_MF | IPH_OFFSET)) { 12773 goto fragmented; 12774 } 12775 12776 /* u1 = IP header length (20 bytes) */ 12777 u1 = IP_SIMPLE_HDR_LENGTH; 12778 12779 /* packet does not contain complete IP & UDP headers */ 12780 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 12781 goto udppullup; 12782 12783 /* up points to UDP header */ 12784 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 12785 #define iphs ((uint16_t *)ipha) 12786 12787 /* if udp hdr cksum != 0, then need to checksum udp packet */ 12788 if (up[3] != 0) { 12789 mblk_t *mp1 = mp->b_cont; 12790 boolean_t cksum_err; 12791 uint16_t hck_flags = 0; 12792 12793 /* Pseudo-header checksum */ 12794 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12795 iphs[9] + up[2]; 12796 12797 /* 12798 * Revert to software checksum calculation if the interface 12799 * isn't capable of checksum offload or if IPsec is present. 12800 */ 12801 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 12802 hck_flags = DB_CKSUMFLAGS(mp); 12803 12804 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12805 IP_STAT(ipst, ip_in_sw_cksum); 12806 12807 IP_CKSUM_RECV(hck_flags, u1, 12808 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12809 (int32_t)((uchar_t *)up - rptr), 12810 mp, mp1, cksum_err); 12811 12812 if (cksum_err) { 12813 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12814 if (hck_flags & HCK_FULLCKSUM) 12815 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12816 else if (hck_flags & HCK_PARTIALCKSUM) 12817 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12818 else 12819 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12820 12821 freemsg(first_mp); 12822 return; 12823 } 12824 } 12825 12826 /* Non-fragmented broadcast or multicast packet? */ 12827 if (ire->ire_type == IRE_BROADCAST) 12828 goto udpslowpath; 12829 12830 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 12831 ire->ire_zoneid, ipst)) != NULL) { 12832 ASSERT(connp->conn_upq != NULL); 12833 IP_STAT(ipst, ip_udp_fast_path); 12834 12835 if (CONN_UDP_FLOWCTLD(connp)) { 12836 freemsg(mp); 12837 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 12838 } else { 12839 if (!mctl_present) { 12840 BUMP_MIB(ill->ill_ip_mib, 12841 ipIfStatsHCInDelivers); 12842 } 12843 /* 12844 * mp and first_mp can change. 12845 */ 12846 if (ip_udp_check(q, connp, recv_ill, 12847 ipha, &mp, &first_mp, mctl_present, ire)) { 12848 /* Send it upstream */ 12849 (connp->conn_recv)(connp, mp, NULL); 12850 } 12851 } 12852 /* 12853 * freeb() cannot deal with null mblk being passed 12854 * in and first_mp can be set to null in the call 12855 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 12856 */ 12857 if (mctl_present && first_mp != NULL) { 12858 freeb(first_mp); 12859 } 12860 CONN_DEC_REF(connp); 12861 return; 12862 } 12863 12864 /* 12865 * if we got here we know the packet is not fragmented and 12866 * has no options. The classifier could not find a conn_t and 12867 * most likely its an icmp packet so send it through slow path. 12868 */ 12869 12870 goto udpslowpath; 12871 12872 ipoptions: 12873 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 12874 goto slow_done; 12875 } 12876 12877 UPDATE_IB_PKT_COUNT(ire); 12878 ire->ire_last_used_time = lbolt; 12879 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12880 if (u1 & (IPH_MF | IPH_OFFSET)) { 12881 fragmented: 12882 /* 12883 * "sum" and "reass_hck_flags" are non-zero if the 12884 * reassembled packet has a valid hardware computed 12885 * checksum information associated with it. 12886 */ 12887 if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) 12888 goto slow_done; 12889 /* 12890 * Make sure that first_mp points back to mp as 12891 * the mp we came in with could have changed in 12892 * ip_rput_fragment(). 12893 */ 12894 ASSERT(!mctl_present); 12895 ipha = (ipha_t *)mp->b_rptr; 12896 first_mp = mp; 12897 } 12898 12899 /* Now we have a complete datagram, destined for this machine. */ 12900 u1 = IPH_HDR_LENGTH(ipha); 12901 /* Pull up the UDP header, if necessary. */ 12902 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12903 udppullup: 12904 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12905 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12906 freemsg(first_mp); 12907 goto slow_done; 12908 } 12909 ipha = (ipha_t *)mp->b_rptr; 12910 } 12911 12912 /* 12913 * Validate the checksum for the reassembled packet; for the 12914 * pullup case we calculate the payload checksum in software. 12915 */ 12916 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12917 if (up[3] != 0) { 12918 boolean_t cksum_err; 12919 12920 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12921 IP_STAT(ipst, ip_in_sw_cksum); 12922 12923 IP_CKSUM_RECV_REASS(reass_hck_flags, 12924 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12925 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12926 iphs[9] + up[2], sum, cksum_err); 12927 12928 if (cksum_err) { 12929 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12930 12931 if (reass_hck_flags & HCK_FULLCKSUM) 12932 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12933 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12934 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12935 else 12936 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12937 12938 freemsg(first_mp); 12939 goto slow_done; 12940 } 12941 } 12942 udpslowpath: 12943 12944 /* Clear hardware checksum flag to be safe */ 12945 DB_CKSUMFLAGS(mp) = 0; 12946 12947 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12948 (ire->ire_type == IRE_BROADCAST), 12949 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO, 12950 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12951 12952 slow_done: 12953 IP_STAT(ipst, ip_udp_slow_path); 12954 return; 12955 12956 #undef iphs 12957 #undef rptr 12958 } 12959 12960 /* ARGSUSED */ 12961 static mblk_t * 12962 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12963 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12964 ill_rx_ring_t *ill_ring) 12965 { 12966 conn_t *connp; 12967 uint32_t sum; 12968 uint32_t u1; 12969 uint16_t *up; 12970 int offset; 12971 ssize_t len; 12972 mblk_t *mp1; 12973 boolean_t syn_present = B_FALSE; 12974 tcph_t *tcph; 12975 uint_t ip_hdr_len; 12976 ill_t *ill = (ill_t *)q->q_ptr; 12977 zoneid_t zoneid = ire->ire_zoneid; 12978 boolean_t cksum_err; 12979 uint16_t hck_flags = 0; 12980 ip_stack_t *ipst = recv_ill->ill_ipst; 12981 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12982 12983 #define rptr ((uchar_t *)ipha) 12984 12985 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12986 ASSERT(ill != NULL); 12987 12988 /* 12989 * FAST PATH for tcp packets 12990 */ 12991 12992 /* u1 is # words of IP options */ 12993 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12994 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12995 12996 /* IP options present */ 12997 if (u1) { 12998 goto ipoptions; 12999 } else if (!mctl_present) { 13000 /* Check the IP header checksum. */ 13001 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 13002 /* Clear the IP header h/w cksum flag */ 13003 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 13004 } else if (!mctl_present) { 13005 /* 13006 * Don't verify header checksum if this packet 13007 * is coming back from AH/ESP as we already did it. 13008 */ 13009 #define uph ((uint16_t *)ipha) 13010 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 13011 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 13012 #undef uph 13013 /* finish doing IP checksum */ 13014 sum = (sum & 0xFFFF) + (sum >> 16); 13015 sum = ~(sum + (sum >> 16)) & 0xFFFF; 13016 if (sum != 0 && sum != 0xFFFF) { 13017 BUMP_MIB(ill->ill_ip_mib, 13018 ipIfStatsInCksumErrs); 13019 goto error; 13020 } 13021 } 13022 } 13023 13024 if (!mctl_present) { 13025 UPDATE_IB_PKT_COUNT(ire); 13026 ire->ire_last_used_time = lbolt; 13027 } 13028 13029 /* packet part of fragmented IP packet? */ 13030 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13031 if (u1 & (IPH_MF | IPH_OFFSET)) { 13032 goto fragmented; 13033 } 13034 13035 /* u1 = IP header length (20 bytes) */ 13036 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 13037 13038 /* does packet contain IP+TCP headers? */ 13039 len = mp->b_wptr - rptr; 13040 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 13041 IP_STAT(ipst, ip_tcppullup); 13042 goto tcppullup; 13043 } 13044 13045 /* TCP options present? */ 13046 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 13047 13048 /* 13049 * If options need to be pulled up, then goto tcpoptions. 13050 * otherwise we are still in the fast path 13051 */ 13052 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 13053 IP_STAT(ipst, ip_tcpoptions); 13054 goto tcpoptions; 13055 } 13056 13057 /* multiple mblks of tcp data? */ 13058 if ((mp1 = mp->b_cont) != NULL) { 13059 /* more then two? */ 13060 if (mp1->b_cont != NULL) { 13061 IP_STAT(ipst, ip_multipkttcp); 13062 goto multipkttcp; 13063 } 13064 len += mp1->b_wptr - mp1->b_rptr; 13065 } 13066 13067 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 13068 13069 /* part of pseudo checksum */ 13070 13071 /* TCP datagram length */ 13072 u1 = len - IP_SIMPLE_HDR_LENGTH; 13073 13074 #define iphs ((uint16_t *)ipha) 13075 13076 #ifdef _BIG_ENDIAN 13077 u1 += IPPROTO_TCP; 13078 #else 13079 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13080 #endif 13081 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13082 13083 /* 13084 * Revert to software checksum calculation if the interface 13085 * isn't capable of checksum offload or if IPsec is present. 13086 */ 13087 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 13088 hck_flags = DB_CKSUMFLAGS(mp); 13089 13090 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 13091 IP_STAT(ipst, ip_in_sw_cksum); 13092 13093 IP_CKSUM_RECV(hck_flags, u1, 13094 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 13095 (int32_t)((uchar_t *)up - rptr), 13096 mp, mp1, cksum_err); 13097 13098 if (cksum_err) { 13099 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13100 13101 if (hck_flags & HCK_FULLCKSUM) 13102 IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); 13103 else if (hck_flags & HCK_PARTIALCKSUM) 13104 IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); 13105 else 13106 IP_STAT(ipst, ip_tcp_in_sw_cksum_err); 13107 13108 goto error; 13109 } 13110 13111 try_again: 13112 13113 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 13114 zoneid, ipst)) == NULL) { 13115 /* Send the TH_RST */ 13116 goto no_conn; 13117 } 13118 13119 /* 13120 * TCP FAST PATH for AF_INET socket. 13121 * 13122 * TCP fast path to avoid extra work. An AF_INET socket type 13123 * does not have facility to receive extra information via 13124 * ip_process or ip_add_info. Also, when the connection was 13125 * established, we made a check if this connection is impacted 13126 * by any global IPsec policy or per connection policy (a 13127 * policy that comes in effect later will not apply to this 13128 * connection). Since all this can be determined at the 13129 * connection establishment time, a quick check of flags 13130 * can avoid extra work. 13131 */ 13132 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 13133 !IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13134 ASSERT(first_mp == mp); 13135 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13136 SET_SQUEUE(mp, tcp_rput_data, connp); 13137 return (mp); 13138 } 13139 13140 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 13141 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 13142 if (IPCL_IS_TCP(connp)) { 13143 mp->b_datap->db_struioflag |= STRUIO_EAGER; 13144 DB_CKSUMSTART(mp) = 13145 (intptr_t)ip_squeue_get(ill_ring); 13146 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 13147 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13148 BUMP_MIB(ill->ill_ip_mib, 13149 ipIfStatsHCInDelivers); 13150 SET_SQUEUE(mp, connp->conn_recv, connp); 13151 return (mp); 13152 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 13153 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13154 BUMP_MIB(ill->ill_ip_mib, 13155 ipIfStatsHCInDelivers); 13156 ip_squeue_enter_unbound++; 13157 SET_SQUEUE(mp, tcp_conn_request_unbound, 13158 connp); 13159 return (mp); 13160 } 13161 syn_present = B_TRUE; 13162 } 13163 13164 } 13165 13166 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 13167 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 13168 13169 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13170 /* No need to send this packet to TCP */ 13171 if ((flags & TH_RST) || (flags & TH_URG)) { 13172 CONN_DEC_REF(connp); 13173 freemsg(first_mp); 13174 return (NULL); 13175 } 13176 if (flags & TH_ACK) { 13177 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 13178 ipst->ips_netstack->netstack_tcp, connp); 13179 CONN_DEC_REF(connp); 13180 return (NULL); 13181 } 13182 13183 CONN_DEC_REF(connp); 13184 freemsg(first_mp); 13185 return (NULL); 13186 } 13187 13188 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 13189 first_mp = ipsec_check_inbound_policy(first_mp, connp, 13190 ipha, NULL, mctl_present); 13191 if (first_mp == NULL) { 13192 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13193 CONN_DEC_REF(connp); 13194 return (NULL); 13195 } 13196 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 13197 ASSERT(syn_present); 13198 if (mctl_present) { 13199 ASSERT(first_mp != mp); 13200 first_mp->b_datap->db_struioflag |= 13201 STRUIO_POLICY; 13202 } else { 13203 ASSERT(first_mp == mp); 13204 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 13205 mp->b_datap->db_struioflag |= STRUIO_POLICY; 13206 } 13207 } else { 13208 /* 13209 * Discard first_mp early since we're dealing with a 13210 * fully-connected conn_t and tcp doesn't do policy in 13211 * this case. 13212 */ 13213 if (mctl_present) { 13214 freeb(first_mp); 13215 mctl_present = B_FALSE; 13216 } 13217 first_mp = mp; 13218 } 13219 } 13220 13221 /* Initiate IPPF processing for fastpath */ 13222 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13223 uint32_t ill_index; 13224 13225 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13226 ip_process(IPP_LOCAL_IN, &mp, ill_index); 13227 if (mp == NULL) { 13228 ip2dbg(("ip_input_ipsec_process: TCP pkt " 13229 "deferred/dropped during IPPF processing\n")); 13230 CONN_DEC_REF(connp); 13231 if (mctl_present) 13232 freeb(first_mp); 13233 return (NULL); 13234 } else if (mctl_present) { 13235 /* 13236 * ip_process might return a new mp. 13237 */ 13238 ASSERT(first_mp != mp); 13239 first_mp->b_cont = mp; 13240 } else { 13241 first_mp = mp; 13242 } 13243 13244 } 13245 13246 if (!syn_present && connp->conn_ip_recvpktinfo) { 13247 /* 13248 * TCP does not support IP_RECVPKTINFO for v4 so lets 13249 * make sure IPF_RECVIF is passed to ip_add_info. 13250 */ 13251 mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF, 13252 IPCL_ZONEID(connp), ipst); 13253 if (mp == NULL) { 13254 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13255 CONN_DEC_REF(connp); 13256 if (mctl_present) 13257 freeb(first_mp); 13258 return (NULL); 13259 } else if (mctl_present) { 13260 /* 13261 * ip_add_info might return a new mp. 13262 */ 13263 ASSERT(first_mp != mp); 13264 first_mp->b_cont = mp; 13265 } else { 13266 first_mp = mp; 13267 } 13268 } 13269 13270 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13271 if (IPCL_IS_TCP(connp)) { 13272 SET_SQUEUE(first_mp, connp->conn_recv, connp); 13273 return (first_mp); 13274 } else { 13275 /* SOCK_RAW, IPPROTO_TCP case */ 13276 (connp->conn_recv)(connp, first_mp, NULL); 13277 CONN_DEC_REF(connp); 13278 return (NULL); 13279 } 13280 13281 no_conn: 13282 /* Initiate IPPf processing, if needed. */ 13283 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13284 uint32_t ill_index; 13285 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13286 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 13287 if (first_mp == NULL) { 13288 return (NULL); 13289 } 13290 } 13291 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13292 13293 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid, 13294 ipst->ips_netstack->netstack_tcp, NULL); 13295 return (NULL); 13296 ipoptions: 13297 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) { 13298 goto slow_done; 13299 } 13300 13301 UPDATE_IB_PKT_COUNT(ire); 13302 ire->ire_last_used_time = lbolt; 13303 13304 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13305 if (u1 & (IPH_MF | IPH_OFFSET)) { 13306 fragmented: 13307 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 13308 if (mctl_present) 13309 freeb(first_mp); 13310 goto slow_done; 13311 } 13312 /* 13313 * Make sure that first_mp points back to mp as 13314 * the mp we came in with could have changed in 13315 * ip_rput_fragment(). 13316 */ 13317 ASSERT(!mctl_present); 13318 ipha = (ipha_t *)mp->b_rptr; 13319 first_mp = mp; 13320 } 13321 13322 /* Now we have a complete datagram, destined for this machine. */ 13323 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 13324 13325 len = mp->b_wptr - mp->b_rptr; 13326 /* Pull up a minimal TCP header, if necessary. */ 13327 if (len < (u1 + 20)) { 13328 tcppullup: 13329 if (!pullupmsg(mp, u1 + 20)) { 13330 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13331 goto error; 13332 } 13333 ipha = (ipha_t *)mp->b_rptr; 13334 len = mp->b_wptr - mp->b_rptr; 13335 } 13336 13337 /* 13338 * Extract the offset field from the TCP header. As usual, we 13339 * try to help the compiler more than the reader. 13340 */ 13341 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 13342 if (offset != 5) { 13343 tcpoptions: 13344 if (offset < 5) { 13345 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13346 goto error; 13347 } 13348 /* 13349 * There must be TCP options. 13350 * Make sure we can grab them. 13351 */ 13352 offset <<= 2; 13353 offset += u1; 13354 if (len < offset) { 13355 if (!pullupmsg(mp, offset)) { 13356 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13357 goto error; 13358 } 13359 ipha = (ipha_t *)mp->b_rptr; 13360 len = mp->b_wptr - rptr; 13361 } 13362 } 13363 13364 /* Get the total packet length in len, including headers. */ 13365 if (mp->b_cont) { 13366 multipkttcp: 13367 len = msgdsize(mp); 13368 } 13369 13370 /* 13371 * Check the TCP checksum by pulling together the pseudo- 13372 * header checksum, and passing it to ip_csum to be added in 13373 * with the TCP datagram. 13374 * 13375 * Since we are not using the hwcksum if available we must 13376 * clear the flag. We may come here via tcppullup or tcpoptions. 13377 * If either of these fails along the way the mblk is freed. 13378 * If this logic ever changes and mblk is reused to say send 13379 * ICMP's back, then this flag may need to be cleared in 13380 * other places as well. 13381 */ 13382 DB_CKSUMFLAGS(mp) = 0; 13383 13384 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 13385 13386 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 13387 #ifdef _BIG_ENDIAN 13388 u1 += IPPROTO_TCP; 13389 #else 13390 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13391 #endif 13392 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13393 /* 13394 * Not M_DATA mblk or its a dup, so do the checksum now. 13395 */ 13396 IP_STAT(ipst, ip_in_sw_cksum); 13397 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 13398 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13399 goto error; 13400 } 13401 13402 IP_STAT(ipst, ip_tcp_slow_path); 13403 goto try_again; 13404 #undef iphs 13405 #undef rptr 13406 13407 error: 13408 freemsg(first_mp); 13409 slow_done: 13410 return (NULL); 13411 } 13412 13413 /* ARGSUSED */ 13414 static void 13415 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 13416 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 13417 { 13418 conn_t *connp; 13419 uint32_t sum; 13420 uint32_t u1; 13421 ssize_t len; 13422 sctp_hdr_t *sctph; 13423 zoneid_t zoneid = ire->ire_zoneid; 13424 uint32_t pktsum; 13425 uint32_t calcsum; 13426 uint32_t ports; 13427 in6_addr_t map_src, map_dst; 13428 ill_t *ill = (ill_t *)q->q_ptr; 13429 ip_stack_t *ipst; 13430 sctp_stack_t *sctps; 13431 boolean_t sctp_csum_err = B_FALSE; 13432 13433 ASSERT(recv_ill != NULL); 13434 ipst = recv_ill->ill_ipst; 13435 sctps = ipst->ips_netstack->netstack_sctp; 13436 13437 #define rptr ((uchar_t *)ipha) 13438 13439 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 13440 ASSERT(ill != NULL); 13441 13442 /* u1 is # words of IP options */ 13443 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 13444 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 13445 13446 /* IP options present */ 13447 if (u1 > 0) { 13448 goto ipoptions; 13449 } else { 13450 /* Check the IP header checksum. */ 13451 if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) && 13452 !mctl_present) { 13453 #define uph ((uint16_t *)ipha) 13454 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 13455 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 13456 #undef uph 13457 /* finish doing IP checksum */ 13458 sum = (sum & 0xFFFF) + (sum >> 16); 13459 sum = ~(sum + (sum >> 16)) & 0xFFFF; 13460 /* 13461 * Don't verify header checksum if this packet 13462 * is coming back from AH/ESP as we already did it. 13463 */ 13464 if (sum != 0 && sum != 0xFFFF) { 13465 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 13466 goto error; 13467 } 13468 } 13469 /* 13470 * Since there is no SCTP h/w cksum support yet, just 13471 * clear the flag. 13472 */ 13473 DB_CKSUMFLAGS(mp) = 0; 13474 } 13475 13476 /* 13477 * Don't verify header checksum if this packet is coming 13478 * back from AH/ESP as we already did it. 13479 */ 13480 if (!mctl_present) { 13481 UPDATE_IB_PKT_COUNT(ire); 13482 ire->ire_last_used_time = lbolt; 13483 } 13484 13485 /* packet part of fragmented IP packet? */ 13486 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13487 if (u1 & (IPH_MF | IPH_OFFSET)) 13488 goto fragmented; 13489 13490 /* u1 = IP header length (20 bytes) */ 13491 u1 = IP_SIMPLE_HDR_LENGTH; 13492 13493 find_sctp_client: 13494 /* Pullup if we don't have the sctp common header. */ 13495 len = MBLKL(mp); 13496 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 13497 if (mp->b_cont == NULL || 13498 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 13499 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13500 goto error; 13501 } 13502 ipha = (ipha_t *)mp->b_rptr; 13503 len = MBLKL(mp); 13504 } 13505 13506 sctph = (sctp_hdr_t *)(rptr + u1); 13507 #ifdef DEBUG 13508 if (!skip_sctp_cksum) { 13509 #endif 13510 pktsum = sctph->sh_chksum; 13511 sctph->sh_chksum = 0; 13512 calcsum = sctp_cksum(mp, u1); 13513 sctph->sh_chksum = pktsum; 13514 if (calcsum != pktsum) 13515 sctp_csum_err = B_TRUE; 13516 #ifdef DEBUG /* skip_sctp_cksum */ 13517 } 13518 #endif 13519 /* get the ports */ 13520 ports = *(uint32_t *)&sctph->sh_sport; 13521 13522 IRE_REFRELE(ire); 13523 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 13524 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 13525 if (sctp_csum_err) { 13526 /* 13527 * No potential sctp checksum errors go to the Sun 13528 * sctp stack however they might be Adler-32 summed 13529 * packets a userland stack bound to a raw IP socket 13530 * could reasonably use. Note though that Adler-32 is 13531 * a long deprecated algorithm and customer sctp 13532 * networks should eventually migrate to CRC-32 at 13533 * which time this facility should be removed. 13534 */ 13535 flags |= IP_FF_SCTP_CSUM_ERR; 13536 goto no_conn; 13537 } 13538 if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp, 13539 sctps)) == NULL) { 13540 /* Check for raw socket or OOTB handling */ 13541 goto no_conn; 13542 } 13543 13544 /* Found a client; up it goes */ 13545 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13546 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 13547 return; 13548 13549 no_conn: 13550 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 13551 ports, mctl_present, flags, B_TRUE, zoneid); 13552 return; 13553 13554 ipoptions: 13555 DB_CKSUMFLAGS(mp) = 0; 13556 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) 13557 goto slow_done; 13558 13559 UPDATE_IB_PKT_COUNT(ire); 13560 ire->ire_last_used_time = lbolt; 13561 13562 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13563 if (u1 & (IPH_MF | IPH_OFFSET)) { 13564 fragmented: 13565 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) 13566 goto slow_done; 13567 /* 13568 * Make sure that first_mp points back to mp as 13569 * the mp we came in with could have changed in 13570 * ip_rput_fragment(). 13571 */ 13572 ASSERT(!mctl_present); 13573 ipha = (ipha_t *)mp->b_rptr; 13574 first_mp = mp; 13575 } 13576 13577 /* Now we have a complete datagram, destined for this machine. */ 13578 u1 = IPH_HDR_LENGTH(ipha); 13579 goto find_sctp_client; 13580 #undef iphs 13581 #undef rptr 13582 13583 error: 13584 freemsg(first_mp); 13585 slow_done: 13586 IRE_REFRELE(ire); 13587 } 13588 13589 #define VER_BITS 0xF0 13590 #define VERSION_6 0x60 13591 13592 static boolean_t 13593 ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp, 13594 ipaddr_t *dstp, ip_stack_t *ipst) 13595 { 13596 uint_t opt_len; 13597 ipha_t *ipha; 13598 ssize_t len; 13599 uint_t pkt_len; 13600 13601 ASSERT(ill != NULL); 13602 IP_STAT(ipst, ip_ipoptions); 13603 ipha = *iphapp; 13604 13605 #define rptr ((uchar_t *)ipha) 13606 /* Assume no IPv6 packets arrive over the IPv4 queue */ 13607 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 13608 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); 13609 freemsg(mp); 13610 return (B_FALSE); 13611 } 13612 13613 /* multiple mblk or too short */ 13614 pkt_len = ntohs(ipha->ipha_length); 13615 13616 /* Get the number of words of IP options in the IP header. */ 13617 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 13618 if (opt_len) { 13619 /* IP Options present! Validate and process. */ 13620 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 13621 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13622 goto done; 13623 } 13624 /* 13625 * Recompute complete header length and make sure we 13626 * have access to all of it. 13627 */ 13628 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 13629 if (len > (mp->b_wptr - rptr)) { 13630 if (len > pkt_len) { 13631 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13632 goto done; 13633 } 13634 if (!pullupmsg(mp, len)) { 13635 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13636 goto done; 13637 } 13638 ipha = (ipha_t *)mp->b_rptr; 13639 } 13640 /* 13641 * Go off to ip_rput_options which returns the next hop 13642 * destination address, which may have been affected 13643 * by source routing. 13644 */ 13645 IP_STAT(ipst, ip_opt); 13646 if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) { 13647 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13648 return (B_FALSE); 13649 } 13650 } 13651 *iphapp = ipha; 13652 return (B_TRUE); 13653 done: 13654 /* clear b_prev - used by ip_mroute_decap */ 13655 mp->b_prev = NULL; 13656 freemsg(mp); 13657 return (B_FALSE); 13658 #undef rptr 13659 } 13660 13661 /* 13662 * Deal with the fact that there is no ire for the destination. 13663 */ 13664 static ire_t * 13665 ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst) 13666 { 13667 ipha_t *ipha; 13668 ill_t *ill; 13669 ire_t *ire; 13670 ip_stack_t *ipst; 13671 enum ire_forward_action ret_action; 13672 13673 ipha = (ipha_t *)mp->b_rptr; 13674 ill = (ill_t *)q->q_ptr; 13675 13676 ASSERT(ill != NULL); 13677 ipst = ill->ill_ipst; 13678 13679 /* 13680 * No IRE for this destination, so it can't be for us. 13681 * Unless we are forwarding, drop the packet. 13682 * We have to let source routed packets through 13683 * since we don't yet know if they are 'ping -l' 13684 * packets i.e. if they will go out over the 13685 * same interface as they came in on. 13686 */ 13687 if (ll_multicast) { 13688 freemsg(mp); 13689 return (NULL); 13690 } 13691 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { 13692 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13693 freemsg(mp); 13694 return (NULL); 13695 } 13696 13697 /* 13698 * Mark this packet as having originated externally. 13699 * 13700 * For non-forwarding code path, ire_send later double 13701 * checks this interface to see if it is still exists 13702 * post-ARP resolution. 13703 * 13704 * Also, IPQOS uses this to differentiate between 13705 * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP 13706 * QOS packet processing in ip_wput_attach_llhdr(). 13707 * The QoS module can mark the b_band for a fastpath message 13708 * or the dl_priority field in a unitdata_req header for 13709 * CoS marking. This info can only be found in 13710 * ip_wput_attach_llhdr(). 13711 */ 13712 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 13713 /* 13714 * Clear the indication that this may have a hardware checksum 13715 * as we are not using it 13716 */ 13717 DB_CKSUMFLAGS(mp) = 0; 13718 13719 ire = ire_forward(dst, &ret_action, NULL, NULL, 13720 MBLK_GETLABEL(mp), ipst); 13721 13722 if (ire == NULL && ret_action == Forward_check_multirt) { 13723 /* Let ip_newroute handle CGTP */ 13724 ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst); 13725 return (NULL); 13726 } 13727 13728 if (ire != NULL) 13729 return (ire); 13730 13731 mp->b_prev = mp->b_next = 0; 13732 13733 if (ret_action == Forward_blackhole) { 13734 freemsg(mp); 13735 return (NULL); 13736 } 13737 /* send icmp unreachable */ 13738 q = WR(q); 13739 /* Sent by forwarding path, and router is global zone */ 13740 if (ip_source_routed(ipha, ipst)) { 13741 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, 13742 GLOBAL_ZONEID, ipst); 13743 } else { 13744 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID, 13745 ipst); 13746 } 13747 13748 return (NULL); 13749 13750 } 13751 13752 /* 13753 * check ip header length and align it. 13754 */ 13755 static boolean_t 13756 ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) 13757 { 13758 ssize_t len; 13759 ill_t *ill; 13760 ipha_t *ipha; 13761 13762 len = MBLKL(mp); 13763 13764 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13765 ill = (ill_t *)q->q_ptr; 13766 13767 if (!OK_32PTR(mp->b_rptr)) 13768 IP_STAT(ipst, ip_notaligned1); 13769 else 13770 IP_STAT(ipst, ip_notaligned2); 13771 /* Guard against bogus device drivers */ 13772 if (len < 0) { 13773 /* clear b_prev - used by ip_mroute_decap */ 13774 mp->b_prev = NULL; 13775 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13776 freemsg(mp); 13777 return (B_FALSE); 13778 } 13779 13780 if (ip_rput_pullups++ == 0) { 13781 ipha = (ipha_t *)mp->b_rptr; 13782 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 13783 "ip_check_and_align_header: %s forced us to " 13784 " pullup pkt, hdr len %ld, hdr addr %p", 13785 ill->ill_name, len, (void *)ipha); 13786 } 13787 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 13788 /* clear b_prev - used by ip_mroute_decap */ 13789 mp->b_prev = NULL; 13790 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13791 freemsg(mp); 13792 return (B_FALSE); 13793 } 13794 } 13795 return (B_TRUE); 13796 } 13797 13798 ire_t * 13799 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) 13800 { 13801 ire_t *new_ire; 13802 ill_t *ire_ill; 13803 uint_t ifindex; 13804 ip_stack_t *ipst = ill->ill_ipst; 13805 boolean_t strict_check = B_FALSE; 13806 13807 /* 13808 * This packet came in on an interface other than the one associated 13809 * with the first ire we found for the destination address. We do 13810 * another ire lookup here, using the ingress ill, to see if the 13811 * interface is in an interface group. 13812 * As long as the ills belong to the same group, we don't consider 13813 * them to be arriving on the wrong interface. Thus, if the switch 13814 * is doing inbound load spreading, we won't drop packets when the 13815 * ip*_strict_dst_multihoming switch is on. Note, the same holds true 13816 * for 'usesrc groups' where the destination address may belong to 13817 * another interface to allow multipathing to happen. 13818 * We also need to check for IPIF_UNNUMBERED point2point interfaces 13819 * where the local address may not be unique. In this case we were 13820 * at the mercy of the initial ire cache lookup and the IRE_LOCAL it 13821 * actually returned. The new lookup, which is more specific, should 13822 * only find the IRE_LOCAL associated with the ingress ill if one 13823 * exists. 13824 */ 13825 13826 if (ire->ire_ipversion == IPV4_VERSION) { 13827 if (ipst->ips_ip_strict_dst_multihoming) 13828 strict_check = B_TRUE; 13829 new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, 13830 ill->ill_ipif, ALL_ZONES, NULL, 13831 (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); 13832 } else { 13833 ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); 13834 if (ipst->ips_ipv6_strict_dst_multihoming) 13835 strict_check = B_TRUE; 13836 new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, 13837 IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, 13838 (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); 13839 } 13840 /* 13841 * If the same ire that was returned in ip_input() is found then this 13842 * is an indication that interface groups are in use. The packet 13843 * arrived on a different ill in the group than the one associated with 13844 * the destination address. If a different ire was found then the same 13845 * IP address must be hosted on multiple ills. This is possible with 13846 * unnumbered point2point interfaces. We switch to use this new ire in 13847 * order to have accurate interface statistics. 13848 */ 13849 if (new_ire != NULL) { 13850 if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) { 13851 ire_refrele(ire); 13852 ire = new_ire; 13853 } else { 13854 ire_refrele(new_ire); 13855 } 13856 return (ire); 13857 } else if ((ire->ire_rfq == NULL) && 13858 (ire->ire_ipversion == IPV4_VERSION)) { 13859 /* 13860 * The best match could have been the original ire which 13861 * was created against an IRE_LOCAL on lo0. In the IPv4 case 13862 * the strict multihoming checks are irrelevant as we consider 13863 * local addresses hosted on lo0 to be interface agnostic. We 13864 * only expect a null ire_rfq on IREs which are associated with 13865 * lo0 hence we can return now. 13866 */ 13867 return (ire); 13868 } 13869 13870 /* 13871 * Chase pointers once and store locally. 13872 */ 13873 ire_ill = (ire->ire_rfq == NULL) ? NULL : 13874 (ill_t *)(ire->ire_rfq->q_ptr); 13875 ifindex = ill->ill_usesrc_ifindex; 13876 13877 /* 13878 * Check if it's a legal address on the 'usesrc' interface. 13879 */ 13880 if ((ifindex != 0) && (ire_ill != NULL) && 13881 (ifindex == ire_ill->ill_phyint->phyint_ifindex)) { 13882 return (ire); 13883 } 13884 13885 /* 13886 * If the ip*_strict_dst_multihoming switch is on then we can 13887 * only accept this packet if the interface is marked as routing. 13888 */ 13889 if (!(strict_check)) 13890 return (ire); 13891 13892 if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & 13893 ILLF_ROUTER) != 0) { 13894 return (ire); 13895 } 13896 13897 ire_refrele(ire); 13898 return (NULL); 13899 } 13900 13901 ire_t * 13902 ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) 13903 { 13904 ipha_t *ipha; 13905 ire_t *src_ire; 13906 ill_t *stq_ill; 13907 uint_t hlen; 13908 uint_t pkt_len; 13909 uint32_t sum; 13910 queue_t *dev_q; 13911 ip_stack_t *ipst = ill->ill_ipst; 13912 mblk_t *fpmp; 13913 enum ire_forward_action ret_action; 13914 13915 ipha = (ipha_t *)mp->b_rptr; 13916 13917 if (ire != NULL && 13918 ire->ire_zoneid != GLOBAL_ZONEID && 13919 ire->ire_zoneid != ALL_ZONES) { 13920 /* 13921 * Should only use IREs that are visible to the global 13922 * zone for forwarding. 13923 */ 13924 ire_refrele(ire); 13925 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst); 13926 } 13927 13928 /* 13929 * Martian Address Filtering [RFC 1812, Section 5.3.7] 13930 * The loopback address check for both src and dst has already 13931 * been checked in ip_input 13932 */ 13933 13934 if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) { 13935 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13936 goto drop; 13937 } 13938 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 13939 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 13940 13941 if (src_ire != NULL) { 13942 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13943 ire_refrele(src_ire); 13944 goto drop; 13945 } 13946 13947 /* No ire cache of nexthop. So first create one */ 13948 if (ire == NULL) { 13949 13950 ire = ire_forward(dst, &ret_action, NULL, NULL, 13951 NULL, ipst); 13952 /* 13953 * We only come to ip_fast_forward if ip_cgtp_filter 13954 * is not set. So ire_forward() should not return with 13955 * Forward_check_multirt as the next action. 13956 */ 13957 ASSERT(ret_action != Forward_check_multirt); 13958 if (ire == NULL) { 13959 /* An attempt was made to forward the packet */ 13960 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 13961 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13962 mp->b_prev = mp->b_next = 0; 13963 /* send icmp unreachable */ 13964 /* Sent by forwarding path, and router is global zone */ 13965 if (ret_action == Forward_ret_icmp_err) { 13966 if (ip_source_routed(ipha, ipst)) { 13967 icmp_unreachable(ill->ill_wq, mp, 13968 ICMP_SOURCE_ROUTE_FAILED, 13969 GLOBAL_ZONEID, ipst); 13970 } else { 13971 icmp_unreachable(ill->ill_wq, mp, 13972 ICMP_HOST_UNREACHABLE, 13973 GLOBAL_ZONEID, ipst); 13974 } 13975 } else { 13976 freemsg(mp); 13977 } 13978 return (NULL); 13979 } 13980 } 13981 13982 /* 13983 * Forwarding fastpath exception case: 13984 * If either of the follwoing case is true, we take 13985 * the slowpath 13986 * o forwarding is not enabled 13987 * o incoming and outgoing interface are the same, or the same 13988 * IPMP group 13989 * o corresponding ire is in incomplete state 13990 * o packet needs fragmentation 13991 * o ARP cache is not resolved 13992 * 13993 * The codeflow from here on is thus: 13994 * ip_rput_process_forward->ip_rput_forward->ip_xmit_v4 13995 */ 13996 pkt_len = ntohs(ipha->ipha_length); 13997 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 13998 if (!(stq_ill->ill_flags & ILLF_ROUTER) || 13999 !(ill->ill_flags & ILLF_ROUTER) || 14000 (ill == stq_ill) || 14001 (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) || 14002 (ire->ire_nce == NULL) || 14003 (pkt_len > ire->ire_max_frag) || 14004 ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || 14005 ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) || 14006 ipha->ipha_ttl <= 1) { 14007 ip_rput_process_forward(ill->ill_rq, mp, ire, 14008 ipha, ill, B_FALSE); 14009 return (ire); 14010 } 14011 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 14012 14013 DTRACE_PROBE4(ip4__forwarding__start, 14014 ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); 14015 14016 FW_HOOKS(ipst->ips_ip4_forwarding_event, 14017 ipst->ips_ipv4firewall_forwarding, 14018 ill, stq_ill, ipha, mp, mp, 0, ipst); 14019 14020 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 14021 14022 if (mp == NULL) 14023 goto drop; 14024 14025 mp->b_datap->db_struioun.cksum.flags = 0; 14026 /* Adjust the checksum to reflect the ttl decrement. */ 14027 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 14028 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 14029 ipha->ipha_ttl--; 14030 14031 /* 14032 * Write the link layer header. We can do this safely here, 14033 * because we have already tested to make sure that the IP 14034 * policy is not set, and that we have a fast path destination 14035 * header. 14036 */ 14037 mp->b_rptr -= hlen; 14038 bcopy(fpmp->b_rptr, mp->b_rptr, hlen); 14039 14040 UPDATE_IB_PKT_COUNT(ire); 14041 ire->ire_last_used_time = lbolt; 14042 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 14043 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 14044 UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len); 14045 14046 dev_q = ire->ire_stq->q_next; 14047 if ((dev_q->q_next != NULL || dev_q->q_first != NULL) && 14048 !canputnext(ire->ire_stq)) { 14049 goto indiscard; 14050 } 14051 if (ILL_DLS_CAPABLE(stq_ill)) { 14052 /* 14053 * Send the packet directly to DLD, where it 14054 * may be queued depending on the availability 14055 * of transmit resources at the media layer. 14056 */ 14057 IP_DLS_ILL_TX(stq_ill, ipha, mp, ipst); 14058 } else { 14059 DTRACE_PROBE4(ip4__physical__out__start, 14060 ill_t *, NULL, ill_t *, stq_ill, 14061 ipha_t *, ipha, mblk_t *, mp); 14062 FW_HOOKS(ipst->ips_ip4_physical_out_event, 14063 ipst->ips_ipv4firewall_physical_out, 14064 NULL, stq_ill, ipha, mp, mp, 0, ipst); 14065 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 14066 if (mp == NULL) 14067 goto drop; 14068 14069 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 14070 ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, 14071 ip6_t *, NULL, int, 0); 14072 14073 putnext(ire->ire_stq, mp); 14074 } 14075 return (ire); 14076 14077 indiscard: 14078 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14079 drop: 14080 if (mp != NULL) 14081 freemsg(mp); 14082 return (ire); 14083 14084 } 14085 14086 /* 14087 * This function is called in the forwarding slowpath, when 14088 * either the ire lacks the link-layer address, or the packet needs 14089 * further processing(eg. fragmentation), before transmission. 14090 */ 14091 14092 static void 14093 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14094 ill_t *ill, boolean_t ll_multicast) 14095 { 14096 ill_group_t *ill_group; 14097 ill_group_t *ire_group; 14098 queue_t *dev_q; 14099 ire_t *src_ire; 14100 ip_stack_t *ipst = ill->ill_ipst; 14101 14102 ASSERT(ire->ire_stq != NULL); 14103 14104 mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */ 14105 mp->b_next = NULL; /* ip_rput_noire sets dst here */ 14106 14107 if (ll_multicast != 0) { 14108 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14109 goto drop_pkt; 14110 } 14111 14112 /* 14113 * check if ipha_src is a broadcast address. Note that this 14114 * check is redundant when we get here from ip_fast_forward() 14115 * which has already done this check. However, since we can 14116 * also get here from ip_rput_process_broadcast() or, for 14117 * for the slow path through ip_fast_forward(), we perform 14118 * the check again for code-reusability 14119 */ 14120 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 14121 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 14122 if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) { 14123 if (src_ire != NULL) 14124 ire_refrele(src_ire); 14125 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14126 ip2dbg(("ip_rput_process_forward: Received packet with" 14127 " bad src/dst address on %s\n", ill->ill_name)); 14128 goto drop_pkt; 14129 } 14130 14131 ill_group = ill->ill_group; 14132 ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; 14133 /* 14134 * Check if we want to forward this one at this time. 14135 * We allow source routed packets on a host provided that 14136 * they go out the same interface or same interface group 14137 * as they came in on. 14138 * 14139 * XXX To be quicker, we may wish to not chase pointers to 14140 * get the ILLF_ROUTER flag and instead store the 14141 * forwarding policy in the ire. An unfortunate 14142 * side-effect of that would be requiring an ire flush 14143 * whenever the ILLF_ROUTER flag changes. 14144 */ 14145 if (((ill->ill_flags & 14146 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 14147 ILLF_ROUTER) == 0) && 14148 !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q || 14149 (ill_group != NULL && ill_group == ire_group)))) { 14150 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14151 if (ip_source_routed(ipha, ipst)) { 14152 q = WR(q); 14153 /* 14154 * Clear the indication that this may have 14155 * hardware checksum as we are not using it. 14156 */ 14157 DB_CKSUMFLAGS(mp) = 0; 14158 /* Sent by forwarding path, and router is global zone */ 14159 icmp_unreachable(q, mp, 14160 ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst); 14161 return; 14162 } 14163 goto drop_pkt; 14164 } 14165 14166 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 14167 14168 /* Packet is being forwarded. Turning off hwcksum flag. */ 14169 DB_CKSUMFLAGS(mp) = 0; 14170 if (ipst->ips_ip_g_send_redirects) { 14171 /* 14172 * Check whether the incoming interface and outgoing 14173 * interface is part of the same group. If so, 14174 * send redirects. 14175 * 14176 * Check the source address to see if it originated 14177 * on the same logical subnet it is going back out on. 14178 * If so, we should be able to send it a redirect. 14179 * Avoid sending a redirect if the destination 14180 * is directly connected (i.e., ipha_dst is the same 14181 * as ire_gateway_addr or the ire_addr of the 14182 * nexthop IRE_CACHE ), or if the packet was source 14183 * routed out this interface. 14184 */ 14185 ipaddr_t src, nhop; 14186 mblk_t *mp1; 14187 ire_t *nhop_ire = NULL; 14188 14189 /* 14190 * Check whether ire_rfq and q are from the same ill 14191 * or if they are not same, they at least belong 14192 * to the same group. If so, send redirects. 14193 */ 14194 if ((ire->ire_rfq == q || 14195 (ill_group != NULL && ill_group == ire_group)) && 14196 !ip_source_routed(ipha, ipst)) { 14197 14198 nhop = (ire->ire_gateway_addr != 0 ? 14199 ire->ire_gateway_addr : ire->ire_addr); 14200 14201 if (ipha->ipha_dst == nhop) { 14202 /* 14203 * We avoid sending a redirect if the 14204 * destination is directly connected 14205 * because it is possible that multiple 14206 * IP subnets may have been configured on 14207 * the link, and the source may not 14208 * be on the same subnet as ip destination, 14209 * even though they are on the same 14210 * physical link. 14211 */ 14212 goto sendit; 14213 } 14214 14215 src = ipha->ipha_src; 14216 14217 /* 14218 * We look up the interface ire for the nexthop, 14219 * to see if ipha_src is in the same subnet 14220 * as the nexthop. 14221 * 14222 * Note that, if, in the future, IRE_CACHE entries 14223 * are obsoleted, this lookup will not be needed, 14224 * as the ire passed to this function will be the 14225 * same as the nhop_ire computed below. 14226 */ 14227 nhop_ire = ire_ftable_lookup(nhop, 0, 0, 14228 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 14229 0, NULL, MATCH_IRE_TYPE, ipst); 14230 14231 if (nhop_ire != NULL) { 14232 if ((src & nhop_ire->ire_mask) == 14233 (nhop & nhop_ire->ire_mask)) { 14234 /* 14235 * The source is directly connected. 14236 * Just copy the ip header (which is 14237 * in the first mblk) 14238 */ 14239 mp1 = copyb(mp); 14240 if (mp1 != NULL) { 14241 icmp_send_redirect(WR(q), mp1, 14242 nhop, ipst); 14243 } 14244 } 14245 ire_refrele(nhop_ire); 14246 } 14247 } 14248 } 14249 sendit: 14250 dev_q = ire->ire_stq->q_next; 14251 if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { 14252 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14253 freemsg(mp); 14254 return; 14255 } 14256 14257 ip_rput_forward(ire, ipha, mp, ill); 14258 return; 14259 14260 drop_pkt: 14261 ip2dbg(("ip_rput_process_forward: drop pkt\n")); 14262 freemsg(mp); 14263 } 14264 14265 ire_t * 14266 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14267 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 14268 { 14269 queue_t *q; 14270 uint16_t hcksumflags; 14271 ip_stack_t *ipst = ill->ill_ipst; 14272 14273 q = *qp; 14274 14275 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); 14276 14277 /* 14278 * Clear the indication that this may have hardware 14279 * checksum as we are not using it for forwarding. 14280 */ 14281 hcksumflags = DB_CKSUMFLAGS(mp); 14282 DB_CKSUMFLAGS(mp) = 0; 14283 14284 /* 14285 * Directed broadcast forwarding: if the packet came in over a 14286 * different interface then it is routed out over we can forward it. 14287 */ 14288 if (ipha->ipha_protocol == IPPROTO_TCP) { 14289 ire_refrele(ire); 14290 freemsg(mp); 14291 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14292 return (NULL); 14293 } 14294 /* 14295 * For multicast we have set dst to be INADDR_BROADCAST 14296 * for delivering to all STREAMS. IRE_MARK_NORECV is really 14297 * only for broadcast packets. 14298 */ 14299 if (!CLASSD(ipha->ipha_dst)) { 14300 ire_t *new_ire; 14301 ipif_t *ipif; 14302 /* 14303 * For ill groups, as the switch duplicates broadcasts 14304 * across all the ports, we need to filter out and 14305 * send up only one copy. There is one copy for every 14306 * broadcast address on each ill. Thus, we look for a 14307 * specific IRE on this ill and look at IRE_MARK_NORECV 14308 * later to see whether this ill is eligible to receive 14309 * them or not. ill_nominate_bcast_rcv() nominates only 14310 * one set of IREs for receiving. 14311 */ 14312 14313 ipif = ipif_get_next_ipif(NULL, ill); 14314 if (ipif == NULL) { 14315 ire_refrele(ire); 14316 freemsg(mp); 14317 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14318 return (NULL); 14319 } 14320 new_ire = ire_ctable_lookup(dst, 0, 0, 14321 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst); 14322 ipif_refrele(ipif); 14323 14324 if (new_ire != NULL) { 14325 if (new_ire->ire_marks & IRE_MARK_NORECV) { 14326 ire_refrele(ire); 14327 ire_refrele(new_ire); 14328 freemsg(mp); 14329 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14330 return (NULL); 14331 } 14332 /* 14333 * In the special case of multirouted broadcast 14334 * packets, we unconditionally need to "gateway" 14335 * them to the appropriate interface here. 14336 * In the normal case, this cannot happen, because 14337 * there is no broadcast IRE tagged with the 14338 * RTF_MULTIRT flag. 14339 */ 14340 if (new_ire->ire_flags & RTF_MULTIRT) { 14341 ire_refrele(new_ire); 14342 if (ire->ire_rfq != NULL) { 14343 q = ire->ire_rfq; 14344 *qp = q; 14345 } 14346 } else { 14347 ire_refrele(ire); 14348 ire = new_ire; 14349 } 14350 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 14351 if (!ipst->ips_ip_g_forward_directed_bcast) { 14352 /* 14353 * Free the message if 14354 * ip_g_forward_directed_bcast is turned 14355 * off for non-local broadcast. 14356 */ 14357 ire_refrele(ire); 14358 freemsg(mp); 14359 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14360 return (NULL); 14361 } 14362 } else { 14363 /* 14364 * This CGTP packet successfully passed the 14365 * CGTP filter, but the related CGTP 14366 * broadcast IRE has not been found, 14367 * meaning that the redundant ipif is 14368 * probably down. However, if we discarded 14369 * this packet, its duplicate would be 14370 * filtered out by the CGTP filter so none 14371 * of them would get through. So we keep 14372 * going with this one. 14373 */ 14374 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 14375 if (ire->ire_rfq != NULL) { 14376 q = ire->ire_rfq; 14377 *qp = q; 14378 } 14379 } 14380 } 14381 if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) { 14382 /* 14383 * Verify that there are not more then one 14384 * IRE_BROADCAST with this broadcast address which 14385 * has ire_stq set. 14386 * TODO: simplify, loop over all IRE's 14387 */ 14388 ire_t *ire1; 14389 int num_stq = 0; 14390 mblk_t *mp1; 14391 14392 /* Find the first one with ire_stq set */ 14393 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 14394 for (ire1 = ire; ire1 && 14395 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 14396 ire1 = ire1->ire_next) 14397 ; 14398 if (ire1) { 14399 ire_refrele(ire); 14400 ire = ire1; 14401 IRE_REFHOLD(ire); 14402 } 14403 14404 /* Check if there are additional ones with stq set */ 14405 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 14406 if (ire->ire_addr != ire1->ire_addr) 14407 break; 14408 if (ire1->ire_stq) { 14409 num_stq++; 14410 break; 14411 } 14412 } 14413 rw_exit(&ire->ire_bucket->irb_lock); 14414 if (num_stq == 1 && ire->ire_stq != NULL) { 14415 ip1dbg(("ip_rput_process_broadcast: directed " 14416 "broadcast to 0x%x\n", 14417 ntohl(ire->ire_addr))); 14418 mp1 = copymsg(mp); 14419 if (mp1) { 14420 switch (ipha->ipha_protocol) { 14421 case IPPROTO_UDP: 14422 ip_udp_input(q, mp1, ipha, ire, ill); 14423 break; 14424 default: 14425 ip_proto_input(q, mp1, ipha, ire, ill, 14426 0); 14427 break; 14428 } 14429 } 14430 /* 14431 * Adjust ttl to 2 (1+1 - the forward engine 14432 * will decrement it by one. 14433 */ 14434 if (ip_csum_hdr(ipha)) { 14435 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 14436 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 14437 freemsg(mp); 14438 ire_refrele(ire); 14439 return (NULL); 14440 } 14441 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; 14442 ipha->ipha_hdr_checksum = 0; 14443 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 14444 ip_rput_process_forward(q, mp, ire, ipha, 14445 ill, ll_multicast); 14446 ire_refrele(ire); 14447 return (NULL); 14448 } 14449 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 14450 ntohl(ire->ire_addr))); 14451 } 14452 14453 14454 /* Restore any hardware checksum flags */ 14455 DB_CKSUMFLAGS(mp) = hcksumflags; 14456 return (ire); 14457 } 14458 14459 /* ARGSUSED */ 14460 static boolean_t 14461 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 14462 int *ll_multicast, ipaddr_t *dstp) 14463 { 14464 ip_stack_t *ipst = ill->ill_ipst; 14465 14466 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); 14467 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, 14468 ntohs(ipha->ipha_length)); 14469 14470 /* 14471 * Forward packets only if we have joined the allmulti 14472 * group on this interface. 14473 */ 14474 if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) { 14475 int retval; 14476 14477 /* 14478 * Clear the indication that this may have hardware 14479 * checksum as we are not using it. 14480 */ 14481 DB_CKSUMFLAGS(mp) = 0; 14482 retval = ip_mforward(ill, ipha, mp); 14483 /* ip_mforward updates mib variables if needed */ 14484 /* clear b_prev - used by ip_mroute_decap */ 14485 mp->b_prev = NULL; 14486 14487 switch (retval) { 14488 case 0: 14489 /* 14490 * pkt is okay and arrived on phyint. 14491 * 14492 * If we are running as a multicast router 14493 * we need to see all IGMP and/or PIM packets. 14494 */ 14495 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 14496 (ipha->ipha_protocol == IPPROTO_PIM)) { 14497 goto done; 14498 } 14499 break; 14500 case -1: 14501 /* pkt is mal-formed, toss it */ 14502 goto drop_pkt; 14503 case 1: 14504 /* pkt is okay and arrived on a tunnel */ 14505 /* 14506 * If we are running a multicast router 14507 * we need to see all igmp packets. 14508 */ 14509 if (ipha->ipha_protocol == IPPROTO_IGMP) { 14510 *dstp = INADDR_BROADCAST; 14511 *ll_multicast = 1; 14512 return (B_FALSE); 14513 } 14514 14515 goto drop_pkt; 14516 } 14517 } 14518 14519 ILM_WALKER_HOLD(ill); 14520 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 14521 /* 14522 * This might just be caused by the fact that 14523 * multiple IP Multicast addresses map to the same 14524 * link layer multicast - no need to increment counter! 14525 */ 14526 ILM_WALKER_RELE(ill); 14527 freemsg(mp); 14528 return (B_TRUE); 14529 } 14530 ILM_WALKER_RELE(ill); 14531 done: 14532 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 14533 /* 14534 * This assumes the we deliver to all streams for multicast 14535 * and broadcast packets. 14536 */ 14537 *dstp = INADDR_BROADCAST; 14538 *ll_multicast = 1; 14539 return (B_FALSE); 14540 drop_pkt: 14541 ip2dbg(("ip_rput: drop pkt\n")); 14542 freemsg(mp); 14543 return (B_TRUE); 14544 } 14545 14546 /* 14547 * This function is used to both return an indication of whether or not 14548 * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND) 14549 * and in doing so, determine whether or not it is broadcast vs multicast. 14550 * For it to be a broadcast packet, we must have the appropriate mblk_t 14551 * hanging off the ill_t. If this is either not present or doesn't match 14552 * the destination mac address in the DL_UNITDATA_IND, the packet is deemed 14553 * to be multicast. Thus NICs that have no broadcast address (or no 14554 * capability for one, such as point to point links) cannot return as 14555 * the packet being broadcast. The use of HPE_BROADCAST/HPE_MULTICAST as 14556 * the return values simplifies the current use of the return value of this 14557 * function, which is to pass through the multicast/broadcast characteristic 14558 * to consumers of the netinfo/pfhooks API. While this is not cast in stone, 14559 * changing the return value to some other symbol demands the appropriate 14560 * "translation" when hpe_flags is set prior to calling hook_run() for 14561 * packet events. 14562 */ 14563 int 14564 ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb) 14565 { 14566 dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr; 14567 mblk_t *bmp; 14568 14569 if (ind->dl_group_address) { 14570 if (ind->dl_dest_addr_offset > sizeof (*ind) && 14571 ind->dl_dest_addr_offset + ind->dl_dest_addr_length < 14572 MBLKL(mb) && 14573 (bmp = ill->ill_bcast_mp) != NULL) { 14574 dl_unitdata_req_t *dlur; 14575 uint8_t *bphys_addr; 14576 14577 dlur = (dl_unitdata_req_t *)bmp->b_rptr; 14578 if (ill->ill_sap_length < 0) 14579 bphys_addr = (uchar_t *)dlur + 14580 dlur->dl_dest_addr_offset; 14581 else 14582 bphys_addr = (uchar_t *)dlur + 14583 dlur->dl_dest_addr_offset + 14584 ill->ill_sap_length; 14585 14586 if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset, 14587 bphys_addr, ind->dl_dest_addr_length) == 0) { 14588 return (HPE_BROADCAST); 14589 } 14590 return (HPE_MULTICAST); 14591 } 14592 return (HPE_MULTICAST); 14593 } 14594 return (0); 14595 } 14596 14597 static boolean_t 14598 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 14599 int *ll_multicast, mblk_t **mpp) 14600 { 14601 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 14602 boolean_t must_copy = B_FALSE; 14603 struct iocblk *iocp; 14604 ipha_t *ipha; 14605 ip_stack_t *ipst = ill->ill_ipst; 14606 14607 #define rptr ((uchar_t *)ipha) 14608 14609 first_mp = *first_mpp; 14610 mp = *mpp; 14611 14612 ASSERT(first_mp == mp); 14613 14614 /* 14615 * if db_ref > 1 then copymsg and free original. Packet may be 14616 * changed and do not want other entity who has a reference to this 14617 * message to trip over the changes. This is a blind change because 14618 * trying to catch all places that might change packet is too 14619 * difficult (since it may be a module above this one) 14620 * 14621 * This corresponds to the non-fast path case. We walk down the full 14622 * chain in this case, and check the db_ref count of all the dblks, 14623 * and do a copymsg if required. It is possible that the db_ref counts 14624 * of the data blocks in the mblk chain can be different. 14625 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 14626 * count of 1, followed by a M_DATA block with a ref count of 2, if 14627 * 'snoop' is running. 14628 */ 14629 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 14630 if (mp1->b_datap->db_ref > 1) { 14631 must_copy = B_TRUE; 14632 break; 14633 } 14634 } 14635 14636 if (must_copy) { 14637 mp1 = copymsg(mp); 14638 if (mp1 == NULL) { 14639 for (mp1 = mp; mp1 != NULL; 14640 mp1 = mp1->b_cont) { 14641 mp1->b_next = NULL; 14642 mp1->b_prev = NULL; 14643 } 14644 freemsg(mp); 14645 if (ill != NULL) { 14646 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14647 } else { 14648 BUMP_MIB(&ipst->ips_ip_mib, 14649 ipIfStatsInDiscards); 14650 } 14651 return (B_TRUE); 14652 } 14653 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 14654 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 14655 /* Copy b_prev - used by ip_mroute_decap */ 14656 to_mp->b_prev = from_mp->b_prev; 14657 from_mp->b_prev = NULL; 14658 } 14659 *first_mpp = first_mp = mp1; 14660 freemsg(mp); 14661 mp = mp1; 14662 *mpp = mp1; 14663 } 14664 14665 ipha = (ipha_t *)mp->b_rptr; 14666 14667 /* 14668 * previous code has a case for M_DATA. 14669 * We want to check how that happens. 14670 */ 14671 ASSERT(first_mp->b_datap->db_type != M_DATA); 14672 switch (first_mp->b_datap->db_type) { 14673 case M_PROTO: 14674 case M_PCPROTO: 14675 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 14676 DL_UNITDATA_IND) { 14677 /* Go handle anything other than data elsewhere. */ 14678 ip_rput_dlpi(q, mp); 14679 return (B_TRUE); 14680 } 14681 14682 *ll_multicast = ip_get_dlpi_mbcast(ill, mp); 14683 /* Ditch the DLPI header. */ 14684 mp1 = mp->b_cont; 14685 ASSERT(first_mp == mp); 14686 *first_mpp = mp1; 14687 freeb(mp); 14688 *mpp = mp1; 14689 return (B_FALSE); 14690 case M_IOCACK: 14691 ip1dbg(("got iocack ")); 14692 iocp = (struct iocblk *)mp->b_rptr; 14693 switch (iocp->ioc_cmd) { 14694 case DL_IOC_HDR_INFO: 14695 ill = (ill_t *)q->q_ptr; 14696 ill_fastpath_ack(ill, mp); 14697 return (B_TRUE); 14698 case SIOCSTUNPARAM: 14699 case OSIOCSTUNPARAM: 14700 /* Go through qwriter_ip */ 14701 break; 14702 case SIOCGTUNPARAM: 14703 case OSIOCGTUNPARAM: 14704 ip_rput_other(NULL, q, mp, NULL); 14705 return (B_TRUE); 14706 default: 14707 putnext(q, mp); 14708 return (B_TRUE); 14709 } 14710 /* FALLTHRU */ 14711 case M_ERROR: 14712 case M_HANGUP: 14713 /* 14714 * Since this is on the ill stream we unconditionally 14715 * bump up the refcount 14716 */ 14717 ill_refhold(ill); 14718 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14719 return (B_TRUE); 14720 case M_CTL: 14721 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 14722 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 14723 IPHADA_M_CTL)) { 14724 /* 14725 * It's an IPsec accelerated packet. 14726 * Make sure that the ill from which we received the 14727 * packet has enabled IPsec hardware acceleration. 14728 */ 14729 if (!(ill->ill_capabilities & 14730 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 14731 /* IPsec kstats: bean counter */ 14732 freemsg(mp); 14733 return (B_TRUE); 14734 } 14735 14736 /* 14737 * Make mp point to the mblk following the M_CTL, 14738 * then process according to type of mp. 14739 * After this processing, first_mp will point to 14740 * the data-attributes and mp to the pkt following 14741 * the M_CTL. 14742 */ 14743 mp = first_mp->b_cont; 14744 if (mp == NULL) { 14745 freemsg(first_mp); 14746 return (B_TRUE); 14747 } 14748 /* 14749 * A Hardware Accelerated packet can only be M_DATA 14750 * ESP or AH packet. 14751 */ 14752 if (mp->b_datap->db_type != M_DATA) { 14753 /* non-M_DATA IPsec accelerated packet */ 14754 IPSECHW_DEBUG(IPSECHW_PKT, 14755 ("non-M_DATA IPsec accelerated pkt\n")); 14756 freemsg(first_mp); 14757 return (B_TRUE); 14758 } 14759 ipha = (ipha_t *)mp->b_rptr; 14760 if (ipha->ipha_protocol != IPPROTO_AH && 14761 ipha->ipha_protocol != IPPROTO_ESP) { 14762 IPSECHW_DEBUG(IPSECHW_PKT, 14763 ("non-M_DATA IPsec accelerated pkt\n")); 14764 freemsg(first_mp); 14765 return (B_TRUE); 14766 } 14767 *mpp = mp; 14768 return (B_FALSE); 14769 } 14770 putnext(q, mp); 14771 return (B_TRUE); 14772 case M_IOCNAK: 14773 ip1dbg(("got iocnak ")); 14774 iocp = (struct iocblk *)mp->b_rptr; 14775 switch (iocp->ioc_cmd) { 14776 case SIOCSTUNPARAM: 14777 case OSIOCSTUNPARAM: 14778 /* 14779 * Since this is on the ill stream we unconditionally 14780 * bump up the refcount 14781 */ 14782 ill_refhold(ill); 14783 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14784 return (B_TRUE); 14785 case DL_IOC_HDR_INFO: 14786 case SIOCGTUNPARAM: 14787 case OSIOCGTUNPARAM: 14788 ip_rput_other(NULL, q, mp, NULL); 14789 return (B_TRUE); 14790 default: 14791 break; 14792 } 14793 /* FALLTHRU */ 14794 default: 14795 putnext(q, mp); 14796 return (B_TRUE); 14797 } 14798 } 14799 14800 /* Read side put procedure. Packets coming from the wire arrive here. */ 14801 void 14802 ip_rput(queue_t *q, mblk_t *mp) 14803 { 14804 ill_t *ill; 14805 union DL_primitives *dl; 14806 14807 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 14808 14809 ill = (ill_t *)q->q_ptr; 14810 14811 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 14812 /* 14813 * If things are opening or closing, only accept high-priority 14814 * DLPI messages. (On open ill->ill_ipif has not yet been 14815 * created; on close, things hanging off the ill may have been 14816 * freed already.) 14817 */ 14818 dl = (union DL_primitives *)mp->b_rptr; 14819 if (DB_TYPE(mp) != M_PCPROTO || 14820 dl->dl_primitive == DL_UNITDATA_IND) { 14821 /* 14822 * SIOC[GS]TUNPARAM ioctls can come here. 14823 */ 14824 inet_freemsg(mp); 14825 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14826 "ip_rput_end: q %p (%S)", q, "uninit"); 14827 return; 14828 } 14829 } 14830 14831 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14832 "ip_rput_end: q %p (%S)", q, "end"); 14833 14834 ip_input(ill, NULL, mp, NULL); 14835 } 14836 14837 static mblk_t * 14838 ip_fix_dbref(ill_t *ill, mblk_t *mp) 14839 { 14840 mblk_t *mp1; 14841 boolean_t adjusted = B_FALSE; 14842 ip_stack_t *ipst = ill->ill_ipst; 14843 14844 IP_STAT(ipst, ip_db_ref); 14845 /* 14846 * The IP_RECVSLLA option depends on having the 14847 * link layer header. First check that: 14848 * a> the underlying device is of type ether, 14849 * since this option is currently supported only 14850 * over ethernet. 14851 * b> there is enough room to copy over the link 14852 * layer header. 14853 * 14854 * Once the checks are done, adjust rptr so that 14855 * the link layer header will be copied via 14856 * copymsg. Note that, IFT_ETHER may be returned 14857 * by some non-ethernet drivers but in this case 14858 * the second check will fail. 14859 */ 14860 if (ill->ill_type == IFT_ETHER && 14861 (mp->b_rptr - mp->b_datap->db_base) >= 14862 sizeof (struct ether_header)) { 14863 mp->b_rptr -= sizeof (struct ether_header); 14864 adjusted = B_TRUE; 14865 } 14866 mp1 = copymsg(mp); 14867 14868 if (mp1 == NULL) { 14869 mp->b_next = NULL; 14870 /* clear b_prev - used by ip_mroute_decap */ 14871 mp->b_prev = NULL; 14872 freemsg(mp); 14873 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14874 return (NULL); 14875 } 14876 14877 if (adjusted) { 14878 /* 14879 * Copy is done. Restore the pointer in 14880 * the _new_ mblk 14881 */ 14882 mp1->b_rptr += sizeof (struct ether_header); 14883 } 14884 14885 /* Copy b_prev - used by ip_mroute_decap */ 14886 mp1->b_prev = mp->b_prev; 14887 mp->b_prev = NULL; 14888 14889 /* preserve the hardware checksum flags and data, if present */ 14890 if (DB_CKSUMFLAGS(mp) != 0) { 14891 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 14892 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 14893 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 14894 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 14895 DB_CKSUM16(mp1) = DB_CKSUM16(mp); 14896 } 14897 14898 freemsg(mp); 14899 return (mp1); 14900 } 14901 14902 /* 14903 * Direct read side procedure capable of dealing with chains. GLDv3 based 14904 * drivers call this function directly with mblk chains while STREAMS 14905 * read side procedure ip_rput() calls this for single packet with ip_ring 14906 * set to NULL to process one packet at a time. 14907 * 14908 * The ill will always be valid if this function is called directly from 14909 * the driver. 14910 * 14911 * If ip_input() is called from GLDv3: 14912 * 14913 * - This must be a non-VLAN IP stream. 14914 * - 'mp' is either an untagged or a special priority-tagged packet. 14915 * - Any VLAN tag that was in the MAC header has been stripped. 14916 * 14917 * If the IP header in packet is not 32-bit aligned, every message in the 14918 * chain will be aligned before further operations. This is required on SPARC 14919 * platform. 14920 */ 14921 /* ARGSUSED */ 14922 void 14923 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 14924 struct mac_header_info_s *mhip) 14925 { 14926 ipaddr_t dst = NULL; 14927 ipaddr_t prev_dst; 14928 ire_t *ire = NULL; 14929 ipha_t *ipha; 14930 uint_t pkt_len; 14931 ssize_t len; 14932 uint_t opt_len; 14933 int ll_multicast; 14934 int cgtp_flt_pkt; 14935 queue_t *q = ill->ill_rq; 14936 squeue_t *curr_sqp = NULL; 14937 mblk_t *head = NULL; 14938 mblk_t *tail = NULL; 14939 mblk_t *first_mp; 14940 mblk_t *mp; 14941 mblk_t *dmp; 14942 int cnt = 0; 14943 ip_stack_t *ipst = ill->ill_ipst; 14944 14945 ASSERT(mp_chain != NULL); 14946 ASSERT(ill != NULL); 14947 14948 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 14949 14950 #define rptr ((uchar_t *)ipha) 14951 14952 while (mp_chain != NULL) { 14953 first_mp = mp = mp_chain; 14954 mp_chain = mp_chain->b_next; 14955 mp->b_next = NULL; 14956 ll_multicast = 0; 14957 14958 /* 14959 * We do ire caching from one iteration to 14960 * another. In the event the packet chain contains 14961 * all packets from the same dst, this caching saves 14962 * an ire_cache_lookup for each of the succeeding 14963 * packets in a packet chain. 14964 */ 14965 prev_dst = dst; 14966 14967 /* 14968 * if db_ref > 1 then copymsg and free original. Packet 14969 * may be changed and we do not want the other entity 14970 * who has a reference to this message to trip over the 14971 * changes. This is a blind change because trying to 14972 * catch all places that might change the packet is too 14973 * difficult. 14974 * 14975 * This corresponds to the fast path case, where we have 14976 * a chain of M_DATA mblks. We check the db_ref count 14977 * of only the 1st data block in the mblk chain. There 14978 * doesn't seem to be a reason why a device driver would 14979 * send up data with varying db_ref counts in the mblk 14980 * chain. In any case the Fast path is a private 14981 * interface, and our drivers don't do such a thing. 14982 * Given the above assumption, there is no need to walk 14983 * down the entire mblk chain (which could have a 14984 * potential performance problem) 14985 */ 14986 14987 if (DB_REF(mp) > 1) { 14988 if ((mp = ip_fix_dbref(ill, mp)) == NULL) 14989 continue; 14990 } 14991 14992 /* 14993 * Check and align the IP header. 14994 */ 14995 first_mp = mp; 14996 if (DB_TYPE(mp) == M_DATA) { 14997 dmp = mp; 14998 } else if (DB_TYPE(mp) == M_PROTO && 14999 *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) { 15000 dmp = mp->b_cont; 15001 } else { 15002 dmp = NULL; 15003 } 15004 if (dmp != NULL) { 15005 /* 15006 * IP header ptr not aligned? 15007 * OR IP header not complete in first mblk 15008 */ 15009 if (!OK_32PTR(dmp->b_rptr) || 15010 MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) { 15011 if (!ip_check_and_align_header(q, dmp, ipst)) 15012 continue; 15013 } 15014 } 15015 15016 /* 15017 * ip_input fast path 15018 */ 15019 15020 /* mblk type is not M_DATA */ 15021 if (DB_TYPE(mp) != M_DATA) { 15022 if (ip_rput_process_notdata(q, &first_mp, ill, 15023 &ll_multicast, &mp)) 15024 continue; 15025 15026 /* 15027 * The only way we can get here is if we had a 15028 * packet that was either a DL_UNITDATA_IND or 15029 * an M_CTL for an IPsec accelerated packet. 15030 * 15031 * In either case, the first_mp will point to 15032 * the leading M_PROTO or M_CTL. 15033 */ 15034 ASSERT(first_mp != NULL); 15035 } else if (mhip != NULL) { 15036 /* 15037 * ll_multicast is set here so that it is ready 15038 * for easy use with FW_HOOKS(). ip_get_dlpi_mbcast 15039 * manipulates ll_multicast in the same fashion when 15040 * called from ip_rput_process_notdata. 15041 */ 15042 switch (mhip->mhi_dsttype) { 15043 case MAC_ADDRTYPE_MULTICAST : 15044 ll_multicast = HPE_MULTICAST; 15045 break; 15046 case MAC_ADDRTYPE_BROADCAST : 15047 ll_multicast = HPE_BROADCAST; 15048 break; 15049 default : 15050 break; 15051 } 15052 } 15053 15054 /* Make sure its an M_DATA and that its aligned */ 15055 ASSERT(DB_TYPE(mp) == M_DATA); 15056 ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr)); 15057 15058 ipha = (ipha_t *)mp->b_rptr; 15059 len = mp->b_wptr - rptr; 15060 pkt_len = ntohs(ipha->ipha_length); 15061 15062 /* 15063 * We must count all incoming packets, even if they end 15064 * up being dropped later on. 15065 */ 15066 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 15067 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); 15068 15069 /* multiple mblk or too short */ 15070 len -= pkt_len; 15071 if (len != 0) { 15072 /* 15073 * Make sure we have data length consistent 15074 * with the IP header. 15075 */ 15076 if (mp->b_cont == NULL) { 15077 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 15078 BUMP_MIB(ill->ill_ip_mib, 15079 ipIfStatsInHdrErrors); 15080 ip2dbg(("ip_input: drop pkt\n")); 15081 freemsg(mp); 15082 continue; 15083 } 15084 mp->b_wptr = rptr + pkt_len; 15085 } else if ((len += msgdsize(mp->b_cont)) != 0) { 15086 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 15087 BUMP_MIB(ill->ill_ip_mib, 15088 ipIfStatsInHdrErrors); 15089 ip2dbg(("ip_input: drop pkt\n")); 15090 freemsg(mp); 15091 continue; 15092 } 15093 (void) adjmsg(mp, -len); 15094 IP_STAT(ipst, ip_multimblk3); 15095 } 15096 } 15097 15098 /* Obtain the dst of the current packet */ 15099 dst = ipha->ipha_dst; 15100 15101 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, 15102 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, 15103 ipha, ip6_t *, NULL, int, 0); 15104 15105 /* 15106 * The following test for loopback is faster than 15107 * IP_LOOPBACK_ADDR(), because it avoids any bitwise 15108 * operations. 15109 * Note that these addresses are always in network byte order 15110 */ 15111 if (((*(uchar_t *)&ipha->ipha_dst) == 127) || 15112 ((*(uchar_t *)&ipha->ipha_src) == 127)) { 15113 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 15114 freemsg(mp); 15115 continue; 15116 } 15117 15118 /* 15119 * The event for packets being received from a 'physical' 15120 * interface is placed after validation of the source and/or 15121 * destination address as being local so that packets can be 15122 * redirected to loopback addresses using ipnat. 15123 */ 15124 DTRACE_PROBE4(ip4__physical__in__start, 15125 ill_t *, ill, ill_t *, NULL, 15126 ipha_t *, ipha, mblk_t *, first_mp); 15127 15128 FW_HOOKS(ipst->ips_ip4_physical_in_event, 15129 ipst->ips_ipv4firewall_physical_in, 15130 ill, NULL, ipha, first_mp, mp, ll_multicast, ipst); 15131 15132 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp); 15133 15134 if (first_mp == NULL) { 15135 continue; 15136 } 15137 dst = ipha->ipha_dst; 15138 15139 /* 15140 * Attach any necessary label information to 15141 * this packet 15142 */ 15143 if (is_system_labeled() && 15144 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 15145 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 15146 freemsg(mp); 15147 continue; 15148 } 15149 15150 /* 15151 * Reuse the cached ire only if the ipha_dst of the previous 15152 * packet is the same as the current packet AND it is not 15153 * INADDR_ANY. 15154 */ 15155 if (!(dst == prev_dst && dst != INADDR_ANY) && 15156 (ire != NULL)) { 15157 ire_refrele(ire); 15158 ire = NULL; 15159 } 15160 opt_len = ipha->ipha_version_and_hdr_length - 15161 IP_SIMPLE_HDR_VERSION; 15162 15163 /* 15164 * Check to see if we can take the fastpath. 15165 * That is possible if the following conditions are met 15166 * o Tsol disabled 15167 * o CGTP disabled 15168 * o ipp_action_count is 0 15169 * o no options in the packet 15170 * o not a RSVP packet 15171 * o not a multicast packet 15172 * o ill not in IP_DHCPINIT_IF mode 15173 */ 15174 if (!is_system_labeled() && 15175 !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 && 15176 opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP && 15177 !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) { 15178 if (ire == NULL) 15179 ire = ire_cache_lookup(dst, ALL_ZONES, NULL, 15180 ipst); 15181 15182 /* incoming packet is for forwarding */ 15183 if (ire == NULL || (ire->ire_type & IRE_CACHE)) { 15184 ire = ip_fast_forward(ire, dst, ill, mp); 15185 continue; 15186 } 15187 /* incoming packet is for local consumption */ 15188 if (ire->ire_type & IRE_LOCAL) 15189 goto local; 15190 } 15191 15192 /* 15193 * Disable ire caching for anything more complex 15194 * than the simple fast path case we checked for above. 15195 */ 15196 if (ire != NULL) { 15197 ire_refrele(ire); 15198 ire = NULL; 15199 } 15200 15201 /* 15202 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP 15203 * server to unicast DHCP packets to a DHCP client using the 15204 * IP address it is offering to the client. This can be 15205 * disabled through the "broadcast bit", but not all DHCP 15206 * servers honor that bit. Therefore, to interoperate with as 15207 * many DHCP servers as possible, the DHCP client allows the 15208 * server to unicast, but we treat those packets as broadcast 15209 * here. Note that we don't rewrite the packet itself since 15210 * (a) that would mess up the checksums and (b) the DHCP 15211 * client conn is bound to INADDR_ANY so ip_fanout_udp() will 15212 * hand it the packet regardless. 15213 */ 15214 if (ill->ill_dhcpinit != 0 && 15215 IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP && 15216 pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) { 15217 udpha_t *udpha; 15218 15219 /* 15220 * Reload ipha since pullupmsg() can change b_rptr. 15221 */ 15222 ipha = (ipha_t *)mp->b_rptr; 15223 udpha = (udpha_t *)&ipha[1]; 15224 15225 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { 15226 DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, 15227 mblk_t *, mp); 15228 dst = INADDR_BROADCAST; 15229 } 15230 } 15231 15232 /* Full-blown slow path */ 15233 if (opt_len != 0) { 15234 if (len != 0) 15235 IP_STAT(ipst, ip_multimblk4); 15236 else 15237 IP_STAT(ipst, ip_ipoptions); 15238 if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha, 15239 &dst, ipst)) 15240 continue; 15241 } 15242 15243 /* 15244 * Invoke the CGTP (multirouting) filtering module to process 15245 * the incoming packet. Packets identified as duplicates 15246 * must be discarded. Filtering is active only if the 15247 * the ip_cgtp_filter ndd variable is non-zero. 15248 */ 15249 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 15250 if (ipst->ips_ip_cgtp_filter && 15251 ipst->ips_ip_cgtp_filter_ops != NULL) { 15252 netstackid_t stackid; 15253 15254 stackid = ipst->ips_netstack->netstack_stackid; 15255 cgtp_flt_pkt = 15256 ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, 15257 ill->ill_phyint->phyint_ifindex, mp); 15258 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 15259 freemsg(first_mp); 15260 continue; 15261 } 15262 } 15263 15264 /* 15265 * If rsvpd is running, let RSVP daemon handle its processing 15266 * and forwarding of RSVP multicast/unicast packets. 15267 * If rsvpd is not running but mrouted is running, RSVP 15268 * multicast packets are forwarded as multicast traffic 15269 * and RSVP unicast packets are forwarded by unicast router. 15270 * If neither rsvpd nor mrouted is running, RSVP multicast 15271 * packets are not forwarded, but the unicast packets are 15272 * forwarded like unicast traffic. 15273 */ 15274 if (ipha->ipha_protocol == IPPROTO_RSVP && 15275 ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head != 15276 NULL) { 15277 /* RSVP packet and rsvpd running. Treat as ours */ 15278 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 15279 /* 15280 * This assumes that we deliver to all streams for 15281 * multicast and broadcast packets. 15282 * We have to force ll_multicast to 1 to handle the 15283 * M_DATA messages passed in from ip_mroute_decap. 15284 */ 15285 dst = INADDR_BROADCAST; 15286 ll_multicast = 1; 15287 } else if (CLASSD(dst)) { 15288 /* packet is multicast */ 15289 mp->b_next = NULL; 15290 if (ip_rput_process_multicast(q, mp, ill, ipha, 15291 &ll_multicast, &dst)) 15292 continue; 15293 } 15294 15295 if (ire == NULL) { 15296 ire = ire_cache_lookup(dst, ALL_ZONES, 15297 MBLK_GETLABEL(mp), ipst); 15298 } 15299 15300 if (ire != NULL && ire->ire_stq != NULL && 15301 ire->ire_zoneid != GLOBAL_ZONEID && 15302 ire->ire_zoneid != ALL_ZONES) { 15303 /* 15304 * Should only use IREs that are visible from the 15305 * global zone for forwarding. 15306 */ 15307 ire_refrele(ire); 15308 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, 15309 MBLK_GETLABEL(mp), ipst); 15310 } 15311 15312 if (ire == NULL) { 15313 /* 15314 * No IRE for this destination, so it can't be for us. 15315 * Unless we are forwarding, drop the packet. 15316 * We have to let source routed packets through 15317 * since we don't yet know if they are 'ping -l' 15318 * packets i.e. if they will go out over the 15319 * same interface as they came in on. 15320 */ 15321 ire = ip_rput_noire(q, mp, ll_multicast, dst); 15322 if (ire == NULL) 15323 continue; 15324 } 15325 15326 /* 15327 * Broadcast IRE may indicate either broadcast or 15328 * multicast packet 15329 */ 15330 if (ire->ire_type == IRE_BROADCAST) { 15331 /* 15332 * Skip broadcast checks if packet is UDP multicast; 15333 * we'd rather not enter ip_rput_process_broadcast() 15334 * unless the packet is broadcast for real, since 15335 * that routine is a no-op for multicast. 15336 */ 15337 if (ipha->ipha_protocol != IPPROTO_UDP || 15338 !CLASSD(ipha->ipha_dst)) { 15339 ire = ip_rput_process_broadcast(&q, mp, 15340 ire, ipha, ill, dst, cgtp_flt_pkt, 15341 ll_multicast); 15342 if (ire == NULL) 15343 continue; 15344 } 15345 } else if (ire->ire_stq != NULL) { 15346 /* fowarding? */ 15347 ip_rput_process_forward(q, mp, ire, ipha, ill, 15348 ll_multicast); 15349 /* ip_rput_process_forward consumed the packet */ 15350 continue; 15351 } 15352 15353 local: 15354 /* 15355 * If the queue in the ire is different to the ingress queue 15356 * then we need to check to see if we can accept the packet. 15357 * Note that for multicast packets and broadcast packets sent 15358 * to a broadcast address which is shared between multiple 15359 * interfaces we should not do this since we just got a random 15360 * broadcast ire. 15361 */ 15362 if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { 15363 if ((ire = ip_check_multihome(&ipha->ipha_dst, ire, 15364 ill)) == NULL) { 15365 /* Drop packet */ 15366 BUMP_MIB(ill->ill_ip_mib, 15367 ipIfStatsForwProhibits); 15368 freemsg(mp); 15369 continue; 15370 } 15371 if (ire->ire_rfq != NULL) 15372 q = ire->ire_rfq; 15373 } 15374 15375 switch (ipha->ipha_protocol) { 15376 case IPPROTO_TCP: 15377 ASSERT(first_mp == mp); 15378 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 15379 mp, 0, q, ip_ring)) != NULL) { 15380 if (curr_sqp == NULL) { 15381 curr_sqp = GET_SQUEUE(mp); 15382 ASSERT(cnt == 0); 15383 cnt++; 15384 head = tail = mp; 15385 } else if (curr_sqp == GET_SQUEUE(mp)) { 15386 ASSERT(tail != NULL); 15387 cnt++; 15388 tail->b_next = mp; 15389 tail = mp; 15390 } else { 15391 /* 15392 * A different squeue. Send the 15393 * chain for the previous squeue on 15394 * its way. This shouldn't happen 15395 * often unless interrupt binding 15396 * changes. 15397 */ 15398 IP_STAT(ipst, ip_input_multi_squeue); 15399 squeue_enter_chain(curr_sqp, head, 15400 tail, cnt, SQTAG_IP_INPUT); 15401 curr_sqp = GET_SQUEUE(mp); 15402 head = mp; 15403 tail = mp; 15404 cnt = 1; 15405 } 15406 } 15407 continue; 15408 case IPPROTO_UDP: 15409 ASSERT(first_mp == mp); 15410 ip_udp_input(q, mp, ipha, ire, ill); 15411 continue; 15412 case IPPROTO_SCTP: 15413 ASSERT(first_mp == mp); 15414 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 15415 q, dst); 15416 /* ire has been released by ip_sctp_input */ 15417 ire = NULL; 15418 continue; 15419 default: 15420 ip_proto_input(q, first_mp, ipha, ire, ill, 0); 15421 continue; 15422 } 15423 } 15424 15425 if (ire != NULL) 15426 ire_refrele(ire); 15427 15428 if (head != NULL) 15429 squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); 15430 15431 /* 15432 * This code is there just to make netperf/ttcp look good. 15433 * 15434 * Its possible that after being in polling mode (and having cleared 15435 * the backlog), squeues have turned the interrupt frequency higher 15436 * to improve latency at the expense of more CPU utilization (less 15437 * packets per interrupts or more number of interrupts). Workloads 15438 * like ttcp/netperf do manage to tickle polling once in a while 15439 * but for the remaining time, stay in higher interrupt mode since 15440 * their packet arrival rate is pretty uniform and this shows up 15441 * as higher CPU utilization. Since people care about CPU utilization 15442 * while running netperf/ttcp, turn the interrupt frequency back to 15443 * normal/default if polling has not been used in ip_poll_normal_ticks. 15444 */ 15445 if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { 15446 if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { 15447 ip_ring->rr_poll_state &= ~ILL_POLLING; 15448 ip_ring->rr_blank(ip_ring->rr_handle, 15449 ip_ring->rr_normal_blank_time, 15450 ip_ring->rr_normal_pkt_cnt); 15451 } 15452 } 15453 15454 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 15455 "ip_input_end: q %p (%S)", q, "end"); 15456 #undef rptr 15457 } 15458 15459 static void 15460 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 15461 t_uscalar_t err) 15462 { 15463 if (dl_err == DL_SYSERR) { 15464 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15465 "%s: %s failed: DL_SYSERR (errno %u)\n", 15466 ill->ill_name, dl_primstr(prim), err); 15467 return; 15468 } 15469 15470 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15471 "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim), 15472 dl_errstr(dl_err)); 15473 } 15474 15475 /* 15476 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 15477 * than DL_UNITDATA_IND messages. If we need to process this message 15478 * exclusively, we call qwriter_ip, in which case we also need to call 15479 * ill_refhold before that, since qwriter_ip does an ill_refrele. 15480 */ 15481 void 15482 ip_rput_dlpi(queue_t *q, mblk_t *mp) 15483 { 15484 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15485 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15486 ill_t *ill = q->q_ptr; 15487 t_uscalar_t prim = dloa->dl_primitive; 15488 t_uscalar_t reqprim = DL_PRIM_INVAL; 15489 15490 ip1dbg(("ip_rput_dlpi")); 15491 15492 /* 15493 * If we received an ACK but didn't send a request for it, then it 15494 * can't be part of any pending operation; discard up-front. 15495 */ 15496 switch (prim) { 15497 case DL_ERROR_ACK: 15498 reqprim = dlea->dl_error_primitive; 15499 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s " 15500 "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim), 15501 reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno, 15502 dlea->dl_unix_errno)); 15503 break; 15504 case DL_OK_ACK: 15505 reqprim = dloa->dl_correct_primitive; 15506 break; 15507 case DL_INFO_ACK: 15508 reqprim = DL_INFO_REQ; 15509 break; 15510 case DL_BIND_ACK: 15511 reqprim = DL_BIND_REQ; 15512 break; 15513 case DL_PHYS_ADDR_ACK: 15514 reqprim = DL_PHYS_ADDR_REQ; 15515 break; 15516 case DL_NOTIFY_ACK: 15517 reqprim = DL_NOTIFY_REQ; 15518 break; 15519 case DL_CONTROL_ACK: 15520 reqprim = DL_CONTROL_REQ; 15521 break; 15522 case DL_CAPABILITY_ACK: 15523 reqprim = DL_CAPABILITY_REQ; 15524 break; 15525 } 15526 15527 if (prim != DL_NOTIFY_IND) { 15528 if (reqprim == DL_PRIM_INVAL || 15529 !ill_dlpi_pending(ill, reqprim)) { 15530 /* Not a DLPI message we support or expected */ 15531 freemsg(mp); 15532 return; 15533 } 15534 ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim), 15535 dl_primstr(reqprim))); 15536 } 15537 15538 switch (reqprim) { 15539 case DL_UNBIND_REQ: 15540 /* 15541 * NOTE: we mark the unbind as complete even if we got a 15542 * DL_ERROR_ACK, since there's not much else we can do. 15543 */ 15544 mutex_enter(&ill->ill_lock); 15545 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 15546 cv_signal(&ill->ill_cv); 15547 mutex_exit(&ill->ill_lock); 15548 break; 15549 15550 case DL_ENABMULTI_REQ: 15551 if (prim == DL_OK_ACK) { 15552 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15553 ill->ill_dlpi_multicast_state = IDS_OK; 15554 } 15555 break; 15556 } 15557 15558 /* 15559 * The message is one we're waiting for (or DL_NOTIFY_IND), but we 15560 * need to become writer to continue to process it. Because an 15561 * exclusive operation doesn't complete until replies to all queued 15562 * DLPI messages have been received, we know we're in the middle of an 15563 * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND). 15564 * 15565 * As required by qwriter_ip(), we refhold the ill; it will refrele. 15566 * Since this is on the ill stream we unconditionally bump up the 15567 * refcount without doing ILL_CAN_LOOKUP(). 15568 */ 15569 ill_refhold(ill); 15570 if (prim == DL_NOTIFY_IND) 15571 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE); 15572 else 15573 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE); 15574 } 15575 15576 /* 15577 * Handling of DLPI messages that require exclusive access to the ipsq. 15578 * 15579 * Need to do ill_pending_mp_release on ioctl completion, which could 15580 * happen here. (along with mi_copy_done) 15581 */ 15582 /* ARGSUSED */ 15583 static void 15584 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15585 { 15586 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15587 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15588 int err = 0; 15589 ill_t *ill; 15590 ipif_t *ipif = NULL; 15591 mblk_t *mp1 = NULL; 15592 conn_t *connp = NULL; 15593 t_uscalar_t paddrreq; 15594 mblk_t *mp_hw; 15595 boolean_t success; 15596 boolean_t ioctl_aborted = B_FALSE; 15597 boolean_t log = B_TRUE; 15598 ip_stack_t *ipst; 15599 15600 ip1dbg(("ip_rput_dlpi_writer ..")); 15601 ill = (ill_t *)q->q_ptr; 15602 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 15603 15604 ASSERT(IAM_WRITER_ILL(ill)); 15605 15606 ipst = ill->ill_ipst; 15607 15608 /* 15609 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. 15610 * both are null or non-null. However we can assert that only 15611 * after grabbing the ipsq_lock. So we don't make any assertion 15612 * here and in other places in the code. 15613 */ 15614 ipif = ipsq->ipsq_pending_ipif; 15615 /* 15616 * The current ioctl could have been aborted by the user and a new 15617 * ioctl to bring up another ill could have started. We could still 15618 * get a response from the driver later. 15619 */ 15620 if (ipif != NULL && ipif->ipif_ill != ill) 15621 ioctl_aborted = B_TRUE; 15622 15623 switch (dloa->dl_primitive) { 15624 case DL_ERROR_ACK: 15625 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n", 15626 dl_primstr(dlea->dl_error_primitive))); 15627 15628 switch (dlea->dl_error_primitive) { 15629 case DL_DISABMULTI_REQ: 15630 if (!ill->ill_isv6) 15631 ipsq_current_finish(ipsq); 15632 ill_dlpi_done(ill, dlea->dl_error_primitive); 15633 break; 15634 case DL_PROMISCON_REQ: 15635 case DL_PROMISCOFF_REQ: 15636 case DL_UNBIND_REQ: 15637 case DL_ATTACH_REQ: 15638 case DL_INFO_REQ: 15639 ill_dlpi_done(ill, dlea->dl_error_primitive); 15640 break; 15641 case DL_NOTIFY_REQ: 15642 ill_dlpi_done(ill, DL_NOTIFY_REQ); 15643 log = B_FALSE; 15644 break; 15645 case DL_PHYS_ADDR_REQ: 15646 /* 15647 * For IPv6 only, there are two additional 15648 * phys_addr_req's sent to the driver to get the 15649 * IPv6 token and lla. This allows IP to acquire 15650 * the hardware address format for a given interface 15651 * without having built in knowledge of the hardware 15652 * address. ill_phys_addr_pend keeps track of the last 15653 * DL_PAR sent so we know which response we are 15654 * dealing with. ill_dlpi_done will update 15655 * ill_phys_addr_pend when it sends the next req. 15656 * We don't complete the IOCTL until all three DL_PARs 15657 * have been attempted, so set *_len to 0 and break. 15658 */ 15659 paddrreq = ill->ill_phys_addr_pend; 15660 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 15661 if (paddrreq == DL_IPV6_TOKEN) { 15662 ill->ill_token_length = 0; 15663 log = B_FALSE; 15664 break; 15665 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 15666 ill->ill_nd_lla_len = 0; 15667 log = B_FALSE; 15668 break; 15669 } 15670 /* 15671 * Something went wrong with the DL_PHYS_ADDR_REQ. 15672 * We presumably have an IOCTL hanging out waiting 15673 * for completion. Find it and complete the IOCTL 15674 * with the error noted. 15675 * However, ill_dl_phys was called on an ill queue 15676 * (from SIOCSLIFNAME), thus conn_pending_ill is not 15677 * set. But the ioctl is known to be pending on ill_wq. 15678 */ 15679 if (!ill->ill_ifname_pending) 15680 break; 15681 ill->ill_ifname_pending = 0; 15682 if (!ioctl_aborted) 15683 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15684 if (mp1 != NULL) { 15685 /* 15686 * This operation (SIOCSLIFNAME) must have 15687 * happened on the ill. Assert there is no conn 15688 */ 15689 ASSERT(connp == NULL); 15690 q = ill->ill_wq; 15691 } 15692 break; 15693 case DL_BIND_REQ: 15694 ill_dlpi_done(ill, DL_BIND_REQ); 15695 if (ill->ill_ifname_pending) 15696 break; 15697 /* 15698 * Something went wrong with the bind. We presumably 15699 * have an IOCTL hanging out waiting for completion. 15700 * Find it, take down the interface that was coming 15701 * up, and complete the IOCTL with the error noted. 15702 */ 15703 if (!ioctl_aborted) 15704 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15705 if (mp1 != NULL) { 15706 /* 15707 * This operation (SIOCSLIFFLAGS) must have 15708 * happened from a conn. 15709 */ 15710 ASSERT(connp != NULL); 15711 q = CONNP_TO_WQ(connp); 15712 if (ill->ill_move_in_progress) { 15713 ILL_CLEAR_MOVE(ill); 15714 } 15715 (void) ipif_down(ipif, NULL, NULL); 15716 /* error is set below the switch */ 15717 } 15718 break; 15719 case DL_ENABMULTI_REQ: 15720 if (!ill->ill_isv6) 15721 ipsq_current_finish(ipsq); 15722 ill_dlpi_done(ill, DL_ENABMULTI_REQ); 15723 15724 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15725 ill->ill_dlpi_multicast_state = IDS_FAILED; 15726 if (ill->ill_dlpi_multicast_state == IDS_FAILED) { 15727 ipif_t *ipif; 15728 15729 printf("ip: joining multicasts failed (%d)" 15730 " on %s - will use link layer " 15731 "broadcasts for multicast\n", 15732 dlea->dl_errno, ill->ill_name); 15733 15734 /* 15735 * Set up the multicast mapping alone. 15736 * writer, so ok to access ill->ill_ipif 15737 * without any lock. 15738 */ 15739 ipif = ill->ill_ipif; 15740 mutex_enter(&ill->ill_phyint->phyint_lock); 15741 ill->ill_phyint->phyint_flags |= 15742 PHYI_MULTI_BCAST; 15743 mutex_exit(&ill->ill_phyint->phyint_lock); 15744 15745 if (!ill->ill_isv6) { 15746 (void) ipif_arp_setup_multicast(ipif, 15747 NULL); 15748 } else { 15749 (void) ipif_ndp_setup_multicast(ipif, 15750 NULL); 15751 } 15752 } 15753 freemsg(mp); /* Don't want to pass this up */ 15754 return; 15755 15756 case DL_CAPABILITY_REQ: 15757 case DL_CONTROL_REQ: 15758 ill_dlpi_done(ill, dlea->dl_error_primitive); 15759 ill->ill_dlpi_capab_state = IDS_FAILED; 15760 freemsg(mp); 15761 return; 15762 } 15763 /* 15764 * Note the error for IOCTL completion (mp1 is set when 15765 * ready to complete ioctl). If ill_ifname_pending_err is 15766 * set, an error occured during plumbing (ill_ifname_pending), 15767 * so we want to report that error. 15768 * 15769 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 15770 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 15771 * expected to get errack'd if the driver doesn't support 15772 * these flags (e.g. ethernet). log will be set to B_FALSE 15773 * if these error conditions are encountered. 15774 */ 15775 if (mp1 != NULL) { 15776 if (ill->ill_ifname_pending_err != 0) { 15777 err = ill->ill_ifname_pending_err; 15778 ill->ill_ifname_pending_err = 0; 15779 } else { 15780 err = dlea->dl_unix_errno ? 15781 dlea->dl_unix_errno : ENXIO; 15782 } 15783 /* 15784 * If we're plumbing an interface and an error hasn't already 15785 * been saved, set ill_ifname_pending_err to the error passed 15786 * up. Ignore the error if log is B_FALSE (see comment above). 15787 */ 15788 } else if (log && ill->ill_ifname_pending && 15789 ill->ill_ifname_pending_err == 0) { 15790 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 15791 dlea->dl_unix_errno : ENXIO; 15792 } 15793 15794 if (log) 15795 ip_dlpi_error(ill, dlea->dl_error_primitive, 15796 dlea->dl_errno, dlea->dl_unix_errno); 15797 break; 15798 case DL_CAPABILITY_ACK: 15799 /* Call a routine to handle this one. */ 15800 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 15801 ill_capability_ack(ill, mp); 15802 15803 /* 15804 * If the ack is due to renegotiation, we will need to send 15805 * a new CAPABILITY_REQ to start the renegotiation. 15806 */ 15807 if (ill->ill_capab_reneg) { 15808 ill->ill_capab_reneg = B_FALSE; 15809 ill_capability_probe(ill); 15810 } 15811 break; 15812 case DL_CONTROL_ACK: 15813 /* We treat all of these as "fire and forget" */ 15814 ill_dlpi_done(ill, DL_CONTROL_REQ); 15815 break; 15816 case DL_INFO_ACK: 15817 /* Call a routine to handle this one. */ 15818 ill_dlpi_done(ill, DL_INFO_REQ); 15819 ip_ll_subnet_defaults(ill, mp); 15820 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 15821 return; 15822 case DL_BIND_ACK: 15823 /* 15824 * We should have an IOCTL waiting on this unless 15825 * sent by ill_dl_phys, in which case just return 15826 */ 15827 ill_dlpi_done(ill, DL_BIND_REQ); 15828 if (ill->ill_ifname_pending) 15829 break; 15830 15831 if (!ioctl_aborted) 15832 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15833 if (mp1 == NULL) 15834 break; 15835 /* 15836 * Because mp1 was added by ill_dl_up(), and it always 15837 * passes a valid connp, connp must be valid here. 15838 */ 15839 ASSERT(connp != NULL); 15840 q = CONNP_TO_WQ(connp); 15841 15842 /* 15843 * We are exclusive. So nothing can change even after 15844 * we get the pending mp. If need be we can put it back 15845 * and restart, as in calling ipif_arp_up() below. 15846 */ 15847 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 15848 15849 mutex_enter(&ill->ill_lock); 15850 ill->ill_dl_up = 1; 15851 (void) ill_hook_event_create(ill, 0, NE_UP, NULL, 0); 15852 mutex_exit(&ill->ill_lock); 15853 15854 /* 15855 * Now bring up the resolver; when that is complete, we'll 15856 * create IREs. Note that we intentionally mirror what 15857 * ipif_up() would have done, because we got here by way of 15858 * ill_dl_up(), which stopped ipif_up()'s processing. 15859 */ 15860 if (ill->ill_isv6) { 15861 /* 15862 * v6 interfaces. 15863 * Unlike ARP which has to do another bind 15864 * and attach, once we get here we are 15865 * done with NDP. Except in the case of 15866 * ILLF_XRESOLV, in which case we send an 15867 * AR_INTERFACE_UP to the external resolver. 15868 * If all goes well, the ioctl will complete 15869 * in ip_rput(). If there's an error, we 15870 * complete it here. 15871 */ 15872 if ((err = ipif_ndp_up(ipif)) == 0) { 15873 if (ill->ill_flags & ILLF_XRESOLV) { 15874 mutex_enter(&connp->conn_lock); 15875 mutex_enter(&ill->ill_lock); 15876 success = ipsq_pending_mp_add( 15877 connp, ipif, q, mp1, 0); 15878 mutex_exit(&ill->ill_lock); 15879 mutex_exit(&connp->conn_lock); 15880 if (success) { 15881 err = ipif_resolver_up(ipif, 15882 Res_act_initial); 15883 if (err == EINPROGRESS) { 15884 freemsg(mp); 15885 return; 15886 } 15887 ASSERT(err != 0); 15888 mp1 = ipsq_pending_mp_get(ipsq, 15889 &connp); 15890 ASSERT(mp1 != NULL); 15891 } else { 15892 /* conn has started closing */ 15893 err = EINTR; 15894 } 15895 } else { /* Non XRESOLV interface */ 15896 (void) ipif_resolver_up(ipif, 15897 Res_act_initial); 15898 err = ipif_up_done_v6(ipif); 15899 } 15900 } 15901 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 15902 /* 15903 * ARP and other v4 external resolvers. 15904 * Leave the pending mblk intact so that 15905 * the ioctl completes in ip_rput(). 15906 */ 15907 mutex_enter(&connp->conn_lock); 15908 mutex_enter(&ill->ill_lock); 15909 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 15910 mutex_exit(&ill->ill_lock); 15911 mutex_exit(&connp->conn_lock); 15912 if (success) { 15913 err = ipif_resolver_up(ipif, Res_act_initial); 15914 if (err == EINPROGRESS) { 15915 freemsg(mp); 15916 return; 15917 } 15918 ASSERT(err != 0); 15919 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15920 } else { 15921 /* The conn has started closing */ 15922 err = EINTR; 15923 } 15924 } else { 15925 /* 15926 * This one is complete. Reply to pending ioctl. 15927 */ 15928 (void) ipif_resolver_up(ipif, Res_act_initial); 15929 err = ipif_up_done(ipif); 15930 } 15931 15932 if ((err == 0) && (ill->ill_up_ipifs)) { 15933 err = ill_up_ipifs(ill, q, mp1); 15934 if (err == EINPROGRESS) { 15935 freemsg(mp); 15936 return; 15937 } 15938 } 15939 15940 if (ill->ill_up_ipifs) { 15941 ill_group_cleanup(ill); 15942 } 15943 15944 break; 15945 case DL_NOTIFY_IND: { 15946 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 15947 ire_t *ire; 15948 boolean_t need_ire_walk_v4 = B_FALSE; 15949 boolean_t need_ire_walk_v6 = B_FALSE; 15950 15951 switch (notify->dl_notification) { 15952 case DL_NOTE_PHYS_ADDR: 15953 err = ill_set_phys_addr(ill, mp); 15954 break; 15955 15956 case DL_NOTE_FASTPATH_FLUSH: 15957 ill_fastpath_flush(ill); 15958 break; 15959 15960 case DL_NOTE_SDU_SIZE: 15961 /* 15962 * Change the MTU size of the interface, of all 15963 * attached ipif's, and of all relevant ire's. The 15964 * new value's a uint32_t at notify->dl_data. 15965 * Mtu change Vs. new ire creation - protocol below. 15966 * 15967 * a Mark the ipif as IPIF_CHANGING. 15968 * b Set the new mtu in the ipif. 15969 * c Change the ire_max_frag on all affected ires 15970 * d Unmark the IPIF_CHANGING 15971 * 15972 * To see how the protocol works, assume an interface 15973 * route is also being added simultaneously by 15974 * ip_rt_add and let 'ipif' be the ipif referenced by 15975 * the ire. If the ire is created before step a, 15976 * it will be cleaned up by step c. If the ire is 15977 * created after step d, it will see the new value of 15978 * ipif_mtu. Any attempt to create the ire between 15979 * steps a to d will fail because of the IPIF_CHANGING 15980 * flag. Note that ire_create() is passed a pointer to 15981 * the ipif_mtu, and not the value. During ire_add 15982 * under the bucket lock, the ire_max_frag of the 15983 * new ire being created is set from the ipif/ire from 15984 * which it is being derived. 15985 */ 15986 mutex_enter(&ill->ill_lock); 15987 ill->ill_max_frag = (uint_t)notify->dl_data; 15988 15989 /* 15990 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu 15991 * leave it alone 15992 */ 15993 if (ill->ill_mtu_userspecified) { 15994 mutex_exit(&ill->ill_lock); 15995 break; 15996 } 15997 ill->ill_max_mtu = ill->ill_max_frag; 15998 if (ill->ill_isv6) { 15999 if (ill->ill_max_mtu < IPV6_MIN_MTU) 16000 ill->ill_max_mtu = IPV6_MIN_MTU; 16001 } else { 16002 if (ill->ill_max_mtu < IP_MIN_MTU) 16003 ill->ill_max_mtu = IP_MIN_MTU; 16004 } 16005 for (ipif = ill->ill_ipif; ipif != NULL; 16006 ipif = ipif->ipif_next) { 16007 /* 16008 * Don't override the mtu if the user 16009 * has explicitly set it. 16010 */ 16011 if (ipif->ipif_flags & IPIF_FIXEDMTU) 16012 continue; 16013 ipif->ipif_mtu = (uint_t)notify->dl_data; 16014 if (ipif->ipif_isv6) 16015 ire = ipif_to_ire_v6(ipif); 16016 else 16017 ire = ipif_to_ire(ipif); 16018 if (ire != NULL) { 16019 ire->ire_max_frag = ipif->ipif_mtu; 16020 ire_refrele(ire); 16021 } 16022 if (ipif->ipif_flags & IPIF_UP) { 16023 if (ill->ill_isv6) 16024 need_ire_walk_v6 = B_TRUE; 16025 else 16026 need_ire_walk_v4 = B_TRUE; 16027 } 16028 } 16029 mutex_exit(&ill->ill_lock); 16030 if (need_ire_walk_v4) 16031 ire_walk_v4(ill_mtu_change, (char *)ill, 16032 ALL_ZONES, ipst); 16033 if (need_ire_walk_v6) 16034 ire_walk_v6(ill_mtu_change, (char *)ill, 16035 ALL_ZONES, ipst); 16036 break; 16037 case DL_NOTE_LINK_UP: 16038 case DL_NOTE_LINK_DOWN: { 16039 /* 16040 * We are writer. ill / phyint / ipsq assocs stable. 16041 * The RUNNING flag reflects the state of the link. 16042 */ 16043 phyint_t *phyint = ill->ill_phyint; 16044 uint64_t new_phyint_flags; 16045 boolean_t changed = B_FALSE; 16046 boolean_t went_up; 16047 16048 went_up = notify->dl_notification == DL_NOTE_LINK_UP; 16049 mutex_enter(&phyint->phyint_lock); 16050 new_phyint_flags = went_up ? 16051 phyint->phyint_flags | PHYI_RUNNING : 16052 phyint->phyint_flags & ~PHYI_RUNNING; 16053 if (new_phyint_flags != phyint->phyint_flags) { 16054 phyint->phyint_flags = new_phyint_flags; 16055 changed = B_TRUE; 16056 } 16057 mutex_exit(&phyint->phyint_lock); 16058 /* 16059 * ill_restart_dad handles the DAD restart and routing 16060 * socket notification logic. 16061 */ 16062 if (changed) { 16063 ill_restart_dad(phyint->phyint_illv4, went_up); 16064 ill_restart_dad(phyint->phyint_illv6, went_up); 16065 } 16066 break; 16067 } 16068 case DL_NOTE_PROMISC_ON_PHYS: 16069 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16070 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 16071 mutex_enter(&ill->ill_lock); 16072 ill->ill_promisc_on_phys = B_TRUE; 16073 mutex_exit(&ill->ill_lock); 16074 break; 16075 case DL_NOTE_PROMISC_OFF_PHYS: 16076 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16077 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 16078 mutex_enter(&ill->ill_lock); 16079 ill->ill_promisc_on_phys = B_FALSE; 16080 mutex_exit(&ill->ill_lock); 16081 break; 16082 case DL_NOTE_CAPAB_RENEG: 16083 /* 16084 * Something changed on the driver side. 16085 * It wants us to renegotiate the capabilities 16086 * on this ill. One possible cause is the aggregation 16087 * interface under us where a port got added or 16088 * went away. 16089 * 16090 * If the capability negotiation is already done 16091 * or is in progress, reset the capabilities and 16092 * mark the ill's ill_capab_reneg to be B_TRUE, 16093 * so that when the ack comes back, we can start 16094 * the renegotiation process. 16095 * 16096 * Note that if ill_capab_reneg is already B_TRUE 16097 * (ill_dlpi_capab_state is IDS_UNKNOWN in this case), 16098 * the capability resetting request has been sent 16099 * and the renegotiation has not been started yet; 16100 * nothing needs to be done in this case. 16101 */ 16102 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) { 16103 ill_capability_reset(ill); 16104 ill->ill_capab_reneg = B_TRUE; 16105 } 16106 break; 16107 default: 16108 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 16109 "type 0x%x for DL_NOTIFY_IND\n", 16110 notify->dl_notification)); 16111 break; 16112 } 16113 16114 /* 16115 * As this is an asynchronous operation, we 16116 * should not call ill_dlpi_done 16117 */ 16118 break; 16119 } 16120 case DL_NOTIFY_ACK: { 16121 dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr; 16122 16123 if (noteack->dl_notifications & DL_NOTE_LINK_UP) 16124 ill->ill_note_link = 1; 16125 ill_dlpi_done(ill, DL_NOTIFY_REQ); 16126 break; 16127 } 16128 case DL_PHYS_ADDR_ACK: { 16129 /* 16130 * As part of plumbing the interface via SIOCSLIFNAME, 16131 * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs, 16132 * whose answers we receive here. As each answer is received, 16133 * we call ill_dlpi_done() to dispatch the next request as 16134 * we're processing the current one. Once all answers have 16135 * been received, we use ipsq_pending_mp_get() to dequeue the 16136 * outstanding IOCTL and reply to it. (Because ill_dl_phys() 16137 * is invoked from an ill queue, conn_oper_pending_ill is not 16138 * available, but we know the ioctl is pending on ill_wq.) 16139 */ 16140 uint_t paddrlen, paddroff; 16141 16142 paddrreq = ill->ill_phys_addr_pend; 16143 paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length; 16144 paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset; 16145 16146 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 16147 if (paddrreq == DL_IPV6_TOKEN) { 16148 /* 16149 * bcopy to low-order bits of ill_token 16150 * 16151 * XXX Temporary hack - currently, all known tokens 16152 * are 64 bits, so I'll cheat for the moment. 16153 */ 16154 bcopy(mp->b_rptr + paddroff, 16155 &ill->ill_token.s6_addr32[2], paddrlen); 16156 ill->ill_token_length = paddrlen; 16157 break; 16158 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 16159 ASSERT(ill->ill_nd_lla_mp == NULL); 16160 ill_set_ndmp(ill, mp, paddroff, paddrlen); 16161 mp = NULL; 16162 break; 16163 } 16164 16165 ASSERT(paddrreq == DL_CURR_PHYS_ADDR); 16166 ASSERT(ill->ill_phys_addr_mp == NULL); 16167 if (!ill->ill_ifname_pending) 16168 break; 16169 ill->ill_ifname_pending = 0; 16170 if (!ioctl_aborted) 16171 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16172 if (mp1 != NULL) { 16173 ASSERT(connp == NULL); 16174 q = ill->ill_wq; 16175 } 16176 /* 16177 * If any error acks received during the plumbing sequence, 16178 * ill_ifname_pending_err will be set. Break out and send up 16179 * the error to the pending ioctl. 16180 */ 16181 if (ill->ill_ifname_pending_err != 0) { 16182 err = ill->ill_ifname_pending_err; 16183 ill->ill_ifname_pending_err = 0; 16184 break; 16185 } 16186 16187 ill->ill_phys_addr_mp = mp; 16188 ill->ill_phys_addr = mp->b_rptr + paddroff; 16189 mp = NULL; 16190 16191 /* 16192 * If paddrlen is zero, the DLPI provider doesn't support 16193 * physical addresses. The other two tests were historical 16194 * workarounds for bugs in our former PPP implementation, but 16195 * now other things have grown dependencies on them -- e.g., 16196 * the tun module specifies a dl_addr_length of zero in its 16197 * DL_BIND_ACK, but then specifies an incorrect value in its 16198 * DL_PHYS_ADDR_ACK. These bogus checks need to be removed, 16199 * but only after careful testing ensures that all dependent 16200 * broken DLPI providers have been fixed. 16201 */ 16202 if (paddrlen == 0 || ill->ill_phys_addr_length == 0 || 16203 ill->ill_phys_addr_length == IP_ADDR_LEN) { 16204 ill->ill_phys_addr = NULL; 16205 } else if (paddrlen != ill->ill_phys_addr_length) { 16206 ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d", 16207 paddrlen, ill->ill_phys_addr_length)); 16208 err = EINVAL; 16209 break; 16210 } 16211 16212 if (ill->ill_nd_lla_mp == NULL) { 16213 if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) { 16214 err = ENOMEM; 16215 break; 16216 } 16217 ill_set_ndmp(ill, mp_hw, paddroff, paddrlen); 16218 } 16219 16220 /* 16221 * Set the interface token. If the zeroth interface address 16222 * is unspecified, then set it to the link local address. 16223 */ 16224 if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 16225 (void) ill_setdefaulttoken(ill); 16226 16227 ASSERT(ill->ill_ipif->ipif_id == 0); 16228 if (ipif != NULL && 16229 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 16230 (void) ipif_setlinklocal(ipif); 16231 } 16232 break; 16233 } 16234 case DL_OK_ACK: 16235 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 16236 dl_primstr((int)dloa->dl_correct_primitive), 16237 dloa->dl_correct_primitive)); 16238 switch (dloa->dl_correct_primitive) { 16239 case DL_ENABMULTI_REQ: 16240 case DL_DISABMULTI_REQ: 16241 if (!ill->ill_isv6) 16242 ipsq_current_finish(ipsq); 16243 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16244 break; 16245 case DL_PROMISCON_REQ: 16246 case DL_PROMISCOFF_REQ: 16247 case DL_UNBIND_REQ: 16248 case DL_ATTACH_REQ: 16249 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16250 break; 16251 } 16252 break; 16253 default: 16254 break; 16255 } 16256 16257 freemsg(mp); 16258 if (mp1 != NULL) { 16259 /* 16260 * The operation must complete without EINPROGRESS 16261 * since ipsq_pending_mp_get() has removed the mblk 16262 * from ipsq_pending_mp. Otherwise, the operation 16263 * will be stuck forever in the ipsq. 16264 */ 16265 ASSERT(err != EINPROGRESS); 16266 16267 switch (ipsq->ipsq_current_ioctl) { 16268 case 0: 16269 ipsq_current_finish(ipsq); 16270 break; 16271 16272 case SIOCLIFADDIF: 16273 case SIOCSLIFNAME: 16274 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); 16275 break; 16276 16277 default: 16278 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 16279 break; 16280 } 16281 } 16282 } 16283 16284 /* 16285 * ip_rput_other is called by ip_rput to handle messages modifying the global 16286 * state in IP. Normally called as writer. Exception SIOCGTUNPARAM (shared) 16287 */ 16288 /* ARGSUSED */ 16289 void 16290 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 16291 { 16292 ill_t *ill; 16293 struct iocblk *iocp; 16294 mblk_t *mp1; 16295 conn_t *connp = NULL; 16296 16297 ip1dbg(("ip_rput_other ")); 16298 ill = (ill_t *)q->q_ptr; 16299 /* 16300 * This routine is not a writer in the case of SIOCGTUNPARAM 16301 * in which case ipsq is NULL. 16302 */ 16303 if (ipsq != NULL) { 16304 ASSERT(IAM_WRITER_IPSQ(ipsq)); 16305 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 16306 } 16307 16308 switch (mp->b_datap->db_type) { 16309 case M_ERROR: 16310 case M_HANGUP: 16311 /* 16312 * The device has a problem. We force the ILL down. It can 16313 * be brought up again manually using SIOCSIFFLAGS (via 16314 * ifconfig or equivalent). 16315 */ 16316 ASSERT(ipsq != NULL); 16317 if (mp->b_rptr < mp->b_wptr) 16318 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 16319 if (ill->ill_error == 0) 16320 ill->ill_error = ENXIO; 16321 if (!ill_down_start(q, mp)) 16322 return; 16323 ipif_all_down_tail(ipsq, q, mp, NULL); 16324 break; 16325 case M_IOCACK: 16326 iocp = (struct iocblk *)mp->b_rptr; 16327 ASSERT(iocp->ioc_cmd != DL_IOC_HDR_INFO); 16328 switch (iocp->ioc_cmd) { 16329 case SIOCSTUNPARAM: 16330 case OSIOCSTUNPARAM: 16331 ASSERT(ipsq != NULL); 16332 /* 16333 * Finish socket ioctl passed through to tun. 16334 * We should have an IOCTL waiting on this. 16335 */ 16336 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16337 if (ill->ill_isv6) { 16338 struct iftun_req *ta; 16339 16340 /* 16341 * if a source or destination is 16342 * being set, try and set the link 16343 * local address for the tunnel 16344 */ 16345 ta = (struct iftun_req *)mp->b_cont-> 16346 b_cont->b_rptr; 16347 if (ta->ifta_flags & (IFTUN_SRC | IFTUN_DST)) { 16348 ipif_set_tun_llink(ill, ta); 16349 } 16350 16351 } 16352 if (mp1 != NULL) { 16353 /* 16354 * Now copy back the b_next/b_prev used by 16355 * mi code for the mi_copy* functions. 16356 * See ip_sioctl_tunparam() for the reason. 16357 * Also protect against missing b_cont. 16358 */ 16359 if (mp->b_cont != NULL) { 16360 mp->b_cont->b_next = 16361 mp1->b_cont->b_next; 16362 mp->b_cont->b_prev = 16363 mp1->b_cont->b_prev; 16364 } 16365 inet_freemsg(mp1); 16366 ASSERT(connp != NULL); 16367 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16368 iocp->ioc_error, NO_COPYOUT, ipsq); 16369 } else { 16370 ASSERT(connp == NULL); 16371 putnext(q, mp); 16372 } 16373 break; 16374 case SIOCGTUNPARAM: 16375 case OSIOCGTUNPARAM: 16376 /* 16377 * This is really M_IOCDATA from the tunnel driver. 16378 * convert back and complete the ioctl. 16379 * We should have an IOCTL waiting on this. 16380 */ 16381 mp1 = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 16382 if (mp1) { 16383 /* 16384 * Now copy back the b_next/b_prev used by 16385 * mi code for the mi_copy* functions. 16386 * See ip_sioctl_tunparam() for the reason. 16387 * Also protect against missing b_cont. 16388 */ 16389 if (mp->b_cont != NULL) { 16390 mp->b_cont->b_next = 16391 mp1->b_cont->b_next; 16392 mp->b_cont->b_prev = 16393 mp1->b_cont->b_prev; 16394 } 16395 inet_freemsg(mp1); 16396 if (iocp->ioc_error == 0) 16397 mp->b_datap->db_type = M_IOCDATA; 16398 ASSERT(connp != NULL); 16399 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16400 iocp->ioc_error, COPYOUT, NULL); 16401 } else { 16402 ASSERT(connp == NULL); 16403 putnext(q, mp); 16404 } 16405 break; 16406 default: 16407 break; 16408 } 16409 break; 16410 case M_IOCNAK: 16411 iocp = (struct iocblk *)mp->b_rptr; 16412 16413 switch (iocp->ioc_cmd) { 16414 int mode; 16415 16416 case DL_IOC_HDR_INFO: 16417 /* 16418 * If this was the first attempt turn of the 16419 * fastpath probing. 16420 */ 16421 mutex_enter(&ill->ill_lock); 16422 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) { 16423 ill->ill_dlpi_fastpath_state = IDS_FAILED; 16424 mutex_exit(&ill->ill_lock); 16425 ill_fastpath_nack(ill); 16426 ip1dbg(("ip_rput: DLPI fastpath off on " 16427 "interface %s\n", 16428 ill->ill_name)); 16429 } else { 16430 mutex_exit(&ill->ill_lock); 16431 } 16432 freemsg(mp); 16433 break; 16434 case SIOCSTUNPARAM: 16435 case OSIOCSTUNPARAM: 16436 ASSERT(ipsq != NULL); 16437 /* 16438 * Finish socket ioctl passed through to tun 16439 * We should have an IOCTL waiting on this. 16440 */ 16441 /* FALLTHRU */ 16442 case SIOCGTUNPARAM: 16443 case OSIOCGTUNPARAM: 16444 /* 16445 * This is really M_IOCDATA from the tunnel driver. 16446 * convert back and complete the ioctl. 16447 * We should have an IOCTL waiting on this. 16448 */ 16449 if (iocp->ioc_cmd == SIOCGTUNPARAM || 16450 iocp->ioc_cmd == OSIOCGTUNPARAM) { 16451 mp1 = ill_pending_mp_get(ill, &connp, 16452 iocp->ioc_id); 16453 mode = COPYOUT; 16454 ipsq = NULL; 16455 } else { 16456 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16457 mode = NO_COPYOUT; 16458 } 16459 if (mp1 != NULL) { 16460 /* 16461 * Now copy back the b_next/b_prev used by 16462 * mi code for the mi_copy* functions. 16463 * See ip_sioctl_tunparam() for the reason. 16464 * Also protect against missing b_cont. 16465 */ 16466 if (mp->b_cont != NULL) { 16467 mp->b_cont->b_next = 16468 mp1->b_cont->b_next; 16469 mp->b_cont->b_prev = 16470 mp1->b_cont->b_prev; 16471 } 16472 inet_freemsg(mp1); 16473 if (iocp->ioc_error == 0) 16474 iocp->ioc_error = EINVAL; 16475 ASSERT(connp != NULL); 16476 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16477 iocp->ioc_error, mode, ipsq); 16478 } else { 16479 ASSERT(connp == NULL); 16480 putnext(q, mp); 16481 } 16482 break; 16483 default: 16484 break; 16485 } 16486 default: 16487 break; 16488 } 16489 } 16490 16491 /* 16492 * NOTE : This function does not ire_refrele the ire argument passed in. 16493 * 16494 * IPQoS notes 16495 * IP policy is invoked twice for a forwarded packet, once on the read side 16496 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 16497 * enabled. An additional parameter, in_ill, has been added for this purpose. 16498 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 16499 * because ip_mroute drops this information. 16500 * 16501 */ 16502 void 16503 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 16504 { 16505 uint32_t old_pkt_len; 16506 uint32_t pkt_len; 16507 queue_t *q; 16508 uint32_t sum; 16509 #define rptr ((uchar_t *)ipha) 16510 uint32_t max_frag; 16511 uint32_t ill_index; 16512 ill_t *out_ill; 16513 mib2_ipIfStatsEntry_t *mibptr; 16514 ip_stack_t *ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst; 16515 16516 /* Get the ill_index of the incoming ILL */ 16517 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 16518 mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib; 16519 16520 /* Initiate Read side IPPF processing */ 16521 if (IPP_ENABLED(IPP_FWD_IN, ipst)) { 16522 ip_process(IPP_FWD_IN, &mp, ill_index); 16523 if (mp == NULL) { 16524 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 16525 "during IPPF processing\n")); 16526 return; 16527 } 16528 } 16529 16530 /* Adjust the checksum to reflect the ttl decrement. */ 16531 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 16532 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 16533 16534 if (ipha->ipha_ttl-- <= 1) { 16535 if (ip_csum_hdr(ipha)) { 16536 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16537 goto drop_pkt; 16538 } 16539 /* 16540 * Note: ire_stq this will be NULL for multicast 16541 * datagrams using the long path through arp (the IRE 16542 * is not an IRE_CACHE). This should not cause 16543 * problems since we don't generate ICMP errors for 16544 * multicast packets. 16545 */ 16546 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16547 q = ire->ire_stq; 16548 if (q != NULL) { 16549 /* Sent by forwarding path, and router is global zone */ 16550 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED, 16551 GLOBAL_ZONEID, ipst); 16552 } else 16553 freemsg(mp); 16554 return; 16555 } 16556 16557 /* 16558 * Don't forward if the interface is down 16559 */ 16560 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 16561 BUMP_MIB(mibptr, ipIfStatsInDiscards); 16562 ip2dbg(("ip_rput_forward:interface is down\n")); 16563 goto drop_pkt; 16564 } 16565 16566 /* Get the ill_index of the outgoing ILL */ 16567 out_ill = ire_to_ill(ire); 16568 ill_index = out_ill->ill_phyint->phyint_ifindex; 16569 16570 DTRACE_PROBE4(ip4__forwarding__start, 16571 ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16572 16573 FW_HOOKS(ipst->ips_ip4_forwarding_event, 16574 ipst->ips_ipv4firewall_forwarding, 16575 in_ill, out_ill, ipha, mp, mp, 0, ipst); 16576 16577 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 16578 16579 if (mp == NULL) 16580 return; 16581 old_pkt_len = pkt_len = ntohs(ipha->ipha_length); 16582 16583 if (is_system_labeled()) { 16584 mblk_t *mp1; 16585 16586 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 16587 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16588 goto drop_pkt; 16589 } 16590 /* Size may have changed */ 16591 mp = mp1; 16592 ipha = (ipha_t *)mp->b_rptr; 16593 pkt_len = ntohs(ipha->ipha_length); 16594 } 16595 16596 /* Check if there are options to update */ 16597 if (!IS_SIMPLE_IPH(ipha)) { 16598 if (ip_csum_hdr(ipha)) { 16599 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16600 goto drop_pkt; 16601 } 16602 if (ip_rput_forward_options(mp, ipha, ire, ipst)) { 16603 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16604 return; 16605 } 16606 16607 ipha->ipha_hdr_checksum = 0; 16608 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 16609 } 16610 max_frag = ire->ire_max_frag; 16611 if (pkt_len > max_frag) { 16612 /* 16613 * It needs fragging on its way out. We haven't 16614 * verified the header checksum yet. Since we 16615 * are going to put a surely good checksum in the 16616 * outgoing header, we have to make sure that it 16617 * was good coming in. 16618 */ 16619 if (ip_csum_hdr(ipha)) { 16620 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16621 goto drop_pkt; 16622 } 16623 /* Initiate Write side IPPF processing */ 16624 if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { 16625 ip_process(IPP_FWD_OUT, &mp, ill_index); 16626 if (mp == NULL) { 16627 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 16628 " during IPPF processing\n")); 16629 return; 16630 } 16631 } 16632 /* 16633 * Handle labeled packet resizing. 16634 * 16635 * If we have added a label, inform ip_wput_frag() of its 16636 * effect on the MTU for ICMP messages. 16637 */ 16638 if (pkt_len > old_pkt_len) { 16639 uint32_t secopt_size; 16640 16641 secopt_size = pkt_len - old_pkt_len; 16642 if (secopt_size < max_frag) 16643 max_frag -= secopt_size; 16644 } 16645 16646 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, GLOBAL_ZONEID, ipst); 16647 ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n")); 16648 return; 16649 } 16650 16651 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 16652 ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16653 FW_HOOKS(ipst->ips_ip4_physical_out_event, 16654 ipst->ips_ipv4firewall_physical_out, 16655 NULL, out_ill, ipha, mp, mp, 0, ipst); 16656 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 16657 if (mp == NULL) 16658 return; 16659 16660 mp->b_prev = (mblk_t *)IPP_FWD_OUT; 16661 ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n")); 16662 (void) ip_xmit_v4(mp, ire, NULL, B_FALSE); 16663 /* ip_xmit_v4 always consumes the packet */ 16664 return; 16665 16666 drop_pkt:; 16667 ip1dbg(("ip_rput_forward: drop pkt\n")); 16668 freemsg(mp); 16669 #undef rptr 16670 } 16671 16672 void 16673 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 16674 { 16675 ire_t *ire; 16676 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16677 16678 ASSERT(!ipif->ipif_isv6); 16679 /* 16680 * Find an IRE which matches the destination and the outgoing 16681 * queue in the cache table. All we need is an IRE_CACHE which 16682 * is pointing at ipif->ipif_ill. If it is part of some ill group, 16683 * then it is enough to have some IRE_CACHE in the group. 16684 */ 16685 if (ipif->ipif_flags & IPIF_POINTOPOINT) 16686 dst = ipif->ipif_pp_dst_addr; 16687 16688 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), 16689 MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst); 16690 if (ire == NULL) { 16691 /* 16692 * Mark this packet to make it be delivered to 16693 * ip_rput_forward after the new ire has been 16694 * created. 16695 */ 16696 mp->b_prev = NULL; 16697 mp->b_next = mp; 16698 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 16699 NULL, 0, GLOBAL_ZONEID, &zero_info); 16700 } else { 16701 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 16702 IRE_REFRELE(ire); 16703 } 16704 } 16705 16706 /* Update any source route, record route or timestamp options */ 16707 static int 16708 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) 16709 { 16710 ipoptp_t opts; 16711 uchar_t *opt; 16712 uint8_t optval; 16713 uint8_t optlen; 16714 ipaddr_t dst; 16715 uint32_t ts; 16716 ire_t *dst_ire = NULL; 16717 ire_t *tmp_ire = NULL; 16718 timestruc_t now; 16719 16720 ip2dbg(("ip_rput_forward_options\n")); 16721 dst = ipha->ipha_dst; 16722 for (optval = ipoptp_first(&opts, ipha); 16723 optval != IPOPT_EOL; 16724 optval = ipoptp_next(&opts)) { 16725 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16726 opt = opts.ipoptp_cur; 16727 optlen = opts.ipoptp_len; 16728 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 16729 optval, opts.ipoptp_len)); 16730 switch (optval) { 16731 uint32_t off; 16732 case IPOPT_SSRR: 16733 case IPOPT_LSRR: 16734 /* Check if adminstratively disabled */ 16735 if (!ipst->ips_ip_forward_src_routed) { 16736 if (ire->ire_stq != NULL) { 16737 /* 16738 * Sent by forwarding path, and router 16739 * is global zone 16740 */ 16741 icmp_unreachable(ire->ire_stq, mp, 16742 ICMP_SOURCE_ROUTE_FAILED, 16743 GLOBAL_ZONEID, ipst); 16744 } else { 16745 ip0dbg(("ip_rput_forward_options: " 16746 "unable to send unreach\n")); 16747 freemsg(mp); 16748 } 16749 return (-1); 16750 } 16751 16752 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16753 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16754 if (dst_ire == NULL) { 16755 /* 16756 * Must be partial since ip_rput_options 16757 * checked for strict. 16758 */ 16759 break; 16760 } 16761 off = opt[IPOPT_OFFSET]; 16762 off--; 16763 redo_srr: 16764 if (optlen < IP_ADDR_LEN || 16765 off > optlen - IP_ADDR_LEN) { 16766 /* End of source route */ 16767 ip1dbg(( 16768 "ip_rput_forward_options: end of SR\n")); 16769 ire_refrele(dst_ire); 16770 break; 16771 } 16772 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16773 bcopy(&ire->ire_src_addr, (char *)opt + off, 16774 IP_ADDR_LEN); 16775 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 16776 ntohl(dst))); 16777 16778 /* 16779 * Check if our address is present more than 16780 * once as consecutive hops in source route. 16781 */ 16782 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16783 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16784 if (tmp_ire != NULL) { 16785 ire_refrele(tmp_ire); 16786 off += IP_ADDR_LEN; 16787 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16788 goto redo_srr; 16789 } 16790 ipha->ipha_dst = dst; 16791 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16792 ire_refrele(dst_ire); 16793 break; 16794 case IPOPT_RR: 16795 off = opt[IPOPT_OFFSET]; 16796 off--; 16797 if (optlen < IP_ADDR_LEN || 16798 off > optlen - IP_ADDR_LEN) { 16799 /* No more room - ignore */ 16800 ip1dbg(( 16801 "ip_rput_forward_options: end of RR\n")); 16802 break; 16803 } 16804 bcopy(&ire->ire_src_addr, (char *)opt + off, 16805 IP_ADDR_LEN); 16806 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16807 break; 16808 case IPOPT_TS: 16809 /* Insert timestamp if there is room */ 16810 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16811 case IPOPT_TS_TSONLY: 16812 off = IPOPT_TS_TIMELEN; 16813 break; 16814 case IPOPT_TS_PRESPEC: 16815 case IPOPT_TS_PRESPEC_RFC791: 16816 /* Verify that the address matched */ 16817 off = opt[IPOPT_OFFSET] - 1; 16818 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16819 dst_ire = ire_ctable_lookup(dst, 0, 16820 IRE_LOCAL, NULL, ALL_ZONES, NULL, 16821 MATCH_IRE_TYPE, ipst); 16822 if (dst_ire == NULL) { 16823 /* Not for us */ 16824 break; 16825 } 16826 ire_refrele(dst_ire); 16827 /* FALLTHRU */ 16828 case IPOPT_TS_TSANDADDR: 16829 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16830 break; 16831 default: 16832 /* 16833 * ip_*put_options should have already 16834 * dropped this packet. 16835 */ 16836 cmn_err(CE_PANIC, "ip_rput_forward_options: " 16837 "unknown IT - bug in ip_rput_options?\n"); 16838 return (0); /* Keep "lint" happy */ 16839 } 16840 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16841 /* Increase overflow counter */ 16842 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16843 opt[IPOPT_POS_OV_FLG] = 16844 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16845 (off << 4)); 16846 break; 16847 } 16848 off = opt[IPOPT_OFFSET] - 1; 16849 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16850 case IPOPT_TS_PRESPEC: 16851 case IPOPT_TS_PRESPEC_RFC791: 16852 case IPOPT_TS_TSANDADDR: 16853 bcopy(&ire->ire_src_addr, 16854 (char *)opt + off, IP_ADDR_LEN); 16855 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16856 /* FALLTHRU */ 16857 case IPOPT_TS_TSONLY: 16858 off = opt[IPOPT_OFFSET] - 1; 16859 /* Compute # of milliseconds since midnight */ 16860 gethrestime(&now); 16861 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16862 now.tv_nsec / (NANOSEC / MILLISEC); 16863 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16864 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16865 break; 16866 } 16867 break; 16868 } 16869 } 16870 return (0); 16871 } 16872 16873 /* 16874 * This is called after processing at least one of AH/ESP headers. 16875 * 16876 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 16877 * the actual, physical interface on which the packet was received, 16878 * but, when ip_strict_dst_multihoming is set to 1, could be the 16879 * interface which had the ipha_dst configured when the packet went 16880 * through ip_rput. The ill_index corresponding to the recv_ill 16881 * is saved in ipsec_in_rill_index 16882 * 16883 * NOTE2: The "ire" argument is only used in IPv4 cases. This function 16884 * cannot assume "ire" points to valid data for any IPv6 cases. 16885 */ 16886 void 16887 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 16888 { 16889 mblk_t *mp; 16890 ipaddr_t dst; 16891 in6_addr_t *v6dstp; 16892 ipha_t *ipha; 16893 ip6_t *ip6h; 16894 ipsec_in_t *ii; 16895 boolean_t ill_need_rele = B_FALSE; 16896 boolean_t rill_need_rele = B_FALSE; 16897 boolean_t ire_need_rele = B_FALSE; 16898 netstack_t *ns; 16899 ip_stack_t *ipst; 16900 16901 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 16902 ASSERT(ii->ipsec_in_ill_index != 0); 16903 ns = ii->ipsec_in_ns; 16904 ASSERT(ii->ipsec_in_ns != NULL); 16905 ipst = ns->netstack_ip; 16906 16907 mp = ipsec_mp->b_cont; 16908 ASSERT(mp != NULL); 16909 16910 16911 if (ill == NULL) { 16912 ASSERT(recv_ill == NULL); 16913 /* 16914 * We need to get the original queue on which ip_rput_local 16915 * or ip_rput_data_v6 was called. 16916 */ 16917 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 16918 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst); 16919 ill_need_rele = B_TRUE; 16920 16921 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 16922 recv_ill = ill_lookup_on_ifindex( 16923 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 16924 NULL, NULL, NULL, NULL, ipst); 16925 rill_need_rele = B_TRUE; 16926 } else { 16927 recv_ill = ill; 16928 } 16929 16930 if ((ill == NULL) || (recv_ill == NULL)) { 16931 ip0dbg(("ip_fanout_proto_again: interface " 16932 "disappeared\n")); 16933 if (ill != NULL) 16934 ill_refrele(ill); 16935 if (recv_ill != NULL) 16936 ill_refrele(recv_ill); 16937 freemsg(ipsec_mp); 16938 return; 16939 } 16940 } 16941 16942 ASSERT(ill != NULL && recv_ill != NULL); 16943 16944 if (mp->b_datap->db_type == M_CTL) { 16945 /* 16946 * AH/ESP is returning the ICMP message after 16947 * removing their headers. Fanout again till 16948 * it gets to the right protocol. 16949 */ 16950 if (ii->ipsec_in_v4) { 16951 icmph_t *icmph; 16952 int iph_hdr_length; 16953 int hdr_length; 16954 16955 ipha = (ipha_t *)mp->b_rptr; 16956 iph_hdr_length = IPH_HDR_LENGTH(ipha); 16957 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 16958 ipha = (ipha_t *)&icmph[1]; 16959 hdr_length = IPH_HDR_LENGTH(ipha); 16960 /* 16961 * icmp_inbound_error_fanout may need to do pullupmsg. 16962 * Reset the type to M_DATA. 16963 */ 16964 mp->b_datap->db_type = M_DATA; 16965 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 16966 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 16967 B_FALSE, ill, ii->ipsec_in_zoneid); 16968 } else { 16969 icmp6_t *icmp6; 16970 int hdr_length; 16971 16972 ip6h = (ip6_t *)mp->b_rptr; 16973 /* Don't call hdr_length_v6() unless you have to. */ 16974 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 16975 hdr_length = ip_hdr_length_v6(mp, ip6h); 16976 else 16977 hdr_length = IPV6_HDR_LEN; 16978 16979 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 16980 /* 16981 * icmp_inbound_error_fanout_v6 may need to do 16982 * pullupmsg. Reset the type to M_DATA. 16983 */ 16984 mp->b_datap->db_type = M_DATA; 16985 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 16986 ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); 16987 } 16988 if (ill_need_rele) 16989 ill_refrele(ill); 16990 if (rill_need_rele) 16991 ill_refrele(recv_ill); 16992 return; 16993 } 16994 16995 if (ii->ipsec_in_v4) { 16996 ipha = (ipha_t *)mp->b_rptr; 16997 dst = ipha->ipha_dst; 16998 if (CLASSD(dst)) { 16999 /* 17000 * Multicast has to be delivered to all streams. 17001 */ 17002 dst = INADDR_BROADCAST; 17003 } 17004 17005 if (ire == NULL) { 17006 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 17007 MBLK_GETLABEL(mp), ipst); 17008 if (ire == NULL) { 17009 if (ill_need_rele) 17010 ill_refrele(ill); 17011 if (rill_need_rele) 17012 ill_refrele(recv_ill); 17013 ip1dbg(("ip_fanout_proto_again: " 17014 "IRE not found")); 17015 freemsg(ipsec_mp); 17016 return; 17017 } 17018 ire_need_rele = B_TRUE; 17019 } 17020 17021 switch (ipha->ipha_protocol) { 17022 case IPPROTO_UDP: 17023 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 17024 recv_ill); 17025 if (ire_need_rele) 17026 ire_refrele(ire); 17027 break; 17028 case IPPROTO_TCP: 17029 if (!ire_need_rele) 17030 IRE_REFHOLD(ire); 17031 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 17032 ire, ipsec_mp, 0, ill->ill_rq, NULL); 17033 IRE_REFRELE(ire); 17034 if (mp != NULL) 17035 squeue_enter_chain(GET_SQUEUE(mp), mp, 17036 mp, 1, SQTAG_IP_PROTO_AGAIN); 17037 break; 17038 case IPPROTO_SCTP: 17039 if (!ire_need_rele) 17040 IRE_REFHOLD(ire); 17041 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 17042 ipsec_mp, 0, ill->ill_rq, dst); 17043 break; 17044 default: 17045 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 17046 recv_ill, 0); 17047 if (ire_need_rele) 17048 ire_refrele(ire); 17049 break; 17050 } 17051 } else { 17052 uint32_t rput_flags = 0; 17053 17054 ip6h = (ip6_t *)mp->b_rptr; 17055 v6dstp = &ip6h->ip6_dst; 17056 /* 17057 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 17058 * address. 17059 * 17060 * Currently, we don't store that state in the IPSEC_IN 17061 * message, and we may need to. 17062 */ 17063 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 17064 IP6_IN_LLMCAST : 0); 17065 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 17066 NULL, NULL); 17067 } 17068 if (ill_need_rele) 17069 ill_refrele(ill); 17070 if (rill_need_rele) 17071 ill_refrele(recv_ill); 17072 } 17073 17074 /* 17075 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 17076 * returns 'true' if there are still fragments left on the queue, in 17077 * which case we restart the timer. 17078 */ 17079 void 17080 ill_frag_timer(void *arg) 17081 { 17082 ill_t *ill = (ill_t *)arg; 17083 boolean_t frag_pending; 17084 ip_stack_t *ipst = ill->ill_ipst; 17085 17086 mutex_enter(&ill->ill_lock); 17087 ASSERT(!ill->ill_fragtimer_executing); 17088 if (ill->ill_state_flags & ILL_CONDEMNED) { 17089 ill->ill_frag_timer_id = 0; 17090 mutex_exit(&ill->ill_lock); 17091 return; 17092 } 17093 ill->ill_fragtimer_executing = 1; 17094 mutex_exit(&ill->ill_lock); 17095 17096 frag_pending = ill_frag_timeout(ill, ipst->ips_ip_g_frag_timeout); 17097 17098 /* 17099 * Restart the timer, if we have fragments pending or if someone 17100 * wanted us to be scheduled again. 17101 */ 17102 mutex_enter(&ill->ill_lock); 17103 ill->ill_fragtimer_executing = 0; 17104 ill->ill_frag_timer_id = 0; 17105 if (frag_pending || ill->ill_fragtimer_needrestart) 17106 ill_frag_timer_start(ill); 17107 mutex_exit(&ill->ill_lock); 17108 } 17109 17110 void 17111 ill_frag_timer_start(ill_t *ill) 17112 { 17113 ip_stack_t *ipst = ill->ill_ipst; 17114 17115 ASSERT(MUTEX_HELD(&ill->ill_lock)); 17116 17117 /* If the ill is closing or opening don't proceed */ 17118 if (ill->ill_state_flags & ILL_CONDEMNED) 17119 return; 17120 17121 if (ill->ill_fragtimer_executing) { 17122 /* 17123 * ill_frag_timer is currently executing. Just record the 17124 * the fact that we want the timer to be restarted. 17125 * ill_frag_timer will post a timeout before it returns, 17126 * ensuring it will be called again. 17127 */ 17128 ill->ill_fragtimer_needrestart = 1; 17129 return; 17130 } 17131 17132 if (ill->ill_frag_timer_id == 0) { 17133 /* 17134 * The timer is neither running nor is the timeout handler 17135 * executing. Post a timeout so that ill_frag_timer will be 17136 * called 17137 */ 17138 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 17139 MSEC_TO_TICK(ipst->ips_ip_g_frag_timo_ms >> 1)); 17140 ill->ill_fragtimer_needrestart = 0; 17141 } 17142 } 17143 17144 /* 17145 * This routine is needed for loopback when forwarding multicasts. 17146 * 17147 * IPQoS Notes: 17148 * IPPF processing is done in fanout routines. 17149 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 17150 * processing for IPsec packets is done when it comes back in clear. 17151 * NOTE : The callers of this function need to do the ire_refrele for the 17152 * ire that is being passed in. 17153 */ 17154 void 17155 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17156 ill_t *recv_ill, uint32_t esp_udp_ports) 17157 { 17158 boolean_t esp_in_udp_packet = (esp_udp_ports != 0); 17159 ill_t *ill = (ill_t *)q->q_ptr; 17160 uint32_t sum; 17161 uint32_t u1; 17162 uint32_t u2; 17163 int hdr_length; 17164 boolean_t mctl_present; 17165 mblk_t *first_mp = mp; 17166 mblk_t *hada_mp = NULL; 17167 ipha_t *inner_ipha; 17168 ip_stack_t *ipst; 17169 17170 ASSERT(recv_ill != NULL); 17171 ipst = recv_ill->ill_ipst; 17172 17173 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 17174 "ip_rput_locl_start: q %p", q); 17175 17176 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17177 ASSERT(ill != NULL); 17178 17179 17180 #define rptr ((uchar_t *)ipha) 17181 #define iphs ((uint16_t *)ipha) 17182 17183 /* 17184 * no UDP or TCP packet should come here anymore. 17185 */ 17186 ASSERT(ipha->ipha_protocol != IPPROTO_TCP && 17187 ipha->ipha_protocol != IPPROTO_UDP); 17188 17189 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 17190 if (mctl_present && 17191 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 17192 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 17193 17194 /* 17195 * It's an IPsec accelerated packet. 17196 * Keep a pointer to the data attributes around until 17197 * we allocate the ipsec_info_t. 17198 */ 17199 IPSECHW_DEBUG(IPSECHW_PKT, 17200 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 17201 hada_mp = first_mp; 17202 hada_mp->b_cont = NULL; 17203 /* 17204 * Since it is accelerated, it comes directly from 17205 * the ill and the data attributes is followed by 17206 * the packet data. 17207 */ 17208 ASSERT(mp->b_datap->db_type != M_CTL); 17209 first_mp = mp; 17210 mctl_present = B_FALSE; 17211 } 17212 17213 /* 17214 * IF M_CTL is not present, then ipsec_in_is_secure 17215 * should return B_TRUE. There is a case where loopback 17216 * packets has an M_CTL in the front with all the 17217 * IPsec options set to IPSEC_PREF_NEVER - which means 17218 * ipsec_in_is_secure will return B_FALSE. As loopback 17219 * packets never comes here, it is safe to ASSERT the 17220 * following. 17221 */ 17222 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 17223 17224 /* 17225 * Also, we should never have an mctl_present if this is an 17226 * ESP-in-UDP packet. 17227 */ 17228 ASSERT(!mctl_present || !esp_in_udp_packet); 17229 17230 17231 /* u1 is # words of IP options */ 17232 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 17233 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 17234 17235 /* 17236 * Don't verify header checksum if we just removed UDP header or 17237 * packet is coming back from AH/ESP. 17238 */ 17239 if (!esp_in_udp_packet && !mctl_present) { 17240 if (u1) { 17241 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 17242 if (hada_mp != NULL) 17243 freemsg(hada_mp); 17244 return; 17245 } 17246 } else { 17247 /* Check the IP header checksum. */ 17248 #define uph ((uint16_t *)ipha) 17249 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 17250 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 17251 #undef uph 17252 /* finish doing IP checksum */ 17253 sum = (sum & 0xFFFF) + (sum >> 16); 17254 sum = ~(sum + (sum >> 16)) & 0xFFFF; 17255 if (sum && sum != 0xFFFF) { 17256 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 17257 goto drop_pkt; 17258 } 17259 } 17260 } 17261 17262 /* 17263 * Count for SNMP of inbound packets for ire. As ip_proto_input 17264 * might be called more than once for secure packets, count only 17265 * the first time. 17266 */ 17267 if (!mctl_present) { 17268 UPDATE_IB_PKT_COUNT(ire); 17269 ire->ire_last_used_time = lbolt; 17270 } 17271 17272 /* Check for fragmentation offset. */ 17273 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 17274 u1 = u2 & (IPH_MF | IPH_OFFSET); 17275 if (u1) { 17276 /* 17277 * We re-assemble fragments before we do the AH/ESP 17278 * processing. Thus, M_CTL should not be present 17279 * while we are re-assembling. 17280 */ 17281 ASSERT(!mctl_present); 17282 ASSERT(first_mp == mp); 17283 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 17284 return; 17285 } 17286 /* 17287 * Make sure that first_mp points back to mp as 17288 * the mp we came in with could have changed in 17289 * ip_rput_fragment(). 17290 */ 17291 ipha = (ipha_t *)mp->b_rptr; 17292 first_mp = mp; 17293 } 17294 17295 /* 17296 * Clear hardware checksumming flag as it is currently only 17297 * used by TCP and UDP. 17298 */ 17299 DB_CKSUMFLAGS(mp) = 0; 17300 17301 /* Now we have a complete datagram, destined for this machine. */ 17302 u1 = IPH_HDR_LENGTH(ipha); 17303 switch (ipha->ipha_protocol) { 17304 case IPPROTO_ICMP: { 17305 ire_t *ire_zone; 17306 ilm_t *ilm; 17307 mblk_t *mp1; 17308 zoneid_t last_zoneid; 17309 17310 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { 17311 ASSERT(ire->ire_type == IRE_BROADCAST); 17312 /* 17313 * Inactive/Failed interfaces are not supposed to 17314 * respond to the multicast packets. 17315 */ 17316 if (ill_is_probeonly(ill)) { 17317 freemsg(first_mp); 17318 return; 17319 } 17320 17321 /* 17322 * In the multicast case, applications may have joined 17323 * the group from different zones, so we need to deliver 17324 * the packet to each of them. Loop through the 17325 * multicast memberships structures (ilm) on the receive 17326 * ill and send a copy of the packet up each matching 17327 * one. However, we don't do this for multicasts sent on 17328 * the loopback interface (PHYI_LOOPBACK flag set) as 17329 * they must stay in the sender's zone. 17330 * 17331 * ilm_add_v6() ensures that ilms in the same zone are 17332 * contiguous in the ill_ilm list. We use this property 17333 * to avoid sending duplicates needed when two 17334 * applications in the same zone join the same group on 17335 * different logical interfaces: we ignore the ilm if 17336 * its zoneid is the same as the last matching one. 17337 * In addition, the sending of the packet for 17338 * ire_zoneid is delayed until all of the other ilms 17339 * have been exhausted. 17340 */ 17341 last_zoneid = -1; 17342 ILM_WALKER_HOLD(recv_ill); 17343 for (ilm = recv_ill->ill_ilm; ilm != NULL; 17344 ilm = ilm->ilm_next) { 17345 if ((ilm->ilm_flags & ILM_DELETED) || 17346 ipha->ipha_dst != ilm->ilm_addr || 17347 ilm->ilm_zoneid == last_zoneid || 17348 ilm->ilm_zoneid == ire->ire_zoneid || 17349 ilm->ilm_zoneid == ALL_ZONES || 17350 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 17351 continue; 17352 mp1 = ip_copymsg(first_mp); 17353 if (mp1 == NULL) 17354 continue; 17355 icmp_inbound(q, mp1, B_TRUE, ill, 17356 0, sum, mctl_present, B_TRUE, 17357 recv_ill, ilm->ilm_zoneid); 17358 last_zoneid = ilm->ilm_zoneid; 17359 } 17360 ILM_WALKER_RELE(recv_ill); 17361 } else if (ire->ire_type == IRE_BROADCAST) { 17362 /* 17363 * In the broadcast case, there may be many zones 17364 * which need a copy of the packet delivered to them. 17365 * There is one IRE_BROADCAST per broadcast address 17366 * and per zone; we walk those using a helper function. 17367 * In addition, the sending of the packet for ire is 17368 * delayed until all of the other ires have been 17369 * processed. 17370 */ 17371 IRB_REFHOLD(ire->ire_bucket); 17372 ire_zone = NULL; 17373 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 17374 ire)) != NULL) { 17375 mp1 = ip_copymsg(first_mp); 17376 if (mp1 == NULL) 17377 continue; 17378 17379 UPDATE_IB_PKT_COUNT(ire_zone); 17380 ire_zone->ire_last_used_time = lbolt; 17381 icmp_inbound(q, mp1, B_TRUE, ill, 17382 0, sum, mctl_present, B_TRUE, 17383 recv_ill, ire_zone->ire_zoneid); 17384 } 17385 IRB_REFRELE(ire->ire_bucket); 17386 } 17387 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 17388 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 17389 ire->ire_zoneid); 17390 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17391 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 17392 return; 17393 } 17394 case IPPROTO_IGMP: 17395 /* 17396 * If we are not willing to accept IGMP packets in clear, 17397 * then check with global policy. 17398 */ 17399 if (ipst->ips_igmp_accept_clear_messages == 0) { 17400 first_mp = ipsec_check_global_policy(first_mp, NULL, 17401 ipha, NULL, mctl_present, ipst->ips_netstack); 17402 if (first_mp == NULL) 17403 return; 17404 } 17405 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17406 freemsg(first_mp); 17407 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 17408 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17409 return; 17410 } 17411 if ((mp = igmp_input(q, mp, ill)) == NULL) { 17412 /* Bad packet - discarded by igmp_input */ 17413 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17414 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 17415 if (mctl_present) 17416 freeb(first_mp); 17417 return; 17418 } 17419 /* 17420 * igmp_input() may have returned the pulled up message. 17421 * So first_mp and ipha need to be reinitialized. 17422 */ 17423 ipha = (ipha_t *)mp->b_rptr; 17424 if (mctl_present) 17425 first_mp->b_cont = mp; 17426 else 17427 first_mp = mp; 17428 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17429 connf_head != NULL) { 17430 /* No user-level listener for IGMP packets */ 17431 goto drop_pkt; 17432 } 17433 /* deliver to local raw users */ 17434 break; 17435 case IPPROTO_PIM: 17436 /* 17437 * If we are not willing to accept PIM packets in clear, 17438 * then check with global policy. 17439 */ 17440 if (ipst->ips_pim_accept_clear_messages == 0) { 17441 first_mp = ipsec_check_global_policy(first_mp, NULL, 17442 ipha, NULL, mctl_present, ipst->ips_netstack); 17443 if (first_mp == NULL) 17444 return; 17445 } 17446 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17447 freemsg(first_mp); 17448 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 17449 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17450 return; 17451 } 17452 if (pim_input(q, mp, ill) != 0) { 17453 /* Bad packet - discarded by pim_input */ 17454 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17455 "ip_rput_locl_end: q %p (%S)", q, "pim"); 17456 if (mctl_present) 17457 freeb(first_mp); 17458 return; 17459 } 17460 17461 /* 17462 * pim_input() may have pulled up the message so ipha needs to 17463 * be reinitialized. 17464 */ 17465 ipha = (ipha_t *)mp->b_rptr; 17466 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17467 connf_head != NULL) { 17468 /* No user-level listener for PIM packets */ 17469 goto drop_pkt; 17470 } 17471 /* deliver to local raw users */ 17472 break; 17473 case IPPROTO_ENCAP: 17474 /* 17475 * Handle self-encapsulated packets (IP-in-IP where 17476 * the inner addresses == the outer addresses). 17477 */ 17478 hdr_length = IPH_HDR_LENGTH(ipha); 17479 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 17480 mp->b_wptr) { 17481 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 17482 sizeof (ipha_t) - mp->b_rptr)) { 17483 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17484 freemsg(first_mp); 17485 return; 17486 } 17487 ipha = (ipha_t *)mp->b_rptr; 17488 } 17489 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 17490 /* 17491 * Check the sanity of the inner IP header. 17492 */ 17493 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 17494 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17495 freemsg(first_mp); 17496 return; 17497 } 17498 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 17499 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17500 freemsg(first_mp); 17501 return; 17502 } 17503 if (inner_ipha->ipha_src == ipha->ipha_src && 17504 inner_ipha->ipha_dst == ipha->ipha_dst) { 17505 ipsec_in_t *ii; 17506 17507 /* 17508 * Self-encapsulated tunnel packet. Remove 17509 * the outer IP header and fanout again. 17510 * We also need to make sure that the inner 17511 * header is pulled up until options. 17512 */ 17513 mp->b_rptr = (uchar_t *)inner_ipha; 17514 ipha = inner_ipha; 17515 hdr_length = IPH_HDR_LENGTH(ipha); 17516 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 17517 if (!pullupmsg(mp, (uchar_t *)ipha + 17518 + hdr_length - mp->b_rptr)) { 17519 freemsg(first_mp); 17520 return; 17521 } 17522 ipha = (ipha_t *)mp->b_rptr; 17523 } 17524 if (hdr_length > sizeof (ipha_t)) { 17525 /* We got options on the inner packet. */ 17526 ipaddr_t dst = ipha->ipha_dst; 17527 17528 if (ip_rput_options(q, mp, ipha, &dst, ipst) == 17529 -1) { 17530 /* Bad options! */ 17531 return; 17532 } 17533 if (dst != ipha->ipha_dst) { 17534 /* 17535 * Someone put a source-route in 17536 * the inside header of a self- 17537 * encapsulated packet. Drop it 17538 * with extreme prejudice and let 17539 * the sender know. 17540 */ 17541 icmp_unreachable(q, first_mp, 17542 ICMP_SOURCE_ROUTE_FAILED, 17543 recv_ill->ill_zoneid, ipst); 17544 return; 17545 } 17546 } 17547 if (!mctl_present) { 17548 ASSERT(first_mp == mp); 17549 /* 17550 * This means that somebody is sending 17551 * Self-encapsualted packets without AH/ESP. 17552 * If AH/ESP was present, we would have already 17553 * allocated the first_mp. 17554 * 17555 * Send this packet to find a tunnel endpoint. 17556 * if I can't find one, an ICMP 17557 * PROTOCOL_UNREACHABLE will get sent. 17558 */ 17559 goto fanout; 17560 } 17561 /* 17562 * We generally store the ill_index if we need to 17563 * do IPsec processing as we lose the ill queue when 17564 * we come back. But in this case, we never should 17565 * have to store the ill_index here as it should have 17566 * been stored previously when we processed the 17567 * AH/ESP header in this routine or for non-ipsec 17568 * cases, we still have the queue. But for some bad 17569 * packets from the wire, we can get to IPsec after 17570 * this and we better store the index for that case. 17571 */ 17572 ill = (ill_t *)q->q_ptr; 17573 ii = (ipsec_in_t *)first_mp->b_rptr; 17574 ii->ipsec_in_ill_index = 17575 ill->ill_phyint->phyint_ifindex; 17576 ii->ipsec_in_rill_index = 17577 recv_ill->ill_phyint->phyint_ifindex; 17578 if (ii->ipsec_in_decaps) { 17579 /* 17580 * This packet is self-encapsulated multiple 17581 * times. We don't want to recurse infinitely. 17582 * To keep it simple, drop the packet. 17583 */ 17584 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17585 freemsg(first_mp); 17586 return; 17587 } 17588 ii->ipsec_in_decaps = B_TRUE; 17589 ip_fanout_proto_again(first_mp, recv_ill, recv_ill, 17590 ire); 17591 return; 17592 } 17593 break; 17594 case IPPROTO_AH: 17595 case IPPROTO_ESP: { 17596 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 17597 ipsa_t *assoc; 17598 17599 /* 17600 * Fast path for AH/ESP. If this is the first time 17601 * we are sending a datagram to AH/ESP, allocate 17602 * a IPSEC_IN message and prepend it. Otherwise, 17603 * just fanout. 17604 */ 17605 17606 int ipsec_rc; 17607 ipsec_in_t *ii; 17608 netstack_t *ns = ipst->ips_netstack; 17609 17610 IP_STAT(ipst, ipsec_proto_ahesp); 17611 if (!mctl_present) { 17612 ASSERT(first_mp == mp); 17613 first_mp = ipsec_in_alloc(B_TRUE, ns); 17614 if (first_mp == NULL) { 17615 ip1dbg(("ip_proto_input: IPSEC_IN " 17616 "allocation failure.\n")); 17617 freemsg(hada_mp); /* okay ifnull */ 17618 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17619 freemsg(mp); 17620 return; 17621 } 17622 /* 17623 * Store the ill_index so that when we come back 17624 * from IPsec we ride on the same queue. 17625 */ 17626 ill = (ill_t *)q->q_ptr; 17627 ii = (ipsec_in_t *)first_mp->b_rptr; 17628 ii->ipsec_in_ill_index = 17629 ill->ill_phyint->phyint_ifindex; 17630 ii->ipsec_in_rill_index = 17631 recv_ill->ill_phyint->phyint_ifindex; 17632 first_mp->b_cont = mp; 17633 /* 17634 * Cache hardware acceleration info. 17635 */ 17636 if (hada_mp != NULL) { 17637 IPSECHW_DEBUG(IPSECHW_PKT, 17638 ("ip_rput_local: caching data attr.\n")); 17639 ii->ipsec_in_accelerated = B_TRUE; 17640 ii->ipsec_in_da = hada_mp; 17641 hada_mp = NULL; 17642 } 17643 } else { 17644 ii = (ipsec_in_t *)first_mp->b_rptr; 17645 } 17646 17647 ii->ipsec_in_esp_udp_ports = esp_udp_ports; 17648 17649 if (!ipsec_loaded(ipss)) { 17650 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 17651 ire->ire_zoneid, ipst); 17652 return; 17653 } 17654 17655 ns = ipst->ips_netstack; 17656 /* select inbound SA and have IPsec process the pkt */ 17657 if (ipha->ipha_protocol == IPPROTO_ESP) { 17658 esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns); 17659 boolean_t esp_in_udp_sa; 17660 if (esph == NULL) 17661 return; 17662 ASSERT(ii->ipsec_in_esp_sa != NULL); 17663 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 17664 esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags & 17665 IPSA_F_NATT) != 0); 17666 /* 17667 * The following is a fancy, but quick, way of saying: 17668 * ESP-in-UDP SA and Raw ESP packet --> drop 17669 * OR 17670 * ESP SA and ESP-in-UDP packet --> drop 17671 */ 17672 if (esp_in_udp_sa != esp_in_udp_packet) { 17673 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17674 ip_drop_packet(first_mp, B_TRUE, ill, NULL, 17675 DROPPER(ns->netstack_ipsec, ipds_esp_no_sa), 17676 &ns->netstack_ipsec->ipsec_dropper); 17677 return; 17678 } 17679 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 17680 first_mp, esph); 17681 assoc = ii->ipsec_in_esp_sa; 17682 } else { 17683 ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns); 17684 if (ah == NULL) 17685 return; 17686 ASSERT(ii->ipsec_in_ah_sa != NULL); 17687 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 17688 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 17689 first_mp, ah); 17690 assoc = ii->ipsec_in_ah_sa; 17691 } 17692 17693 switch (ipsec_rc) { 17694 case IPSEC_STATUS_SUCCESS: 17695 /* 17696 * The packet is successfully processed but 17697 * received on an SA which is in IDLE state. 17698 * We queue the packet for subsequent 17699 * processing after the SA moves to MATURE 17700 * state. 17701 */ 17702 if ((assoc != NULL) && 17703 (assoc->ipsa_state == IPSA_STATE_IDLE)) { 17704 ASSERT(cl_inet_idlesa != NULL); 17705 in6_addr_t srcaddr, dstaddr; 17706 uint8_t protocol; 17707 protocol = (assoc->ipsa_type == SADB_SATYPE_AH) 17708 ? IPPROTO_AH : IPPROTO_ESP; 17709 IPSA_COPY_ADDR(&srcaddr, assoc->ipsa_srcaddr, 17710 assoc->ipsa_addrfam); 17711 IPSA_COPY_ADDR(&dstaddr, assoc->ipsa_dstaddr, 17712 assoc->ipsa_addrfam); 17713 cl_inet_idlesa(protocol, assoc->ipsa_spi, 17714 assoc->ipsa_addrfam, srcaddr, 17715 dstaddr); 17716 sadb_buf_pkt(assoc, first_mp, ns); 17717 return; 17718 } 17719 break; 17720 case IPSEC_STATUS_FAILED: 17721 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17722 /* FALLTHRU */ 17723 case IPSEC_STATUS_PENDING: 17724 return; 17725 } 17726 /* we're done with IPsec processing, send it up */ 17727 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 17728 return; 17729 } 17730 default: 17731 break; 17732 } 17733 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 17734 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 17735 ire->ire_zoneid)); 17736 goto drop_pkt; 17737 } 17738 /* 17739 * Handle protocols with which IP is less intimate. There 17740 * can be more than one stream bound to a particular 17741 * protocol. When this is the case, each one gets a copy 17742 * of any incoming packets. 17743 */ 17744 fanout: 17745 ip_fanout_proto(q, first_mp, ill, ipha, 17746 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 17747 B_TRUE, recv_ill, ire->ire_zoneid); 17748 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17749 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 17750 return; 17751 17752 drop_pkt: 17753 freemsg(first_mp); 17754 if (hada_mp != NULL) 17755 freeb(hada_mp); 17756 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17757 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 17758 #undef rptr 17759 #undef iphs 17760 17761 } 17762 17763 /* 17764 * Update any source route, record route or timestamp options. 17765 * Check that we are at end of strict source route. 17766 * The options have already been checked for sanity in ip_rput_options(). 17767 */ 17768 static boolean_t 17769 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17770 ip_stack_t *ipst) 17771 { 17772 ipoptp_t opts; 17773 uchar_t *opt; 17774 uint8_t optval; 17775 uint8_t optlen; 17776 ipaddr_t dst; 17777 uint32_t ts; 17778 ire_t *dst_ire; 17779 timestruc_t now; 17780 zoneid_t zoneid; 17781 ill_t *ill; 17782 17783 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17784 17785 ip2dbg(("ip_rput_local_options\n")); 17786 17787 for (optval = ipoptp_first(&opts, ipha); 17788 optval != IPOPT_EOL; 17789 optval = ipoptp_next(&opts)) { 17790 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 17791 opt = opts.ipoptp_cur; 17792 optlen = opts.ipoptp_len; 17793 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 17794 optval, optlen)); 17795 switch (optval) { 17796 uint32_t off; 17797 case IPOPT_SSRR: 17798 case IPOPT_LSRR: 17799 off = opt[IPOPT_OFFSET]; 17800 off--; 17801 if (optlen < IP_ADDR_LEN || 17802 off > optlen - IP_ADDR_LEN) { 17803 /* End of source route */ 17804 ip1dbg(("ip_rput_local_options: end of SR\n")); 17805 break; 17806 } 17807 /* 17808 * This will only happen if two consecutive entries 17809 * in the source route contains our address or if 17810 * it is a packet with a loose source route which 17811 * reaches us before consuming the whole source route 17812 */ 17813 ip1dbg(("ip_rput_local_options: not end of SR\n")); 17814 if (optval == IPOPT_SSRR) { 17815 goto bad_src_route; 17816 } 17817 /* 17818 * Hack: instead of dropping the packet truncate the 17819 * source route to what has been used by filling the 17820 * rest with IPOPT_NOP. 17821 */ 17822 opt[IPOPT_OLEN] = (uint8_t)off; 17823 while (off < optlen) { 17824 opt[off++] = IPOPT_NOP; 17825 } 17826 break; 17827 case IPOPT_RR: 17828 off = opt[IPOPT_OFFSET]; 17829 off--; 17830 if (optlen < IP_ADDR_LEN || 17831 off > optlen - IP_ADDR_LEN) { 17832 /* No more room - ignore */ 17833 ip1dbg(( 17834 "ip_rput_local_options: end of RR\n")); 17835 break; 17836 } 17837 bcopy(&ire->ire_src_addr, (char *)opt + off, 17838 IP_ADDR_LEN); 17839 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17840 break; 17841 case IPOPT_TS: 17842 /* Insert timestamp if there is romm */ 17843 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17844 case IPOPT_TS_TSONLY: 17845 off = IPOPT_TS_TIMELEN; 17846 break; 17847 case IPOPT_TS_PRESPEC: 17848 case IPOPT_TS_PRESPEC_RFC791: 17849 /* Verify that the address matched */ 17850 off = opt[IPOPT_OFFSET] - 1; 17851 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17852 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 17853 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 17854 ipst); 17855 if (dst_ire == NULL) { 17856 /* Not for us */ 17857 break; 17858 } 17859 ire_refrele(dst_ire); 17860 /* FALLTHRU */ 17861 case IPOPT_TS_TSANDADDR: 17862 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 17863 break; 17864 default: 17865 /* 17866 * ip_*put_options should have already 17867 * dropped this packet. 17868 */ 17869 cmn_err(CE_PANIC, "ip_rput_local_options: " 17870 "unknown IT - bug in ip_rput_options?\n"); 17871 return (B_TRUE); /* Keep "lint" happy */ 17872 } 17873 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 17874 /* Increase overflow counter */ 17875 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 17876 opt[IPOPT_POS_OV_FLG] = 17877 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 17878 (off << 4)); 17879 break; 17880 } 17881 off = opt[IPOPT_OFFSET] - 1; 17882 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17883 case IPOPT_TS_PRESPEC: 17884 case IPOPT_TS_PRESPEC_RFC791: 17885 case IPOPT_TS_TSANDADDR: 17886 bcopy(&ire->ire_src_addr, (char *)opt + off, 17887 IP_ADDR_LEN); 17888 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17889 /* FALLTHRU */ 17890 case IPOPT_TS_TSONLY: 17891 off = opt[IPOPT_OFFSET] - 1; 17892 /* Compute # of milliseconds since midnight */ 17893 gethrestime(&now); 17894 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 17895 now.tv_nsec / (NANOSEC / MILLISEC); 17896 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 17897 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 17898 break; 17899 } 17900 break; 17901 } 17902 } 17903 return (B_TRUE); 17904 17905 bad_src_route: 17906 q = WR(q); 17907 if (q->q_next != NULL) 17908 ill = q->q_ptr; 17909 else 17910 ill = NULL; 17911 17912 /* make sure we clear any indication of a hardware checksum */ 17913 DB_CKSUMFLAGS(mp) = 0; 17914 zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst); 17915 if (zoneid == ALL_ZONES) 17916 freemsg(mp); 17917 else 17918 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 17919 return (B_FALSE); 17920 17921 } 17922 17923 /* 17924 * Process IP options in an inbound packet. If an option affects the 17925 * effective destination address, return the next hop address via dstp. 17926 * Returns -1 if something fails in which case an ICMP error has been sent 17927 * and mp freed. 17928 */ 17929 static int 17930 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, 17931 ip_stack_t *ipst) 17932 { 17933 ipoptp_t opts; 17934 uchar_t *opt; 17935 uint8_t optval; 17936 uint8_t optlen; 17937 ipaddr_t dst; 17938 intptr_t code = 0; 17939 ire_t *ire = NULL; 17940 zoneid_t zoneid; 17941 ill_t *ill; 17942 17943 ip2dbg(("ip_rput_options\n")); 17944 dst = ipha->ipha_dst; 17945 for (optval = ipoptp_first(&opts, ipha); 17946 optval != IPOPT_EOL; 17947 optval = ipoptp_next(&opts)) { 17948 opt = opts.ipoptp_cur; 17949 optlen = opts.ipoptp_len; 17950 ip2dbg(("ip_rput_options: opt %d, len %d\n", 17951 optval, optlen)); 17952 /* 17953 * Note: we need to verify the checksum before we 17954 * modify anything thus this routine only extracts the next 17955 * hop dst from any source route. 17956 */ 17957 switch (optval) { 17958 uint32_t off; 17959 case IPOPT_SSRR: 17960 case IPOPT_LSRR: 17961 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 17962 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 17963 if (ire == NULL) { 17964 if (optval == IPOPT_SSRR) { 17965 ip1dbg(("ip_rput_options: not next" 17966 " strict source route 0x%x\n", 17967 ntohl(dst))); 17968 code = (char *)&ipha->ipha_dst - 17969 (char *)ipha; 17970 goto param_prob; /* RouterReq's */ 17971 } 17972 ip2dbg(("ip_rput_options: " 17973 "not next source route 0x%x\n", 17974 ntohl(dst))); 17975 break; 17976 } 17977 ire_refrele(ire); 17978 17979 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 17980 ip1dbg(( 17981 "ip_rput_options: bad option offset\n")); 17982 code = (char *)&opt[IPOPT_OLEN] - 17983 (char *)ipha; 17984 goto param_prob; 17985 } 17986 off = opt[IPOPT_OFFSET]; 17987 off--; 17988 redo_srr: 17989 if (optlen < IP_ADDR_LEN || 17990 off > optlen - IP_ADDR_LEN) { 17991 /* End of source route */ 17992 ip1dbg(("ip_rput_options: end of SR\n")); 17993 break; 17994 } 17995 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17996 ip1dbg(("ip_rput_options: next hop 0x%x\n", 17997 ntohl(dst))); 17998 17999 /* 18000 * Check if our address is present more than 18001 * once as consecutive hops in source route. 18002 * XXX verify per-interface ip_forwarding 18003 * for source route? 18004 */ 18005 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 18006 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 18007 18008 if (ire != NULL) { 18009 ire_refrele(ire); 18010 off += IP_ADDR_LEN; 18011 goto redo_srr; 18012 } 18013 18014 if (dst == htonl(INADDR_LOOPBACK)) { 18015 ip1dbg(("ip_rput_options: loopback addr in " 18016 "source route!\n")); 18017 goto bad_src_route; 18018 } 18019 /* 18020 * For strict: verify that dst is directly 18021 * reachable. 18022 */ 18023 if (optval == IPOPT_SSRR) { 18024 ire = ire_ftable_lookup(dst, 0, 0, 18025 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 18026 MBLK_GETLABEL(mp), 18027 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 18028 if (ire == NULL) { 18029 ip1dbg(("ip_rput_options: SSRR not " 18030 "directly reachable: 0x%x\n", 18031 ntohl(dst))); 18032 goto bad_src_route; 18033 } 18034 ire_refrele(ire); 18035 } 18036 /* 18037 * Defer update of the offset and the record route 18038 * until the packet is forwarded. 18039 */ 18040 break; 18041 case IPOPT_RR: 18042 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18043 ip1dbg(( 18044 "ip_rput_options: bad option offset\n")); 18045 code = (char *)&opt[IPOPT_OLEN] - 18046 (char *)ipha; 18047 goto param_prob; 18048 } 18049 break; 18050 case IPOPT_TS: 18051 /* 18052 * Verify that length >= 5 and that there is either 18053 * room for another timestamp or that the overflow 18054 * counter is not maxed out. 18055 */ 18056 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 18057 if (optlen < IPOPT_MINLEN_IT) { 18058 goto param_prob; 18059 } 18060 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18061 ip1dbg(( 18062 "ip_rput_options: bad option offset\n")); 18063 code = (char *)&opt[IPOPT_OFFSET] - 18064 (char *)ipha; 18065 goto param_prob; 18066 } 18067 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 18068 case IPOPT_TS_TSONLY: 18069 off = IPOPT_TS_TIMELEN; 18070 break; 18071 case IPOPT_TS_TSANDADDR: 18072 case IPOPT_TS_PRESPEC: 18073 case IPOPT_TS_PRESPEC_RFC791: 18074 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 18075 break; 18076 default: 18077 code = (char *)&opt[IPOPT_POS_OV_FLG] - 18078 (char *)ipha; 18079 goto param_prob; 18080 } 18081 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 18082 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 18083 /* 18084 * No room and the overflow counter is 15 18085 * already. 18086 */ 18087 goto param_prob; 18088 } 18089 break; 18090 } 18091 } 18092 18093 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 18094 *dstp = dst; 18095 return (0); 18096 } 18097 18098 ip1dbg(("ip_rput_options: error processing IP options.")); 18099 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 18100 18101 param_prob: 18102 q = WR(q); 18103 if (q->q_next != NULL) 18104 ill = q->q_ptr; 18105 else 18106 ill = NULL; 18107 18108 /* make sure we clear any indication of a hardware checksum */ 18109 DB_CKSUMFLAGS(mp) = 0; 18110 /* Don't know whether this is for non-global or global/forwarding */ 18111 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18112 if (zoneid == ALL_ZONES) 18113 freemsg(mp); 18114 else 18115 icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst); 18116 return (-1); 18117 18118 bad_src_route: 18119 q = WR(q); 18120 if (q->q_next != NULL) 18121 ill = q->q_ptr; 18122 else 18123 ill = NULL; 18124 18125 /* make sure we clear any indication of a hardware checksum */ 18126 DB_CKSUMFLAGS(mp) = 0; 18127 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18128 if (zoneid == ALL_ZONES) 18129 freemsg(mp); 18130 else 18131 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 18132 return (-1); 18133 } 18134 18135 /* 18136 * IP & ICMP info in >=14 msg's ... 18137 * - ip fixed part (mib2_ip_t) 18138 * - icmp fixed part (mib2_icmp_t) 18139 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 18140 * - ipRouteEntryTable (ip 21) all IPv4 IREs 18141 * - ipNetToMediaEntryTable (ip 22) [filled in by the arp module] 18142 * - ipRouteAttributeTable (ip 102) labeled routes 18143 * - ip multicast membership (ip_member_t) 18144 * - ip multicast source filtering (ip_grpsrc_t) 18145 * - igmp fixed part (struct igmpstat) 18146 * - multicast routing stats (struct mrtstat) 18147 * - multicast routing vifs (array of struct vifctl) 18148 * - multicast routing routes (array of struct mfcctl) 18149 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 18150 * One per ill plus one generic 18151 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 18152 * One per ill plus one generic 18153 * - ipv6RouteEntry all IPv6 IREs 18154 * - ipv6RouteAttributeTable (ip6 102) labeled routes 18155 * - ipv6NetToMediaEntry all Neighbor Cache entries 18156 * - ipv6AddrEntry all IPv6 ipifs 18157 * - ipv6 multicast membership (ipv6_member_t) 18158 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 18159 * 18160 * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries. 18161 * 18162 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 18163 * already filled in by the caller. 18164 * Return value of 0 indicates that no messages were sent and caller 18165 * should free mpctl. 18166 */ 18167 int 18168 ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) 18169 { 18170 ip_stack_t *ipst; 18171 sctp_stack_t *sctps; 18172 18173 if (q->q_next != NULL) { 18174 ipst = ILLQ_TO_IPST(q); 18175 } else { 18176 ipst = CONNQ_TO_IPST(q); 18177 } 18178 ASSERT(ipst != NULL); 18179 sctps = ipst->ips_netstack->netstack_sctp; 18180 18181 if (mpctl == NULL || mpctl->b_cont == NULL) { 18182 return (0); 18183 } 18184 18185 /* 18186 * For the purposes of the (broken) packet shell use 18187 * of the level we make sure MIB2_TCP/MIB2_UDP can be used 18188 * to make TCP and UDP appear first in the list of mib items. 18189 * TBD: We could expand this and use it in netstat so that 18190 * the kernel doesn't have to produce large tables (connections, 18191 * routes, etc) when netstat only wants the statistics or a particular 18192 * table. 18193 */ 18194 if (!(level == MIB2_TCP || level == MIB2_UDP)) { 18195 if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) { 18196 return (1); 18197 } 18198 } 18199 18200 if (level != MIB2_TCP) { 18201 if ((mpctl = udp_snmp_get(q, mpctl)) == NULL) { 18202 return (1); 18203 } 18204 } 18205 18206 if (level != MIB2_UDP) { 18207 if ((mpctl = tcp_snmp_get(q, mpctl)) == NULL) { 18208 return (1); 18209 } 18210 } 18211 18212 if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl, 18213 ipst)) == NULL) { 18214 return (1); 18215 } 18216 18217 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst)) == NULL) { 18218 return (1); 18219 } 18220 18221 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) { 18222 return (1); 18223 } 18224 18225 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) { 18226 return (1); 18227 } 18228 18229 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) { 18230 return (1); 18231 } 18232 18233 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) { 18234 return (1); 18235 } 18236 18237 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst)) == NULL) { 18238 return (1); 18239 } 18240 18241 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst)) == NULL) { 18242 return (1); 18243 } 18244 18245 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) { 18246 return (1); 18247 } 18248 18249 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) { 18250 return (1); 18251 } 18252 18253 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) { 18254 return (1); 18255 } 18256 18257 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) { 18258 return (1); 18259 } 18260 18261 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) { 18262 return (1); 18263 } 18264 18265 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) { 18266 return (1); 18267 } 18268 18269 if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) { 18270 return (1); 18271 } 18272 18273 mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst); 18274 if (mpctl == NULL) { 18275 return (1); 18276 } 18277 18278 if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { 18279 return (1); 18280 } 18281 freemsg(mpctl); 18282 return (1); 18283 } 18284 18285 18286 /* Get global (legacy) IPv4 statistics */ 18287 static mblk_t * 18288 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib, 18289 ip_stack_t *ipst) 18290 { 18291 mib2_ip_t old_ip_mib; 18292 struct opthdr *optp; 18293 mblk_t *mp2ctl; 18294 18295 /* 18296 * make a copy of the original message 18297 */ 18298 mp2ctl = copymsg(mpctl); 18299 18300 /* fixed length IP structure... */ 18301 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18302 optp->level = MIB2_IP; 18303 optp->name = 0; 18304 SET_MIB(old_ip_mib.ipForwarding, 18305 (WE_ARE_FORWARDING(ipst) ? 1 : 2)); 18306 SET_MIB(old_ip_mib.ipDefaultTTL, 18307 (uint32_t)ipst->ips_ip_def_ttl); 18308 SET_MIB(old_ip_mib.ipReasmTimeout, 18309 ipst->ips_ip_g_frag_timeout); 18310 SET_MIB(old_ip_mib.ipAddrEntrySize, 18311 sizeof (mib2_ipAddrEntry_t)); 18312 SET_MIB(old_ip_mib.ipRouteEntrySize, 18313 sizeof (mib2_ipRouteEntry_t)); 18314 SET_MIB(old_ip_mib.ipNetToMediaEntrySize, 18315 sizeof (mib2_ipNetToMediaEntry_t)); 18316 SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 18317 SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 18318 SET_MIB(old_ip_mib.ipRouteAttributeSize, 18319 sizeof (mib2_ipAttributeEntry_t)); 18320 SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 18321 18322 /* 18323 * Grab the statistics from the new IP MIB 18324 */ 18325 SET_MIB(old_ip_mib.ipInReceives, 18326 (uint32_t)ipmib->ipIfStatsHCInReceives); 18327 SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors); 18328 SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors); 18329 SET_MIB(old_ip_mib.ipForwDatagrams, 18330 (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams); 18331 SET_MIB(old_ip_mib.ipInUnknownProtos, 18332 ipmib->ipIfStatsInUnknownProtos); 18333 SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards); 18334 SET_MIB(old_ip_mib.ipInDelivers, 18335 (uint32_t)ipmib->ipIfStatsHCInDelivers); 18336 SET_MIB(old_ip_mib.ipOutRequests, 18337 (uint32_t)ipmib->ipIfStatsHCOutRequests); 18338 SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards); 18339 SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes); 18340 SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds); 18341 SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs); 18342 SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails); 18343 SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs); 18344 SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails); 18345 SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates); 18346 18347 /* ipRoutingDiscards is not being used */ 18348 SET_MIB(old_ip_mib.ipRoutingDiscards, 0); 18349 SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs); 18350 SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts); 18351 SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs); 18352 SET_MIB(old_ip_mib.ipReasmDuplicates, 18353 ipmib->ipIfStatsReasmDuplicates); 18354 SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups); 18355 SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits); 18356 SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs); 18357 SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows); 18358 SET_MIB(old_ip_mib.rawipInOverflows, 18359 ipmib->rawipIfStatsInOverflows); 18360 18361 SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded); 18362 SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed); 18363 SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion); 18364 SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion); 18365 SET_MIB(old_ip_mib.ipOutSwitchIPv6, 18366 ipmib->ipIfStatsOutSwitchIPVersion); 18367 18368 if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib, 18369 (int)sizeof (old_ip_mib))) { 18370 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 18371 (uint_t)sizeof (old_ip_mib))); 18372 } 18373 18374 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18375 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 18376 (int)optp->level, (int)optp->name, (int)optp->len)); 18377 qreply(q, mpctl); 18378 return (mp2ctl); 18379 } 18380 18381 /* Per interface IPv4 statistics */ 18382 static mblk_t * 18383 ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18384 { 18385 struct opthdr *optp; 18386 mblk_t *mp2ctl; 18387 ill_t *ill; 18388 ill_walk_context_t ctx; 18389 mblk_t *mp_tail = NULL; 18390 mib2_ipIfStatsEntry_t global_ip_mib; 18391 18392 /* 18393 * Make a copy of the original message 18394 */ 18395 mp2ctl = copymsg(mpctl); 18396 18397 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18398 optp->level = MIB2_IP; 18399 optp->name = MIB2_IP_TRAFFIC_STATS; 18400 /* Include "unknown interface" ip_mib */ 18401 ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 18402 ipst->ips_ip_mib.ipIfStatsIfIndex = 18403 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 18404 SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding, 18405 (ipst->ips_ip_g_forward ? 1 : 2)); 18406 SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL, 18407 (uint32_t)ipst->ips_ip_def_ttl); 18408 SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize, 18409 sizeof (mib2_ipIfStatsEntry_t)); 18410 SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize, 18411 sizeof (mib2_ipAddrEntry_t)); 18412 SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize, 18413 sizeof (mib2_ipRouteEntry_t)); 18414 SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize, 18415 sizeof (mib2_ipNetToMediaEntry_t)); 18416 SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize, 18417 sizeof (ip_member_t)); 18418 SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize, 18419 sizeof (ip_grpsrc_t)); 18420 18421 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18422 (char *)&ipst->ips_ip_mib, (int)sizeof (ipst->ips_ip_mib))) { 18423 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18424 "failed to allocate %u bytes\n", 18425 (uint_t)sizeof (ipst->ips_ip_mib))); 18426 } 18427 18428 bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib)); 18429 18430 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18431 ill = ILL_START_WALK_V4(&ctx, ipst); 18432 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18433 ill->ill_ip_mib->ipIfStatsIfIndex = 18434 ill->ill_phyint->phyint_ifindex; 18435 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 18436 (ipst->ips_ip_g_forward ? 1 : 2)); 18437 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL, 18438 (uint32_t)ipst->ips_ip_def_ttl); 18439 18440 ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib); 18441 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18442 (char *)ill->ill_ip_mib, 18443 (int)sizeof (*ill->ill_ip_mib))) { 18444 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18445 "failed to allocate %u bytes\n", 18446 (uint_t)sizeof (*ill->ill_ip_mib))); 18447 } 18448 } 18449 rw_exit(&ipst->ips_ill_g_lock); 18450 18451 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18452 ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18453 "level %d, name %d, len %d\n", 18454 (int)optp->level, (int)optp->name, (int)optp->len)); 18455 qreply(q, mpctl); 18456 18457 if (mp2ctl == NULL) 18458 return (NULL); 18459 18460 return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst)); 18461 } 18462 18463 /* Global IPv4 ICMP statistics */ 18464 static mblk_t * 18465 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18466 { 18467 struct opthdr *optp; 18468 mblk_t *mp2ctl; 18469 18470 /* 18471 * Make a copy of the original message 18472 */ 18473 mp2ctl = copymsg(mpctl); 18474 18475 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18476 optp->level = MIB2_ICMP; 18477 optp->name = 0; 18478 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib, 18479 (int)sizeof (ipst->ips_icmp_mib))) { 18480 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 18481 (uint_t)sizeof (ipst->ips_icmp_mib))); 18482 } 18483 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18484 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 18485 (int)optp->level, (int)optp->name, (int)optp->len)); 18486 qreply(q, mpctl); 18487 return (mp2ctl); 18488 } 18489 18490 /* Global IPv4 IGMP statistics */ 18491 static mblk_t * 18492 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18493 { 18494 struct opthdr *optp; 18495 mblk_t *mp2ctl; 18496 18497 /* 18498 * make a copy of the original message 18499 */ 18500 mp2ctl = copymsg(mpctl); 18501 18502 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18503 optp->level = EXPER_IGMP; 18504 optp->name = 0; 18505 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat, 18506 (int)sizeof (ipst->ips_igmpstat))) { 18507 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 18508 (uint_t)sizeof (ipst->ips_igmpstat))); 18509 } 18510 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18511 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 18512 (int)optp->level, (int)optp->name, (int)optp->len)); 18513 qreply(q, mpctl); 18514 return (mp2ctl); 18515 } 18516 18517 /* Global IPv4 Multicast Routing statistics */ 18518 static mblk_t * 18519 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18520 { 18521 struct opthdr *optp; 18522 mblk_t *mp2ctl; 18523 18524 /* 18525 * make a copy of the original message 18526 */ 18527 mp2ctl = copymsg(mpctl); 18528 18529 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18530 optp->level = EXPER_DVMRP; 18531 optp->name = 0; 18532 if (!ip_mroute_stats(mpctl->b_cont, ipst)) { 18533 ip0dbg(("ip_mroute_stats: failed\n")); 18534 } 18535 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18536 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 18537 (int)optp->level, (int)optp->name, (int)optp->len)); 18538 qreply(q, mpctl); 18539 return (mp2ctl); 18540 } 18541 18542 /* IPv4 address information */ 18543 static mblk_t * 18544 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18545 { 18546 struct opthdr *optp; 18547 mblk_t *mp2ctl; 18548 mblk_t *mp_tail = NULL; 18549 ill_t *ill; 18550 ipif_t *ipif; 18551 uint_t bitval; 18552 mib2_ipAddrEntry_t mae; 18553 zoneid_t zoneid; 18554 ill_walk_context_t ctx; 18555 18556 /* 18557 * make a copy of the original message 18558 */ 18559 mp2ctl = copymsg(mpctl); 18560 18561 /* ipAddrEntryTable */ 18562 18563 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18564 optp->level = MIB2_IP; 18565 optp->name = MIB2_IP_ADDR; 18566 zoneid = Q_TO_CONN(q)->conn_zoneid; 18567 18568 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18569 ill = ILL_START_WALK_V4(&ctx, ipst); 18570 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18571 for (ipif = ill->ill_ipif; ipif != NULL; 18572 ipif = ipif->ipif_next) { 18573 if (ipif->ipif_zoneid != zoneid && 18574 ipif->ipif_zoneid != ALL_ZONES) 18575 continue; 18576 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18577 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18578 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18579 18580 ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes, 18581 OCTET_LENGTH); 18582 mae.ipAdEntIfIndex.o_length = 18583 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 18584 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 18585 mae.ipAdEntNetMask = ipif->ipif_net_mask; 18586 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 18587 mae.ipAdEntInfo.ae_subnet_len = 18588 ip_mask_to_plen(ipif->ipif_net_mask); 18589 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 18590 for (bitval = 1; 18591 bitval && 18592 !(bitval & ipif->ipif_brd_addr); 18593 bitval <<= 1) 18594 noop; 18595 mae.ipAdEntBcastAddr = bitval; 18596 mae.ipAdEntReasmMaxSize = IP_MAXPACKET; 18597 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 18598 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 18599 mae.ipAdEntInfo.ae_broadcast_addr = 18600 ipif->ipif_brd_addr; 18601 mae.ipAdEntInfo.ae_pp_dst_addr = 18602 ipif->ipif_pp_dst_addr; 18603 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 18604 ill->ill_flags | ill->ill_phyint->phyint_flags; 18605 mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL; 18606 18607 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18608 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 18609 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 18610 "allocate %u bytes\n", 18611 (uint_t)sizeof (mib2_ipAddrEntry_t))); 18612 } 18613 } 18614 } 18615 rw_exit(&ipst->ips_ill_g_lock); 18616 18617 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18618 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 18619 (int)optp->level, (int)optp->name, (int)optp->len)); 18620 qreply(q, mpctl); 18621 return (mp2ctl); 18622 } 18623 18624 /* IPv6 address information */ 18625 static mblk_t * 18626 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18627 { 18628 struct opthdr *optp; 18629 mblk_t *mp2ctl; 18630 mblk_t *mp_tail = NULL; 18631 ill_t *ill; 18632 ipif_t *ipif; 18633 mib2_ipv6AddrEntry_t mae6; 18634 zoneid_t zoneid; 18635 ill_walk_context_t ctx; 18636 18637 /* 18638 * make a copy of the original message 18639 */ 18640 mp2ctl = copymsg(mpctl); 18641 18642 /* ipv6AddrEntryTable */ 18643 18644 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18645 optp->level = MIB2_IP6; 18646 optp->name = MIB2_IP6_ADDR; 18647 zoneid = Q_TO_CONN(q)->conn_zoneid; 18648 18649 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18650 ill = ILL_START_WALK_V6(&ctx, ipst); 18651 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18652 for (ipif = ill->ill_ipif; ipif != NULL; 18653 ipif = ipif->ipif_next) { 18654 if (ipif->ipif_zoneid != zoneid && 18655 ipif->ipif_zoneid != ALL_ZONES) 18656 continue; 18657 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18658 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18659 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18660 18661 ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes, 18662 OCTET_LENGTH); 18663 mae6.ipv6AddrIfIndex.o_length = 18664 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 18665 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 18666 mae6.ipv6AddrPfxLength = 18667 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 18668 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 18669 mae6.ipv6AddrInfo.ae_subnet_len = 18670 mae6.ipv6AddrPfxLength; 18671 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 18672 18673 /* Type: stateless(1), stateful(2), unknown(3) */ 18674 if (ipif->ipif_flags & IPIF_ADDRCONF) 18675 mae6.ipv6AddrType = 1; 18676 else 18677 mae6.ipv6AddrType = 2; 18678 /* Anycast: true(1), false(2) */ 18679 if (ipif->ipif_flags & IPIF_ANYCAST) 18680 mae6.ipv6AddrAnycastFlag = 1; 18681 else 18682 mae6.ipv6AddrAnycastFlag = 2; 18683 18684 /* 18685 * Address status: preferred(1), deprecated(2), 18686 * invalid(3), inaccessible(4), unknown(5) 18687 */ 18688 if (ipif->ipif_flags & IPIF_NOLOCAL) 18689 mae6.ipv6AddrStatus = 3; 18690 else if (ipif->ipif_flags & IPIF_DEPRECATED) 18691 mae6.ipv6AddrStatus = 2; 18692 else 18693 mae6.ipv6AddrStatus = 1; 18694 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 18695 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 18696 mae6.ipv6AddrInfo.ae_pp_dst_addr = 18697 ipif->ipif_v6pp_dst_addr; 18698 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 18699 ill->ill_flags | ill->ill_phyint->phyint_flags; 18700 mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET; 18701 mae6.ipv6AddrIdentifier = ill->ill_token; 18702 mae6.ipv6AddrIdentifierLen = ill->ill_token_length; 18703 mae6.ipv6AddrReachableTime = ill->ill_reachable_time; 18704 mae6.ipv6AddrRetransmitTime = 18705 ill->ill_reachable_retrans_time; 18706 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18707 (char *)&mae6, 18708 (int)sizeof (mib2_ipv6AddrEntry_t))) { 18709 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 18710 "allocate %u bytes\n", 18711 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 18712 } 18713 } 18714 } 18715 rw_exit(&ipst->ips_ill_g_lock); 18716 18717 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18718 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 18719 (int)optp->level, (int)optp->name, (int)optp->len)); 18720 qreply(q, mpctl); 18721 return (mp2ctl); 18722 } 18723 18724 /* IPv4 multicast group membership. */ 18725 static mblk_t * 18726 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18727 { 18728 struct opthdr *optp; 18729 mblk_t *mp2ctl; 18730 ill_t *ill; 18731 ipif_t *ipif; 18732 ilm_t *ilm; 18733 ip_member_t ipm; 18734 mblk_t *mp_tail = NULL; 18735 ill_walk_context_t ctx; 18736 zoneid_t zoneid; 18737 18738 /* 18739 * make a copy of the original message 18740 */ 18741 mp2ctl = copymsg(mpctl); 18742 zoneid = Q_TO_CONN(q)->conn_zoneid; 18743 18744 /* ipGroupMember table */ 18745 optp = (struct opthdr *)&mpctl->b_rptr[ 18746 sizeof (struct T_optmgmt_ack)]; 18747 optp->level = MIB2_IP; 18748 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 18749 18750 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18751 ill = ILL_START_WALK_V4(&ctx, ipst); 18752 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18753 ILM_WALKER_HOLD(ill); 18754 for (ipif = ill->ill_ipif; ipif != NULL; 18755 ipif = ipif->ipif_next) { 18756 if (ipif->ipif_zoneid != zoneid && 18757 ipif->ipif_zoneid != ALL_ZONES) 18758 continue; /* not this zone */ 18759 ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes, 18760 OCTET_LENGTH); 18761 ipm.ipGroupMemberIfIndex.o_length = 18762 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 18763 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18764 ASSERT(ilm->ilm_ipif != NULL); 18765 ASSERT(ilm->ilm_ill == NULL); 18766 if (ilm->ilm_ipif != ipif) 18767 continue; 18768 ipm.ipGroupMemberAddress = ilm->ilm_addr; 18769 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 18770 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 18771 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18772 (char *)&ipm, (int)sizeof (ipm))) { 18773 ip1dbg(("ip_snmp_get_mib2_ip_group: " 18774 "failed to allocate %u bytes\n", 18775 (uint_t)sizeof (ipm))); 18776 } 18777 } 18778 } 18779 ILM_WALKER_RELE(ill); 18780 } 18781 rw_exit(&ipst->ips_ill_g_lock); 18782 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18783 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18784 (int)optp->level, (int)optp->name, (int)optp->len)); 18785 qreply(q, mpctl); 18786 return (mp2ctl); 18787 } 18788 18789 /* IPv6 multicast group membership. */ 18790 static mblk_t * 18791 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18792 { 18793 struct opthdr *optp; 18794 mblk_t *mp2ctl; 18795 ill_t *ill; 18796 ilm_t *ilm; 18797 ipv6_member_t ipm6; 18798 mblk_t *mp_tail = NULL; 18799 ill_walk_context_t ctx; 18800 zoneid_t zoneid; 18801 18802 /* 18803 * make a copy of the original message 18804 */ 18805 mp2ctl = copymsg(mpctl); 18806 zoneid = Q_TO_CONN(q)->conn_zoneid; 18807 18808 /* ip6GroupMember table */ 18809 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18810 optp->level = MIB2_IP6; 18811 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 18812 18813 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18814 ill = ILL_START_WALK_V6(&ctx, ipst); 18815 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18816 ILM_WALKER_HOLD(ill); 18817 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 18818 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18819 ASSERT(ilm->ilm_ipif == NULL); 18820 ASSERT(ilm->ilm_ill != NULL); 18821 if (ilm->ilm_zoneid != zoneid) 18822 continue; /* not this zone */ 18823 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 18824 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 18825 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 18826 if (!snmp_append_data2(mpctl->b_cont, 18827 &mp_tail, 18828 (char *)&ipm6, (int)sizeof (ipm6))) { 18829 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 18830 "failed to allocate %u bytes\n", 18831 (uint_t)sizeof (ipm6))); 18832 } 18833 } 18834 ILM_WALKER_RELE(ill); 18835 } 18836 rw_exit(&ipst->ips_ill_g_lock); 18837 18838 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18839 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18840 (int)optp->level, (int)optp->name, (int)optp->len)); 18841 qreply(q, mpctl); 18842 return (mp2ctl); 18843 } 18844 18845 /* IP multicast filtered sources */ 18846 static mblk_t * 18847 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18848 { 18849 struct opthdr *optp; 18850 mblk_t *mp2ctl; 18851 ill_t *ill; 18852 ipif_t *ipif; 18853 ilm_t *ilm; 18854 ip_grpsrc_t ips; 18855 mblk_t *mp_tail = NULL; 18856 ill_walk_context_t ctx; 18857 zoneid_t zoneid; 18858 int i; 18859 slist_t *sl; 18860 18861 /* 18862 * make a copy of the original message 18863 */ 18864 mp2ctl = copymsg(mpctl); 18865 zoneid = Q_TO_CONN(q)->conn_zoneid; 18866 18867 /* ipGroupSource table */ 18868 optp = (struct opthdr *)&mpctl->b_rptr[ 18869 sizeof (struct T_optmgmt_ack)]; 18870 optp->level = MIB2_IP; 18871 optp->name = EXPER_IP_GROUP_SOURCES; 18872 18873 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18874 ill = ILL_START_WALK_V4(&ctx, ipst); 18875 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18876 ILM_WALKER_HOLD(ill); 18877 for (ipif = ill->ill_ipif; ipif != NULL; 18878 ipif = ipif->ipif_next) { 18879 if (ipif->ipif_zoneid != zoneid) 18880 continue; /* not this zone */ 18881 ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes, 18882 OCTET_LENGTH); 18883 ips.ipGroupSourceIfIndex.o_length = 18884 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 18885 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18886 ASSERT(ilm->ilm_ipif != NULL); 18887 ASSERT(ilm->ilm_ill == NULL); 18888 sl = ilm->ilm_filter; 18889 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 18890 continue; 18891 ips.ipGroupSourceGroup = ilm->ilm_addr; 18892 for (i = 0; i < sl->sl_numsrc; i++) { 18893 if (!IN6_IS_ADDR_V4MAPPED( 18894 &sl->sl_addr[i])) 18895 continue; 18896 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 18897 ips.ipGroupSourceAddress); 18898 if (snmp_append_data2(mpctl->b_cont, 18899 &mp_tail, (char *)&ips, 18900 (int)sizeof (ips)) == 0) { 18901 ip1dbg(("ip_snmp_get_mib2_" 18902 "ip_group_src: failed to " 18903 "allocate %u bytes\n", 18904 (uint_t)sizeof (ips))); 18905 } 18906 } 18907 } 18908 } 18909 ILM_WALKER_RELE(ill); 18910 } 18911 rw_exit(&ipst->ips_ill_g_lock); 18912 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18913 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18914 (int)optp->level, (int)optp->name, (int)optp->len)); 18915 qreply(q, mpctl); 18916 return (mp2ctl); 18917 } 18918 18919 /* IPv6 multicast filtered sources. */ 18920 static mblk_t * 18921 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18922 { 18923 struct opthdr *optp; 18924 mblk_t *mp2ctl; 18925 ill_t *ill; 18926 ilm_t *ilm; 18927 ipv6_grpsrc_t ips6; 18928 mblk_t *mp_tail = NULL; 18929 ill_walk_context_t ctx; 18930 zoneid_t zoneid; 18931 int i; 18932 slist_t *sl; 18933 18934 /* 18935 * make a copy of the original message 18936 */ 18937 mp2ctl = copymsg(mpctl); 18938 zoneid = Q_TO_CONN(q)->conn_zoneid; 18939 18940 /* ip6GroupMember table */ 18941 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18942 optp->level = MIB2_IP6; 18943 optp->name = EXPER_IP6_GROUP_SOURCES; 18944 18945 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18946 ill = ILL_START_WALK_V6(&ctx, ipst); 18947 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18948 ILM_WALKER_HOLD(ill); 18949 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 18950 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18951 ASSERT(ilm->ilm_ipif == NULL); 18952 ASSERT(ilm->ilm_ill != NULL); 18953 sl = ilm->ilm_filter; 18954 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 18955 continue; 18956 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 18957 for (i = 0; i < sl->sl_numsrc; i++) { 18958 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 18959 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18960 (char *)&ips6, (int)sizeof (ips6))) { 18961 ip1dbg(("ip_snmp_get_mib2_ip6_" 18962 "group_src: failed to allocate " 18963 "%u bytes\n", 18964 (uint_t)sizeof (ips6))); 18965 } 18966 } 18967 } 18968 ILM_WALKER_RELE(ill); 18969 } 18970 rw_exit(&ipst->ips_ill_g_lock); 18971 18972 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18973 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18974 (int)optp->level, (int)optp->name, (int)optp->len)); 18975 qreply(q, mpctl); 18976 return (mp2ctl); 18977 } 18978 18979 /* Multicast routing virtual interface table. */ 18980 static mblk_t * 18981 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18982 { 18983 struct opthdr *optp; 18984 mblk_t *mp2ctl; 18985 18986 /* 18987 * make a copy of the original message 18988 */ 18989 mp2ctl = copymsg(mpctl); 18990 18991 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18992 optp->level = EXPER_DVMRP; 18993 optp->name = EXPER_DVMRP_VIF; 18994 if (!ip_mroute_vif(mpctl->b_cont, ipst)) { 18995 ip0dbg(("ip_mroute_vif: failed\n")); 18996 } 18997 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18998 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 18999 (int)optp->level, (int)optp->name, (int)optp->len)); 19000 qreply(q, mpctl); 19001 return (mp2ctl); 19002 } 19003 19004 /* Multicast routing table. */ 19005 static mblk_t * 19006 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19007 { 19008 struct opthdr *optp; 19009 mblk_t *mp2ctl; 19010 19011 /* 19012 * make a copy of the original message 19013 */ 19014 mp2ctl = copymsg(mpctl); 19015 19016 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19017 optp->level = EXPER_DVMRP; 19018 optp->name = EXPER_DVMRP_MRT; 19019 if (!ip_mroute_mrt(mpctl->b_cont, ipst)) { 19020 ip0dbg(("ip_mroute_mrt: failed\n")); 19021 } 19022 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19023 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 19024 (int)optp->level, (int)optp->name, (int)optp->len)); 19025 qreply(q, mpctl); 19026 return (mp2ctl); 19027 } 19028 19029 /* 19030 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 19031 * in one IRE walk. 19032 */ 19033 static mblk_t * 19034 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19035 { 19036 struct opthdr *optp; 19037 mblk_t *mp2ctl; /* Returned */ 19038 mblk_t *mp3ctl; /* nettomedia */ 19039 mblk_t *mp4ctl; /* routeattrs */ 19040 iproutedata_t ird; 19041 zoneid_t zoneid; 19042 19043 /* 19044 * make copies of the original message 19045 * - mp2ctl is returned unchanged to the caller for his use 19046 * - mpctl is sent upstream as ipRouteEntryTable 19047 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 19048 * - mp4ctl is sent upstream as ipRouteAttributeTable 19049 */ 19050 mp2ctl = copymsg(mpctl); 19051 mp3ctl = copymsg(mpctl); 19052 mp4ctl = copymsg(mpctl); 19053 if (mp3ctl == NULL || mp4ctl == NULL) { 19054 freemsg(mp4ctl); 19055 freemsg(mp3ctl); 19056 freemsg(mp2ctl); 19057 freemsg(mpctl); 19058 return (NULL); 19059 } 19060 19061 bzero(&ird, sizeof (ird)); 19062 19063 ird.ird_route.lp_head = mpctl->b_cont; 19064 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19065 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19066 19067 zoneid = Q_TO_CONN(q)->conn_zoneid; 19068 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); 19069 19070 /* ipRouteEntryTable in mpctl */ 19071 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19072 optp->level = MIB2_IP; 19073 optp->name = MIB2_IP_ROUTE; 19074 optp->len = msgdsize(ird.ird_route.lp_head); 19075 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19076 (int)optp->level, (int)optp->name, (int)optp->len)); 19077 qreply(q, mpctl); 19078 19079 /* ipNetToMediaEntryTable in mp3ctl */ 19080 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19081 optp->level = MIB2_IP; 19082 optp->name = MIB2_IP_MEDIA; 19083 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19084 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19085 (int)optp->level, (int)optp->name, (int)optp->len)); 19086 qreply(q, mp3ctl); 19087 19088 /* ipRouteAttributeTable in mp4ctl */ 19089 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19090 optp->level = MIB2_IP; 19091 optp->name = EXPER_IP_RTATTR; 19092 optp->len = msgdsize(ird.ird_attrs.lp_head); 19093 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19094 (int)optp->level, (int)optp->name, (int)optp->len)); 19095 if (optp->len == 0) 19096 freemsg(mp4ctl); 19097 else 19098 qreply(q, mp4ctl); 19099 19100 return (mp2ctl); 19101 } 19102 19103 /* 19104 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 19105 * ipv6NetToMediaEntryTable in an NDP walk. 19106 */ 19107 static mblk_t * 19108 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19109 { 19110 struct opthdr *optp; 19111 mblk_t *mp2ctl; /* Returned */ 19112 mblk_t *mp3ctl; /* nettomedia */ 19113 mblk_t *mp4ctl; /* routeattrs */ 19114 iproutedata_t ird; 19115 zoneid_t zoneid; 19116 19117 /* 19118 * make copies of the original message 19119 * - mp2ctl is returned unchanged to the caller for his use 19120 * - mpctl is sent upstream as ipv6RouteEntryTable 19121 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 19122 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 19123 */ 19124 mp2ctl = copymsg(mpctl); 19125 mp3ctl = copymsg(mpctl); 19126 mp4ctl = copymsg(mpctl); 19127 if (mp3ctl == NULL || mp4ctl == NULL) { 19128 freemsg(mp4ctl); 19129 freemsg(mp3ctl); 19130 freemsg(mp2ctl); 19131 freemsg(mpctl); 19132 return (NULL); 19133 } 19134 19135 bzero(&ird, sizeof (ird)); 19136 19137 ird.ird_route.lp_head = mpctl->b_cont; 19138 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19139 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19140 19141 zoneid = Q_TO_CONN(q)->conn_zoneid; 19142 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); 19143 19144 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19145 optp->level = MIB2_IP6; 19146 optp->name = MIB2_IP6_ROUTE; 19147 optp->len = msgdsize(ird.ird_route.lp_head); 19148 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19149 (int)optp->level, (int)optp->name, (int)optp->len)); 19150 qreply(q, mpctl); 19151 19152 /* ipv6NetToMediaEntryTable in mp3ctl */ 19153 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst); 19154 19155 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19156 optp->level = MIB2_IP6; 19157 optp->name = MIB2_IP6_MEDIA; 19158 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19159 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19160 (int)optp->level, (int)optp->name, (int)optp->len)); 19161 qreply(q, mp3ctl); 19162 19163 /* ipv6RouteAttributeTable in mp4ctl */ 19164 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19165 optp->level = MIB2_IP6; 19166 optp->name = EXPER_IP_RTATTR; 19167 optp->len = msgdsize(ird.ird_attrs.lp_head); 19168 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19169 (int)optp->level, (int)optp->name, (int)optp->len)); 19170 if (optp->len == 0) 19171 freemsg(mp4ctl); 19172 else 19173 qreply(q, mp4ctl); 19174 19175 return (mp2ctl); 19176 } 19177 19178 /* 19179 * IPv6 mib: One per ill 19180 */ 19181 static mblk_t * 19182 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19183 { 19184 struct opthdr *optp; 19185 mblk_t *mp2ctl; 19186 ill_t *ill; 19187 ill_walk_context_t ctx; 19188 mblk_t *mp_tail = NULL; 19189 19190 /* 19191 * Make a copy of the original message 19192 */ 19193 mp2ctl = copymsg(mpctl); 19194 19195 /* fixed length IPv6 structure ... */ 19196 19197 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19198 optp->level = MIB2_IP6; 19199 optp->name = 0; 19200 /* Include "unknown interface" ip6_mib */ 19201 ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 19202 ipst->ips_ip6_mib.ipIfStatsIfIndex = 19203 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 19204 SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding, 19205 ipst->ips_ipv6_forward ? 1 : 2); 19206 SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit, 19207 ipst->ips_ipv6_def_hops); 19208 SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize, 19209 sizeof (mib2_ipIfStatsEntry_t)); 19210 SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize, 19211 sizeof (mib2_ipv6AddrEntry_t)); 19212 SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize, 19213 sizeof (mib2_ipv6RouteEntry_t)); 19214 SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize, 19215 sizeof (mib2_ipv6NetToMediaEntry_t)); 19216 SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize, 19217 sizeof (ipv6_member_t)); 19218 SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize, 19219 sizeof (ipv6_grpsrc_t)); 19220 19221 /* 19222 * Synchronize 64- and 32-bit counters 19223 */ 19224 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives, 19225 ipIfStatsHCInReceives); 19226 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers, 19227 ipIfStatsHCInDelivers); 19228 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests, 19229 ipIfStatsHCOutRequests); 19230 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams, 19231 ipIfStatsHCOutForwDatagrams); 19232 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts, 19233 ipIfStatsHCOutMcastPkts); 19234 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts, 19235 ipIfStatsHCInMcastPkts); 19236 19237 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19238 (char *)&ipst->ips_ip6_mib, (int)sizeof (ipst->ips_ip6_mib))) { 19239 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 19240 (uint_t)sizeof (ipst->ips_ip6_mib))); 19241 } 19242 19243 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19244 ill = ILL_START_WALK_V6(&ctx, ipst); 19245 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19246 ill->ill_ip_mib->ipIfStatsIfIndex = 19247 ill->ill_phyint->phyint_ifindex; 19248 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 19249 ipst->ips_ipv6_forward ? 1 : 2); 19250 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit, 19251 ill->ill_max_hops); 19252 19253 /* 19254 * Synchronize 64- and 32-bit counters 19255 */ 19256 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives, 19257 ipIfStatsHCInReceives); 19258 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers, 19259 ipIfStatsHCInDelivers); 19260 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests, 19261 ipIfStatsHCOutRequests); 19262 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams, 19263 ipIfStatsHCOutForwDatagrams); 19264 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts, 19265 ipIfStatsHCOutMcastPkts); 19266 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts, 19267 ipIfStatsHCInMcastPkts); 19268 19269 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19270 (char *)ill->ill_ip_mib, 19271 (int)sizeof (*ill->ill_ip_mib))) { 19272 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 19273 "%u bytes\n", (uint_t)sizeof (*ill->ill_ip_mib))); 19274 } 19275 } 19276 rw_exit(&ipst->ips_ill_g_lock); 19277 19278 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19279 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 19280 (int)optp->level, (int)optp->name, (int)optp->len)); 19281 qreply(q, mpctl); 19282 return (mp2ctl); 19283 } 19284 19285 /* 19286 * ICMPv6 mib: One per ill 19287 */ 19288 static mblk_t * 19289 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19290 { 19291 struct opthdr *optp; 19292 mblk_t *mp2ctl; 19293 ill_t *ill; 19294 ill_walk_context_t ctx; 19295 mblk_t *mp_tail = NULL; 19296 /* 19297 * Make a copy of the original message 19298 */ 19299 mp2ctl = copymsg(mpctl); 19300 19301 /* fixed length ICMPv6 structure ... */ 19302 19303 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19304 optp->level = MIB2_ICMP6; 19305 optp->name = 0; 19306 /* Include "unknown interface" icmp6_mib */ 19307 ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex = 19308 MIB2_UNKNOWN_INTERFACE; /* netstat flag */ 19309 ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize = 19310 sizeof (mib2_ipv6IfIcmpEntry_t); 19311 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19312 (char *)&ipst->ips_icmp6_mib, 19313 (int)sizeof (ipst->ips_icmp6_mib))) { 19314 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 19315 (uint_t)sizeof (ipst->ips_icmp6_mib))); 19316 } 19317 19318 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19319 ill = ILL_START_WALK_V6(&ctx, ipst); 19320 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19321 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 19322 ill->ill_phyint->phyint_ifindex; 19323 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19324 (char *)ill->ill_icmp6_mib, 19325 (int)sizeof (*ill->ill_icmp6_mib))) { 19326 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 19327 "%u bytes\n", 19328 (uint_t)sizeof (*ill->ill_icmp6_mib))); 19329 } 19330 } 19331 rw_exit(&ipst->ips_ill_g_lock); 19332 19333 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19334 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 19335 (int)optp->level, (int)optp->name, (int)optp->len)); 19336 qreply(q, mpctl); 19337 return (mp2ctl); 19338 } 19339 19340 /* 19341 * ire_walk routine to create both ipRouteEntryTable and 19342 * ipRouteAttributeTable in one IRE walk 19343 */ 19344 static void 19345 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 19346 { 19347 ill_t *ill; 19348 ipif_t *ipif; 19349 mib2_ipRouteEntry_t *re; 19350 mib2_ipAttributeEntry_t *iae, *iaeptr; 19351 ipaddr_t gw_addr; 19352 tsol_ire_gw_secattr_t *attrp; 19353 tsol_gc_t *gc = NULL; 19354 tsol_gcgrp_t *gcgrp = NULL; 19355 uint_t sacnt = 0; 19356 int i; 19357 19358 ASSERT(ire->ire_ipversion == IPV4_VERSION); 19359 19360 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19361 return; 19362 19363 if ((attrp = ire->ire_gw_secattr) != NULL) { 19364 mutex_enter(&attrp->igsa_lock); 19365 if ((gc = attrp->igsa_gc) != NULL) { 19366 gcgrp = gc->gc_grp; 19367 ASSERT(gcgrp != NULL); 19368 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19369 sacnt = 1; 19370 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19371 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19372 gc = gcgrp->gcgrp_head; 19373 sacnt = gcgrp->gcgrp_count; 19374 } 19375 mutex_exit(&attrp->igsa_lock); 19376 19377 /* do nothing if there's no gc to report */ 19378 if (gc == NULL) { 19379 ASSERT(sacnt == 0); 19380 if (gcgrp != NULL) { 19381 /* we might as well drop the lock now */ 19382 rw_exit(&gcgrp->gcgrp_rwlock); 19383 gcgrp = NULL; 19384 } 19385 attrp = NULL; 19386 } 19387 19388 ASSERT(gc == NULL || (gcgrp != NULL && 19389 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19390 } 19391 ASSERT(sacnt == 0 || gc != NULL); 19392 19393 if (sacnt != 0 && 19394 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19395 kmem_free(re, sizeof (*re)); 19396 rw_exit(&gcgrp->gcgrp_rwlock); 19397 return; 19398 } 19399 19400 /* 19401 * Return all IRE types for route table... let caller pick and choose 19402 */ 19403 re->ipRouteDest = ire->ire_addr; 19404 ipif = ire->ire_ipif; 19405 re->ipRouteIfIndex.o_length = 0; 19406 if (ire->ire_type == IRE_CACHE) { 19407 ill = (ill_t *)ire->ire_stq->q_ptr; 19408 re->ipRouteIfIndex.o_length = 19409 ill->ill_name_length == 0 ? 0 : 19410 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19411 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 19412 re->ipRouteIfIndex.o_length); 19413 } else if (ipif != NULL) { 19414 ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH); 19415 re->ipRouteIfIndex.o_length = 19416 mi_strlen(re->ipRouteIfIndex.o_bytes); 19417 } 19418 re->ipRouteMetric1 = -1; 19419 re->ipRouteMetric2 = -1; 19420 re->ipRouteMetric3 = -1; 19421 re->ipRouteMetric4 = -1; 19422 19423 gw_addr = ire->ire_gateway_addr; 19424 19425 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 19426 re->ipRouteNextHop = ire->ire_src_addr; 19427 else 19428 re->ipRouteNextHop = gw_addr; 19429 /* indirect(4), direct(3), or invalid(2) */ 19430 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19431 re->ipRouteType = 2; 19432 else 19433 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 19434 re->ipRouteProto = -1; 19435 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 19436 re->ipRouteMask = ire->ire_mask; 19437 re->ipRouteMetric5 = -1; 19438 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 19439 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 19440 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19441 re->ipRouteInfo.re_ref = ire->ire_refcnt; 19442 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 19443 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19444 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19445 re->ipRouteInfo.re_flags = ire->ire_flags; 19446 19447 if (ire->ire_flags & RTF_DYNAMIC) { 19448 re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19449 } else { 19450 re->ipRouteInfo.re_ire_type = ire->ire_type; 19451 } 19452 19453 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19454 (char *)re, (int)sizeof (*re))) { 19455 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19456 (uint_t)sizeof (*re))); 19457 } 19458 19459 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19460 iaeptr->iae_routeidx = ird->ird_idx; 19461 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19462 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19463 } 19464 19465 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19466 (char *)iae, sacnt * sizeof (*iae))) { 19467 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19468 (unsigned)(sacnt * sizeof (*iae)))); 19469 } 19470 19471 /* bump route index for next pass */ 19472 ird->ird_idx++; 19473 19474 kmem_free(re, sizeof (*re)); 19475 if (sacnt != 0) 19476 kmem_free(iae, sacnt * sizeof (*iae)); 19477 19478 if (gcgrp != NULL) 19479 rw_exit(&gcgrp->gcgrp_rwlock); 19480 } 19481 19482 /* 19483 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 19484 */ 19485 static void 19486 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 19487 { 19488 ill_t *ill; 19489 ipif_t *ipif; 19490 mib2_ipv6RouteEntry_t *re; 19491 mib2_ipAttributeEntry_t *iae, *iaeptr; 19492 in6_addr_t gw_addr_v6; 19493 tsol_ire_gw_secattr_t *attrp; 19494 tsol_gc_t *gc = NULL; 19495 tsol_gcgrp_t *gcgrp = NULL; 19496 uint_t sacnt = 0; 19497 int i; 19498 19499 ASSERT(ire->ire_ipversion == IPV6_VERSION); 19500 19501 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19502 return; 19503 19504 if ((attrp = ire->ire_gw_secattr) != NULL) { 19505 mutex_enter(&attrp->igsa_lock); 19506 if ((gc = attrp->igsa_gc) != NULL) { 19507 gcgrp = gc->gc_grp; 19508 ASSERT(gcgrp != NULL); 19509 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19510 sacnt = 1; 19511 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19512 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19513 gc = gcgrp->gcgrp_head; 19514 sacnt = gcgrp->gcgrp_count; 19515 } 19516 mutex_exit(&attrp->igsa_lock); 19517 19518 /* do nothing if there's no gc to report */ 19519 if (gc == NULL) { 19520 ASSERT(sacnt == 0); 19521 if (gcgrp != NULL) { 19522 /* we might as well drop the lock now */ 19523 rw_exit(&gcgrp->gcgrp_rwlock); 19524 gcgrp = NULL; 19525 } 19526 attrp = NULL; 19527 } 19528 19529 ASSERT(gc == NULL || (gcgrp != NULL && 19530 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19531 } 19532 ASSERT(sacnt == 0 || gc != NULL); 19533 19534 if (sacnt != 0 && 19535 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19536 kmem_free(re, sizeof (*re)); 19537 rw_exit(&gcgrp->gcgrp_rwlock); 19538 return; 19539 } 19540 19541 /* 19542 * Return all IRE types for route table... let caller pick and choose 19543 */ 19544 re->ipv6RouteDest = ire->ire_addr_v6; 19545 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 19546 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 19547 re->ipv6RouteIfIndex.o_length = 0; 19548 ipif = ire->ire_ipif; 19549 if (ire->ire_type == IRE_CACHE) { 19550 ill = (ill_t *)ire->ire_stq->q_ptr; 19551 re->ipv6RouteIfIndex.o_length = 19552 ill->ill_name_length == 0 ? 0 : 19553 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19554 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 19555 re->ipv6RouteIfIndex.o_length); 19556 } else if (ipif != NULL) { 19557 ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH); 19558 re->ipv6RouteIfIndex.o_length = 19559 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 19560 } 19561 19562 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19563 19564 mutex_enter(&ire->ire_lock); 19565 gw_addr_v6 = ire->ire_gateway_addr_v6; 19566 mutex_exit(&ire->ire_lock); 19567 19568 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 19569 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 19570 else 19571 re->ipv6RouteNextHop = gw_addr_v6; 19572 19573 /* remote(4), local(3), or discard(2) */ 19574 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19575 re->ipv6RouteType = 2; 19576 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 19577 re->ipv6RouteType = 3; 19578 else 19579 re->ipv6RouteType = 4; 19580 19581 re->ipv6RouteProtocol = -1; 19582 re->ipv6RoutePolicy = 0; 19583 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 19584 re->ipv6RouteNextHopRDI = 0; 19585 re->ipv6RouteWeight = 0; 19586 re->ipv6RouteMetric = 0; 19587 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 19588 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 19589 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19590 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 19591 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19592 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19593 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 19594 re->ipv6RouteInfo.re_flags = ire->ire_flags; 19595 19596 if (ire->ire_flags & RTF_DYNAMIC) { 19597 re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19598 } else { 19599 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 19600 } 19601 19602 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19603 (char *)re, (int)sizeof (*re))) { 19604 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19605 (uint_t)sizeof (*re))); 19606 } 19607 19608 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19609 iaeptr->iae_routeidx = ird->ird_idx; 19610 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19611 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19612 } 19613 19614 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19615 (char *)iae, sacnt * sizeof (*iae))) { 19616 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19617 (unsigned)(sacnt * sizeof (*iae)))); 19618 } 19619 19620 /* bump route index for next pass */ 19621 ird->ird_idx++; 19622 19623 kmem_free(re, sizeof (*re)); 19624 if (sacnt != 0) 19625 kmem_free(iae, sacnt * sizeof (*iae)); 19626 19627 if (gcgrp != NULL) 19628 rw_exit(&gcgrp->gcgrp_rwlock); 19629 } 19630 19631 /* 19632 * ndp_walk routine to create ipv6NetToMediaEntryTable 19633 */ 19634 static int 19635 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 19636 { 19637 ill_t *ill; 19638 mib2_ipv6NetToMediaEntry_t ntme; 19639 dl_unitdata_req_t *dl; 19640 19641 ill = nce->nce_ill; 19642 if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */ 19643 return (0); 19644 19645 /* 19646 * Neighbor cache entry attached to IRE with on-link 19647 * destination. 19648 */ 19649 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 19650 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 19651 if ((ill->ill_flags & ILLF_XRESOLV) && 19652 (nce->nce_res_mp != NULL)) { 19653 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 19654 ntme.ipv6NetToMediaPhysAddress.o_length = 19655 dl->dl_dest_addr_length; 19656 } else { 19657 ntme.ipv6NetToMediaPhysAddress.o_length = 19658 ill->ill_phys_addr_length; 19659 } 19660 if (nce->nce_res_mp != NULL) { 19661 bcopy((char *)nce->nce_res_mp->b_rptr + 19662 NCE_LL_ADDR_OFFSET(ill), 19663 ntme.ipv6NetToMediaPhysAddress.o_bytes, 19664 ntme.ipv6NetToMediaPhysAddress.o_length); 19665 } else { 19666 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 19667 ill->ill_phys_addr_length); 19668 } 19669 /* 19670 * Note: Returns ND_* states. Should be: 19671 * reachable(1), stale(2), delay(3), probe(4), 19672 * invalid(5), unknown(6) 19673 */ 19674 ntme.ipv6NetToMediaState = nce->nce_state; 19675 ntme.ipv6NetToMediaLastUpdated = 0; 19676 19677 /* other(1), dynamic(2), static(3), local(4) */ 19678 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 19679 ntme.ipv6NetToMediaType = 4; 19680 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 19681 ntme.ipv6NetToMediaType = 1; 19682 } else { 19683 ntme.ipv6NetToMediaType = 2; 19684 } 19685 19686 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 19687 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 19688 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 19689 (uint_t)sizeof (ntme))); 19690 } 19691 return (0); 19692 } 19693 19694 /* 19695 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 19696 */ 19697 /* ARGSUSED */ 19698 int 19699 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 19700 { 19701 switch (level) { 19702 case MIB2_IP: 19703 case MIB2_ICMP: 19704 switch (name) { 19705 default: 19706 break; 19707 } 19708 return (1); 19709 default: 19710 return (1); 19711 } 19712 } 19713 19714 /* 19715 * When there exists both a 64- and 32-bit counter of a particular type 19716 * (i.e., InReceives), only the 64-bit counters are added. 19717 */ 19718 void 19719 ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2) 19720 { 19721 UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors); 19722 UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors); 19723 UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes); 19724 UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors); 19725 UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos); 19726 UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts); 19727 UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards); 19728 UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards); 19729 UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs); 19730 UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails); 19731 UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates); 19732 UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds); 19733 UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs); 19734 UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails); 19735 UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes); 19736 UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates); 19737 UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups); 19738 UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits); 19739 UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs); 19740 UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows); 19741 UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows); 19742 UPDATE_MIB(o1, ipIfStatsInWrongIPVersion, 19743 o2->ipIfStatsInWrongIPVersion); 19744 UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion, 19745 o2->ipIfStatsInWrongIPVersion); 19746 UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion, 19747 o2->ipIfStatsOutSwitchIPVersion); 19748 UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives); 19749 UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets); 19750 UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams, 19751 o2->ipIfStatsHCInForwDatagrams); 19752 UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers); 19753 UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests); 19754 UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams, 19755 o2->ipIfStatsHCOutForwDatagrams); 19756 UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds); 19757 UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits); 19758 UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets); 19759 UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts); 19760 UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets); 19761 UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts); 19762 UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets, 19763 o2->ipIfStatsHCOutMcastOctets); 19764 UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts); 19765 UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts); 19766 UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded); 19767 UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed); 19768 UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs); 19769 UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs); 19770 UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts); 19771 } 19772 19773 void 19774 ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2) 19775 { 19776 UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs); 19777 UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors); 19778 UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs); 19779 UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs); 19780 UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds); 19781 UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems); 19782 UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs); 19783 UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos); 19784 UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies); 19785 UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits, 19786 o2->ipv6IfIcmpInRouterSolicits); 19787 UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements, 19788 o2->ipv6IfIcmpInRouterAdvertisements); 19789 UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits, 19790 o2->ipv6IfIcmpInNeighborSolicits); 19791 UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements, 19792 o2->ipv6IfIcmpInNeighborAdvertisements); 19793 UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects); 19794 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries, 19795 o2->ipv6IfIcmpInGroupMembQueries); 19796 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses, 19797 o2->ipv6IfIcmpInGroupMembResponses); 19798 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions, 19799 o2->ipv6IfIcmpInGroupMembReductions); 19800 UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs); 19801 UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors); 19802 UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs, 19803 o2->ipv6IfIcmpOutDestUnreachs); 19804 UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs, 19805 o2->ipv6IfIcmpOutAdminProhibs); 19806 UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds); 19807 UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems, 19808 o2->ipv6IfIcmpOutParmProblems); 19809 UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs); 19810 UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos); 19811 UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies); 19812 UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits, 19813 o2->ipv6IfIcmpOutRouterSolicits); 19814 UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements, 19815 o2->ipv6IfIcmpOutRouterAdvertisements); 19816 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits, 19817 o2->ipv6IfIcmpOutNeighborSolicits); 19818 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements, 19819 o2->ipv6IfIcmpOutNeighborAdvertisements); 19820 UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects); 19821 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries, 19822 o2->ipv6IfIcmpOutGroupMembQueries); 19823 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses, 19824 o2->ipv6IfIcmpOutGroupMembResponses); 19825 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions, 19826 o2->ipv6IfIcmpOutGroupMembReductions); 19827 UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows); 19828 UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit); 19829 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements, 19830 o2->ipv6IfIcmpInBadNeighborAdvertisements); 19831 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations, 19832 o2->ipv6IfIcmpInBadNeighborSolicitations); 19833 UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects); 19834 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal, 19835 o2->ipv6IfIcmpInGroupMembTotal); 19836 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries, 19837 o2->ipv6IfIcmpInGroupMembBadQueries); 19838 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports, 19839 o2->ipv6IfIcmpInGroupMembBadReports); 19840 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports, 19841 o2->ipv6IfIcmpInGroupMembOurReports); 19842 } 19843 19844 /* 19845 * Called before the options are updated to check if this packet will 19846 * be source routed from here. 19847 * This routine assumes that the options are well formed i.e. that they 19848 * have already been checked. 19849 */ 19850 static boolean_t 19851 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) 19852 { 19853 ipoptp_t opts; 19854 uchar_t *opt; 19855 uint8_t optval; 19856 uint8_t optlen; 19857 ipaddr_t dst; 19858 ire_t *ire; 19859 19860 if (IS_SIMPLE_IPH(ipha)) { 19861 ip2dbg(("not source routed\n")); 19862 return (B_FALSE); 19863 } 19864 dst = ipha->ipha_dst; 19865 for (optval = ipoptp_first(&opts, ipha); 19866 optval != IPOPT_EOL; 19867 optval = ipoptp_next(&opts)) { 19868 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 19869 opt = opts.ipoptp_cur; 19870 optlen = opts.ipoptp_len; 19871 ip2dbg(("ip_source_routed: opt %d, len %d\n", 19872 optval, optlen)); 19873 switch (optval) { 19874 uint32_t off; 19875 case IPOPT_SSRR: 19876 case IPOPT_LSRR: 19877 /* 19878 * If dst is one of our addresses and there are some 19879 * entries left in the source route return (true). 19880 */ 19881 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 19882 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19883 if (ire == NULL) { 19884 ip2dbg(("ip_source_routed: not next" 19885 " source route 0x%x\n", 19886 ntohl(dst))); 19887 return (B_FALSE); 19888 } 19889 ire_refrele(ire); 19890 off = opt[IPOPT_OFFSET]; 19891 off--; 19892 if (optlen < IP_ADDR_LEN || 19893 off > optlen - IP_ADDR_LEN) { 19894 /* End of source route */ 19895 ip1dbg(("ip_source_routed: end of SR\n")); 19896 return (B_FALSE); 19897 } 19898 return (B_TRUE); 19899 } 19900 } 19901 ip2dbg(("not source routed\n")); 19902 return (B_FALSE); 19903 } 19904 19905 /* 19906 * Check if the packet contains any source route. 19907 */ 19908 static boolean_t 19909 ip_source_route_included(ipha_t *ipha) 19910 { 19911 ipoptp_t opts; 19912 uint8_t optval; 19913 19914 if (IS_SIMPLE_IPH(ipha)) 19915 return (B_FALSE); 19916 for (optval = ipoptp_first(&opts, ipha); 19917 optval != IPOPT_EOL; 19918 optval = ipoptp_next(&opts)) { 19919 switch (optval) { 19920 case IPOPT_SSRR: 19921 case IPOPT_LSRR: 19922 return (B_TRUE); 19923 } 19924 } 19925 return (B_FALSE); 19926 } 19927 19928 /* 19929 * Called when the IRE expiration timer fires. 19930 */ 19931 void 19932 ip_trash_timer_expire(void *args) 19933 { 19934 int flush_flag = 0; 19935 ire_expire_arg_t iea; 19936 ip_stack_t *ipst = (ip_stack_t *)args; 19937 19938 iea.iea_ipst = ipst; /* No netstack_hold */ 19939 19940 /* 19941 * ip_ire_expire_id is protected by ip_trash_timer_lock. 19942 * This lock makes sure that a new invocation of this function 19943 * that occurs due to an almost immediate timer firing will not 19944 * progress beyond this point until the current invocation is done 19945 */ 19946 mutex_enter(&ipst->ips_ip_trash_timer_lock); 19947 ipst->ips_ip_ire_expire_id = 0; 19948 mutex_exit(&ipst->ips_ip_trash_timer_lock); 19949 19950 /* Periodic timer */ 19951 if (ipst->ips_ip_ire_arp_time_elapsed >= 19952 ipst->ips_ip_ire_arp_interval) { 19953 /* 19954 * Remove all IRE_CACHE entries since they might 19955 * contain arp information. 19956 */ 19957 flush_flag |= FLUSH_ARP_TIME; 19958 ipst->ips_ip_ire_arp_time_elapsed = 0; 19959 IP_STAT(ipst, ip_ire_arp_timer_expired); 19960 } 19961 if (ipst->ips_ip_ire_rd_time_elapsed >= 19962 ipst->ips_ip_ire_redir_interval) { 19963 /* Remove all redirects */ 19964 flush_flag |= FLUSH_REDIRECT_TIME; 19965 ipst->ips_ip_ire_rd_time_elapsed = 0; 19966 IP_STAT(ipst, ip_ire_redirect_timer_expired); 19967 } 19968 if (ipst->ips_ip_ire_pmtu_time_elapsed >= 19969 ipst->ips_ip_ire_pathmtu_interval) { 19970 /* Increase path mtu */ 19971 flush_flag |= FLUSH_MTU_TIME; 19972 ipst->ips_ip_ire_pmtu_time_elapsed = 0; 19973 IP_STAT(ipst, ip_ire_pmtu_timer_expired); 19974 } 19975 19976 /* 19977 * Optimize for the case when there are no redirects in the 19978 * ftable, that is, no need to walk the ftable in that case. 19979 */ 19980 if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) { 19981 iea.iea_flush_flag = flush_flag; 19982 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire, 19983 (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL, 19984 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 19985 NULL, ALL_ZONES, ipst); 19986 } 19987 if ((flush_flag & FLUSH_REDIRECT_TIME) && 19988 ipst->ips_ip_redirect_cnt > 0) { 19989 iea.iea_flush_flag = flush_flag; 19990 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE, 19991 ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 19992 0, NULL, 0, NULL, NULL, ALL_ZONES, ipst); 19993 } 19994 if (flush_flag & FLUSH_MTU_TIME) { 19995 /* 19996 * Walk all IPv6 IRE's and update them 19997 * Note that ARP and redirect timers are not 19998 * needed since NUD handles stale entries. 19999 */ 20000 flush_flag = FLUSH_MTU_TIME; 20001 iea.iea_flush_flag = flush_flag; 20002 ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea, 20003 ALL_ZONES, ipst); 20004 } 20005 20006 ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval; 20007 ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval; 20008 ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval; 20009 20010 /* 20011 * Hold the lock to serialize timeout calls and prevent 20012 * stale values in ip_ire_expire_id. Otherwise it is possible 20013 * for the timer to fire and a new invocation of this function 20014 * to start before the return value of timeout has been stored 20015 * in ip_ire_expire_id by the current invocation. 20016 */ 20017 mutex_enter(&ipst->ips_ip_trash_timer_lock); 20018 ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire, 20019 (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 20020 mutex_exit(&ipst->ips_ip_trash_timer_lock); 20021 } 20022 20023 /* 20024 * Called by the memory allocator subsystem directly, when the system 20025 * is running low on memory. 20026 */ 20027 /* ARGSUSED */ 20028 void 20029 ip_trash_ire_reclaim(void *args) 20030 { 20031 netstack_handle_t nh; 20032 netstack_t *ns; 20033 20034 netstack_next_init(&nh); 20035 while ((ns = netstack_next(&nh)) != NULL) { 20036 ip_trash_ire_reclaim_stack(ns->netstack_ip); 20037 netstack_rele(ns); 20038 } 20039 netstack_next_fini(&nh); 20040 } 20041 20042 static void 20043 ip_trash_ire_reclaim_stack(ip_stack_t *ipst) 20044 { 20045 ire_cache_count_t icc; 20046 ire_cache_reclaim_t icr; 20047 ncc_cache_count_t ncc; 20048 nce_cache_reclaim_t ncr; 20049 uint_t delete_cnt; 20050 /* 20051 * Memory reclaim call back. 20052 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 20053 * Then, with a target of freeing 1/Nth of IRE_CACHE 20054 * entries, determine what fraction to free for 20055 * each category of IRE_CACHE entries giving absolute priority 20056 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 20057 * entry will be freed unless all offlink entries are freed). 20058 */ 20059 icc.icc_total = 0; 20060 icc.icc_unused = 0; 20061 icc.icc_offlink = 0; 20062 icc.icc_pmtu = 0; 20063 icc.icc_onlink = 0; 20064 ire_walk(ire_cache_count, (char *)&icc, ipst); 20065 20066 /* 20067 * Free NCEs for IPv6 like the onlink ires. 20068 */ 20069 ncc.ncc_total = 0; 20070 ncc.ncc_host = 0; 20071 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst); 20072 20073 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 20074 icc.icc_pmtu + icc.icc_onlink); 20075 delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction; 20076 IP_STAT(ipst, ip_trash_ire_reclaim_calls); 20077 if (delete_cnt == 0) 20078 return; 20079 IP_STAT(ipst, ip_trash_ire_reclaim_success); 20080 /* Always delete all unused offlink entries */ 20081 icr.icr_ipst = ipst; 20082 icr.icr_unused = 1; 20083 if (delete_cnt <= icc.icc_unused) { 20084 /* 20085 * Only need to free unused entries. In other words, 20086 * there are enough unused entries to free to meet our 20087 * target number of freed ire cache entries. 20088 */ 20089 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 20090 ncr.ncr_host = 0; 20091 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 20092 /* 20093 * Only need to free unused entries, plus a fraction of offlink 20094 * entries. It follows from the first if statement that 20095 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 20096 */ 20097 delete_cnt -= icc.icc_unused; 20098 /* Round up # deleted by truncating fraction */ 20099 icr.icr_offlink = icc.icc_offlink / delete_cnt; 20100 icr.icr_pmtu = icr.icr_onlink = 0; 20101 ncr.ncr_host = 0; 20102 } else if (delete_cnt <= 20103 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 20104 /* 20105 * Free all unused and offlink entries, plus a fraction of 20106 * pmtu entries. It follows from the previous if statement 20107 * that icc_pmtu is non-zero, and that 20108 * delete_cnt != icc_unused + icc_offlink. 20109 */ 20110 icr.icr_offlink = 1; 20111 delete_cnt -= icc.icc_unused + icc.icc_offlink; 20112 /* Round up # deleted by truncating fraction */ 20113 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 20114 icr.icr_onlink = 0; 20115 ncr.ncr_host = 0; 20116 } else { 20117 /* 20118 * Free all unused, offlink, and pmtu entries, plus a fraction 20119 * of onlink entries. If we're here, then we know that 20120 * icc_onlink is non-zero, and that 20121 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 20122 */ 20123 icr.icr_offlink = icr.icr_pmtu = 1; 20124 delete_cnt -= icc.icc_unused + icc.icc_offlink + 20125 icc.icc_pmtu; 20126 /* Round up # deleted by truncating fraction */ 20127 icr.icr_onlink = icc.icc_onlink / delete_cnt; 20128 /* Using the same delete fraction as for onlink IREs */ 20129 ncr.ncr_host = ncc.ncc_host / delete_cnt; 20130 } 20131 #ifdef DEBUG 20132 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 20133 "fractions %d/%d/%d/%d\n", 20134 icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total, 20135 icc.icc_unused, icc.icc_offlink, 20136 icc.icc_pmtu, icc.icc_onlink, 20137 icr.icr_unused, icr.icr_offlink, 20138 icr.icr_pmtu, icr.icr_onlink)); 20139 #endif 20140 ire_walk(ire_cache_reclaim, (char *)&icr, ipst); 20141 if (ncr.ncr_host != 0) 20142 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 20143 (uchar_t *)&ncr, ipst); 20144 #ifdef DEBUG 20145 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 20146 icc.icc_pmtu = 0; icc.icc_onlink = 0; 20147 ire_walk(ire_cache_count, (char *)&icc, ipst); 20148 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 20149 icc.icc_total, icc.icc_unused, icc.icc_offlink, 20150 icc.icc_pmtu, icc.icc_onlink)); 20151 #endif 20152 } 20153 20154 /* 20155 * ip_unbind is called when a copy of an unbind request is received from the 20156 * upper level protocol. We remove this conn from any fanout hash list it is 20157 * on, and zero out the bind information. No reply is expected up above. 20158 */ 20159 mblk_t * 20160 ip_unbind(queue_t *q, mblk_t *mp) 20161 { 20162 conn_t *connp = Q_TO_CONN(q); 20163 20164 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 20165 20166 if (is_system_labeled() && connp->conn_anon_port) { 20167 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 20168 connp->conn_mlp_type, connp->conn_ulp, 20169 ntohs(connp->conn_lport), B_FALSE); 20170 connp->conn_anon_port = 0; 20171 } 20172 connp->conn_mlp_type = mlptSingle; 20173 20174 ipcl_hash_remove(connp); 20175 20176 ASSERT(mp->b_cont == NULL); 20177 /* 20178 * Convert mp into a T_OK_ACK 20179 */ 20180 mp = mi_tpi_ok_ack_alloc(mp); 20181 20182 /* 20183 * should not happen in practice... T_OK_ACK is smaller than the 20184 * original message. 20185 */ 20186 if (mp == NULL) 20187 return (NULL); 20188 20189 return (mp); 20190 } 20191 20192 /* 20193 * Write side put procedure. Outbound data, IOCTLs, responses from 20194 * resolvers, etc, come down through here. 20195 * 20196 * arg2 is always a queue_t *. 20197 * When that queue is an ill_t (i.e. q_next != NULL), then arg must be 20198 * the zoneid. 20199 * When that queue is not an ill_t, then arg must be a conn_t pointer. 20200 */ 20201 void 20202 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 20203 { 20204 ip_output_options(arg, mp, arg2, caller, &zero_info); 20205 } 20206 20207 void 20208 ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, 20209 ip_opt_info_t *infop) 20210 { 20211 conn_t *connp = NULL; 20212 queue_t *q = (queue_t *)arg2; 20213 ipha_t *ipha; 20214 #define rptr ((uchar_t *)ipha) 20215 ire_t *ire = NULL; 20216 ire_t *sctp_ire = NULL; 20217 uint32_t v_hlen_tos_len; 20218 ipaddr_t dst; 20219 mblk_t *first_mp = NULL; 20220 boolean_t mctl_present; 20221 ipsec_out_t *io; 20222 int match_flags; 20223 ill_t *attach_ill = NULL; 20224 /* Bind to IPIF_NOFAILOVER ill etc. */ 20225 ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ 20226 ipif_t *dst_ipif; 20227 boolean_t multirt_need_resolve = B_FALSE; 20228 mblk_t *copy_mp = NULL; 20229 int err; 20230 zoneid_t zoneid; 20231 boolean_t need_decref = B_FALSE; 20232 boolean_t ignore_dontroute = B_FALSE; 20233 boolean_t ignore_nexthop = B_FALSE; 20234 boolean_t ip_nexthop = B_FALSE; 20235 ipaddr_t nexthop_addr; 20236 ip_stack_t *ipst; 20237 20238 #ifdef _BIG_ENDIAN 20239 #define V_HLEN (v_hlen_tos_len >> 24) 20240 #else 20241 #define V_HLEN (v_hlen_tos_len & 0xFF) 20242 #endif 20243 20244 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 20245 "ip_wput_start: q %p", q); 20246 20247 /* 20248 * ip_wput fast path 20249 */ 20250 20251 /* is packet from ARP ? */ 20252 if (q->q_next != NULL) { 20253 zoneid = (zoneid_t)(uintptr_t)arg; 20254 goto qnext; 20255 } 20256 20257 connp = (conn_t *)arg; 20258 ASSERT(connp != NULL); 20259 zoneid = connp->conn_zoneid; 20260 ipst = connp->conn_netstack->netstack_ip; 20261 20262 /* is queue flow controlled? */ 20263 if ((q->q_first != NULL || connp->conn_draining) && 20264 (caller == IP_WPUT)) { 20265 ASSERT(!need_decref); 20266 (void) putq(q, mp); 20267 return; 20268 } 20269 20270 /* Multidata transmit? */ 20271 if (DB_TYPE(mp) == M_MULTIDATA) { 20272 /* 20273 * We should never get here, since all Multidata messages 20274 * originating from tcp should have been directed over to 20275 * tcp_multisend() in the first place. 20276 */ 20277 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20278 freemsg(mp); 20279 return; 20280 } else if (DB_TYPE(mp) != M_DATA) 20281 goto notdata; 20282 20283 if (mp->b_flag & MSGHASREF) { 20284 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20285 mp->b_flag &= ~MSGHASREF; 20286 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 20287 need_decref = B_TRUE; 20288 } 20289 ipha = (ipha_t *)mp->b_rptr; 20290 20291 /* is IP header non-aligned or mblk smaller than basic IP header */ 20292 #ifndef SAFETY_BEFORE_SPEED 20293 if (!OK_32PTR(rptr) || 20294 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 20295 goto hdrtoosmall; 20296 #endif 20297 20298 ASSERT(OK_32PTR(ipha)); 20299 20300 /* 20301 * This function assumes that mp points to an IPv4 packet. If it's the 20302 * wrong version, we'll catch it again in ip_output_v6. 20303 * 20304 * Note that this is *only* locally-generated output here, and never 20305 * forwarded data, and that we need to deal only with transports that 20306 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 20307 * label.) 20308 */ 20309 if (is_system_labeled() && 20310 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 20311 !connp->conn_ulp_labeled) { 20312 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 20313 connp->conn_mac_exempt, ipst); 20314 ipha = (ipha_t *)mp->b_rptr; 20315 if (err != 0) { 20316 first_mp = mp; 20317 if (err == EINVAL) 20318 goto icmp_parameter_problem; 20319 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 20320 goto discard_pkt; 20321 } 20322 } 20323 20324 ASSERT(infop != NULL); 20325 20326 if (infop->ip_opt_flags & IP_VERIFY_SRC) { 20327 /* 20328 * IP_PKTINFO ancillary option is present. 20329 * IPCL_ZONEID is used to honor IP_ALLZONES option which 20330 * allows using address of any zone as the source address. 20331 */ 20332 ire = ire_ctable_lookup(ipha->ipha_src, 0, 20333 (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp), 20334 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 20335 if (ire == NULL) 20336 goto drop_pkt; 20337 ire_refrele(ire); 20338 ire = NULL; 20339 } 20340 20341 /* 20342 * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index 20343 * passed in IP_PKTINFO. 20344 */ 20345 if (infop->ip_opt_ill_index != 0 && 20346 connp->conn_outgoing_ill == NULL && 20347 connp->conn_nofailover_ill == NULL) { 20348 20349 xmit_ill = ill_lookup_on_ifindex( 20350 infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL, 20351 ipst); 20352 20353 if (xmit_ill == NULL || IS_VNI(xmit_ill)) 20354 goto drop_pkt; 20355 /* 20356 * check that there is an ipif belonging 20357 * to our zone. IPCL_ZONEID is not used because 20358 * IP_ALLZONES option is valid only when the ill is 20359 * accessible from all zones i.e has a valid ipif in 20360 * all zones. 20361 */ 20362 if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) { 20363 goto drop_pkt; 20364 } 20365 } 20366 20367 /* 20368 * If there is a policy, try to attach an ipsec_out in 20369 * the front. At the end, first_mp either points to a 20370 * M_DATA message or IPSEC_OUT message linked to a 20371 * M_DATA message. We have to do it now as we might 20372 * lose the "conn" if we go through ip_newroute. 20373 */ 20374 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 20375 if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL, 20376 ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) { 20377 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20378 if (need_decref) 20379 CONN_DEC_REF(connp); 20380 return; 20381 } else { 20382 ASSERT(mp->b_datap->db_type == M_CTL); 20383 first_mp = mp; 20384 mp = mp->b_cont; 20385 mctl_present = B_TRUE; 20386 } 20387 } else { 20388 first_mp = mp; 20389 mctl_present = B_FALSE; 20390 } 20391 20392 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20393 20394 /* is wrong version or IP options present */ 20395 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 20396 goto version_hdrlen_check; 20397 dst = ipha->ipha_dst; 20398 20399 if (connp->conn_nofailover_ill != NULL) { 20400 attach_ill = conn_get_held_ill(connp, 20401 &connp->conn_nofailover_ill, &err); 20402 if (err == ILL_LOOKUP_FAILED) { 20403 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20404 if (need_decref) 20405 CONN_DEC_REF(connp); 20406 freemsg(first_mp); 20407 return; 20408 } 20409 } 20410 20411 /* If IP_BOUND_IF has been set, use that ill. */ 20412 if (connp->conn_outgoing_ill != NULL) { 20413 xmit_ill = conn_get_held_ill(connp, 20414 &connp->conn_outgoing_ill, &err); 20415 if (err == ILL_LOOKUP_FAILED) 20416 goto drop_pkt; 20417 20418 goto send_from_ill; 20419 } 20420 20421 /* is packet multicast? */ 20422 if (CLASSD(dst)) 20423 goto multicast; 20424 20425 /* 20426 * If xmit_ill is set above due to index passed in ip_pkt_info. It 20427 * takes precedence over conn_dontroute and conn_nexthop_set 20428 */ 20429 if (xmit_ill != NULL) 20430 goto send_from_ill; 20431 20432 if (connp->conn_dontroute || connp->conn_nexthop_set) { 20433 /* 20434 * If the destination is a broadcast, local, or loopback 20435 * address, SO_DONTROUTE and IP_NEXTHOP go through the 20436 * standard path. 20437 */ 20438 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20439 if ((ire == NULL) || (ire->ire_type & 20440 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) { 20441 if (ire != NULL) { 20442 ire_refrele(ire); 20443 /* No more access to ire */ 20444 ire = NULL; 20445 } 20446 /* 20447 * bypass routing checks and go directly to interface. 20448 */ 20449 if (connp->conn_dontroute) 20450 goto dontroute; 20451 20452 ASSERT(connp->conn_nexthop_set); 20453 ip_nexthop = B_TRUE; 20454 nexthop_addr = connp->conn_nexthop_v4; 20455 goto send_from_ill; 20456 } 20457 20458 /* Must be a broadcast, a loopback or a local ire */ 20459 ire_refrele(ire); 20460 /* No more access to ire */ 20461 ire = NULL; 20462 } 20463 20464 if (attach_ill != NULL) 20465 goto send_from_ill; 20466 20467 /* 20468 * We cache IRE_CACHEs to avoid lookups. We don't do 20469 * this for the tcp global queue and listen end point 20470 * as it does not really have a real destination to 20471 * talk to. This is also true for SCTP. 20472 */ 20473 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 20474 !connp->conn_fully_bound) { 20475 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20476 if (ire == NULL) 20477 goto noirefound; 20478 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20479 "ip_wput_end: q %p (%S)", q, "end"); 20480 20481 /* 20482 * Check if the ire has the RTF_MULTIRT flag, inherited 20483 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20484 */ 20485 if (ire->ire_flags & RTF_MULTIRT) { 20486 20487 /* 20488 * Force the TTL of multirouted packets if required. 20489 * The TTL of such packets is bounded by the 20490 * ip_multirt_ttl ndd variable. 20491 */ 20492 if ((ipst->ips_ip_multirt_ttl > 0) && 20493 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20494 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20495 "(was %d), dst 0x%08x\n", 20496 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20497 ntohl(ire->ire_addr))); 20498 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20499 } 20500 /* 20501 * We look at this point if there are pending 20502 * unresolved routes. ire_multirt_resolvable() 20503 * checks in O(n) that all IRE_OFFSUBNET ire 20504 * entries for the packet's destination and 20505 * flagged RTF_MULTIRT are currently resolved. 20506 * If some remain unresolved, we make a copy 20507 * of the current message. It will be used 20508 * to initiate additional route resolutions. 20509 */ 20510 multirt_need_resolve = 20511 ire_multirt_need_resolve(ire->ire_addr, 20512 MBLK_GETLABEL(first_mp), ipst); 20513 ip2dbg(("ip_wput[TCP]: ire %p, " 20514 "multirt_need_resolve %d, first_mp %p\n", 20515 (void *)ire, multirt_need_resolve, 20516 (void *)first_mp)); 20517 if (multirt_need_resolve) { 20518 copy_mp = copymsg(first_mp); 20519 if (copy_mp != NULL) { 20520 MULTIRT_DEBUG_TAG(copy_mp); 20521 } 20522 } 20523 } 20524 20525 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20526 20527 /* 20528 * Try to resolve another multiroute if 20529 * ire_multirt_need_resolve() deemed it necessary. 20530 */ 20531 if (copy_mp != NULL) 20532 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20533 if (need_decref) 20534 CONN_DEC_REF(connp); 20535 return; 20536 } 20537 20538 /* 20539 * Access to conn_ire_cache. (protected by conn_lock) 20540 * 20541 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 20542 * the ire bucket lock here to check for CONDEMNED as it is okay to 20543 * send a packet or two with the IRE_CACHE that is going away. 20544 * Access to the ire requires an ire refhold on the ire prior to 20545 * its use since an interface unplumb thread may delete the cached 20546 * ire and release the refhold at any time. 20547 * 20548 * Caching an ire in the conn_ire_cache 20549 * 20550 * o Caching an ire pointer in the conn requires a strict check for 20551 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 20552 * ires before cleaning up the conns. So the caching of an ire pointer 20553 * in the conn is done after making sure under the bucket lock that the 20554 * ire has not yet been marked CONDEMNED. Otherwise we will end up 20555 * caching an ire after the unplumb thread has cleaned up the conn. 20556 * If the conn does not send a packet subsequently the unplumb thread 20557 * will be hanging waiting for the ire count to drop to zero. 20558 * 20559 * o We also need to atomically test for a null conn_ire_cache and 20560 * set the conn_ire_cache under the the protection of the conn_lock 20561 * to avoid races among concurrent threads trying to simultaneously 20562 * cache an ire in the conn_ire_cache. 20563 */ 20564 mutex_enter(&connp->conn_lock); 20565 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 20566 20567 if (ire != NULL && ire->ire_addr == dst && 20568 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20569 20570 IRE_REFHOLD(ire); 20571 mutex_exit(&connp->conn_lock); 20572 20573 } else { 20574 boolean_t cached = B_FALSE; 20575 connp->conn_ire_cache = NULL; 20576 mutex_exit(&connp->conn_lock); 20577 /* Release the old ire */ 20578 if (ire != NULL && sctp_ire == NULL) 20579 IRE_REFRELE_NOTR(ire); 20580 20581 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20582 if (ire == NULL) 20583 goto noirefound; 20584 IRE_REFHOLD_NOTR(ire); 20585 20586 mutex_enter(&connp->conn_lock); 20587 if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) { 20588 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20589 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20590 if (connp->conn_ulp == IPPROTO_TCP) 20591 TCP_CHECK_IREINFO(connp->conn_tcp, ire); 20592 connp->conn_ire_cache = ire; 20593 cached = B_TRUE; 20594 } 20595 rw_exit(&ire->ire_bucket->irb_lock); 20596 } 20597 mutex_exit(&connp->conn_lock); 20598 20599 /* 20600 * We can continue to use the ire but since it was 20601 * not cached, we should drop the extra reference. 20602 */ 20603 if (!cached) 20604 IRE_REFRELE_NOTR(ire); 20605 } 20606 20607 20608 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20609 "ip_wput_end: q %p (%S)", q, "end"); 20610 20611 /* 20612 * Check if the ire has the RTF_MULTIRT flag, inherited 20613 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20614 */ 20615 if (ire->ire_flags & RTF_MULTIRT) { 20616 20617 /* 20618 * Force the TTL of multirouted packets if required. 20619 * The TTL of such packets is bounded by the 20620 * ip_multirt_ttl ndd variable. 20621 */ 20622 if ((ipst->ips_ip_multirt_ttl > 0) && 20623 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20624 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20625 "(was %d), dst 0x%08x\n", 20626 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20627 ntohl(ire->ire_addr))); 20628 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20629 } 20630 20631 /* 20632 * At this point, we check to see if there are any pending 20633 * unresolved routes. ire_multirt_resolvable() 20634 * checks in O(n) that all IRE_OFFSUBNET ire 20635 * entries for the packet's destination and 20636 * flagged RTF_MULTIRT are currently resolved. 20637 * If some remain unresolved, we make a copy 20638 * of the current message. It will be used 20639 * to initiate additional route resolutions. 20640 */ 20641 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 20642 MBLK_GETLABEL(first_mp), ipst); 20643 ip2dbg(("ip_wput[not TCP]: ire %p, " 20644 "multirt_need_resolve %d, first_mp %p\n", 20645 (void *)ire, multirt_need_resolve, (void *)first_mp)); 20646 if (multirt_need_resolve) { 20647 copy_mp = copymsg(first_mp); 20648 if (copy_mp != NULL) { 20649 MULTIRT_DEBUG_TAG(copy_mp); 20650 } 20651 } 20652 } 20653 20654 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20655 20656 /* 20657 * Try to resolve another multiroute if 20658 * ire_multirt_resolvable() deemed it necessary 20659 */ 20660 if (copy_mp != NULL) 20661 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20662 if (need_decref) 20663 CONN_DEC_REF(connp); 20664 return; 20665 20666 qnext: 20667 /* 20668 * Upper Level Protocols pass down complete IP datagrams 20669 * as M_DATA messages. Everything else is a sideshow. 20670 * 20671 * 1) We could be re-entering ip_wput because of ip_neworute 20672 * in which case we could have a IPSEC_OUT message. We 20673 * need to pass through ip_wput like other datagrams and 20674 * hence cannot branch to ip_wput_nondata. 20675 * 20676 * 2) ARP, AH, ESP, and other clients who are on the module 20677 * instance of IP stream, give us something to deal with. 20678 * We will handle AH and ESP here and rest in ip_wput_nondata. 20679 * 20680 * 3) ICMP replies also could come here. 20681 */ 20682 ipst = ILLQ_TO_IPST(q); 20683 20684 if (DB_TYPE(mp) != M_DATA) { 20685 notdata: 20686 if (DB_TYPE(mp) == M_CTL) { 20687 /* 20688 * M_CTL messages are used by ARP, AH and ESP to 20689 * communicate with IP. We deal with IPSEC_IN and 20690 * IPSEC_OUT here. ip_wput_nondata handles other 20691 * cases. 20692 */ 20693 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 20694 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 20695 first_mp = mp->b_cont; 20696 first_mp->b_flag &= ~MSGHASREF; 20697 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20698 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 20699 CONN_DEC_REF(connp); 20700 connp = NULL; 20701 } 20702 if (ii->ipsec_info_type == IPSEC_IN) { 20703 /* 20704 * Either this message goes back to 20705 * IPsec for further processing or to 20706 * ULP after policy checks. 20707 */ 20708 ip_fanout_proto_again(mp, NULL, NULL, NULL); 20709 return; 20710 } else if (ii->ipsec_info_type == IPSEC_OUT) { 20711 io = (ipsec_out_t *)ii; 20712 if (io->ipsec_out_proc_begin) { 20713 /* 20714 * IPsec processing has already started. 20715 * Complete it. 20716 * IPQoS notes: We don't care what is 20717 * in ipsec_out_ill_index since this 20718 * won't be processed for IPQoS policies 20719 * in ipsec_out_process. 20720 */ 20721 ipsec_out_process(q, mp, NULL, 20722 io->ipsec_out_ill_index); 20723 return; 20724 } else { 20725 connp = (q->q_next != NULL) ? 20726 NULL : Q_TO_CONN(q); 20727 first_mp = mp; 20728 mp = mp->b_cont; 20729 mctl_present = B_TRUE; 20730 } 20731 zoneid = io->ipsec_out_zoneid; 20732 ASSERT(zoneid != ALL_ZONES); 20733 } else if (ii->ipsec_info_type == IPSEC_CTL) { 20734 /* 20735 * It's an IPsec control message requesting 20736 * an SADB update to be sent to the IPsec 20737 * hardware acceleration capable ills. 20738 */ 20739 ipsec_ctl_t *ipsec_ctl = 20740 (ipsec_ctl_t *)mp->b_rptr; 20741 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 20742 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 20743 mblk_t *cmp = mp->b_cont; 20744 20745 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 20746 ASSERT(cmp != NULL); 20747 20748 freeb(mp); 20749 ill_ipsec_capab_send_all(satype, cmp, sa, 20750 ipst->ips_netstack); 20751 return; 20752 } else { 20753 /* 20754 * This must be ARP or special TSOL signaling. 20755 */ 20756 ip_wput_nondata(NULL, q, mp, NULL); 20757 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20758 "ip_wput_end: q %p (%S)", q, "nondata"); 20759 return; 20760 } 20761 } else { 20762 /* 20763 * This must be non-(ARP/AH/ESP) messages. 20764 */ 20765 ASSERT(!need_decref); 20766 ip_wput_nondata(NULL, q, mp, NULL); 20767 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20768 "ip_wput_end: q %p (%S)", q, "nondata"); 20769 return; 20770 } 20771 } else { 20772 first_mp = mp; 20773 mctl_present = B_FALSE; 20774 } 20775 20776 ASSERT(first_mp != NULL); 20777 /* 20778 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if 20779 * to make sure that this packet goes out on the same interface it 20780 * came in. We handle that here. 20781 */ 20782 if (mctl_present) { 20783 uint_t ifindex; 20784 20785 io = (ipsec_out_t *)first_mp->b_rptr; 20786 if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) { 20787 /* 20788 * We may have lost the conn context if we are 20789 * coming here from ip_newroute(). Copy the 20790 * nexthop information. 20791 */ 20792 if (io->ipsec_out_ip_nexthop) { 20793 ip_nexthop = B_TRUE; 20794 nexthop_addr = io->ipsec_out_nexthop_addr; 20795 20796 ipha = (ipha_t *)mp->b_rptr; 20797 dst = ipha->ipha_dst; 20798 goto send_from_ill; 20799 } else { 20800 ASSERT(io->ipsec_out_ill_index != 0); 20801 ifindex = io->ipsec_out_ill_index; 20802 attach_ill = ill_lookup_on_ifindex(ifindex, 20803 B_FALSE, NULL, NULL, NULL, NULL, ipst); 20804 if (attach_ill == NULL) { 20805 ASSERT(xmit_ill == NULL); 20806 ip1dbg(("ip_output: bad ifindex for " 20807 "(BIND TO IPIF_NOFAILOVER) %d\n", 20808 ifindex)); 20809 freemsg(first_mp); 20810 BUMP_MIB(&ipst->ips_ip_mib, 20811 ipIfStatsOutDiscards); 20812 ASSERT(!need_decref); 20813 return; 20814 } 20815 } 20816 } 20817 } 20818 20819 ASSERT(xmit_ill == NULL); 20820 20821 /* We have a complete IP datagram heading outbound. */ 20822 ipha = (ipha_t *)mp->b_rptr; 20823 20824 #ifndef SPEED_BEFORE_SAFETY 20825 /* 20826 * Make sure we have a full-word aligned message and that at least 20827 * a simple IP header is accessible in the first message. If not, 20828 * try a pullup. For labeled systems we need to always take this 20829 * path as M_CTLs are "notdata" but have trailing data to process. 20830 */ 20831 if (!OK_32PTR(rptr) || 20832 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) { 20833 hdrtoosmall: 20834 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 20835 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20836 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 20837 if (first_mp == NULL) 20838 first_mp = mp; 20839 goto discard_pkt; 20840 } 20841 20842 /* This function assumes that mp points to an IPv4 packet. */ 20843 if (is_system_labeled() && q->q_next == NULL && 20844 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 20845 !connp->conn_ulp_labeled) { 20846 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 20847 connp->conn_mac_exempt, ipst); 20848 ipha = (ipha_t *)mp->b_rptr; 20849 if (first_mp != NULL) 20850 first_mp->b_cont = mp; 20851 if (err != 0) { 20852 if (first_mp == NULL) 20853 first_mp = mp; 20854 if (err == EINVAL) 20855 goto icmp_parameter_problem; 20856 ip2dbg(("ip_wput: label check failed (%d)\n", 20857 err)); 20858 goto discard_pkt; 20859 } 20860 } 20861 20862 ipha = (ipha_t *)mp->b_rptr; 20863 if (first_mp == NULL) { 20864 ASSERT(attach_ill == NULL && xmit_ill == NULL); 20865 /* 20866 * If we got here because of "goto hdrtoosmall" 20867 * We need to attach a IPSEC_OUT. 20868 */ 20869 if (connp->conn_out_enforce_policy) { 20870 if (((mp = ipsec_attach_ipsec_out(&mp, connp, 20871 NULL, ipha->ipha_protocol, 20872 ipst->ips_netstack)) == NULL)) { 20873 BUMP_MIB(&ipst->ips_ip_mib, 20874 ipIfStatsOutDiscards); 20875 if (need_decref) 20876 CONN_DEC_REF(connp); 20877 return; 20878 } else { 20879 ASSERT(mp->b_datap->db_type == M_CTL); 20880 first_mp = mp; 20881 mp = mp->b_cont; 20882 mctl_present = B_TRUE; 20883 } 20884 } else { 20885 first_mp = mp; 20886 mctl_present = B_FALSE; 20887 } 20888 } 20889 } 20890 #endif 20891 20892 /* Most of the code below is written for speed, not readability */ 20893 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20894 20895 /* 20896 * If ip_newroute() fails, we're going to need a full 20897 * header for the icmp wraparound. 20898 */ 20899 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 20900 uint_t v_hlen; 20901 version_hdrlen_check: 20902 ASSERT(first_mp != NULL); 20903 v_hlen = V_HLEN; 20904 /* 20905 * siphon off IPv6 packets coming down from transport 20906 * layer modules here. 20907 * Note: high-order bit carries NUD reachability confirmation 20908 */ 20909 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 20910 /* 20911 * FIXME: assume that callers of ip_output* call 20912 * the right version? 20913 */ 20914 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); 20915 ASSERT(xmit_ill == NULL); 20916 if (attach_ill != NULL) 20917 ill_refrele(attach_ill); 20918 if (need_decref) 20919 mp->b_flag |= MSGHASREF; 20920 (void) ip_output_v6(arg, first_mp, arg2, caller); 20921 return; 20922 } 20923 20924 if ((v_hlen >> 4) != IP_VERSION) { 20925 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20926 "ip_wput_end: q %p (%S)", q, "badvers"); 20927 goto discard_pkt; 20928 } 20929 /* 20930 * Is the header length at least 20 bytes? 20931 * 20932 * Are there enough bytes accessible in the header? If 20933 * not, try a pullup. 20934 */ 20935 v_hlen &= 0xF; 20936 v_hlen <<= 2; 20937 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 20938 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20939 "ip_wput_end: q %p (%S)", q, "badlen"); 20940 goto discard_pkt; 20941 } 20942 if (v_hlen > (mp->b_wptr - rptr)) { 20943 if (!pullupmsg(mp, v_hlen)) { 20944 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20945 "ip_wput_end: q %p (%S)", q, "badpullup2"); 20946 goto discard_pkt; 20947 } 20948 ipha = (ipha_t *)mp->b_rptr; 20949 } 20950 /* 20951 * Move first entry from any source route into ipha_dst and 20952 * verify the options 20953 */ 20954 if (ip_wput_options(q, first_mp, ipha, mctl_present, 20955 zoneid, ipst)) { 20956 ASSERT(xmit_ill == NULL); 20957 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20958 if (attach_ill != NULL) 20959 ill_refrele(attach_ill); 20960 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20961 "ip_wput_end: q %p (%S)", q, "badopts"); 20962 if (need_decref) 20963 CONN_DEC_REF(connp); 20964 return; 20965 } 20966 } 20967 dst = ipha->ipha_dst; 20968 20969 /* 20970 * Try to get an IRE_CACHE for the destination address. If we can't, 20971 * we have to run the packet through ip_newroute which will take 20972 * the appropriate action to arrange for an IRE_CACHE, such as querying 20973 * a resolver, or assigning a default gateway, etc. 20974 */ 20975 if (CLASSD(dst)) { 20976 ipif_t *ipif; 20977 uint32_t setsrc = 0; 20978 20979 multicast: 20980 ASSERT(first_mp != NULL); 20981 ip2dbg(("ip_wput: CLASSD\n")); 20982 if (connp == NULL) { 20983 /* 20984 * Use the first good ipif on the ill. 20985 * XXX Should this ever happen? (Appears 20986 * to show up with just ppp and no ethernet due 20987 * to in.rdisc.) 20988 * However, ire_send should be able to 20989 * call ip_wput_ire directly. 20990 * 20991 * XXX Also, this can happen for ICMP and other packets 20992 * with multicast source addresses. Perhaps we should 20993 * fix things so that we drop the packet in question, 20994 * but for now, just run with it. 20995 */ 20996 ill_t *ill = (ill_t *)q->q_ptr; 20997 20998 /* 20999 * Don't honor attach_if for this case. If ill 21000 * is part of the group, ipif could belong to 21001 * any ill and we cannot maintain attach_ill 21002 * and ipif_ill same anymore and the assert 21003 * below would fail. 21004 */ 21005 if (mctl_present && io->ipsec_out_attach_if) { 21006 io->ipsec_out_ill_index = 0; 21007 io->ipsec_out_attach_if = B_FALSE; 21008 ASSERT(attach_ill != NULL); 21009 ill_refrele(attach_ill); 21010 attach_ill = NULL; 21011 } 21012 21013 ASSERT(attach_ill == NULL); 21014 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 21015 if (ipif == NULL) { 21016 if (need_decref) 21017 CONN_DEC_REF(connp); 21018 freemsg(first_mp); 21019 return; 21020 } 21021 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 21022 ntohl(dst), ill->ill_name)); 21023 } else { 21024 /* 21025 * The order of precedence is IP_BOUND_IF, IP_PKTINFO 21026 * and IP_MULTICAST_IF. The block comment above this 21027 * function explains the locking mechanism used here. 21028 */ 21029 if (xmit_ill == NULL) { 21030 xmit_ill = conn_get_held_ill(connp, 21031 &connp->conn_outgoing_ill, &err); 21032 if (err == ILL_LOOKUP_FAILED) { 21033 ip1dbg(("ip_wput: No ill for " 21034 "IP_BOUND_IF\n")); 21035 BUMP_MIB(&ipst->ips_ip_mib, 21036 ipIfStatsOutNoRoutes); 21037 goto drop_pkt; 21038 } 21039 } 21040 21041 if (xmit_ill == NULL) { 21042 ipif = conn_get_held_ipif(connp, 21043 &connp->conn_multicast_ipif, &err); 21044 if (err == IPIF_LOOKUP_FAILED) { 21045 ip1dbg(("ip_wput: No ipif for " 21046 "multicast\n")); 21047 BUMP_MIB(&ipst->ips_ip_mib, 21048 ipIfStatsOutNoRoutes); 21049 goto drop_pkt; 21050 } 21051 } 21052 if (xmit_ill != NULL) { 21053 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21054 if (ipif == NULL) { 21055 ip1dbg(("ip_wput: No ipif for " 21056 "xmit_ill\n")); 21057 BUMP_MIB(&ipst->ips_ip_mib, 21058 ipIfStatsOutNoRoutes); 21059 goto drop_pkt; 21060 } 21061 } else if (ipif == NULL || ipif->ipif_isv6) { 21062 /* 21063 * We must do this ipif determination here 21064 * else we could pass through ip_newroute 21065 * and come back here without the conn context. 21066 * 21067 * Note: we do late binding i.e. we bind to 21068 * the interface when the first packet is sent. 21069 * For performance reasons we do not rebind on 21070 * each packet but keep the binding until the 21071 * next IP_MULTICAST_IF option. 21072 * 21073 * conn_multicast_{ipif,ill} are shared between 21074 * IPv4 and IPv6 and AF_INET6 sockets can 21075 * send both IPv4 and IPv6 packets. Hence 21076 * we have to check that "isv6" matches above. 21077 */ 21078 if (ipif != NULL) 21079 ipif_refrele(ipif); 21080 ipif = ipif_lookup_group(dst, zoneid, ipst); 21081 if (ipif == NULL) { 21082 ip1dbg(("ip_wput: No ipif for " 21083 "multicast\n")); 21084 BUMP_MIB(&ipst->ips_ip_mib, 21085 ipIfStatsOutNoRoutes); 21086 goto drop_pkt; 21087 } 21088 err = conn_set_held_ipif(connp, 21089 &connp->conn_multicast_ipif, ipif); 21090 if (err == IPIF_LOOKUP_FAILED) { 21091 ipif_refrele(ipif); 21092 ip1dbg(("ip_wput: No ipif for " 21093 "multicast\n")); 21094 BUMP_MIB(&ipst->ips_ip_mib, 21095 ipIfStatsOutNoRoutes); 21096 goto drop_pkt; 21097 } 21098 } 21099 } 21100 ASSERT(!ipif->ipif_isv6); 21101 /* 21102 * As we may lose the conn by the time we reach ip_wput_ire, 21103 * we copy conn_multicast_loop and conn_dontroute on to an 21104 * ipsec_out. In case if this datagram goes out secure, 21105 * we need the ill_index also. Copy that also into the 21106 * ipsec_out. 21107 */ 21108 if (mctl_present) { 21109 io = (ipsec_out_t *)first_mp->b_rptr; 21110 ASSERT(first_mp->b_datap->db_type == M_CTL); 21111 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21112 } else { 21113 ASSERT(mp == first_mp); 21114 if ((first_mp = allocb(sizeof (ipsec_info_t), 21115 BPRI_HI)) == NULL) { 21116 ipif_refrele(ipif); 21117 first_mp = mp; 21118 goto discard_pkt; 21119 } 21120 first_mp->b_datap->db_type = M_CTL; 21121 first_mp->b_wptr += sizeof (ipsec_info_t); 21122 /* ipsec_out_secure is B_FALSE now */ 21123 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 21124 io = (ipsec_out_t *)first_mp->b_rptr; 21125 io->ipsec_out_type = IPSEC_OUT; 21126 io->ipsec_out_len = sizeof (ipsec_out_t); 21127 io->ipsec_out_use_global_policy = B_TRUE; 21128 io->ipsec_out_ns = ipst->ips_netstack; 21129 first_mp->b_cont = mp; 21130 mctl_present = B_TRUE; 21131 } 21132 if (attach_ill != NULL) { 21133 ASSERT(attach_ill == ipif->ipif_ill); 21134 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21135 21136 /* 21137 * Check if we need an ire that will not be 21138 * looked up by anybody else i.e. HIDDEN. 21139 */ 21140 if (ill_is_probeonly(attach_ill)) { 21141 match_flags |= MATCH_IRE_MARK_HIDDEN; 21142 } 21143 io->ipsec_out_ill_index = 21144 attach_ill->ill_phyint->phyint_ifindex; 21145 io->ipsec_out_attach_if = B_TRUE; 21146 } else { 21147 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 21148 io->ipsec_out_ill_index = 21149 ipif->ipif_ill->ill_phyint->phyint_ifindex; 21150 } 21151 if (connp != NULL) { 21152 io->ipsec_out_multicast_loop = 21153 connp->conn_multicast_loop; 21154 io->ipsec_out_dontroute = connp->conn_dontroute; 21155 io->ipsec_out_zoneid = connp->conn_zoneid; 21156 } 21157 /* 21158 * If the application uses IP_MULTICAST_IF with 21159 * different logical addresses of the same ILL, we 21160 * need to make sure that the soruce address of 21161 * the packet matches the logical IP address used 21162 * in the option. We do it by initializing ipha_src 21163 * here. This should keep IPsec also happy as 21164 * when we return from IPsec processing, we don't 21165 * have to worry about getting the right address on 21166 * the packet. Thus it is sufficient to look for 21167 * IRE_CACHE using MATCH_IRE_ILL rathen than 21168 * MATCH_IRE_IPIF. 21169 * 21170 * NOTE : We need to do it for non-secure case also as 21171 * this might go out secure if there is a global policy 21172 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER 21173 * address, the source should be initialized already and 21174 * hence we won't be initializing here. 21175 * 21176 * As we do not have the ire yet, it is possible that 21177 * we set the source address here and then later discover 21178 * that the ire implies the source address to be assigned 21179 * through the RTF_SETSRC flag. 21180 * In that case, the setsrc variable will remind us 21181 * that overwritting the source address by the one 21182 * of the RTF_SETSRC-flagged ire is allowed. 21183 */ 21184 if (ipha->ipha_src == INADDR_ANY && 21185 (connp == NULL || !connp->conn_unspec_src)) { 21186 ipha->ipha_src = ipif->ipif_src_addr; 21187 setsrc = RTF_SETSRC; 21188 } 21189 /* 21190 * Find an IRE which matches the destination and the outgoing 21191 * queue (i.e. the outgoing interface.) 21192 * For loopback use a unicast IP address for 21193 * the ire lookup. 21194 */ 21195 if (IS_LOOPBACK(ipif->ipif_ill)) 21196 dst = ipif->ipif_lcl_addr; 21197 21198 /* 21199 * If xmit_ill is set, we branch out to ip_newroute_ipif. 21200 * We don't need to lookup ire in ctable as the packet 21201 * needs to be sent to the destination through the specified 21202 * ill irrespective of ires in the cache table. 21203 */ 21204 ire = NULL; 21205 if (xmit_ill == NULL) { 21206 ire = ire_ctable_lookup(dst, 0, 0, ipif, 21207 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21208 } 21209 21210 /* 21211 * refrele attach_ill as its not needed anymore. 21212 */ 21213 if (attach_ill != NULL) { 21214 ill_refrele(attach_ill); 21215 attach_ill = NULL; 21216 } 21217 21218 if (ire == NULL) { 21219 /* 21220 * Multicast loopback and multicast forwarding is 21221 * done in ip_wput_ire. 21222 * 21223 * Mark this packet to make it be delivered to 21224 * ip_wput_ire after the new ire has been 21225 * created. 21226 * 21227 * The call to ip_newroute_ipif takes into account 21228 * the setsrc reminder. In any case, we take care 21229 * of the RTF_MULTIRT flag. 21230 */ 21231 mp->b_prev = mp->b_next = NULL; 21232 if (xmit_ill == NULL || 21233 xmit_ill->ill_ipif_up_count > 0) { 21234 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 21235 setsrc | RTF_MULTIRT, zoneid, infop); 21236 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21237 "ip_wput_end: q %p (%S)", q, "noire"); 21238 } else { 21239 freemsg(first_mp); 21240 } 21241 ipif_refrele(ipif); 21242 if (xmit_ill != NULL) 21243 ill_refrele(xmit_ill); 21244 if (need_decref) 21245 CONN_DEC_REF(connp); 21246 return; 21247 } 21248 21249 ipif_refrele(ipif); 21250 ipif = NULL; 21251 ASSERT(xmit_ill == NULL); 21252 21253 /* 21254 * Honor the RTF_SETSRC flag for multicast packets, 21255 * if allowed by the setsrc reminder. 21256 */ 21257 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 21258 ipha->ipha_src = ire->ire_src_addr; 21259 } 21260 21261 /* 21262 * Unconditionally force the TTL to 1 for 21263 * multirouted multicast packets: 21264 * multirouted multicast should not cross 21265 * multicast routers. 21266 */ 21267 if (ire->ire_flags & RTF_MULTIRT) { 21268 if (ipha->ipha_ttl > 1) { 21269 ip2dbg(("ip_wput: forcing multicast " 21270 "multirt TTL to 1 (was %d), dst 0x%08x\n", 21271 ipha->ipha_ttl, ntohl(ire->ire_addr))); 21272 ipha->ipha_ttl = 1; 21273 } 21274 } 21275 } else { 21276 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 21277 if ((ire != NULL) && (ire->ire_type & 21278 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 21279 ignore_dontroute = B_TRUE; 21280 ignore_nexthop = B_TRUE; 21281 } 21282 if (ire != NULL) { 21283 ire_refrele(ire); 21284 ire = NULL; 21285 } 21286 /* 21287 * Guard against coming in from arp in which case conn is NULL. 21288 * Also guard against non M_DATA with dontroute set but 21289 * destined to local, loopback or broadcast addresses. 21290 */ 21291 if (connp != NULL && connp->conn_dontroute && 21292 !ignore_dontroute) { 21293 dontroute: 21294 /* 21295 * Set TTL to 1 if SO_DONTROUTE is set to prevent 21296 * routing protocols from seeing false direct 21297 * connectivity. 21298 */ 21299 ipha->ipha_ttl = 1; 21300 21301 /* If suitable ipif not found, drop packet */ 21302 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst); 21303 if (dst_ipif == NULL) { 21304 noroute: 21305 ip1dbg(("ip_wput: no route for dst using" 21306 " SO_DONTROUTE\n")); 21307 BUMP_MIB(&ipst->ips_ip_mib, 21308 ipIfStatsOutNoRoutes); 21309 mp->b_prev = mp->b_next = NULL; 21310 if (first_mp == NULL) 21311 first_mp = mp; 21312 goto drop_pkt; 21313 } else { 21314 /* 21315 * If suitable ipif has been found, set 21316 * xmit_ill to the corresponding 21317 * ipif_ill because we'll be using the 21318 * send_from_ill logic below. 21319 */ 21320 ASSERT(xmit_ill == NULL); 21321 xmit_ill = dst_ipif->ipif_ill; 21322 mutex_enter(&xmit_ill->ill_lock); 21323 if (!ILL_CAN_LOOKUP(xmit_ill)) { 21324 mutex_exit(&xmit_ill->ill_lock); 21325 xmit_ill = NULL; 21326 ipif_refrele(dst_ipif); 21327 goto noroute; 21328 } 21329 ill_refhold_locked(xmit_ill); 21330 mutex_exit(&xmit_ill->ill_lock); 21331 ipif_refrele(dst_ipif); 21332 } 21333 } 21334 /* 21335 * If we are bound to IPIF_NOFAILOVER address, look for 21336 * an IRE_CACHE matching the ill. 21337 */ 21338 send_from_ill: 21339 if (attach_ill != NULL) { 21340 ipif_t *attach_ipif; 21341 21342 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21343 21344 /* 21345 * Check if we need an ire that will not be 21346 * looked up by anybody else i.e. HIDDEN. 21347 */ 21348 if (ill_is_probeonly(attach_ill)) { 21349 match_flags |= MATCH_IRE_MARK_HIDDEN; 21350 } 21351 21352 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 21353 if (attach_ipif == NULL) { 21354 ip1dbg(("ip_wput: No ipif for attach_ill\n")); 21355 goto discard_pkt; 21356 } 21357 ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, 21358 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21359 ipif_refrele(attach_ipif); 21360 } else if (xmit_ill != NULL) { 21361 ipif_t *ipif; 21362 21363 /* 21364 * Mark this packet as originated locally 21365 */ 21366 mp->b_prev = mp->b_next = NULL; 21367 21368 /* 21369 * Could be SO_DONTROUTE case also. 21370 * Verify that at least one ipif is up on the ill. 21371 */ 21372 if (xmit_ill->ill_ipif_up_count == 0) { 21373 ip1dbg(("ip_output: xmit_ill %s is down\n", 21374 xmit_ill->ill_name)); 21375 goto drop_pkt; 21376 } 21377 21378 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21379 if (ipif == NULL) { 21380 ip1dbg(("ip_output: xmit_ill %s NULL ipif\n", 21381 xmit_ill->ill_name)); 21382 goto drop_pkt; 21383 } 21384 21385 /* 21386 * Look for a ire that is part of the group, 21387 * if found use it else call ip_newroute_ipif. 21388 * IPCL_ZONEID is not used for matching because 21389 * IP_ALLZONES option is valid only when the 21390 * ill is accessible from all zones i.e has a 21391 * valid ipif in all zones. 21392 */ 21393 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 21394 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 21395 MBLK_GETLABEL(mp), match_flags, ipst); 21396 /* 21397 * If an ire exists use it or else create 21398 * an ire but don't add it to the cache. 21399 * Adding an ire may cause issues with 21400 * asymmetric routing. 21401 * In case of multiroute always act as if 21402 * ire does not exist. 21403 */ 21404 if (ire == NULL || ire->ire_flags & RTF_MULTIRT) { 21405 if (ire != NULL) 21406 ire_refrele(ire); 21407 ip_newroute_ipif(q, first_mp, ipif, 21408 dst, connp, 0, zoneid, infop); 21409 ipif_refrele(ipif); 21410 ip1dbg(("ip_output: xmit_ill via %s\n", 21411 xmit_ill->ill_name)); 21412 ill_refrele(xmit_ill); 21413 if (need_decref) 21414 CONN_DEC_REF(connp); 21415 return; 21416 } 21417 ipif_refrele(ipif); 21418 } else if (ip_nexthop || (connp != NULL && 21419 (connp->conn_nexthop_set)) && !ignore_nexthop) { 21420 if (!ip_nexthop) { 21421 ip_nexthop = B_TRUE; 21422 nexthop_addr = connp->conn_nexthop_v4; 21423 } 21424 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 21425 MATCH_IRE_GW; 21426 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 21427 NULL, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21428 } else { 21429 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), 21430 ipst); 21431 } 21432 if (!ire) { 21433 /* 21434 * Make sure we don't load spread if this 21435 * is IPIF_NOFAILOVER case. 21436 */ 21437 if ((attach_ill != NULL) || 21438 (ip_nexthop && !ignore_nexthop)) { 21439 if (mctl_present) { 21440 io = (ipsec_out_t *)first_mp->b_rptr; 21441 ASSERT(first_mp->b_datap->db_type == 21442 M_CTL); 21443 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21444 } else { 21445 ASSERT(mp == first_mp); 21446 first_mp = allocb( 21447 sizeof (ipsec_info_t), BPRI_HI); 21448 if (first_mp == NULL) { 21449 first_mp = mp; 21450 goto discard_pkt; 21451 } 21452 first_mp->b_datap->db_type = M_CTL; 21453 first_mp->b_wptr += 21454 sizeof (ipsec_info_t); 21455 /* ipsec_out_secure is B_FALSE now */ 21456 bzero(first_mp->b_rptr, 21457 sizeof (ipsec_info_t)); 21458 io = (ipsec_out_t *)first_mp->b_rptr; 21459 io->ipsec_out_type = IPSEC_OUT; 21460 io->ipsec_out_len = 21461 sizeof (ipsec_out_t); 21462 io->ipsec_out_use_global_policy = 21463 B_TRUE; 21464 io->ipsec_out_ns = ipst->ips_netstack; 21465 first_mp->b_cont = mp; 21466 mctl_present = B_TRUE; 21467 } 21468 if (attach_ill != NULL) { 21469 io->ipsec_out_ill_index = attach_ill-> 21470 ill_phyint->phyint_ifindex; 21471 io->ipsec_out_attach_if = B_TRUE; 21472 } else { 21473 io->ipsec_out_ip_nexthop = ip_nexthop; 21474 io->ipsec_out_nexthop_addr = 21475 nexthop_addr; 21476 } 21477 } 21478 noirefound: 21479 /* 21480 * Mark this packet as having originated on 21481 * this machine. This will be noted in 21482 * ire_add_then_send, which needs to know 21483 * whether to run it back through ip_wput or 21484 * ip_rput following successful resolution. 21485 */ 21486 mp->b_prev = NULL; 21487 mp->b_next = NULL; 21488 ip_newroute(q, first_mp, dst, connp, zoneid, ipst); 21489 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21490 "ip_wput_end: q %p (%S)", q, "newroute"); 21491 if (attach_ill != NULL) 21492 ill_refrele(attach_ill); 21493 if (xmit_ill != NULL) 21494 ill_refrele(xmit_ill); 21495 if (need_decref) 21496 CONN_DEC_REF(connp); 21497 return; 21498 } 21499 } 21500 21501 /* We now know where we are going with it. */ 21502 21503 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21504 "ip_wput_end: q %p (%S)", q, "end"); 21505 21506 /* 21507 * Check if the ire has the RTF_MULTIRT flag, inherited 21508 * from an IRE_OFFSUBNET ire entry in ip_newroute. 21509 */ 21510 if (ire->ire_flags & RTF_MULTIRT) { 21511 /* 21512 * Force the TTL of multirouted packets if required. 21513 * The TTL of such packets is bounded by the 21514 * ip_multirt_ttl ndd variable. 21515 */ 21516 if ((ipst->ips_ip_multirt_ttl > 0) && 21517 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 21518 ip2dbg(("ip_wput: forcing multirt TTL to %d " 21519 "(was %d), dst 0x%08x\n", 21520 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 21521 ntohl(ire->ire_addr))); 21522 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 21523 } 21524 /* 21525 * At this point, we check to see if there are any pending 21526 * unresolved routes. ire_multirt_resolvable() 21527 * checks in O(n) that all IRE_OFFSUBNET ire 21528 * entries for the packet's destination and 21529 * flagged RTF_MULTIRT are currently resolved. 21530 * If some remain unresolved, we make a copy 21531 * of the current message. It will be used 21532 * to initiate additional route resolutions. 21533 */ 21534 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 21535 MBLK_GETLABEL(first_mp), ipst); 21536 ip2dbg(("ip_wput[noirefound]: ire %p, " 21537 "multirt_need_resolve %d, first_mp %p\n", 21538 (void *)ire, multirt_need_resolve, (void *)first_mp)); 21539 if (multirt_need_resolve) { 21540 copy_mp = copymsg(first_mp); 21541 if (copy_mp != NULL) { 21542 MULTIRT_DEBUG_TAG(copy_mp); 21543 } 21544 } 21545 } 21546 21547 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 21548 /* 21549 * Try to resolve another multiroute if 21550 * ire_multirt_resolvable() deemed it necessary. 21551 * At this point, we need to distinguish 21552 * multicasts from other packets. For multicasts, 21553 * we call ip_newroute_ipif() and request that both 21554 * multirouting and setsrc flags are checked. 21555 */ 21556 if (copy_mp != NULL) { 21557 if (CLASSD(dst)) { 21558 ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst); 21559 if (ipif) { 21560 ASSERT(infop->ip_opt_ill_index == 0); 21561 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 21562 RTF_SETSRC | RTF_MULTIRT, zoneid, infop); 21563 ipif_refrele(ipif); 21564 } else { 21565 MULTIRT_DEBUG_UNTAG(copy_mp); 21566 freemsg(copy_mp); 21567 copy_mp = NULL; 21568 } 21569 } else { 21570 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 21571 } 21572 } 21573 if (attach_ill != NULL) 21574 ill_refrele(attach_ill); 21575 if (xmit_ill != NULL) 21576 ill_refrele(xmit_ill); 21577 if (need_decref) 21578 CONN_DEC_REF(connp); 21579 return; 21580 21581 icmp_parameter_problem: 21582 /* could not have originated externally */ 21583 ASSERT(mp->b_prev == NULL); 21584 if (ip_hdr_complete(ipha, zoneid, ipst) == 0) { 21585 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 21586 /* it's the IP header length that's in trouble */ 21587 icmp_param_problem(q, first_mp, 0, zoneid, ipst); 21588 first_mp = NULL; 21589 } 21590 21591 discard_pkt: 21592 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 21593 drop_pkt: 21594 ip1dbg(("ip_wput: dropped packet\n")); 21595 if (ire != NULL) 21596 ire_refrele(ire); 21597 if (need_decref) 21598 CONN_DEC_REF(connp); 21599 freemsg(first_mp); 21600 if (attach_ill != NULL) 21601 ill_refrele(attach_ill); 21602 if (xmit_ill != NULL) 21603 ill_refrele(xmit_ill); 21604 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21605 "ip_wput_end: q %p (%S)", q, "droppkt"); 21606 } 21607 21608 /* 21609 * If this is a conn_t queue, then we pass in the conn. This includes the 21610 * zoneid. 21611 * Otherwise, this is a message coming back from ARP or for an ill_t queue, 21612 * in which case we use the global zoneid since those are all part of 21613 * the global zone. 21614 */ 21615 void 21616 ip_wput(queue_t *q, mblk_t *mp) 21617 { 21618 if (CONN_Q(q)) 21619 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 21620 else 21621 ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT); 21622 } 21623 21624 /* 21625 * 21626 * The following rules must be observed when accessing any ipif or ill 21627 * that has been cached in the conn. Typically conn_nofailover_ill, 21628 * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill. 21629 * 21630 * Access: The ipif or ill pointed to from the conn can be accessed under 21631 * the protection of the conn_lock or after it has been refheld under the 21632 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 21633 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 21634 * The reason for this is that a concurrent unplumb could actually be 21635 * cleaning up these cached pointers by walking the conns and might have 21636 * finished cleaning up the conn in question. The macros check that an 21637 * unplumb has not yet started on the ipif or ill. 21638 * 21639 * Caching: An ipif or ill pointer may be cached in the conn only after 21640 * making sure that an unplumb has not started. So the caching is done 21641 * while holding both the conn_lock and the ill_lock and after using the 21642 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 21643 * flag before starting the cleanup of conns. 21644 * 21645 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 21646 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 21647 * or a reference to the ipif or a reference to an ire that references the 21648 * ipif. An ipif does not change its ill except for failover/failback. Since 21649 * failover/failback happens only after bringing down the ipif and making sure 21650 * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock 21651 * the above holds. 21652 */ 21653 ipif_t * 21654 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 21655 { 21656 ipif_t *ipif; 21657 ill_t *ill; 21658 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 21659 21660 *err = 0; 21661 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21662 mutex_enter(&connp->conn_lock); 21663 ipif = *ipifp; 21664 if (ipif != NULL) { 21665 ill = ipif->ipif_ill; 21666 mutex_enter(&ill->ill_lock); 21667 if (IPIF_CAN_LOOKUP(ipif)) { 21668 ipif_refhold_locked(ipif); 21669 mutex_exit(&ill->ill_lock); 21670 mutex_exit(&connp->conn_lock); 21671 rw_exit(&ipst->ips_ill_g_lock); 21672 return (ipif); 21673 } else { 21674 *err = IPIF_LOOKUP_FAILED; 21675 } 21676 mutex_exit(&ill->ill_lock); 21677 } 21678 mutex_exit(&connp->conn_lock); 21679 rw_exit(&ipst->ips_ill_g_lock); 21680 return (NULL); 21681 } 21682 21683 ill_t * 21684 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 21685 { 21686 ill_t *ill; 21687 21688 *err = 0; 21689 mutex_enter(&connp->conn_lock); 21690 ill = *illp; 21691 if (ill != NULL) { 21692 mutex_enter(&ill->ill_lock); 21693 if (ILL_CAN_LOOKUP(ill)) { 21694 ill_refhold_locked(ill); 21695 mutex_exit(&ill->ill_lock); 21696 mutex_exit(&connp->conn_lock); 21697 return (ill); 21698 } else { 21699 *err = ILL_LOOKUP_FAILED; 21700 } 21701 mutex_exit(&ill->ill_lock); 21702 } 21703 mutex_exit(&connp->conn_lock); 21704 return (NULL); 21705 } 21706 21707 static int 21708 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 21709 { 21710 ill_t *ill; 21711 21712 ill = ipif->ipif_ill; 21713 mutex_enter(&connp->conn_lock); 21714 mutex_enter(&ill->ill_lock); 21715 if (IPIF_CAN_LOOKUP(ipif)) { 21716 *ipifp = ipif; 21717 mutex_exit(&ill->ill_lock); 21718 mutex_exit(&connp->conn_lock); 21719 return (0); 21720 } 21721 mutex_exit(&ill->ill_lock); 21722 mutex_exit(&connp->conn_lock); 21723 return (IPIF_LOOKUP_FAILED); 21724 } 21725 21726 /* 21727 * This is called if the outbound datagram needs fragmentation. 21728 * 21729 * NOTE : This function does not ire_refrele the ire argument passed in. 21730 */ 21731 static void 21732 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, 21733 ip_stack_t *ipst) 21734 { 21735 ipha_t *ipha; 21736 mblk_t *mp; 21737 uint32_t v_hlen_tos_len; 21738 uint32_t max_frag; 21739 uint32_t frag_flag; 21740 boolean_t dont_use; 21741 21742 if (ipsec_mp->b_datap->db_type == M_CTL) { 21743 mp = ipsec_mp->b_cont; 21744 } else { 21745 mp = ipsec_mp; 21746 } 21747 21748 ipha = (ipha_t *)mp->b_rptr; 21749 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 21750 21751 #ifdef _BIG_ENDIAN 21752 #define V_HLEN (v_hlen_tos_len >> 24) 21753 #define LENGTH (v_hlen_tos_len & 0xFFFF) 21754 #else 21755 #define V_HLEN (v_hlen_tos_len & 0xFF) 21756 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 21757 #endif 21758 21759 #ifndef SPEED_BEFORE_SAFETY 21760 /* 21761 * Check that ipha_length is consistent with 21762 * the mblk length 21763 */ 21764 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 21765 ip0dbg(("Packet length mismatch: %d, %ld\n", 21766 LENGTH, msgdsize(mp))); 21767 freemsg(ipsec_mp); 21768 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21769 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 21770 "packet length mismatch"); 21771 return; 21772 } 21773 #endif 21774 /* 21775 * Don't use frag_flag if pre-built packet or source 21776 * routed or if multicast (since multicast packets do not solicit 21777 * ICMP "packet too big" messages). Get the values of 21778 * max_frag and frag_flag atomically by acquiring the 21779 * ire_lock. 21780 */ 21781 mutex_enter(&ire->ire_lock); 21782 max_frag = ire->ire_max_frag; 21783 frag_flag = ire->ire_frag_flag; 21784 mutex_exit(&ire->ire_lock); 21785 21786 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 21787 (V_HLEN != IP_SIMPLE_HDR_VERSION && 21788 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 21789 21790 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 21791 (dont_use ? 0 : frag_flag), zoneid, ipst); 21792 } 21793 21794 /* 21795 * Used for deciding the MSS size for the upper layer. Thus 21796 * we need to check the outbound policy values in the conn. 21797 */ 21798 int 21799 conn_ipsec_length(conn_t *connp) 21800 { 21801 ipsec_latch_t *ipl; 21802 21803 ipl = connp->conn_latch; 21804 if (ipl == NULL) 21805 return (0); 21806 21807 if (ipl->ipl_out_policy == NULL) 21808 return (0); 21809 21810 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 21811 } 21812 21813 /* 21814 * Returns an estimate of the IPsec headers size. This is used if 21815 * we don't want to call into IPsec to get the exact size. 21816 */ 21817 int 21818 ipsec_out_extra_length(mblk_t *ipsec_mp) 21819 { 21820 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 21821 ipsec_action_t *a; 21822 21823 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21824 if (!io->ipsec_out_secure) 21825 return (0); 21826 21827 a = io->ipsec_out_act; 21828 21829 if (a == NULL) { 21830 ASSERT(io->ipsec_out_policy != NULL); 21831 a = io->ipsec_out_policy->ipsp_act; 21832 } 21833 ASSERT(a != NULL); 21834 21835 return (a->ipa_ovhd); 21836 } 21837 21838 /* 21839 * Returns an estimate of the IPsec headers size. This is used if 21840 * we don't want to call into IPsec to get the exact size. 21841 */ 21842 int 21843 ipsec_in_extra_length(mblk_t *ipsec_mp) 21844 { 21845 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21846 ipsec_action_t *a; 21847 21848 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21849 21850 a = ii->ipsec_in_action; 21851 return (a == NULL ? 0 : a->ipa_ovhd); 21852 } 21853 21854 /* 21855 * If there are any source route options, return the true final 21856 * destination. Otherwise, return the destination. 21857 */ 21858 ipaddr_t 21859 ip_get_dst(ipha_t *ipha) 21860 { 21861 ipoptp_t opts; 21862 uchar_t *opt; 21863 uint8_t optval; 21864 uint8_t optlen; 21865 ipaddr_t dst; 21866 uint32_t off; 21867 21868 dst = ipha->ipha_dst; 21869 21870 if (IS_SIMPLE_IPH(ipha)) 21871 return (dst); 21872 21873 for (optval = ipoptp_first(&opts, ipha); 21874 optval != IPOPT_EOL; 21875 optval = ipoptp_next(&opts)) { 21876 opt = opts.ipoptp_cur; 21877 optlen = opts.ipoptp_len; 21878 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 21879 switch (optval) { 21880 case IPOPT_SSRR: 21881 case IPOPT_LSRR: 21882 off = opt[IPOPT_OFFSET]; 21883 /* 21884 * If one of the conditions is true, it means 21885 * end of options and dst already has the right 21886 * value. 21887 */ 21888 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 21889 off = optlen - IP_ADDR_LEN; 21890 bcopy(&opt[off], &dst, IP_ADDR_LEN); 21891 } 21892 return (dst); 21893 default: 21894 break; 21895 } 21896 } 21897 21898 return (dst); 21899 } 21900 21901 mblk_t * 21902 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 21903 conn_t *connp, boolean_t unspec_src, zoneid_t zoneid) 21904 { 21905 ipsec_out_t *io; 21906 mblk_t *first_mp; 21907 boolean_t policy_present; 21908 ip_stack_t *ipst; 21909 ipsec_stack_t *ipss; 21910 21911 ASSERT(ire != NULL); 21912 ipst = ire->ire_ipst; 21913 ipss = ipst->ips_netstack->netstack_ipsec; 21914 21915 first_mp = mp; 21916 if (mp->b_datap->db_type == M_CTL) { 21917 io = (ipsec_out_t *)first_mp->b_rptr; 21918 /* 21919 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 21920 * 21921 * 1) There is per-socket policy (including cached global 21922 * policy) or a policy on the IP-in-IP tunnel. 21923 * 2) There is no per-socket policy, but it is 21924 * a multicast packet that needs to go out 21925 * on a specific interface. This is the case 21926 * where (ip_wput and ip_wput_multicast) attaches 21927 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 21928 * 21929 * In case (2) we check with global policy to 21930 * see if there is a match and set the ill_index 21931 * appropriately so that we can lookup the ire 21932 * properly in ip_wput_ipsec_out. 21933 */ 21934 21935 /* 21936 * ipsec_out_use_global_policy is set to B_FALSE 21937 * in ipsec_in_to_out(). Refer to that function for 21938 * details. 21939 */ 21940 if ((io->ipsec_out_latch == NULL) && 21941 (io->ipsec_out_use_global_policy)) { 21942 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 21943 ire, connp, unspec_src, zoneid)); 21944 } 21945 if (!io->ipsec_out_secure) { 21946 /* 21947 * If this is not a secure packet, drop 21948 * the IPSEC_OUT mp and treat it as a clear 21949 * packet. This happens when we are sending 21950 * a ICMP reply back to a clear packet. See 21951 * ipsec_in_to_out() for details. 21952 */ 21953 mp = first_mp->b_cont; 21954 freeb(first_mp); 21955 } 21956 return (mp); 21957 } 21958 /* 21959 * See whether we need to attach a global policy here. We 21960 * don't depend on the conn (as it could be null) for deciding 21961 * what policy this datagram should go through because it 21962 * should have happened in ip_wput if there was some 21963 * policy. This normally happens for connections which are not 21964 * fully bound preventing us from caching policies in 21965 * ip_bind. Packets coming from the TCP listener/global queue 21966 * - which are non-hard_bound - could also be affected by 21967 * applying policy here. 21968 * 21969 * If this packet is coming from tcp global queue or listener, 21970 * we will be applying policy here. This may not be *right* 21971 * if these packets are coming from the detached connection as 21972 * it could have gone in clear before. This happens only if a 21973 * TCP connection started when there is no policy and somebody 21974 * added policy before it became detached. Thus packets of the 21975 * detached connection could go out secure and the other end 21976 * would drop it because it will be expecting in clear. The 21977 * converse is not true i.e if somebody starts a TCP 21978 * connection and deletes the policy, all the packets will 21979 * still go out with the policy that existed before deleting 21980 * because ip_unbind sends up policy information which is used 21981 * by TCP on subsequent ip_wputs. The right solution is to fix 21982 * TCP to attach a dummy IPSEC_OUT and set 21983 * ipsec_out_use_global_policy to B_FALSE. As this might 21984 * affect performance for normal cases, we are not doing it. 21985 * Thus, set policy before starting any TCP connections. 21986 * 21987 * NOTE - We might apply policy even for a hard bound connection 21988 * - for which we cached policy in ip_bind - if somebody added 21989 * global policy after we inherited the policy in ip_bind. 21990 * This means that the packets that were going out in clear 21991 * previously would start going secure and hence get dropped 21992 * on the other side. To fix this, TCP attaches a dummy 21993 * ipsec_out and make sure that we don't apply global policy. 21994 */ 21995 if (ipha != NULL) 21996 policy_present = ipss->ipsec_outbound_v4_policy_present; 21997 else 21998 policy_present = ipss->ipsec_outbound_v6_policy_present; 21999 if (!policy_present) 22000 return (mp); 22001 22002 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src, 22003 zoneid)); 22004 } 22005 22006 ire_t * 22007 conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) 22008 { 22009 ipaddr_t addr; 22010 ire_t *save_ire; 22011 irb_t *irb; 22012 ill_group_t *illgrp; 22013 int err; 22014 22015 save_ire = ire; 22016 addr = ire->ire_addr; 22017 22018 ASSERT(ire->ire_type == IRE_BROADCAST); 22019 22020 illgrp = connp->conn_outgoing_ill->ill_group; 22021 if (illgrp == NULL) { 22022 *conn_outgoing_ill = conn_get_held_ill(connp, 22023 &connp->conn_outgoing_ill, &err); 22024 if (err == ILL_LOOKUP_FAILED) { 22025 ire_refrele(save_ire); 22026 return (NULL); 22027 } 22028 return (save_ire); 22029 } 22030 /* 22031 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. 22032 * If it is part of the group, we need to send on the ire 22033 * that has been cleared of IRE_MARK_NORECV and that belongs 22034 * to this group. This is okay as IP_BOUND_IF really means 22035 * any ill in the group. We depend on the fact that the 22036 * first ire in the group is always cleared of IRE_MARK_NORECV 22037 * if such an ire exists. This is possible only if you have 22038 * at least one ill in the group that has not failed. 22039 * 22040 * First get to the ire that matches the address and group. 22041 * 22042 * We don't look for an ire with a matching zoneid because a given zone 22043 * won't always have broadcast ires on all ills in the group. 22044 */ 22045 irb = ire->ire_bucket; 22046 rw_enter(&irb->irb_lock, RW_READER); 22047 if (ire->ire_marks & IRE_MARK_NORECV) { 22048 /* 22049 * If the current zone only has an ire broadcast for this 22050 * address marked NORECV, the ire we want is ahead in the 22051 * bucket, so we look it up deliberately ignoring the zoneid. 22052 */ 22053 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 22054 if (ire->ire_addr != addr) 22055 continue; 22056 /* skip over deleted ires */ 22057 if (ire->ire_marks & IRE_MARK_CONDEMNED) 22058 continue; 22059 } 22060 } 22061 while (ire != NULL) { 22062 /* 22063 * If a new interface is coming up, we could end up 22064 * seeing the loopback ire and the non-loopback ire 22065 * may not have been added yet. So check for ire_stq 22066 */ 22067 if (ire->ire_stq != NULL && (ire->ire_addr != addr || 22068 ire->ire_ipif->ipif_ill->ill_group == illgrp)) { 22069 break; 22070 } 22071 ire = ire->ire_next; 22072 } 22073 if (ire != NULL && ire->ire_addr == addr && 22074 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 22075 IRE_REFHOLD(ire); 22076 rw_exit(&irb->irb_lock); 22077 ire_refrele(save_ire); 22078 *conn_outgoing_ill = ire_to_ill(ire); 22079 /* 22080 * Refhold the ill to make the conn_outgoing_ill 22081 * independent of the ire. ip_wput_ire goes in a loop 22082 * and may refrele the ire. Since we have an ire at this 22083 * point we don't need to use ILL_CAN_LOOKUP on the ill. 22084 */ 22085 ill_refhold(*conn_outgoing_ill); 22086 return (ire); 22087 } 22088 rw_exit(&irb->irb_lock); 22089 ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); 22090 /* 22091 * If we can't find a suitable ire, return the original ire. 22092 */ 22093 return (save_ire); 22094 } 22095 22096 /* 22097 * This function does the ire_refrele of the ire passed in as the 22098 * argument. As this function looks up more ires i.e broadcast ires, 22099 * it needs to REFRELE them. Currently, for simplicity we don't 22100 * differentiate the one passed in and looked up here. We always 22101 * REFRELE. 22102 * IPQoS Notes: 22103 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 22104 * IPsec packets are done in ipsec_out_process. 22105 * 22106 */ 22107 void 22108 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, 22109 zoneid_t zoneid) 22110 { 22111 ipha_t *ipha; 22112 #define rptr ((uchar_t *)ipha) 22113 queue_t *stq; 22114 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 22115 uint32_t v_hlen_tos_len; 22116 uint32_t ttl_protocol; 22117 ipaddr_t src; 22118 ipaddr_t dst; 22119 uint32_t cksum; 22120 ipaddr_t orig_src; 22121 ire_t *ire1; 22122 mblk_t *next_mp; 22123 uint_t hlen; 22124 uint16_t *up; 22125 uint32_t max_frag = ire->ire_max_frag; 22126 ill_t *ill = ire_to_ill(ire); 22127 int clusterwide; 22128 uint16_t ip_hdr_included; /* IP header included by ULP? */ 22129 int ipsec_len; 22130 mblk_t *first_mp; 22131 ipsec_out_t *io; 22132 boolean_t conn_dontroute; /* conn value for multicast */ 22133 boolean_t conn_multicast_loop; /* conn value for multicast */ 22134 boolean_t multicast_forward; /* Should we forward ? */ 22135 boolean_t unspec_src; 22136 ill_t *conn_outgoing_ill = NULL; 22137 ill_t *ire_ill; 22138 ill_t *ire1_ill; 22139 ill_t *out_ill; 22140 uint32_t ill_index = 0; 22141 boolean_t multirt_send = B_FALSE; 22142 int err; 22143 ipxmit_state_t pktxmit_state; 22144 ip_stack_t *ipst = ire->ire_ipst; 22145 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 22146 22147 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 22148 "ip_wput_ire_start: q %p", q); 22149 22150 multicast_forward = B_FALSE; 22151 unspec_src = (connp != NULL && connp->conn_unspec_src); 22152 22153 if (ire->ire_flags & RTF_MULTIRT) { 22154 /* 22155 * Multirouting case. The bucket where ire is stored 22156 * probably holds other RTF_MULTIRT flagged ire 22157 * to the destination. In this call to ip_wput_ire, 22158 * we attempt to send the packet through all 22159 * those ires. Thus, we first ensure that ire is the 22160 * first RTF_MULTIRT ire in the bucket, 22161 * before walking the ire list. 22162 */ 22163 ire_t *first_ire; 22164 irb_t *irb = ire->ire_bucket; 22165 ASSERT(irb != NULL); 22166 22167 /* Make sure we do not omit any multiroute ire. */ 22168 IRB_REFHOLD(irb); 22169 for (first_ire = irb->irb_ire; 22170 first_ire != NULL; 22171 first_ire = first_ire->ire_next) { 22172 if ((first_ire->ire_flags & RTF_MULTIRT) && 22173 (first_ire->ire_addr == ire->ire_addr) && 22174 !(first_ire->ire_marks & 22175 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 22176 break; 22177 } 22178 } 22179 22180 if ((first_ire != NULL) && (first_ire != ire)) { 22181 IRE_REFHOLD(first_ire); 22182 ire_refrele(ire); 22183 ire = first_ire; 22184 ill = ire_to_ill(ire); 22185 } 22186 IRB_REFRELE(irb); 22187 } 22188 22189 /* 22190 * conn_outgoing_ill variable is used only in the broadcast loop. 22191 * for performance we don't grab the mutexs in the fastpath 22192 */ 22193 if ((connp != NULL) && 22194 (ire->ire_type == IRE_BROADCAST) && 22195 ((connp->conn_nofailover_ill != NULL) || 22196 (connp->conn_outgoing_ill != NULL))) { 22197 /* 22198 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF 22199 * option. So, see if this endpoint is bound to a 22200 * IPIF_NOFAILOVER address. If so, honor it. This implies 22201 * that if the interface is failed, we will still send 22202 * the packet on the same ill which is what we want. 22203 */ 22204 conn_outgoing_ill = conn_get_held_ill(connp, 22205 &connp->conn_nofailover_ill, &err); 22206 if (err == ILL_LOOKUP_FAILED) { 22207 ire_refrele(ire); 22208 freemsg(mp); 22209 return; 22210 } 22211 if (conn_outgoing_ill == NULL) { 22212 /* 22213 * Choose a good ill in the group to send the 22214 * packets on. 22215 */ 22216 ire = conn_set_outgoing_ill(connp, ire, 22217 &conn_outgoing_ill); 22218 if (ire == NULL) { 22219 freemsg(mp); 22220 return; 22221 } 22222 } 22223 } 22224 22225 if (mp->b_datap->db_type != M_CTL) { 22226 ipha = (ipha_t *)mp->b_rptr; 22227 } else { 22228 io = (ipsec_out_t *)mp->b_rptr; 22229 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22230 ASSERT(zoneid == io->ipsec_out_zoneid); 22231 ASSERT(zoneid != ALL_ZONES); 22232 ipha = (ipha_t *)mp->b_cont->b_rptr; 22233 dst = ipha->ipha_dst; 22234 /* 22235 * For the multicast case, ipsec_out carries conn_dontroute and 22236 * conn_multicast_loop as conn may not be available here. We 22237 * need this for multicast loopback and forwarding which is done 22238 * later in the code. 22239 */ 22240 if (CLASSD(dst)) { 22241 conn_dontroute = io->ipsec_out_dontroute; 22242 conn_multicast_loop = io->ipsec_out_multicast_loop; 22243 /* 22244 * If conn_dontroute is not set or conn_multicast_loop 22245 * is set, we need to do forwarding/loopback. For 22246 * datagrams from ip_wput_multicast, conn_dontroute is 22247 * set to B_TRUE and conn_multicast_loop is set to 22248 * B_FALSE so that we neither do forwarding nor 22249 * loopback. 22250 */ 22251 if (!conn_dontroute || conn_multicast_loop) 22252 multicast_forward = B_TRUE; 22253 } 22254 } 22255 22256 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 22257 ire->ire_zoneid != ALL_ZONES) { 22258 /* 22259 * When a zone sends a packet to another zone, we try to deliver 22260 * the packet under the same conditions as if the destination 22261 * was a real node on the network. To do so, we look for a 22262 * matching route in the forwarding table. 22263 * RTF_REJECT and RTF_BLACKHOLE are handled just like 22264 * ip_newroute() does. 22265 * Note that IRE_LOCAL are special, since they are used 22266 * when the zoneid doesn't match in some cases. This means that 22267 * we need to handle ipha_src differently since ire_src_addr 22268 * belongs to the receiving zone instead of the sending zone. 22269 * When ip_restrict_interzone_loopback is set, then 22270 * ire_cache_lookup() ensures that IRE_LOCAL are only used 22271 * for loopback between zones when the logical "Ethernet" would 22272 * have looped them back. 22273 */ 22274 ire_t *src_ire; 22275 22276 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 22277 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 22278 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst); 22279 if (src_ire != NULL && 22280 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && 22281 (!ipst->ips_ip_restrict_interzone_loopback || 22282 ire_local_same_ill_group(ire, src_ire))) { 22283 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 22284 ipha->ipha_src = src_ire->ire_src_addr; 22285 ire_refrele(src_ire); 22286 } else { 22287 ire_refrele(ire); 22288 if (conn_outgoing_ill != NULL) 22289 ill_refrele(conn_outgoing_ill); 22290 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 22291 if (src_ire != NULL) { 22292 if (src_ire->ire_flags & RTF_BLACKHOLE) { 22293 ire_refrele(src_ire); 22294 freemsg(mp); 22295 return; 22296 } 22297 ire_refrele(src_ire); 22298 } 22299 if (ip_hdr_complete(ipha, zoneid, ipst)) { 22300 /* Failed */ 22301 freemsg(mp); 22302 return; 22303 } 22304 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid, 22305 ipst); 22306 return; 22307 } 22308 } 22309 22310 if (mp->b_datap->db_type == M_CTL || 22311 ipss->ipsec_outbound_v4_policy_present) { 22312 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 22313 unspec_src, zoneid); 22314 if (mp == NULL) { 22315 ire_refrele(ire); 22316 if (conn_outgoing_ill != NULL) 22317 ill_refrele(conn_outgoing_ill); 22318 return; 22319 } 22320 /* 22321 * Trusted Extensions supports all-zones interfaces, so 22322 * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to 22323 * the global zone. 22324 */ 22325 if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) { 22326 io = (ipsec_out_t *)mp->b_rptr; 22327 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22328 zoneid = io->ipsec_out_zoneid; 22329 } 22330 } 22331 22332 first_mp = mp; 22333 ipsec_len = 0; 22334 22335 if (first_mp->b_datap->db_type == M_CTL) { 22336 io = (ipsec_out_t *)first_mp->b_rptr; 22337 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22338 mp = first_mp->b_cont; 22339 ipsec_len = ipsec_out_extra_length(first_mp); 22340 ASSERT(ipsec_len >= 0); 22341 /* We already picked up the zoneid from the M_CTL above */ 22342 ASSERT(zoneid == io->ipsec_out_zoneid); 22343 ASSERT(zoneid != ALL_ZONES); 22344 22345 /* 22346 * Drop M_CTL here if IPsec processing is not needed. 22347 * (Non-IPsec use of M_CTL extracted any information it 22348 * needed above). 22349 */ 22350 if (ipsec_len == 0) { 22351 freeb(first_mp); 22352 first_mp = mp; 22353 } 22354 } 22355 22356 /* 22357 * Fast path for ip_wput_ire 22358 */ 22359 22360 ipha = (ipha_t *)mp->b_rptr; 22361 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22362 dst = ipha->ipha_dst; 22363 22364 /* 22365 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 22366 * if the socket is a SOCK_RAW type. The transport checksum should 22367 * be provided in the pre-built packet, so we don't need to compute it. 22368 * Also, other application set flags, like DF, should not be altered. 22369 * Other transport MUST pass down zero. 22370 */ 22371 ip_hdr_included = ipha->ipha_ident; 22372 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 22373 22374 if (CLASSD(dst)) { 22375 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 22376 ntohl(dst), 22377 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 22378 ntohl(ire->ire_addr))); 22379 } 22380 22381 /* Macros to extract header fields from data already in registers */ 22382 #ifdef _BIG_ENDIAN 22383 #define V_HLEN (v_hlen_tos_len >> 24) 22384 #define LENGTH (v_hlen_tos_len & 0xFFFF) 22385 #define PROTO (ttl_protocol & 0xFF) 22386 #else 22387 #define V_HLEN (v_hlen_tos_len & 0xFF) 22388 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 22389 #define PROTO (ttl_protocol >> 8) 22390 #endif 22391 22392 22393 orig_src = src = ipha->ipha_src; 22394 /* (The loop back to "another" is explained down below.) */ 22395 another:; 22396 /* 22397 * Assign an ident value for this packet. We assign idents on 22398 * a per destination basis out of the IRE. There could be 22399 * other threads targeting the same destination, so we have to 22400 * arrange for a atomic increment. Note that we use a 32-bit 22401 * atomic add because it has better performance than its 22402 * 16-bit sibling. 22403 * 22404 * If running in cluster mode and if the source address 22405 * belongs to a replicated service then vector through 22406 * cl_inet_ipident vector to allocate ip identifier 22407 * NOTE: This is a contract private interface with the 22408 * clustering group. 22409 */ 22410 clusterwide = 0; 22411 if (cl_inet_ipident) { 22412 ASSERT(cl_inet_isclusterwide); 22413 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 22414 AF_INET, (uint8_t *)(uintptr_t)src)) { 22415 ipha->ipha_ident = (*cl_inet_ipident)(IPPROTO_IP, 22416 AF_INET, (uint8_t *)(uintptr_t)src, 22417 (uint8_t *)(uintptr_t)dst); 22418 clusterwide = 1; 22419 } 22420 } 22421 if (!clusterwide) { 22422 ipha->ipha_ident = 22423 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 22424 } 22425 22426 #ifndef _BIG_ENDIAN 22427 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 22428 #endif 22429 22430 /* 22431 * Set source address unless sent on an ill or conn_unspec_src is set. 22432 * This is needed to obey conn_unspec_src when packets go through 22433 * ip_newroute + arp. 22434 * Assumes ip_newroute{,_multi} sets the source address as well. 22435 */ 22436 if (src == INADDR_ANY && !unspec_src) { 22437 /* 22438 * Assign the appropriate source address from the IRE if none 22439 * was specified. 22440 */ 22441 ASSERT(ire->ire_ipversion == IPV4_VERSION); 22442 22443 /* 22444 * With IP multipathing, broadcast packets are sent on the ire 22445 * that has been cleared of IRE_MARK_NORECV and that belongs to 22446 * the group. However, this ire might not be in the same zone so 22447 * we can't always use its source address. We look for a 22448 * broadcast ire in the same group and in the right zone. 22449 */ 22450 if (ire->ire_type == IRE_BROADCAST && 22451 ire->ire_zoneid != zoneid) { 22452 ire_t *src_ire = ire_ctable_lookup(dst, 0, 22453 IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, 22454 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); 22455 if (src_ire != NULL) { 22456 src = src_ire->ire_src_addr; 22457 ire_refrele(src_ire); 22458 } else { 22459 ire_refrele(ire); 22460 if (conn_outgoing_ill != NULL) 22461 ill_refrele(conn_outgoing_ill); 22462 freemsg(first_mp); 22463 if (ill != NULL) { 22464 BUMP_MIB(ill->ill_ip_mib, 22465 ipIfStatsOutDiscards); 22466 } else { 22467 BUMP_MIB(&ipst->ips_ip_mib, 22468 ipIfStatsOutDiscards); 22469 } 22470 return; 22471 } 22472 } else { 22473 src = ire->ire_src_addr; 22474 } 22475 22476 if (connp == NULL) { 22477 ip1dbg(("ip_wput_ire: no connp and no src " 22478 "address for dst 0x%x, using src 0x%x\n", 22479 ntohl(dst), 22480 ntohl(src))); 22481 } 22482 ipha->ipha_src = src; 22483 } 22484 stq = ire->ire_stq; 22485 22486 /* 22487 * We only allow ire chains for broadcasts since there will 22488 * be multiple IRE_CACHE entries for the same multicast 22489 * address (one per ipif). 22490 */ 22491 next_mp = NULL; 22492 22493 /* broadcast packet */ 22494 if (ire->ire_type == IRE_BROADCAST) 22495 goto broadcast; 22496 22497 /* loopback ? */ 22498 if (stq == NULL) 22499 goto nullstq; 22500 22501 /* The ill_index for outbound ILL */ 22502 ill_index = Q_TO_INDEX(stq); 22503 22504 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 22505 ttl_protocol = ((uint16_t *)ipha)[4]; 22506 22507 /* pseudo checksum (do it in parts for IP header checksum) */ 22508 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 22509 22510 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 22511 queue_t *dev_q = stq->q_next; 22512 22513 /* flow controlled */ 22514 if ((dev_q->q_next || dev_q->q_first) && 22515 !canput(dev_q)) 22516 goto blocked; 22517 if ((PROTO == IPPROTO_UDP) && 22518 (ip_hdr_included != IP_HDR_INCLUDED)) { 22519 hlen = (V_HLEN & 0xF) << 2; 22520 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 22521 if (*up != 0) { 22522 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 22523 hlen, LENGTH, max_frag, ipsec_len, cksum); 22524 /* Software checksum? */ 22525 if (DB_CKSUMFLAGS(mp) == 0) { 22526 IP_STAT(ipst, ip_out_sw_cksum); 22527 IP_STAT_UPDATE(ipst, 22528 ip_udp_out_sw_cksum_bytes, 22529 LENGTH - hlen); 22530 } 22531 } 22532 } 22533 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 22534 hlen = (V_HLEN & 0xF) << 2; 22535 if (PROTO == IPPROTO_TCP) { 22536 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22537 /* 22538 * The packet header is processed once and for all, even 22539 * in the multirouting case. We disable hardware 22540 * checksum if the packet is multirouted, as it will be 22541 * replicated via several interfaces, and not all of 22542 * them may have this capability. 22543 */ 22544 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 22545 LENGTH, max_frag, ipsec_len, cksum); 22546 /* Software checksum? */ 22547 if (DB_CKSUMFLAGS(mp) == 0) { 22548 IP_STAT(ipst, ip_out_sw_cksum); 22549 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22550 LENGTH - hlen); 22551 } 22552 } else { 22553 sctp_hdr_t *sctph; 22554 22555 ASSERT(PROTO == IPPROTO_SCTP); 22556 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22557 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22558 /* 22559 * Zero out the checksum field to ensure proper 22560 * checksum calculation. 22561 */ 22562 sctph->sh_chksum = 0; 22563 #ifdef DEBUG 22564 if (!skip_sctp_cksum) 22565 #endif 22566 sctph->sh_chksum = sctp_cksum(mp, hlen); 22567 } 22568 } 22569 22570 /* 22571 * If this is a multicast packet and originated from ip_wput 22572 * we need to do loopback and forwarding checks. If it comes 22573 * from ip_wput_multicast, we SHOULD not do this. 22574 */ 22575 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 22576 22577 /* checksum */ 22578 cksum += ttl_protocol; 22579 22580 /* fragment the packet */ 22581 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 22582 goto fragmentit; 22583 /* 22584 * Don't use frag_flag if packet is pre-built or source 22585 * routed or if multicast (since multicast packets do 22586 * not solicit ICMP "packet too big" messages). 22587 */ 22588 if ((ip_hdr_included != IP_HDR_INCLUDED) && 22589 (V_HLEN == IP_SIMPLE_HDR_VERSION || 22590 !ip_source_route_included(ipha)) && 22591 !CLASSD(ipha->ipha_dst)) 22592 ipha->ipha_fragment_offset_and_flags |= 22593 htons(ire->ire_frag_flag); 22594 22595 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 22596 /* calculate IP header checksum */ 22597 cksum += ipha->ipha_ident; 22598 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 22599 cksum += ipha->ipha_fragment_offset_and_flags; 22600 22601 /* IP options present */ 22602 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 22603 if (hlen) 22604 goto checksumoptions; 22605 22606 /* calculate hdr checksum */ 22607 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 22608 cksum = ~(cksum + (cksum >> 16)); 22609 ipha->ipha_hdr_checksum = (uint16_t)cksum; 22610 } 22611 if (ipsec_len != 0) { 22612 /* 22613 * We will do the rest of the processing after 22614 * we come back from IPsec in ip_wput_ipsec_out(). 22615 */ 22616 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 22617 22618 io = (ipsec_out_t *)first_mp->b_rptr; 22619 io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> 22620 ill_phyint->phyint_ifindex; 22621 22622 ipsec_out_process(q, first_mp, ire, ill_index); 22623 ire_refrele(ire); 22624 if (conn_outgoing_ill != NULL) 22625 ill_refrele(conn_outgoing_ill); 22626 return; 22627 } 22628 22629 /* 22630 * In most cases, the emission loop below is entered only 22631 * once. Only in the case where the ire holds the 22632 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 22633 * flagged ires in the bucket, and send the packet 22634 * through all crossed RTF_MULTIRT routes. 22635 */ 22636 if (ire->ire_flags & RTF_MULTIRT) { 22637 multirt_send = B_TRUE; 22638 } 22639 do { 22640 if (multirt_send) { 22641 irb_t *irb; 22642 /* 22643 * We are in a multiple send case, need to get 22644 * the next ire and make a duplicate of the packet. 22645 * ire1 holds here the next ire to process in the 22646 * bucket. If multirouting is expected, 22647 * any non-RTF_MULTIRT ire that has the 22648 * right destination address is ignored. 22649 */ 22650 irb = ire->ire_bucket; 22651 ASSERT(irb != NULL); 22652 22653 IRB_REFHOLD(irb); 22654 for (ire1 = ire->ire_next; 22655 ire1 != NULL; 22656 ire1 = ire1->ire_next) { 22657 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22658 continue; 22659 if (ire1->ire_addr != ire->ire_addr) 22660 continue; 22661 if (ire1->ire_marks & 22662 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 22663 continue; 22664 22665 /* Got one */ 22666 IRE_REFHOLD(ire1); 22667 break; 22668 } 22669 IRB_REFRELE(irb); 22670 22671 if (ire1 != NULL) { 22672 next_mp = copyb(mp); 22673 if ((next_mp == NULL) || 22674 ((mp->b_cont != NULL) && 22675 ((next_mp->b_cont = 22676 dupmsg(mp->b_cont)) == NULL))) { 22677 freemsg(next_mp); 22678 next_mp = NULL; 22679 ire_refrele(ire1); 22680 ire1 = NULL; 22681 } 22682 } 22683 22684 /* Last multiroute ire; don't loop anymore. */ 22685 if (ire1 == NULL) { 22686 multirt_send = B_FALSE; 22687 } 22688 } 22689 22690 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 22691 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha, 22692 mblk_t *, mp); 22693 FW_HOOKS(ipst->ips_ip4_physical_out_event, 22694 ipst->ips_ipv4firewall_physical_out, 22695 NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst); 22696 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 22697 if (mp == NULL) 22698 goto release_ire_and_ill; 22699 22700 mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT); 22701 DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire); 22702 pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE); 22703 if ((pktxmit_state == SEND_FAILED) || 22704 (pktxmit_state == LLHDR_RESLV_FAILED)) { 22705 ip2dbg(("ip_wput_ire: ip_xmit_v4 failed" 22706 "- packet dropped\n")); 22707 release_ire_and_ill: 22708 ire_refrele(ire); 22709 if (next_mp != NULL) { 22710 freemsg(next_mp); 22711 ire_refrele(ire1); 22712 } 22713 if (conn_outgoing_ill != NULL) 22714 ill_refrele(conn_outgoing_ill); 22715 return; 22716 } 22717 22718 if (CLASSD(dst)) { 22719 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts); 22720 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets, 22721 LENGTH); 22722 } 22723 22724 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22725 "ip_wput_ire_end: q %p (%S)", 22726 q, "last copy out"); 22727 IRE_REFRELE(ire); 22728 22729 if (multirt_send) { 22730 ASSERT(ire1); 22731 /* 22732 * Proceed with the next RTF_MULTIRT ire, 22733 * Also set up the send-to queue accordingly. 22734 */ 22735 ire = ire1; 22736 ire1 = NULL; 22737 stq = ire->ire_stq; 22738 mp = next_mp; 22739 next_mp = NULL; 22740 ipha = (ipha_t *)mp->b_rptr; 22741 ill_index = Q_TO_INDEX(stq); 22742 ill = (ill_t *)stq->q_ptr; 22743 } 22744 } while (multirt_send); 22745 if (conn_outgoing_ill != NULL) 22746 ill_refrele(conn_outgoing_ill); 22747 return; 22748 22749 /* 22750 * ire->ire_type == IRE_BROADCAST (minimize diffs) 22751 */ 22752 broadcast: 22753 { 22754 /* 22755 * To avoid broadcast storms, we usually set the TTL to 1 for 22756 * broadcasts. However, if SO_DONTROUTE isn't set, this value 22757 * can be overridden stack-wide through the ip_broadcast_ttl 22758 * ndd tunable, or on a per-connection basis through the 22759 * IP_BROADCAST_TTL socket option. 22760 * 22761 * In the event that we are replying to incoming ICMP packets, 22762 * connp could be NULL. 22763 */ 22764 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 22765 if (connp != NULL) { 22766 if (connp->conn_dontroute) 22767 ipha->ipha_ttl = 1; 22768 else if (connp->conn_broadcast_ttl != 0) 22769 ipha->ipha_ttl = connp->conn_broadcast_ttl; 22770 } 22771 22772 /* 22773 * Note that we are not doing a IRB_REFHOLD here. 22774 * Actually we don't care if the list changes i.e 22775 * if somebody deletes an IRE from the list while 22776 * we drop the lock, the next time we come around 22777 * ire_next will be NULL and hence we won't send 22778 * out multiple copies which is fine. 22779 */ 22780 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 22781 ire1 = ire->ire_next; 22782 if (conn_outgoing_ill != NULL) { 22783 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 22784 ASSERT(ire1 == ire->ire_next); 22785 if (ire1 != NULL && ire1->ire_addr == dst) { 22786 ire_refrele(ire); 22787 ire = ire1; 22788 IRE_REFHOLD(ire); 22789 ire1 = ire->ire_next; 22790 continue; 22791 } 22792 rw_exit(&ire->ire_bucket->irb_lock); 22793 /* Did not find a matching ill */ 22794 ip1dbg(("ip_wput_ire: broadcast with no " 22795 "matching IP_BOUND_IF ill %s dst %x\n", 22796 conn_outgoing_ill->ill_name, dst)); 22797 freemsg(first_mp); 22798 if (ire != NULL) 22799 ire_refrele(ire); 22800 ill_refrele(conn_outgoing_ill); 22801 return; 22802 } 22803 } else if (ire1 != NULL && ire1->ire_addr == dst) { 22804 /* 22805 * If the next IRE has the same address and is not one 22806 * of the two copies that we need to send, try to see 22807 * whether this copy should be sent at all. This 22808 * assumes that we insert loopbacks first and then 22809 * non-loopbacks. This is acheived by inserting the 22810 * loopback always before non-loopback. 22811 * This is used to send a single copy of a broadcast 22812 * packet out all physical interfaces that have an 22813 * matching IRE_BROADCAST while also looping 22814 * back one copy (to ip_wput_local) for each 22815 * matching physical interface. However, we avoid 22816 * sending packets out different logical that match by 22817 * having ipif_up/ipif_down supress duplicate 22818 * IRE_BROADCASTS. 22819 * 22820 * This feature is currently used to get broadcasts 22821 * sent to multiple interfaces, when the broadcast 22822 * address being used applies to multiple interfaces. 22823 * For example, a whole net broadcast will be 22824 * replicated on every connected subnet of 22825 * the target net. 22826 * 22827 * Each zone has its own set of IRE_BROADCASTs, so that 22828 * we're able to distribute inbound packets to multiple 22829 * zones who share a broadcast address. We avoid looping 22830 * back outbound packets in different zones but on the 22831 * same ill, as the application would see duplicates. 22832 * 22833 * If the interfaces are part of the same group, 22834 * we would want to send only one copy out for 22835 * whole group. 22836 * 22837 * This logic assumes that ire_add_v4() groups the 22838 * IRE_BROADCAST entries so that those with the same 22839 * ire_addr and ill_group are kept together. 22840 */ 22841 ire_ill = ire->ire_ipif->ipif_ill; 22842 if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { 22843 if (ire_ill->ill_group != NULL && 22844 (ire->ire_marks & IRE_MARK_NORECV)) { 22845 /* 22846 * If the current zone only has an ire 22847 * broadcast for this address marked 22848 * NORECV, the ire we want is ahead in 22849 * the bucket, so we look it up 22850 * deliberately ignoring the zoneid. 22851 */ 22852 for (ire1 = ire->ire_bucket->irb_ire; 22853 ire1 != NULL; 22854 ire1 = ire1->ire_next) { 22855 ire1_ill = 22856 ire1->ire_ipif->ipif_ill; 22857 if (ire1->ire_addr != dst) 22858 continue; 22859 /* skip over the current ire */ 22860 if (ire1 == ire) 22861 continue; 22862 /* skip over deleted ires */ 22863 if (ire1->ire_marks & 22864 IRE_MARK_CONDEMNED) 22865 continue; 22866 /* 22867 * non-loopback ire in our 22868 * group: use it for the next 22869 * pass in the loop 22870 */ 22871 if (ire1->ire_stq != NULL && 22872 ire1_ill->ill_group == 22873 ire_ill->ill_group) 22874 break; 22875 } 22876 } 22877 } else { 22878 while (ire1 != NULL && ire1->ire_addr == dst) { 22879 ire1_ill = ire1->ire_ipif->ipif_ill; 22880 /* 22881 * We can have two broadcast ires on the 22882 * same ill in different zones; here 22883 * we'll send a copy of the packet on 22884 * each ill and the fanout code will 22885 * call conn_wantpacket() to check that 22886 * the zone has the broadcast address 22887 * configured on the ill. If the two 22888 * ires are in the same group we only 22889 * send one copy up. 22890 */ 22891 if (ire1_ill != ire_ill && 22892 (ire1_ill->ill_group == NULL || 22893 ire_ill->ill_group == NULL || 22894 ire1_ill->ill_group != 22895 ire_ill->ill_group)) { 22896 break; 22897 } 22898 ire1 = ire1->ire_next; 22899 } 22900 } 22901 } 22902 ASSERT(multirt_send == B_FALSE); 22903 if (ire1 != NULL && ire1->ire_addr == dst) { 22904 if ((ire->ire_flags & RTF_MULTIRT) && 22905 (ire1->ire_flags & RTF_MULTIRT)) { 22906 /* 22907 * We are in the multirouting case. 22908 * The message must be sent at least 22909 * on both ires. These ires have been 22910 * inserted AFTER the standard ones 22911 * in ip_rt_add(). There are thus no 22912 * other ire entries for the destination 22913 * address in the rest of the bucket 22914 * that do not have the RTF_MULTIRT 22915 * flag. We don't process a copy 22916 * of the message here. This will be 22917 * done in the final sending loop. 22918 */ 22919 multirt_send = B_TRUE; 22920 } else { 22921 next_mp = ip_copymsg(first_mp); 22922 if (next_mp != NULL) 22923 IRE_REFHOLD(ire1); 22924 } 22925 } 22926 rw_exit(&ire->ire_bucket->irb_lock); 22927 } 22928 22929 if (stq) { 22930 /* 22931 * A non-NULL send-to queue means this packet is going 22932 * out of this machine. 22933 */ 22934 out_ill = (ill_t *)stq->q_ptr; 22935 22936 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests); 22937 ttl_protocol = ((uint16_t *)ipha)[4]; 22938 /* 22939 * We accumulate the pseudo header checksum in cksum. 22940 * This is pretty hairy code, so watch close. One 22941 * thing to keep in mind is that UDP and TCP have 22942 * stored their respective datagram lengths in their 22943 * checksum fields. This lines things up real nice. 22944 */ 22945 cksum = (dst >> 16) + (dst & 0xFFFF) + 22946 (src >> 16) + (src & 0xFFFF); 22947 /* 22948 * We assume the udp checksum field contains the 22949 * length, so to compute the pseudo header checksum, 22950 * all we need is the protocol number and src/dst. 22951 */ 22952 /* Provide the checksums for UDP and TCP. */ 22953 if ((PROTO == IPPROTO_TCP) && 22954 (ip_hdr_included != IP_HDR_INCLUDED)) { 22955 /* hlen gets the number of uchar_ts in the IP header */ 22956 hlen = (V_HLEN & 0xF) << 2; 22957 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22958 IP_STAT(ipst, ip_out_sw_cksum); 22959 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22960 LENGTH - hlen); 22961 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 22962 } else if (PROTO == IPPROTO_SCTP && 22963 (ip_hdr_included != IP_HDR_INCLUDED)) { 22964 sctp_hdr_t *sctph; 22965 22966 hlen = (V_HLEN & 0xF) << 2; 22967 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22968 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22969 sctph->sh_chksum = 0; 22970 #ifdef DEBUG 22971 if (!skip_sctp_cksum) 22972 #endif 22973 sctph->sh_chksum = sctp_cksum(mp, hlen); 22974 } else { 22975 queue_t *dev_q = stq->q_next; 22976 22977 if ((dev_q->q_next || dev_q->q_first) && 22978 !canput(dev_q)) { 22979 blocked: 22980 ipha->ipha_ident = ip_hdr_included; 22981 /* 22982 * If we don't have a conn to apply 22983 * backpressure, free the message. 22984 * In the ire_send path, we don't know 22985 * the position to requeue the packet. Rather 22986 * than reorder packets, we just drop this 22987 * packet. 22988 */ 22989 if (ipst->ips_ip_output_queue && 22990 connp != NULL && 22991 caller != IRE_SEND) { 22992 if (caller == IP_WSRV) { 22993 connp->conn_did_putbq = 1; 22994 (void) putbq(connp->conn_wq, 22995 first_mp); 22996 conn_drain_insert(connp); 22997 /* 22998 * This is the service thread, 22999 * and the queue is already 23000 * noenabled. The check for 23001 * canput and the putbq is not 23002 * atomic. So we need to check 23003 * again. 23004 */ 23005 if (canput(stq->q_next)) 23006 connp->conn_did_putbq 23007 = 0; 23008 IP_STAT(ipst, ip_conn_flputbq); 23009 } else { 23010 /* 23011 * We are not the service proc. 23012 * ip_wsrv will be scheduled or 23013 * is already running. 23014 */ 23015 (void) putq(connp->conn_wq, 23016 first_mp); 23017 } 23018 } else { 23019 out_ill = (ill_t *)stq->q_ptr; 23020 BUMP_MIB(out_ill->ill_ip_mib, 23021 ipIfStatsOutDiscards); 23022 freemsg(first_mp); 23023 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23024 "ip_wput_ire_end: q %p (%S)", 23025 q, "discard"); 23026 } 23027 ire_refrele(ire); 23028 if (next_mp) { 23029 ire_refrele(ire1); 23030 freemsg(next_mp); 23031 } 23032 if (conn_outgoing_ill != NULL) 23033 ill_refrele(conn_outgoing_ill); 23034 return; 23035 } 23036 if ((PROTO == IPPROTO_UDP) && 23037 (ip_hdr_included != IP_HDR_INCLUDED)) { 23038 /* 23039 * hlen gets the number of uchar_ts in the 23040 * IP header 23041 */ 23042 hlen = (V_HLEN & 0xF) << 2; 23043 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 23044 max_frag = ire->ire_max_frag; 23045 if (*up != 0) { 23046 IP_CKSUM_XMIT(out_ill, ire, mp, ipha, 23047 up, PROTO, hlen, LENGTH, max_frag, 23048 ipsec_len, cksum); 23049 /* Software checksum? */ 23050 if (DB_CKSUMFLAGS(mp) == 0) { 23051 IP_STAT(ipst, ip_out_sw_cksum); 23052 IP_STAT_UPDATE(ipst, 23053 ip_udp_out_sw_cksum_bytes, 23054 LENGTH - hlen); 23055 } 23056 } 23057 } 23058 } 23059 /* 23060 * Need to do this even when fragmenting. The local 23061 * loopback can be done without computing checksums 23062 * but forwarding out other interface must be done 23063 * after the IP checksum (and ULP checksums) have been 23064 * computed. 23065 * 23066 * NOTE : multicast_forward is set only if this packet 23067 * originated from ip_wput. For packets originating from 23068 * ip_wput_multicast, it is not set. 23069 */ 23070 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 23071 multi_loopback: 23072 ip2dbg(("ip_wput: multicast, loop %d\n", 23073 conn_multicast_loop)); 23074 23075 /* Forget header checksum offload */ 23076 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 23077 23078 /* 23079 * Local loopback of multicasts? Check the 23080 * ill. 23081 * 23082 * Note that the loopback function will not come 23083 * in through ip_rput - it will only do the 23084 * client fanout thus we need to do an mforward 23085 * as well. The is different from the BSD 23086 * logic. 23087 */ 23088 if (ill != NULL) { 23089 ilm_t *ilm; 23090 23091 ILM_WALKER_HOLD(ill); 23092 ilm = ilm_lookup_ill(ill, ipha->ipha_dst, 23093 ALL_ZONES); 23094 ILM_WALKER_RELE(ill); 23095 if (ilm != NULL) { 23096 /* 23097 * Pass along the virtual output q. 23098 * ip_wput_local() will distribute the 23099 * packet to all the matching zones, 23100 * except the sending zone when 23101 * IP_MULTICAST_LOOP is false. 23102 */ 23103 ip_multicast_loopback(q, ill, first_mp, 23104 conn_multicast_loop ? 0 : 23105 IP_FF_NO_MCAST_LOOP, zoneid); 23106 } 23107 } 23108 if (ipha->ipha_ttl == 0) { 23109 /* 23110 * 0 => only to this host i.e. we are 23111 * done. We are also done if this was the 23112 * loopback interface since it is sufficient 23113 * to loopback one copy of a multicast packet. 23114 */ 23115 freemsg(first_mp); 23116 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23117 "ip_wput_ire_end: q %p (%S)", 23118 q, "loopback"); 23119 ire_refrele(ire); 23120 if (conn_outgoing_ill != NULL) 23121 ill_refrele(conn_outgoing_ill); 23122 return; 23123 } 23124 /* 23125 * ILLF_MULTICAST is checked in ip_newroute 23126 * i.e. we don't need to check it here since 23127 * all IRE_CACHEs come from ip_newroute. 23128 * For multicast traffic, SO_DONTROUTE is interpreted 23129 * to mean only send the packet out the interface 23130 * (optionally specified with IP_MULTICAST_IF) 23131 * and do not forward it out additional interfaces. 23132 * RSVP and the rsvp daemon is an example of a 23133 * protocol and user level process that 23134 * handles it's own routing. Hence, it uses the 23135 * SO_DONTROUTE option to accomplish this. 23136 */ 23137 23138 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 23139 ill != NULL) { 23140 /* Unconditionally redo the checksum */ 23141 ipha->ipha_hdr_checksum = 0; 23142 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 23143 23144 /* 23145 * If this needs to go out secure, we need 23146 * to wait till we finish the IPsec 23147 * processing. 23148 */ 23149 if (ipsec_len == 0 && 23150 ip_mforward(ill, ipha, mp)) { 23151 freemsg(first_mp); 23152 ip1dbg(("ip_wput: mforward failed\n")); 23153 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23154 "ip_wput_ire_end: q %p (%S)", 23155 q, "mforward failed"); 23156 ire_refrele(ire); 23157 if (conn_outgoing_ill != NULL) 23158 ill_refrele(conn_outgoing_ill); 23159 return; 23160 } 23161 } 23162 } 23163 max_frag = ire->ire_max_frag; 23164 cksum += ttl_protocol; 23165 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 23166 /* No fragmentation required for this one. */ 23167 /* 23168 * Don't use frag_flag if packet is pre-built or source 23169 * routed or if multicast (since multicast packets do 23170 * not solicit ICMP "packet too big" messages). 23171 */ 23172 if ((ip_hdr_included != IP_HDR_INCLUDED) && 23173 (V_HLEN == IP_SIMPLE_HDR_VERSION || 23174 !ip_source_route_included(ipha)) && 23175 !CLASSD(ipha->ipha_dst)) 23176 ipha->ipha_fragment_offset_and_flags |= 23177 htons(ire->ire_frag_flag); 23178 23179 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 23180 /* Complete the IP header checksum. */ 23181 cksum += ipha->ipha_ident; 23182 cksum += (v_hlen_tos_len >> 16)+ 23183 (v_hlen_tos_len & 0xFFFF); 23184 cksum += ipha->ipha_fragment_offset_and_flags; 23185 hlen = (V_HLEN & 0xF) - 23186 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 23187 if (hlen) { 23188 checksumoptions: 23189 /* 23190 * Account for the IP Options in the IP 23191 * header checksum. 23192 */ 23193 up = (uint16_t *)(rptr+ 23194 IP_SIMPLE_HDR_LENGTH); 23195 do { 23196 cksum += up[0]; 23197 cksum += up[1]; 23198 up += 2; 23199 } while (--hlen); 23200 } 23201 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 23202 cksum = ~(cksum + (cksum >> 16)); 23203 ipha->ipha_hdr_checksum = (uint16_t)cksum; 23204 } 23205 if (ipsec_len != 0) { 23206 ipsec_out_process(q, first_mp, ire, ill_index); 23207 if (!next_mp) { 23208 ire_refrele(ire); 23209 if (conn_outgoing_ill != NULL) 23210 ill_refrele(conn_outgoing_ill); 23211 return; 23212 } 23213 goto next; 23214 } 23215 23216 /* 23217 * multirt_send has already been handled 23218 * for broadcast, but not yet for multicast 23219 * or IP options. 23220 */ 23221 if (next_mp == NULL) { 23222 if (ire->ire_flags & RTF_MULTIRT) { 23223 multirt_send = B_TRUE; 23224 } 23225 } 23226 23227 /* 23228 * In most cases, the emission loop below is 23229 * entered only once. Only in the case where 23230 * the ire holds the RTF_MULTIRT flag, do we loop 23231 * to process all RTF_MULTIRT ires in the bucket, 23232 * and send the packet through all crossed 23233 * RTF_MULTIRT routes. 23234 */ 23235 do { 23236 if (multirt_send) { 23237 irb_t *irb; 23238 23239 irb = ire->ire_bucket; 23240 ASSERT(irb != NULL); 23241 /* 23242 * We are in a multiple send case, 23243 * need to get the next IRE and make 23244 * a duplicate of the packet. 23245 */ 23246 IRB_REFHOLD(irb); 23247 for (ire1 = ire->ire_next; 23248 ire1 != NULL; 23249 ire1 = ire1->ire_next) { 23250 if (!(ire1->ire_flags & 23251 RTF_MULTIRT)) { 23252 continue; 23253 } 23254 if (ire1->ire_addr != 23255 ire->ire_addr) { 23256 continue; 23257 } 23258 if (ire1->ire_marks & 23259 (IRE_MARK_CONDEMNED| 23260 IRE_MARK_HIDDEN)) { 23261 continue; 23262 } 23263 23264 /* Got one */ 23265 IRE_REFHOLD(ire1); 23266 break; 23267 } 23268 IRB_REFRELE(irb); 23269 23270 if (ire1 != NULL) { 23271 next_mp = copyb(mp); 23272 if ((next_mp == NULL) || 23273 ((mp->b_cont != NULL) && 23274 ((next_mp->b_cont = 23275 dupmsg(mp->b_cont)) 23276 == NULL))) { 23277 freemsg(next_mp); 23278 next_mp = NULL; 23279 ire_refrele(ire1); 23280 ire1 = NULL; 23281 } 23282 } 23283 23284 /* 23285 * Last multiroute ire; don't loop 23286 * anymore. The emission is over 23287 * and next_mp is NULL. 23288 */ 23289 if (ire1 == NULL) { 23290 multirt_send = B_FALSE; 23291 } 23292 } 23293 23294 out_ill = ire_to_ill(ire); 23295 DTRACE_PROBE4(ip4__physical__out__start, 23296 ill_t *, NULL, 23297 ill_t *, out_ill, 23298 ipha_t *, ipha, mblk_t *, mp); 23299 FW_HOOKS(ipst->ips_ip4_physical_out_event, 23300 ipst->ips_ipv4firewall_physical_out, 23301 NULL, out_ill, ipha, mp, mp, 0, ipst); 23302 DTRACE_PROBE1(ip4__physical__out__end, 23303 mblk_t *, mp); 23304 if (mp == NULL) 23305 goto release_ire_and_ill_2; 23306 23307 ASSERT(ipsec_len == 0); 23308 mp->b_prev = 23309 SET_BPREV_FLAG(IPP_LOCAL_OUT); 23310 DTRACE_PROBE2(ip__xmit__2, 23311 mblk_t *, mp, ire_t *, ire); 23312 pktxmit_state = ip_xmit_v4(mp, ire, 23313 NULL, B_TRUE); 23314 if ((pktxmit_state == SEND_FAILED) || 23315 (pktxmit_state == LLHDR_RESLV_FAILED)) { 23316 release_ire_and_ill_2: 23317 if (next_mp) { 23318 freemsg(next_mp); 23319 ire_refrele(ire1); 23320 } 23321 ire_refrele(ire); 23322 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23323 "ip_wput_ire_end: q %p (%S)", 23324 q, "discard MDATA"); 23325 if (conn_outgoing_ill != NULL) 23326 ill_refrele(conn_outgoing_ill); 23327 return; 23328 } 23329 23330 if (CLASSD(dst)) { 23331 BUMP_MIB(out_ill->ill_ip_mib, 23332 ipIfStatsHCOutMcastPkts); 23333 UPDATE_MIB(out_ill->ill_ip_mib, 23334 ipIfStatsHCOutMcastOctets, 23335 LENGTH); 23336 } else if (ire->ire_type == IRE_BROADCAST) { 23337 BUMP_MIB(out_ill->ill_ip_mib, 23338 ipIfStatsHCOutBcastPkts); 23339 } 23340 23341 if (multirt_send) { 23342 /* 23343 * We are in a multiple send case, 23344 * need to re-enter the sending loop 23345 * using the next ire. 23346 */ 23347 ire_refrele(ire); 23348 ire = ire1; 23349 stq = ire->ire_stq; 23350 mp = next_mp; 23351 next_mp = NULL; 23352 ipha = (ipha_t *)mp->b_rptr; 23353 ill_index = Q_TO_INDEX(stq); 23354 } 23355 } while (multirt_send); 23356 23357 if (!next_mp) { 23358 /* 23359 * Last copy going out (the ultra-common 23360 * case). Note that we intentionally replicate 23361 * the putnext rather than calling it before 23362 * the next_mp check in hopes of a little 23363 * tail-call action out of the compiler. 23364 */ 23365 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23366 "ip_wput_ire_end: q %p (%S)", 23367 q, "last copy out(1)"); 23368 ire_refrele(ire); 23369 if (conn_outgoing_ill != NULL) 23370 ill_refrele(conn_outgoing_ill); 23371 return; 23372 } 23373 /* More copies going out below. */ 23374 } else { 23375 int offset; 23376 fragmentit: 23377 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 23378 /* 23379 * If this would generate a icmp_frag_needed message, 23380 * we need to handle it before we do the IPsec 23381 * processing. Otherwise, we need to strip the IPsec 23382 * headers before we send up the message to the ULPs 23383 * which becomes messy and difficult. 23384 */ 23385 if (ipsec_len != 0) { 23386 if ((max_frag < (unsigned int)(LENGTH + 23387 ipsec_len)) && (offset & IPH_DF)) { 23388 out_ill = (ill_t *)stq->q_ptr; 23389 BUMP_MIB(out_ill->ill_ip_mib, 23390 ipIfStatsOutFragFails); 23391 BUMP_MIB(out_ill->ill_ip_mib, 23392 ipIfStatsOutFragReqds); 23393 ipha->ipha_hdr_checksum = 0; 23394 ipha->ipha_hdr_checksum = 23395 (uint16_t)ip_csum_hdr(ipha); 23396 icmp_frag_needed(ire->ire_stq, first_mp, 23397 max_frag, zoneid, ipst); 23398 if (!next_mp) { 23399 ire_refrele(ire); 23400 if (conn_outgoing_ill != NULL) { 23401 ill_refrele( 23402 conn_outgoing_ill); 23403 } 23404 return; 23405 } 23406 } else { 23407 /* 23408 * This won't cause a icmp_frag_needed 23409 * message. to be generated. Send it on 23410 * the wire. Note that this could still 23411 * cause fragmentation and all we 23412 * do is the generation of the message 23413 * to the ULP if needed before IPsec. 23414 */ 23415 if (!next_mp) { 23416 ipsec_out_process(q, first_mp, 23417 ire, ill_index); 23418 TRACE_2(TR_FAC_IP, 23419 TR_IP_WPUT_IRE_END, 23420 "ip_wput_ire_end: q %p " 23421 "(%S)", q, 23422 "last ipsec_out_process"); 23423 ire_refrele(ire); 23424 if (conn_outgoing_ill != NULL) { 23425 ill_refrele( 23426 conn_outgoing_ill); 23427 } 23428 return; 23429 } 23430 ipsec_out_process(q, first_mp, 23431 ire, ill_index); 23432 } 23433 } else { 23434 /* 23435 * Initiate IPPF processing. For 23436 * fragmentable packets we finish 23437 * all QOS packet processing before 23438 * calling: 23439 * ip_wput_ire_fragmentit->ip_wput_frag 23440 */ 23441 23442 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23443 ip_process(IPP_LOCAL_OUT, &mp, 23444 ill_index); 23445 if (mp == NULL) { 23446 out_ill = (ill_t *)stq->q_ptr; 23447 BUMP_MIB(out_ill->ill_ip_mib, 23448 ipIfStatsOutDiscards); 23449 if (next_mp != NULL) { 23450 freemsg(next_mp); 23451 ire_refrele(ire1); 23452 } 23453 ire_refrele(ire); 23454 TRACE_2(TR_FAC_IP, 23455 TR_IP_WPUT_IRE_END, 23456 "ip_wput_ire: q %p (%S)", 23457 q, "discard MDATA"); 23458 if (conn_outgoing_ill != NULL) { 23459 ill_refrele( 23460 conn_outgoing_ill); 23461 } 23462 return; 23463 } 23464 } 23465 if (!next_mp) { 23466 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23467 "ip_wput_ire_end: q %p (%S)", 23468 q, "last fragmentation"); 23469 ip_wput_ire_fragmentit(mp, ire, 23470 zoneid, ipst); 23471 ire_refrele(ire); 23472 if (conn_outgoing_ill != NULL) 23473 ill_refrele(conn_outgoing_ill); 23474 return; 23475 } 23476 ip_wput_ire_fragmentit(mp, ire, zoneid, ipst); 23477 } 23478 } 23479 } else { 23480 nullstq: 23481 /* A NULL stq means the destination address is local. */ 23482 UPDATE_OB_PKT_COUNT(ire); 23483 ire->ire_last_used_time = lbolt; 23484 ASSERT(ire->ire_ipif != NULL); 23485 if (!next_mp) { 23486 /* 23487 * Is there an "in" and "out" for traffic local 23488 * to a host (loopback)? The code in Solaris doesn't 23489 * explicitly draw a line in its code for in vs out, 23490 * so we've had to draw a line in the sand: ip_wput_ire 23491 * is considered to be the "output" side and 23492 * ip_wput_local to be the "input" side. 23493 */ 23494 out_ill = ire_to_ill(ire); 23495 23496 /* 23497 * DTrace this as ip:::send. A blocked packet will 23498 * fire the send probe, but not the receive probe. 23499 */ 23500 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23501 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23502 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23503 23504 DTRACE_PROBE4(ip4__loopback__out__start, 23505 ill_t *, NULL, ill_t *, out_ill, 23506 ipha_t *, ipha, mblk_t *, first_mp); 23507 23508 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23509 ipst->ips_ipv4firewall_loopback_out, 23510 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23511 23512 DTRACE_PROBE1(ip4__loopback__out_end, 23513 mblk_t *, first_mp); 23514 23515 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23516 "ip_wput_ire_end: q %p (%S)", 23517 q, "local address"); 23518 23519 if (first_mp != NULL) 23520 ip_wput_local(q, out_ill, ipha, 23521 first_mp, ire, 0, ire->ire_zoneid); 23522 ire_refrele(ire); 23523 if (conn_outgoing_ill != NULL) 23524 ill_refrele(conn_outgoing_ill); 23525 return; 23526 } 23527 23528 out_ill = ire_to_ill(ire); 23529 23530 /* 23531 * DTrace this as ip:::send. A blocked packet will fire the 23532 * send probe, but not the receive probe. 23533 */ 23534 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23535 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23536 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23537 23538 DTRACE_PROBE4(ip4__loopback__out__start, 23539 ill_t *, NULL, ill_t *, out_ill, 23540 ipha_t *, ipha, mblk_t *, first_mp); 23541 23542 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23543 ipst->ips_ipv4firewall_loopback_out, 23544 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23545 23546 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp); 23547 23548 if (first_mp != NULL) 23549 ip_wput_local(q, out_ill, ipha, 23550 first_mp, ire, 0, ire->ire_zoneid); 23551 } 23552 next: 23553 /* 23554 * More copies going out to additional interfaces. 23555 * ire1 has already been held. We don't need the 23556 * "ire" anymore. 23557 */ 23558 ire_refrele(ire); 23559 ire = ire1; 23560 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 23561 mp = next_mp; 23562 ASSERT(ire->ire_ipversion == IPV4_VERSION); 23563 ill = ire_to_ill(ire); 23564 first_mp = mp; 23565 if (ipsec_len != 0) { 23566 ASSERT(first_mp->b_datap->db_type == M_CTL); 23567 mp = mp->b_cont; 23568 } 23569 dst = ire->ire_addr; 23570 ipha = (ipha_t *)mp->b_rptr; 23571 /* 23572 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 23573 * Restore ipha_ident "no checksum" flag. 23574 */ 23575 src = orig_src; 23576 ipha->ipha_ident = ip_hdr_included; 23577 goto another; 23578 23579 #undef rptr 23580 #undef Q_TO_INDEX 23581 } 23582 23583 /* 23584 * Routine to allocate a message that is used to notify the ULP about MDT. 23585 * The caller may provide a pointer to the link-layer MDT capabilities, 23586 * or NULL if MDT is to be disabled on the stream. 23587 */ 23588 mblk_t * 23589 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 23590 { 23591 mblk_t *mp; 23592 ip_mdt_info_t *mdti; 23593 ill_mdt_capab_t *idst; 23594 23595 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 23596 DB_TYPE(mp) = M_CTL; 23597 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 23598 mdti = (ip_mdt_info_t *)mp->b_rptr; 23599 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 23600 idst = &(mdti->mdt_capab); 23601 23602 /* 23603 * If the caller provides us with the capability, copy 23604 * it over into our notification message; otherwise 23605 * we zero out the capability portion. 23606 */ 23607 if (isrc != NULL) 23608 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23609 else 23610 bzero((caddr_t)idst, sizeof (*idst)); 23611 } 23612 return (mp); 23613 } 23614 23615 /* 23616 * Routine which determines whether MDT can be enabled on the destination 23617 * IRE and IPC combination, and if so, allocates and returns the MDT 23618 * notification mblk that may be used by ULP. We also check if we need to 23619 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 23620 * MDT usage in the past have been lifted. This gets called during IP 23621 * and ULP binding. 23622 */ 23623 mblk_t * 23624 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23625 ill_mdt_capab_t *mdt_cap) 23626 { 23627 mblk_t *mp; 23628 boolean_t rc = B_FALSE; 23629 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23630 23631 ASSERT(dst_ire != NULL); 23632 ASSERT(connp != NULL); 23633 ASSERT(mdt_cap != NULL); 23634 23635 /* 23636 * Currently, we only support simple TCP/{IPv4,IPv6} with 23637 * Multidata, which is handled in tcp_multisend(). This 23638 * is the reason why we do all these checks here, to ensure 23639 * that we don't enable Multidata for the cases which we 23640 * can't handle at the moment. 23641 */ 23642 do { 23643 /* Only do TCP at the moment */ 23644 if (connp->conn_ulp != IPPROTO_TCP) 23645 break; 23646 23647 /* 23648 * IPsec outbound policy present? Note that we get here 23649 * after calling ipsec_conn_cache_policy() where the global 23650 * policy checking is performed. conn_latch will be 23651 * non-NULL as long as there's a policy defined, 23652 * i.e. conn_out_enforce_policy may be NULL in such case 23653 * when the connection is non-secure, and hence we check 23654 * further if the latch refers to an outbound policy. 23655 */ 23656 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 23657 break; 23658 23659 /* CGTP (multiroute) is enabled? */ 23660 if (dst_ire->ire_flags & RTF_MULTIRT) 23661 break; 23662 23663 /* Outbound IPQoS enabled? */ 23664 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23665 /* 23666 * In this case, we disable MDT for this and all 23667 * future connections going over the interface. 23668 */ 23669 mdt_cap->ill_mdt_on = 0; 23670 break; 23671 } 23672 23673 /* socket option(s) present? */ 23674 if (!CONN_IS_LSO_MD_FASTPATH(connp)) 23675 break; 23676 23677 rc = B_TRUE; 23678 /* CONSTCOND */ 23679 } while (0); 23680 23681 /* Remember the result */ 23682 connp->conn_mdt_ok = rc; 23683 23684 if (!rc) 23685 return (NULL); 23686 else if (!mdt_cap->ill_mdt_on) { 23687 /* 23688 * If MDT has been previously turned off in the past, and we 23689 * currently can do MDT (due to IPQoS policy removal, etc.) 23690 * then enable it for this interface. 23691 */ 23692 mdt_cap->ill_mdt_on = 1; 23693 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 23694 "interface %s\n", ill_name)); 23695 } 23696 23697 /* Allocate the MDT info mblk */ 23698 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 23699 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 23700 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23701 return (NULL); 23702 } 23703 return (mp); 23704 } 23705 23706 /* 23707 * Routine to allocate a message that is used to notify the ULP about LSO. 23708 * The caller may provide a pointer to the link-layer LSO capabilities, 23709 * or NULL if LSO is to be disabled on the stream. 23710 */ 23711 mblk_t * 23712 ip_lsoinfo_alloc(ill_lso_capab_t *isrc) 23713 { 23714 mblk_t *mp; 23715 ip_lso_info_t *lsoi; 23716 ill_lso_capab_t *idst; 23717 23718 if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) { 23719 DB_TYPE(mp) = M_CTL; 23720 mp->b_wptr = mp->b_rptr + sizeof (*lsoi); 23721 lsoi = (ip_lso_info_t *)mp->b_rptr; 23722 lsoi->lso_info_id = LSO_IOC_INFO_UPDATE; 23723 idst = &(lsoi->lso_capab); 23724 23725 /* 23726 * If the caller provides us with the capability, copy 23727 * it over into our notification message; otherwise 23728 * we zero out the capability portion. 23729 */ 23730 if (isrc != NULL) 23731 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23732 else 23733 bzero((caddr_t)idst, sizeof (*idst)); 23734 } 23735 return (mp); 23736 } 23737 23738 /* 23739 * Routine which determines whether LSO can be enabled on the destination 23740 * IRE and IPC combination, and if so, allocates and returns the LSO 23741 * notification mblk that may be used by ULP. We also check if we need to 23742 * turn LSO back to 'on' when certain restrictions prohibiting us to allow 23743 * LSO usage in the past have been lifted. This gets called during IP 23744 * and ULP binding. 23745 */ 23746 mblk_t * 23747 ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23748 ill_lso_capab_t *lso_cap) 23749 { 23750 mblk_t *mp; 23751 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23752 23753 ASSERT(dst_ire != NULL); 23754 ASSERT(connp != NULL); 23755 ASSERT(lso_cap != NULL); 23756 23757 connp->conn_lso_ok = B_TRUE; 23758 23759 if ((connp->conn_ulp != IPPROTO_TCP) || 23760 CONN_IPSEC_OUT_ENCAPSULATED(connp) || 23761 (dst_ire->ire_flags & RTF_MULTIRT) || 23762 !CONN_IS_LSO_MD_FASTPATH(connp) || 23763 (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { 23764 connp->conn_lso_ok = B_FALSE; 23765 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23766 /* 23767 * Disable LSO for this and all future connections going 23768 * over the interface. 23769 */ 23770 lso_cap->ill_lso_on = 0; 23771 } 23772 } 23773 23774 if (!connp->conn_lso_ok) 23775 return (NULL); 23776 else if (!lso_cap->ill_lso_on) { 23777 /* 23778 * If LSO has been previously turned off in the past, and we 23779 * currently can do LSO (due to IPQoS policy removal, etc.) 23780 * then enable it for this interface. 23781 */ 23782 lso_cap->ill_lso_on = 1; 23783 ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n", 23784 ill_name)); 23785 } 23786 23787 /* Allocate the LSO info mblk */ 23788 if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL) 23789 ip0dbg(("ip_lsoinfo_return: can't enable LSO for " 23790 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23791 23792 return (mp); 23793 } 23794 23795 /* 23796 * Create destination address attribute, and fill it with the physical 23797 * destination address and SAP taken from the template DL_UNITDATA_REQ 23798 * message block. 23799 */ 23800 boolean_t 23801 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 23802 { 23803 dl_unitdata_req_t *dlurp; 23804 pattr_t *pa; 23805 pattrinfo_t pa_info; 23806 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 23807 uint_t das_len, das_off; 23808 23809 ASSERT(dlmp != NULL); 23810 23811 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 23812 das_len = dlurp->dl_dest_addr_length; 23813 das_off = dlurp->dl_dest_addr_offset; 23814 23815 pa_info.type = PATTR_DSTADDRSAP; 23816 pa_info.len = sizeof (**das) + das_len - 1; 23817 23818 /* create and associate the attribute */ 23819 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23820 if (pa != NULL) { 23821 ASSERT(*das != NULL); 23822 (*das)->addr_is_group = 0; 23823 (*das)->addr_len = (uint8_t)das_len; 23824 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 23825 } 23826 23827 return (pa != NULL); 23828 } 23829 23830 /* 23831 * Create hardware checksum attribute and fill it with the values passed. 23832 */ 23833 boolean_t 23834 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 23835 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 23836 { 23837 pattr_t *pa; 23838 pattrinfo_t pa_info; 23839 23840 ASSERT(mmd != NULL); 23841 23842 pa_info.type = PATTR_HCKSUM; 23843 pa_info.len = sizeof (pattr_hcksum_t); 23844 23845 /* create and associate the attribute */ 23846 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23847 if (pa != NULL) { 23848 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 23849 23850 hck->hcksum_start_offset = start_offset; 23851 hck->hcksum_stuff_offset = stuff_offset; 23852 hck->hcksum_end_offset = end_offset; 23853 hck->hcksum_flags = flags; 23854 } 23855 return (pa != NULL); 23856 } 23857 23858 /* 23859 * Create zerocopy attribute and fill it with the specified flags 23860 */ 23861 boolean_t 23862 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 23863 { 23864 pattr_t *pa; 23865 pattrinfo_t pa_info; 23866 23867 ASSERT(mmd != NULL); 23868 pa_info.type = PATTR_ZCOPY; 23869 pa_info.len = sizeof (pattr_zcopy_t); 23870 23871 /* create and associate the attribute */ 23872 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23873 if (pa != NULL) { 23874 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 23875 23876 zcopy->zcopy_flags = flags; 23877 } 23878 return (pa != NULL); 23879 } 23880 23881 /* 23882 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 23883 * block chain. We could rewrite to handle arbitrary message block chains but 23884 * that would make the code complicated and slow. Right now there three 23885 * restrictions: 23886 * 23887 * 1. The first message block must contain the complete IP header and 23888 * at least 1 byte of payload data. 23889 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 23890 * so that we can use a single Multidata message. 23891 * 3. No frag must be distributed over two or more message blocks so 23892 * that we don't need more than two packet descriptors per frag. 23893 * 23894 * The above restrictions allow us to support userland applications (which 23895 * will send down a single message block) and NFS over UDP (which will 23896 * send down a chain of at most three message blocks). 23897 * 23898 * We also don't use MDT for payloads with less than or equal to 23899 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 23900 */ 23901 boolean_t 23902 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 23903 { 23904 int blocks; 23905 ssize_t total, missing, size; 23906 23907 ASSERT(mp != NULL); 23908 ASSERT(hdr_len > 0); 23909 23910 size = MBLKL(mp) - hdr_len; 23911 if (size <= 0) 23912 return (B_FALSE); 23913 23914 /* The first mblk contains the header and some payload. */ 23915 blocks = 1; 23916 total = size; 23917 size %= len; 23918 missing = (size == 0) ? 0 : (len - size); 23919 mp = mp->b_cont; 23920 23921 while (mp != NULL) { 23922 /* 23923 * Give up if we encounter a zero length message block. 23924 * In practice, this should rarely happen and therefore 23925 * not worth the trouble of freeing and re-linking the 23926 * mblk from the chain to handle such case. 23927 */ 23928 if ((size = MBLKL(mp)) == 0) 23929 return (B_FALSE); 23930 23931 /* Too many payload buffers for a single Multidata message? */ 23932 if (++blocks > MULTIDATA_MAX_PBUFS) 23933 return (B_FALSE); 23934 23935 total += size; 23936 /* Is a frag distributed over two or more message blocks? */ 23937 if (missing > size) 23938 return (B_FALSE); 23939 size -= missing; 23940 23941 size %= len; 23942 missing = (size == 0) ? 0 : (len - size); 23943 23944 mp = mp->b_cont; 23945 } 23946 23947 return (total > ip_wput_frag_mdt_min); 23948 } 23949 23950 /* 23951 * Outbound IPv4 fragmentation routine using MDT. 23952 */ 23953 static void 23954 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 23955 uint32_t frag_flag, int offset) 23956 { 23957 ipha_t *ipha_orig; 23958 int i1, ip_data_end; 23959 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 23960 mblk_t *hdr_mp, *md_mp = NULL; 23961 unsigned char *hdr_ptr, *pld_ptr; 23962 multidata_t *mmd; 23963 ip_pdescinfo_t pdi; 23964 ill_t *ill; 23965 ip_stack_t *ipst = ire->ire_ipst; 23966 23967 ASSERT(DB_TYPE(mp) == M_DATA); 23968 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 23969 23970 ill = ire_to_ill(ire); 23971 ASSERT(ill != NULL); 23972 23973 ipha_orig = (ipha_t *)mp->b_rptr; 23974 mp->b_rptr += sizeof (ipha_t); 23975 23976 /* Calculate how many packets we will send out */ 23977 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 23978 pkts = (i1 + len - 1) / len; 23979 ASSERT(pkts > 1); 23980 23981 /* Allocate a message block which will hold all the IP Headers. */ 23982 wroff = ipst->ips_ip_wroff_extra; 23983 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 23984 23985 i1 = pkts * hdr_chunk_len; 23986 /* 23987 * Create the header buffer, Multidata and destination address 23988 * and SAP attribute that should be associated with it. 23989 */ 23990 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 23991 ((hdr_mp->b_wptr += i1), 23992 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 23993 !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { 23994 freemsg(mp); 23995 if (md_mp == NULL) { 23996 freemsg(hdr_mp); 23997 } else { 23998 free_mmd: IP_STAT(ipst, ip_frag_mdt_discarded); 23999 freemsg(md_mp); 24000 } 24001 IP_STAT(ipst, ip_frag_mdt_allocfail); 24002 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); 24003 return; 24004 } 24005 IP_STAT(ipst, ip_frag_mdt_allocd); 24006 24007 /* 24008 * Add a payload buffer to the Multidata; this operation must not 24009 * fail, or otherwise our logic in this routine is broken. There 24010 * is no memory allocation done by the routine, so any returned 24011 * failure simply tells us that we've done something wrong. 24012 * 24013 * A failure tells us that either we're adding the same payload 24014 * buffer more than once, or we're trying to add more buffers than 24015 * allowed. None of the above cases should happen, and we panic 24016 * because either there's horrible heap corruption, and/or 24017 * programming mistake. 24018 */ 24019 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24020 goto pbuf_panic; 24021 24022 hdr_ptr = hdr_mp->b_rptr; 24023 pld_ptr = mp->b_rptr; 24024 24025 /* Establish the ending byte offset, based on the starting offset. */ 24026 offset <<= 3; 24027 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 24028 IP_SIMPLE_HDR_LENGTH; 24029 24030 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 24031 24032 while (pld_ptr < mp->b_wptr) { 24033 ipha_t *ipha; 24034 uint16_t offset_and_flags; 24035 uint16_t ip_len; 24036 int error; 24037 24038 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 24039 ipha = (ipha_t *)(hdr_ptr + wroff); 24040 ASSERT(OK_32PTR(ipha)); 24041 *ipha = *ipha_orig; 24042 24043 if (ip_data_end - offset > len) { 24044 offset_and_flags = IPH_MF; 24045 } else { 24046 /* 24047 * Last frag. Set len to the length of this last piece. 24048 */ 24049 len = ip_data_end - offset; 24050 /* A frag of a frag might have IPH_MF non-zero */ 24051 offset_and_flags = 24052 ntohs(ipha->ipha_fragment_offset_and_flags) & 24053 IPH_MF; 24054 } 24055 offset_and_flags |= (uint16_t)(offset >> 3); 24056 offset_and_flags |= (uint16_t)frag_flag; 24057 /* Store the offset and flags in the IP header. */ 24058 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24059 24060 /* Store the length in the IP header. */ 24061 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 24062 ipha->ipha_length = htons(ip_len); 24063 24064 /* 24065 * Set the IP header checksum. Note that mp is just 24066 * the header, so this is easy to pass to ip_csum. 24067 */ 24068 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24069 24070 DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *, 24071 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 24072 NULL, int, 0); 24073 24074 /* 24075 * Record offset and size of header and data of the next packet 24076 * in the multidata message. 24077 */ 24078 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 24079 PDESC_PLD_INIT(&pdi); 24080 i1 = MIN(mp->b_wptr - pld_ptr, len); 24081 ASSERT(i1 > 0); 24082 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 24083 if (i1 == len) { 24084 pld_ptr += len; 24085 } else { 24086 i1 = len - i1; 24087 mp = mp->b_cont; 24088 ASSERT(mp != NULL); 24089 ASSERT(MBLKL(mp) >= i1); 24090 /* 24091 * Attach the next payload message block to the 24092 * multidata message. 24093 */ 24094 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24095 goto pbuf_panic; 24096 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 24097 pld_ptr = mp->b_rptr + i1; 24098 } 24099 24100 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 24101 KM_NOSLEEP)) == NULL) { 24102 /* 24103 * Any failure other than ENOMEM indicates that we 24104 * have passed in invalid pdesc info or parameters 24105 * to mmd_addpdesc, which must not happen. 24106 * 24107 * EINVAL is a result of failure on boundary checks 24108 * against the pdesc info contents. It should not 24109 * happen, and we panic because either there's 24110 * horrible heap corruption, and/or programming 24111 * mistake. 24112 */ 24113 if (error != ENOMEM) { 24114 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 24115 "pdesc logic error detected for " 24116 "mmd %p pinfo %p (%d)\n", 24117 (void *)mmd, (void *)&pdi, error); 24118 /* NOTREACHED */ 24119 } 24120 IP_STAT(ipst, ip_frag_mdt_addpdescfail); 24121 /* Free unattached payload message blocks as well */ 24122 md_mp->b_cont = mp->b_cont; 24123 goto free_mmd; 24124 } 24125 24126 /* Advance fragment offset. */ 24127 offset += len; 24128 24129 /* Advance to location for next header in the buffer. */ 24130 hdr_ptr += hdr_chunk_len; 24131 24132 /* Did we reach the next payload message block? */ 24133 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 24134 mp = mp->b_cont; 24135 /* 24136 * Attach the next message block with payload 24137 * data to the multidata message. 24138 */ 24139 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24140 goto pbuf_panic; 24141 pld_ptr = mp->b_rptr; 24142 } 24143 } 24144 24145 ASSERT(hdr_mp->b_wptr == hdr_ptr); 24146 ASSERT(mp->b_wptr == pld_ptr); 24147 24148 /* Update IP statistics */ 24149 IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts); 24150 24151 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts); 24152 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); 24153 24154 len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH; 24155 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts); 24156 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len); 24157 24158 if (pkt_type == OB_PKT) { 24159 ire->ire_ob_pkt_count += pkts; 24160 if (ire->ire_ipif != NULL) 24161 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 24162 } else { 24163 /* The type is IB_PKT in the forwarding path. */ 24164 ire->ire_ib_pkt_count += pkts; 24165 ASSERT(!IRE_IS_LOCAL(ire)); 24166 if (ire->ire_type & IRE_BROADCAST) { 24167 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 24168 } else { 24169 UPDATE_MIB(ill->ill_ip_mib, 24170 ipIfStatsHCOutForwDatagrams, pkts); 24171 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 24172 } 24173 } 24174 ire->ire_last_used_time = lbolt; 24175 /* Send it down */ 24176 putnext(ire->ire_stq, md_mp); 24177 return; 24178 24179 pbuf_panic: 24180 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 24181 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 24182 pbuf_idx); 24183 /* NOTREACHED */ 24184 } 24185 24186 /* 24187 * Outbound IP fragmentation routine. 24188 * 24189 * NOTE : This routine does not ire_refrele the ire that is passed in 24190 * as the argument. 24191 */ 24192 static void 24193 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 24194 uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst) 24195 { 24196 int i1; 24197 mblk_t *ll_hdr_mp; 24198 int ll_hdr_len; 24199 int hdr_len; 24200 mblk_t *hdr_mp; 24201 ipha_t *ipha; 24202 int ip_data_end; 24203 int len; 24204 mblk_t *mp = mp_orig, *mp1; 24205 int offset; 24206 queue_t *q; 24207 uint32_t v_hlen_tos_len; 24208 mblk_t *first_mp; 24209 boolean_t mctl_present; 24210 ill_t *ill; 24211 ill_t *out_ill; 24212 mblk_t *xmit_mp; 24213 mblk_t *carve_mp; 24214 ire_t *ire1 = NULL; 24215 ire_t *save_ire = NULL; 24216 mblk_t *next_mp = NULL; 24217 boolean_t last_frag = B_FALSE; 24218 boolean_t multirt_send = B_FALSE; 24219 ire_t *first_ire = NULL; 24220 irb_t *irb = NULL; 24221 mib2_ipIfStatsEntry_t *mibptr = NULL; 24222 24223 ill = ire_to_ill(ire); 24224 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 24225 24226 BUMP_MIB(mibptr, ipIfStatsOutFragReqds); 24227 24228 if (max_frag == 0) { 24229 ip1dbg(("ip_wput_frag: ire frag size is 0" 24230 " - dropping packet\n")); 24231 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24232 freemsg(mp); 24233 return; 24234 } 24235 24236 /* 24237 * IPsec does not allow hw accelerated packets to be fragmented 24238 * This check is made in ip_wput_ipsec_out prior to coming here 24239 * via ip_wput_ire_fragmentit. 24240 * 24241 * If at this point we have an ire whose ARP request has not 24242 * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger 24243 * sending of ARP query and change ire's state to ND_INCOMPLETE. 24244 * This packet and all fragmentable packets for this ire will 24245 * continue to get dropped while ire_nce->nce_state remains in 24246 * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to 24247 * ND_REACHABLE, all subsquent large packets for this ire will 24248 * get fragemented and sent out by this function. 24249 */ 24250 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 24251 /* If nce_state is ND_INITIAL, trigger ARP query */ 24252 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 24253 ip1dbg(("ip_wput_frag: mac address for ire is unresolved" 24254 " - dropping packet\n")); 24255 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24256 freemsg(mp); 24257 return; 24258 } 24259 24260 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 24261 "ip_wput_frag_start:"); 24262 24263 if (mp->b_datap->db_type == M_CTL) { 24264 first_mp = mp; 24265 mp_orig = mp = mp->b_cont; 24266 mctl_present = B_TRUE; 24267 } else { 24268 first_mp = mp; 24269 mctl_present = B_FALSE; 24270 } 24271 24272 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 24273 ipha = (ipha_t *)mp->b_rptr; 24274 24275 /* 24276 * If the Don't Fragment flag is on, generate an ICMP destination 24277 * unreachable, fragmentation needed. 24278 */ 24279 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 24280 if (offset & IPH_DF) { 24281 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24282 if (is_system_labeled()) { 24283 max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag, 24284 ire->ire_max_frag - max_frag, AF_INET); 24285 } 24286 /* 24287 * Need to compute hdr checksum if called from ip_wput_ire. 24288 * Note that ip_rput_forward verifies the checksum before 24289 * calling this routine so in that case this is a noop. 24290 */ 24291 ipha->ipha_hdr_checksum = 0; 24292 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24293 icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid, 24294 ipst); 24295 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24296 "ip_wput_frag_end:(%S)", 24297 "don't fragment"); 24298 return; 24299 } 24300 /* 24301 * Labeled systems adjust max_frag if they add a label 24302 * to send the correct path mtu. We need the real mtu since we 24303 * are fragmenting the packet after label adjustment. 24304 */ 24305 if (is_system_labeled()) 24306 max_frag = ire->ire_max_frag; 24307 if (mctl_present) 24308 freeb(first_mp); 24309 /* 24310 * Establish the starting offset. May not be zero if we are fragging 24311 * a fragment that is being forwarded. 24312 */ 24313 offset = offset & IPH_OFFSET; 24314 24315 /* TODO why is this test needed? */ 24316 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 24317 if (((max_frag - LENGTH) & ~7) < 8) { 24318 /* TODO: notify ulp somehow */ 24319 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24320 freemsg(mp); 24321 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24322 "ip_wput_frag_end:(%S)", 24323 "len < 8"); 24324 return; 24325 } 24326 24327 hdr_len = (V_HLEN & 0xF) << 2; 24328 24329 ipha->ipha_hdr_checksum = 0; 24330 24331 /* 24332 * Establish the number of bytes maximum per frag, after putting 24333 * in the header. 24334 */ 24335 len = (max_frag - hdr_len) & ~7; 24336 24337 /* Check if we can use MDT to send out the frags. */ 24338 ASSERT(!IRE_IS_LOCAL(ire)); 24339 if (hdr_len == IP_SIMPLE_HDR_LENGTH && 24340 ipst->ips_ip_multidata_outbound && 24341 !(ire->ire_flags & RTF_MULTIRT) && 24342 !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && 24343 ill != NULL && ILL_MDT_CAPABLE(ill) && 24344 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 24345 ASSERT(ill->ill_mdt_capab != NULL); 24346 if (!ill->ill_mdt_capab->ill_mdt_on) { 24347 /* 24348 * If MDT has been previously turned off in the past, 24349 * and we currently can do MDT (due to IPQoS policy 24350 * removal, etc.) then enable it for this interface. 24351 */ 24352 ill->ill_mdt_capab->ill_mdt_on = 1; 24353 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 24354 ill->ill_name)); 24355 } 24356 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 24357 offset); 24358 return; 24359 } 24360 24361 /* Get a copy of the header for the trailing frags */ 24362 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst); 24363 if (!hdr_mp) { 24364 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24365 freemsg(mp); 24366 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24367 "ip_wput_frag_end:(%S)", 24368 "couldn't copy hdr"); 24369 return; 24370 } 24371 if (DB_CRED(mp) != NULL) 24372 mblk_setcred(hdr_mp, DB_CRED(mp)); 24373 24374 /* Store the starting offset, with the MoreFrags flag. */ 24375 i1 = offset | IPH_MF | frag_flag; 24376 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 24377 24378 /* Establish the ending byte offset, based on the starting offset. */ 24379 offset <<= 3; 24380 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 24381 24382 /* Store the length of the first fragment in the IP header. */ 24383 i1 = len + hdr_len; 24384 ASSERT(i1 <= IP_MAXPACKET); 24385 ipha->ipha_length = htons((uint16_t)i1); 24386 24387 /* 24388 * Compute the IP header checksum for the first frag. We have to 24389 * watch out that we stop at the end of the header. 24390 */ 24391 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24392 24393 /* 24394 * Now carve off the first frag. Note that this will include the 24395 * original IP header. 24396 */ 24397 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 24398 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24399 freeb(hdr_mp); 24400 freemsg(mp_orig); 24401 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24402 "ip_wput_frag_end:(%S)", 24403 "couldn't carve first"); 24404 return; 24405 } 24406 24407 /* 24408 * Multirouting case. Each fragment is replicated 24409 * via all non-condemned RTF_MULTIRT routes 24410 * currently resolved. 24411 * We ensure that first_ire is the first RTF_MULTIRT 24412 * ire in the bucket. 24413 */ 24414 if (ire->ire_flags & RTF_MULTIRT) { 24415 irb = ire->ire_bucket; 24416 ASSERT(irb != NULL); 24417 24418 multirt_send = B_TRUE; 24419 24420 /* Make sure we do not omit any multiroute ire. */ 24421 IRB_REFHOLD(irb); 24422 for (first_ire = irb->irb_ire; 24423 first_ire != NULL; 24424 first_ire = first_ire->ire_next) { 24425 if ((first_ire->ire_flags & RTF_MULTIRT) && 24426 (first_ire->ire_addr == ire->ire_addr) && 24427 !(first_ire->ire_marks & 24428 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 24429 break; 24430 } 24431 } 24432 24433 if (first_ire != NULL) { 24434 if (first_ire != ire) { 24435 IRE_REFHOLD(first_ire); 24436 /* 24437 * Do not release the ire passed in 24438 * as the argument. 24439 */ 24440 ire = first_ire; 24441 } else { 24442 first_ire = NULL; 24443 } 24444 } 24445 IRB_REFRELE(irb); 24446 24447 /* 24448 * Save the first ire; we will need to restore it 24449 * for the trailing frags. 24450 * We REFHOLD save_ire, as each iterated ire will be 24451 * REFRELEd. 24452 */ 24453 save_ire = ire; 24454 IRE_REFHOLD(save_ire); 24455 } 24456 24457 /* 24458 * First fragment emission loop. 24459 * In most cases, the emission loop below is entered only 24460 * once. Only in the case where the ire holds the RTF_MULTIRT 24461 * flag, do we loop to process all RTF_MULTIRT ires in the 24462 * bucket, and send the fragment through all crossed 24463 * RTF_MULTIRT routes. 24464 */ 24465 do { 24466 if (ire->ire_flags & RTF_MULTIRT) { 24467 /* 24468 * We are in a multiple send case, need to get 24469 * the next ire and make a copy of the packet. 24470 * ire1 holds here the next ire to process in the 24471 * bucket. If multirouting is expected, 24472 * any non-RTF_MULTIRT ire that has the 24473 * right destination address is ignored. 24474 * 24475 * We have to take into account the MTU of 24476 * each walked ire. max_frag is set by the 24477 * the caller and generally refers to 24478 * the primary ire entry. Here we ensure that 24479 * no route with a lower MTU will be used, as 24480 * fragments are carved once for all ires, 24481 * then replicated. 24482 */ 24483 ASSERT(irb != NULL); 24484 IRB_REFHOLD(irb); 24485 for (ire1 = ire->ire_next; 24486 ire1 != NULL; 24487 ire1 = ire1->ire_next) { 24488 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24489 continue; 24490 if (ire1->ire_addr != ire->ire_addr) 24491 continue; 24492 if (ire1->ire_marks & 24493 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 24494 continue; 24495 /* 24496 * Ensure we do not exceed the MTU 24497 * of the next route. 24498 */ 24499 if (ire1->ire_max_frag < max_frag) { 24500 ip_multirt_bad_mtu(ire1, max_frag); 24501 continue; 24502 } 24503 24504 /* Got one. */ 24505 IRE_REFHOLD(ire1); 24506 break; 24507 } 24508 IRB_REFRELE(irb); 24509 24510 if (ire1 != NULL) { 24511 next_mp = copyb(mp); 24512 if ((next_mp == NULL) || 24513 ((mp->b_cont != NULL) && 24514 ((next_mp->b_cont = 24515 dupmsg(mp->b_cont)) == NULL))) { 24516 freemsg(next_mp); 24517 next_mp = NULL; 24518 ire_refrele(ire1); 24519 ire1 = NULL; 24520 } 24521 } 24522 24523 /* Last multiroute ire; don't loop anymore. */ 24524 if (ire1 == NULL) { 24525 multirt_send = B_FALSE; 24526 } 24527 } 24528 24529 ll_hdr_len = 0; 24530 LOCK_IRE_FP_MP(ire); 24531 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24532 if (ll_hdr_mp != NULL) { 24533 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24534 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 24535 } else { 24536 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24537 } 24538 24539 /* If there is a transmit header, get a copy for this frag. */ 24540 /* 24541 * TODO: should check db_ref before calling ip_carve_mp since 24542 * it might give us a dup. 24543 */ 24544 if (!ll_hdr_mp) { 24545 /* No xmit header. */ 24546 xmit_mp = mp; 24547 24548 /* We have a link-layer header that can fit in our mblk. */ 24549 } else if (mp->b_datap->db_ref == 1 && 24550 ll_hdr_len != 0 && 24551 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24552 /* M_DATA fastpath */ 24553 mp->b_rptr -= ll_hdr_len; 24554 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 24555 xmit_mp = mp; 24556 24557 /* Corner case if copyb has failed */ 24558 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 24559 UNLOCK_IRE_FP_MP(ire); 24560 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24561 freeb(hdr_mp); 24562 freemsg(mp); 24563 freemsg(mp_orig); 24564 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24565 "ip_wput_frag_end:(%S)", 24566 "discard"); 24567 24568 if (multirt_send) { 24569 ASSERT(ire1); 24570 ASSERT(next_mp); 24571 24572 freemsg(next_mp); 24573 ire_refrele(ire1); 24574 } 24575 if (save_ire != NULL) 24576 IRE_REFRELE(save_ire); 24577 24578 if (first_ire != NULL) 24579 ire_refrele(first_ire); 24580 return; 24581 24582 /* 24583 * Case of res_mp OR the fastpath mp can't fit 24584 * in the mblk 24585 */ 24586 } else { 24587 xmit_mp->b_cont = mp; 24588 if (DB_CRED(mp) != NULL) 24589 mblk_setcred(xmit_mp, DB_CRED(mp)); 24590 /* 24591 * Get priority marking, if any. 24592 * We propagate the CoS marking from the 24593 * original packet that went to QoS processing 24594 * in ip_wput_ire to the newly carved mp. 24595 */ 24596 if (DB_TYPE(xmit_mp) == M_DATA) 24597 xmit_mp->b_band = mp->b_band; 24598 } 24599 UNLOCK_IRE_FP_MP(ire); 24600 24601 q = ire->ire_stq; 24602 out_ill = (ill_t *)q->q_ptr; 24603 24604 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24605 24606 DTRACE_PROBE4(ip4__physical__out__start, 24607 ill_t *, NULL, ill_t *, out_ill, 24608 ipha_t *, ipha, mblk_t *, xmit_mp); 24609 24610 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24611 ipst->ips_ipv4firewall_physical_out, 24612 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24613 24614 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp); 24615 24616 if (xmit_mp != NULL) { 24617 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL, 24618 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 24619 ipha_t *, ipha, ip6_t *, NULL, int, 0); 24620 24621 putnext(q, xmit_mp); 24622 24623 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 24624 UPDATE_MIB(out_ill->ill_ip_mib, 24625 ipIfStatsHCOutOctets, i1); 24626 24627 if (pkt_type != OB_PKT) { 24628 /* 24629 * Update the packet count and MIB stats 24630 * of trailing RTF_MULTIRT ires. 24631 */ 24632 UPDATE_OB_PKT_COUNT(ire); 24633 BUMP_MIB(out_ill->ill_ip_mib, 24634 ipIfStatsOutFragReqds); 24635 } 24636 } 24637 24638 if (multirt_send) { 24639 /* 24640 * We are in a multiple send case; look for 24641 * the next ire and re-enter the loop. 24642 */ 24643 ASSERT(ire1); 24644 ASSERT(next_mp); 24645 /* REFRELE the current ire before looping */ 24646 ire_refrele(ire); 24647 ire = ire1; 24648 ire1 = NULL; 24649 mp = next_mp; 24650 next_mp = NULL; 24651 } 24652 } while (multirt_send); 24653 24654 ASSERT(ire1 == NULL); 24655 24656 /* Restore the original ire; we need it for the trailing frags */ 24657 if (save_ire != NULL) { 24658 /* REFRELE the last iterated ire */ 24659 ire_refrele(ire); 24660 /* save_ire has been REFHOLDed */ 24661 ire = save_ire; 24662 save_ire = NULL; 24663 q = ire->ire_stq; 24664 } 24665 24666 if (pkt_type == OB_PKT) { 24667 UPDATE_OB_PKT_COUNT(ire); 24668 } else { 24669 out_ill = (ill_t *)q->q_ptr; 24670 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 24671 UPDATE_IB_PKT_COUNT(ire); 24672 } 24673 24674 /* Advance the offset to the second frag starting point. */ 24675 offset += len; 24676 /* 24677 * Update hdr_len from the copied header - there might be less options 24678 * in the later fragments. 24679 */ 24680 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 24681 /* Loop until done. */ 24682 for (;;) { 24683 uint16_t offset_and_flags; 24684 uint16_t ip_len; 24685 24686 if (ip_data_end - offset > len) { 24687 /* 24688 * Carve off the appropriate amount from the original 24689 * datagram. 24690 */ 24691 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24692 mp = NULL; 24693 break; 24694 } 24695 /* 24696 * More frags after this one. Get another copy 24697 * of the header. 24698 */ 24699 if (carve_mp->b_datap->db_ref == 1 && 24700 hdr_mp->b_wptr - hdr_mp->b_rptr < 24701 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24702 /* Inline IP header */ 24703 carve_mp->b_rptr -= hdr_mp->b_wptr - 24704 hdr_mp->b_rptr; 24705 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24706 hdr_mp->b_wptr - hdr_mp->b_rptr); 24707 mp = carve_mp; 24708 } else { 24709 if (!(mp = copyb(hdr_mp))) { 24710 freemsg(carve_mp); 24711 break; 24712 } 24713 /* Get priority marking, if any. */ 24714 mp->b_band = carve_mp->b_band; 24715 mp->b_cont = carve_mp; 24716 } 24717 ipha = (ipha_t *)mp->b_rptr; 24718 offset_and_flags = IPH_MF; 24719 } else { 24720 /* 24721 * Last frag. Consume the header. Set len to 24722 * the length of this last piece. 24723 */ 24724 len = ip_data_end - offset; 24725 24726 /* 24727 * Carve off the appropriate amount from the original 24728 * datagram. 24729 */ 24730 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24731 mp = NULL; 24732 break; 24733 } 24734 if (carve_mp->b_datap->db_ref == 1 && 24735 hdr_mp->b_wptr - hdr_mp->b_rptr < 24736 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24737 /* Inline IP header */ 24738 carve_mp->b_rptr -= hdr_mp->b_wptr - 24739 hdr_mp->b_rptr; 24740 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24741 hdr_mp->b_wptr - hdr_mp->b_rptr); 24742 mp = carve_mp; 24743 freeb(hdr_mp); 24744 hdr_mp = mp; 24745 } else { 24746 mp = hdr_mp; 24747 /* Get priority marking, if any. */ 24748 mp->b_band = carve_mp->b_band; 24749 mp->b_cont = carve_mp; 24750 } 24751 ipha = (ipha_t *)mp->b_rptr; 24752 /* A frag of a frag might have IPH_MF non-zero */ 24753 offset_and_flags = 24754 ntohs(ipha->ipha_fragment_offset_and_flags) & 24755 IPH_MF; 24756 } 24757 offset_and_flags |= (uint16_t)(offset >> 3); 24758 offset_and_flags |= (uint16_t)frag_flag; 24759 /* Store the offset and flags in the IP header. */ 24760 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24761 24762 /* Store the length in the IP header. */ 24763 ip_len = (uint16_t)(len + hdr_len); 24764 ipha->ipha_length = htons(ip_len); 24765 24766 /* 24767 * Set the IP header checksum. Note that mp is just 24768 * the header, so this is easy to pass to ip_csum. 24769 */ 24770 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24771 24772 /* Attach a transmit header, if any, and ship it. */ 24773 if (pkt_type == OB_PKT) { 24774 UPDATE_OB_PKT_COUNT(ire); 24775 } else { 24776 out_ill = (ill_t *)q->q_ptr; 24777 BUMP_MIB(out_ill->ill_ip_mib, 24778 ipIfStatsHCOutForwDatagrams); 24779 UPDATE_IB_PKT_COUNT(ire); 24780 } 24781 24782 if (ire->ire_flags & RTF_MULTIRT) { 24783 irb = ire->ire_bucket; 24784 ASSERT(irb != NULL); 24785 24786 multirt_send = B_TRUE; 24787 24788 /* 24789 * Save the original ire; we will need to restore it 24790 * for the tailing frags. 24791 */ 24792 save_ire = ire; 24793 IRE_REFHOLD(save_ire); 24794 } 24795 /* 24796 * Emission loop for this fragment, similar 24797 * to what is done for the first fragment. 24798 */ 24799 do { 24800 if (multirt_send) { 24801 /* 24802 * We are in a multiple send case, need to get 24803 * the next ire and make a copy of the packet. 24804 */ 24805 ASSERT(irb != NULL); 24806 IRB_REFHOLD(irb); 24807 for (ire1 = ire->ire_next; 24808 ire1 != NULL; 24809 ire1 = ire1->ire_next) { 24810 if (!(ire1->ire_flags & RTF_MULTIRT)) 24811 continue; 24812 if (ire1->ire_addr != ire->ire_addr) 24813 continue; 24814 if (ire1->ire_marks & 24815 (IRE_MARK_CONDEMNED| 24816 IRE_MARK_HIDDEN)) { 24817 continue; 24818 } 24819 /* 24820 * Ensure we do not exceed the MTU 24821 * of the next route. 24822 */ 24823 if (ire1->ire_max_frag < max_frag) { 24824 ip_multirt_bad_mtu(ire1, 24825 max_frag); 24826 continue; 24827 } 24828 24829 /* Got one. */ 24830 IRE_REFHOLD(ire1); 24831 break; 24832 } 24833 IRB_REFRELE(irb); 24834 24835 if (ire1 != NULL) { 24836 next_mp = copyb(mp); 24837 if ((next_mp == NULL) || 24838 ((mp->b_cont != NULL) && 24839 ((next_mp->b_cont = 24840 dupmsg(mp->b_cont)) == NULL))) { 24841 freemsg(next_mp); 24842 next_mp = NULL; 24843 ire_refrele(ire1); 24844 ire1 = NULL; 24845 } 24846 } 24847 24848 /* Last multiroute ire; don't loop anymore. */ 24849 if (ire1 == NULL) { 24850 multirt_send = B_FALSE; 24851 } 24852 } 24853 24854 /* Update transmit header */ 24855 ll_hdr_len = 0; 24856 LOCK_IRE_FP_MP(ire); 24857 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24858 if (ll_hdr_mp != NULL) { 24859 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24860 ll_hdr_len = MBLKL(ll_hdr_mp); 24861 } else { 24862 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24863 } 24864 24865 if (!ll_hdr_mp) { 24866 xmit_mp = mp; 24867 24868 /* 24869 * We have link-layer header that can fit in 24870 * our mblk. 24871 */ 24872 } else if (mp->b_datap->db_ref == 1 && 24873 ll_hdr_len != 0 && 24874 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24875 /* M_DATA fastpath */ 24876 mp->b_rptr -= ll_hdr_len; 24877 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 24878 ll_hdr_len); 24879 xmit_mp = mp; 24880 24881 /* 24882 * Case of res_mp OR the fastpath mp can't fit 24883 * in the mblk 24884 */ 24885 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 24886 xmit_mp->b_cont = mp; 24887 if (DB_CRED(mp) != NULL) 24888 mblk_setcred(xmit_mp, DB_CRED(mp)); 24889 /* Get priority marking, if any. */ 24890 if (DB_TYPE(xmit_mp) == M_DATA) 24891 xmit_mp->b_band = mp->b_band; 24892 24893 /* Corner case if copyb failed */ 24894 } else { 24895 /* 24896 * Exit both the replication and 24897 * fragmentation loops. 24898 */ 24899 UNLOCK_IRE_FP_MP(ire); 24900 goto drop_pkt; 24901 } 24902 UNLOCK_IRE_FP_MP(ire); 24903 24904 mp1 = mp; 24905 out_ill = (ill_t *)q->q_ptr; 24906 24907 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24908 24909 DTRACE_PROBE4(ip4__physical__out__start, 24910 ill_t *, NULL, ill_t *, out_ill, 24911 ipha_t *, ipha, mblk_t *, xmit_mp); 24912 24913 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24914 ipst->ips_ipv4firewall_physical_out, 24915 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24916 24917 DTRACE_PROBE1(ip4__physical__out__end, 24918 mblk_t *, xmit_mp); 24919 24920 if (mp != mp1 && hdr_mp == mp1) 24921 hdr_mp = mp; 24922 if (mp != mp1 && mp_orig == mp1) 24923 mp_orig = mp; 24924 24925 if (xmit_mp != NULL) { 24926 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, 24927 NULL, void_ip_t *, ipha, 24928 __dtrace_ipsr_ill_t *, out_ill, ipha_t *, 24929 ipha, ip6_t *, NULL, int, 0); 24930 24931 putnext(q, xmit_mp); 24932 24933 BUMP_MIB(out_ill->ill_ip_mib, 24934 ipIfStatsHCOutTransmits); 24935 UPDATE_MIB(out_ill->ill_ip_mib, 24936 ipIfStatsHCOutOctets, ip_len); 24937 24938 if (pkt_type != OB_PKT) { 24939 /* 24940 * Update the packet count of trailing 24941 * RTF_MULTIRT ires. 24942 */ 24943 UPDATE_OB_PKT_COUNT(ire); 24944 } 24945 } 24946 24947 /* All done if we just consumed the hdr_mp. */ 24948 if (mp == hdr_mp) { 24949 last_frag = B_TRUE; 24950 BUMP_MIB(out_ill->ill_ip_mib, 24951 ipIfStatsOutFragOKs); 24952 } 24953 24954 if (multirt_send) { 24955 /* 24956 * We are in a multiple send case; look for 24957 * the next ire and re-enter the loop. 24958 */ 24959 ASSERT(ire1); 24960 ASSERT(next_mp); 24961 /* REFRELE the current ire before looping */ 24962 ire_refrele(ire); 24963 ire = ire1; 24964 ire1 = NULL; 24965 q = ire->ire_stq; 24966 mp = next_mp; 24967 next_mp = NULL; 24968 } 24969 } while (multirt_send); 24970 /* 24971 * Restore the original ire; we need it for the 24972 * trailing frags 24973 */ 24974 if (save_ire != NULL) { 24975 ASSERT(ire1 == NULL); 24976 /* REFRELE the last iterated ire */ 24977 ire_refrele(ire); 24978 /* save_ire has been REFHOLDed */ 24979 ire = save_ire; 24980 q = ire->ire_stq; 24981 save_ire = NULL; 24982 } 24983 24984 if (last_frag) { 24985 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24986 "ip_wput_frag_end:(%S)", 24987 "consumed hdr_mp"); 24988 24989 if (first_ire != NULL) 24990 ire_refrele(first_ire); 24991 return; 24992 } 24993 /* Otherwise, advance and loop. */ 24994 offset += len; 24995 } 24996 24997 drop_pkt: 24998 /* Clean up following allocation failure. */ 24999 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 25000 freemsg(mp); 25001 if (mp != hdr_mp) 25002 freeb(hdr_mp); 25003 if (mp != mp_orig) 25004 freemsg(mp_orig); 25005 25006 if (save_ire != NULL) 25007 IRE_REFRELE(save_ire); 25008 if (first_ire != NULL) 25009 ire_refrele(first_ire); 25010 25011 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 25012 "ip_wput_frag_end:(%S)", 25013 "end--alloc failure"); 25014 } 25015 25016 /* 25017 * Copy the header plus those options which have the copy bit set 25018 */ 25019 static mblk_t * 25020 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst) 25021 { 25022 mblk_t *mp; 25023 uchar_t *up; 25024 25025 /* 25026 * Quick check if we need to look for options without the copy bit 25027 * set 25028 */ 25029 mp = allocb(ipst->ips_ip_wroff_extra + hdr_len, BPRI_HI); 25030 if (!mp) 25031 return (mp); 25032 mp->b_rptr += ipst->ips_ip_wroff_extra; 25033 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 25034 bcopy(rptr, mp->b_rptr, hdr_len); 25035 mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra; 25036 return (mp); 25037 } 25038 up = mp->b_rptr; 25039 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 25040 up += IP_SIMPLE_HDR_LENGTH; 25041 rptr += IP_SIMPLE_HDR_LENGTH; 25042 hdr_len -= IP_SIMPLE_HDR_LENGTH; 25043 while (hdr_len > 0) { 25044 uint32_t optval; 25045 uint32_t optlen; 25046 25047 optval = *rptr; 25048 if (optval == IPOPT_EOL) 25049 break; 25050 if (optval == IPOPT_NOP) 25051 optlen = 1; 25052 else 25053 optlen = rptr[1]; 25054 if (optval & IPOPT_COPY) { 25055 bcopy(rptr, up, optlen); 25056 up += optlen; 25057 } 25058 rptr += optlen; 25059 hdr_len -= optlen; 25060 } 25061 /* 25062 * Make sure that we drop an even number of words by filling 25063 * with EOL to the next word boundary. 25064 */ 25065 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 25066 hdr_len & 0x3; hdr_len++) 25067 *up++ = IPOPT_EOL; 25068 mp->b_wptr = up; 25069 /* Update header length */ 25070 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 25071 return (mp); 25072 } 25073 25074 /* 25075 * Delivery to local recipients including fanout to multiple recipients. 25076 * Does not do checksumming of UDP/TCP. 25077 * Note: q should be the read side queue for either the ill or conn. 25078 * Note: rq should be the read side q for the lower (ill) stream. 25079 * We don't send packets to IPPF processing, thus the last argument 25080 * to all the fanout calls are B_FALSE. 25081 */ 25082 void 25083 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 25084 int fanout_flags, zoneid_t zoneid) 25085 { 25086 uint32_t protocol; 25087 mblk_t *first_mp; 25088 boolean_t mctl_present; 25089 int ire_type; 25090 #define rptr ((uchar_t *)ipha) 25091 ip_stack_t *ipst = ill->ill_ipst; 25092 25093 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 25094 "ip_wput_local_start: q %p", q); 25095 25096 if (ire != NULL) { 25097 ire_type = ire->ire_type; 25098 } else { 25099 /* 25100 * Only ip_multicast_loopback() calls us with a NULL ire. If the 25101 * packet is not multicast, we can't tell the ire type. 25102 */ 25103 ASSERT(CLASSD(ipha->ipha_dst)); 25104 ire_type = IRE_BROADCAST; 25105 } 25106 25107 first_mp = mp; 25108 if (first_mp->b_datap->db_type == M_CTL) { 25109 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 25110 if (!io->ipsec_out_secure) { 25111 /* 25112 * This ipsec_out_t was allocated in ip_wput 25113 * for multicast packets to store the ill_index. 25114 * As this is being delivered locally, we don't 25115 * need this anymore. 25116 */ 25117 mp = first_mp->b_cont; 25118 freeb(first_mp); 25119 first_mp = mp; 25120 mctl_present = B_FALSE; 25121 } else { 25122 /* 25123 * Convert IPSEC_OUT to IPSEC_IN, preserving all 25124 * security properties for the looped-back packet. 25125 */ 25126 mctl_present = B_TRUE; 25127 mp = first_mp->b_cont; 25128 ASSERT(mp != NULL); 25129 ipsec_out_to_in(first_mp); 25130 } 25131 } else { 25132 mctl_present = B_FALSE; 25133 } 25134 25135 DTRACE_PROBE4(ip4__loopback__in__start, 25136 ill_t *, ill, ill_t *, NULL, 25137 ipha_t *, ipha, mblk_t *, first_mp); 25138 25139 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 25140 ipst->ips_ipv4firewall_loopback_in, 25141 ill, NULL, ipha, first_mp, mp, 0, ipst); 25142 25143 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp); 25144 25145 if (first_mp == NULL) 25146 return; 25147 25148 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, 25149 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 25150 int, 1); 25151 25152 ipst->ips_loopback_packets++; 25153 25154 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 25155 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 25156 if (!IS_SIMPLE_IPH(ipha)) { 25157 ip_wput_local_options(ipha, ipst); 25158 } 25159 25160 protocol = ipha->ipha_protocol; 25161 switch (protocol) { 25162 case IPPROTO_ICMP: { 25163 ire_t *ire_zone; 25164 ilm_t *ilm; 25165 mblk_t *mp1; 25166 zoneid_t last_zoneid; 25167 25168 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { 25169 ASSERT(ire_type == IRE_BROADCAST); 25170 /* 25171 * In the multicast case, applications may have joined 25172 * the group from different zones, so we need to deliver 25173 * the packet to each of them. Loop through the 25174 * multicast memberships structures (ilm) on the receive 25175 * ill and send a copy of the packet up each matching 25176 * one. However, we don't do this for multicasts sent on 25177 * the loopback interface (PHYI_LOOPBACK flag set) as 25178 * they must stay in the sender's zone. 25179 * 25180 * ilm_add_v6() ensures that ilms in the same zone are 25181 * contiguous in the ill_ilm list. We use this property 25182 * to avoid sending duplicates needed when two 25183 * applications in the same zone join the same group on 25184 * different logical interfaces: we ignore the ilm if 25185 * it's zoneid is the same as the last matching one. 25186 * In addition, the sending of the packet for 25187 * ire_zoneid is delayed until all of the other ilms 25188 * have been exhausted. 25189 */ 25190 last_zoneid = -1; 25191 ILM_WALKER_HOLD(ill); 25192 for (ilm = ill->ill_ilm; ilm != NULL; 25193 ilm = ilm->ilm_next) { 25194 if ((ilm->ilm_flags & ILM_DELETED) || 25195 ipha->ipha_dst != ilm->ilm_addr || 25196 ilm->ilm_zoneid == last_zoneid || 25197 ilm->ilm_zoneid == zoneid || 25198 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 25199 continue; 25200 mp1 = ip_copymsg(first_mp); 25201 if (mp1 == NULL) 25202 continue; 25203 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25204 mctl_present, B_FALSE, ill, 25205 ilm->ilm_zoneid); 25206 last_zoneid = ilm->ilm_zoneid; 25207 } 25208 ILM_WALKER_RELE(ill); 25209 /* 25210 * Loopback case: the sending endpoint has 25211 * IP_MULTICAST_LOOP disabled, therefore we don't 25212 * dispatch the multicast packet to the sending zone. 25213 */ 25214 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 25215 freemsg(first_mp); 25216 return; 25217 } 25218 } else if (ire_type == IRE_BROADCAST) { 25219 /* 25220 * In the broadcast case, there may be many zones 25221 * which need a copy of the packet delivered to them. 25222 * There is one IRE_BROADCAST per broadcast address 25223 * and per zone; we walk those using a helper function. 25224 * In addition, the sending of the packet for zoneid is 25225 * delayed until all of the other ires have been 25226 * processed. 25227 */ 25228 IRB_REFHOLD(ire->ire_bucket); 25229 ire_zone = NULL; 25230 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 25231 ire)) != NULL) { 25232 mp1 = ip_copymsg(first_mp); 25233 if (mp1 == NULL) 25234 continue; 25235 25236 UPDATE_IB_PKT_COUNT(ire_zone); 25237 ire_zone->ire_last_used_time = lbolt; 25238 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25239 mctl_present, B_FALSE, ill, 25240 ire_zone->ire_zoneid); 25241 } 25242 IRB_REFRELE(ire->ire_bucket); 25243 } 25244 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 25245 0, mctl_present, B_FALSE, ill, zoneid); 25246 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25247 "ip_wput_local_end: q %p (%S)", 25248 q, "icmp"); 25249 return; 25250 } 25251 case IPPROTO_IGMP: 25252 if ((mp = igmp_input(q, mp, ill)) == NULL) { 25253 /* Bad packet - discarded by igmp_input */ 25254 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25255 "ip_wput_local_end: q %p (%S)", 25256 q, "igmp_input--bad packet"); 25257 if (mctl_present) 25258 freeb(first_mp); 25259 return; 25260 } 25261 /* 25262 * igmp_input() may have returned the pulled up message. 25263 * So first_mp and ipha need to be reinitialized. 25264 */ 25265 ipha = (ipha_t *)mp->b_rptr; 25266 if (mctl_present) 25267 first_mp->b_cont = mp; 25268 else 25269 first_mp = mp; 25270 /* deliver to local raw users */ 25271 break; 25272 case IPPROTO_ENCAP: 25273 /* 25274 * This case is covered by either ip_fanout_proto, or by 25275 * the above security processing for self-tunneled packets. 25276 */ 25277 break; 25278 case IPPROTO_UDP: { 25279 uint16_t *up; 25280 uint32_t ports; 25281 25282 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 25283 UDP_PORTS_OFFSET); 25284 /* Force a 'valid' checksum. */ 25285 up[3] = 0; 25286 25287 ports = *(uint32_t *)up; 25288 ip_fanout_udp(q, first_mp, ill, ipha, ports, 25289 (ire_type == IRE_BROADCAST), 25290 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25291 IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE, 25292 ill, zoneid); 25293 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25294 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 25295 return; 25296 } 25297 case IPPROTO_TCP: { 25298 25299 /* 25300 * For TCP, discard broadcast packets. 25301 */ 25302 if ((ushort_t)ire_type == IRE_BROADCAST) { 25303 freemsg(first_mp); 25304 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 25305 ip2dbg(("ip_wput_local: discard broadcast\n")); 25306 return; 25307 } 25308 25309 if (mp->b_datap->db_type == M_DATA) { 25310 /* 25311 * M_DATA mblk, so init mblk (chain) for no struio(). 25312 */ 25313 mblk_t *mp1 = mp; 25314 25315 do { 25316 mp1->b_datap->db_struioflag = 0; 25317 } while ((mp1 = mp1->b_cont) != NULL); 25318 } 25319 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 25320 <= mp->b_wptr); 25321 ip_fanout_tcp(q, first_mp, ill, ipha, 25322 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25323 IP_FF_SYN_ADDIRE | IP_FF_IPINFO, 25324 mctl_present, B_FALSE, zoneid); 25325 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25326 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 25327 return; 25328 } 25329 case IPPROTO_SCTP: 25330 { 25331 uint32_t ports; 25332 25333 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 25334 ip_fanout_sctp(first_mp, ill, ipha, ports, 25335 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25336 IP_FF_IPINFO, mctl_present, B_FALSE, zoneid); 25337 return; 25338 } 25339 25340 default: 25341 break; 25342 } 25343 /* 25344 * Find a client for some other protocol. We give 25345 * copies to multiple clients, if more than one is 25346 * bound. 25347 */ 25348 ip_fanout_proto(q, first_mp, ill, ipha, 25349 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 25350 mctl_present, B_FALSE, ill, zoneid); 25351 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25352 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 25353 #undef rptr 25354 } 25355 25356 /* 25357 * Update any source route, record route, or timestamp options. 25358 * Check that we are at end of strict source route. 25359 * The options have been sanity checked by ip_wput_options(). 25360 */ 25361 static void 25362 ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) 25363 { 25364 ipoptp_t opts; 25365 uchar_t *opt; 25366 uint8_t optval; 25367 uint8_t optlen; 25368 ipaddr_t dst; 25369 uint32_t ts; 25370 ire_t *ire; 25371 timestruc_t now; 25372 25373 ip2dbg(("ip_wput_local_options\n")); 25374 for (optval = ipoptp_first(&opts, ipha); 25375 optval != IPOPT_EOL; 25376 optval = ipoptp_next(&opts)) { 25377 opt = opts.ipoptp_cur; 25378 optlen = opts.ipoptp_len; 25379 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 25380 switch (optval) { 25381 uint32_t off; 25382 case IPOPT_SSRR: 25383 case IPOPT_LSRR: 25384 off = opt[IPOPT_OFFSET]; 25385 off--; 25386 if (optlen < IP_ADDR_LEN || 25387 off > optlen - IP_ADDR_LEN) { 25388 /* End of source route */ 25389 break; 25390 } 25391 /* 25392 * This will only happen if two consecutive entries 25393 * in the source route contains our address or if 25394 * it is a packet with a loose source route which 25395 * reaches us before consuming the whole source route 25396 */ 25397 ip1dbg(("ip_wput_local_options: not end of SR\n")); 25398 if (optval == IPOPT_SSRR) { 25399 return; 25400 } 25401 /* 25402 * Hack: instead of dropping the packet truncate the 25403 * source route to what has been used by filling the 25404 * rest with IPOPT_NOP. 25405 */ 25406 opt[IPOPT_OLEN] = (uint8_t)off; 25407 while (off < optlen) { 25408 opt[off++] = IPOPT_NOP; 25409 } 25410 break; 25411 case IPOPT_RR: 25412 off = opt[IPOPT_OFFSET]; 25413 off--; 25414 if (optlen < IP_ADDR_LEN || 25415 off > optlen - IP_ADDR_LEN) { 25416 /* No more room - ignore */ 25417 ip1dbg(( 25418 "ip_wput_forward_options: end of RR\n")); 25419 break; 25420 } 25421 dst = htonl(INADDR_LOOPBACK); 25422 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25423 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25424 break; 25425 case IPOPT_TS: 25426 /* Insert timestamp if there is romm */ 25427 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25428 case IPOPT_TS_TSONLY: 25429 off = IPOPT_TS_TIMELEN; 25430 break; 25431 case IPOPT_TS_PRESPEC: 25432 case IPOPT_TS_PRESPEC_RFC791: 25433 /* Verify that the address matched */ 25434 off = opt[IPOPT_OFFSET] - 1; 25435 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 25436 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 25437 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 25438 ipst); 25439 if (ire == NULL) { 25440 /* Not for us */ 25441 break; 25442 } 25443 ire_refrele(ire); 25444 /* FALLTHRU */ 25445 case IPOPT_TS_TSANDADDR: 25446 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25447 break; 25448 default: 25449 /* 25450 * ip_*put_options should have already 25451 * dropped this packet. 25452 */ 25453 cmn_err(CE_PANIC, "ip_wput_local_options: " 25454 "unknown IT - bug in ip_wput_options?\n"); 25455 return; /* Keep "lint" happy */ 25456 } 25457 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 25458 /* Increase overflow counter */ 25459 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 25460 opt[IPOPT_POS_OV_FLG] = (uint8_t) 25461 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 25462 (off << 4); 25463 break; 25464 } 25465 off = opt[IPOPT_OFFSET] - 1; 25466 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25467 case IPOPT_TS_PRESPEC: 25468 case IPOPT_TS_PRESPEC_RFC791: 25469 case IPOPT_TS_TSANDADDR: 25470 dst = htonl(INADDR_LOOPBACK); 25471 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25472 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25473 /* FALLTHRU */ 25474 case IPOPT_TS_TSONLY: 25475 off = opt[IPOPT_OFFSET] - 1; 25476 /* Compute # of milliseconds since midnight */ 25477 gethrestime(&now); 25478 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 25479 now.tv_nsec / (NANOSEC / MILLISEC); 25480 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 25481 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 25482 break; 25483 } 25484 break; 25485 } 25486 } 25487 } 25488 25489 /* 25490 * Send out a multicast packet on interface ipif. 25491 * The sender does not have an conn. 25492 * Caller verifies that this isn't a PHYI_LOOPBACK. 25493 */ 25494 void 25495 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) 25496 { 25497 ipha_t *ipha; 25498 ire_t *ire; 25499 ipaddr_t dst; 25500 mblk_t *first_mp; 25501 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 25502 25503 /* igmp_sendpkt always allocates a ipsec_out_t */ 25504 ASSERT(mp->b_datap->db_type == M_CTL); 25505 ASSERT(!ipif->ipif_isv6); 25506 ASSERT(!IS_LOOPBACK(ipif->ipif_ill)); 25507 25508 first_mp = mp; 25509 mp = first_mp->b_cont; 25510 ASSERT(mp->b_datap->db_type == M_DATA); 25511 ipha = (ipha_t *)mp->b_rptr; 25512 25513 /* 25514 * Find an IRE which matches the destination and the outgoing 25515 * queue (i.e. the outgoing interface.) 25516 */ 25517 if (ipif->ipif_flags & IPIF_POINTOPOINT) 25518 dst = ipif->ipif_pp_dst_addr; 25519 else 25520 dst = ipha->ipha_dst; 25521 /* 25522 * The source address has already been initialized by the 25523 * caller and hence matching on ILL (MATCH_IRE_ILL) would 25524 * be sufficient rather than MATCH_IRE_IPIF. 25525 * 25526 * This function is used for sending IGMP packets. We need 25527 * to make sure that we send the packet out of the interface 25528 * (ipif->ipif_ill) where we joined the group. This is to 25529 * prevent from switches doing IGMP snooping to send us multicast 25530 * packets for a given group on the interface we have joined. 25531 * If we can't find an ire, igmp_sendpkt has already initialized 25532 * ipsec_out_attach_if so that this will not be load spread in 25533 * ip_newroute_ipif. 25534 */ 25535 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, 25536 MATCH_IRE_ILL, ipst); 25537 if (!ire) { 25538 /* 25539 * Mark this packet to make it be delivered to 25540 * ip_wput_ire after the new ire has been 25541 * created. 25542 */ 25543 mp->b_prev = NULL; 25544 mp->b_next = NULL; 25545 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC, 25546 zoneid, &zero_info); 25547 return; 25548 } 25549 25550 /* 25551 * Honor the RTF_SETSRC flag; this is the only case 25552 * where we force this addr whatever the current src addr is, 25553 * because this address is set by igmp_sendpkt(), and 25554 * cannot be specified by any user. 25555 */ 25556 if (ire->ire_flags & RTF_SETSRC) { 25557 ipha->ipha_src = ire->ire_src_addr; 25558 } 25559 25560 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid); 25561 } 25562 25563 /* 25564 * NOTE : This function does not ire_refrele the ire argument passed in. 25565 * 25566 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 25567 * failure. The nce_fp_mp can vanish any time in the case of 25568 * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 25569 * the ire_lock to access the nce_fp_mp in this case. 25570 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 25571 * prepending a fastpath message IPQoS processing must precede it, we also set 25572 * the b_band of the fastpath message to that of the mblk returned by IPQoS 25573 * (IPQoS might have set the b_band for CoS marking). 25574 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 25575 * must follow it so that IPQoS can mark the dl_priority field for CoS 25576 * marking, if needed. 25577 */ 25578 static mblk_t * 25579 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, 25580 uint32_t ill_index, ipha_t **iphap) 25581 { 25582 uint_t hlen; 25583 ipha_t *ipha; 25584 mblk_t *mp1; 25585 boolean_t qos_done = B_FALSE; 25586 uchar_t *ll_hdr; 25587 ip_stack_t *ipst = ire->ire_ipst; 25588 25589 #define rptr ((uchar_t *)ipha) 25590 25591 ipha = (ipha_t *)mp->b_rptr; 25592 hlen = 0; 25593 LOCK_IRE_FP_MP(ire); 25594 if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) { 25595 ASSERT(DB_TYPE(mp1) == M_DATA); 25596 /* Initiate IPPF processing */ 25597 if ((proc != 0) && IPP_ENABLED(proc, ipst)) { 25598 UNLOCK_IRE_FP_MP(ire); 25599 ip_process(proc, &mp, ill_index); 25600 if (mp == NULL) 25601 return (NULL); 25602 25603 ipha = (ipha_t *)mp->b_rptr; 25604 LOCK_IRE_FP_MP(ire); 25605 if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) { 25606 qos_done = B_TRUE; 25607 goto no_fp_mp; 25608 } 25609 ASSERT(DB_TYPE(mp1) == M_DATA); 25610 } 25611 hlen = MBLKL(mp1); 25612 /* 25613 * Check if we have enough room to prepend fastpath 25614 * header 25615 */ 25616 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 25617 ll_hdr = rptr - hlen; 25618 bcopy(mp1->b_rptr, ll_hdr, hlen); 25619 /* 25620 * Set the b_rptr to the start of the link layer 25621 * header 25622 */ 25623 mp->b_rptr = ll_hdr; 25624 mp1 = mp; 25625 } else { 25626 mp1 = copyb(mp1); 25627 if (mp1 == NULL) 25628 goto unlock_err; 25629 mp1->b_band = mp->b_band; 25630 mp1->b_cont = mp; 25631 /* 25632 * certain system generated traffic may not 25633 * have cred/label in ip header block. This 25634 * is true even for a labeled system. But for 25635 * labeled traffic, inherit the label in the 25636 * new header. 25637 */ 25638 if (DB_CRED(mp) != NULL) 25639 mblk_setcred(mp1, DB_CRED(mp)); 25640 /* 25641 * XXX disable ICK_VALID and compute checksum 25642 * here; can happen if nce_fp_mp changes and 25643 * it can't be copied now due to insufficient 25644 * space. (unlikely, fp mp can change, but it 25645 * does not increase in length) 25646 */ 25647 } 25648 UNLOCK_IRE_FP_MP(ire); 25649 } else { 25650 no_fp_mp: 25651 mp1 = copyb(ire->ire_nce->nce_res_mp); 25652 if (mp1 == NULL) { 25653 unlock_err: 25654 UNLOCK_IRE_FP_MP(ire); 25655 freemsg(mp); 25656 return (NULL); 25657 } 25658 UNLOCK_IRE_FP_MP(ire); 25659 mp1->b_cont = mp; 25660 /* 25661 * certain system generated traffic may not 25662 * have cred/label in ip header block. This 25663 * is true even for a labeled system. But for 25664 * labeled traffic, inherit the label in the 25665 * new header. 25666 */ 25667 if (DB_CRED(mp) != NULL) 25668 mblk_setcred(mp1, DB_CRED(mp)); 25669 if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) { 25670 ip_process(proc, &mp1, ill_index); 25671 if (mp1 == NULL) 25672 return (NULL); 25673 25674 if (mp1->b_cont == NULL) 25675 ipha = NULL; 25676 else 25677 ipha = (ipha_t *)mp1->b_cont->b_rptr; 25678 } 25679 } 25680 25681 *iphap = ipha; 25682 return (mp1); 25683 #undef rptr 25684 } 25685 25686 /* 25687 * Finish the outbound IPsec processing for an IPv6 packet. This function 25688 * is called from ipsec_out_process() if the IPsec packet was processed 25689 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25690 * asynchronously. 25691 */ 25692 void 25693 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 25694 ire_t *ire_arg) 25695 { 25696 in6_addr_t *v6dstp; 25697 ire_t *ire; 25698 mblk_t *mp; 25699 ip6_t *ip6h1; 25700 uint_t ill_index; 25701 ipsec_out_t *io; 25702 boolean_t attach_if, hwaccel; 25703 uint32_t flags = IP6_NO_IPPOLICY; 25704 int match_flags; 25705 zoneid_t zoneid; 25706 boolean_t ill_need_rele = B_FALSE; 25707 boolean_t ire_need_rele = B_FALSE; 25708 ip_stack_t *ipst; 25709 25710 mp = ipsec_mp->b_cont; 25711 ip6h1 = (ip6_t *)mp->b_rptr; 25712 io = (ipsec_out_t *)ipsec_mp->b_rptr; 25713 ASSERT(io->ipsec_out_ns != NULL); 25714 ipst = io->ipsec_out_ns->netstack_ip; 25715 ill_index = io->ipsec_out_ill_index; 25716 if (io->ipsec_out_reachable) { 25717 flags |= IPV6_REACHABILITY_CONFIRMATION; 25718 } 25719 attach_if = io->ipsec_out_attach_if; 25720 hwaccel = io->ipsec_out_accelerated; 25721 zoneid = io->ipsec_out_zoneid; 25722 ASSERT(zoneid != ALL_ZONES); 25723 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 25724 /* Multicast addresses should have non-zero ill_index. */ 25725 v6dstp = &ip6h->ip6_dst; 25726 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 25727 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 25728 ASSERT(!attach_if || ill_index != 0); 25729 if (ill_index != 0) { 25730 if (ill == NULL) { 25731 ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, 25732 B_TRUE, ipst); 25733 25734 /* Failure case frees things for us. */ 25735 if (ill == NULL) 25736 return; 25737 25738 ill_need_rele = B_TRUE; 25739 } 25740 /* 25741 * If this packet needs to go out on a particular interface 25742 * honor it. 25743 */ 25744 if (attach_if) { 25745 match_flags = MATCH_IRE_ILL; 25746 25747 /* 25748 * Check if we need an ire that will not be 25749 * looked up by anybody else i.e. HIDDEN. 25750 */ 25751 if (ill_is_probeonly(ill)) { 25752 match_flags |= MATCH_IRE_MARK_HIDDEN; 25753 } 25754 } 25755 } 25756 ASSERT(mp != NULL); 25757 25758 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 25759 boolean_t unspec_src; 25760 ipif_t *ipif; 25761 25762 /* 25763 * Use the ill_index to get the right ill. 25764 */ 25765 unspec_src = io->ipsec_out_unspec_src; 25766 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 25767 if (ipif == NULL) { 25768 if (ill_need_rele) 25769 ill_refrele(ill); 25770 freemsg(ipsec_mp); 25771 return; 25772 } 25773 25774 if (ire_arg != NULL) { 25775 ire = ire_arg; 25776 } else { 25777 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25778 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 25779 ire_need_rele = B_TRUE; 25780 } 25781 if (ire != NULL) { 25782 ipif_refrele(ipif); 25783 /* 25784 * XXX Do the multicast forwarding now, as the IPsec 25785 * processing has been done. 25786 */ 25787 goto send; 25788 } 25789 25790 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 25791 mp->b_prev = NULL; 25792 mp->b_next = NULL; 25793 25794 /* 25795 * If the IPsec packet was processed asynchronously, 25796 * drop it now. 25797 */ 25798 if (q == NULL) { 25799 if (ill_need_rele) 25800 ill_refrele(ill); 25801 freemsg(ipsec_mp); 25802 return; 25803 } 25804 25805 ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, 25806 unspec_src, zoneid); 25807 ipif_refrele(ipif); 25808 } else { 25809 if (attach_if) { 25810 ipif_t *ipif; 25811 25812 ipif = ipif_get_next_ipif(NULL, ill); 25813 if (ipif == NULL) { 25814 if (ill_need_rele) 25815 ill_refrele(ill); 25816 freemsg(ipsec_mp); 25817 return; 25818 } 25819 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25820 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 25821 ire_need_rele = B_TRUE; 25822 ipif_refrele(ipif); 25823 } else { 25824 if (ire_arg != NULL) { 25825 ire = ire_arg; 25826 } else { 25827 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, 25828 ipst); 25829 ire_need_rele = B_TRUE; 25830 } 25831 } 25832 if (ire != NULL) 25833 goto send; 25834 /* 25835 * ire disappeared underneath. 25836 * 25837 * What we need to do here is the ip_newroute 25838 * logic to get the ire without doing the IPsec 25839 * processing. Follow the same old path. But this 25840 * time, ip_wput or ire_add_then_send will call us 25841 * directly as all the IPsec operations are done. 25842 */ 25843 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 25844 mp->b_prev = NULL; 25845 mp->b_next = NULL; 25846 25847 /* 25848 * If the IPsec packet was processed asynchronously, 25849 * drop it now. 25850 */ 25851 if (q == NULL) { 25852 if (ill_need_rele) 25853 ill_refrele(ill); 25854 freemsg(ipsec_mp); 25855 return; 25856 } 25857 25858 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 25859 zoneid, ipst); 25860 } 25861 if (ill != NULL && ill_need_rele) 25862 ill_refrele(ill); 25863 return; 25864 send: 25865 if (ill != NULL && ill_need_rele) 25866 ill_refrele(ill); 25867 25868 /* Local delivery */ 25869 if (ire->ire_stq == NULL) { 25870 ill_t *out_ill; 25871 ASSERT(q != NULL); 25872 25873 /* PFHooks: LOOPBACK_OUT */ 25874 out_ill = ire_to_ill(ire); 25875 25876 /* 25877 * DTrace this as ip:::send. A blocked packet will fire the 25878 * send probe, but not the receive probe. 25879 */ 25880 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 25881 void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill, 25882 ipha_t *, NULL, ip6_t *, ip6h, int, 1); 25883 25884 DTRACE_PROBE4(ip6__loopback__out__start, 25885 ill_t *, NULL, ill_t *, out_ill, 25886 ip6_t *, ip6h1, mblk_t *, ipsec_mp); 25887 25888 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 25889 ipst->ips_ipv6firewall_loopback_out, 25890 NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst); 25891 25892 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp); 25893 25894 if (ipsec_mp != NULL) 25895 ip_wput_local_v6(RD(q), out_ill, 25896 ip6h, ipsec_mp, ire, 0); 25897 if (ire_need_rele) 25898 ire_refrele(ire); 25899 return; 25900 } 25901 /* 25902 * Everything is done. Send it out on the wire. 25903 * We force the insertion of a fragment header using the 25904 * IPH_FRAG_HDR flag in two cases: 25905 * - after reception of an ICMPv6 "packet too big" message 25906 * with a MTU < 1280 (cf. RFC 2460 section 5) 25907 * - for multirouted IPv6 packets, so that the receiver can 25908 * discard duplicates according to their fragment identifier 25909 */ 25910 /* XXX fix flow control problems. */ 25911 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 25912 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 25913 if (hwaccel) { 25914 /* 25915 * hardware acceleration does not handle these 25916 * "slow path" cases. 25917 */ 25918 /* IPsec KSTATS: should bump bean counter here. */ 25919 if (ire_need_rele) 25920 ire_refrele(ire); 25921 freemsg(ipsec_mp); 25922 return; 25923 } 25924 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 25925 (mp->b_cont ? msgdsize(mp) : 25926 mp->b_wptr - (uchar_t *)ip6h)) { 25927 /* IPsec KSTATS: should bump bean counter here. */ 25928 ip0dbg(("Packet length mismatch: %d, %ld\n", 25929 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 25930 msgdsize(mp))); 25931 if (ire_need_rele) 25932 ire_refrele(ire); 25933 freemsg(ipsec_mp); 25934 return; 25935 } 25936 ASSERT(mp->b_prev == NULL); 25937 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 25938 ntohs(ip6h->ip6_plen) + 25939 IPV6_HDR_LEN, ire->ire_max_frag)); 25940 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 25941 ire->ire_max_frag); 25942 } else { 25943 UPDATE_OB_PKT_COUNT(ire); 25944 ire->ire_last_used_time = lbolt; 25945 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 25946 } 25947 if (ire_need_rele) 25948 ire_refrele(ire); 25949 freeb(ipsec_mp); 25950 } 25951 25952 void 25953 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 25954 { 25955 mblk_t *hada_mp; /* attributes M_CTL mblk */ 25956 da_ipsec_t *hada; /* data attributes */ 25957 ill_t *ill = (ill_t *)q->q_ptr; 25958 25959 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 25960 25961 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 25962 /* IPsec KSTATS: Bump lose counter here! */ 25963 freemsg(mp); 25964 return; 25965 } 25966 25967 /* 25968 * It's an IPsec packet that must be 25969 * accelerated by the Provider, and the 25970 * outbound ill is IPsec acceleration capable. 25971 * Prepends the mblk with an IPHADA_M_CTL, and ship it 25972 * to the ill. 25973 * IPsec KSTATS: should bump packet counter here. 25974 */ 25975 25976 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 25977 if (hada_mp == NULL) { 25978 /* IPsec KSTATS: should bump packet counter here. */ 25979 freemsg(mp); 25980 return; 25981 } 25982 25983 hada_mp->b_datap->db_type = M_CTL; 25984 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 25985 hada_mp->b_cont = mp; 25986 25987 hada = (da_ipsec_t *)hada_mp->b_rptr; 25988 bzero(hada, sizeof (da_ipsec_t)); 25989 hada->da_type = IPHADA_M_CTL; 25990 25991 putnext(q, hada_mp); 25992 } 25993 25994 /* 25995 * Finish the outbound IPsec processing. This function is called from 25996 * ipsec_out_process() if the IPsec packet was processed 25997 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25998 * asynchronously. 25999 */ 26000 void 26001 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 26002 ire_t *ire_arg) 26003 { 26004 uint32_t v_hlen_tos_len; 26005 ipaddr_t dst; 26006 ipif_t *ipif = NULL; 26007 ire_t *ire; 26008 ire_t *ire1 = NULL; 26009 mblk_t *next_mp = NULL; 26010 uint32_t max_frag; 26011 boolean_t multirt_send = B_FALSE; 26012 mblk_t *mp; 26013 ipha_t *ipha1; 26014 uint_t ill_index; 26015 ipsec_out_t *io; 26016 boolean_t attach_if; 26017 int match_flags; 26018 irb_t *irb = NULL; 26019 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 26020 zoneid_t zoneid; 26021 ipxmit_state_t pktxmit_state; 26022 ip_stack_t *ipst; 26023 26024 #ifdef _BIG_ENDIAN 26025 #define LENGTH (v_hlen_tos_len & 0xFFFF) 26026 #else 26027 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 26028 #endif 26029 26030 mp = ipsec_mp->b_cont; 26031 ipha1 = (ipha_t *)mp->b_rptr; 26032 ASSERT(mp != NULL); 26033 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 26034 dst = ipha->ipha_dst; 26035 26036 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26037 ill_index = io->ipsec_out_ill_index; 26038 attach_if = io->ipsec_out_attach_if; 26039 zoneid = io->ipsec_out_zoneid; 26040 ASSERT(zoneid != ALL_ZONES); 26041 ipst = io->ipsec_out_ns->netstack_ip; 26042 ASSERT(io->ipsec_out_ns != NULL); 26043 26044 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 26045 if (ill_index != 0) { 26046 if (ill == NULL) { 26047 ill = ip_grab_attach_ill(NULL, ipsec_mp, 26048 ill_index, B_FALSE, ipst); 26049 26050 /* Failure case frees things for us. */ 26051 if (ill == NULL) 26052 return; 26053 26054 ill_need_rele = B_TRUE; 26055 } 26056 /* 26057 * If this packet needs to go out on a particular interface 26058 * honor it. 26059 */ 26060 if (attach_if) { 26061 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 26062 26063 /* 26064 * Check if we need an ire that will not be 26065 * looked up by anybody else i.e. HIDDEN. 26066 */ 26067 if (ill_is_probeonly(ill)) { 26068 match_flags |= MATCH_IRE_MARK_HIDDEN; 26069 } 26070 } 26071 } 26072 26073 if (CLASSD(dst)) { 26074 boolean_t conn_dontroute; 26075 /* 26076 * Use the ill_index to get the right ipif. 26077 */ 26078 conn_dontroute = io->ipsec_out_dontroute; 26079 if (ill_index == 0) 26080 ipif = ipif_lookup_group(dst, zoneid, ipst); 26081 else 26082 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 26083 if (ipif == NULL) { 26084 ip1dbg(("ip_wput_ipsec_out: No ipif for" 26085 " multicast\n")); 26086 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 26087 freemsg(ipsec_mp); 26088 goto done; 26089 } 26090 /* 26091 * ipha_src has already been intialized with the 26092 * value of the ipif in ip_wput. All we need now is 26093 * an ire to send this downstream. 26094 */ 26095 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 26096 MBLK_GETLABEL(mp), match_flags, ipst); 26097 if (ire != NULL) { 26098 ill_t *ill1; 26099 /* 26100 * Do the multicast forwarding now, as the IPsec 26101 * processing has been done. 26102 */ 26103 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 26104 (ill1 = ire_to_ill(ire))) { 26105 if (ip_mforward(ill1, ipha, mp)) { 26106 freemsg(ipsec_mp); 26107 ip1dbg(("ip_wput_ipsec_out: mforward " 26108 "failed\n")); 26109 ire_refrele(ire); 26110 goto done; 26111 } 26112 } 26113 goto send; 26114 } 26115 26116 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 26117 mp->b_prev = NULL; 26118 mp->b_next = NULL; 26119 26120 /* 26121 * If the IPsec packet was processed asynchronously, 26122 * drop it now. 26123 */ 26124 if (q == NULL) { 26125 freemsg(ipsec_mp); 26126 goto done; 26127 } 26128 26129 /* 26130 * We may be using a wrong ipif to create the ire. 26131 * But it is okay as the source address is assigned 26132 * for the packet already. Next outbound packet would 26133 * create the IRE with the right IPIF in ip_wput. 26134 * 26135 * Also handle RTF_MULTIRT routes. 26136 */ 26137 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, 26138 zoneid, &zero_info); 26139 } else { 26140 if (attach_if) { 26141 ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, 26142 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 26143 } else { 26144 if (ire_arg != NULL) { 26145 ire = ire_arg; 26146 ire_need_rele = B_FALSE; 26147 } else { 26148 ire = ire_cache_lookup(dst, zoneid, 26149 MBLK_GETLABEL(mp), ipst); 26150 } 26151 } 26152 if (ire != NULL) { 26153 goto send; 26154 } 26155 26156 /* 26157 * ire disappeared underneath. 26158 * 26159 * What we need to do here is the ip_newroute 26160 * logic to get the ire without doing the IPsec 26161 * processing. Follow the same old path. But this 26162 * time, ip_wput or ire_add_then_put will call us 26163 * directly as all the IPsec operations are done. 26164 */ 26165 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 26166 mp->b_prev = NULL; 26167 mp->b_next = NULL; 26168 26169 /* 26170 * If the IPsec packet was processed asynchronously, 26171 * drop it now. 26172 */ 26173 if (q == NULL) { 26174 freemsg(ipsec_mp); 26175 goto done; 26176 } 26177 26178 /* 26179 * Since we're going through ip_newroute() again, we 26180 * need to make sure we don't: 26181 * 26182 * 1.) Trigger the ASSERT() with the ipha_ident 26183 * overloading. 26184 * 2.) Redo transport-layer checksumming, since we've 26185 * already done all that to get this far. 26186 * 26187 * The easiest way not do either of the above is to set 26188 * the ipha_ident field to IP_HDR_INCLUDED. 26189 */ 26190 ipha->ipha_ident = IP_HDR_INCLUDED; 26191 ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 26192 zoneid, ipst); 26193 } 26194 goto done; 26195 send: 26196 if (ire->ire_stq == NULL) { 26197 ill_t *out_ill; 26198 /* 26199 * Loopbacks go through ip_wput_local except for one case. 26200 * We come here if we generate a icmp_frag_needed message 26201 * after IPsec processing is over. When this function calls 26202 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 26203 * icmp_frag_needed. The message generated comes back here 26204 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 26205 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 26206 * source address as it is usually set in ip_wput_ire. As 26207 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 26208 * and we end up here. We can't enter ip_wput_ire once the 26209 * IPsec processing is over and hence we need to do it here. 26210 */ 26211 ASSERT(q != NULL); 26212 UPDATE_OB_PKT_COUNT(ire); 26213 ire->ire_last_used_time = lbolt; 26214 if (ipha->ipha_src == 0) 26215 ipha->ipha_src = ire->ire_src_addr; 26216 26217 /* PFHooks: LOOPBACK_OUT */ 26218 out_ill = ire_to_ill(ire); 26219 26220 /* 26221 * DTrace this as ip:::send. A blocked packet will fire the 26222 * send probe, but not the receive probe. 26223 */ 26224 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 26225 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 26226 ipha_t *, ipha, ip6_t *, NULL, int, 1); 26227 26228 DTRACE_PROBE4(ip4__loopback__out__start, 26229 ill_t *, NULL, ill_t *, out_ill, 26230 ipha_t *, ipha1, mblk_t *, ipsec_mp); 26231 26232 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 26233 ipst->ips_ipv4firewall_loopback_out, 26234 NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst); 26235 26236 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp); 26237 26238 if (ipsec_mp != NULL) 26239 ip_wput_local(RD(q), out_ill, 26240 ipha, ipsec_mp, ire, 0, zoneid); 26241 if (ire_need_rele) 26242 ire_refrele(ire); 26243 goto done; 26244 } 26245 26246 if (ire->ire_max_frag < (unsigned int)LENGTH) { 26247 /* 26248 * We are through with IPsec processing. 26249 * Fragment this and send it on the wire. 26250 */ 26251 if (io->ipsec_out_accelerated) { 26252 /* 26253 * The packet has been accelerated but must 26254 * be fragmented. This should not happen 26255 * since AH and ESP must not accelerate 26256 * packets that need fragmentation, however 26257 * the configuration could have changed 26258 * since the AH or ESP processing. 26259 * Drop packet. 26260 * IPsec KSTATS: bump bean counter here. 26261 */ 26262 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 26263 "fragmented accelerated packet!\n")); 26264 freemsg(ipsec_mp); 26265 } else { 26266 ip_wput_ire_fragmentit(ipsec_mp, ire, zoneid, ipst); 26267 } 26268 if (ire_need_rele) 26269 ire_refrele(ire); 26270 goto done; 26271 } 26272 26273 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 26274 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 26275 (void *)ire->ire_ipif, (void *)ipif)); 26276 26277 /* 26278 * Multiroute the secured packet, unless IPsec really 26279 * requires the packet to go out only through a particular 26280 * interface. 26281 */ 26282 if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { 26283 ire_t *first_ire; 26284 irb = ire->ire_bucket; 26285 ASSERT(irb != NULL); 26286 /* 26287 * This ire has been looked up as the one that 26288 * goes through the given ipif; 26289 * make sure we do not omit any other multiroute ire 26290 * that may be present in the bucket before this one. 26291 */ 26292 IRB_REFHOLD(irb); 26293 for (first_ire = irb->irb_ire; 26294 first_ire != NULL; 26295 first_ire = first_ire->ire_next) { 26296 if ((first_ire->ire_flags & RTF_MULTIRT) && 26297 (first_ire->ire_addr == ire->ire_addr) && 26298 !(first_ire->ire_marks & 26299 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 26300 break; 26301 } 26302 } 26303 26304 if ((first_ire != NULL) && (first_ire != ire)) { 26305 /* 26306 * Don't change the ire if the packet must 26307 * be fragmented if sent via this new one. 26308 */ 26309 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 26310 IRE_REFHOLD(first_ire); 26311 if (ire_need_rele) 26312 ire_refrele(ire); 26313 else 26314 ire_need_rele = B_TRUE; 26315 ire = first_ire; 26316 } 26317 } 26318 IRB_REFRELE(irb); 26319 26320 multirt_send = B_TRUE; 26321 max_frag = ire->ire_max_frag; 26322 } else { 26323 if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { 26324 ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " 26325 "flag, attach_if %d\n", attach_if)); 26326 } 26327 } 26328 26329 /* 26330 * In most cases, the emission loop below is entered only once. 26331 * Only in the case where the ire holds the RTF_MULTIRT 26332 * flag, we loop to process all RTF_MULTIRT ires in the 26333 * bucket, and send the packet through all crossed 26334 * RTF_MULTIRT routes. 26335 */ 26336 do { 26337 if (multirt_send) { 26338 /* 26339 * ire1 holds here the next ire to process in the 26340 * bucket. If multirouting is expected, 26341 * any non-RTF_MULTIRT ire that has the 26342 * right destination address is ignored. 26343 */ 26344 ASSERT(irb != NULL); 26345 IRB_REFHOLD(irb); 26346 for (ire1 = ire->ire_next; 26347 ire1 != NULL; 26348 ire1 = ire1->ire_next) { 26349 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 26350 continue; 26351 if (ire1->ire_addr != ire->ire_addr) 26352 continue; 26353 if (ire1->ire_marks & 26354 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 26355 continue; 26356 /* No loopback here */ 26357 if (ire1->ire_stq == NULL) 26358 continue; 26359 /* 26360 * Ensure we do not exceed the MTU 26361 * of the next route. 26362 */ 26363 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 26364 ip_multirt_bad_mtu(ire1, max_frag); 26365 continue; 26366 } 26367 26368 IRE_REFHOLD(ire1); 26369 break; 26370 } 26371 IRB_REFRELE(irb); 26372 if (ire1 != NULL) { 26373 /* 26374 * We are in a multiple send case, need to 26375 * make a copy of the packet. 26376 */ 26377 next_mp = copymsg(ipsec_mp); 26378 if (next_mp == NULL) { 26379 ire_refrele(ire1); 26380 ire1 = NULL; 26381 } 26382 } 26383 } 26384 /* 26385 * Everything is done. Send it out on the wire 26386 * 26387 * ip_xmit_v4 will call ip_wput_attach_llhdr and then 26388 * either send it on the wire or, in the case of 26389 * HW acceleration, call ipsec_hw_putnext. 26390 */ 26391 if (ire->ire_nce && 26392 ire->ire_nce->nce_state != ND_REACHABLE) { 26393 DTRACE_PROBE2(ip__wput__ipsec__bail, 26394 (ire_t *), ire, (mblk_t *), ipsec_mp); 26395 /* 26396 * If ire's link-layer is unresolved (this 26397 * would only happen if the incomplete ire 26398 * was added to cachetable via forwarding path) 26399 * don't bother going to ip_xmit_v4. Just drop the 26400 * packet. 26401 * There is a slight risk here, in that, if we 26402 * have the forwarding path create an incomplete 26403 * IRE, then until the IRE is completed, any 26404 * transmitted IPsec packets will be dropped 26405 * instead of being queued waiting for resolution. 26406 * 26407 * But the likelihood of a forwarding packet and a wput 26408 * packet sending to the same dst at the same time 26409 * and there not yet be an ARP entry for it is small. 26410 * Furthermore, if this actually happens, it might 26411 * be likely that wput would generate multiple 26412 * packets (and forwarding would also have a train 26413 * of packets) for that destination. If this is 26414 * the case, some of them would have been dropped 26415 * anyway, since ARP only queues a few packets while 26416 * waiting for resolution 26417 * 26418 * NOTE: We should really call ip_xmit_v4, 26419 * and let it queue the packet and send the 26420 * ARP query and have ARP come back thus: 26421 * <ARP> ip_wput->ip_output->ip-wput_nondata-> 26422 * ip_xmit_v4->ip_wput_attach_llhdr + ipsec 26423 * hw accel work. But it's too complex to get 26424 * the IPsec hw acceleration approach to fit 26425 * well with ip_xmit_v4 doing ARP without 26426 * doing IPsec simplification. For now, we just 26427 * poke ip_xmit_v4 to trigger the arp resolve, so 26428 * that we can continue with the send on the next 26429 * attempt. 26430 * 26431 * XXX THis should be revisited, when 26432 * the IPsec/IP interaction is cleaned up 26433 */ 26434 ip1dbg(("ip_wput_ipsec_out: ire is incomplete" 26435 " - dropping packet\n")); 26436 freemsg(ipsec_mp); 26437 /* 26438 * Call ip_xmit_v4() to trigger ARP query 26439 * in case the nce_state is ND_INITIAL 26440 */ 26441 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 26442 goto drop_pkt; 26443 } 26444 26445 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 26446 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1, 26447 mblk_t *, ipsec_mp); 26448 FW_HOOKS(ipst->ips_ip4_physical_out_event, 26449 ipst->ips_ipv4firewall_physical_out, NULL, 26450 ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst); 26451 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp); 26452 if (ipsec_mp == NULL) 26453 goto drop_pkt; 26454 26455 ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n")); 26456 pktxmit_state = ip_xmit_v4(mp, ire, 26457 (io->ipsec_out_accelerated ? io : NULL), B_FALSE); 26458 26459 if ((pktxmit_state == SEND_FAILED) || 26460 (pktxmit_state == LLHDR_RESLV_FAILED)) { 26461 26462 freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */ 26463 drop_pkt: 26464 BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib, 26465 ipIfStatsOutDiscards); 26466 if (ire_need_rele) 26467 ire_refrele(ire); 26468 if (ire1 != NULL) { 26469 ire_refrele(ire1); 26470 freemsg(next_mp); 26471 } 26472 goto done; 26473 } 26474 26475 freeb(ipsec_mp); 26476 if (ire_need_rele) 26477 ire_refrele(ire); 26478 26479 if (ire1 != NULL) { 26480 ire = ire1; 26481 ire_need_rele = B_TRUE; 26482 ASSERT(next_mp); 26483 ipsec_mp = next_mp; 26484 mp = ipsec_mp->b_cont; 26485 ire1 = NULL; 26486 next_mp = NULL; 26487 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26488 } else { 26489 multirt_send = B_FALSE; 26490 } 26491 } while (multirt_send); 26492 done: 26493 if (ill != NULL && ill_need_rele) 26494 ill_refrele(ill); 26495 if (ipif != NULL) 26496 ipif_refrele(ipif); 26497 } 26498 26499 /* 26500 * Get the ill corresponding to the specified ire, and compare its 26501 * capabilities with the protocol and algorithms specified by the 26502 * the SA obtained from ipsec_out. If they match, annotate the 26503 * ipsec_out structure to indicate that the packet needs acceleration. 26504 * 26505 * 26506 * A packet is eligible for outbound hardware acceleration if the 26507 * following conditions are satisfied: 26508 * 26509 * 1. the packet will not be fragmented 26510 * 2. the provider supports the algorithm 26511 * 3. there is no pending control message being exchanged 26512 * 4. snoop is not attached 26513 * 5. the destination address is not a broadcast or multicast address. 26514 * 26515 * Rationale: 26516 * - Hardware drivers do not support fragmentation with 26517 * the current interface. 26518 * - snoop, multicast, and broadcast may result in exposure of 26519 * a cleartext datagram. 26520 * We check all five of these conditions here. 26521 * 26522 * XXX would like to nuke "ire_t *" parameter here; problem is that 26523 * IRE is only way to figure out if a v4 address is a broadcast and 26524 * thus ineligible for acceleration... 26525 */ 26526 static void 26527 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 26528 { 26529 ipsec_out_t *io; 26530 mblk_t *data_mp; 26531 uint_t plen, overhead; 26532 ip_stack_t *ipst; 26533 26534 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 26535 return; 26536 26537 if (ill == NULL) 26538 return; 26539 ipst = ill->ill_ipst; 26540 /* 26541 * Destination address is a broadcast or multicast. Punt. 26542 */ 26543 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 26544 IRE_LOCAL))) 26545 return; 26546 26547 data_mp = ipsec_mp->b_cont; 26548 26549 if (ill->ill_isv6) { 26550 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 26551 26552 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 26553 return; 26554 26555 plen = ip6h->ip6_plen; 26556 } else { 26557 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 26558 26559 if (CLASSD(ipha->ipha_dst)) 26560 return; 26561 26562 plen = ipha->ipha_length; 26563 } 26564 /* 26565 * Is there a pending DLPI control message being exchanged 26566 * between IP/IPsec and the DLS Provider? If there is, it 26567 * could be a SADB update, and the state of the DLS Provider 26568 * SADB might not be in sync with the SADB maintained by 26569 * IPsec. To avoid dropping packets or using the wrong keying 26570 * material, we do not accelerate this packet. 26571 */ 26572 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 26573 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26574 "ill_dlpi_pending! don't accelerate packet\n")); 26575 return; 26576 } 26577 26578 /* 26579 * Is the Provider in promiscous mode? If it does, we don't 26580 * accelerate the packet since it will bounce back up to the 26581 * listeners in the clear. 26582 */ 26583 if (ill->ill_promisc_on_phys) { 26584 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26585 "ill in promiscous mode, don't accelerate packet\n")); 26586 return; 26587 } 26588 26589 /* 26590 * Will the packet require fragmentation? 26591 */ 26592 26593 /* 26594 * IPsec ESP note: this is a pessimistic estimate, but the same 26595 * as is used elsewhere. 26596 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 26597 * + 2-byte trailer 26598 */ 26599 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 26600 IPSEC_BASE_ESP_HDR_SIZE(sa); 26601 26602 if ((plen + overhead) > ill->ill_max_mtu) 26603 return; 26604 26605 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26606 26607 /* 26608 * Can the ill accelerate this IPsec protocol and algorithm 26609 * specified by the SA? 26610 */ 26611 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 26612 ill->ill_isv6, sa, ipst->ips_netstack)) { 26613 return; 26614 } 26615 26616 /* 26617 * Tell AH or ESP that the outbound ill is capable of 26618 * accelerating this packet. 26619 */ 26620 io->ipsec_out_is_capab_ill = B_TRUE; 26621 } 26622 26623 /* 26624 * Select which AH & ESP SA's to use (if any) for the outbound packet. 26625 * 26626 * If this function returns B_TRUE, the requested SA's have been filled 26627 * into the ipsec_out_*_sa pointers. 26628 * 26629 * If the function returns B_FALSE, the packet has been "consumed", most 26630 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 26631 * 26632 * The SA references created by the protocol-specific "select" 26633 * function will be released when the ipsec_mp is freed, thanks to the 26634 * ipsec_out_free destructor -- see spd.c. 26635 */ 26636 static boolean_t 26637 ipsec_out_select_sa(mblk_t *ipsec_mp) 26638 { 26639 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 26640 ipsec_out_t *io; 26641 ipsec_policy_t *pp; 26642 ipsec_action_t *ap; 26643 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26644 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26645 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26646 26647 if (!io->ipsec_out_secure) { 26648 /* 26649 * We came here by mistake. 26650 * Don't bother with ipsec processing 26651 * We should "discourage" this path in the future. 26652 */ 26653 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26654 return (B_FALSE); 26655 } 26656 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26657 ASSERT((io->ipsec_out_policy != NULL) || 26658 (io->ipsec_out_act != NULL)); 26659 26660 ASSERT(io->ipsec_out_failed == B_FALSE); 26661 26662 /* 26663 * IPsec processing has started. 26664 */ 26665 io->ipsec_out_proc_begin = B_TRUE; 26666 ap = io->ipsec_out_act; 26667 if (ap == NULL) { 26668 pp = io->ipsec_out_policy; 26669 ASSERT(pp != NULL); 26670 ap = pp->ipsp_act; 26671 ASSERT(ap != NULL); 26672 } 26673 26674 /* 26675 * We have an action. now, let's select SA's. 26676 * (In the future, we can cache this in the conn_t..) 26677 */ 26678 if (ap->ipa_want_esp) { 26679 if (io->ipsec_out_esp_sa == NULL) { 26680 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 26681 IPPROTO_ESP); 26682 } 26683 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 26684 } 26685 26686 if (ap->ipa_want_ah) { 26687 if (io->ipsec_out_ah_sa == NULL) { 26688 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 26689 IPPROTO_AH); 26690 } 26691 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 26692 /* 26693 * The ESP and AH processing order needs to be preserved 26694 * when both protocols are required (ESP should be applied 26695 * before AH for an outbound packet). Force an ESP ACQUIRE 26696 * when both ESP and AH are required, and an AH ACQUIRE 26697 * is needed. 26698 */ 26699 if (ap->ipa_want_esp && need_ah_acquire) 26700 need_esp_acquire = B_TRUE; 26701 } 26702 26703 /* 26704 * Send an ACQUIRE (extended, regular, or both) if we need one. 26705 * Release SAs that got referenced, but will not be used until we 26706 * acquire _all_ of the SAs we need. 26707 */ 26708 if (need_ah_acquire || need_esp_acquire) { 26709 if (io->ipsec_out_ah_sa != NULL) { 26710 IPSA_REFRELE(io->ipsec_out_ah_sa); 26711 io->ipsec_out_ah_sa = NULL; 26712 } 26713 if (io->ipsec_out_esp_sa != NULL) { 26714 IPSA_REFRELE(io->ipsec_out_esp_sa); 26715 io->ipsec_out_esp_sa = NULL; 26716 } 26717 26718 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 26719 return (B_FALSE); 26720 } 26721 26722 return (B_TRUE); 26723 } 26724 26725 /* 26726 * Process an IPSEC_OUT message and see what you can 26727 * do with it. 26728 * IPQoS Notes: 26729 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 26730 * IPsec. 26731 * XXX would like to nuke ire_t. 26732 * XXX ill_index better be "real" 26733 */ 26734 void 26735 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 26736 { 26737 ipsec_out_t *io; 26738 ipsec_policy_t *pp; 26739 ipsec_action_t *ap; 26740 ipha_t *ipha; 26741 ip6_t *ip6h; 26742 mblk_t *mp; 26743 ill_t *ill; 26744 zoneid_t zoneid; 26745 ipsec_status_t ipsec_rc; 26746 boolean_t ill_need_rele = B_FALSE; 26747 ip_stack_t *ipst; 26748 ipsec_stack_t *ipss; 26749 26750 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26751 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26752 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26753 ipst = io->ipsec_out_ns->netstack_ip; 26754 mp = ipsec_mp->b_cont; 26755 26756 /* 26757 * Initiate IPPF processing. We do it here to account for packets 26758 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 26759 * We can check for ipsec_out_proc_begin even for such packets, as 26760 * they will always be false (asserted below). 26761 */ 26762 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) { 26763 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 26764 io->ipsec_out_ill_index : ill_index); 26765 if (mp == NULL) { 26766 ip2dbg(("ipsec_out_process: packet dropped "\ 26767 "during IPPF processing\n")); 26768 freeb(ipsec_mp); 26769 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26770 return; 26771 } 26772 } 26773 26774 if (!io->ipsec_out_secure) { 26775 /* 26776 * We came here by mistake. 26777 * Don't bother with ipsec processing 26778 * Should "discourage" this path in the future. 26779 */ 26780 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26781 goto done; 26782 } 26783 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26784 ASSERT((io->ipsec_out_policy != NULL) || 26785 (io->ipsec_out_act != NULL)); 26786 ASSERT(io->ipsec_out_failed == B_FALSE); 26787 26788 ipss = ipst->ips_netstack->netstack_ipsec; 26789 if (!ipsec_loaded(ipss)) { 26790 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 26791 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26792 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26793 } else { 26794 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 26795 } 26796 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 26797 DROPPER(ipss, ipds_ip_ipsec_not_loaded), 26798 &ipss->ipsec_dropper); 26799 return; 26800 } 26801 26802 /* 26803 * IPsec processing has started. 26804 */ 26805 io->ipsec_out_proc_begin = B_TRUE; 26806 ap = io->ipsec_out_act; 26807 if (ap == NULL) { 26808 pp = io->ipsec_out_policy; 26809 ASSERT(pp != NULL); 26810 ap = pp->ipsp_act; 26811 ASSERT(ap != NULL); 26812 } 26813 26814 /* 26815 * Save the outbound ill index. When the packet comes back 26816 * from IPsec, we make sure the ill hasn't changed or disappeared 26817 * before sending it the accelerated packet. 26818 */ 26819 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 26820 int ifindex; 26821 ill = ire_to_ill(ire); 26822 ifindex = ill->ill_phyint->phyint_ifindex; 26823 io->ipsec_out_capab_ill_index = ifindex; 26824 } 26825 26826 /* 26827 * The order of processing is first insert a IP header if needed. 26828 * Then insert the ESP header and then the AH header. 26829 */ 26830 if ((io->ipsec_out_se_done == B_FALSE) && 26831 (ap->ipa_want_se)) { 26832 /* 26833 * First get the outer IP header before sending 26834 * it to ESP. 26835 */ 26836 ipha_t *oipha, *iipha; 26837 mblk_t *outer_mp, *inner_mp; 26838 26839 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 26840 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 26841 "ipsec_out_process: " 26842 "Self-Encapsulation failed: Out of memory\n"); 26843 freemsg(ipsec_mp); 26844 if (ill != NULL) { 26845 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26846 } else { 26847 BUMP_MIB(&ipst->ips_ip_mib, 26848 ipIfStatsOutDiscards); 26849 } 26850 return; 26851 } 26852 inner_mp = ipsec_mp->b_cont; 26853 ASSERT(inner_mp->b_datap->db_type == M_DATA); 26854 oipha = (ipha_t *)outer_mp->b_rptr; 26855 iipha = (ipha_t *)inner_mp->b_rptr; 26856 *oipha = *iipha; 26857 outer_mp->b_wptr += sizeof (ipha_t); 26858 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 26859 sizeof (ipha_t)); 26860 oipha->ipha_protocol = IPPROTO_ENCAP; 26861 oipha->ipha_version_and_hdr_length = 26862 IP_SIMPLE_HDR_VERSION; 26863 oipha->ipha_hdr_checksum = 0; 26864 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 26865 outer_mp->b_cont = inner_mp; 26866 ipsec_mp->b_cont = outer_mp; 26867 26868 io->ipsec_out_se_done = B_TRUE; 26869 io->ipsec_out_tunnel = B_TRUE; 26870 } 26871 26872 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 26873 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 26874 !ipsec_out_select_sa(ipsec_mp)) 26875 return; 26876 26877 /* 26878 * By now, we know what SA's to use. Toss over to ESP & AH 26879 * to do the heavy lifting. 26880 */ 26881 zoneid = io->ipsec_out_zoneid; 26882 ASSERT(zoneid != ALL_ZONES); 26883 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 26884 ASSERT(io->ipsec_out_esp_sa != NULL); 26885 io->ipsec_out_esp_done = B_TRUE; 26886 /* 26887 * Note that since hw accel can only apply one transform, 26888 * not two, we skip hw accel for ESP if we also have AH 26889 * This is an design limitation of the interface 26890 * which should be revisited. 26891 */ 26892 ASSERT(ire != NULL); 26893 if (io->ipsec_out_ah_sa == NULL) { 26894 ill = (ill_t *)ire->ire_stq->q_ptr; 26895 ipsec_out_is_accelerated(ipsec_mp, 26896 io->ipsec_out_esp_sa, ill, ire); 26897 } 26898 26899 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 26900 switch (ipsec_rc) { 26901 case IPSEC_STATUS_SUCCESS: 26902 break; 26903 case IPSEC_STATUS_FAILED: 26904 if (ill != NULL) { 26905 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26906 } else { 26907 BUMP_MIB(&ipst->ips_ip_mib, 26908 ipIfStatsOutDiscards); 26909 } 26910 /* FALLTHRU */ 26911 case IPSEC_STATUS_PENDING: 26912 return; 26913 } 26914 } 26915 26916 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 26917 ASSERT(io->ipsec_out_ah_sa != NULL); 26918 io->ipsec_out_ah_done = B_TRUE; 26919 if (ire == NULL) { 26920 int idx = io->ipsec_out_capab_ill_index; 26921 ill = ill_lookup_on_ifindex(idx, B_FALSE, 26922 NULL, NULL, NULL, NULL, ipst); 26923 ill_need_rele = B_TRUE; 26924 } else { 26925 ill = (ill_t *)ire->ire_stq->q_ptr; 26926 } 26927 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 26928 ire); 26929 26930 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 26931 switch (ipsec_rc) { 26932 case IPSEC_STATUS_SUCCESS: 26933 break; 26934 case IPSEC_STATUS_FAILED: 26935 if (ill != NULL) { 26936 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26937 } else { 26938 BUMP_MIB(&ipst->ips_ip_mib, 26939 ipIfStatsOutDiscards); 26940 } 26941 /* FALLTHRU */ 26942 case IPSEC_STATUS_PENDING: 26943 if (ill != NULL && ill_need_rele) 26944 ill_refrele(ill); 26945 return; 26946 } 26947 } 26948 /* 26949 * We are done with IPsec processing. Send it over 26950 * the wire. 26951 */ 26952 done: 26953 mp = ipsec_mp->b_cont; 26954 ipha = (ipha_t *)mp->b_rptr; 26955 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26956 ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); 26957 } else { 26958 ip6h = (ip6_t *)ipha; 26959 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); 26960 } 26961 if (ill != NULL && ill_need_rele) 26962 ill_refrele(ill); 26963 } 26964 26965 /* ARGSUSED */ 26966 void 26967 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 26968 { 26969 opt_restart_t *or; 26970 int err; 26971 conn_t *connp; 26972 26973 ASSERT(CONN_Q(q)); 26974 connp = Q_TO_CONN(q); 26975 26976 ASSERT(first_mp->b_datap->db_type == M_CTL); 26977 or = (opt_restart_t *)first_mp->b_rptr; 26978 /* 26979 * We don't need to pass any credentials here since this is just 26980 * a restart. The credentials are passed in when svr4_optcom_req 26981 * is called the first time (from ip_wput_nondata). 26982 */ 26983 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 26984 err = svr4_optcom_req(q, first_mp, NULL, 26985 &ip_opt_obj, B_FALSE); 26986 } else { 26987 ASSERT(or->or_type == T_OPTMGMT_REQ); 26988 err = tpi_optcom_req(q, first_mp, NULL, 26989 &ip_opt_obj, B_FALSE); 26990 } 26991 if (err != EINPROGRESS) { 26992 /* operation is done */ 26993 CONN_OPER_PENDING_DONE(connp); 26994 } 26995 } 26996 26997 /* 26998 * ioctls that go through a down/up sequence may need to wait for the down 26999 * to complete. This involves waiting for the ire and ipif refcnts to go down 27000 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 27001 */ 27002 /* ARGSUSED */ 27003 void 27004 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 27005 { 27006 struct iocblk *iocp; 27007 mblk_t *mp1; 27008 ip_ioctl_cmd_t *ipip; 27009 int err; 27010 sin_t *sin; 27011 struct lifreq *lifr; 27012 struct ifreq *ifr; 27013 27014 iocp = (struct iocblk *)mp->b_rptr; 27015 ASSERT(ipsq != NULL); 27016 /* Existence of mp1 verified in ip_wput_nondata */ 27017 mp1 = mp->b_cont->b_cont; 27018 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27019 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 27020 /* 27021 * Special case where ipsq_current_ipif is not set: 27022 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 27023 * ill could also have become part of a ipmp group in the 27024 * process, we are here as were not able to complete the 27025 * operation in ipif_set_values because we could not become 27026 * exclusive on the new ipsq, In such a case ipsq_current_ipif 27027 * will not be set so we need to set it. 27028 */ 27029 ill_t *ill = q->q_ptr; 27030 ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd); 27031 } 27032 ASSERT(ipsq->ipsq_current_ipif != NULL); 27033 27034 if (ipip->ipi_cmd_type == IF_CMD) { 27035 /* This a old style SIOC[GS]IF* command */ 27036 ifr = (struct ifreq *)mp1->b_rptr; 27037 sin = (sin_t *)&ifr->ifr_addr; 27038 } else if (ipip->ipi_cmd_type == LIF_CMD) { 27039 /* This a new style SIOC[GS]LIF* command */ 27040 lifr = (struct lifreq *)mp1->b_rptr; 27041 sin = (sin_t *)&lifr->lifr_addr; 27042 } else { 27043 sin = NULL; 27044 } 27045 27046 err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp, 27047 ipip, mp1->b_rptr); 27048 27049 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 27050 } 27051 27052 /* 27053 * ioctl processing 27054 * 27055 * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up 27056 * the ioctl command in the ioctl tables, determines the copyin data size 27057 * from the ipi_copyin_size field, and does an mi_copyin() of that size. 27058 * 27059 * ioctl processing then continues when the M_IOCDATA makes its way down to 27060 * ip_wput_nondata(). The ioctl is looked up again in the ioctl table, its 27061 * associated 'conn' is refheld till the end of the ioctl and the general 27062 * ioctl processing function ip_process_ioctl() is called to extract the 27063 * arguments and process the ioctl. To simplify extraction, ioctl commands 27064 * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a 27065 * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq()) 27066 * is used to extract the ioctl's arguments. 27067 * 27068 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 27069 * so goes thru the serialization primitive ipsq_try_enter. Then the 27070 * appropriate function to handle the ioctl is called based on the entry in 27071 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 27072 * which also refreleases the 'conn' that was refheld at the start of the 27073 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 27074 * 27075 * Many exclusive ioctls go thru an internal down up sequence as part of 27076 * the operation. For example an attempt to change the IP address of an 27077 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 27078 * does all the cleanup such as deleting all ires that use this address. 27079 * Then we need to wait till all references to the interface go away. 27080 */ 27081 void 27082 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 27083 { 27084 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 27085 ip_ioctl_cmd_t *ipip = arg; 27086 ip_extract_func_t *extract_funcp; 27087 cmd_info_t ci; 27088 int err; 27089 27090 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 27091 27092 if (ipip == NULL) 27093 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27094 27095 /* 27096 * SIOCLIFADDIF needs to go thru a special path since the 27097 * ill may not exist yet. This happens in the case of lo0 27098 * which is created using this ioctl. 27099 */ 27100 if (ipip->ipi_cmd == SIOCLIFADDIF) { 27101 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 27102 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27103 return; 27104 } 27105 27106 ci.ci_ipif = NULL; 27107 if (ipip->ipi_cmd_type == MISC_CMD) { 27108 /* 27109 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF. 27110 */ 27111 if (ipip->ipi_cmd == IF_UNITSEL) { 27112 /* ioctl comes down the ill */ 27113 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 27114 ipif_refhold(ci.ci_ipif); 27115 } 27116 err = 0; 27117 ci.ci_sin = NULL; 27118 ci.ci_sin6 = NULL; 27119 ci.ci_lifr = NULL; 27120 } else { 27121 switch (ipip->ipi_cmd_type) { 27122 case IF_CMD: 27123 case LIF_CMD: 27124 extract_funcp = ip_extract_lifreq; 27125 break; 27126 27127 case ARP_CMD: 27128 case XARP_CMD: 27129 extract_funcp = ip_extract_arpreq; 27130 break; 27131 27132 case TUN_CMD: 27133 extract_funcp = ip_extract_tunreq; 27134 break; 27135 27136 case MSFILT_CMD: 27137 extract_funcp = ip_extract_msfilter; 27138 break; 27139 27140 default: 27141 ASSERT(0); 27142 } 27143 27144 err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl); 27145 if (err != 0) { 27146 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27147 return; 27148 } 27149 27150 /* 27151 * All of the extraction functions return a refheld ipif. 27152 */ 27153 ASSERT(ci.ci_ipif != NULL); 27154 } 27155 27156 if (!(ipip->ipi_flags & IPI_WR)) { 27157 /* 27158 * A return value of EINPROGRESS means the ioctl is 27159 * either queued and waiting for some reason or has 27160 * already completed. 27161 */ 27162 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 27163 ci.ci_lifr); 27164 if (ci.ci_ipif != NULL) 27165 ipif_refrele(ci.ci_ipif); 27166 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27167 return; 27168 } 27169 27170 /* 27171 * If ipsq is non-null, we are already being called exclusively on an 27172 * ill but in the case of a failover in progress it is the "from" ill, 27173 * rather than the "to" ill (which is the ill ptr passed in). 27174 * In order to ensure we are exclusive on both ILLs we rerun 27175 * ipsq_try_enter() here, ipsq's support recursive entry. 27176 */ 27177 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 27178 ASSERT(ci.ci_ipif != NULL); 27179 27180 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, 27181 NEW_OP, B_TRUE); 27182 27183 /* 27184 * Release the ipif so that ipif_down and friends that wait for 27185 * references to go away are not misled about the current ipif_refcnt 27186 * values. We are writer so we can access the ipif even after releasing 27187 * the ipif. 27188 */ 27189 ipif_refrele(ci.ci_ipif); 27190 if (ipsq == NULL) 27191 return; 27192 27193 ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); 27194 27195 /* 27196 * For most set ioctls that come here, this serves as a single point 27197 * where we set the IPIF_CHANGING flag. This ensures that there won't 27198 * be any new references to the ipif. This helps functions that go 27199 * through this path and end up trying to wait for the refcnts 27200 * associated with the ipif to go down to zero. Some exceptions are 27201 * Failover, Failback, and Groupname commands that operate on more than 27202 * just the ci.ci_ipif. These commands internally determine the 27203 * set of ipif's they operate on and set and clear the IPIF_CHANGING 27204 * flags on that set. Another exception is the Removeif command that 27205 * sets the IPIF_CONDEMNED flag internally after identifying the right 27206 * ipif to operate on. 27207 */ 27208 mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); 27209 if (ipip->ipi_cmd != SIOCLIFREMOVEIF && 27210 ipip->ipi_cmd != SIOCLIFFAILOVER && 27211 ipip->ipi_cmd != SIOCLIFFAILBACK && 27212 ipip->ipi_cmd != SIOCSLIFGROUPNAME) 27213 (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; 27214 mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); 27215 27216 /* 27217 * A return value of EINPROGRESS means the ioctl is 27218 * either queued and waiting for some reason or has 27219 * already completed. 27220 */ 27221 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); 27222 27223 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 27224 27225 ipsq_exit(ipsq); 27226 } 27227 27228 /* 27229 * Complete the ioctl. Typically ioctls use the mi package and need to 27230 * do mi_copyout/mi_copy_done. 27231 */ 27232 void 27233 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq) 27234 { 27235 conn_t *connp = NULL; 27236 27237 if (err == EINPROGRESS) 27238 return; 27239 27240 if (CONN_Q(q)) { 27241 connp = Q_TO_CONN(q); 27242 ASSERT(connp->conn_ref >= 2); 27243 } 27244 27245 switch (mode) { 27246 case COPYOUT: 27247 if (err == 0) 27248 mi_copyout(q, mp); 27249 else 27250 mi_copy_done(q, mp, err); 27251 break; 27252 27253 case NO_COPYOUT: 27254 mi_copy_done(q, mp, err); 27255 break; 27256 27257 default: 27258 ASSERT(mode == CONN_CLOSE); /* aborted through CONN_CLOSE */ 27259 break; 27260 } 27261 27262 /* 27263 * The refhold placed at the start of the ioctl is released here. 27264 */ 27265 if (connp != NULL) 27266 CONN_OPER_PENDING_DONE(connp); 27267 27268 if (ipsq != NULL) 27269 ipsq_current_finish(ipsq); 27270 } 27271 27272 /* 27273 * This is called from ip_wput_nondata to resume a deferred TCP bind. 27274 */ 27275 /* ARGSUSED */ 27276 void 27277 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) 27278 { 27279 conn_t *connp = arg; 27280 tcp_t *tcp; 27281 27282 ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); 27283 tcp = connp->conn_tcp; 27284 27285 if (connp->conn_tcp->tcp_state == TCPS_CLOSED) 27286 freemsg(mp); 27287 else 27288 tcp_rput_other(tcp, mp); 27289 CONN_OPER_PENDING_DONE(connp); 27290 } 27291 27292 /* Called from ip_wput for all non data messages */ 27293 /* ARGSUSED */ 27294 void 27295 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 27296 { 27297 mblk_t *mp1; 27298 ire_t *ire, *fake_ire; 27299 ill_t *ill; 27300 struct iocblk *iocp; 27301 ip_ioctl_cmd_t *ipip; 27302 cred_t *cr; 27303 conn_t *connp; 27304 int err; 27305 nce_t *nce; 27306 ipif_t *ipif; 27307 ip_stack_t *ipst; 27308 char *proto_str; 27309 27310 if (CONN_Q(q)) { 27311 connp = Q_TO_CONN(q); 27312 ipst = connp->conn_netstack->netstack_ip; 27313 } else { 27314 connp = NULL; 27315 ipst = ILLQ_TO_IPST(q); 27316 } 27317 27318 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(q)); 27319 27320 switch (DB_TYPE(mp)) { 27321 case M_IOCTL: 27322 /* 27323 * IOCTL processing begins in ip_sioctl_copyin_setup which 27324 * will arrange to copy in associated control structures. 27325 */ 27326 ip_sioctl_copyin_setup(q, mp); 27327 return; 27328 case M_IOCDATA: 27329 /* 27330 * Ensure that this is associated with one of our trans- 27331 * parent ioctls. If it's not ours, discard it if we're 27332 * running as a driver, or pass it on if we're a module. 27333 */ 27334 iocp = (struct iocblk *)mp->b_rptr; 27335 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27336 if (ipip == NULL) { 27337 if (q->q_next == NULL) { 27338 goto nak; 27339 } else { 27340 putnext(q, mp); 27341 } 27342 return; 27343 } 27344 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 27345 /* 27346 * the ioctl is one we recognise, but is not 27347 * consumed by IP as a module, pass M_IOCDATA 27348 * for processing downstream, but only for 27349 * common Streams ioctls. 27350 */ 27351 if (ipip->ipi_flags & IPI_PASS_DOWN) { 27352 putnext(q, mp); 27353 return; 27354 } else { 27355 goto nak; 27356 } 27357 } 27358 27359 /* IOCTL continuation following copyin or copyout. */ 27360 if (mi_copy_state(q, mp, NULL) == -1) { 27361 /* 27362 * The copy operation failed. mi_copy_state already 27363 * cleaned up, so we're out of here. 27364 */ 27365 return; 27366 } 27367 /* 27368 * If we just completed a copy in, we become writer and 27369 * continue processing in ip_sioctl_copyin_done. If it 27370 * was a copy out, we call mi_copyout again. If there is 27371 * nothing more to copy out, it will complete the IOCTL. 27372 */ 27373 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 27374 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 27375 mi_copy_done(q, mp, EPROTO); 27376 return; 27377 } 27378 /* 27379 * Check for cases that need more copying. A return 27380 * value of 0 means a second copyin has been started, 27381 * so we return; a return value of 1 means no more 27382 * copying is needed, so we continue. 27383 */ 27384 if (ipip->ipi_cmd_type == MSFILT_CMD && 27385 MI_COPY_COUNT(mp) == 1) { 27386 if (ip_copyin_msfilter(q, mp) == 0) 27387 return; 27388 } 27389 /* 27390 * Refhold the conn, till the ioctl completes. This is 27391 * needed in case the ioctl ends up in the pending mp 27392 * list. Every mp in the ill_pending_mp list and 27393 * the ipsq_pending_mp must have a refhold on the conn 27394 * to resume processing. The refhold is released when 27395 * the ioctl completes. (normally or abnormally) 27396 * In all cases ip_ioctl_finish is called to finish 27397 * the ioctl. 27398 */ 27399 if (connp != NULL) { 27400 /* This is not a reentry */ 27401 ASSERT(ipsq == NULL); 27402 CONN_INC_REF(connp); 27403 } else { 27404 if (!(ipip->ipi_flags & IPI_MODOK)) { 27405 mi_copy_done(q, mp, EINVAL); 27406 return; 27407 } 27408 } 27409 27410 ip_process_ioctl(ipsq, q, mp, ipip); 27411 27412 } else { 27413 mi_copyout(q, mp); 27414 } 27415 return; 27416 nak: 27417 iocp->ioc_error = EINVAL; 27418 mp->b_datap->db_type = M_IOCNAK; 27419 iocp->ioc_count = 0; 27420 qreply(q, mp); 27421 return; 27422 27423 case M_IOCNAK: 27424 /* 27425 * The only way we could get here is if a resolver didn't like 27426 * an IOCTL we sent it. This shouldn't happen. 27427 */ 27428 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 27429 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 27430 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 27431 freemsg(mp); 27432 return; 27433 case M_IOCACK: 27434 /* /dev/ip shouldn't see this */ 27435 if (CONN_Q(q)) 27436 goto nak; 27437 27438 /* Finish socket ioctls passed through to ARP. */ 27439 ip_sioctl_iocack(q, mp); 27440 return; 27441 case M_FLUSH: 27442 if (*mp->b_rptr & FLUSHW) 27443 flushq(q, FLUSHALL); 27444 if (q->q_next) { 27445 putnext(q, mp); 27446 return; 27447 } 27448 if (*mp->b_rptr & FLUSHR) { 27449 *mp->b_rptr &= ~FLUSHW; 27450 qreply(q, mp); 27451 return; 27452 } 27453 freemsg(mp); 27454 return; 27455 case IRE_DB_REQ_TYPE: 27456 if (connp == NULL) { 27457 proto_str = "IRE_DB_REQ_TYPE"; 27458 goto protonak; 27459 } 27460 /* An Upper Level Protocol wants a copy of an IRE. */ 27461 ip_ire_req(q, mp); 27462 return; 27463 case M_CTL: 27464 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 27465 break; 27466 27467 if (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == 27468 TUN_HELLO) { 27469 ASSERT(connp != NULL); 27470 connp->conn_flags |= IPCL_IPTUN; 27471 freeb(mp); 27472 return; 27473 } 27474 27475 /* M_CTL messages are used by ARP to tell us things. */ 27476 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 27477 break; 27478 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 27479 case AR_ENTRY_SQUERY: 27480 ip_wput_ctl(q, mp); 27481 return; 27482 case AR_CLIENT_NOTIFY: 27483 ip_arp_news(q, mp); 27484 return; 27485 case AR_DLPIOP_DONE: 27486 ASSERT(q->q_next != NULL); 27487 ill = (ill_t *)q->q_ptr; 27488 /* qwriter_ip releases the refhold */ 27489 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 27490 ill_refhold(ill); 27491 qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE); 27492 return; 27493 case AR_ARP_CLOSING: 27494 /* 27495 * ARP (above us) is closing. If no ARP bringup is 27496 * currently pending, ack the message so that ARP 27497 * can complete its close. Also mark ill_arp_closing 27498 * so that new ARP bringups will fail. If any 27499 * ARP bringup is currently in progress, we will 27500 * ack this when the current ARP bringup completes. 27501 */ 27502 ASSERT(q->q_next != NULL); 27503 ill = (ill_t *)q->q_ptr; 27504 mutex_enter(&ill->ill_lock); 27505 ill->ill_arp_closing = 1; 27506 if (!ill->ill_arp_bringup_pending) { 27507 mutex_exit(&ill->ill_lock); 27508 qreply(q, mp); 27509 } else { 27510 mutex_exit(&ill->ill_lock); 27511 freemsg(mp); 27512 } 27513 return; 27514 case AR_ARP_EXTEND: 27515 /* 27516 * The ARP module above us is capable of duplicate 27517 * address detection. Old ATM drivers will not send 27518 * this message. 27519 */ 27520 ASSERT(q->q_next != NULL); 27521 ill = (ill_t *)q->q_ptr; 27522 ill->ill_arp_extend = B_TRUE; 27523 freemsg(mp); 27524 return; 27525 default: 27526 break; 27527 } 27528 break; 27529 case M_PROTO: 27530 case M_PCPROTO: 27531 /* 27532 * The only PROTO messages we expect are ULP binds and 27533 * copies of option negotiation acknowledgements. 27534 */ 27535 switch (((union T_primitives *)mp->b_rptr)->type) { 27536 case O_T_BIND_REQ: 27537 case T_BIND_REQ: { 27538 /* Request can get queued in bind */ 27539 if (connp == NULL) { 27540 proto_str = "O_T_BIND_REQ/T_BIND_REQ"; 27541 goto protonak; 27542 } 27543 /* 27544 * The transports except SCTP call ip_bind_{v4,v6}() 27545 * directly instead of a a putnext. SCTP doesn't 27546 * generate any T_BIND_REQ since it has its own 27547 * fanout data structures. However, ESP and AH 27548 * come in for regular binds; all other cases are 27549 * bind retries. 27550 */ 27551 ASSERT(!IPCL_IS_SCTP(connp)); 27552 27553 /* Don't increment refcnt if this is a re-entry */ 27554 if (ipsq == NULL) 27555 CONN_INC_REF(connp); 27556 27557 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 27558 connp, NULL) : ip_bind_v4(q, mp, connp); 27559 if (mp == NULL) 27560 return; 27561 if (IPCL_IS_TCP(connp)) { 27562 /* 27563 * In the case of TCP endpoint we 27564 * come here only for bind retries 27565 */ 27566 ASSERT(ipsq != NULL); 27567 CONN_INC_REF(connp); 27568 squeue_fill(connp->conn_sqp, mp, 27569 ip_resume_tcp_bind, connp, 27570 SQTAG_BIND_RETRY); 27571 } else if (IPCL_IS_UDP(connp)) { 27572 /* 27573 * In the case of UDP endpoint we 27574 * come here only for bind retries 27575 */ 27576 ASSERT(ipsq != NULL); 27577 udp_resume_bind(connp, mp); 27578 } else if (IPCL_IS_RAWIP(connp)) { 27579 /* 27580 * In the case of RAWIP endpoint we 27581 * come here only for bind retries 27582 */ 27583 ASSERT(ipsq != NULL); 27584 rawip_resume_bind(connp, mp); 27585 } else { 27586 /* The case of AH and ESP */ 27587 qreply(q, mp); 27588 CONN_OPER_PENDING_DONE(connp); 27589 } 27590 return; 27591 } 27592 case T_SVR4_OPTMGMT_REQ: 27593 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 27594 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 27595 27596 if (connp == NULL) { 27597 proto_str = "T_SVR4_OPTMGMT_REQ"; 27598 goto protonak; 27599 } 27600 27601 if (!snmpcom_req(q, mp, ip_snmp_set, 27602 ip_snmp_get, cr)) { 27603 /* 27604 * Call svr4_optcom_req so that it can 27605 * generate the ack. We don't come here 27606 * if this operation is being restarted. 27607 * ip_restart_optmgmt will drop the conn ref. 27608 * In the case of ipsec option after the ipsec 27609 * load is complete conn_restart_ipsec_waiter 27610 * drops the conn ref. 27611 */ 27612 ASSERT(ipsq == NULL); 27613 CONN_INC_REF(connp); 27614 if (ip_check_for_ipsec_opt(q, mp)) 27615 return; 27616 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj, 27617 B_FALSE); 27618 if (err != EINPROGRESS) { 27619 /* Operation is done */ 27620 CONN_OPER_PENDING_DONE(connp); 27621 } 27622 } 27623 return; 27624 case T_OPTMGMT_REQ: 27625 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 27626 /* 27627 * Note: No snmpcom_req support through new 27628 * T_OPTMGMT_REQ. 27629 * Call tpi_optcom_req so that it can 27630 * generate the ack. 27631 */ 27632 if (connp == NULL) { 27633 proto_str = "T_OPTMGMT_REQ"; 27634 goto protonak; 27635 } 27636 27637 ASSERT(ipsq == NULL); 27638 /* 27639 * We don't come here for restart. ip_restart_optmgmt 27640 * will drop the conn ref. In the case of ipsec option 27641 * after the ipsec load is complete 27642 * conn_restart_ipsec_waiter drops the conn ref. 27643 */ 27644 CONN_INC_REF(connp); 27645 if (ip_check_for_ipsec_opt(q, mp)) 27646 return; 27647 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE); 27648 if (err != EINPROGRESS) { 27649 /* Operation is done */ 27650 CONN_OPER_PENDING_DONE(connp); 27651 } 27652 return; 27653 case T_UNBIND_REQ: 27654 if (connp == NULL) { 27655 proto_str = "T_UNBIND_REQ"; 27656 goto protonak; 27657 } 27658 mp = ip_unbind(q, mp); 27659 qreply(q, mp); 27660 return; 27661 default: 27662 /* 27663 * Have to drop any DLPI messages coming down from 27664 * arp (such as an info_req which would cause ip 27665 * to receive an extra info_ack if it was passed 27666 * through. 27667 */ 27668 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 27669 (int)*(uint_t *)mp->b_rptr)); 27670 freemsg(mp); 27671 return; 27672 } 27673 /* NOTREACHED */ 27674 case IRE_DB_TYPE: { 27675 nce_t *nce; 27676 ill_t *ill; 27677 in6_addr_t gw_addr_v6; 27678 27679 27680 /* 27681 * This is a response back from a resolver. It 27682 * consists of a message chain containing: 27683 * IRE_MBLK-->LL_HDR_MBLK->pkt 27684 * The IRE_MBLK is the one we allocated in ip_newroute. 27685 * The LL_HDR_MBLK is the DLPI header to use to get 27686 * the attached packet, and subsequent ones for the 27687 * same destination, transmitted. 27688 */ 27689 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 27690 break; 27691 /* 27692 * First, check to make sure the resolution succeeded. 27693 * If it failed, the second mblk will be empty. 27694 * If it is, free the chain, dropping the packet. 27695 * (We must ire_delete the ire; that frees the ire mblk) 27696 * We're doing this now to support PVCs for ATM; it's 27697 * a partial xresolv implementation. When we fully implement 27698 * xresolv interfaces, instead of freeing everything here 27699 * we'll initiate neighbor discovery. 27700 * 27701 * For v4 (ARP and other external resolvers) the resolver 27702 * frees the message, so no check is needed. This check 27703 * is required, though, for a full xresolve implementation. 27704 * Including this code here now both shows how external 27705 * resolvers can NACK a resolution request using an 27706 * existing design that has no specific provisions for NACKs, 27707 * and also takes into account that the current non-ARP 27708 * external resolver has been coded to use this method of 27709 * NACKing for all IPv6 (xresolv) cases, 27710 * whether our xresolv implementation is complete or not. 27711 * 27712 */ 27713 ire = (ire_t *)mp->b_rptr; 27714 ill = ire_to_ill(ire); 27715 mp1 = mp->b_cont; /* dl_unitdata_req */ 27716 if (mp1->b_rptr == mp1->b_wptr) { 27717 if (ire->ire_ipversion == IPV6_VERSION) { 27718 /* 27719 * XRESOLV interface. 27720 */ 27721 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27722 mutex_enter(&ire->ire_lock); 27723 gw_addr_v6 = ire->ire_gateway_addr_v6; 27724 mutex_exit(&ire->ire_lock); 27725 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27726 nce = ndp_lookup_v6(ill, 27727 &ire->ire_addr_v6, B_FALSE); 27728 } else { 27729 nce = ndp_lookup_v6(ill, &gw_addr_v6, 27730 B_FALSE); 27731 } 27732 if (nce != NULL) { 27733 nce_resolv_failed(nce); 27734 ndp_delete(nce); 27735 NCE_REFRELE(nce); 27736 } 27737 } 27738 mp->b_cont = NULL; 27739 freemsg(mp1); /* frees the pkt as well */ 27740 ASSERT(ire->ire_nce == NULL); 27741 ire_delete((ire_t *)mp->b_rptr); 27742 return; 27743 } 27744 27745 /* 27746 * Split them into IRE_MBLK and pkt and feed it into 27747 * ire_add_then_send. Then in ire_add_then_send 27748 * the IRE will be added, and then the packet will be 27749 * run back through ip_wput. This time it will make 27750 * it to the wire. 27751 */ 27752 mp->b_cont = NULL; 27753 mp = mp1->b_cont; /* now, mp points to pkt */ 27754 mp1->b_cont = NULL; 27755 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 27756 if (ire->ire_ipversion == IPV6_VERSION) { 27757 /* 27758 * XRESOLV interface. Find the nce and put a copy 27759 * of the dl_unitdata_req in nce_res_mp 27760 */ 27761 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27762 mutex_enter(&ire->ire_lock); 27763 gw_addr_v6 = ire->ire_gateway_addr_v6; 27764 mutex_exit(&ire->ire_lock); 27765 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27766 nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, 27767 B_FALSE); 27768 } else { 27769 nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE); 27770 } 27771 if (nce != NULL) { 27772 /* 27773 * We have to protect nce_res_mp here 27774 * from being accessed by other threads 27775 * while we change the mblk pointer. 27776 * Other functions will also lock the nce when 27777 * accessing nce_res_mp. 27778 * 27779 * The reason we change the mblk pointer 27780 * here rather than copying the resolved address 27781 * into the template is that, unlike with 27782 * ethernet, we have no guarantee that the 27783 * resolved address length will be 27784 * smaller than or equal to the lla length 27785 * with which the template was allocated, 27786 * (for ethernet, they're equal) 27787 * so we have to use the actual resolved 27788 * address mblk - which holds the real 27789 * dl_unitdata_req with the resolved address. 27790 * 27791 * Doing this is the same behavior as was 27792 * previously used in the v4 ARP case. 27793 */ 27794 mutex_enter(&nce->nce_lock); 27795 if (nce->nce_res_mp != NULL) 27796 freemsg(nce->nce_res_mp); 27797 nce->nce_res_mp = mp1; 27798 mutex_exit(&nce->nce_lock); 27799 /* 27800 * We do a fastpath probe here because 27801 * we have resolved the address without 27802 * using Neighbor Discovery. 27803 * In the non-XRESOLV v6 case, the fastpath 27804 * probe is done right after neighbor 27805 * discovery completes. 27806 */ 27807 if (nce->nce_res_mp != NULL) { 27808 int res; 27809 nce_fastpath_list_add(nce); 27810 res = ill_fastpath_probe(ill, 27811 nce->nce_res_mp); 27812 if (res != 0 && res != EAGAIN) 27813 nce_fastpath_list_delete(nce); 27814 } 27815 27816 ire_add_then_send(q, ire, mp); 27817 /* 27818 * Now we have to clean out any packets 27819 * that may have been queued on the nce 27820 * while it was waiting for address resolution 27821 * to complete. 27822 */ 27823 mutex_enter(&nce->nce_lock); 27824 mp1 = nce->nce_qd_mp; 27825 nce->nce_qd_mp = NULL; 27826 mutex_exit(&nce->nce_lock); 27827 while (mp1 != NULL) { 27828 mblk_t *nxt_mp; 27829 queue_t *fwdq = NULL; 27830 ill_t *inbound_ill; 27831 uint_t ifindex; 27832 27833 nxt_mp = mp1->b_next; 27834 mp1->b_next = NULL; 27835 /* 27836 * Retrieve ifindex stored in 27837 * ip_rput_data_v6() 27838 */ 27839 ifindex = 27840 (uint_t)(uintptr_t)mp1->b_prev; 27841 inbound_ill = 27842 ill_lookup_on_ifindex(ifindex, 27843 B_TRUE, NULL, NULL, NULL, 27844 NULL, ipst); 27845 mp1->b_prev = NULL; 27846 if (inbound_ill != NULL) 27847 fwdq = inbound_ill->ill_rq; 27848 27849 if (fwdq != NULL) { 27850 put(fwdq, mp1); 27851 ill_refrele(inbound_ill); 27852 } else 27853 put(WR(ill->ill_rq), mp1); 27854 mp1 = nxt_mp; 27855 } 27856 NCE_REFRELE(nce); 27857 } else { /* nce is NULL; clean up */ 27858 ire_delete(ire); 27859 freemsg(mp); 27860 freemsg(mp1); 27861 return; 27862 } 27863 } else { 27864 nce_t *arpce; 27865 /* 27866 * Link layer resolution succeeded. Recompute the 27867 * ire_nce. 27868 */ 27869 ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST)); 27870 if ((arpce = ndp_lookup_v4(ill, 27871 (ire->ire_gateway_addr != INADDR_ANY ? 27872 &ire->ire_gateway_addr : &ire->ire_addr), 27873 B_FALSE)) == NULL) { 27874 freeb(ire->ire_mp); 27875 freeb(mp1); 27876 freemsg(mp); 27877 return; 27878 } 27879 mutex_enter(&arpce->nce_lock); 27880 arpce->nce_last = TICK_TO_MSEC(lbolt64); 27881 if (arpce->nce_state == ND_REACHABLE) { 27882 /* 27883 * Someone resolved this before us; 27884 * cleanup the res_mp. Since ire has 27885 * not been added yet, the call to ire_add_v4 27886 * from ire_add_then_send (when a dup is 27887 * detected) will clean up the ire. 27888 */ 27889 freeb(mp1); 27890 } else { 27891 ASSERT(arpce->nce_res_mp == NULL); 27892 arpce->nce_res_mp = mp1; 27893 arpce->nce_state = ND_REACHABLE; 27894 } 27895 mutex_exit(&arpce->nce_lock); 27896 if (ire->ire_marks & IRE_MARK_NOADD) { 27897 /* 27898 * this ire will not be added to the ire 27899 * cache table, so we can set the ire_nce 27900 * here, as there are no atomicity constraints. 27901 */ 27902 ire->ire_nce = arpce; 27903 /* 27904 * We are associating this nce with the ire 27905 * so change the nce ref taken in 27906 * ndp_lookup_v4() from 27907 * NCE_REFHOLD to NCE_REFHOLD_NOTR 27908 */ 27909 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 27910 } else { 27911 NCE_REFRELE(arpce); 27912 } 27913 ire_add_then_send(q, ire, mp); 27914 } 27915 return; /* All is well, the packet has been sent. */ 27916 } 27917 case IRE_ARPRESOLVE_TYPE: { 27918 27919 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */ 27920 break; 27921 mp1 = mp->b_cont; /* dl_unitdata_req */ 27922 mp->b_cont = NULL; 27923 /* 27924 * First, check to make sure the resolution succeeded. 27925 * If it failed, the second mblk will be empty. 27926 */ 27927 if (mp1->b_rptr == mp1->b_wptr) { 27928 /* cleanup the incomplete ire, free queued packets */ 27929 freemsg(mp); /* fake ire */ 27930 freeb(mp1); /* dl_unitdata response */ 27931 return; 27932 } 27933 27934 /* 27935 * update any incomplete nce_t found. we lookup the ctable 27936 * and find the nce from the ire->ire_nce because we need 27937 * to pass the ire to ip_xmit_v4 later, and can find both 27938 * ire and nce in one lookup from the ctable. 27939 */ 27940 fake_ire = (ire_t *)mp->b_rptr; 27941 /* 27942 * By the time we come back here from ARP 27943 * the logical outgoing interface of the incomplete ire 27944 * we added in ire_forward could have disappeared, 27945 * causing the incomplete ire to also have 27946 * dissapeared. So we need to retreive the 27947 * proper ipif for the ire before looking 27948 * in ctable; do the ctablelookup based on ire_ipif_seqid 27949 */ 27950 ill = q->q_ptr; 27951 27952 /* Get the outgoing ipif */ 27953 mutex_enter(&ill->ill_lock); 27954 if (ill->ill_state_flags & ILL_CONDEMNED) { 27955 mutex_exit(&ill->ill_lock); 27956 freemsg(mp); /* fake ire */ 27957 freeb(mp1); /* dl_unitdata response */ 27958 return; 27959 } 27960 ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid); 27961 27962 if (ipif == NULL) { 27963 mutex_exit(&ill->ill_lock); 27964 ip1dbg(("logical intrf to incomplete ire vanished\n")); 27965 freemsg(mp); 27966 freeb(mp1); 27967 return; 27968 } 27969 ipif_refhold_locked(ipif); 27970 mutex_exit(&ill->ill_lock); 27971 ire = ire_ctable_lookup(fake_ire->ire_addr, 27972 fake_ire->ire_gateway_addr, IRE_CACHE, 27973 ipif, fake_ire->ire_zoneid, NULL, 27974 (MATCH_IRE_GW|MATCH_IRE_IPIF|MATCH_IRE_ZONEONLY| 27975 MATCH_IRE_TYPE), ipst); 27976 ipif_refrele(ipif); 27977 if (ire == NULL) { 27978 /* 27979 * no ire was found; check if there is an nce 27980 * for this lookup; if it has no ire's pointing at it 27981 * cleanup. 27982 */ 27983 if ((nce = ndp_lookup_v4(ill, 27984 (fake_ire->ire_gateway_addr != INADDR_ANY ? 27985 &fake_ire->ire_gateway_addr : &fake_ire->ire_addr), 27986 B_FALSE)) != NULL) { 27987 /* 27988 * cleanup: 27989 * We check for refcnt 2 (one for the nce 27990 * hash list + 1 for the ref taken by 27991 * ndp_lookup_v4) to check that there are 27992 * no ire's pointing at the nce. 27993 */ 27994 if (nce->nce_refcnt == 2) 27995 ndp_delete(nce); 27996 NCE_REFRELE(nce); 27997 } 27998 freeb(mp1); /* dl_unitdata response */ 27999 freemsg(mp); /* fake ire */ 28000 return; 28001 } 28002 nce = ire->ire_nce; 28003 DTRACE_PROBE2(ire__arpresolve__type, 28004 ire_t *, ire, nce_t *, nce); 28005 ASSERT(nce->nce_state != ND_INITIAL); 28006 mutex_enter(&nce->nce_lock); 28007 nce->nce_last = TICK_TO_MSEC(lbolt64); 28008 if (nce->nce_state == ND_REACHABLE) { 28009 /* 28010 * Someone resolved this before us; 28011 * our response is not needed any more. 28012 */ 28013 mutex_exit(&nce->nce_lock); 28014 freeb(mp1); /* dl_unitdata response */ 28015 } else { 28016 ASSERT(nce->nce_res_mp == NULL); 28017 nce->nce_res_mp = mp1; 28018 nce->nce_state = ND_REACHABLE; 28019 mutex_exit(&nce->nce_lock); 28020 nce_fastpath(nce); 28021 } 28022 /* 28023 * The cached nce_t has been updated to be reachable; 28024 * Clear the IRE_MARK_UNCACHED flag and free the fake_ire. 28025 */ 28026 fake_ire->ire_marks &= ~IRE_MARK_UNCACHED; 28027 freemsg(mp); 28028 /* 28029 * send out queued packets. 28030 */ 28031 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 28032 28033 IRE_REFRELE(ire); 28034 return; 28035 } 28036 default: 28037 break; 28038 } 28039 if (q->q_next) { 28040 putnext(q, mp); 28041 } else 28042 freemsg(mp); 28043 return; 28044 28045 protonak: 28046 cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str); 28047 if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL) 28048 qreply(q, mp); 28049 } 28050 28051 /* 28052 * Process IP options in an outbound packet. Modify the destination if there 28053 * is a source route option. 28054 * Returns non-zero if something fails in which case an ICMP error has been 28055 * sent and mp freed. 28056 */ 28057 static int 28058 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 28059 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 28060 { 28061 ipoptp_t opts; 28062 uchar_t *opt; 28063 uint8_t optval; 28064 uint8_t optlen; 28065 ipaddr_t dst; 28066 intptr_t code = 0; 28067 mblk_t *mp; 28068 ire_t *ire = NULL; 28069 28070 ip2dbg(("ip_wput_options\n")); 28071 mp = ipsec_mp; 28072 if (mctl_present) { 28073 mp = ipsec_mp->b_cont; 28074 } 28075 28076 dst = ipha->ipha_dst; 28077 for (optval = ipoptp_first(&opts, ipha); 28078 optval != IPOPT_EOL; 28079 optval = ipoptp_next(&opts)) { 28080 opt = opts.ipoptp_cur; 28081 optlen = opts.ipoptp_len; 28082 ip2dbg(("ip_wput_options: opt %d, len %d\n", 28083 optval, optlen)); 28084 switch (optval) { 28085 uint32_t off; 28086 case IPOPT_SSRR: 28087 case IPOPT_LSRR: 28088 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28089 ip1dbg(( 28090 "ip_wput_options: bad option offset\n")); 28091 code = (char *)&opt[IPOPT_OLEN] - 28092 (char *)ipha; 28093 goto param_prob; 28094 } 28095 off = opt[IPOPT_OFFSET]; 28096 ip1dbg(("ip_wput_options: next hop 0x%x\n", 28097 ntohl(dst))); 28098 /* 28099 * For strict: verify that dst is directly 28100 * reachable. 28101 */ 28102 if (optval == IPOPT_SSRR) { 28103 ire = ire_ftable_lookup(dst, 0, 0, 28104 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 28105 MBLK_GETLABEL(mp), 28106 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 28107 if (ire == NULL) { 28108 ip1dbg(("ip_wput_options: SSRR not" 28109 " directly reachable: 0x%x\n", 28110 ntohl(dst))); 28111 goto bad_src_route; 28112 } 28113 ire_refrele(ire); 28114 } 28115 break; 28116 case IPOPT_RR: 28117 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28118 ip1dbg(( 28119 "ip_wput_options: bad option offset\n")); 28120 code = (char *)&opt[IPOPT_OLEN] - 28121 (char *)ipha; 28122 goto param_prob; 28123 } 28124 break; 28125 case IPOPT_TS: 28126 /* 28127 * Verify that length >=5 and that there is either 28128 * room for another timestamp or that the overflow 28129 * counter is not maxed out. 28130 */ 28131 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 28132 if (optlen < IPOPT_MINLEN_IT) { 28133 goto param_prob; 28134 } 28135 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28136 ip1dbg(( 28137 "ip_wput_options: bad option offset\n")); 28138 code = (char *)&opt[IPOPT_OFFSET] - 28139 (char *)ipha; 28140 goto param_prob; 28141 } 28142 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 28143 case IPOPT_TS_TSONLY: 28144 off = IPOPT_TS_TIMELEN; 28145 break; 28146 case IPOPT_TS_TSANDADDR: 28147 case IPOPT_TS_PRESPEC: 28148 case IPOPT_TS_PRESPEC_RFC791: 28149 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 28150 break; 28151 default: 28152 code = (char *)&opt[IPOPT_POS_OV_FLG] - 28153 (char *)ipha; 28154 goto param_prob; 28155 } 28156 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 28157 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 28158 /* 28159 * No room and the overflow counter is 15 28160 * already. 28161 */ 28162 goto param_prob; 28163 } 28164 break; 28165 } 28166 } 28167 28168 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 28169 return (0); 28170 28171 ip1dbg(("ip_wput_options: error processing IP options.")); 28172 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 28173 28174 param_prob: 28175 /* 28176 * Since ip_wput() isn't close to finished, we fill 28177 * in enough of the header for credible error reporting. 28178 */ 28179 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 28180 /* Failed */ 28181 freemsg(ipsec_mp); 28182 return (-1); 28183 } 28184 icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst); 28185 return (-1); 28186 28187 bad_src_route: 28188 /* 28189 * Since ip_wput() isn't close to finished, we fill 28190 * in enough of the header for credible error reporting. 28191 */ 28192 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 28193 /* Failed */ 28194 freemsg(ipsec_mp); 28195 return (-1); 28196 } 28197 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 28198 return (-1); 28199 } 28200 28201 /* 28202 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 28203 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 28204 * thru /etc/system. 28205 */ 28206 #define CONN_MAXDRAINCNT 64 28207 28208 static void 28209 conn_drain_init(ip_stack_t *ipst) 28210 { 28211 int i; 28212 28213 ipst->ips_conn_drain_list_cnt = conn_drain_nthreads; 28214 28215 if ((ipst->ips_conn_drain_list_cnt == 0) || 28216 (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 28217 /* 28218 * Default value of the number of drainers is the 28219 * number of cpus, subject to maximum of 8 drainers. 28220 */ 28221 if (boot_max_ncpus != -1) 28222 ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 28223 else 28224 ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8); 28225 } 28226 28227 ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt * 28228 sizeof (idl_t), KM_SLEEP); 28229 28230 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28231 mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL, 28232 MUTEX_DEFAULT, NULL); 28233 } 28234 } 28235 28236 static void 28237 conn_drain_fini(ip_stack_t *ipst) 28238 { 28239 int i; 28240 28241 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) 28242 mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock); 28243 kmem_free(ipst->ips_conn_drain_list, 28244 ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); 28245 ipst->ips_conn_drain_list = NULL; 28246 } 28247 28248 /* 28249 * Note: For an overview of how flowcontrol is handled in IP please see the 28250 * IP Flowcontrol notes at the top of this file. 28251 * 28252 * Flow control has blocked us from proceeding. Insert the given conn in one 28253 * of the conn drain lists. These conn wq's will be qenabled later on when 28254 * STREAMS flow control does a backenable. conn_walk_drain will enable 28255 * the first conn in each of these drain lists. Each of these qenabled conns 28256 * in turn enables the next in the list, after it runs, or when it closes, 28257 * thus sustaining the drain process. 28258 * 28259 * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> 28260 * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert 28261 * running at any time, on a given conn, since there can be only 1 service proc 28262 * running on a queue at any time. 28263 */ 28264 void 28265 conn_drain_insert(conn_t *connp) 28266 { 28267 idl_t *idl; 28268 uint_t index; 28269 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28270 28271 mutex_enter(&connp->conn_lock); 28272 if (connp->conn_state_flags & CONN_CLOSING) { 28273 /* 28274 * The conn is closing as a result of which CONN_CLOSING 28275 * is set. Return. 28276 */ 28277 mutex_exit(&connp->conn_lock); 28278 return; 28279 } else if (connp->conn_idl == NULL) { 28280 /* 28281 * Assign the next drain list round robin. We dont' use 28282 * a lock, and thus it may not be strictly round robin. 28283 * Atomicity of load/stores is enough to make sure that 28284 * conn_drain_list_index is always within bounds. 28285 */ 28286 index = ipst->ips_conn_drain_list_index; 28287 ASSERT(index < ipst->ips_conn_drain_list_cnt); 28288 connp->conn_idl = &ipst->ips_conn_drain_list[index]; 28289 index++; 28290 if (index == ipst->ips_conn_drain_list_cnt) 28291 index = 0; 28292 ipst->ips_conn_drain_list_index = index; 28293 } 28294 mutex_exit(&connp->conn_lock); 28295 28296 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28297 if ((connp->conn_drain_prev != NULL) || 28298 (connp->conn_state_flags & CONN_CLOSING)) { 28299 /* 28300 * The conn is already in the drain list, OR 28301 * the conn is closing. We need to check again for 28302 * the closing case again since close can happen 28303 * after we drop the conn_lock, and before we 28304 * acquire the CONN_DRAIN_LIST_LOCK. 28305 */ 28306 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28307 return; 28308 } else { 28309 idl = connp->conn_idl; 28310 } 28311 28312 /* 28313 * The conn is not in the drain list. Insert it at the 28314 * tail of the drain list. The drain list is circular 28315 * and doubly linked. idl_conn points to the 1st element 28316 * in the list. 28317 */ 28318 if (idl->idl_conn == NULL) { 28319 idl->idl_conn = connp; 28320 connp->conn_drain_next = connp; 28321 connp->conn_drain_prev = connp; 28322 } else { 28323 conn_t *head = idl->idl_conn; 28324 28325 connp->conn_drain_next = head; 28326 connp->conn_drain_prev = head->conn_drain_prev; 28327 head->conn_drain_prev->conn_drain_next = connp; 28328 head->conn_drain_prev = connp; 28329 } 28330 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28331 } 28332 28333 /* 28334 * This conn is closing, and we are called from ip_close. OR 28335 * This conn has been serviced by ip_wsrv, and we need to do the tail 28336 * processing. 28337 * If this conn is part of the drain list, we may need to sustain the drain 28338 * process by qenabling the next conn in the drain list. We may also need to 28339 * remove this conn from the list, if it is done. 28340 */ 28341 static void 28342 conn_drain_tail(conn_t *connp, boolean_t closing) 28343 { 28344 idl_t *idl; 28345 28346 /* 28347 * connp->conn_idl is stable at this point, and no lock is needed 28348 * to check it. If we are called from ip_close, close has already 28349 * set CONN_CLOSING, thus freezing the value of conn_idl, and 28350 * called us only because conn_idl is non-null. If we are called thru 28351 * service, conn_idl could be null, but it cannot change because 28352 * service is single-threaded per queue, and there cannot be another 28353 * instance of service trying to call conn_drain_insert on this conn 28354 * now. 28355 */ 28356 ASSERT(!closing || (connp->conn_idl != NULL)); 28357 28358 /* 28359 * If connp->conn_idl is null, the conn has not been inserted into any 28360 * drain list even once since creation of the conn. Just return. 28361 */ 28362 if (connp->conn_idl == NULL) 28363 return; 28364 28365 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28366 28367 if (connp->conn_drain_prev == NULL) { 28368 /* This conn is currently not in the drain list. */ 28369 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28370 return; 28371 } 28372 idl = connp->conn_idl; 28373 if (idl->idl_conn_draining == connp) { 28374 /* 28375 * This conn is the current drainer. If this is the last conn 28376 * in the drain list, we need to do more checks, in the 'if' 28377 * below. Otherwwise we need to just qenable the next conn, 28378 * to sustain the draining, and is handled in the 'else' 28379 * below. 28380 */ 28381 if (connp->conn_drain_next == idl->idl_conn) { 28382 /* 28383 * This conn is the last in this list. This round 28384 * of draining is complete. If idl_repeat is set, 28385 * it means another flow enabling has happened from 28386 * the driver/streams and we need to another round 28387 * of draining. 28388 * If there are more than 2 conns in the drain list, 28389 * do a left rotate by 1, so that all conns except the 28390 * conn at the head move towards the head by 1, and the 28391 * the conn at the head goes to the tail. This attempts 28392 * a more even share for all queues that are being 28393 * drained. 28394 */ 28395 if ((connp->conn_drain_next != connp) && 28396 (idl->idl_conn->conn_drain_next != connp)) { 28397 idl->idl_conn = idl->idl_conn->conn_drain_next; 28398 } 28399 if (idl->idl_repeat) { 28400 qenable(idl->idl_conn->conn_wq); 28401 idl->idl_conn_draining = idl->idl_conn; 28402 idl->idl_repeat = 0; 28403 } else { 28404 idl->idl_conn_draining = NULL; 28405 } 28406 } else { 28407 /* 28408 * If the next queue that we are now qenable'ing, 28409 * is closing, it will remove itself from this list 28410 * and qenable the subsequent queue in ip_close(). 28411 * Serialization is acheived thru idl_lock. 28412 */ 28413 qenable(connp->conn_drain_next->conn_wq); 28414 idl->idl_conn_draining = connp->conn_drain_next; 28415 } 28416 } 28417 if (!connp->conn_did_putbq || closing) { 28418 /* 28419 * Remove ourself from the drain list, if we did not do 28420 * a putbq, or if the conn is closing. 28421 * Note: It is possible that q->q_first is non-null. It means 28422 * that these messages landed after we did a enableok() in 28423 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 28424 * service them. 28425 */ 28426 if (connp->conn_drain_next == connp) { 28427 /* Singleton in the list */ 28428 ASSERT(connp->conn_drain_prev == connp); 28429 idl->idl_conn = NULL; 28430 idl->idl_conn_draining = NULL; 28431 } else { 28432 connp->conn_drain_prev->conn_drain_next = 28433 connp->conn_drain_next; 28434 connp->conn_drain_next->conn_drain_prev = 28435 connp->conn_drain_prev; 28436 if (idl->idl_conn == connp) 28437 idl->idl_conn = connp->conn_drain_next; 28438 ASSERT(idl->idl_conn_draining != connp); 28439 28440 } 28441 connp->conn_drain_next = NULL; 28442 connp->conn_drain_prev = NULL; 28443 } 28444 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28445 } 28446 28447 /* 28448 * Write service routine. Shared perimeter entry point. 28449 * ip_wsrv can be called in any of the following ways. 28450 * 1. The device queue's messages has fallen below the low water mark 28451 * and STREAMS has backenabled the ill_wq. We walk thru all the 28452 * the drain lists and backenable the first conn in each list. 28453 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 28454 * qenabled non-tcp upper layers. We start dequeing messages and call 28455 * ip_wput for each message. 28456 */ 28457 28458 void 28459 ip_wsrv(queue_t *q) 28460 { 28461 conn_t *connp; 28462 ill_t *ill; 28463 mblk_t *mp; 28464 28465 if (q->q_next) { 28466 ill = (ill_t *)q->q_ptr; 28467 if (ill->ill_state_flags == 0) { 28468 /* 28469 * The device flow control has opened up. 28470 * Walk through conn drain lists and qenable the 28471 * first conn in each list. This makes sense only 28472 * if the stream is fully plumbed and setup. 28473 * Hence the if check above. 28474 */ 28475 ip1dbg(("ip_wsrv: walking\n")); 28476 conn_walk_drain(ill->ill_ipst); 28477 } 28478 return; 28479 } 28480 28481 connp = Q_TO_CONN(q); 28482 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 28483 28484 /* 28485 * 1. Set conn_draining flag to signal that service is active. 28486 * 28487 * 2. ip_output determines whether it has been called from service, 28488 * based on the last parameter. If it is IP_WSRV it concludes it 28489 * has been called from service. 28490 * 28491 * 3. Message ordering is preserved by the following logic. 28492 * i. A directly called ip_output (i.e. not thru service) will queue 28493 * the message at the tail, if conn_draining is set (i.e. service 28494 * is running) or if q->q_first is non-null. 28495 * 28496 * ii. If ip_output is called from service, and if ip_output cannot 28497 * putnext due to flow control, it does a putbq. 28498 * 28499 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 28500 * (causing an infinite loop). 28501 */ 28502 ASSERT(!connp->conn_did_putbq); 28503 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 28504 connp->conn_draining = 1; 28505 noenable(q); 28506 while ((mp = getq(q)) != NULL) { 28507 ASSERT(CONN_Q(q)); 28508 28509 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 28510 if (connp->conn_did_putbq) { 28511 /* ip_wput did a putbq */ 28512 break; 28513 } 28514 } 28515 /* 28516 * At this point, a thread coming down from top, calling 28517 * ip_wput, may end up queueing the message. We have not yet 28518 * enabled the queue, so ip_wsrv won't be called again. 28519 * To avoid this race, check q->q_first again (in the loop) 28520 * If the other thread queued the message before we call 28521 * enableok(), we will catch it in the q->q_first check. 28522 * If the other thread queues the message after we call 28523 * enableok(), ip_wsrv will be called again by STREAMS. 28524 */ 28525 connp->conn_draining = 0; 28526 enableok(q); 28527 } 28528 28529 /* Enable the next conn for draining */ 28530 conn_drain_tail(connp, B_FALSE); 28531 28532 connp->conn_did_putbq = 0; 28533 } 28534 28535 /* 28536 * Walk the list of all conn's calling the function provided with the 28537 * specified argument for each. Note that this only walks conn's that 28538 * have been bound. 28539 * Applies to both IPv4 and IPv6. 28540 */ 28541 static void 28542 conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 28543 { 28544 conn_walk_fanout_table(ipst->ips_ipcl_udp_fanout, 28545 ipst->ips_ipcl_udp_fanout_size, 28546 func, arg, zoneid); 28547 conn_walk_fanout_table(ipst->ips_ipcl_conn_fanout, 28548 ipst->ips_ipcl_conn_fanout_size, 28549 func, arg, zoneid); 28550 conn_walk_fanout_table(ipst->ips_ipcl_bind_fanout, 28551 ipst->ips_ipcl_bind_fanout_size, 28552 func, arg, zoneid); 28553 conn_walk_fanout_table(ipst->ips_ipcl_proto_fanout, 28554 IPPROTO_MAX, func, arg, zoneid); 28555 conn_walk_fanout_table(ipst->ips_ipcl_proto_fanout_v6, 28556 IPPROTO_MAX, func, arg, zoneid); 28557 } 28558 28559 /* 28560 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 28561 * of conns that need to be drained, check if drain is already in progress. 28562 * If so set the idl_repeat bit, indicating that the last conn in the list 28563 * needs to reinitiate the drain once again, for the list. If drain is not 28564 * in progress for the list, initiate the draining, by qenabling the 1st 28565 * conn in the list. The drain is self-sustaining, each qenabled conn will 28566 * in turn qenable the next conn, when it is done/blocked/closing. 28567 */ 28568 static void 28569 conn_walk_drain(ip_stack_t *ipst) 28570 { 28571 int i; 28572 idl_t *idl; 28573 28574 IP_STAT(ipst, ip_conn_walk_drain); 28575 28576 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28577 idl = &ipst->ips_conn_drain_list[i]; 28578 mutex_enter(&idl->idl_lock); 28579 if (idl->idl_conn == NULL) { 28580 mutex_exit(&idl->idl_lock); 28581 continue; 28582 } 28583 /* 28584 * If this list is not being drained currently by 28585 * an ip_wsrv thread, start the process. 28586 */ 28587 if (idl->idl_conn_draining == NULL) { 28588 ASSERT(idl->idl_repeat == 0); 28589 qenable(idl->idl_conn->conn_wq); 28590 idl->idl_conn_draining = idl->idl_conn; 28591 } else { 28592 idl->idl_repeat = 1; 28593 } 28594 mutex_exit(&idl->idl_lock); 28595 } 28596 } 28597 28598 /* 28599 * Walk an conn hash table of `count' buckets, calling func for each entry. 28600 */ 28601 static void 28602 conn_walk_fanout_table(connf_t *connfp, uint_t count, pfv_t func, void *arg, 28603 zoneid_t zoneid) 28604 { 28605 conn_t *connp; 28606 28607 while (count-- > 0) { 28608 mutex_enter(&connfp->connf_lock); 28609 for (connp = connfp->connf_head; connp != NULL; 28610 connp = connp->conn_next) { 28611 if (zoneid == GLOBAL_ZONEID || 28612 zoneid == connp->conn_zoneid) { 28613 CONN_INC_REF(connp); 28614 mutex_exit(&connfp->connf_lock); 28615 (*func)(connp, arg); 28616 mutex_enter(&connfp->connf_lock); 28617 CONN_DEC_REF(connp); 28618 } 28619 } 28620 mutex_exit(&connfp->connf_lock); 28621 connfp++; 28622 } 28623 } 28624 28625 /* conn_walk_fanout routine invoked for ip_conn_report for each conn. */ 28626 static void 28627 conn_report1(conn_t *connp, void *mp) 28628 { 28629 char buf1[INET6_ADDRSTRLEN]; 28630 char buf2[INET6_ADDRSTRLEN]; 28631 uint_t print_len, buf_len; 28632 28633 ASSERT(connp != NULL); 28634 28635 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 28636 if (buf_len <= 0) 28637 return; 28638 (void) inet_ntop(AF_INET6, &connp->conn_srcv6, buf1, sizeof (buf1)); 28639 (void) inet_ntop(AF_INET6, &connp->conn_remv6, buf2, sizeof (buf2)); 28640 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 28641 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 28642 "%5d %s/%05d %s/%05d\n", 28643 (void *)connp, (void *)CONNP_TO_RQ(connp), 28644 (void *)CONNP_TO_WQ(connp), connp->conn_zoneid, 28645 buf1, connp->conn_lport, 28646 buf2, connp->conn_fport); 28647 if (print_len < buf_len) { 28648 ((mblk_t *)mp)->b_wptr += print_len; 28649 } else { 28650 ((mblk_t *)mp)->b_wptr += buf_len; 28651 } 28652 } 28653 28654 /* 28655 * Named Dispatch routine to produce a formatted report on all conns 28656 * that are listed in one of the fanout tables. 28657 * This report is accessed by using the ndd utility to "get" ND variable 28658 * "ip_conn_status". 28659 */ 28660 /* ARGSUSED */ 28661 static int 28662 ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 28663 { 28664 conn_t *connp = Q_TO_CONN(q); 28665 28666 (void) mi_mpprintf(mp, 28667 "CONN " MI_COL_HDRPAD_STR 28668 "rfq " MI_COL_HDRPAD_STR 28669 "stq " MI_COL_HDRPAD_STR 28670 " zone local remote"); 28671 28672 /* 28673 * Because of the ndd constraint, at most we can have 64K buffer 28674 * to put in all conn info. So to be more efficient, just 28675 * allocate a 64K buffer here, assuming we need that large buffer. 28676 * This should be OK as only privileged processes can do ndd /dev/ip. 28677 */ 28678 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 28679 /* The following may work even if we cannot get a large buf. */ 28680 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 28681 return (0); 28682 } 28683 28684 conn_walk_fanout(conn_report1, mp->b_cont, connp->conn_zoneid, 28685 connp->conn_netstack->netstack_ip); 28686 return (0); 28687 } 28688 28689 /* 28690 * Determine if the ill and multicast aspects of that packets 28691 * "matches" the conn. 28692 */ 28693 boolean_t 28694 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 28695 zoneid_t zoneid) 28696 { 28697 ill_t *in_ill; 28698 boolean_t found; 28699 ipif_t *ipif; 28700 ire_t *ire; 28701 ipaddr_t dst, src; 28702 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28703 28704 dst = ipha->ipha_dst; 28705 src = ipha->ipha_src; 28706 28707 /* 28708 * conn_incoming_ill is set by IP_BOUND_IF which limits 28709 * unicast, broadcast and multicast reception to 28710 * conn_incoming_ill. conn_wantpacket itself is called 28711 * only for BROADCAST and multicast. 28712 * 28713 * 1) ip_rput supresses duplicate broadcasts if the ill 28714 * is part of a group. Hence, we should be receiving 28715 * just one copy of broadcast for the whole group. 28716 * Thus, if it is part of the group the packet could 28717 * come on any ill of the group and hence we need a 28718 * match on the group. Otherwise, match on ill should 28719 * be sufficient. 28720 * 28721 * 2) ip_rput does not suppress duplicate multicast packets. 28722 * If there are two interfaces in a ill group and we have 28723 * 2 applications (conns) joined a multicast group G on 28724 * both the interfaces, ilm_lookup_ill filter in ip_rput 28725 * will give us two packets because we join G on both the 28726 * interfaces rather than nominating just one interface 28727 * for receiving multicast like broadcast above. So, 28728 * we have to call ilg_lookup_ill to filter out duplicate 28729 * copies, if ill is part of a group. 28730 */ 28731 in_ill = connp->conn_incoming_ill; 28732 if (in_ill != NULL) { 28733 if (in_ill->ill_group == NULL) { 28734 if (in_ill != ill) 28735 return (B_FALSE); 28736 } else if (in_ill->ill_group != ill->ill_group) { 28737 return (B_FALSE); 28738 } 28739 } 28740 28741 if (!CLASSD(dst)) { 28742 if (IPCL_ZONE_MATCH(connp, zoneid)) 28743 return (B_TRUE); 28744 /* 28745 * The conn is in a different zone; we need to check that this 28746 * broadcast address is configured in the application's zone and 28747 * on one ill in the group. 28748 */ 28749 ipif = ipif_get_next_ipif(NULL, ill); 28750 if (ipif == NULL) 28751 return (B_FALSE); 28752 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 28753 connp->conn_zoneid, NULL, 28754 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); 28755 ipif_refrele(ipif); 28756 if (ire != NULL) { 28757 ire_refrele(ire); 28758 return (B_TRUE); 28759 } else { 28760 return (B_FALSE); 28761 } 28762 } 28763 28764 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 28765 connp->conn_zoneid == zoneid) { 28766 /* 28767 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 28768 * disabled, therefore we don't dispatch the multicast packet to 28769 * the sending zone. 28770 */ 28771 return (B_FALSE); 28772 } 28773 28774 if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) { 28775 /* 28776 * Multicast packet on the loopback interface: we only match 28777 * conns who joined the group in the specified zone. 28778 */ 28779 return (B_FALSE); 28780 } 28781 28782 if (connp->conn_multi_router) { 28783 /* multicast packet and multicast router socket: send up */ 28784 return (B_TRUE); 28785 } 28786 28787 mutex_enter(&connp->conn_lock); 28788 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 28789 mutex_exit(&connp->conn_lock); 28790 return (found); 28791 } 28792 28793 /* 28794 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 28795 */ 28796 /* ARGSUSED */ 28797 static void 28798 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 28799 { 28800 ill_t *ill = (ill_t *)q->q_ptr; 28801 mblk_t *mp1, *mp2; 28802 ipif_t *ipif; 28803 int err = 0; 28804 conn_t *connp = NULL; 28805 ipsq_t *ipsq; 28806 arc_t *arc; 28807 28808 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 28809 28810 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 28811 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 28812 28813 ASSERT(IAM_WRITER_ILL(ill)); 28814 mp2 = mp->b_cont; 28815 mp->b_cont = NULL; 28816 28817 /* 28818 * We have now received the arp bringup completion message 28819 * from ARP. Mark the arp bringup as done. Also if the arp 28820 * stream has already started closing, send up the AR_ARP_CLOSING 28821 * ack now since ARP is waiting in close for this ack. 28822 */ 28823 mutex_enter(&ill->ill_lock); 28824 ill->ill_arp_bringup_pending = 0; 28825 if (ill->ill_arp_closing) { 28826 mutex_exit(&ill->ill_lock); 28827 /* Let's reuse the mp for sending the ack */ 28828 arc = (arc_t *)mp->b_rptr; 28829 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 28830 arc->arc_cmd = AR_ARP_CLOSING; 28831 qreply(q, mp); 28832 } else { 28833 mutex_exit(&ill->ill_lock); 28834 freeb(mp); 28835 } 28836 28837 ipsq = ill->ill_phyint->phyint_ipsq; 28838 ipif = ipsq->ipsq_pending_ipif; 28839 mp1 = ipsq_pending_mp_get(ipsq, &connp); 28840 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 28841 if (mp1 == NULL) { 28842 /* bringup was aborted by the user */ 28843 freemsg(mp2); 28844 return; 28845 } 28846 28847 /* 28848 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 28849 * must have an associated conn_t. Otherwise, we're bringing this 28850 * interface back up as part of handling an asynchronous event (e.g., 28851 * physical address change). 28852 */ 28853 if (ipsq->ipsq_current_ioctl != 0) { 28854 ASSERT(connp != NULL); 28855 q = CONNP_TO_WQ(connp); 28856 } else { 28857 ASSERT(connp == NULL); 28858 q = ill->ill_rq; 28859 } 28860 28861 /* 28862 * If the DL_BIND_REQ fails, it is noted 28863 * in arc_name_offset. 28864 */ 28865 err = *((int *)mp2->b_rptr); 28866 if (err == 0) { 28867 if (ipif->ipif_isv6) { 28868 if ((err = ipif_up_done_v6(ipif)) != 0) 28869 ip0dbg(("ip_arp_done: init failed\n")); 28870 } else { 28871 if ((err = ipif_up_done(ipif)) != 0) 28872 ip0dbg(("ip_arp_done: init failed\n")); 28873 } 28874 } else { 28875 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 28876 } 28877 28878 freemsg(mp2); 28879 28880 if ((err == 0) && (ill->ill_up_ipifs)) { 28881 err = ill_up_ipifs(ill, q, mp1); 28882 if (err == EINPROGRESS) 28883 return; 28884 } 28885 28886 if (ill->ill_up_ipifs) 28887 ill_group_cleanup(ill); 28888 28889 /* 28890 * The operation must complete without EINPROGRESS since 28891 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 28892 * Otherwise, the operation will be stuck forever in the ipsq. 28893 */ 28894 ASSERT(err != EINPROGRESS); 28895 if (ipsq->ipsq_current_ioctl != 0) 28896 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 28897 else 28898 ipsq_current_finish(ipsq); 28899 } 28900 28901 /* Allocate the private structure */ 28902 static int 28903 ip_priv_alloc(void **bufp) 28904 { 28905 void *buf; 28906 28907 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 28908 return (ENOMEM); 28909 28910 *bufp = buf; 28911 return (0); 28912 } 28913 28914 /* Function to delete the private structure */ 28915 void 28916 ip_priv_free(void *buf) 28917 { 28918 ASSERT(buf != NULL); 28919 kmem_free(buf, sizeof (ip_priv_t)); 28920 } 28921 28922 /* 28923 * The entry point for IPPF processing. 28924 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 28925 * routine just returns. 28926 * 28927 * When called, ip_process generates an ipp_packet_t structure 28928 * which holds the state information for this packet and invokes the 28929 * the classifier (via ipp_packet_process). The classification, depending on 28930 * configured filters, results in a list of actions for this packet. Invoking 28931 * an action may cause the packet to be dropped, in which case the resulting 28932 * mblk (*mpp) is NULL. proc indicates the callout position for 28933 * this packet and ill_index is the interface this packet on or will leave 28934 * on (inbound and outbound resp.). 28935 */ 28936 void 28937 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 28938 { 28939 mblk_t *mp; 28940 ip_priv_t *priv; 28941 ipp_action_id_t aid; 28942 int rc = 0; 28943 ipp_packet_t *pp; 28944 #define IP_CLASS "ip" 28945 28946 /* If the classifier is not loaded, return */ 28947 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 28948 return; 28949 } 28950 28951 mp = *mpp; 28952 ASSERT(mp != NULL); 28953 28954 /* Allocate the packet structure */ 28955 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 28956 if (rc != 0) { 28957 *mpp = NULL; 28958 freemsg(mp); 28959 return; 28960 } 28961 28962 /* Allocate the private structure */ 28963 rc = ip_priv_alloc((void **)&priv); 28964 if (rc != 0) { 28965 *mpp = NULL; 28966 freemsg(mp); 28967 ipp_packet_free(pp); 28968 return; 28969 } 28970 priv->proc = proc; 28971 priv->ill_index = ill_index; 28972 ipp_packet_set_private(pp, priv, ip_priv_free); 28973 ipp_packet_set_data(pp, mp); 28974 28975 /* Invoke the classifier */ 28976 rc = ipp_packet_process(&pp); 28977 if (pp != NULL) { 28978 mp = ipp_packet_get_data(pp); 28979 ipp_packet_free(pp); 28980 if (rc != 0) { 28981 freemsg(mp); 28982 *mpp = NULL; 28983 } 28984 } else { 28985 *mpp = NULL; 28986 } 28987 #undef IP_CLASS 28988 } 28989 28990 /* 28991 * Propagate a multicast group membership operation (add/drop) on 28992 * all the interfaces crossed by the related multirt routes. 28993 * The call is considered successful if the operation succeeds 28994 * on at least one interface. 28995 */ 28996 static int 28997 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 28998 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 28999 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 29000 mblk_t *first_mp) 29001 { 29002 ire_t *ire_gw; 29003 irb_t *irb; 29004 int error = 0; 29005 opt_restart_t *or; 29006 ip_stack_t *ipst = ire->ire_ipst; 29007 29008 irb = ire->ire_bucket; 29009 ASSERT(irb != NULL); 29010 29011 ASSERT(DB_TYPE(first_mp) == M_CTL); 29012 29013 or = (opt_restart_t *)first_mp->b_rptr; 29014 IRB_REFHOLD(irb); 29015 for (; ire != NULL; ire = ire->ire_next) { 29016 if ((ire->ire_flags & RTF_MULTIRT) == 0) 29017 continue; 29018 if (ire->ire_addr != group) 29019 continue; 29020 29021 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 29022 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 29023 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst); 29024 /* No resolver exists for the gateway; skip this ire. */ 29025 if (ire_gw == NULL) 29026 continue; 29027 29028 /* 29029 * This function can return EINPROGRESS. If so the operation 29030 * will be restarted from ip_restart_optmgmt which will 29031 * call ip_opt_set and option processing will restart for 29032 * this option. So we may end up calling 'fn' more than once. 29033 * This requires that 'fn' is idempotent except for the 29034 * return value. The operation is considered a success if 29035 * it succeeds at least once on any one interface. 29036 */ 29037 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 29038 NULL, fmode, src, first_mp); 29039 if (error == 0) 29040 or->or_private = CGTP_MCAST_SUCCESS; 29041 29042 if (ip_debug > 0) { 29043 ulong_t off; 29044 char *ksym; 29045 ksym = kobj_getsymname((uintptr_t)fn, &off); 29046 ip2dbg(("ip_multirt_apply_membership: " 29047 "called %s, multirt group 0x%08x via itf 0x%08x, " 29048 "error %d [success %u]\n", 29049 ksym ? ksym : "?", 29050 ntohl(group), ntohl(ire_gw->ire_src_addr), 29051 error, or->or_private)); 29052 } 29053 29054 ire_refrele(ire_gw); 29055 if (error == EINPROGRESS) { 29056 IRB_REFRELE(irb); 29057 return (error); 29058 } 29059 } 29060 IRB_REFRELE(irb); 29061 /* 29062 * Consider the call as successful if we succeeded on at least 29063 * one interface. Otherwise, return the last encountered error. 29064 */ 29065 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 29066 } 29067 29068 29069 /* 29070 * Issue a warning regarding a route crossing an interface with an 29071 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 29072 * amount of time is logged. 29073 */ 29074 static void 29075 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 29076 { 29077 hrtime_t current = gethrtime(); 29078 char buf[INET_ADDRSTRLEN]; 29079 ip_stack_t *ipst = ire->ire_ipst; 29080 29081 /* Convert interval in ms to hrtime in ns */ 29082 if (ipst->ips_multirt_bad_mtu_last_time + 29083 ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <= 29084 current) { 29085 cmn_err(CE_WARN, "ip: ignoring multiroute " 29086 "to %s, incorrect MTU %u (expected %u)\n", 29087 ip_dot_addr(ire->ire_addr, buf), 29088 ire->ire_max_frag, max_frag); 29089 29090 ipst->ips_multirt_bad_mtu_last_time = current; 29091 } 29092 } 29093 29094 29095 /* 29096 * Get the CGTP (multirouting) filtering status. 29097 * If 0, the CGTP hooks are transparent. 29098 */ 29099 /* ARGSUSED */ 29100 static int 29101 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 29102 { 29103 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 29104 29105 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 29106 return (0); 29107 } 29108 29109 29110 /* 29111 * Set the CGTP (multirouting) filtering status. 29112 * If the status is changed from active to transparent 29113 * or from transparent to active, forward the new status 29114 * to the filtering module (if loaded). 29115 */ 29116 /* ARGSUSED */ 29117 static int 29118 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 29119 cred_t *ioc_cr) 29120 { 29121 long new_value; 29122 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 29123 ip_stack_t *ipst = CONNQ_TO_IPST(q); 29124 29125 if (secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 29126 return (EPERM); 29127 29128 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 29129 new_value < 0 || new_value > 1) { 29130 return (EINVAL); 29131 } 29132 29133 if ((!*ip_cgtp_filter_value) && new_value) { 29134 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 29135 ipst->ips_ip_cgtp_filter_ops == NULL ? 29136 " (module not loaded)" : ""); 29137 } 29138 if (*ip_cgtp_filter_value && (!new_value)) { 29139 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 29140 ipst->ips_ip_cgtp_filter_ops == NULL ? 29141 " (module not loaded)" : ""); 29142 } 29143 29144 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 29145 int res; 29146 netstackid_t stackid; 29147 29148 stackid = ipst->ips_netstack->netstack_stackid; 29149 res = ipst->ips_ip_cgtp_filter_ops->cfo_change_state(stackid, 29150 new_value); 29151 if (res) 29152 return (res); 29153 } 29154 29155 *ip_cgtp_filter_value = (boolean_t)new_value; 29156 29157 return (0); 29158 } 29159 29160 29161 /* 29162 * Return the expected CGTP hooks version number. 29163 */ 29164 int 29165 ip_cgtp_filter_supported(void) 29166 { 29167 return (ip_cgtp_filter_rev); 29168 } 29169 29170 29171 /* 29172 * CGTP hooks can be registered by invoking this function. 29173 * Checks that the version number matches. 29174 */ 29175 int 29176 ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops) 29177 { 29178 netstack_t *ns; 29179 ip_stack_t *ipst; 29180 29181 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 29182 return (ENOTSUP); 29183 29184 ns = netstack_find_by_stackid(stackid); 29185 if (ns == NULL) 29186 return (EINVAL); 29187 ipst = ns->netstack_ip; 29188 ASSERT(ipst != NULL); 29189 29190 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 29191 netstack_rele(ns); 29192 return (EALREADY); 29193 } 29194 29195 ipst->ips_ip_cgtp_filter_ops = ops; 29196 netstack_rele(ns); 29197 return (0); 29198 } 29199 29200 /* 29201 * CGTP hooks can be unregistered by invoking this function. 29202 * Returns ENXIO if there was no registration. 29203 * Returns EBUSY if the ndd variable has not been turned off. 29204 */ 29205 int 29206 ip_cgtp_filter_unregister(netstackid_t stackid) 29207 { 29208 netstack_t *ns; 29209 ip_stack_t *ipst; 29210 29211 ns = netstack_find_by_stackid(stackid); 29212 if (ns == NULL) 29213 return (EINVAL); 29214 ipst = ns->netstack_ip; 29215 ASSERT(ipst != NULL); 29216 29217 if (ipst->ips_ip_cgtp_filter) { 29218 netstack_rele(ns); 29219 return (EBUSY); 29220 } 29221 29222 if (ipst->ips_ip_cgtp_filter_ops == NULL) { 29223 netstack_rele(ns); 29224 return (ENXIO); 29225 } 29226 ipst->ips_ip_cgtp_filter_ops = NULL; 29227 netstack_rele(ns); 29228 return (0); 29229 } 29230 29231 /* 29232 * Check whether there is a CGTP filter registration. 29233 * Returns non-zero if there is a registration, otherwise returns zero. 29234 * Note: returns zero if bad stackid. 29235 */ 29236 int 29237 ip_cgtp_filter_is_registered(netstackid_t stackid) 29238 { 29239 netstack_t *ns; 29240 ip_stack_t *ipst; 29241 int ret; 29242 29243 ns = netstack_find_by_stackid(stackid); 29244 if (ns == NULL) 29245 return (0); 29246 ipst = ns->netstack_ip; 29247 ASSERT(ipst != NULL); 29248 29249 if (ipst->ips_ip_cgtp_filter_ops != NULL) 29250 ret = 1; 29251 else 29252 ret = 0; 29253 29254 netstack_rele(ns); 29255 return (ret); 29256 } 29257 29258 static squeue_func_t 29259 ip_squeue_switch(int val) 29260 { 29261 squeue_func_t rval = squeue_fill; 29262 29263 switch (val) { 29264 case IP_SQUEUE_ENTER_NODRAIN: 29265 rval = squeue_enter_nodrain; 29266 break; 29267 case IP_SQUEUE_ENTER: 29268 rval = squeue_enter; 29269 break; 29270 default: 29271 break; 29272 } 29273 return (rval); 29274 } 29275 29276 /* ARGSUSED */ 29277 static int 29278 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 29279 caddr_t addr, cred_t *cr) 29280 { 29281 int *v = (int *)addr; 29282 long new_value; 29283 29284 if (secpolicy_net_config(cr, B_FALSE) != 0) 29285 return (EPERM); 29286 29287 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29288 return (EINVAL); 29289 29290 ip_input_proc = ip_squeue_switch(new_value); 29291 *v = new_value; 29292 return (0); 29293 } 29294 29295 /* 29296 * Handle ndd set of variables which require PRIV_SYS_NET_CONFIG such as 29297 * ip_debug. 29298 */ 29299 /* ARGSUSED */ 29300 static int 29301 ip_int_set(queue_t *q, mblk_t *mp, char *value, 29302 caddr_t addr, cred_t *cr) 29303 { 29304 int *v = (int *)addr; 29305 long new_value; 29306 29307 if (secpolicy_net_config(cr, B_FALSE) != 0) 29308 return (EPERM); 29309 29310 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29311 return (EINVAL); 29312 29313 *v = new_value; 29314 return (0); 29315 } 29316 29317 /* 29318 * Handle changes to ipmp_hook_emulation ndd variable. 29319 * Need to update phyint_hook_ifindex. 29320 * Also generate a nic plumb event should a new ifidex be assigned to a group. 29321 */ 29322 static void 29323 ipmp_hook_emulation_changed(ip_stack_t *ipst) 29324 { 29325 phyint_t *phyi; 29326 phyint_t *phyi_tmp; 29327 char *groupname; 29328 int namelen; 29329 ill_t *ill; 29330 boolean_t new_group; 29331 29332 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29333 /* 29334 * Group indicies are stored in the phyint - a common structure 29335 * to both IPv4 and IPv6. 29336 */ 29337 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 29338 for (; phyi != NULL; 29339 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 29340 phyi, AVL_AFTER)) { 29341 /* Ignore the ones that do not have a group */ 29342 if (phyi->phyint_groupname_len == 0) 29343 continue; 29344 29345 /* 29346 * Look for other phyint in group. 29347 * Clear name/namelen so the lookup doesn't find ourselves. 29348 */ 29349 namelen = phyi->phyint_groupname_len; 29350 groupname = phyi->phyint_groupname; 29351 phyi->phyint_groupname_len = 0; 29352 phyi->phyint_groupname = NULL; 29353 29354 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); 29355 /* Restore */ 29356 phyi->phyint_groupname_len = namelen; 29357 phyi->phyint_groupname = groupname; 29358 29359 new_group = B_FALSE; 29360 if (ipst->ips_ipmp_hook_emulation) { 29361 /* 29362 * If the group already exists and has already 29363 * been assigned a group ifindex, we use the existing 29364 * group_ifindex, otherwise we pick a new group_ifindex 29365 * here. 29366 */ 29367 if (phyi_tmp != NULL && 29368 phyi_tmp->phyint_group_ifindex != 0) { 29369 phyi->phyint_group_ifindex = 29370 phyi_tmp->phyint_group_ifindex; 29371 } else { 29372 /* XXX We need a recovery strategy here. */ 29373 if (!ip_assign_ifindex( 29374 &phyi->phyint_group_ifindex, ipst)) 29375 cmn_err(CE_PANIC, 29376 "ip_assign_ifindex() failed"); 29377 new_group = B_TRUE; 29378 } 29379 } else { 29380 phyi->phyint_group_ifindex = 0; 29381 } 29382 if (ipst->ips_ipmp_hook_emulation) 29383 phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; 29384 else 29385 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 29386 29387 /* 29388 * For IP Filter to find out the relationship between 29389 * names and interface indicies, we need to generate 29390 * a NE_PLUMB event when a new group can appear. 29391 * We always generate events when a new interface appears 29392 * (even when ipmp_hook_emulation is set) so there 29393 * is no need to generate NE_PLUMB events when 29394 * ipmp_hook_emulation is turned off. 29395 * And since it isn't critical for IP Filter to get 29396 * the NE_UNPLUMB events we skip those here. 29397 */ 29398 if (new_group) { 29399 /* 29400 * First phyint in group - generate group PLUMB event. 29401 * Since we are not running inside the ipsq we do 29402 * the dispatch immediately. 29403 */ 29404 if (phyi->phyint_illv4 != NULL) 29405 ill = phyi->phyint_illv4; 29406 else 29407 ill = phyi->phyint_illv6; 29408 29409 if (ill != NULL) { 29410 mutex_enter(&ill->ill_lock); 29411 ill_nic_info_plumb(ill, B_TRUE); 29412 ill_nic_info_dispatch(ill); 29413 mutex_exit(&ill->ill_lock); 29414 } 29415 } 29416 } 29417 rw_exit(&ipst->ips_ill_g_lock); 29418 } 29419 29420 /* ARGSUSED */ 29421 static int 29422 ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value, 29423 caddr_t addr, cred_t *cr) 29424 { 29425 int *v = (int *)addr; 29426 long new_value; 29427 ip_stack_t *ipst = CONNQ_TO_IPST(q); 29428 29429 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29430 return (EINVAL); 29431 29432 if (*v != new_value) { 29433 *v = new_value; 29434 ipmp_hook_emulation_changed(ipst); 29435 } 29436 return (0); 29437 } 29438 29439 static void * 29440 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) 29441 { 29442 kstat_t *ksp; 29443 29444 ip_stat_t template = { 29445 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 29446 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 29447 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 29448 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 29449 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 29450 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 29451 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 29452 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 29453 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 29454 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 29455 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 29456 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 29457 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 29458 { "ip_db_ref", KSTAT_DATA_UINT64 }, 29459 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 29460 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 29461 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 29462 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 29463 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 29464 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 29465 { "ip_opt", KSTAT_DATA_UINT64 }, 29466 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 29467 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 29468 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 29469 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 29470 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 29471 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 29472 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 29473 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 29474 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 29475 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 29476 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 29477 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 29478 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29479 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29480 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29481 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29482 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29483 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29484 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29485 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29486 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 29487 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 29488 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 29489 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 29490 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 29491 }; 29492 29493 ksp = kstat_create_netstack("ip", 0, "ipstat", "net", 29494 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 29495 KSTAT_FLAG_VIRTUAL, stackid); 29496 29497 if (ksp == NULL) 29498 return (NULL); 29499 29500 bcopy(&template, ip_statisticsp, sizeof (template)); 29501 ksp->ks_data = (void *)ip_statisticsp; 29502 ksp->ks_private = (void *)(uintptr_t)stackid; 29503 29504 kstat_install(ksp); 29505 return (ksp); 29506 } 29507 29508 static void 29509 ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp) 29510 { 29511 if (ksp != NULL) { 29512 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29513 kstat_delete_netstack(ksp, stackid); 29514 } 29515 } 29516 29517 static void * 29518 ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst) 29519 { 29520 kstat_t *ksp; 29521 29522 ip_named_kstat_t template = { 29523 { "forwarding", KSTAT_DATA_UINT32, 0 }, 29524 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 29525 { "inReceives", KSTAT_DATA_UINT64, 0 }, 29526 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 29527 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 29528 { "forwDatagrams", KSTAT_DATA_UINT64, 0 }, 29529 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 29530 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 29531 { "inDelivers", KSTAT_DATA_UINT64, 0 }, 29532 { "outRequests", KSTAT_DATA_UINT64, 0 }, 29533 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 29534 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 29535 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 29536 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 29537 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 29538 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 29539 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 29540 { "fragFails", KSTAT_DATA_UINT32, 0 }, 29541 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 29542 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 29543 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 29544 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 29545 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 29546 { "inErrs", KSTAT_DATA_UINT32, 0 }, 29547 { "noPorts", KSTAT_DATA_UINT32, 0 }, 29548 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 29549 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 29550 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 29551 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 29552 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 29553 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 29554 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 29555 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 29556 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 29557 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 29558 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 29559 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 29560 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 29561 }; 29562 29563 ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 29564 NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid); 29565 if (ksp == NULL || ksp->ks_data == NULL) 29566 return (NULL); 29567 29568 template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2; 29569 template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl; 29570 template.reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29571 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 29572 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 29573 29574 template.netToMediaEntrySize.value.i32 = 29575 sizeof (mib2_ipNetToMediaEntry_t); 29576 29577 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 29578 29579 bcopy(&template, ksp->ks_data, sizeof (template)); 29580 ksp->ks_update = ip_kstat_update; 29581 ksp->ks_private = (void *)(uintptr_t)stackid; 29582 29583 kstat_install(ksp); 29584 return (ksp); 29585 } 29586 29587 static void 29588 ip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29589 { 29590 if (ksp != NULL) { 29591 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29592 kstat_delete_netstack(ksp, stackid); 29593 } 29594 } 29595 29596 static int 29597 ip_kstat_update(kstat_t *kp, int rw) 29598 { 29599 ip_named_kstat_t *ipkp; 29600 mib2_ipIfStatsEntry_t ipmib; 29601 ill_walk_context_t ctx; 29602 ill_t *ill; 29603 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29604 netstack_t *ns; 29605 ip_stack_t *ipst; 29606 29607 if (kp == NULL || kp->ks_data == NULL) 29608 return (EIO); 29609 29610 if (rw == KSTAT_WRITE) 29611 return (EACCES); 29612 29613 ns = netstack_find_by_stackid(stackid); 29614 if (ns == NULL) 29615 return (-1); 29616 ipst = ns->netstack_ip; 29617 if (ipst == NULL) { 29618 netstack_rele(ns); 29619 return (-1); 29620 } 29621 ipkp = (ip_named_kstat_t *)kp->ks_data; 29622 29623 bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib)); 29624 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29625 ill = ILL_START_WALK_V4(&ctx, ipst); 29626 for (; ill != NULL; ill = ill_next(&ctx, ill)) 29627 ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib); 29628 rw_exit(&ipst->ips_ill_g_lock); 29629 29630 ipkp->forwarding.value.ui32 = ipmib.ipIfStatsForwarding; 29631 ipkp->defaultTTL.value.ui32 = ipmib.ipIfStatsDefaultTTL; 29632 ipkp->inReceives.value.ui64 = ipmib.ipIfStatsHCInReceives; 29633 ipkp->inHdrErrors.value.ui32 = ipmib.ipIfStatsInHdrErrors; 29634 ipkp->inAddrErrors.value.ui32 = ipmib.ipIfStatsInAddrErrors; 29635 ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams; 29636 ipkp->inUnknownProtos.value.ui32 = ipmib.ipIfStatsInUnknownProtos; 29637 ipkp->inDiscards.value.ui32 = ipmib.ipIfStatsInDiscards; 29638 ipkp->inDelivers.value.ui64 = ipmib.ipIfStatsHCInDelivers; 29639 ipkp->outRequests.value.ui64 = ipmib.ipIfStatsHCOutRequests; 29640 ipkp->outDiscards.value.ui32 = ipmib.ipIfStatsOutDiscards; 29641 ipkp->outNoRoutes.value.ui32 = ipmib.ipIfStatsOutNoRoutes; 29642 ipkp->reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29643 ipkp->reasmReqds.value.ui32 = ipmib.ipIfStatsReasmReqds; 29644 ipkp->reasmOKs.value.ui32 = ipmib.ipIfStatsReasmOKs; 29645 ipkp->reasmFails.value.ui32 = ipmib.ipIfStatsReasmFails; 29646 ipkp->fragOKs.value.ui32 = ipmib.ipIfStatsOutFragOKs; 29647 ipkp->fragFails.value.ui32 = ipmib.ipIfStatsOutFragFails; 29648 ipkp->fragCreates.value.ui32 = ipmib.ipIfStatsOutFragCreates; 29649 29650 ipkp->routingDiscards.value.ui32 = 0; 29651 ipkp->inErrs.value.ui32 = ipmib.tcpIfStatsInErrs; 29652 ipkp->noPorts.value.ui32 = ipmib.udpIfStatsNoPorts; 29653 ipkp->inCksumErrs.value.ui32 = ipmib.ipIfStatsInCksumErrs; 29654 ipkp->reasmDuplicates.value.ui32 = ipmib.ipIfStatsReasmDuplicates; 29655 ipkp->reasmPartDups.value.ui32 = ipmib.ipIfStatsReasmPartDups; 29656 ipkp->forwProhibits.value.ui32 = ipmib.ipIfStatsForwProhibits; 29657 ipkp->udpInCksumErrs.value.ui32 = ipmib.udpIfStatsInCksumErrs; 29658 ipkp->udpInOverflows.value.ui32 = ipmib.udpIfStatsInOverflows; 29659 ipkp->rawipInOverflows.value.ui32 = ipmib.rawipIfStatsInOverflows; 29660 ipkp->ipsecInSucceeded.value.ui32 = ipmib.ipsecIfStatsInSucceeded; 29661 ipkp->ipsecInFailed.value.i32 = ipmib.ipsecIfStatsInFailed; 29662 29663 ipkp->inIPv6.value.ui32 = ipmib.ipIfStatsInWrongIPVersion; 29664 ipkp->outIPv6.value.ui32 = ipmib.ipIfStatsOutWrongIPVersion; 29665 ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion; 29666 29667 netstack_rele(ns); 29668 29669 return (0); 29670 } 29671 29672 static void * 29673 icmp_kstat_init(netstackid_t stackid) 29674 { 29675 kstat_t *ksp; 29676 29677 icmp_named_kstat_t template = { 29678 { "inMsgs", KSTAT_DATA_UINT32 }, 29679 { "inErrors", KSTAT_DATA_UINT32 }, 29680 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 29681 { "inTimeExcds", KSTAT_DATA_UINT32 }, 29682 { "inParmProbs", KSTAT_DATA_UINT32 }, 29683 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 29684 { "inRedirects", KSTAT_DATA_UINT32 }, 29685 { "inEchos", KSTAT_DATA_UINT32 }, 29686 { "inEchoReps", KSTAT_DATA_UINT32 }, 29687 { "inTimestamps", KSTAT_DATA_UINT32 }, 29688 { "inTimestampReps", KSTAT_DATA_UINT32 }, 29689 { "inAddrMasks", KSTAT_DATA_UINT32 }, 29690 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 29691 { "outMsgs", KSTAT_DATA_UINT32 }, 29692 { "outErrors", KSTAT_DATA_UINT32 }, 29693 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 29694 { "outTimeExcds", KSTAT_DATA_UINT32 }, 29695 { "outParmProbs", KSTAT_DATA_UINT32 }, 29696 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 29697 { "outRedirects", KSTAT_DATA_UINT32 }, 29698 { "outEchos", KSTAT_DATA_UINT32 }, 29699 { "outEchoReps", KSTAT_DATA_UINT32 }, 29700 { "outTimestamps", KSTAT_DATA_UINT32 }, 29701 { "outTimestampReps", KSTAT_DATA_UINT32 }, 29702 { "outAddrMasks", KSTAT_DATA_UINT32 }, 29703 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 29704 { "inChksumErrs", KSTAT_DATA_UINT32 }, 29705 { "inUnknowns", KSTAT_DATA_UINT32 }, 29706 { "inFragNeeded", KSTAT_DATA_UINT32 }, 29707 { "outFragNeeded", KSTAT_DATA_UINT32 }, 29708 { "outDrops", KSTAT_DATA_UINT32 }, 29709 { "inOverFlows", KSTAT_DATA_UINT32 }, 29710 { "inBadRedirects", KSTAT_DATA_UINT32 }, 29711 }; 29712 29713 ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 29714 NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid); 29715 if (ksp == NULL || ksp->ks_data == NULL) 29716 return (NULL); 29717 29718 bcopy(&template, ksp->ks_data, sizeof (template)); 29719 29720 ksp->ks_update = icmp_kstat_update; 29721 ksp->ks_private = (void *)(uintptr_t)stackid; 29722 29723 kstat_install(ksp); 29724 return (ksp); 29725 } 29726 29727 static void 29728 icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29729 { 29730 if (ksp != NULL) { 29731 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29732 kstat_delete_netstack(ksp, stackid); 29733 } 29734 } 29735 29736 static int 29737 icmp_kstat_update(kstat_t *kp, int rw) 29738 { 29739 icmp_named_kstat_t *icmpkp; 29740 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29741 netstack_t *ns; 29742 ip_stack_t *ipst; 29743 29744 if ((kp == NULL) || (kp->ks_data == NULL)) 29745 return (EIO); 29746 29747 if (rw == KSTAT_WRITE) 29748 return (EACCES); 29749 29750 ns = netstack_find_by_stackid(stackid); 29751 if (ns == NULL) 29752 return (-1); 29753 ipst = ns->netstack_ip; 29754 if (ipst == NULL) { 29755 netstack_rele(ns); 29756 return (-1); 29757 } 29758 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 29759 29760 icmpkp->inMsgs.value.ui32 = ipst->ips_icmp_mib.icmpInMsgs; 29761 icmpkp->inErrors.value.ui32 = ipst->ips_icmp_mib.icmpInErrors; 29762 icmpkp->inDestUnreachs.value.ui32 = 29763 ipst->ips_icmp_mib.icmpInDestUnreachs; 29764 icmpkp->inTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpInTimeExcds; 29765 icmpkp->inParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpInParmProbs; 29766 icmpkp->inSrcQuenchs.value.ui32 = ipst->ips_icmp_mib.icmpInSrcQuenchs; 29767 icmpkp->inRedirects.value.ui32 = ipst->ips_icmp_mib.icmpInRedirects; 29768 icmpkp->inEchos.value.ui32 = ipst->ips_icmp_mib.icmpInEchos; 29769 icmpkp->inEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpInEchoReps; 29770 icmpkp->inTimestamps.value.ui32 = ipst->ips_icmp_mib.icmpInTimestamps; 29771 icmpkp->inTimestampReps.value.ui32 = 29772 ipst->ips_icmp_mib.icmpInTimestampReps; 29773 icmpkp->inAddrMasks.value.ui32 = ipst->ips_icmp_mib.icmpInAddrMasks; 29774 icmpkp->inAddrMaskReps.value.ui32 = 29775 ipst->ips_icmp_mib.icmpInAddrMaskReps; 29776 icmpkp->outMsgs.value.ui32 = ipst->ips_icmp_mib.icmpOutMsgs; 29777 icmpkp->outErrors.value.ui32 = ipst->ips_icmp_mib.icmpOutErrors; 29778 icmpkp->outDestUnreachs.value.ui32 = 29779 ipst->ips_icmp_mib.icmpOutDestUnreachs; 29780 icmpkp->outTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpOutTimeExcds; 29781 icmpkp->outParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpOutParmProbs; 29782 icmpkp->outSrcQuenchs.value.ui32 = 29783 ipst->ips_icmp_mib.icmpOutSrcQuenchs; 29784 icmpkp->outRedirects.value.ui32 = ipst->ips_icmp_mib.icmpOutRedirects; 29785 icmpkp->outEchos.value.ui32 = ipst->ips_icmp_mib.icmpOutEchos; 29786 icmpkp->outEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpOutEchoReps; 29787 icmpkp->outTimestamps.value.ui32 = 29788 ipst->ips_icmp_mib.icmpOutTimestamps; 29789 icmpkp->outTimestampReps.value.ui32 = 29790 ipst->ips_icmp_mib.icmpOutTimestampReps; 29791 icmpkp->outAddrMasks.value.ui32 = 29792 ipst->ips_icmp_mib.icmpOutAddrMasks; 29793 icmpkp->outAddrMaskReps.value.ui32 = 29794 ipst->ips_icmp_mib.icmpOutAddrMaskReps; 29795 icmpkp->inCksumErrs.value.ui32 = ipst->ips_icmp_mib.icmpInCksumErrs; 29796 icmpkp->inUnknowns.value.ui32 = ipst->ips_icmp_mib.icmpInUnknowns; 29797 icmpkp->inFragNeeded.value.ui32 = ipst->ips_icmp_mib.icmpInFragNeeded; 29798 icmpkp->outFragNeeded.value.ui32 = 29799 ipst->ips_icmp_mib.icmpOutFragNeeded; 29800 icmpkp->outDrops.value.ui32 = ipst->ips_icmp_mib.icmpOutDrops; 29801 icmpkp->inOverflows.value.ui32 = ipst->ips_icmp_mib.icmpInOverflows; 29802 icmpkp->inBadRedirects.value.ui32 = 29803 ipst->ips_icmp_mib.icmpInBadRedirects; 29804 29805 netstack_rele(ns); 29806 return (0); 29807 } 29808 29809 /* 29810 * This is the fanout function for raw socket opened for SCTP. Note 29811 * that it is called after SCTP checks that there is no socket which 29812 * wants a packet. Then before SCTP handles this out of the blue packet, 29813 * this function is called to see if there is any raw socket for SCTP. 29814 * If there is and it is bound to the correct address, the packet will 29815 * be sent to that socket. Note that only one raw socket can be bound to 29816 * a port. This is assured in ipcl_sctp_hash_insert(); 29817 */ 29818 void 29819 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 29820 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 29821 zoneid_t zoneid) 29822 { 29823 conn_t *connp; 29824 queue_t *rq; 29825 mblk_t *first_mp; 29826 boolean_t secure; 29827 ip6_t *ip6h; 29828 ip_stack_t *ipst = recv_ill->ill_ipst; 29829 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 29830 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; 29831 boolean_t sctp_csum_err = B_FALSE; 29832 29833 if (flags & IP_FF_SCTP_CSUM_ERR) { 29834 sctp_csum_err = B_TRUE; 29835 flags &= ~IP_FF_SCTP_CSUM_ERR; 29836 } 29837 29838 first_mp = mp; 29839 if (mctl_present) { 29840 mp = first_mp->b_cont; 29841 secure = ipsec_in_is_secure(first_mp); 29842 ASSERT(mp != NULL); 29843 } else { 29844 secure = B_FALSE; 29845 } 29846 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 29847 29848 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst); 29849 if (connp == NULL) { 29850 /* 29851 * Although raw sctp is not summed, OOB chunks must be. 29852 * Drop the packet here if the sctp checksum failed. 29853 */ 29854 if (sctp_csum_err) { 29855 BUMP_MIB(&sctps->sctps_mib, sctpChecksumError); 29856 freemsg(first_mp); 29857 return; 29858 } 29859 sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present); 29860 return; 29861 } 29862 rq = connp->conn_rq; 29863 if (!canputnext(rq)) { 29864 CONN_DEC_REF(connp); 29865 BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows); 29866 freemsg(first_mp); 29867 return; 29868 } 29869 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) : 29870 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) { 29871 first_mp = ipsec_check_inbound_policy(first_mp, connp, 29872 (isv4 ? ipha : NULL), ip6h, mctl_present); 29873 if (first_mp == NULL) { 29874 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 29875 CONN_DEC_REF(connp); 29876 return; 29877 } 29878 } 29879 /* 29880 * We probably should not send M_CTL message up to 29881 * raw socket. 29882 */ 29883 if (mctl_present) 29884 freeb(first_mp); 29885 29886 /* Initiate IPPF processing here if needed. */ 29887 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) || 29888 (!isv4 && IP6_IN_IPP(flags, ipst))) { 29889 ip_process(IPP_LOCAL_IN, &mp, 29890 recv_ill->ill_phyint->phyint_ifindex); 29891 if (mp == NULL) { 29892 CONN_DEC_REF(connp); 29893 return; 29894 } 29895 } 29896 29897 if (connp->conn_recvif || connp->conn_recvslla || 29898 ((connp->conn_ip_recvpktinfo || 29899 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 29900 (flags & IP_FF_IPINFO))) { 29901 int in_flags = 0; 29902 29903 /* 29904 * Since sctp does not support IP_RECVPKTINFO for v4, only pass 29905 * IPF_RECVIF. 29906 */ 29907 if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { 29908 in_flags = IPF_RECVIF; 29909 } 29910 if (connp->conn_recvslla) { 29911 in_flags |= IPF_RECVSLLA; 29912 } 29913 if (isv4) { 29914 mp = ip_add_info(mp, recv_ill, in_flags, 29915 IPCL_ZONEID(connp), ipst); 29916 } else { 29917 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 29918 if (mp == NULL) { 29919 BUMP_MIB(recv_ill->ill_ip_mib, 29920 ipIfStatsInDiscards); 29921 CONN_DEC_REF(connp); 29922 return; 29923 } 29924 } 29925 } 29926 29927 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 29928 /* 29929 * We are sending the IPSEC_IN message also up. Refer 29930 * to comments above this function. 29931 * This is the SOCK_RAW, IPPROTO_SCTP case. 29932 */ 29933 (connp->conn_recv)(connp, mp, NULL); 29934 CONN_DEC_REF(connp); 29935 } 29936 29937 #define UPDATE_IP_MIB_OB_COUNTERS(ill, len) \ 29938 { \ 29939 BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits); \ 29940 UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len)); \ 29941 } 29942 /* 29943 * This function should be called only if all packet processing 29944 * including fragmentation is complete. Callers of this function 29945 * must set mp->b_prev to one of these values: 29946 * {0, IPP_FWD_OUT, IPP_LOCAL_OUT} 29947 * prior to handing over the mp as first argument to this function. 29948 * 29949 * If the ire passed by caller is incomplete, this function 29950 * queues the packet and if necessary, sends ARP request and bails. 29951 * If the ire passed is fully resolved, we simply prepend 29952 * the link-layer header to the packet, do ipsec hw acceleration 29953 * work if necessary, and send the packet out on the wire. 29954 * 29955 * NOTE: IPsec will only call this function with fully resolved 29956 * ires if hw acceleration is involved. 29957 * TODO list : 29958 * a Handle M_MULTIDATA so that 29959 * tcp_multisend->tcp_multisend_data can 29960 * call ip_xmit_v4 directly 29961 * b Handle post-ARP work for fragments so that 29962 * ip_wput_frag can call this function. 29963 */ 29964 ipxmit_state_t 29965 ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled) 29966 { 29967 nce_t *arpce; 29968 ipha_t *ipha; 29969 queue_t *q; 29970 int ill_index; 29971 mblk_t *nxt_mp, *first_mp; 29972 boolean_t xmit_drop = B_FALSE; 29973 ip_proc_t proc; 29974 ill_t *out_ill; 29975 int pkt_len; 29976 29977 arpce = ire->ire_nce; 29978 ASSERT(arpce != NULL); 29979 29980 DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire, nce_t *, arpce); 29981 29982 mutex_enter(&arpce->nce_lock); 29983 switch (arpce->nce_state) { 29984 case ND_REACHABLE: 29985 /* If there are other queued packets, queue this packet */ 29986 if (arpce->nce_qd_mp != NULL) { 29987 if (mp != NULL) 29988 nce_queue_mp_common(arpce, mp, B_FALSE); 29989 mp = arpce->nce_qd_mp; 29990 } 29991 arpce->nce_qd_mp = NULL; 29992 mutex_exit(&arpce->nce_lock); 29993 29994 /* 29995 * Flush the queue. In the common case, where the 29996 * ARP is already resolved, it will go through the 29997 * while loop only once. 29998 */ 29999 while (mp != NULL) { 30000 30001 nxt_mp = mp->b_next; 30002 mp->b_next = NULL; 30003 ASSERT(mp->b_datap->db_type != M_CTL); 30004 pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length); 30005 /* 30006 * This info is needed for IPQOS to do COS marking 30007 * in ip_wput_attach_llhdr->ip_process. 30008 */ 30009 proc = (ip_proc_t)(uintptr_t)mp->b_prev; 30010 mp->b_prev = NULL; 30011 30012 /* set up ill index for outbound qos processing */ 30013 out_ill = ire_to_ill(ire); 30014 ill_index = out_ill->ill_phyint->phyint_ifindex; 30015 first_mp = ip_wput_attach_llhdr(mp, ire, proc, 30016 ill_index, &ipha); 30017 if (first_mp == NULL) { 30018 xmit_drop = B_TRUE; 30019 BUMP_MIB(out_ill->ill_ip_mib, 30020 ipIfStatsOutDiscards); 30021 goto next_mp; 30022 } 30023 30024 /* non-ipsec hw accel case */ 30025 if (io == NULL || !io->ipsec_out_accelerated) { 30026 /* send it */ 30027 q = ire->ire_stq; 30028 if (proc == IPP_FWD_OUT) { 30029 UPDATE_IB_PKT_COUNT(ire); 30030 } else { 30031 UPDATE_OB_PKT_COUNT(ire); 30032 } 30033 ire->ire_last_used_time = lbolt; 30034 30035 if (flow_ctl_enabled || canputnext(q)) { 30036 if (proc == IPP_FWD_OUT) { 30037 30038 BUMP_MIB(out_ill->ill_ip_mib, 30039 ipIfStatsHCOutForwDatagrams); 30040 30041 } 30042 UPDATE_IP_MIB_OB_COUNTERS(out_ill, 30043 pkt_len); 30044 30045 DTRACE_IP7(send, mblk_t *, first_mp, 30046 conn_t *, NULL, void_ip_t *, ipha, 30047 __dtrace_ipsr_ill_t *, out_ill, 30048 ipha_t *, ipha, ip6_t *, NULL, int, 30049 0); 30050 30051 putnext(q, first_mp); 30052 } else { 30053 BUMP_MIB(out_ill->ill_ip_mib, 30054 ipIfStatsOutDiscards); 30055 xmit_drop = B_TRUE; 30056 freemsg(first_mp); 30057 } 30058 } else { 30059 /* 30060 * Safety Pup says: make sure this 30061 * is going to the right interface! 30062 */ 30063 ill_t *ill1 = 30064 (ill_t *)ire->ire_stq->q_ptr; 30065 int ifindex = 30066 ill1->ill_phyint->phyint_ifindex; 30067 if (ifindex != 30068 io->ipsec_out_capab_ill_index) { 30069 xmit_drop = B_TRUE; 30070 freemsg(mp); 30071 } else { 30072 UPDATE_IP_MIB_OB_COUNTERS(ill1, 30073 pkt_len); 30074 30075 DTRACE_IP7(send, mblk_t *, first_mp, 30076 conn_t *, NULL, void_ip_t *, ipha, 30077 __dtrace_ipsr_ill_t *, ill1, 30078 ipha_t *, ipha, ip6_t *, NULL, 30079 int, 0); 30080 30081 ipsec_hw_putnext(ire->ire_stq, mp); 30082 } 30083 } 30084 next_mp: 30085 mp = nxt_mp; 30086 } /* while (mp != NULL) */ 30087 if (xmit_drop) 30088 return (SEND_FAILED); 30089 else 30090 return (SEND_PASSED); 30091 30092 case ND_INITIAL: 30093 case ND_INCOMPLETE: 30094 30095 /* 30096 * While we do send off packets to dests that 30097 * use fully-resolved CGTP routes, we do not 30098 * handle unresolved CGTP routes. 30099 */ 30100 ASSERT(!(ire->ire_flags & RTF_MULTIRT)); 30101 ASSERT(io == NULL || !io->ipsec_out_accelerated); 30102 30103 if (mp != NULL) { 30104 /* queue the packet */ 30105 nce_queue_mp_common(arpce, mp, B_FALSE); 30106 } 30107 30108 if (arpce->nce_state == ND_INCOMPLETE) { 30109 mutex_exit(&arpce->nce_lock); 30110 DTRACE_PROBE3(ip__xmit__incomplete, 30111 (ire_t *), ire, (mblk_t *), mp, 30112 (ipsec_out_t *), io); 30113 return (LOOKUP_IN_PROGRESS); 30114 } 30115 30116 arpce->nce_state = ND_INCOMPLETE; 30117 mutex_exit(&arpce->nce_lock); 30118 /* 30119 * Note that ire_add() (called from ire_forward()) 30120 * holds a ref on the ire until ARP is completed. 30121 */ 30122 30123 ire_arpresolve(ire, ire_to_ill(ire)); 30124 return (LOOKUP_IN_PROGRESS); 30125 default: 30126 ASSERT(0); 30127 mutex_exit(&arpce->nce_lock); 30128 return (LLHDR_RESLV_FAILED); 30129 } 30130 } 30131 30132 #undef UPDATE_IP_MIB_OB_COUNTERS 30133 30134 /* 30135 * Return B_TRUE if the buffers differ in length or content. 30136 * This is used for comparing extension header buffers. 30137 * Note that an extension header would be declared different 30138 * even if all that changed was the next header value in that header i.e. 30139 * what really changed is the next extension header. 30140 */ 30141 boolean_t 30142 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 30143 uint_t blen) 30144 { 30145 if (!b_valid) 30146 blen = 0; 30147 30148 if (alen != blen) 30149 return (B_TRUE); 30150 if (alen == 0) 30151 return (B_FALSE); /* Both zero length */ 30152 return (bcmp(abuf, bbuf, alen)); 30153 } 30154 30155 /* 30156 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 30157 * Return B_FALSE if memory allocation fails - don't change any state! 30158 */ 30159 boolean_t 30160 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 30161 const void *src, uint_t srclen) 30162 { 30163 void *dst; 30164 30165 if (!src_valid) 30166 srclen = 0; 30167 30168 ASSERT(*dstlenp == 0); 30169 if (src != NULL && srclen != 0) { 30170 dst = mi_alloc(srclen, BPRI_MED); 30171 if (dst == NULL) 30172 return (B_FALSE); 30173 } else { 30174 dst = NULL; 30175 } 30176 if (*dstp != NULL) 30177 mi_free(*dstp); 30178 *dstp = dst; 30179 *dstlenp = dst == NULL ? 0 : srclen; 30180 return (B_TRUE); 30181 } 30182 30183 /* 30184 * Replace what is in *dst, *dstlen with the source. 30185 * Assumes ip_allocbuf has already been called. 30186 */ 30187 void 30188 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 30189 const void *src, uint_t srclen) 30190 { 30191 if (!src_valid) 30192 srclen = 0; 30193 30194 ASSERT(*dstlenp == srclen); 30195 if (src != NULL && srclen != 0) 30196 bcopy(src, *dstp, srclen); 30197 } 30198 30199 /* 30200 * Free the storage pointed to by the members of an ip6_pkt_t. 30201 */ 30202 void 30203 ip6_pkt_free(ip6_pkt_t *ipp) 30204 { 30205 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 30206 30207 if (ipp->ipp_fields & IPPF_HOPOPTS) { 30208 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 30209 ipp->ipp_hopopts = NULL; 30210 ipp->ipp_hopoptslen = 0; 30211 } 30212 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 30213 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 30214 ipp->ipp_rtdstopts = NULL; 30215 ipp->ipp_rtdstoptslen = 0; 30216 } 30217 if (ipp->ipp_fields & IPPF_DSTOPTS) { 30218 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 30219 ipp->ipp_dstopts = NULL; 30220 ipp->ipp_dstoptslen = 0; 30221 } 30222 if (ipp->ipp_fields & IPPF_RTHDR) { 30223 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 30224 ipp->ipp_rthdr = NULL; 30225 ipp->ipp_rthdrlen = 0; 30226 } 30227 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 30228 IPPF_RTHDR); 30229 } 30230