1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/dlpi.h> 31 #include <sys/stropts.h> 32 #include <sys/sysmacros.h> 33 #include <sys/strsubr.h> 34 #include <sys/strlog.h> 35 #include <sys/strsun.h> 36 #include <sys/zone.h> 37 #define _SUN_TPI_VERSION 2 38 #include <sys/tihdr.h> 39 #include <sys/xti_inet.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/debug.h> 44 #include <sys/kobj.h> 45 #include <sys/modctl.h> 46 #include <sys/atomic.h> 47 #include <sys/policy.h> 48 #include <sys/priv.h> 49 50 #include <sys/systm.h> 51 #include <sys/param.h> 52 #include <sys/kmem.h> 53 #include <sys/sdt.h> 54 #include <sys/socket.h> 55 #include <sys/vtrace.h> 56 #include <sys/isa_defs.h> 57 #include <sys/mac.h> 58 #include <net/if.h> 59 #include <net/if_arp.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <net/if_dl.h> 64 65 #include <inet/common.h> 66 #include <inet/mi.h> 67 #include <inet/mib2.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/snmpcom.h> 71 #include <inet/optcom.h> 72 #include <inet/kstatcom.h> 73 74 #include <netinet/igmp_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet/icmp6.h> 77 #include <netinet/sctp.h> 78 79 #include <inet/ip.h> 80 #include <inet/ip_impl.h> 81 #include <inet/ip6.h> 82 #include <inet/ip6_asp.h> 83 #include <inet/tcp.h> 84 #include <inet/tcp_impl.h> 85 #include <inet/ip_multi.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_ire.h> 88 #include <inet/ip_ftable.h> 89 #include <inet/ip_rts.h> 90 #include <inet/ip_ndp.h> 91 #include <inet/ip_listutils.h> 92 #include <netinet/igmp.h> 93 #include <netinet/ip_mroute.h> 94 #include <inet/ipp_common.h> 95 96 #include <net/pfkeyv2.h> 97 #include <inet/ipsec_info.h> 98 #include <inet/sadb.h> 99 #include <inet/ipsec_impl.h> 100 #include <sys/iphada.h> 101 #include <inet/tun.h> 102 #include <inet/ipdrop.h> 103 #include <inet/ip_netinfo.h> 104 105 #include <sys/ethernet.h> 106 #include <net/if_types.h> 107 #include <sys/cpuvar.h> 108 109 #include <ipp/ipp.h> 110 #include <ipp/ipp_impl.h> 111 #include <ipp/ipgpc/ipgpc.h> 112 113 #include <sys/multidata.h> 114 #include <sys/pattr.h> 115 116 #include <inet/ipclassifier.h> 117 #include <inet/sctp_ip.h> 118 #include <inet/sctp/sctp_impl.h> 119 #include <inet/udp_impl.h> 120 #include <inet/rawip_impl.h> 121 #include <inet/rts_impl.h> 122 #include <sys/sunddi.h> 123 124 #include <sys/tsol/label.h> 125 #include <sys/tsol/tnet.h> 126 127 #include <rpc/pmap_prot.h> 128 129 /* 130 * Values for squeue switch: 131 * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain 132 * IP_SQUEUE_ENTER: squeue_enter 133 * IP_SQUEUE_FILL: squeue_fill 134 */ 135 int ip_squeue_enter = 2; /* Setable in /etc/system */ 136 137 squeue_func_t ip_input_proc; 138 #define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) 139 140 /* 141 * Setable in /etc/system 142 */ 143 int ip_poll_normal_ms = 100; 144 int ip_poll_normal_ticks = 0; 145 int ip_modclose_ackwait_ms = 3000; 146 147 /* 148 * It would be nice to have these present only in DEBUG systems, but the 149 * current design of the global symbol checking logic requires them to be 150 * unconditionally present. 151 */ 152 uint_t ip_thread_data; /* TSD key for debug support */ 153 krwlock_t ip_thread_rwlock; 154 list_t ip_thread_list; 155 156 /* 157 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 158 */ 159 160 struct listptr_s { 161 mblk_t *lp_head; /* pointer to the head of the list */ 162 mblk_t *lp_tail; /* pointer to the tail of the list */ 163 }; 164 165 typedef struct listptr_s listptr_t; 166 167 /* 168 * This is used by ip_snmp_get_mib2_ip_route_media and 169 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 170 */ 171 typedef struct iproutedata_s { 172 uint_t ird_idx; 173 listptr_t ird_route; /* ipRouteEntryTable */ 174 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 175 listptr_t ird_attrs; /* ipRouteAttributeTable */ 176 } iproutedata_t; 177 178 /* 179 * Cluster specific hooks. These should be NULL when booted as a non-cluster 180 */ 181 182 /* 183 * Hook functions to enable cluster networking 184 * On non-clustered systems these vectors must always be NULL. 185 * 186 * Hook function to Check ip specified ip address is a shared ip address 187 * in the cluster 188 * 189 */ 190 int (*cl_inet_isclusterwide)(uint8_t protocol, 191 sa_family_t addr_family, uint8_t *laddrp) = NULL; 192 193 /* 194 * Hook function to generate cluster wide ip fragment identifier 195 */ 196 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 197 uint8_t *laddrp, uint8_t *faddrp) = NULL; 198 199 /* 200 * Hook function to generate cluster wide SPI. 201 */ 202 void (*cl_inet_getspi)(uint8_t, uint8_t *, size_t) = NULL; 203 204 /* 205 * Hook function to verify if the SPI is already utlized. 206 */ 207 208 int (*cl_inet_checkspi)(uint8_t, uint32_t) = NULL; 209 210 /* 211 * Hook function to delete the SPI from the cluster wide repository. 212 */ 213 214 void (*cl_inet_deletespi)(uint8_t, uint32_t) = NULL; 215 216 /* 217 * Hook function to inform the cluster when packet received on an IDLE SA 218 */ 219 220 void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t, 221 in6_addr_t) = NULL; 222 223 /* 224 * Synchronization notes: 225 * 226 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 227 * MT level protection given by STREAMS. IP uses a combination of its own 228 * internal serialization mechanism and standard Solaris locking techniques. 229 * The internal serialization is per phyint (no IPMP) or per IPMP group. 230 * This is used to serialize plumbing operations, IPMP operations, certain 231 * multicast operations, most set ioctls, igmp/mld timers etc. 232 * 233 * Plumbing is a long sequence of operations involving message 234 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 235 * involved in plumbing operations. A natural model is to serialize these 236 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 237 * parallel without any interference. But various set ioctls on hme0 are best 238 * serialized. However if the system uses IPMP, the operations are easier if 239 * they are serialized on a per IPMP group basis since IPMP operations 240 * happen across ill's of a group. Thus the lowest common denominator is to 241 * serialize most set ioctls, multicast join/leave operations, IPMP operations 242 * igmp/mld timer operations, and processing of DLPI control messages received 243 * from drivers on a per IPMP group basis. If the system does not employ 244 * IPMP the serialization is on a per phyint basis. This serialization is 245 * provided by the ipsq_t and primitives operating on this. Details can 246 * be found in ip_if.c above the core primitives operating on ipsq_t. 247 * 248 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 249 * Simiarly lookup of an ire by a thread also returns a refheld ire. 250 * In addition ipif's and ill's referenced by the ire are also indirectly 251 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 252 * the ipif's address or netmask change as long as an ipif is refheld 253 * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the 254 * address of an ipif has to go through the ipsq_t. This ensures that only 255 * 1 such exclusive operation proceeds at any time on the ipif. It then 256 * deletes all ires associated with this ipif, and waits for all refcnts 257 * associated with this ipif to come down to zero. The address is changed 258 * only after the ipif has been quiesced. Then the ipif is brought up again. 259 * More details are described above the comment in ip_sioctl_flags. 260 * 261 * Packet processing is based mostly on IREs and are fully multi-threaded 262 * using standard Solaris MT techniques. 263 * 264 * There are explicit locks in IP to handle: 265 * - The ip_g_head list maintained by mi_open_link() and friends. 266 * 267 * - The reassembly data structures (one lock per hash bucket) 268 * 269 * - conn_lock is meant to protect conn_t fields. The fields actually 270 * protected by conn_lock are documented in the conn_t definition. 271 * 272 * - ire_lock to protect some of the fields of the ire, IRE tables 273 * (one lock per hash bucket). Refer to ip_ire.c for details. 274 * 275 * - ndp_g_lock and nce_lock for protecting NCEs. 276 * 277 * - ill_lock protects fields of the ill and ipif. Details in ip.h 278 * 279 * - ill_g_lock: This is a global reader/writer lock. Protects the following 280 * * The AVL tree based global multi list of all ills. 281 * * The linked list of all ipifs of an ill 282 * * The <ill-ipsq> mapping 283 * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next 284 * * The illgroup list threaded by ill_group_next. 285 * * <ill-phyint> association 286 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 287 * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion 288 * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill 289 * will all have to hold the ill_g_lock as writer for the actual duration 290 * of the insertion/deletion/change. More details about the <ill-ipsq> mapping 291 * may be found in the IPMP section. 292 * 293 * - ill_lock: This is a per ill mutex. 294 * It protects some members of the ill and is documented below. 295 * It also protects the <ill-ipsq> mapping 296 * It also protects the illgroup list threaded by ill_group_next. 297 * It also protects the <ill-phyint> assoc. 298 * It also protects the list of ipifs hanging off the ill. 299 * 300 * - ipsq_lock: This is a per ipsq_t mutex lock. 301 * This protects all the other members of the ipsq struct except 302 * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock 303 * 304 * - illgrp_lock: This is a per ill_group mutex lock. 305 * The only thing it protects is the illgrp_ill_schednext member of ill_group 306 * which dictates which is the next ill in an ill_group that is to be chosen 307 * for sending outgoing packets, through creation of an IRE_CACHE that 308 * references this ill. 309 * 310 * - phyint_lock: This is a per phyint mutex lock. Protects just the 311 * phyint_flags 312 * 313 * - ip_g_nd_lock: This is a global reader/writer lock. 314 * Any call to nd_load to load a new parameter to the ND table must hold the 315 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 316 * as reader. 317 * 318 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 319 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 320 * uniqueness check also done atomically. 321 * 322 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 323 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 324 * as a writer when adding or deleting elements from these lists, and 325 * as a reader when walking these lists to send a SADB update to the 326 * IPsec capable ills. 327 * 328 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 329 * group list linked by ill_usesrc_grp_next. It also protects the 330 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 331 * group is being added or deleted. This lock is taken as a reader when 332 * walking the list/group(eg: to get the number of members in a usesrc group). 333 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 334 * field is changing state i.e from NULL to non-NULL or vice-versa. For 335 * example, it is not necessary to take this lock in the initial portion 336 * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and 337 * ip_sioctl_flags since the these operations are executed exclusively and 338 * that ensures that the "usesrc group state" cannot change. The "usesrc 339 * group state" change can happen only in the latter part of 340 * ip_sioctl_slifusesrc and in ill_delete. 341 * 342 * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. 343 * 344 * To change the <ill-phyint> association, the ill_g_lock must be held 345 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 346 * must be held. 347 * 348 * To change the <ill-ipsq> association the ill_g_lock must be held as writer 349 * and the ill_lock of the ill in question must be held. 350 * 351 * To change the <ill-illgroup> association the ill_g_lock must be held as 352 * writer and the ill_lock of the ill in question must be held. 353 * 354 * To add or delete an ipif from the list of ipifs hanging off the ill, 355 * ill_g_lock (writer) and ill_lock must be held and the thread must be 356 * a writer on the associated ipsq,. 357 * 358 * To add or delete an ill to the system, the ill_g_lock must be held as 359 * writer and the thread must be a writer on the associated ipsq. 360 * 361 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 362 * must be a writer on the associated ipsq. 363 * 364 * Lock hierarchy 365 * 366 * Some lock hierarchy scenarios are listed below. 367 * 368 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 369 * ill_g_lock -> illgrp_lock -> ill_lock 370 * ill_g_lock -> ill_lock(s) -> phyint_lock 371 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 372 * ill_g_lock -> ip_addr_avail_lock 373 * conn_lock -> irb_lock -> ill_lock -> ire_lock 374 * ill_g_lock -> ip_g_nd_lock 375 * 376 * When more than 1 ill lock is needed to be held, all ill lock addresses 377 * are sorted on address and locked starting from highest addressed lock 378 * downward. 379 * 380 * IPsec scenarios 381 * 382 * ipsa_lock -> ill_g_lock -> ill_lock 383 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 384 * ipsec_capab_ills_lock -> ipsa_lock 385 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 386 * 387 * Trusted Solaris scenarios 388 * 389 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 390 * igsa_lock -> gcdb_lock 391 * gcgrp_rwlock -> ire_lock 392 * gcgrp_rwlock -> gcdb_lock 393 * 394 * 395 * Routing/forwarding table locking notes: 396 * 397 * Lock acquisition order: Radix tree lock, irb_lock. 398 * Requirements: 399 * i. Walker must not hold any locks during the walker callback. 400 * ii Walker must not see a truncated tree during the walk because of any node 401 * deletion. 402 * iii Existing code assumes ire_bucket is valid if it is non-null and is used 403 * in many places in the code to walk the irb list. Thus even if all the 404 * ires in a bucket have been deleted, we still can't free the radix node 405 * until the ires have actually been inactive'd (freed). 406 * 407 * Tree traversal - Need to hold the global tree lock in read mode. 408 * Before dropping the global tree lock, need to either increment the ire_refcnt 409 * to ensure that the radix node can't be deleted. 410 * 411 * Tree add - Need to hold the global tree lock in write mode to add a 412 * radix node. To prevent the node from being deleted, increment the 413 * irb_refcnt, after the node is added to the tree. The ire itself is 414 * added later while holding the irb_lock, but not the tree lock. 415 * 416 * Tree delete - Need to hold the global tree lock and irb_lock in write mode. 417 * All associated ires must be inactive (i.e. freed), and irb_refcnt 418 * must be zero. 419 * 420 * Walker - Increment irb_refcnt before calling the walker callback. Hold the 421 * global tree lock (read mode) for traversal. 422 * 423 * IPsec notes : 424 * 425 * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message 426 * in front of the actual packet. For outbound datagrams, the M_CTL 427 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 428 * information used by the IPsec code for applying the right level of 429 * protection. The information initialized by IP in the ipsec_out_t 430 * is determined by the per-socket policy or global policy in the system. 431 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 432 * ipsec_info.h) which starts out with nothing in it. It gets filled 433 * with the right information if it goes through the AH/ESP code, which 434 * happens if the incoming packet is secure. The information initialized 435 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 436 * the policy requirements needed by per-socket policy or global policy 437 * is met or not. 438 * 439 * If there is both per-socket policy (set using setsockopt) and there 440 * is also global policy match for the 5 tuples of the socket, 441 * ipsec_override_policy() makes the decision of which one to use. 442 * 443 * For fully connected sockets i.e dst, src [addr, port] is known, 444 * conn_policy_cached is set indicating that policy has been cached. 445 * conn_in_enforce_policy may or may not be set depending on whether 446 * there is a global policy match or per-socket policy match. 447 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 448 * Once the right policy is set on the conn_t, policy cannot change for 449 * this socket. This makes life simpler for TCP (UDP ?) where 450 * re-transmissions go out with the same policy. For symmetry, policy 451 * is cached for fully connected UDP sockets also. Thus if policy is cached, 452 * it also implies that policy is latched i.e policy cannot change 453 * on these sockets. As we have the right policy on the conn, we don't 454 * have to lookup global policy for every outbound and inbound datagram 455 * and thus serving as an optimization. Note that a global policy change 456 * does not affect fully connected sockets if they have policy. If fully 457 * connected sockets did not have any policy associated with it, global 458 * policy change may affect them. 459 * 460 * IP Flow control notes: 461 * 462 * Non-TCP streams are flow controlled by IP. On the send side, if the packet 463 * cannot be sent down to the driver by IP, because of a canput failure, IP 464 * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. 465 * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained 466 * when the flowcontrol condition subsides. Ultimately STREAMS backenables the 467 * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the 468 * first conn in the list of conn's to be drained. ip_wsrv on this conn drains 469 * the queued messages, and removes the conn from the drain list, if all 470 * messages were drained. It also qenables the next conn in the drain list to 471 * continue the drain process. 472 * 473 * In reality the drain list is not a single list, but a configurable number 474 * of lists. The ip_wsrv on the IP module, qenables the first conn in each 475 * list. If the ip_wsrv of the next qenabled conn does not run, because the 476 * stream closes, ip_close takes responsibility to qenable the next conn in 477 * the drain list. The directly called ip_wput path always does a putq, if 478 * it cannot putnext. Thus synchronization problems are handled between 479 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 480 * functions that manipulate this drain list. Furthermore conn_drain_insert 481 * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv 482 * running on a queue at any time. conn_drain_tail can be simultaneously called 483 * from both ip_wsrv and ip_close. 484 * 485 * IPQOS notes: 486 * 487 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 488 * and IPQoS modules. IPPF includes hooks in IP at different control points 489 * (callout positions) which direct packets to IPQoS modules for policy 490 * processing. Policies, if present, are global. 491 * 492 * The callout positions are located in the following paths: 493 * o local_in (packets destined for this host) 494 * o local_out (packets orginating from this host ) 495 * o fwd_in (packets forwarded by this m/c - inbound) 496 * o fwd_out (packets forwarded by this m/c - outbound) 497 * Hooks at these callout points can be enabled/disabled using the ndd variable 498 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 499 * By default all the callout positions are enabled. 500 * 501 * Outbound (local_out) 502 * Hooks are placed in ip_wput_ire and ipsec_out_process. 503 * 504 * Inbound (local_in) 505 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 506 * TCP and UDP fanout routines. 507 * 508 * Forwarding (in and out) 509 * Hooks are placed in ip_rput_forward. 510 * 511 * IP Policy Framework processing (IPPF processing) 512 * Policy processing for a packet is initiated by ip_process, which ascertains 513 * that the classifier (ipgpc) is loaded and configured, failing which the 514 * packet resumes normal processing in IP. If the clasifier is present, the 515 * packet is acted upon by one or more IPQoS modules (action instances), per 516 * filters configured in ipgpc and resumes normal IP processing thereafter. 517 * An action instance can drop a packet in course of its processing. 518 * 519 * A boolean variable, ip_policy, is used in all the fanout routines that can 520 * invoke ip_process for a packet. This variable indicates if the packet should 521 * to be sent for policy processing. The variable is set to B_TRUE by default, 522 * i.e. when the routines are invoked in the normal ip procesing path for a 523 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 524 * ip_policy is set to B_FALSE for all the routines called in these two 525 * functions because, in the former case, we don't process loopback traffic 526 * currently while in the latter, the packets have already been processed in 527 * icmp_inbound. 528 * 529 * Zones notes: 530 * 531 * The partitioning rules for networking are as follows: 532 * 1) Packets coming from a zone must have a source address belonging to that 533 * zone. 534 * 2) Packets coming from a zone can only be sent on a physical interface on 535 * which the zone has an IP address. 536 * 3) Between two zones on the same machine, packet delivery is only allowed if 537 * there's a matching route for the destination and zone in the forwarding 538 * table. 539 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 540 * different zones can bind to the same port with the wildcard address 541 * (INADDR_ANY). 542 * 543 * The granularity of interface partitioning is at the logical interface level. 544 * Therefore, every zone has its own IP addresses, and incoming packets can be 545 * attributed to a zone unambiguously. A logical interface is placed into a zone 546 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 547 * structure. Rule (1) is implemented by modifying the source address selection 548 * algorithm so that the list of eligible addresses is filtered based on the 549 * sending process zone. 550 * 551 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 552 * across all zones, depending on their type. Here is the break-up: 553 * 554 * IRE type Shared/exclusive 555 * -------- ---------------- 556 * IRE_BROADCAST Exclusive 557 * IRE_DEFAULT (default routes) Shared (*) 558 * IRE_LOCAL Exclusive (x) 559 * IRE_LOOPBACK Exclusive 560 * IRE_PREFIX (net routes) Shared (*) 561 * IRE_CACHE Exclusive 562 * IRE_IF_NORESOLVER (interface routes) Exclusive 563 * IRE_IF_RESOLVER (interface routes) Exclusive 564 * IRE_HOST (host routes) Shared (*) 565 * 566 * (*) A zone can only use a default or off-subnet route if the gateway is 567 * directly reachable from the zone, that is, if the gateway's address matches 568 * one of the zone's logical interfaces. 569 * 570 * (x) IRE_LOCAL are handled a bit differently, since for all other entries 571 * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source 572 * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP 573 * address of the zone itself (the destination). Since IRE_LOCAL is used 574 * for communication between zones, ip_wput_ire has special logic to set 575 * the right source address when sending using an IRE_LOCAL. 576 * 577 * Furthermore, when ip_restrict_interzone_loopback is set (the default), 578 * ire_cache_lookup restricts loopback using an IRE_LOCAL 579 * between zone to the case when L2 would have conceptually looped the packet 580 * back, i.e. the loopback which is required since neither Ethernet drivers 581 * nor Ethernet hardware loops them back. This is the case when the normal 582 * routes (ignoring IREs with different zoneids) would send out the packet on 583 * the same ill (or ill group) as the ill with which is IRE_LOCAL is 584 * associated. 585 * 586 * Multiple zones can share a common broadcast address; typically all zones 587 * share the 255.255.255.255 address. Incoming as well as locally originated 588 * broadcast packets must be dispatched to all the zones on the broadcast 589 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 590 * since some zones may not be on the 10.16.72/24 network. To handle this, each 591 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 592 * sent to every zone that has an IRE_BROADCAST entry for the destination 593 * address on the input ill, see conn_wantpacket(). 594 * 595 * Applications in different zones can join the same multicast group address. 596 * For IPv4, group memberships are per-logical interface, so they're already 597 * inherently part of a zone. For IPv6, group memberships are per-physical 598 * interface, so we distinguish IPv6 group memberships based on group address, 599 * interface and zoneid. In both cases, received multicast packets are sent to 600 * every zone for which a group membership entry exists. On IPv6 we need to 601 * check that the target zone still has an address on the receiving physical 602 * interface; it could have been removed since the application issued the 603 * IPV6_JOIN_GROUP. 604 */ 605 606 /* 607 * Squeue Fanout flags: 608 * 0: No fanout. 609 * 1: Fanout across all squeues 610 */ 611 boolean_t ip_squeue_fanout = 0; 612 613 /* 614 * Maximum dups allowed per packet. 615 */ 616 uint_t ip_max_frag_dups = 10; 617 618 #define IS_SIMPLE_IPH(ipha) \ 619 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 620 621 /* RFC1122 Conformance */ 622 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 623 624 #define ILL_MAX_NAMELEN LIFNAMSIZ 625 626 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 627 628 static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, 629 cred_t *credp, boolean_t isv6); 630 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t, 631 ipha_t **); 632 633 static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t, 634 ip_stack_t *); 635 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 636 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 637 static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp); 638 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t, 639 mblk_t *, int, ip_stack_t *); 640 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 641 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 642 ill_t *, zoneid_t); 643 static void icmp_options_update(ipha_t *); 644 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t, 645 ip_stack_t *); 646 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t, 647 zoneid_t zoneid, ip_stack_t *); 648 static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *); 649 static void icmp_redirect(ill_t *, mblk_t *); 650 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, 651 ip_stack_t *); 652 653 static void ip_arp_news(queue_t *, mblk_t *); 654 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *, 655 ip_stack_t *); 656 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 657 char *ip_dot_addr(ipaddr_t, char *); 658 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 659 int ip_close(queue_t *, int); 660 static char *ip_dot_saddr(uchar_t *, char *); 661 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 662 boolean_t, boolean_t, ill_t *, zoneid_t); 663 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 664 boolean_t, boolean_t, zoneid_t); 665 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 666 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 667 static void ip_lrput(queue_t *, mblk_t *); 668 ipaddr_t ip_net_mask(ipaddr_t); 669 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, 670 ip_stack_t *); 671 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 672 conn_t *, uint32_t, zoneid_t, ip_opt_info_t *); 673 char *ip_nv_lookup(nv_t *, int); 674 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 675 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 676 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 677 static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, 678 ipndp_t *, size_t); 679 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 680 void ip_rput(queue_t *, mblk_t *); 681 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 682 void *dummy_arg); 683 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 684 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *, 685 ip_stack_t *); 686 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 687 ire_t *, ip_stack_t *); 688 static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, 689 mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); 690 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, 691 ip_stack_t *); 692 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, 693 uint16_t *); 694 int ip_snmp_get(queue_t *, mblk_t *, int); 695 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, 696 mib2_ipIfStatsEntry_t *, ip_stack_t *); 697 static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, 698 ip_stack_t *); 699 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); 700 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); 701 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); 702 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); 703 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); 704 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, 705 ip_stack_t *ipst); 706 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, 707 ip_stack_t *ipst); 708 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, 709 ip_stack_t *ipst); 710 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, 711 ip_stack_t *ipst); 712 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, 713 ip_stack_t *ipst); 714 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, 715 ip_stack_t *ipst); 716 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, 717 ip_stack_t *ipst); 718 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, 719 ip_stack_t *ipst); 720 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, 721 ip_stack_t *ipst); 722 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, 723 ip_stack_t *ipst); 724 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 725 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 726 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 727 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 728 static boolean_t ip_source_routed(ipha_t *, ip_stack_t *); 729 static boolean_t ip_source_route_included(ipha_t *); 730 static void ip_trash_ire_reclaim_stack(ip_stack_t *); 731 732 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, 733 zoneid_t, ip_stack_t *); 734 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *); 735 static void ip_wput_local_options(ipha_t *, ip_stack_t *); 736 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 737 zoneid_t, ip_stack_t *); 738 739 static void conn_drain_init(ip_stack_t *); 740 static void conn_drain_fini(ip_stack_t *); 741 static void conn_drain_tail(conn_t *connp, boolean_t closing); 742 743 static void conn_walk_drain(ip_stack_t *); 744 static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, 745 zoneid_t); 746 747 static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); 748 static void ip_stack_shutdown(netstackid_t stackid, void *arg); 749 static void ip_stack_fini(netstackid_t stackid, void *arg); 750 751 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 752 zoneid_t); 753 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 754 void *dummy_arg); 755 756 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 757 758 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 759 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 760 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 761 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 762 763 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 764 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 765 caddr_t, cred_t *); 766 extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 767 caddr_t cp, cred_t *cr); 768 extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, 769 cred_t *); 770 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 771 caddr_t cp, cred_t *cr); 772 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 773 cred_t *); 774 static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, 775 cred_t *); 776 static squeue_func_t ip_squeue_switch(int); 777 778 static void *ip_kstat_init(netstackid_t, ip_stack_t *); 779 static void ip_kstat_fini(netstackid_t, kstat_t *); 780 static int ip_kstat_update(kstat_t *kp, int rw); 781 static void *icmp_kstat_init(netstackid_t); 782 static void icmp_kstat_fini(netstackid_t, kstat_t *); 783 static int icmp_kstat_update(kstat_t *kp, int rw); 784 static void *ip_kstat2_init(netstackid_t, ip_stat_t *); 785 static void ip_kstat2_fini(netstackid_t, kstat_t *); 786 787 static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); 788 789 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 790 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 791 792 static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, 793 ipha_t *, ill_t *, boolean_t); 794 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 795 796 /* How long, in seconds, we allow frags to hang around. */ 797 #define IP_FRAG_TIMEOUT 60 798 799 /* 800 * Threshold which determines whether MDT should be used when 801 * generating IP fragments; payload size must be greater than 802 * this threshold for MDT to take place. 803 */ 804 #define IP_WPUT_FRAG_MDT_MIN 32768 805 806 /* Setable in /etc/system only */ 807 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 808 809 static long ip_rput_pullups; 810 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 811 812 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ 813 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ 814 815 int ip_debug; 816 817 #ifdef DEBUG 818 uint32_t ipsechw_debug = 0; 819 #endif 820 821 /* 822 * Multirouting/CGTP stuff 823 */ 824 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 825 826 /* 827 * XXX following really should only be in a header. Would need more 828 * header and .c clean up first. 829 */ 830 extern optdb_obj_t ip_opt_obj; 831 832 ulong_t ip_squeue_enter_unbound = 0; 833 834 /* 835 * Named Dispatch Parameter Table. 836 * All of these are alterable, within the min/max values given, at run time. 837 */ 838 static ipparam_t lcl_param_arr[] = { 839 /* min max value name */ 840 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 841 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 842 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 843 { 0, 1, 0, "ip_respond_to_timestamp"}, 844 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 845 { 0, 1, 1, "ip_send_redirects"}, 846 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 847 { 0, 10, 0, "ip_mrtdebug"}, 848 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 849 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 850 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 851 { 1, 255, 255, "ip_def_ttl" }, 852 { 0, 1, 0, "ip_forward_src_routed"}, 853 { 0, 256, 32, "ip_wroff_extra" }, 854 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 855 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 856 { 0, 1, 1, "ip_path_mtu_discovery" }, 857 { 0, 240, 30, "ip_ignore_delete_time" }, 858 { 0, 1, 0, "ip_ignore_redirect" }, 859 { 0, 1, 1, "ip_output_queue" }, 860 { 1, 254, 1, "ip_broadcast_ttl" }, 861 { 0, 99999, 100, "ip_icmp_err_interval" }, 862 { 1, 99999, 10, "ip_icmp_err_burst" }, 863 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 864 { 0, 1, 0, "ip_strict_dst_multihoming" }, 865 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 866 { 0, 1, 0, "ipsec_override_persocket_policy" }, 867 { 0, 1, 1, "icmp_accept_clear_messages" }, 868 { 0, 1, 1, "igmp_accept_clear_messages" }, 869 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 870 "ip_ndp_delay_first_probe_time"}, 871 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 872 "ip_ndp_max_unicast_solicit"}, 873 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 874 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 875 { 0, 1, 0, "ip6_forward_src_routed"}, 876 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 877 { 0, 1, 1, "ip6_send_redirects"}, 878 { 0, 1, 0, "ip6_ignore_redirect" }, 879 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 880 881 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 882 883 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 884 885 { 0, 1, 1, "pim_accept_clear_messages" }, 886 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 887 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 888 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 889 { 0, 15, 0, "ip_policy_mask" }, 890 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 891 { 0, 255, 1, "ip_multirt_ttl" }, 892 { 0, 1, 1, "ip_multidata_outbound" }, 893 { 0, 3600000, 300000, "ip_ndp_defense_interval" }, 894 { 0, 999999, 60*60*24, "ip_max_temp_idle" }, 895 { 0, 1000, 1, "ip_max_temp_defend" }, 896 { 0, 1000, 3, "ip_max_defend" }, 897 { 0, 999999, 30, "ip_defend_interval" }, 898 { 0, 3600000, 300000, "ip_dup_recovery" }, 899 { 0, 1, 1, "ip_restrict_interzone_loopback" }, 900 { 0, 1, 1, "ip_lso_outbound" }, 901 { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, 902 { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, 903 #ifdef DEBUG 904 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 905 #else 906 { 0, 0, 0, "" }, 907 #endif 908 }; 909 910 /* 911 * Extended NDP table 912 * The addresses for the first two are filled in to be ips_ip_g_forward 913 * and ips_ipv6_forward at init time. 914 */ 915 static ipndp_t lcl_ndp_arr[] = { 916 /* getf setf data name */ 917 #define IPNDP_IP_FORWARDING_OFFSET 0 918 { ip_param_generic_get, ip_forward_set, NULL, 919 "ip_forwarding" }, 920 #define IPNDP_IP6_FORWARDING_OFFSET 1 921 { ip_param_generic_get, ip_forward_set, NULL, 922 "ip6_forwarding" }, 923 { ip_ill_report, NULL, NULL, 924 "ip_ill_status" }, 925 { ip_ipif_report, NULL, NULL, 926 "ip_ipif_status" }, 927 { ip_conn_report, NULL, NULL, 928 "ip_conn_status" }, 929 { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, 930 "ip_rput_pullups" }, 931 { ip_srcid_report, NULL, NULL, 932 "ip_srcid_status" }, 933 { ip_param_generic_get, ip_squeue_profile_set, 934 (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, 935 { ip_param_generic_get, ip_squeue_bind_set, 936 (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, 937 { ip_param_generic_get, ip_input_proc_set, 938 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 939 { ip_param_generic_get, ip_int_set, 940 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 941 #define IPNDP_CGTP_FILTER_OFFSET 11 942 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, 943 "ip_cgtp_filter" }, 944 { ip_param_generic_get, ip_int_set, 945 (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" }, 946 #define IPNDP_IPMP_HOOK_OFFSET 13 947 { ip_param_generic_get, ipmp_hook_emulation_set, NULL, 948 "ipmp_hook_emulation" }, 949 { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, 950 "ip_debug" }, 951 }; 952 953 /* 954 * Table of IP ioctls encoding the various properties of the ioctl and 955 * indexed based on the last byte of the ioctl command. Occasionally there 956 * is a clash, and there is more than 1 ioctl with the same last byte. 957 * In such a case 1 ioctl is encoded in the ndx table and the remaining 958 * ioctls are encoded in the misc table. An entry in the ndx table is 959 * retrieved by indexing on the last byte of the ioctl command and comparing 960 * the ioctl command with the value in the ndx table. In the event of a 961 * mismatch the misc table is then searched sequentially for the desired 962 * ioctl command. 963 * 964 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 965 */ 966 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 967 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 968 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 969 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 970 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 971 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 972 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 973 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 974 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 975 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 976 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 977 978 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 979 MISC_CMD, ip_siocaddrt, NULL }, 980 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 981 MISC_CMD, ip_siocdelrt, NULL }, 982 983 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 984 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 985 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 986 IF_CMD, ip_sioctl_get_addr, NULL }, 987 988 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 989 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 990 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 991 IPI_GET_CMD | IPI_REPL, 992 IF_CMD, ip_sioctl_get_dstaddr, NULL }, 993 994 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 995 IPI_PRIV | IPI_WR | IPI_REPL, 996 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 997 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 998 IPI_MODOK | IPI_GET_CMD | IPI_REPL, 999 IF_CMD, ip_sioctl_get_flags, NULL }, 1000 1001 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1002 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1003 1004 /* copyin size cannot be coded for SIOCGIFCONF */ 1005 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, 1006 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1007 1008 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1009 IF_CMD, ip_sioctl_mtu, NULL }, 1010 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1011 IF_CMD, ip_sioctl_get_mtu, NULL }, 1012 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 1013 IPI_GET_CMD | IPI_REPL, 1014 IF_CMD, ip_sioctl_get_brdaddr, NULL }, 1015 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1016 IF_CMD, ip_sioctl_brdaddr, NULL }, 1017 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1018 IPI_GET_CMD | IPI_REPL, 1019 IF_CMD, ip_sioctl_get_netmask, NULL }, 1020 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1021 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1022 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1023 IPI_GET_CMD | IPI_REPL, 1024 IF_CMD, ip_sioctl_get_metric, NULL }, 1025 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1026 IF_CMD, ip_sioctl_metric, NULL }, 1027 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1028 1029 /* See 166-168 below for extended SIOC*XARP ioctls */ 1030 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, 1031 ARP_CMD, ip_sioctl_arp, NULL }, 1032 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, 1033 ARP_CMD, ip_sioctl_arp, NULL }, 1034 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, 1035 ARP_CMD, ip_sioctl_arp, NULL }, 1036 1037 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1038 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1039 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1040 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1041 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1042 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1043 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1044 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1045 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1046 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1047 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1048 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1049 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1050 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1051 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1052 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1053 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1054 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1055 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1056 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1057 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1058 1059 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1060 MISC_CMD, if_unitsel, if_unitsel_restart }, 1061 1062 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1063 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1064 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1065 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1066 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1067 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1068 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1069 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1070 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1071 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1072 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1073 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1077 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1078 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1079 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1080 1081 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1082 IPI_PRIV | IPI_WR | IPI_MODOK, 1083 IF_CMD, ip_sioctl_sifname, NULL }, 1084 1085 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1086 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1087 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1088 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1089 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1090 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1091 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1092 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1093 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1094 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1095 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1096 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1097 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1098 1099 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, 1100 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1101 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1102 IF_CMD, ip_sioctl_get_muxid, NULL }, 1103 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1104 IPI_PRIV | IPI_WR | IPI_REPL, 1105 IF_CMD, ip_sioctl_muxid, NULL }, 1106 1107 /* Both if and lif variants share same func */ 1108 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1109 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1110 /* Both if and lif variants share same func */ 1111 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1112 IPI_PRIV | IPI_WR | IPI_REPL, 1113 IF_CMD, ip_sioctl_slifindex, NULL }, 1114 1115 /* copyin size cannot be coded for SIOCGIFCONF */ 1116 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, 1117 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1118 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1119 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1120 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1121 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1122 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1123 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1124 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1125 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1126 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1127 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1128 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1129 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1130 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1131 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1132 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1133 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1134 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1135 1136 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1137 IPI_PRIV | IPI_WR | IPI_REPL, 1138 LIF_CMD, ip_sioctl_removeif, 1139 ip_sioctl_removeif_restart }, 1140 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1141 IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, 1142 LIF_CMD, ip_sioctl_addif, NULL }, 1143 #define SIOCLIFADDR_NDX 112 1144 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1145 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1146 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1147 IPI_GET_CMD | IPI_REPL, 1148 LIF_CMD, ip_sioctl_get_addr, NULL }, 1149 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1150 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1151 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1152 IPI_GET_CMD | IPI_REPL, 1153 LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1154 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1155 IPI_PRIV | IPI_WR | IPI_REPL, 1156 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1157 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1158 IPI_GET_CMD | IPI_MODOK | IPI_REPL, 1159 LIF_CMD, ip_sioctl_get_flags, NULL }, 1160 1161 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1162 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1163 1164 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1165 ip_sioctl_get_lifconf, NULL }, 1166 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1167 LIF_CMD, ip_sioctl_mtu, NULL }, 1168 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, 1169 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1170 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1171 IPI_GET_CMD | IPI_REPL, 1172 LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1173 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1174 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1175 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1176 IPI_GET_CMD | IPI_REPL, 1177 LIF_CMD, ip_sioctl_get_netmask, NULL }, 1178 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1179 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1180 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1181 IPI_GET_CMD | IPI_REPL, 1182 LIF_CMD, ip_sioctl_get_metric, NULL }, 1183 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1184 LIF_CMD, ip_sioctl_metric, NULL }, 1185 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1186 IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, 1187 LIF_CMD, ip_sioctl_slifname, 1188 ip_sioctl_slifname_restart }, 1189 1190 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, 1191 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1192 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1193 IPI_GET_CMD | IPI_REPL, 1194 LIF_CMD, ip_sioctl_get_muxid, NULL }, 1195 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1196 IPI_PRIV | IPI_WR | IPI_REPL, 1197 LIF_CMD, ip_sioctl_muxid, NULL }, 1198 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1199 IPI_GET_CMD | IPI_REPL, 1200 LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1201 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1202 IPI_PRIV | IPI_WR | IPI_REPL, 1203 LIF_CMD, ip_sioctl_slifindex, 0 }, 1204 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1205 LIF_CMD, ip_sioctl_token, NULL }, 1206 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1207 IPI_GET_CMD | IPI_REPL, 1208 LIF_CMD, ip_sioctl_get_token, NULL }, 1209 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1210 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1211 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1212 IPI_GET_CMD | IPI_REPL, 1213 LIF_CMD, ip_sioctl_get_subnet, NULL }, 1214 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1215 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1216 1217 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1218 IPI_GET_CMD | IPI_REPL, 1219 LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1220 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1221 LIF_CMD, ip_siocdelndp_v6, NULL }, 1222 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1223 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1224 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1225 LIF_CMD, ip_siocsetndp_v6, NULL }, 1226 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1227 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1228 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1229 MISC_CMD, ip_sioctl_tonlink, NULL }, 1230 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1231 MISC_CMD, ip_sioctl_tmysite, NULL }, 1232 /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, 1233 TUN_CMD, ip_sioctl_tunparam, NULL }, 1234 /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), 1235 IPI_PRIV | IPI_WR, 1236 TUN_CMD, ip_sioctl_tunparam, NULL }, 1237 1238 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1239 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1240 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1241 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1242 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1243 1244 /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), 1245 IPI_PRIV | IPI_WR | IPI_REPL, 1246 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1247 /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), 1248 IPI_PRIV | IPI_WR | IPI_REPL, 1249 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1250 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1251 IPI_PRIV | IPI_WR | IPI_REPL, 1252 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1253 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1254 IPI_GET_CMD | IPI_REPL, 1255 LIF_CMD, ip_sioctl_get_groupname, NULL }, 1256 /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), 1257 IPI_GET_CMD | IPI_REPL, 1258 LIF_CMD, ip_sioctl_get_oindex, NULL }, 1259 1260 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1261 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1262 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1263 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1264 1265 /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1266 LIF_CMD, ip_sioctl_slifoindex, NULL }, 1267 1268 /* These are handled in ip_sioctl_copyin_setup itself */ 1269 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1270 MISC_CMD, NULL, NULL }, 1271 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1272 MISC_CMD, NULL, NULL }, 1273 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1274 1275 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1276 ip_sioctl_get_lifconf, NULL }, 1277 1278 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, 1279 XARP_CMD, ip_sioctl_arp, NULL }, 1280 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, 1281 XARP_CMD, ip_sioctl_arp, NULL }, 1282 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, 1283 XARP_CMD, ip_sioctl_arp, NULL }, 1284 1285 /* SIOCPOPSOCKFS is not handled by IP */ 1286 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1287 1288 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1289 IPI_GET_CMD | IPI_REPL, 1290 LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1291 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1292 IPI_PRIV | IPI_WR | IPI_REPL, 1293 LIF_CMD, ip_sioctl_slifzone, 1294 ip_sioctl_slifzone_restart }, 1295 /* 172-174 are SCTP ioctls and not handled by IP */ 1296 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1297 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1298 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1299 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1300 IPI_GET_CMD, LIF_CMD, 1301 ip_sioctl_get_lifusesrc, 0 }, 1302 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1303 IPI_PRIV | IPI_WR, 1304 LIF_CMD, ip_sioctl_slifusesrc, 1305 NULL }, 1306 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1307 ip_sioctl_get_lifsrcof, NULL }, 1308 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1309 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1310 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1311 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1312 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1313 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1314 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1315 MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1316 /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, 1317 ip_sioctl_set_ipmpfailback, NULL }, 1318 /* SIOCSENABLESDP is handled by SDP */ 1319 /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, 1320 }; 1321 1322 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1323 1324 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1325 { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), 1326 IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, 1327 { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, 1328 TUN_CMD, ip_sioctl_tunparam, NULL }, 1329 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1330 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1331 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1332 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1333 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1334 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1335 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1336 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, 1337 MISC_CMD, mrt_ioctl}, 1338 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, 1339 MISC_CMD, mrt_ioctl}, 1340 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, 1341 MISC_CMD, mrt_ioctl} 1342 }; 1343 1344 int ip_misc_ioctl_count = 1345 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1346 1347 int conn_drain_nthreads; /* Number of drainers reqd. */ 1348 /* Settable in /etc/system */ 1349 /* Defined in ip_ire.c */ 1350 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1351 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1352 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1353 1354 static nv_t ire_nv_arr[] = { 1355 { IRE_BROADCAST, "BROADCAST" }, 1356 { IRE_LOCAL, "LOCAL" }, 1357 { IRE_LOOPBACK, "LOOPBACK" }, 1358 { IRE_CACHE, "CACHE" }, 1359 { IRE_DEFAULT, "DEFAULT" }, 1360 { IRE_PREFIX, "PREFIX" }, 1361 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1362 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1363 { IRE_HOST, "HOST" }, 1364 { 0 } 1365 }; 1366 1367 nv_t *ire_nv_tbl = ire_nv_arr; 1368 1369 /* Simple ICMP IP Header Template */ 1370 static ipha_t icmp_ipha = { 1371 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1372 }; 1373 1374 struct module_info ip_mod_info = { 1375 IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 1376 }; 1377 1378 /* 1379 * Duplicate static symbols within a module confuses mdb; so we avoid the 1380 * problem by making the symbols here distinct from those in udp.c. 1381 */ 1382 1383 /* 1384 * Entry points for IP as a device and as a module. 1385 * FIXME: down the road we might want a separate module and driver qinit. 1386 * We have separate open functions for the /dev/ip and /dev/ip6 devices. 1387 */ 1388 static struct qinit iprinitv4 = { 1389 (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, 1390 &ip_mod_info 1391 }; 1392 1393 struct qinit iprinitv6 = { 1394 (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, 1395 &ip_mod_info 1396 }; 1397 1398 static struct qinit ipwinitv4 = { 1399 (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1400 &ip_mod_info 1401 }; 1402 1403 struct qinit ipwinitv6 = { 1404 (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1405 &ip_mod_info 1406 }; 1407 1408 static struct qinit iplrinit = { 1409 (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, 1410 &ip_mod_info 1411 }; 1412 1413 static struct qinit iplwinit = { 1414 (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, 1415 &ip_mod_info 1416 }; 1417 1418 /* For AF_INET aka /dev/ip */ 1419 struct streamtab ipinfov4 = { 1420 &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit 1421 }; 1422 1423 /* For AF_INET6 aka /dev/ip6 */ 1424 struct streamtab ipinfov6 = { 1425 &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit 1426 }; 1427 1428 #ifdef DEBUG 1429 static boolean_t skip_sctp_cksum = B_FALSE; 1430 #endif 1431 1432 /* 1433 * Prepend the zoneid using an ipsec_out_t for later use by functions like 1434 * ip_rput_v6(), ip_output(), etc. If the message 1435 * block already has a M_CTL at the front of it, then simply set the zoneid 1436 * appropriately. 1437 */ 1438 mblk_t * 1439 ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1440 { 1441 mblk_t *first_mp; 1442 ipsec_out_t *io; 1443 1444 ASSERT(zoneid != ALL_ZONES); 1445 if (mp->b_datap->db_type == M_CTL) { 1446 io = (ipsec_out_t *)mp->b_rptr; 1447 ASSERT(io->ipsec_out_type == IPSEC_OUT); 1448 io->ipsec_out_zoneid = zoneid; 1449 return (mp); 1450 } 1451 1452 first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack); 1453 if (first_mp == NULL) 1454 return (NULL); 1455 io = (ipsec_out_t *)first_mp->b_rptr; 1456 /* This is not a secure packet */ 1457 io->ipsec_out_secure = B_FALSE; 1458 io->ipsec_out_zoneid = zoneid; 1459 first_mp->b_cont = mp; 1460 return (first_mp); 1461 } 1462 1463 /* 1464 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1465 */ 1466 mblk_t * 1467 ip_copymsg(mblk_t *mp) 1468 { 1469 mblk_t *nmp; 1470 ipsec_info_t *in; 1471 1472 if (mp->b_datap->db_type != M_CTL) 1473 return (copymsg(mp)); 1474 1475 in = (ipsec_info_t *)mp->b_rptr; 1476 1477 /* 1478 * Note that M_CTL is also used for delivering ICMP error messages 1479 * upstream to transport layers. 1480 */ 1481 if (in->ipsec_info_type != IPSEC_OUT && 1482 in->ipsec_info_type != IPSEC_IN) 1483 return (copymsg(mp)); 1484 1485 nmp = copymsg(mp->b_cont); 1486 1487 if (in->ipsec_info_type == IPSEC_OUT) { 1488 return (ipsec_out_tag(mp, nmp, 1489 ((ipsec_out_t *)in)->ipsec_out_ns)); 1490 } else { 1491 return (ipsec_in_tag(mp, nmp, 1492 ((ipsec_in_t *)in)->ipsec_in_ns)); 1493 } 1494 } 1495 1496 /* Generate an ICMP fragmentation needed message. */ 1497 static void 1498 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, 1499 ip_stack_t *ipst) 1500 { 1501 icmph_t icmph; 1502 mblk_t *first_mp; 1503 boolean_t mctl_present; 1504 1505 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1506 1507 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 1508 if (mctl_present) 1509 freeb(first_mp); 1510 return; 1511 } 1512 1513 bzero(&icmph, sizeof (icmph_t)); 1514 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1515 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1516 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1517 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); 1518 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 1519 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 1520 ipst); 1521 } 1522 1523 /* 1524 * icmp_inbound deals with ICMP messages in the following ways. 1525 * 1526 * 1) It needs to send a reply back and possibly delivering it 1527 * to the "interested" upper clients. 1528 * 2) It needs to send it to the upper clients only. 1529 * 3) It needs to change some values in IP only. 1530 * 4) It needs to change some values in IP and upper layers e.g TCP. 1531 * 1532 * We need to accomodate icmp messages coming in clear until we get 1533 * everything secure from the wire. If icmp_accept_clear_messages 1534 * is zero we check with the global policy and act accordingly. If 1535 * it is non-zero, we accept the message without any checks. But 1536 * *this does not mean* that this will be delivered to the upper 1537 * clients. By accepting we might send replies back, change our MTU 1538 * value etc. but delivery to the ULP/clients depends on their policy 1539 * dispositions. 1540 * 1541 * We handle the above 4 cases in the context of IPsec in the 1542 * following way : 1543 * 1544 * 1) Send the reply back in the same way as the request came in. 1545 * If it came in encrypted, it goes out encrypted. If it came in 1546 * clear, it goes out in clear. Thus, this will prevent chosen 1547 * plain text attack. 1548 * 2) The client may or may not expect things to come in secure. 1549 * If it comes in secure, the policy constraints are checked 1550 * before delivering it to the upper layers. If it comes in 1551 * clear, ipsec_inbound_accept_clear will decide whether to 1552 * accept this in clear or not. In both the cases, if the returned 1553 * message (IP header + 8 bytes) that caused the icmp message has 1554 * AH/ESP headers, it is sent up to AH/ESP for validation before 1555 * sending up. If there are only 8 bytes of returned message, then 1556 * upper client will not be notified. 1557 * 3) Check with global policy to see whether it matches the constaints. 1558 * But this will be done only if icmp_accept_messages_in_clear is 1559 * zero. 1560 * 4) If we need to change both in IP and ULP, then the decision taken 1561 * while affecting the values in IP and while delivering up to TCP 1562 * should be the same. 1563 * 1564 * There are two cases. 1565 * 1566 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1567 * failed), we will not deliver it to the ULP, even though they 1568 * are *willing* to accept in *clear*. This is fine as our global 1569 * disposition to icmp messages asks us reject the datagram. 1570 * 1571 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1572 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1573 * to deliver it to ULP (policy failed), it can lead to 1574 * consistency problems. The cases known at this time are 1575 * ICMP_DESTINATION_UNREACHABLE messages with following code 1576 * values : 1577 * 1578 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1579 * and Upper layer rejects. Then the communication will 1580 * come to a stop. This is solved by making similar decisions 1581 * at both levels. Currently, when we are unable to deliver 1582 * to the Upper Layer (due to policy failures) while IP has 1583 * adjusted ire_max_frag, the next outbound datagram would 1584 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1585 * will be with the right level of protection. Thus the right 1586 * value will be communicated even if we are not able to 1587 * communicate when we get from the wire initially. But this 1588 * assumes there would be at least one outbound datagram after 1589 * IP has adjusted its ire_max_frag value. To make things 1590 * simpler, we accept in clear after the validation of 1591 * AH/ESP headers. 1592 * 1593 * - Other ICMP ERRORS : We may not be able to deliver it to the 1594 * upper layer depending on the level of protection the upper 1595 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1596 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1597 * should be accepted in clear when the Upper layer expects secure. 1598 * Thus the communication may get aborted by some bad ICMP 1599 * packets. 1600 * 1601 * IPQoS Notes: 1602 * The only instance when a packet is sent for processing is when there 1603 * isn't an ICMP client and if we are interested in it. 1604 * If there is a client, IPPF processing will take place in the 1605 * ip_fanout_proto routine. 1606 * 1607 * Zones notes: 1608 * The packet is only processed in the context of the specified zone: typically 1609 * only this zone will reply to an echo request, and only interested clients in 1610 * this zone will receive a copy of the packet. This means that the caller must 1611 * call icmp_inbound() for each relevant zone. 1612 */ 1613 static void 1614 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1615 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1616 ill_t *recv_ill, zoneid_t zoneid) 1617 { 1618 icmph_t *icmph; 1619 ipha_t *ipha; 1620 int iph_hdr_length; 1621 int hdr_length; 1622 boolean_t interested; 1623 uint32_t ts; 1624 uchar_t *wptr; 1625 ipif_t *ipif; 1626 mblk_t *first_mp; 1627 ipsec_in_t *ii; 1628 ire_t *src_ire; 1629 boolean_t onlink; 1630 timestruc_t now; 1631 uint32_t ill_index; 1632 ip_stack_t *ipst; 1633 1634 ASSERT(ill != NULL); 1635 ipst = ill->ill_ipst; 1636 1637 first_mp = mp; 1638 if (mctl_present) { 1639 mp = first_mp->b_cont; 1640 ASSERT(mp != NULL); 1641 } 1642 1643 ipha = (ipha_t *)mp->b_rptr; 1644 if (ipst->ips_icmp_accept_clear_messages == 0) { 1645 first_mp = ipsec_check_global_policy(first_mp, NULL, 1646 ipha, NULL, mctl_present, ipst->ips_netstack); 1647 if (first_mp == NULL) 1648 return; 1649 } 1650 1651 /* 1652 * On a labeled system, we have to check whether the zone itself is 1653 * permitted to receive raw traffic. 1654 */ 1655 if (is_system_labeled()) { 1656 if (zoneid == ALL_ZONES) 1657 zoneid = tsol_packet_to_zoneid(mp); 1658 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1659 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1660 zoneid)); 1661 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1662 freemsg(first_mp); 1663 return; 1664 } 1665 } 1666 1667 /* 1668 * We have accepted the ICMP message. It means that we will 1669 * respond to the packet if needed. It may not be delivered 1670 * to the upper client depending on the policy constraints 1671 * and the disposition in ipsec_inbound_accept_clear. 1672 */ 1673 1674 ASSERT(ill != NULL); 1675 1676 BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); 1677 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1678 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1679 /* Last chance to get real. */ 1680 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1681 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1682 freemsg(first_mp); 1683 return; 1684 } 1685 /* Refresh iph following the pullup. */ 1686 ipha = (ipha_t *)mp->b_rptr; 1687 } 1688 /* ICMP header checksum, including checksum field, should be zero. */ 1689 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1690 IP_CSUM(mp, iph_hdr_length, 0)) { 1691 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 1692 freemsg(first_mp); 1693 return; 1694 } 1695 /* The IP header will always be a multiple of four bytes */ 1696 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1697 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1698 icmph->icmph_code)); 1699 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1700 /* We will set "interested" to "true" if we want a copy */ 1701 interested = B_FALSE; 1702 switch (icmph->icmph_type) { 1703 case ICMP_ECHO_REPLY: 1704 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); 1705 break; 1706 case ICMP_DEST_UNREACHABLE: 1707 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1708 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); 1709 interested = B_TRUE; /* Pass up to transport */ 1710 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); 1711 break; 1712 case ICMP_SOURCE_QUENCH: 1713 interested = B_TRUE; /* Pass up to transport */ 1714 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); 1715 break; 1716 case ICMP_REDIRECT: 1717 if (!ipst->ips_ip_ignore_redirect) 1718 interested = B_TRUE; 1719 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); 1720 break; 1721 case ICMP_ECHO_REQUEST: 1722 /* 1723 * Whether to respond to echo requests that come in as IP 1724 * broadcasts or as IP multicast is subject to debate 1725 * (what isn't?). We aim to please, you pick it. 1726 * Default is do it. 1727 */ 1728 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1729 /* unicast: always respond */ 1730 interested = B_TRUE; 1731 } else if (CLASSD(ipha->ipha_dst)) { 1732 /* multicast: respond based on tunable */ 1733 interested = ipst->ips_ip_g_resp_to_echo_mcast; 1734 } else if (broadcast) { 1735 /* broadcast: respond based on tunable */ 1736 interested = ipst->ips_ip_g_resp_to_echo_bcast; 1737 } 1738 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); 1739 break; 1740 case ICMP_ROUTER_ADVERTISEMENT: 1741 case ICMP_ROUTER_SOLICITATION: 1742 break; 1743 case ICMP_TIME_EXCEEDED: 1744 interested = B_TRUE; /* Pass up to transport */ 1745 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); 1746 break; 1747 case ICMP_PARAM_PROBLEM: 1748 interested = B_TRUE; /* Pass up to transport */ 1749 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); 1750 break; 1751 case ICMP_TIME_STAMP_REQUEST: 1752 /* Response to Time Stamp Requests is local policy. */ 1753 if (ipst->ips_ip_g_resp_to_timestamp && 1754 /* So is whether to respond if it was an IP broadcast. */ 1755 (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) { 1756 int tstamp_len = 3 * sizeof (uint32_t); 1757 1758 if (wptr + tstamp_len > mp->b_wptr) { 1759 if (!pullupmsg(mp, wptr + tstamp_len - 1760 mp->b_rptr)) { 1761 BUMP_MIB(ill->ill_ip_mib, 1762 ipIfStatsInDiscards); 1763 freemsg(first_mp); 1764 return; 1765 } 1766 /* Refresh ipha following the pullup. */ 1767 ipha = (ipha_t *)mp->b_rptr; 1768 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1769 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1770 } 1771 interested = B_TRUE; 1772 } 1773 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); 1774 break; 1775 case ICMP_TIME_STAMP_REPLY: 1776 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); 1777 break; 1778 case ICMP_INFO_REQUEST: 1779 /* Per RFC 1122 3.2.2.7, ignore this. */ 1780 case ICMP_INFO_REPLY: 1781 break; 1782 case ICMP_ADDRESS_MASK_REQUEST: 1783 if ((ipst->ips_ip_respond_to_address_mask_broadcast || 1784 !broadcast) && 1785 /* TODO m_pullup of complete header? */ 1786 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) { 1787 interested = B_TRUE; 1788 } 1789 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); 1790 break; 1791 case ICMP_ADDRESS_MASK_REPLY: 1792 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); 1793 break; 1794 default: 1795 interested = B_TRUE; /* Pass up to transport */ 1796 BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); 1797 break; 1798 } 1799 /* See if there is an ICMP client. */ 1800 if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) { 1801 /* If there is an ICMP client and we want one too, copy it. */ 1802 mblk_t *first_mp1; 1803 1804 if (!interested) { 1805 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1806 ip_policy, recv_ill, zoneid); 1807 return; 1808 } 1809 first_mp1 = ip_copymsg(first_mp); 1810 if (first_mp1 != NULL) { 1811 ip_fanout_proto(q, first_mp1, ill, ipha, 1812 0, mctl_present, ip_policy, recv_ill, zoneid); 1813 } 1814 } else if (!interested) { 1815 freemsg(first_mp); 1816 return; 1817 } else { 1818 /* 1819 * Initiate policy processing for this packet if ip_policy 1820 * is true. 1821 */ 1822 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 1823 ill_index = ill->ill_phyint->phyint_ifindex; 1824 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1825 if (mp == NULL) { 1826 if (mctl_present) { 1827 freeb(first_mp); 1828 } 1829 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1830 return; 1831 } 1832 } 1833 } 1834 /* We want to do something with it. */ 1835 /* Check db_ref to make sure we can modify the packet. */ 1836 if (mp->b_datap->db_ref > 1) { 1837 mblk_t *first_mp1; 1838 1839 first_mp1 = ip_copymsg(first_mp); 1840 freemsg(first_mp); 1841 if (!first_mp1) { 1842 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1843 return; 1844 } 1845 first_mp = first_mp1; 1846 if (mctl_present) { 1847 mp = first_mp->b_cont; 1848 ASSERT(mp != NULL); 1849 } else { 1850 mp = first_mp; 1851 } 1852 ipha = (ipha_t *)mp->b_rptr; 1853 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1854 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1855 } 1856 switch (icmph->icmph_type) { 1857 case ICMP_ADDRESS_MASK_REQUEST: 1858 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1859 if (ipif == NULL) { 1860 freemsg(first_mp); 1861 return; 1862 } 1863 /* 1864 * outging interface must be IPv4 1865 */ 1866 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1867 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1868 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1869 ipif_refrele(ipif); 1870 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); 1871 break; 1872 case ICMP_ECHO_REQUEST: 1873 icmph->icmph_type = ICMP_ECHO_REPLY; 1874 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); 1875 break; 1876 case ICMP_TIME_STAMP_REQUEST: { 1877 uint32_t *tsp; 1878 1879 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1880 tsp = (uint32_t *)wptr; 1881 tsp++; /* Skip past 'originate time' */ 1882 /* Compute # of milliseconds since midnight */ 1883 gethrestime(&now); 1884 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1885 now.tv_nsec / (NANOSEC / MILLISEC); 1886 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1887 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1888 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); 1889 break; 1890 } 1891 default: 1892 ipha = (ipha_t *)&icmph[1]; 1893 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1894 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1895 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1896 freemsg(first_mp); 1897 return; 1898 } 1899 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1900 ipha = (ipha_t *)&icmph[1]; 1901 } 1902 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1903 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1904 freemsg(first_mp); 1905 return; 1906 } 1907 hdr_length = IPH_HDR_LENGTH(ipha); 1908 if (hdr_length < sizeof (ipha_t)) { 1909 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1910 freemsg(first_mp); 1911 return; 1912 } 1913 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1914 if (!pullupmsg(mp, 1915 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1916 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1917 freemsg(first_mp); 1918 return; 1919 } 1920 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1921 ipha = (ipha_t *)&icmph[1]; 1922 } 1923 switch (icmph->icmph_type) { 1924 case ICMP_REDIRECT: 1925 /* 1926 * As there is no upper client to deliver, we don't 1927 * need the first_mp any more. 1928 */ 1929 if (mctl_present) { 1930 freeb(first_mp); 1931 } 1932 icmp_redirect(ill, mp); 1933 return; 1934 case ICMP_DEST_UNREACHABLE: 1935 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1936 if (!icmp_inbound_too_big(icmph, ipha, ill, 1937 zoneid, mp, iph_hdr_length, ipst)) { 1938 freemsg(first_mp); 1939 return; 1940 } 1941 /* 1942 * icmp_inbound_too_big() may alter mp. 1943 * Resynch ipha and icmph accordingly. 1944 */ 1945 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1946 ipha = (ipha_t *)&icmph[1]; 1947 } 1948 /* FALLTHRU */ 1949 default : 1950 /* 1951 * IPQoS notes: Since we have already done IPQoS 1952 * processing we don't want to do it again in 1953 * the fanout routines called by 1954 * icmp_inbound_error_fanout, hence the last 1955 * argument, ip_policy, is B_FALSE. 1956 */ 1957 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1958 ipha, iph_hdr_length, hdr_length, mctl_present, 1959 B_FALSE, recv_ill, zoneid); 1960 } 1961 return; 1962 } 1963 /* Send out an ICMP packet */ 1964 icmph->icmph_checksum = 0; 1965 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 1966 if (broadcast || CLASSD(ipha->ipha_dst)) { 1967 ipif_t *ipif_chosen; 1968 /* 1969 * Make it look like it was directed to us, so we don't look 1970 * like a fool with a broadcast or multicast source address. 1971 */ 1972 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1973 /* 1974 * Make sure that we haven't grabbed an interface that's DOWN. 1975 */ 1976 if (ipif != NULL) { 1977 ipif_chosen = ipif_select_source(ipif->ipif_ill, 1978 ipha->ipha_src, zoneid); 1979 if (ipif_chosen != NULL) { 1980 ipif_refrele(ipif); 1981 ipif = ipif_chosen; 1982 } 1983 } 1984 if (ipif == NULL) { 1985 ip0dbg(("icmp_inbound: " 1986 "No source for broadcast/multicast:\n" 1987 "\tsrc 0x%x dst 0x%x ill %p " 1988 "ipif_lcl_addr 0x%x\n", 1989 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 1990 (void *)ill, 1991 ill->ill_ipif->ipif_lcl_addr)); 1992 freemsg(first_mp); 1993 return; 1994 } 1995 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1996 ipha->ipha_dst = ipif->ipif_src_addr; 1997 ipif_refrele(ipif); 1998 } 1999 /* Reset time to live. */ 2000 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 2001 { 2002 /* Swap source and destination addresses */ 2003 ipaddr_t tmp; 2004 2005 tmp = ipha->ipha_src; 2006 ipha->ipha_src = ipha->ipha_dst; 2007 ipha->ipha_dst = tmp; 2008 } 2009 ipha->ipha_ident = 0; 2010 if (!IS_SIMPLE_IPH(ipha)) 2011 icmp_options_update(ipha); 2012 2013 /* 2014 * ICMP echo replies should go out on the same interface 2015 * the request came on as probes used by in.mpathd for detecting 2016 * NIC failures are ECHO packets. We turn-off load spreading 2017 * by setting ipsec_in_attach_if to B_TRUE, which is copied 2018 * to ipsec_out_attach_if by ipsec_in_to_out called later in this 2019 * function. This is in turn handled by ip_wput and ip_newroute 2020 * to make sure that the packet goes out on the interface it came 2021 * in on. If we don't turnoff load spreading, the packets might get 2022 * dropped if there are no non-FAILED/INACTIVE interfaces for it 2023 * to go out and in.mpathd would wrongly detect a failure or 2024 * mis-detect a NIC failure for link failure. As load spreading 2025 * can happen only if ill_group is not NULL, we do only for 2026 * that case and this does not affect the normal case. 2027 * 2028 * We turn off load spreading only on echo packets that came from 2029 * on-link hosts. If the interface route has been deleted, this will 2030 * not be enforced as we can't do much. For off-link hosts, as the 2031 * default routes in IPv4 does not typically have an ire_ipif 2032 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. 2033 * Moreover, expecting a default route through this interface may 2034 * not be correct. We use ipha_dst because of the swap above. 2035 */ 2036 onlink = B_FALSE; 2037 if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { 2038 /* 2039 * First, we need to make sure that it is not one of our 2040 * local addresses. If we set onlink when it is one of 2041 * our local addresses, we will end up creating IRE_CACHES 2042 * for one of our local addresses. Then, we will never 2043 * accept packets for them afterwards. 2044 */ 2045 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, 2046 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2047 if (src_ire == NULL) { 2048 ipif = ipif_get_next_ipif(NULL, ill); 2049 if (ipif == NULL) { 2050 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2051 freemsg(mp); 2052 return; 2053 } 2054 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 2055 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 2056 NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst); 2057 ipif_refrele(ipif); 2058 if (src_ire != NULL) { 2059 onlink = B_TRUE; 2060 ire_refrele(src_ire); 2061 } 2062 } else { 2063 ire_refrele(src_ire); 2064 } 2065 } 2066 if (!mctl_present) { 2067 /* 2068 * This packet should go out the same way as it 2069 * came in i.e in clear. To make sure that global 2070 * policy will not be applied to this in ip_wput_ire, 2071 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2072 */ 2073 ASSERT(first_mp == mp); 2074 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2075 if (first_mp == NULL) { 2076 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2077 freemsg(mp); 2078 return; 2079 } 2080 ii = (ipsec_in_t *)first_mp->b_rptr; 2081 2082 /* This is not a secure packet */ 2083 ii->ipsec_in_secure = B_FALSE; 2084 if (onlink) { 2085 ii->ipsec_in_attach_if = B_TRUE; 2086 ii->ipsec_in_ill_index = 2087 ill->ill_phyint->phyint_ifindex; 2088 ii->ipsec_in_rill_index = 2089 recv_ill->ill_phyint->phyint_ifindex; 2090 } 2091 first_mp->b_cont = mp; 2092 } else if (onlink) { 2093 ii = (ipsec_in_t *)first_mp->b_rptr; 2094 ii->ipsec_in_attach_if = B_TRUE; 2095 ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; 2096 ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; 2097 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2098 } else { 2099 ii = (ipsec_in_t *)first_mp->b_rptr; 2100 ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ 2101 } 2102 ii->ipsec_in_zoneid = zoneid; 2103 ASSERT(zoneid != ALL_ZONES); 2104 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2105 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2106 return; 2107 } 2108 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 2109 put(WR(q), first_mp); 2110 } 2111 2112 static ipaddr_t 2113 icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp) 2114 { 2115 conn_t *connp; 2116 connf_t *connfp; 2117 ipaddr_t nexthop_addr = INADDR_ANY; 2118 int hdr_length = IPH_HDR_LENGTH(ipha); 2119 uint16_t *up; 2120 uint32_t ports; 2121 ip_stack_t *ipst = ill->ill_ipst; 2122 2123 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2124 switch (ipha->ipha_protocol) { 2125 case IPPROTO_TCP: 2126 { 2127 tcph_t *tcph; 2128 2129 /* do a reverse lookup */ 2130 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2131 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, 2132 TCPS_LISTEN, ipst); 2133 break; 2134 } 2135 case IPPROTO_UDP: 2136 { 2137 uint32_t dstport, srcport; 2138 2139 ((uint16_t *)&ports)[0] = up[1]; 2140 ((uint16_t *)&ports)[1] = up[0]; 2141 2142 /* Extract ports in net byte order */ 2143 dstport = htons(ntohl(ports) & 0xFFFF); 2144 srcport = htons(ntohl(ports) >> 16); 2145 2146 connfp = &ipst->ips_ipcl_udp_fanout[ 2147 IPCL_UDP_HASH(dstport, ipst)]; 2148 mutex_enter(&connfp->connf_lock); 2149 connp = connfp->connf_head; 2150 2151 /* do a reverse lookup */ 2152 while ((connp != NULL) && 2153 (!IPCL_UDP_MATCH(connp, dstport, 2154 ipha->ipha_src, srcport, ipha->ipha_dst) || 2155 !IPCL_ZONE_MATCH(connp, zoneid))) { 2156 connp = connp->conn_next; 2157 } 2158 if (connp != NULL) 2159 CONN_INC_REF(connp); 2160 mutex_exit(&connfp->connf_lock); 2161 break; 2162 } 2163 case IPPROTO_SCTP: 2164 { 2165 in6_addr_t map_src, map_dst; 2166 2167 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src); 2168 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst); 2169 ((uint16_t *)&ports)[0] = up[1]; 2170 ((uint16_t *)&ports)[1] = up[0]; 2171 2172 connp = sctp_find_conn(&map_src, &map_dst, ports, 2173 zoneid, ipst->ips_netstack->netstack_sctp); 2174 if (connp == NULL) { 2175 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, 2176 zoneid, ports, ipha, ipst); 2177 } else { 2178 CONN_INC_REF(connp); 2179 SCTP_REFRELE(CONN2SCTP(connp)); 2180 } 2181 break; 2182 } 2183 default: 2184 { 2185 ipha_t ripha; 2186 2187 ripha.ipha_src = ipha->ipha_dst; 2188 ripha.ipha_dst = ipha->ipha_src; 2189 ripha.ipha_protocol = ipha->ipha_protocol; 2190 2191 connfp = &ipst->ips_ipcl_proto_fanout[ 2192 ipha->ipha_protocol]; 2193 mutex_enter(&connfp->connf_lock); 2194 connp = connfp->connf_head; 2195 for (connp = connfp->connf_head; connp != NULL; 2196 connp = connp->conn_next) { 2197 if (IPCL_PROTO_MATCH(connp, 2198 ipha->ipha_protocol, &ripha, ill, 2199 0, zoneid)) { 2200 CONN_INC_REF(connp); 2201 break; 2202 } 2203 } 2204 mutex_exit(&connfp->connf_lock); 2205 } 2206 } 2207 if (connp != NULL) { 2208 if (connp->conn_nexthop_set) 2209 nexthop_addr = connp->conn_nexthop_v4; 2210 CONN_DEC_REF(connp); 2211 } 2212 return (nexthop_addr); 2213 } 2214 2215 /* Table from RFC 1191 */ 2216 static int icmp_frag_size_table[] = 2217 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2218 2219 /* 2220 * Process received ICMP Packet too big. 2221 * After updating any IRE it does the fanout to any matching transport streams. 2222 * Assumes the message has been pulled up till the IP header that caused 2223 * the error. 2224 * 2225 * Returns B_FALSE on failure and B_TRUE on success. 2226 */ 2227 static boolean_t 2228 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, 2229 zoneid_t zoneid, mblk_t *mp, int iph_hdr_length, 2230 ip_stack_t *ipst) 2231 { 2232 ire_t *ire, *first_ire; 2233 int mtu; 2234 int hdr_length; 2235 ipaddr_t nexthop_addr; 2236 2237 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2238 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2239 ASSERT(ill != NULL); 2240 2241 hdr_length = IPH_HDR_LENGTH(ipha); 2242 2243 /* Drop if the original packet contained a source route */ 2244 if (ip_source_route_included(ipha)) { 2245 return (B_FALSE); 2246 } 2247 /* 2248 * Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport 2249 * header. 2250 */ 2251 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2252 mp->b_wptr) { 2253 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2254 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2255 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2256 ip1dbg(("icmp_inbound_too_big: insufficient hdr\n")); 2257 return (B_FALSE); 2258 } 2259 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2260 ipha = (ipha_t *)&icmph[1]; 2261 } 2262 nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp); 2263 if (nexthop_addr != INADDR_ANY) { 2264 /* nexthop set */ 2265 first_ire = ire_ctable_lookup(ipha->ipha_dst, 2266 nexthop_addr, 0, NULL, ALL_ZONES, MBLK_GETLABEL(mp), 2267 MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst); 2268 } else { 2269 /* nexthop not set */ 2270 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, 2271 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 2272 } 2273 2274 if (!first_ire) { 2275 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2276 ntohl(ipha->ipha_dst))); 2277 return (B_FALSE); 2278 } 2279 /* Check for MTU discovery advice as described in RFC 1191 */ 2280 mtu = ntohs(icmph->icmph_du_mtu); 2281 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2282 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2283 ire = ire->ire_next) { 2284 /* 2285 * Look for the connection to which this ICMP message is 2286 * directed. If it has the IP_NEXTHOP option set, then the 2287 * search is limited to IREs with the MATCH_IRE_PRIVATE 2288 * option. Else the search is limited to regular IREs. 2289 */ 2290 if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2291 (nexthop_addr != ire->ire_gateway_addr)) || 2292 (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && 2293 (nexthop_addr != INADDR_ANY))) 2294 continue; 2295 2296 mutex_enter(&ire->ire_lock); 2297 if (icmph->icmph_du_zero == 0 && mtu > 68) { 2298 /* Reduce the IRE max frag value as advised. */ 2299 ip1dbg(("Received mtu from router: %d (was %d)\n", 2300 mtu, ire->ire_max_frag)); 2301 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2302 } else { 2303 uint32_t length; 2304 int i; 2305 2306 /* 2307 * Use the table from RFC 1191 to figure out 2308 * the next "plateau" based on the length in 2309 * the original IP packet. 2310 */ 2311 length = ntohs(ipha->ipha_length); 2312 if (ire->ire_max_frag <= length && 2313 ire->ire_max_frag >= length - hdr_length) { 2314 /* 2315 * Handle broken BSD 4.2 systems that 2316 * return the wrong iph_length in ICMP 2317 * errors. 2318 */ 2319 ip1dbg(("Wrong mtu: sent %d, ire %d\n", 2320 length, ire->ire_max_frag)); 2321 length -= hdr_length; 2322 } 2323 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2324 if (length > icmp_frag_size_table[i]) 2325 break; 2326 } 2327 if (i == A_CNT(icmp_frag_size_table)) { 2328 /* Smaller than 68! */ 2329 ip1dbg(("Too big for packet size %d\n", 2330 length)); 2331 ire->ire_max_frag = MIN(ire->ire_max_frag, 576); 2332 ire->ire_frag_flag = 0; 2333 } else { 2334 mtu = icmp_frag_size_table[i]; 2335 ip1dbg(("Calculated mtu %d, packet size %d, " 2336 "before %d", mtu, length, 2337 ire->ire_max_frag)); 2338 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2339 ip1dbg((", after %d\n", ire->ire_max_frag)); 2340 } 2341 /* Record the new max frag size for the ULP. */ 2342 icmph->icmph_du_zero = 0; 2343 icmph->icmph_du_mtu = 2344 htons((uint16_t)ire->ire_max_frag); 2345 } 2346 mutex_exit(&ire->ire_lock); 2347 } 2348 rw_exit(&first_ire->ire_bucket->irb_lock); 2349 ire_refrele(first_ire); 2350 return (B_TRUE); 2351 } 2352 2353 /* 2354 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2355 * calls this function. 2356 */ 2357 static mblk_t * 2358 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2359 { 2360 ipha_t *ipha; 2361 icmph_t *icmph; 2362 ipha_t *in_ipha; 2363 int length; 2364 2365 ASSERT(mp->b_datap->db_type == M_DATA); 2366 2367 /* 2368 * For Self-encapsulated packets, we added an extra IP header 2369 * without the options. Inner IP header is the one from which 2370 * the outer IP header was formed. Thus, we need to remove the 2371 * outer IP header. To do this, we pullup the whole message 2372 * and overlay whatever follows the outer IP header over the 2373 * outer IP header. 2374 */ 2375 2376 if (!pullupmsg(mp, -1)) 2377 return (NULL); 2378 2379 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2380 ipha = (ipha_t *)&icmph[1]; 2381 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2382 2383 /* 2384 * The length that we want to overlay is following the inner 2385 * IP header. Subtracting the IP header + icmp header + outer 2386 * IP header's length should give us the length that we want to 2387 * overlay. 2388 */ 2389 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2390 hdr_length; 2391 /* 2392 * Overlay whatever follows the inner header over the 2393 * outer header. 2394 */ 2395 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2396 2397 /* Set the wptr to account for the outer header */ 2398 mp->b_wptr -= hdr_length; 2399 return (mp); 2400 } 2401 2402 /* 2403 * Try to pass the ICMP message upstream in case the ULP cares. 2404 * 2405 * If the packet that caused the ICMP error is secure, we send 2406 * it to AH/ESP to make sure that the attached packet has a 2407 * valid association. ipha in the code below points to the 2408 * IP header of the packet that caused the error. 2409 * 2410 * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently 2411 * in the context of IPsec. Normally we tell the upper layer 2412 * whenever we send the ire (including ip_bind), the IPsec header 2413 * length in ire_ipsec_overhead. TCP can deduce the MSS as it 2414 * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. 2415 * Similarly, we pass the new MTU icmph_du_mtu and TCP does the 2416 * same thing. As TCP has the IPsec options size that needs to be 2417 * adjusted, we just pass the MTU unchanged. 2418 * 2419 * IFN could have been generated locally or by some router. 2420 * 2421 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2422 * This happens because IP adjusted its value of MTU on an 2423 * earlier IFN message and could not tell the upper layer, 2424 * the new adjusted value of MTU e.g. Packet was encrypted 2425 * or there was not enough information to fanout to upper 2426 * layers. Thus on the next outbound datagram, ip_wput_ire 2427 * generates the IFN, where IPsec processing has *not* been 2428 * done. 2429 * 2430 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2431 * could have generated this. This happens because ire_max_frag 2432 * value in IP was set to a new value, while the IPsec processing 2433 * was being done and after we made the fragmentation check in 2434 * ip_wput_ire. Thus on return from IPsec processing, 2435 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2436 * and generates the IFN. As IPsec processing is over, we fanout 2437 * to AH/ESP to remove the header. 2438 * 2439 * In both these cases, ipsec_in_loopback will be set indicating 2440 * that IFN was generated locally. 2441 * 2442 * ROUTER : IFN could be secure or non-secure. 2443 * 2444 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2445 * packet in error has AH/ESP headers to validate the AH/ESP 2446 * headers. AH/ESP will verify whether there is a valid SA or 2447 * not and send it back. We will fanout again if we have more 2448 * data in the packet. 2449 * 2450 * If the packet in error does not have AH/ESP, we handle it 2451 * like any other case. 2452 * 2453 * * NON_SECURE : If the packet in error has AH/ESP headers, 2454 * we attach a dummy ipsec_in and send it up to AH/ESP 2455 * for validation. AH/ESP will verify whether there is a 2456 * valid SA or not and send it back. We will fanout again if 2457 * we have more data in the packet. 2458 * 2459 * If the packet in error does not have AH/ESP, we handle it 2460 * like any other case. 2461 */ 2462 static void 2463 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2464 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2465 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2466 zoneid_t zoneid) 2467 { 2468 uint16_t *up; /* Pointer to ports in ULP header */ 2469 uint32_t ports; /* reversed ports for fanout */ 2470 ipha_t ripha; /* With reversed addresses */ 2471 mblk_t *first_mp; 2472 ipsec_in_t *ii; 2473 tcph_t *tcph; 2474 conn_t *connp; 2475 ip_stack_t *ipst; 2476 2477 ASSERT(ill != NULL); 2478 2479 ASSERT(recv_ill != NULL); 2480 ipst = recv_ill->ill_ipst; 2481 2482 first_mp = mp; 2483 if (mctl_present) { 2484 mp = first_mp->b_cont; 2485 ASSERT(mp != NULL); 2486 2487 ii = (ipsec_in_t *)first_mp->b_rptr; 2488 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2489 } else { 2490 ii = NULL; 2491 } 2492 2493 switch (ipha->ipha_protocol) { 2494 case IPPROTO_UDP: 2495 /* 2496 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2497 * transport header. 2498 */ 2499 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2500 mp->b_wptr) { 2501 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2502 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2503 goto discard_pkt; 2504 } 2505 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2506 ipha = (ipha_t *)&icmph[1]; 2507 } 2508 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2509 2510 /* 2511 * Attempt to find a client stream based on port. 2512 * Note that we do a reverse lookup since the header is 2513 * in the form we sent it out. 2514 * The ripha header is only used for the IP_UDP_MATCH and we 2515 * only set the src and dst addresses and protocol. 2516 */ 2517 ripha.ipha_src = ipha->ipha_dst; 2518 ripha.ipha_dst = ipha->ipha_src; 2519 ripha.ipha_protocol = ipha->ipha_protocol; 2520 ((uint16_t *)&ports)[0] = up[1]; 2521 ((uint16_t *)&ports)[1] = up[0]; 2522 ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", 2523 ntohl(ipha->ipha_src), ntohs(up[0]), 2524 ntohl(ipha->ipha_dst), ntohs(up[1]), 2525 icmph->icmph_type, icmph->icmph_code)); 2526 2527 /* Have to change db_type after any pullupmsg */ 2528 DB_TYPE(mp) = M_CTL; 2529 2530 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2531 mctl_present, ip_policy, recv_ill, zoneid); 2532 return; 2533 2534 case IPPROTO_TCP: 2535 /* 2536 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2537 * transport header. 2538 */ 2539 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2540 mp->b_wptr) { 2541 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2542 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2543 goto discard_pkt; 2544 } 2545 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2546 ipha = (ipha_t *)&icmph[1]; 2547 } 2548 /* 2549 * Find a TCP client stream for this packet. 2550 * Note that we do a reverse lookup since the header is 2551 * in the form we sent it out. 2552 */ 2553 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2554 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, 2555 ipst); 2556 if (connp == NULL) 2557 goto discard_pkt; 2558 2559 /* Have to change db_type after any pullupmsg */ 2560 DB_TYPE(mp) = M_CTL; 2561 squeue_fill(connp->conn_sqp, first_mp, tcp_input, 2562 connp, SQTAG_TCP_INPUT_ICMP_ERR); 2563 return; 2564 2565 case IPPROTO_SCTP: 2566 /* 2567 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2568 * transport header. 2569 */ 2570 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2571 mp->b_wptr) { 2572 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2573 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2574 goto discard_pkt; 2575 } 2576 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2577 ipha = (ipha_t *)&icmph[1]; 2578 } 2579 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2580 /* 2581 * Find a SCTP client stream for this packet. 2582 * Note that we do a reverse lookup since the header is 2583 * in the form we sent it out. 2584 * The ripha header is only used for the matching and we 2585 * only set the src and dst addresses, protocol, and version. 2586 */ 2587 ripha.ipha_src = ipha->ipha_dst; 2588 ripha.ipha_dst = ipha->ipha_src; 2589 ripha.ipha_protocol = ipha->ipha_protocol; 2590 ripha.ipha_version_and_hdr_length = 2591 ipha->ipha_version_and_hdr_length; 2592 ((uint16_t *)&ports)[0] = up[1]; 2593 ((uint16_t *)&ports)[1] = up[0]; 2594 2595 /* Have to change db_type after any pullupmsg */ 2596 DB_TYPE(mp) = M_CTL; 2597 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2598 mctl_present, ip_policy, zoneid); 2599 return; 2600 2601 case IPPROTO_ESP: 2602 case IPPROTO_AH: { 2603 int ipsec_rc; 2604 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 2605 2606 /* 2607 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2608 * We will re-use the IPSEC_IN if it is already present as 2609 * AH/ESP will not affect any fields in the IPSEC_IN for 2610 * ICMP errors. If there is no IPSEC_IN, allocate a new 2611 * one and attach it in the front. 2612 */ 2613 if (ii != NULL) { 2614 /* 2615 * ip_fanout_proto_again converts the ICMP errors 2616 * that come back from AH/ESP to M_DATA so that 2617 * if it is non-AH/ESP and we do a pullupmsg in 2618 * this function, it would work. Convert it back 2619 * to M_CTL before we send up as this is a ICMP 2620 * error. This could have been generated locally or 2621 * by some router. Validate the inner IPsec 2622 * headers. 2623 * 2624 * NOTE : ill_index is used by ip_fanout_proto_again 2625 * to locate the ill. 2626 */ 2627 ASSERT(ill != NULL); 2628 ii->ipsec_in_ill_index = 2629 ill->ill_phyint->phyint_ifindex; 2630 ii->ipsec_in_rill_index = 2631 recv_ill->ill_phyint->phyint_ifindex; 2632 DB_TYPE(first_mp->b_cont) = M_CTL; 2633 } else { 2634 /* 2635 * IPSEC_IN is not present. We attach a ipsec_in 2636 * message and send up to IPsec for validating 2637 * and removing the IPsec headers. Clear 2638 * ipsec_in_secure so that when we return 2639 * from IPsec, we don't mistakenly think that this 2640 * is a secure packet came from the network. 2641 * 2642 * NOTE : ill_index is used by ip_fanout_proto_again 2643 * to locate the ill. 2644 */ 2645 ASSERT(first_mp == mp); 2646 first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 2647 if (first_mp == NULL) { 2648 freemsg(mp); 2649 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2650 return; 2651 } 2652 ii = (ipsec_in_t *)first_mp->b_rptr; 2653 2654 /* This is not a secure packet */ 2655 ii->ipsec_in_secure = B_FALSE; 2656 first_mp->b_cont = mp; 2657 DB_TYPE(mp) = M_CTL; 2658 ASSERT(ill != NULL); 2659 ii->ipsec_in_ill_index = 2660 ill->ill_phyint->phyint_ifindex; 2661 ii->ipsec_in_rill_index = 2662 recv_ill->ill_phyint->phyint_ifindex; 2663 } 2664 ip2dbg(("icmp_inbound_error: ipsec\n")); 2665 2666 if (!ipsec_loaded(ipss)) { 2667 ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); 2668 return; 2669 } 2670 2671 if (ipha->ipha_protocol == IPPROTO_ESP) 2672 ipsec_rc = ipsecesp_icmp_error(first_mp); 2673 else 2674 ipsec_rc = ipsecah_icmp_error(first_mp); 2675 if (ipsec_rc == IPSEC_STATUS_FAILED) 2676 return; 2677 2678 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2679 return; 2680 } 2681 default: 2682 /* 2683 * The ripha header is only used for the lookup and we 2684 * only set the src and dst addresses and protocol. 2685 */ 2686 ripha.ipha_src = ipha->ipha_dst; 2687 ripha.ipha_dst = ipha->ipha_src; 2688 ripha.ipha_protocol = ipha->ipha_protocol; 2689 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2690 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2691 ntohl(ipha->ipha_dst), 2692 icmph->icmph_type, icmph->icmph_code)); 2693 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2694 ipha_t *in_ipha; 2695 2696 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2697 mp->b_wptr) { 2698 if (!pullupmsg(mp, (uchar_t *)ipha + 2699 hdr_length + sizeof (ipha_t) - 2700 mp->b_rptr)) { 2701 goto discard_pkt; 2702 } 2703 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2704 ipha = (ipha_t *)&icmph[1]; 2705 } 2706 /* 2707 * Caller has verified that length has to be 2708 * at least the size of IP header. 2709 */ 2710 ASSERT(hdr_length >= sizeof (ipha_t)); 2711 /* 2712 * Check the sanity of the inner IP header like 2713 * we did for the outer header. 2714 */ 2715 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2716 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2717 goto discard_pkt; 2718 } 2719 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2720 goto discard_pkt; 2721 } 2722 /* Check for Self-encapsulated tunnels */ 2723 if (in_ipha->ipha_src == ipha->ipha_src && 2724 in_ipha->ipha_dst == ipha->ipha_dst) { 2725 2726 mp = icmp_inbound_self_encap_error(mp, 2727 iph_hdr_length, hdr_length); 2728 if (mp == NULL) 2729 goto discard_pkt; 2730 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2731 ipha = (ipha_t *)&icmph[1]; 2732 hdr_length = IPH_HDR_LENGTH(ipha); 2733 /* 2734 * The packet in error is self-encapsualted. 2735 * And we are finding it further encapsulated 2736 * which we could not have possibly generated. 2737 */ 2738 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2739 goto discard_pkt; 2740 } 2741 icmp_inbound_error_fanout(q, ill, first_mp, 2742 icmph, ipha, iph_hdr_length, hdr_length, 2743 mctl_present, ip_policy, recv_ill, zoneid); 2744 return; 2745 } 2746 } 2747 if ((ipha->ipha_protocol == IPPROTO_ENCAP || 2748 ipha->ipha_protocol == IPPROTO_IPV6) && 2749 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 2750 ii != NULL && 2751 ii->ipsec_in_loopback && 2752 ii->ipsec_in_secure) { 2753 /* 2754 * For IP tunnels that get a looped-back 2755 * ICMP_FRAGMENTATION_NEEDED message, adjust the 2756 * reported new MTU to take into account the IPsec 2757 * headers protecting this configured tunnel. 2758 * 2759 * This allows the tunnel module (tun.c) to blindly 2760 * accept the MTU reported in an ICMP "too big" 2761 * message. 2762 * 2763 * Non-looped back ICMP messages will just be 2764 * handled by the security protocols (if needed), 2765 * and the first subsequent packet will hit this 2766 * path. 2767 */ 2768 icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - 2769 ipsec_in_extra_length(first_mp)); 2770 } 2771 /* Have to change db_type after any pullupmsg */ 2772 DB_TYPE(mp) = M_CTL; 2773 2774 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2775 ip_policy, recv_ill, zoneid); 2776 return; 2777 } 2778 /* NOTREACHED */ 2779 discard_pkt: 2780 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2781 drop_pkt:; 2782 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2783 freemsg(first_mp); 2784 } 2785 2786 /* 2787 * Common IP options parser. 2788 * 2789 * Setup routine: fill in *optp with options-parsing state, then 2790 * tail-call ipoptp_next to return the first option. 2791 */ 2792 uint8_t 2793 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2794 { 2795 uint32_t totallen; /* total length of all options */ 2796 2797 totallen = ipha->ipha_version_and_hdr_length - 2798 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2799 totallen <<= 2; 2800 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2801 optp->ipoptp_end = optp->ipoptp_next + totallen; 2802 optp->ipoptp_flags = 0; 2803 return (ipoptp_next(optp)); 2804 } 2805 2806 /* 2807 * Common IP options parser: extract next option. 2808 */ 2809 uint8_t 2810 ipoptp_next(ipoptp_t *optp) 2811 { 2812 uint8_t *end = optp->ipoptp_end; 2813 uint8_t *cur = optp->ipoptp_next; 2814 uint8_t opt, len, pointer; 2815 2816 /* 2817 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2818 * has been corrupted. 2819 */ 2820 ASSERT(cur <= end); 2821 2822 if (cur == end) 2823 return (IPOPT_EOL); 2824 2825 opt = cur[IPOPT_OPTVAL]; 2826 2827 /* 2828 * Skip any NOP options. 2829 */ 2830 while (opt == IPOPT_NOP) { 2831 cur++; 2832 if (cur == end) 2833 return (IPOPT_EOL); 2834 opt = cur[IPOPT_OPTVAL]; 2835 } 2836 2837 if (opt == IPOPT_EOL) 2838 return (IPOPT_EOL); 2839 2840 /* 2841 * Option requiring a length. 2842 */ 2843 if ((cur + 1) >= end) { 2844 optp->ipoptp_flags |= IPOPTP_ERROR; 2845 return (IPOPT_EOL); 2846 } 2847 len = cur[IPOPT_OLEN]; 2848 if (len < 2) { 2849 optp->ipoptp_flags |= IPOPTP_ERROR; 2850 return (IPOPT_EOL); 2851 } 2852 optp->ipoptp_cur = cur; 2853 optp->ipoptp_len = len; 2854 optp->ipoptp_next = cur + len; 2855 if (cur + len > end) { 2856 optp->ipoptp_flags |= IPOPTP_ERROR; 2857 return (IPOPT_EOL); 2858 } 2859 2860 /* 2861 * For the options which require a pointer field, make sure 2862 * its there, and make sure it points to either something 2863 * inside this option, or the end of the option. 2864 */ 2865 switch (opt) { 2866 case IPOPT_RR: 2867 case IPOPT_TS: 2868 case IPOPT_LSRR: 2869 case IPOPT_SSRR: 2870 if (len <= IPOPT_OFFSET) { 2871 optp->ipoptp_flags |= IPOPTP_ERROR; 2872 return (opt); 2873 } 2874 pointer = cur[IPOPT_OFFSET]; 2875 if (pointer - 1 > len) { 2876 optp->ipoptp_flags |= IPOPTP_ERROR; 2877 return (opt); 2878 } 2879 break; 2880 } 2881 2882 /* 2883 * Sanity check the pointer field based on the type of the 2884 * option. 2885 */ 2886 switch (opt) { 2887 case IPOPT_RR: 2888 case IPOPT_SSRR: 2889 case IPOPT_LSRR: 2890 if (pointer < IPOPT_MINOFF_SR) 2891 optp->ipoptp_flags |= IPOPTP_ERROR; 2892 break; 2893 case IPOPT_TS: 2894 if (pointer < IPOPT_MINOFF_IT) 2895 optp->ipoptp_flags |= IPOPTP_ERROR; 2896 /* 2897 * Note that the Internet Timestamp option also 2898 * contains two four bit fields (the Overflow field, 2899 * and the Flag field), which follow the pointer 2900 * field. We don't need to check that these fields 2901 * fall within the length of the option because this 2902 * was implicitely done above. We've checked that the 2903 * pointer value is at least IPOPT_MINOFF_IT, and that 2904 * it falls within the option. Since IPOPT_MINOFF_IT > 2905 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2906 */ 2907 ASSERT(len > IPOPT_POS_OV_FLG); 2908 break; 2909 } 2910 2911 return (opt); 2912 } 2913 2914 /* 2915 * Use the outgoing IP header to create an IP_OPTIONS option the way 2916 * it was passed down from the application. 2917 */ 2918 int 2919 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2920 { 2921 ipoptp_t opts; 2922 const uchar_t *opt; 2923 uint8_t optval; 2924 uint8_t optlen; 2925 uint32_t len = 0; 2926 uchar_t *buf1 = buf; 2927 2928 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2929 len += IP_ADDR_LEN; 2930 bzero(buf1, IP_ADDR_LEN); 2931 2932 /* 2933 * OK to cast away const here, as we don't store through the returned 2934 * opts.ipoptp_cur pointer. 2935 */ 2936 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2937 optval != IPOPT_EOL; 2938 optval = ipoptp_next(&opts)) { 2939 int off; 2940 2941 opt = opts.ipoptp_cur; 2942 optlen = opts.ipoptp_len; 2943 switch (optval) { 2944 case IPOPT_SSRR: 2945 case IPOPT_LSRR: 2946 2947 /* 2948 * Insert ipha_dst as the first entry in the source 2949 * route and move down the entries on step. 2950 * The last entry gets placed at buf1. 2951 */ 2952 buf[IPOPT_OPTVAL] = optval; 2953 buf[IPOPT_OLEN] = optlen; 2954 buf[IPOPT_OFFSET] = optlen; 2955 2956 off = optlen - IP_ADDR_LEN; 2957 if (off < 0) { 2958 /* No entries in source route */ 2959 break; 2960 } 2961 /* Last entry in source route */ 2962 bcopy(opt + off, buf1, IP_ADDR_LEN); 2963 off -= IP_ADDR_LEN; 2964 2965 while (off > 0) { 2966 bcopy(opt + off, 2967 buf + off + IP_ADDR_LEN, 2968 IP_ADDR_LEN); 2969 off -= IP_ADDR_LEN; 2970 } 2971 /* ipha_dst into first slot */ 2972 bcopy(&ipha->ipha_dst, 2973 buf + off + IP_ADDR_LEN, 2974 IP_ADDR_LEN); 2975 buf += optlen; 2976 len += optlen; 2977 break; 2978 2979 case IPOPT_COMSEC: 2980 case IPOPT_SECURITY: 2981 /* if passing up a label is not ok, then remove */ 2982 if (is_system_labeled()) 2983 break; 2984 /* FALLTHROUGH */ 2985 default: 2986 bcopy(opt, buf, optlen); 2987 buf += optlen; 2988 len += optlen; 2989 break; 2990 } 2991 } 2992 done: 2993 /* Pad the resulting options */ 2994 while (len & 0x3) { 2995 *buf++ = IPOPT_EOL; 2996 len++; 2997 } 2998 return (len); 2999 } 3000 3001 /* 3002 * Update any record route or timestamp options to include this host. 3003 * Reverse any source route option. 3004 * This routine assumes that the options are well formed i.e. that they 3005 * have already been checked. 3006 */ 3007 static void 3008 icmp_options_update(ipha_t *ipha) 3009 { 3010 ipoptp_t opts; 3011 uchar_t *opt; 3012 uint8_t optval; 3013 ipaddr_t src; /* Our local address */ 3014 ipaddr_t dst; 3015 3016 ip2dbg(("icmp_options_update\n")); 3017 src = ipha->ipha_src; 3018 dst = ipha->ipha_dst; 3019 3020 for (optval = ipoptp_first(&opts, ipha); 3021 optval != IPOPT_EOL; 3022 optval = ipoptp_next(&opts)) { 3023 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 3024 opt = opts.ipoptp_cur; 3025 ip2dbg(("icmp_options_update: opt %d, len %d\n", 3026 optval, opts.ipoptp_len)); 3027 switch (optval) { 3028 int off1, off2; 3029 case IPOPT_SSRR: 3030 case IPOPT_LSRR: 3031 /* 3032 * Reverse the source route. The first entry 3033 * should be the next to last one in the current 3034 * source route (the last entry is our address). 3035 * The last entry should be the final destination. 3036 */ 3037 off1 = IPOPT_MINOFF_SR - 1; 3038 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 3039 if (off2 < 0) { 3040 /* No entries in source route */ 3041 ip1dbg(( 3042 "icmp_options_update: bad src route\n")); 3043 break; 3044 } 3045 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 3046 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 3047 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 3048 off2 -= IP_ADDR_LEN; 3049 3050 while (off1 < off2) { 3051 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 3052 bcopy((char *)opt + off2, (char *)opt + off1, 3053 IP_ADDR_LEN); 3054 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 3055 off1 += IP_ADDR_LEN; 3056 off2 -= IP_ADDR_LEN; 3057 } 3058 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 3059 break; 3060 } 3061 } 3062 } 3063 3064 /* 3065 * Process received ICMP Redirect messages. 3066 */ 3067 static void 3068 icmp_redirect(ill_t *ill, mblk_t *mp) 3069 { 3070 ipha_t *ipha; 3071 int iph_hdr_length; 3072 icmph_t *icmph; 3073 ipha_t *ipha_err; 3074 ire_t *ire; 3075 ire_t *prev_ire; 3076 ire_t *save_ire; 3077 ipaddr_t src, dst, gateway; 3078 iulp_t ulp_info = { 0 }; 3079 int error; 3080 ip_stack_t *ipst; 3081 3082 ASSERT(ill != NULL); 3083 ipst = ill->ill_ipst; 3084 3085 ipha = (ipha_t *)mp->b_rptr; 3086 iph_hdr_length = IPH_HDR_LENGTH(ipha); 3087 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 3088 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 3089 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3090 freemsg(mp); 3091 return; 3092 } 3093 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 3094 ipha_err = (ipha_t *)&icmph[1]; 3095 src = ipha->ipha_src; 3096 dst = ipha_err->ipha_dst; 3097 gateway = icmph->icmph_rd_gateway; 3098 /* Make sure the new gateway is reachable somehow. */ 3099 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 3100 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3101 /* 3102 * Make sure we had a route for the dest in question and that 3103 * that route was pointing to the old gateway (the source of the 3104 * redirect packet.) 3105 */ 3106 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3107 NULL, MATCH_IRE_GW, ipst); 3108 /* 3109 * Check that 3110 * the redirect was not from ourselves 3111 * the new gateway and the old gateway are directly reachable 3112 */ 3113 if (!prev_ire || 3114 !ire || 3115 ire->ire_type == IRE_LOCAL) { 3116 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3117 freemsg(mp); 3118 if (ire != NULL) 3119 ire_refrele(ire); 3120 if (prev_ire != NULL) 3121 ire_refrele(prev_ire); 3122 return; 3123 } 3124 3125 /* 3126 * Should we use the old ULP info to create the new gateway? From 3127 * a user's perspective, we should inherit the info so that it 3128 * is a "smooth" transition. If we do not do that, then new 3129 * connections going thru the new gateway will have no route metrics, 3130 * which is counter-intuitive to user. From a network point of 3131 * view, this may or may not make sense even though the new gateway 3132 * is still directly connected to us so the route metrics should not 3133 * change much. 3134 * 3135 * But if the old ire_uinfo is not initialized, we do another 3136 * recursive lookup on the dest using the new gateway. There may 3137 * be a route to that. If so, use it to initialize the redirect 3138 * route. 3139 */ 3140 if (prev_ire->ire_uinfo.iulp_set) { 3141 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3142 } else { 3143 ire_t *tmp_ire; 3144 ire_t *sire; 3145 3146 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3147 ALL_ZONES, 0, NULL, 3148 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), 3149 ipst); 3150 if (sire != NULL) { 3151 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3152 /* 3153 * If sire != NULL, ire_ftable_lookup() should not 3154 * return a NULL value. 3155 */ 3156 ASSERT(tmp_ire != NULL); 3157 ire_refrele(tmp_ire); 3158 ire_refrele(sire); 3159 } else if (tmp_ire != NULL) { 3160 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3161 sizeof (iulp_t)); 3162 ire_refrele(tmp_ire); 3163 } 3164 } 3165 if (prev_ire->ire_type == IRE_CACHE) 3166 ire_delete(prev_ire); 3167 ire_refrele(prev_ire); 3168 /* 3169 * TODO: more precise handling for cases 0, 2, 3, the latter two 3170 * require TOS routing 3171 */ 3172 switch (icmph->icmph_code) { 3173 case 0: 3174 case 1: 3175 /* TODO: TOS specificity for cases 2 and 3 */ 3176 case 2: 3177 case 3: 3178 break; 3179 default: 3180 freemsg(mp); 3181 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 3182 ire_refrele(ire); 3183 return; 3184 } 3185 /* 3186 * Create a Route Association. This will allow us to remember that 3187 * someone we believe told us to use the particular gateway. 3188 */ 3189 save_ire = ire; 3190 ire = ire_create( 3191 (uchar_t *)&dst, /* dest addr */ 3192 (uchar_t *)&ip_g_all_ones, /* mask */ 3193 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3194 (uchar_t *)&gateway, /* gateway addr */ 3195 &save_ire->ire_max_frag, /* max frag */ 3196 NULL, /* no src nce */ 3197 NULL, /* no rfq */ 3198 NULL, /* no stq */ 3199 IRE_HOST, 3200 NULL, /* ipif */ 3201 0, /* cmask */ 3202 0, /* phandle */ 3203 0, /* ihandle */ 3204 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3205 &ulp_info, 3206 NULL, /* tsol_gc_t */ 3207 NULL, /* gcgrp */ 3208 ipst); 3209 3210 if (ire == NULL) { 3211 freemsg(mp); 3212 ire_refrele(save_ire); 3213 return; 3214 } 3215 error = ire_add(&ire, NULL, NULL, NULL, B_FALSE); 3216 ire_refrele(save_ire); 3217 atomic_inc_32(&ipst->ips_ip_redirect_cnt); 3218 3219 if (error == 0) { 3220 ire_refrele(ire); /* Held in ire_add_v4 */ 3221 /* tell routing sockets that we received a redirect */ 3222 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3223 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3224 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); 3225 } 3226 3227 /* 3228 * Delete any existing IRE_HOST type redirect ires for this destination. 3229 * This together with the added IRE has the effect of 3230 * modifying an existing redirect. 3231 */ 3232 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL, 3233 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst); 3234 if (prev_ire != NULL) { 3235 if (prev_ire ->ire_flags & RTF_DYNAMIC) 3236 ire_delete(prev_ire); 3237 ire_refrele(prev_ire); 3238 } 3239 3240 freemsg(mp); 3241 } 3242 3243 /* 3244 * Generate an ICMP parameter problem message. 3245 */ 3246 static void 3247 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, 3248 ip_stack_t *ipst) 3249 { 3250 icmph_t icmph; 3251 boolean_t mctl_present; 3252 mblk_t *first_mp; 3253 3254 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3255 3256 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3257 if (mctl_present) 3258 freeb(first_mp); 3259 return; 3260 } 3261 3262 bzero(&icmph, sizeof (icmph_t)); 3263 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3264 icmph.icmph_pp_ptr = ptr; 3265 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); 3266 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3267 ipst); 3268 } 3269 3270 /* 3271 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3272 * the ICMP header pointed to by "stuff". (May be called as writer.) 3273 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3274 * an icmp error packet can be sent. 3275 * Assigns an appropriate source address to the packet. If ipha_dst is 3276 * one of our addresses use it for source. Otherwise pick a source based 3277 * on a route lookup back to ipha_src. 3278 * Note that ipha_src must be set here since the 3279 * packet is likely to arrive on an ill queue in ip_wput() which will 3280 * not set a source address. 3281 */ 3282 static void 3283 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3284 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 3285 { 3286 ipaddr_t dst; 3287 icmph_t *icmph; 3288 ipha_t *ipha; 3289 uint_t len_needed; 3290 size_t msg_len; 3291 mblk_t *mp1; 3292 ipaddr_t src; 3293 ire_t *ire; 3294 mblk_t *ipsec_mp; 3295 ipsec_out_t *io = NULL; 3296 3297 if (mctl_present) { 3298 /* 3299 * If it is : 3300 * 3301 * 1) a IPSEC_OUT, then this is caused by outbound 3302 * datagram originating on this host. IPsec processing 3303 * may or may not have been done. Refer to comments above 3304 * icmp_inbound_error_fanout for details. 3305 * 3306 * 2) a IPSEC_IN if we are generating a icmp_message 3307 * for an incoming datagram destined for us i.e called 3308 * from ip_fanout_send_icmp. 3309 */ 3310 ipsec_info_t *in; 3311 ipsec_mp = mp; 3312 mp = ipsec_mp->b_cont; 3313 3314 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3315 ipha = (ipha_t *)mp->b_rptr; 3316 3317 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3318 in->ipsec_info_type == IPSEC_IN); 3319 3320 if (in->ipsec_info_type == IPSEC_IN) { 3321 /* 3322 * Convert the IPSEC_IN to IPSEC_OUT. 3323 */ 3324 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3325 BUMP_MIB(&ipst->ips_ip_mib, 3326 ipIfStatsOutDiscards); 3327 return; 3328 } 3329 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3330 } else { 3331 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3332 io = (ipsec_out_t *)in; 3333 /* 3334 * Clear out ipsec_out_proc_begin, so we do a fresh 3335 * ire lookup. 3336 */ 3337 io->ipsec_out_proc_begin = B_FALSE; 3338 } 3339 ASSERT(zoneid == io->ipsec_out_zoneid); 3340 ASSERT(zoneid != ALL_ZONES); 3341 } else { 3342 /* 3343 * This is in clear. The icmp message we are building 3344 * here should go out in clear. 3345 * 3346 * Pardon the convolution of it all, but it's easier to 3347 * allocate a "use cleartext" IPSEC_IN message and convert 3348 * it than it is to allocate a new one. 3349 */ 3350 ipsec_in_t *ii; 3351 ASSERT(DB_TYPE(mp) == M_DATA); 3352 ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); 3353 if (ipsec_mp == NULL) { 3354 freemsg(mp); 3355 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3356 return; 3357 } 3358 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3359 3360 /* This is not a secure packet */ 3361 ii->ipsec_in_secure = B_FALSE; 3362 /* 3363 * For trusted extensions using a shared IP address we can 3364 * send using any zoneid. 3365 */ 3366 if (zoneid == ALL_ZONES) 3367 ii->ipsec_in_zoneid = GLOBAL_ZONEID; 3368 else 3369 ii->ipsec_in_zoneid = zoneid; 3370 ipsec_mp->b_cont = mp; 3371 ipha = (ipha_t *)mp->b_rptr; 3372 /* 3373 * Convert the IPSEC_IN to IPSEC_OUT. 3374 */ 3375 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3376 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3377 return; 3378 } 3379 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3380 } 3381 3382 /* Remember our eventual destination */ 3383 dst = ipha->ipha_src; 3384 3385 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3386 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst); 3387 if (ire != NULL && 3388 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3389 src = ipha->ipha_dst; 3390 } else { 3391 if (ire != NULL) 3392 ire_refrele(ire); 3393 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3394 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY), 3395 ipst); 3396 if (ire == NULL) { 3397 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 3398 freemsg(ipsec_mp); 3399 return; 3400 } 3401 src = ire->ire_src_addr; 3402 } 3403 3404 if (ire != NULL) 3405 ire_refrele(ire); 3406 3407 /* 3408 * Check if we can send back more then 8 bytes in addition to 3409 * the IP header. We try to send 64 bytes of data and the internal 3410 * header in the special cases of ipv4 encapsulated ipv4 or ipv6. 3411 */ 3412 len_needed = IPH_HDR_LENGTH(ipha); 3413 if (ipha->ipha_protocol == IPPROTO_ENCAP || 3414 ipha->ipha_protocol == IPPROTO_IPV6) { 3415 3416 if (!pullupmsg(mp, -1)) { 3417 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3418 freemsg(ipsec_mp); 3419 return; 3420 } 3421 ipha = (ipha_t *)mp->b_rptr; 3422 3423 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 3424 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + 3425 len_needed)); 3426 } else { 3427 ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); 3428 3429 ASSERT(ipha->ipha_protocol == IPPROTO_IPV6); 3430 len_needed += ip_hdr_length_v6(mp, ip6h); 3431 } 3432 } 3433 len_needed += ipst->ips_ip_icmp_return; 3434 msg_len = msgdsize(mp); 3435 if (msg_len > len_needed) { 3436 (void) adjmsg(mp, len_needed - msg_len); 3437 msg_len = len_needed; 3438 } 3439 mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp); 3440 if (mp1 == NULL) { 3441 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors); 3442 freemsg(ipsec_mp); 3443 return; 3444 } 3445 mp1->b_cont = mp; 3446 mp = mp1; 3447 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3448 ipsec_mp->b_rptr == (uint8_t *)io && 3449 io->ipsec_out_type == IPSEC_OUT); 3450 ipsec_mp->b_cont = mp; 3451 3452 /* 3453 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3454 * node generates be accepted in peace by all on-host destinations. 3455 * If we do NOT assume that all on-host destinations trust 3456 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3457 * (Look for ipsec_out_icmp_loopback). 3458 */ 3459 io->ipsec_out_icmp_loopback = B_TRUE; 3460 3461 ipha = (ipha_t *)mp->b_rptr; 3462 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3463 *ipha = icmp_ipha; 3464 ipha->ipha_src = src; 3465 ipha->ipha_dst = dst; 3466 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 3467 msg_len += sizeof (icmp_ipha) + len; 3468 if (msg_len > IP_MAXPACKET) { 3469 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3470 msg_len = IP_MAXPACKET; 3471 } 3472 ipha->ipha_length = htons((uint16_t)msg_len); 3473 icmph = (icmph_t *)&ipha[1]; 3474 bcopy(stuff, icmph, len); 3475 icmph->icmph_checksum = 0; 3476 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3477 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 3478 put(q, ipsec_mp); 3479 } 3480 3481 /* 3482 * Determine if an ICMP error packet can be sent given the rate limit. 3483 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3484 * in milliseconds) and a burst size. Burst size number of packets can 3485 * be sent arbitrarely closely spaced. 3486 * The state is tracked using two variables to implement an approximate 3487 * token bucket filter: 3488 * icmp_pkt_err_last - lbolt value when the last burst started 3489 * icmp_pkt_err_sent - number of packets sent in current burst 3490 */ 3491 boolean_t 3492 icmp_err_rate_limit(ip_stack_t *ipst) 3493 { 3494 clock_t now = TICK_TO_MSEC(lbolt); 3495 uint_t refilled; /* Number of packets refilled in tbf since last */ 3496 /* Guard against changes by loading into local variable */ 3497 uint_t err_interval = ipst->ips_ip_icmp_err_interval; 3498 3499 if (err_interval == 0) 3500 return (B_FALSE); 3501 3502 if (ipst->ips_icmp_pkt_err_last > now) { 3503 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3504 ipst->ips_icmp_pkt_err_last = 0; 3505 ipst->ips_icmp_pkt_err_sent = 0; 3506 } 3507 /* 3508 * If we are in a burst update the token bucket filter. 3509 * Update the "last" time to be close to "now" but make sure 3510 * we don't loose precision. 3511 */ 3512 if (ipst->ips_icmp_pkt_err_sent != 0) { 3513 refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval; 3514 if (refilled > ipst->ips_icmp_pkt_err_sent) { 3515 ipst->ips_icmp_pkt_err_sent = 0; 3516 } else { 3517 ipst->ips_icmp_pkt_err_sent -= refilled; 3518 ipst->ips_icmp_pkt_err_last += refilled * err_interval; 3519 } 3520 } 3521 if (ipst->ips_icmp_pkt_err_sent == 0) { 3522 /* Start of new burst */ 3523 ipst->ips_icmp_pkt_err_last = now; 3524 } 3525 if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) { 3526 ipst->ips_icmp_pkt_err_sent++; 3527 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3528 ipst->ips_icmp_pkt_err_sent)); 3529 return (B_FALSE); 3530 } 3531 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3532 return (B_TRUE); 3533 } 3534 3535 /* 3536 * Check if it is ok to send an IPv4 ICMP error packet in 3537 * response to the IPv4 packet in mp. 3538 * Free the message and return null if no 3539 * ICMP error packet should be sent. 3540 */ 3541 static mblk_t * 3542 icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) 3543 { 3544 icmph_t *icmph; 3545 ipha_t *ipha; 3546 uint_t len_needed; 3547 ire_t *src_ire; 3548 ire_t *dst_ire; 3549 3550 if (!mp) 3551 return (NULL); 3552 ipha = (ipha_t *)mp->b_rptr; 3553 if (ip_csum_hdr(ipha)) { 3554 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs); 3555 freemsg(mp); 3556 return (NULL); 3557 } 3558 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3559 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3560 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3561 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 3562 if (src_ire != NULL || dst_ire != NULL || 3563 CLASSD(ipha->ipha_dst) || 3564 CLASSD(ipha->ipha_src) || 3565 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3566 /* Note: only errors to the fragment with offset 0 */ 3567 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3568 freemsg(mp); 3569 if (src_ire != NULL) 3570 ire_refrele(src_ire); 3571 if (dst_ire != NULL) 3572 ire_refrele(dst_ire); 3573 return (NULL); 3574 } 3575 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3576 /* 3577 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3578 * errors in response to any ICMP errors. 3579 */ 3580 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3581 if (mp->b_wptr - mp->b_rptr < len_needed) { 3582 if (!pullupmsg(mp, len_needed)) { 3583 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 3584 freemsg(mp); 3585 return (NULL); 3586 } 3587 ipha = (ipha_t *)mp->b_rptr; 3588 } 3589 icmph = (icmph_t *) 3590 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3591 switch (icmph->icmph_type) { 3592 case ICMP_DEST_UNREACHABLE: 3593 case ICMP_SOURCE_QUENCH: 3594 case ICMP_TIME_EXCEEDED: 3595 case ICMP_PARAM_PROBLEM: 3596 case ICMP_REDIRECT: 3597 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3598 freemsg(mp); 3599 return (NULL); 3600 default: 3601 break; 3602 } 3603 } 3604 /* 3605 * If this is a labeled system, then check to see if we're allowed to 3606 * send a response to this particular sender. If not, then just drop. 3607 */ 3608 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3609 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3610 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 3611 freemsg(mp); 3612 return (NULL); 3613 } 3614 if (icmp_err_rate_limit(ipst)) { 3615 /* 3616 * Only send ICMP error packets every so often. 3617 * This should be done on a per port/source basis, 3618 * but for now this will suffice. 3619 */ 3620 freemsg(mp); 3621 return (NULL); 3622 } 3623 return (mp); 3624 } 3625 3626 /* 3627 * Generate an ICMP redirect message. 3628 */ 3629 static void 3630 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst) 3631 { 3632 icmph_t icmph; 3633 3634 /* 3635 * We are called from ip_rput where we could 3636 * not have attached an IPSEC_IN. 3637 */ 3638 ASSERT(mp->b_datap->db_type == M_DATA); 3639 3640 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3641 return; 3642 } 3643 3644 bzero(&icmph, sizeof (icmph_t)); 3645 icmph.icmph_type = ICMP_REDIRECT; 3646 icmph.icmph_code = 1; 3647 icmph.icmph_rd_gateway = gateway; 3648 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); 3649 /* Redirects sent by router, and router is global zone */ 3650 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst); 3651 } 3652 3653 /* 3654 * Generate an ICMP time exceeded message. 3655 */ 3656 void 3657 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3658 ip_stack_t *ipst) 3659 { 3660 icmph_t icmph; 3661 boolean_t mctl_present; 3662 mblk_t *first_mp; 3663 3664 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3665 3666 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3667 if (mctl_present) 3668 freeb(first_mp); 3669 return; 3670 } 3671 3672 bzero(&icmph, sizeof (icmph_t)); 3673 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3674 icmph.icmph_code = code; 3675 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); 3676 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, 3677 ipst); 3678 } 3679 3680 /* 3681 * Generate an ICMP unreachable message. 3682 */ 3683 void 3684 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, 3685 ip_stack_t *ipst) 3686 { 3687 icmph_t icmph; 3688 mblk_t *first_mp; 3689 boolean_t mctl_present; 3690 3691 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3692 3693 if (!(mp = icmp_pkt_err_ok(mp, ipst))) { 3694 if (mctl_present) 3695 freeb(first_mp); 3696 return; 3697 } 3698 3699 bzero(&icmph, sizeof (icmph_t)); 3700 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3701 icmph.icmph_code = code; 3702 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 3703 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3704 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present, 3705 zoneid, ipst); 3706 } 3707 3708 /* 3709 * Attempt to start recovery of an IPv4 interface that's been shut down as a 3710 * duplicate. As long as someone else holds the address, the interface will 3711 * stay down. When that conflict goes away, the interface is brought back up. 3712 * This is done so that accidental shutdowns of addresses aren't made 3713 * permanent. Your server will recover from a failure. 3714 * 3715 * For DHCP, recovery is not done in the kernel. Instead, it's handled by a 3716 * user space process (dhcpagent). 3717 * 3718 * Recovery completes if ARP reports that the address is now ours (via 3719 * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. 3720 * 3721 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 3722 */ 3723 static void 3724 ipif_dup_recovery(void *arg) 3725 { 3726 ipif_t *ipif = arg; 3727 ill_t *ill = ipif->ipif_ill; 3728 mblk_t *arp_add_mp; 3729 mblk_t *arp_del_mp; 3730 area_t *area; 3731 ip_stack_t *ipst = ill->ill_ipst; 3732 3733 ipif->ipif_recovery_id = 0; 3734 3735 /* 3736 * No lock needed for moving or condemned check, as this is just an 3737 * optimization. 3738 */ 3739 if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || 3740 (ipif->ipif_flags & IPIF_POINTOPOINT) || 3741 (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { 3742 /* No reason to try to bring this address back. */ 3743 return; 3744 } 3745 3746 if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) 3747 goto alloc_fail; 3748 3749 if (ipif->ipif_arp_del_mp == NULL) { 3750 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 3751 goto alloc_fail; 3752 ipif->ipif_arp_del_mp = arp_del_mp; 3753 } 3754 3755 /* Setting the 'unverified' flag restarts DAD */ 3756 area = (area_t *)arp_add_mp->b_rptr; 3757 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 3758 ACE_F_UNVERIFIED; 3759 putnext(ill->ill_rq, arp_add_mp); 3760 return; 3761 3762 alloc_fail: 3763 /* 3764 * On allocation failure, just restart the timer. Note that the ipif 3765 * is down here, so no other thread could be trying to start a recovery 3766 * timer. The ill_lock protects the condemned flag and the recovery 3767 * timer ID. 3768 */ 3769 freemsg(arp_add_mp); 3770 mutex_enter(&ill->ill_lock); 3771 if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 && 3772 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 3773 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, 3774 MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3775 } 3776 mutex_exit(&ill->ill_lock); 3777 } 3778 3779 /* 3780 * This is for exclusive changes due to ARP. Either tear down an interface due 3781 * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. 3782 */ 3783 /* ARGSUSED */ 3784 static void 3785 ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3786 { 3787 ill_t *ill = rq->q_ptr; 3788 arh_t *arh; 3789 ipaddr_t src; 3790 ipif_t *ipif; 3791 char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ 3792 char hbuf[MAC_STR_LEN]; 3793 char sbuf[INET_ADDRSTRLEN]; 3794 const char *failtype; 3795 boolean_t bring_up; 3796 ip_stack_t *ipst = ill->ill_ipst; 3797 3798 switch (((arcn_t *)mp->b_rptr)->arcn_code) { 3799 case AR_CN_READY: 3800 failtype = NULL; 3801 bring_up = B_TRUE; 3802 break; 3803 case AR_CN_FAILED: 3804 failtype = "in use"; 3805 bring_up = B_FALSE; 3806 break; 3807 default: 3808 failtype = "claimed"; 3809 bring_up = B_FALSE; 3810 break; 3811 } 3812 3813 arh = (arh_t *)mp->b_cont->b_rptr; 3814 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3815 3816 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, 3817 sizeof (hbuf)); 3818 (void) ip_dot_addr(src, sbuf); 3819 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3820 3821 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || 3822 ipif->ipif_lcl_addr != src) { 3823 continue; 3824 } 3825 3826 /* 3827 * If we failed on a recovery probe, then restart the timer to 3828 * try again later. 3829 */ 3830 if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && 3831 !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3832 ill->ill_net_type == IRE_IF_RESOLVER && 3833 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3834 ipst->ips_ip_dup_recovery > 0 && 3835 ipif->ipif_recovery_id == 0) { 3836 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3837 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3838 continue; 3839 } 3840 3841 /* 3842 * If what we're trying to do has already been done, then do 3843 * nothing. 3844 */ 3845 if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) 3846 continue; 3847 3848 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 3849 3850 if (failtype == NULL) { 3851 cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, 3852 ibuf); 3853 } else { 3854 cmn_err(CE_WARN, "%s has duplicate address %s (%s " 3855 "by %s); disabled", ibuf, sbuf, failtype, hbuf); 3856 } 3857 3858 if (bring_up) { 3859 ASSERT(ill->ill_dl_up); 3860 /* 3861 * Free up the ARP delete message so we can allocate 3862 * a fresh one through the normal path. 3863 */ 3864 freemsg(ipif->ipif_arp_del_mp); 3865 ipif->ipif_arp_del_mp = NULL; 3866 if (ipif_resolver_up(ipif, Res_act_initial) != 3867 EINPROGRESS) { 3868 ipif->ipif_addr_ready = 1; 3869 (void) ipif_up_done(ipif); 3870 } 3871 continue; 3872 } 3873 3874 mutex_enter(&ill->ill_lock); 3875 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 3876 ipif->ipif_flags |= IPIF_DUPLICATE; 3877 ill->ill_ipif_dup_count++; 3878 mutex_exit(&ill->ill_lock); 3879 /* 3880 * Already exclusive on the ill; no need to handle deferred 3881 * processing here. 3882 */ 3883 (void) ipif_down(ipif, NULL, NULL); 3884 ipif_down_tail(ipif); 3885 mutex_enter(&ill->ill_lock); 3886 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 3887 ill->ill_net_type == IRE_IF_RESOLVER && 3888 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 3889 ipst->ips_ip_dup_recovery > 0) { 3890 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 3891 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 3892 } 3893 mutex_exit(&ill->ill_lock); 3894 } 3895 freemsg(mp); 3896 } 3897 3898 /* ARGSUSED */ 3899 static void 3900 ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 3901 { 3902 ill_t *ill = rq->q_ptr; 3903 arh_t *arh; 3904 ipaddr_t src; 3905 ipif_t *ipif; 3906 3907 arh = (arh_t *)mp->b_cont->b_rptr; 3908 bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); 3909 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3910 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src) 3911 (void) ipif_resolver_up(ipif, Res_act_defend); 3912 } 3913 freemsg(mp); 3914 } 3915 3916 /* 3917 * News from ARP. ARP sends notification of interesting events down 3918 * to its clients using M_CTL messages with the interesting ARP packet 3919 * attached via b_cont. 3920 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3921 * queue as opposed to ARP sending the message to all the clients, i.e. all 3922 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3923 * table if a cache IRE is found to delete all the entries for the address in 3924 * the packet. 3925 */ 3926 static void 3927 ip_arp_news(queue_t *q, mblk_t *mp) 3928 { 3929 arcn_t *arcn; 3930 arh_t *arh; 3931 ire_t *ire = NULL; 3932 char hbuf[MAC_STR_LEN]; 3933 char sbuf[INET_ADDRSTRLEN]; 3934 ipaddr_t src; 3935 in6_addr_t v6src; 3936 boolean_t isv6 = B_FALSE; 3937 ipif_t *ipif; 3938 ill_t *ill; 3939 ip_stack_t *ipst; 3940 3941 if (CONN_Q(q)) { 3942 conn_t *connp = Q_TO_CONN(q); 3943 3944 ipst = connp->conn_netstack->netstack_ip; 3945 } else { 3946 ill_t *ill = (ill_t *)q->q_ptr; 3947 3948 ipst = ill->ill_ipst; 3949 } 3950 3951 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3952 if (q->q_next) { 3953 putnext(q, mp); 3954 } else 3955 freemsg(mp); 3956 return; 3957 } 3958 arh = (arh_t *)mp->b_cont->b_rptr; 3959 /* Is it one we are interested in? */ 3960 if (BE16_TO_U16(arh->arh_proto) == IP6_DL_SAP) { 3961 isv6 = B_TRUE; 3962 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3963 IPV6_ADDR_LEN); 3964 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3965 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3966 IP_ADDR_LEN); 3967 } else { 3968 freemsg(mp); 3969 return; 3970 } 3971 3972 ill = q->q_ptr; 3973 3974 arcn = (arcn_t *)mp->b_rptr; 3975 switch (arcn->arcn_code) { 3976 case AR_CN_BOGON: 3977 /* 3978 * Someone is sending ARP packets with a source protocol 3979 * address that we have published and for which we believe our 3980 * entry is authoritative and (when ill_arp_extend is set) 3981 * verified to be unique on the network. 3982 * 3983 * The ARP module internally handles the cases where the sender 3984 * is just probing (for DAD) and where the hardware address of 3985 * a non-authoritative entry has changed. Thus, these are the 3986 * real conflicts, and we have to do resolution. 3987 * 3988 * We back away quickly from the address if it's from DHCP or 3989 * otherwise temporary and hasn't been used recently (or at 3990 * all). We'd like to include "deprecated" addresses here as 3991 * well (as there's no real reason to defend something we're 3992 * discarding), but IPMP "reuses" this flag to mean something 3993 * other than the standard meaning. 3994 * 3995 * If the ARP module above is not extended (meaning that it 3996 * doesn't know how to defend the address), then we just log 3997 * the problem as we always did and continue on. It's not 3998 * right, but there's little else we can do, and those old ATM 3999 * users are going away anyway. 4000 */ 4001 (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, 4002 hbuf, sizeof (hbuf)); 4003 (void) ip_dot_addr(src, sbuf); 4004 if (isv6) { 4005 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL, 4006 ipst); 4007 } else { 4008 ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst); 4009 } 4010 if (ire != NULL && IRE_IS_LOCAL(ire)) { 4011 uint32_t now; 4012 uint32_t maxage; 4013 clock_t lused; 4014 uint_t maxdefense; 4015 uint_t defs; 4016 4017 /* 4018 * First, figure out if this address hasn't been used 4019 * in a while. If it hasn't, then it's a better 4020 * candidate for abandoning. 4021 */ 4022 ipif = ire->ire_ipif; 4023 ASSERT(ipif != NULL); 4024 now = gethrestime_sec(); 4025 maxage = now - ire->ire_create_time; 4026 if (maxage > ipst->ips_ip_max_temp_idle) 4027 maxage = ipst->ips_ip_max_temp_idle; 4028 lused = drv_hztousec(ddi_get_lbolt() - 4029 ire->ire_last_used_time) / MICROSEC + 1; 4030 if (lused >= maxage && (ipif->ipif_flags & 4031 (IPIF_DHCPRUNNING | IPIF_TEMPORARY))) 4032 maxdefense = ipst->ips_ip_max_temp_defend; 4033 else 4034 maxdefense = ipst->ips_ip_max_defend; 4035 4036 /* 4037 * Now figure out how many times we've defended 4038 * ourselves. Ignore defenses that happened long in 4039 * the past. 4040 */ 4041 mutex_enter(&ire->ire_lock); 4042 if ((defs = ire->ire_defense_count) > 0 && 4043 now - ire->ire_defense_time > 4044 ipst->ips_ip_defend_interval) { 4045 ire->ire_defense_count = defs = 0; 4046 } 4047 ire->ire_defense_count++; 4048 ire->ire_defense_time = now; 4049 mutex_exit(&ire->ire_lock); 4050 ill_refhold(ill); 4051 ire_refrele(ire); 4052 4053 /* 4054 * If we've defended ourselves too many times already, 4055 * then give up and tear down the interface(s) using 4056 * this address. Otherwise, defend by sending out a 4057 * gratuitous ARP. 4058 */ 4059 if (defs >= maxdefense && ill->ill_arp_extend) { 4060 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4061 B_FALSE); 4062 } else { 4063 cmn_err(CE_WARN, 4064 "node %s is using our IP address %s on %s", 4065 hbuf, sbuf, ill->ill_name); 4066 /* 4067 * If this is an old (ATM) ARP module, then 4068 * don't try to defend the address. Remain 4069 * compatible with the old behavior. Defend 4070 * only with new ARP. 4071 */ 4072 if (ill->ill_arp_extend) { 4073 qwriter_ip(ill, q, mp, ip_arp_defend, 4074 NEW_OP, B_FALSE); 4075 } else { 4076 ill_refrele(ill); 4077 } 4078 } 4079 return; 4080 } 4081 cmn_err(CE_WARN, 4082 "proxy ARP problem? Node '%s' is using %s on %s", 4083 hbuf, sbuf, ill->ill_name); 4084 if (ire != NULL) 4085 ire_refrele(ire); 4086 break; 4087 case AR_CN_ANNOUNCE: 4088 if (isv6) { 4089 /* 4090 * For XRESOLV interfaces. 4091 * Delete the IRE cache entry and NCE for this 4092 * v6 address 4093 */ 4094 ip_ire_clookup_and_delete_v6(&v6src, ipst); 4095 /* 4096 * If v6src is a non-zero, it's a router address 4097 * as below. Do the same sort of thing to clean 4098 * out off-net IRE_CACHE entries that go through 4099 * the router. 4100 */ 4101 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4102 ire_walk_v6(ire_delete_cache_gw_v6, 4103 (char *)&v6src, ALL_ZONES, ipst); 4104 } 4105 } else { 4106 nce_hw_map_t hwm; 4107 4108 /* 4109 * ARP gives us a copy of any packet where it thinks 4110 * the address has changed, so that we can update our 4111 * caches. We're responsible for caching known answers 4112 * in the current design. We check whether the 4113 * hardware address really has changed in all of our 4114 * entries that have cached this mapping, and if so, we 4115 * blow them away. This way we will immediately pick 4116 * up the rare case of a host changing hardware 4117 * address. 4118 */ 4119 if (src == 0) 4120 break; 4121 hwm.hwm_addr = src; 4122 hwm.hwm_hwlen = arh->arh_hlen; 4123 hwm.hwm_hwaddr = (uchar_t *)(arh + 1); 4124 NDP_HW_CHANGE_INCR(ipst->ips_ndp4); 4125 ndp_walk_common(ipst->ips_ndp4, NULL, 4126 (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES); 4127 NDP_HW_CHANGE_DECR(ipst->ips_ndp4); 4128 } 4129 break; 4130 case AR_CN_READY: 4131 /* No external v6 resolver has a contract to use this */ 4132 if (isv6) 4133 break; 4134 /* If the link is down, we'll retry this later */ 4135 if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 4136 break; 4137 ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL, 4138 NULL, NULL, ipst); 4139 if (ipif != NULL) { 4140 /* 4141 * If this is a duplicate recovery, then we now need to 4142 * go exclusive to bring this thing back up. 4143 */ 4144 if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) == 4145 IPIF_DUPLICATE) { 4146 ipif_refrele(ipif); 4147 ill_refhold(ill); 4148 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, 4149 B_FALSE); 4150 return; 4151 } 4152 /* 4153 * If this is the first notice that this address is 4154 * ready, then let the user know now. 4155 */ 4156 if ((ipif->ipif_flags & IPIF_UP) && 4157 !ipif->ipif_addr_ready) { 4158 ipif_mask_reply(ipif); 4159 ip_rts_ifmsg(ipif); 4160 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 4161 sctp_update_ipif(ipif, SCTP_IPIF_UP); 4162 } 4163 ipif->ipif_addr_ready = 1; 4164 ipif_refrele(ipif); 4165 } 4166 ire = ire_cache_lookup(src, ALL_ZONES, MBLK_GETLABEL(mp), ipst); 4167 if (ire != NULL) { 4168 ire->ire_defense_count = 0; 4169 ire_refrele(ire); 4170 } 4171 break; 4172 case AR_CN_FAILED: 4173 /* No external v6 resolver has a contract to use this */ 4174 if (isv6) 4175 break; 4176 ill_refhold(ill); 4177 qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE); 4178 return; 4179 } 4180 freemsg(mp); 4181 } 4182 4183 /* 4184 * Create a mblk suitable for carrying the interface index and/or source link 4185 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 4186 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 4187 * application. 4188 */ 4189 mblk_t * 4190 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, 4191 ip_stack_t *ipst) 4192 { 4193 mblk_t *mp; 4194 ip_pktinfo_t *pinfo; 4195 ipha_t *ipha; 4196 struct ether_header *pether; 4197 4198 mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); 4199 if (mp == NULL) { 4200 ip1dbg(("ip_add_info: allocation failure.\n")); 4201 return (data_mp); 4202 } 4203 4204 ipha = (ipha_t *)data_mp->b_rptr; 4205 pinfo = (ip_pktinfo_t *)mp->b_rptr; 4206 bzero(pinfo, sizeof (ip_pktinfo_t)); 4207 pinfo->ip_pkt_flags = (uchar_t)flags; 4208 pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 4209 4210 if (flags & (IPF_RECVIF | IPF_RECVADDR)) 4211 pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 4212 if (flags & IPF_RECVADDR) { 4213 ipif_t *ipif; 4214 ire_t *ire; 4215 4216 /* 4217 * Only valid for V4 4218 */ 4219 ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) == 4220 (IPV4_VERSION << 4)); 4221 4222 ipif = ipif_get_next_ipif(NULL, ill); 4223 if (ipif != NULL) { 4224 /* 4225 * Since a decision has already been made to deliver the 4226 * packet, there is no need to test for SECATTR and 4227 * ZONEONLY. 4228 * When a multicast packet is transmitted 4229 * a cache entry is created for the multicast address. 4230 * When delivering a copy of the packet or when new 4231 * packets are received we do not want to match on the 4232 * cached entry so explicitly match on 4233 * IRE_LOCAL and IRE_LOOPBACK 4234 */ 4235 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4236 IRE_LOCAL | IRE_LOOPBACK, 4237 ipif, zoneid, NULL, 4238 MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP, ipst); 4239 if (ire == NULL) { 4240 /* 4241 * packet must have come on a different 4242 * interface. 4243 * Since a decision has already been made to 4244 * deliver the packet, there is no need to test 4245 * for SECATTR and ZONEONLY. 4246 * Only match on local and broadcast ire's. 4247 * See detailed comment above. 4248 */ 4249 ire = ire_ctable_lookup(ipha->ipha_dst, 0, 4250 IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, 4251 NULL, MATCH_IRE_TYPE, ipst); 4252 } 4253 4254 if (ire == NULL) { 4255 /* 4256 * This is either a multicast packet or 4257 * the address has been removed since 4258 * the packet was received. 4259 * Return INADDR_ANY so that normal source 4260 * selection occurs for the response. 4261 */ 4262 4263 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4264 } else { 4265 pinfo->ip_pkt_match_addr.s_addr = 4266 ire->ire_src_addr; 4267 ire_refrele(ire); 4268 } 4269 ipif_refrele(ipif); 4270 } else { 4271 pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; 4272 } 4273 } 4274 4275 pether = (struct ether_header *)((char *)ipha 4276 - sizeof (struct ether_header)); 4277 /* 4278 * Make sure the interface is an ethernet type, since this option 4279 * is currently supported only on this type of interface. Also make 4280 * sure we are pointing correctly above db_base. 4281 */ 4282 4283 if ((flags & IPF_RECVSLLA) && 4284 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 4285 (ill->ill_type == IFT_ETHER) && 4286 (ill->ill_net_type == IRE_IF_RESOLVER)) { 4287 4288 pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; 4289 bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, 4290 (uchar_t *)pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); 4291 } else { 4292 /* 4293 * Clear the bit. Indicate to upper layer that IP is not 4294 * sending this ancillary info. 4295 */ 4296 pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; 4297 } 4298 4299 mp->b_datap->db_type = M_CTL; 4300 mp->b_wptr += sizeof (ip_pktinfo_t); 4301 mp->b_cont = data_mp; 4302 4303 return (mp); 4304 } 4305 4306 /* 4307 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 4308 * part of the bind request. 4309 */ 4310 4311 boolean_t 4312 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 4313 { 4314 ipsec_in_t *ii; 4315 4316 ASSERT(policy_mp != NULL); 4317 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 4318 4319 ii = (ipsec_in_t *)policy_mp->b_rptr; 4320 ASSERT(ii->ipsec_in_type == IPSEC_IN); 4321 4322 connp->conn_policy = ii->ipsec_in_policy; 4323 ii->ipsec_in_policy = NULL; 4324 4325 if (ii->ipsec_in_action != NULL) { 4326 if (connp->conn_latch == NULL) { 4327 connp->conn_latch = iplatch_create(); 4328 if (connp->conn_latch == NULL) 4329 return (B_FALSE); 4330 } 4331 ipsec_latch_inbound(connp->conn_latch, ii); 4332 } 4333 return (B_TRUE); 4334 } 4335 4336 /* 4337 * Upper level protocols (ULP) pass through bind requests to IP for inspection 4338 * and to arrange for power-fanout assist. The ULP is identified by 4339 * adding a single byte at the end of the original bind message. 4340 * A ULP other than UDP or TCP that wishes to be recognized passes 4341 * down a bind with a zero length address. 4342 * 4343 * The binding works as follows: 4344 * - A zero byte address means just bind to the protocol. 4345 * - A four byte address is treated as a request to validate 4346 * that the address is a valid local address, appropriate for 4347 * an application to bind to. This does not affect any fanout 4348 * information in IP. 4349 * - A sizeof sin_t byte address is used to bind to only the local address 4350 * and port. 4351 * - A sizeof ipa_conn_t byte address contains complete fanout information 4352 * consisting of local and remote addresses and ports. In 4353 * this case, the addresses are both validated as appropriate 4354 * for this operation, and, if so, the information is retained 4355 * for use in the inbound fanout. 4356 * 4357 * The ULP (except in the zero-length bind) can append an 4358 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 4359 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 4360 * a copy of the source or destination IRE (source for local bind; 4361 * destination for complete bind). IPSEC_POLICY_SET indicates that the 4362 * policy information contained should be copied on to the conn. 4363 * 4364 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 4365 */ 4366 mblk_t * 4367 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 4368 { 4369 ssize_t len; 4370 struct T_bind_req *tbr; 4371 sin_t *sin; 4372 ipa_conn_t *ac; 4373 uchar_t *ucp; 4374 mblk_t *mp1; 4375 boolean_t ire_requested; 4376 boolean_t ipsec_policy_set = B_FALSE; 4377 int error = 0; 4378 int protocol; 4379 ipa_conn_x_t *acx; 4380 4381 ASSERT(!connp->conn_af_isv6); 4382 connp->conn_pkt_isv6 = B_FALSE; 4383 4384 len = MBLKL(mp); 4385 if (len < (sizeof (*tbr) + 1)) { 4386 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 4387 "ip_bind: bogus msg, len %ld", len); 4388 /* XXX: Need to return something better */ 4389 goto bad_addr; 4390 } 4391 /* Back up and extract the protocol identifier. */ 4392 mp->b_wptr--; 4393 protocol = *mp->b_wptr & 0xFF; 4394 tbr = (struct T_bind_req *)mp->b_rptr; 4395 /* Reset the message type in preparation for shipping it back. */ 4396 DB_TYPE(mp) = M_PCPROTO; 4397 4398 connp->conn_ulp = (uint8_t)protocol; 4399 4400 /* 4401 * Check for a zero length address. This is from a protocol that 4402 * wants to register to receive all packets of its type. 4403 */ 4404 if (tbr->ADDR_length == 0) { 4405 /* 4406 * These protocols are now intercepted in ip_bind_v6(). 4407 * Reject protocol-level binds here for now. 4408 * 4409 * For SCTP raw socket, ICMP sends down a bind with sin_t 4410 * so that the protocol type cannot be SCTP. 4411 */ 4412 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 4413 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 4414 goto bad_addr; 4415 } 4416 4417 /* 4418 * 4419 * The udp module never sends down a zero-length address, 4420 * and allowing this on a labeled system will break MLP 4421 * functionality. 4422 */ 4423 if (is_system_labeled() && protocol == IPPROTO_UDP) 4424 goto bad_addr; 4425 4426 if (connp->conn_mac_exempt) 4427 goto bad_addr; 4428 4429 /* No hash here really. The table is big enough. */ 4430 connp->conn_srcv6 = ipv6_all_zeros; 4431 4432 ipcl_proto_insert(connp, protocol); 4433 4434 tbr->PRIM_type = T_BIND_ACK; 4435 return (mp); 4436 } 4437 4438 /* Extract the address pointer from the message. */ 4439 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 4440 tbr->ADDR_length); 4441 if (ucp == NULL) { 4442 ip1dbg(("ip_bind: no address\n")); 4443 goto bad_addr; 4444 } 4445 if (!OK_32PTR(ucp)) { 4446 ip1dbg(("ip_bind: unaligned address\n")); 4447 goto bad_addr; 4448 } 4449 /* 4450 * Check for trailing mps. 4451 */ 4452 4453 mp1 = mp->b_cont; 4454 ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); 4455 ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); 4456 4457 switch (tbr->ADDR_length) { 4458 default: 4459 ip1dbg(("ip_bind: bad address length %d\n", 4460 (int)tbr->ADDR_length)); 4461 goto bad_addr; 4462 4463 case IP_ADDR_LEN: 4464 /* Verification of local address only */ 4465 error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, 4466 ire_requested, ipsec_policy_set, B_FALSE); 4467 break; 4468 4469 case sizeof (sin_t): 4470 sin = (sin_t *)ucp; 4471 error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, 4472 sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); 4473 break; 4474 4475 case sizeof (ipa_conn_t): 4476 ac = (ipa_conn_t *)ucp; 4477 /* For raw socket, the local port is not set. */ 4478 if (ac->ac_lport == 0) 4479 ac->ac_lport = connp->conn_lport; 4480 /* Always verify destination reachability. */ 4481 error = ip_bind_connected(connp, mp, &ac->ac_laddr, 4482 ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, 4483 ipsec_policy_set, B_TRUE, B_TRUE); 4484 break; 4485 4486 case sizeof (ipa_conn_x_t): 4487 acx = (ipa_conn_x_t *)ucp; 4488 /* 4489 * Whether or not to verify destination reachability depends 4490 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4491 */ 4492 error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, 4493 acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, 4494 acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, 4495 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); 4496 break; 4497 } 4498 if (error == EINPROGRESS) 4499 return (NULL); 4500 else if (error != 0) 4501 goto bad_addr; 4502 /* 4503 * Pass the IPsec headers size in ire_ipsec_overhead. 4504 * We can't do this in ip_bind_insert_ire because the policy 4505 * may not have been inherited at that point in time and hence 4506 * conn_out_enforce_policy may not be set. 4507 */ 4508 mp1 = mp->b_cont; 4509 if (ire_requested && connp->conn_out_enforce_policy && 4510 mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { 4511 ire_t *ire = (ire_t *)mp1->b_rptr; 4512 ASSERT(MBLKL(mp1) >= sizeof (ire_t)); 4513 ire->ire_ipsec_overhead = conn_ipsec_length(connp); 4514 } 4515 4516 /* Send it home. */ 4517 mp->b_datap->db_type = M_PCPROTO; 4518 tbr->PRIM_type = T_BIND_ACK; 4519 return (mp); 4520 4521 bad_addr: 4522 /* 4523 * If error = -1 then we generate a TBADADDR - otherwise error is 4524 * a unix errno. 4525 */ 4526 if (error > 0) 4527 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4528 else 4529 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4530 return (mp); 4531 } 4532 4533 /* 4534 * Here address is verified to be a valid local address. 4535 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4536 * address is also considered a valid local address. 4537 * In the case of a broadcast/multicast address, however, the 4538 * upper protocol is expected to reset the src address 4539 * to 0 if it sees a IRE_BROADCAST type returned so that 4540 * no packets are emitted with broadcast/multicast address as 4541 * source address (that violates hosts requirements RFC1122) 4542 * The addresses valid for bind are: 4543 * (1) - INADDR_ANY (0) 4544 * (2) - IP address of an UP interface 4545 * (3) - IP address of a DOWN interface 4546 * (4) - valid local IP broadcast addresses. In this case 4547 * the conn will only receive packets destined to 4548 * the specified broadcast address. 4549 * (5) - a multicast address. In this case 4550 * the conn will only receive packets destined to 4551 * the specified multicast address. Note: the 4552 * application still has to issue an 4553 * IP_ADD_MEMBERSHIP socket option. 4554 * 4555 * On error, return -1 for TBADADDR otherwise pass the 4556 * errno with TSYSERR reply. 4557 * 4558 * In all the above cases, the bound address must be valid in the current zone. 4559 * When the address is loopback, multicast or broadcast, there might be many 4560 * matching IREs so bind has to look up based on the zone. 4561 * 4562 * Note: lport is in network byte order. 4563 */ 4564 int 4565 ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, 4566 boolean_t ire_requested, boolean_t ipsec_policy_set, 4567 boolean_t fanout_insert) 4568 { 4569 int error = 0; 4570 ire_t *src_ire; 4571 mblk_t *policy_mp; 4572 ipif_t *ipif; 4573 zoneid_t zoneid; 4574 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4575 4576 if (ipsec_policy_set) { 4577 policy_mp = mp->b_cont; 4578 } 4579 4580 /* 4581 * If it was previously connected, conn_fully_bound would have 4582 * been set. 4583 */ 4584 connp->conn_fully_bound = B_FALSE; 4585 4586 src_ire = NULL; 4587 ipif = NULL; 4588 4589 zoneid = IPCL_ZONEID(connp); 4590 4591 if (src_addr) { 4592 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4593 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 4594 /* 4595 * If an address other than 0.0.0.0 is requested, 4596 * we verify that it is a valid address for bind 4597 * Note: Following code is in if-else-if form for 4598 * readability compared to a condition check. 4599 */ 4600 /* LINTED - statement has no consequent */ 4601 if (IRE_IS_LOCAL(src_ire)) { 4602 /* 4603 * (2) Bind to address of local UP interface 4604 */ 4605 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4606 /* 4607 * (4) Bind to broadcast address 4608 * Note: permitted only from transports that 4609 * request IRE 4610 */ 4611 if (!ire_requested) 4612 error = EADDRNOTAVAIL; 4613 } else { 4614 /* 4615 * (3) Bind to address of local DOWN interface 4616 * (ipif_lookup_addr() looks up all interfaces 4617 * but we do not get here for UP interfaces 4618 * - case (2) above) 4619 * We put the protocol byte back into the mblk 4620 * since we may come back via ip_wput_nondata() 4621 * later with this mblk if ipif_lookup_addr chooses 4622 * to defer processing. 4623 */ 4624 *mp->b_wptr++ = (char)connp->conn_ulp; 4625 if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, 4626 CONNP_TO_WQ(connp), mp, ip_wput_nondata, 4627 &error, ipst)) != NULL) { 4628 ipif_refrele(ipif); 4629 } else if (error == EINPROGRESS) { 4630 if (src_ire != NULL) 4631 ire_refrele(src_ire); 4632 return (EINPROGRESS); 4633 } else if (CLASSD(src_addr)) { 4634 error = 0; 4635 if (src_ire != NULL) 4636 ire_refrele(src_ire); 4637 /* 4638 * (5) bind to multicast address. 4639 * Fake out the IRE returned to upper 4640 * layer to be a broadcast IRE. 4641 */ 4642 src_ire = ire_ctable_lookup( 4643 INADDR_BROADCAST, INADDR_ANY, 4644 IRE_BROADCAST, NULL, zoneid, NULL, 4645 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY), 4646 ipst); 4647 if (src_ire == NULL || !ire_requested) 4648 error = EADDRNOTAVAIL; 4649 } else { 4650 /* 4651 * Not a valid address for bind 4652 */ 4653 error = EADDRNOTAVAIL; 4654 } 4655 /* 4656 * Just to keep it consistent with the processing in 4657 * ip_bind_v4() 4658 */ 4659 mp->b_wptr--; 4660 } 4661 if (error) { 4662 /* Red Alert! Attempting to be a bogon! */ 4663 ip1dbg(("ip_bind: bad src address 0x%x\n", 4664 ntohl(src_addr))); 4665 goto bad_addr; 4666 } 4667 } 4668 4669 /* 4670 * Allow setting new policies. For example, disconnects come 4671 * down as ipa_t bind. As we would have set conn_policy_cached 4672 * to B_TRUE before, we should set it to B_FALSE, so that policy 4673 * can change after the disconnect. 4674 */ 4675 connp->conn_policy_cached = B_FALSE; 4676 4677 /* 4678 * If not fanout_insert this was just an address verification 4679 */ 4680 if (fanout_insert) { 4681 /* 4682 * The addresses have been verified. Time to insert in 4683 * the correct fanout list. 4684 */ 4685 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4686 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4687 connp->conn_lport = lport; 4688 connp->conn_fport = 0; 4689 /* 4690 * Do we need to add a check to reject Multicast packets 4691 */ 4692 error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); 4693 } 4694 4695 if (error == 0) { 4696 if (ire_requested) { 4697 if (!ip_bind_insert_ire(mp, src_ire, NULL, ipst)) { 4698 error = -1; 4699 /* Falls through to bad_addr */ 4700 } 4701 } else if (ipsec_policy_set) { 4702 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4703 error = -1; 4704 /* Falls through to bad_addr */ 4705 } 4706 } 4707 } 4708 bad_addr: 4709 if (error != 0) { 4710 if (connp->conn_anon_port) { 4711 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4712 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4713 B_FALSE); 4714 } 4715 connp->conn_mlp_type = mlptSingle; 4716 } 4717 if (src_ire != NULL) 4718 IRE_REFRELE(src_ire); 4719 if (ipsec_policy_set) { 4720 ASSERT(policy_mp == mp->b_cont); 4721 ASSERT(policy_mp != NULL); 4722 freeb(policy_mp); 4723 /* 4724 * As of now assume that nothing else accompanies 4725 * IPSEC_POLICY_SET. 4726 */ 4727 mp->b_cont = NULL; 4728 } 4729 return (error); 4730 } 4731 4732 /* 4733 * Verify that both the source and destination addresses 4734 * are valid. If verify_dst is false, then the destination address may be 4735 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4736 * destination reachability, while tunnels do not. 4737 * Note that we allow connect to broadcast and multicast 4738 * addresses when ire_requested is set. Thus the ULP 4739 * has to check for IRE_BROADCAST and multicast. 4740 * 4741 * Returns zero if ok. 4742 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4743 * (for use with TSYSERR reply). 4744 * 4745 * Note: lport and fport are in network byte order. 4746 */ 4747 int 4748 ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, 4749 uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4750 boolean_t ire_requested, boolean_t ipsec_policy_set, 4751 boolean_t fanout_insert, boolean_t verify_dst) 4752 { 4753 ire_t *src_ire; 4754 ire_t *dst_ire; 4755 int error = 0; 4756 int protocol; 4757 mblk_t *policy_mp; 4758 ire_t *sire = NULL; 4759 ire_t *md_dst_ire = NULL; 4760 ire_t *lso_dst_ire = NULL; 4761 ill_t *ill = NULL; 4762 zoneid_t zoneid; 4763 ipaddr_t src_addr = *src_addrp; 4764 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 4765 4766 src_ire = dst_ire = NULL; 4767 protocol = *mp->b_wptr & 0xFF; 4768 4769 /* 4770 * If we never got a disconnect before, clear it now. 4771 */ 4772 connp->conn_fully_bound = B_FALSE; 4773 4774 if (ipsec_policy_set) { 4775 policy_mp = mp->b_cont; 4776 } 4777 4778 zoneid = IPCL_ZONEID(connp); 4779 4780 if (CLASSD(dst_addr)) { 4781 /* Pick up an IRE_BROADCAST */ 4782 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4783 NULL, zoneid, MBLK_GETLABEL(mp), 4784 (MATCH_IRE_RECURSIVE | 4785 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4786 MATCH_IRE_SECATTR), ipst); 4787 } else { 4788 /* 4789 * If conn_dontroute is set or if conn_nexthop_set is set, 4790 * and onlink ipif is not found set ENETUNREACH error. 4791 */ 4792 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4793 ipif_t *ipif; 4794 4795 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4796 dst_addr : connp->conn_nexthop_v4, zoneid, ipst); 4797 if (ipif == NULL) { 4798 error = ENETUNREACH; 4799 goto bad_addr; 4800 } 4801 ipif_refrele(ipif); 4802 } 4803 4804 if (connp->conn_nexthop_set) { 4805 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4806 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), 4807 MATCH_IRE_SECATTR, ipst); 4808 } else { 4809 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4810 &sire, zoneid, MBLK_GETLABEL(mp), 4811 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4812 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4813 MATCH_IRE_SECATTR), ipst); 4814 } 4815 } 4816 /* 4817 * dst_ire can't be a broadcast when not ire_requested. 4818 * We also prevent ire's with src address INADDR_ANY to 4819 * be used, which are created temporarily for 4820 * sending out packets from endpoints that have 4821 * conn_unspec_src set. If verify_dst is true, the destination must be 4822 * reachable. If verify_dst is false, the destination needn't be 4823 * reachable. 4824 * 4825 * If we match on a reject or black hole, then we've got a 4826 * local failure. May as well fail out the connect() attempt, 4827 * since it's never going to succeed. 4828 */ 4829 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4830 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4831 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4832 /* 4833 * If we're verifying destination reachability, we always want 4834 * to complain here. 4835 * 4836 * If we're not verifying destination reachability but the 4837 * destination has a route, we still want to fail on the 4838 * temporary address and broadcast address tests. 4839 */ 4840 if (verify_dst || (dst_ire != NULL)) { 4841 if (ip_debug > 2) { 4842 pr_addr_dbg("ip_bind_connected: bad connected " 4843 "dst %s\n", AF_INET, &dst_addr); 4844 } 4845 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4846 error = ENETUNREACH; 4847 else 4848 error = EHOSTUNREACH; 4849 goto bad_addr; 4850 } 4851 } 4852 4853 /* 4854 * We now know that routing will allow us to reach the destination. 4855 * Check whether Trusted Solaris policy allows communication with this 4856 * host, and pretend that the destination is unreachable if not. 4857 * 4858 * This is never a problem for TCP, since that transport is known to 4859 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4860 * handling. If the remote is unreachable, it will be detected at that 4861 * point, so there's no reason to check it here. 4862 * 4863 * Note that for sendto (and other datagram-oriented friends), this 4864 * check is done as part of the data path label computation instead. 4865 * The check here is just to make non-TCP connect() report the right 4866 * error. 4867 */ 4868 if (dst_ire != NULL && is_system_labeled() && 4869 !IPCL_IS_TCP(connp) && 4870 tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst_addr, NULL, 4871 connp->conn_mac_exempt, ipst) != 0) { 4872 error = EHOSTUNREACH; 4873 if (ip_debug > 2) { 4874 pr_addr_dbg("ip_bind_connected: no label for dst %s\n", 4875 AF_INET, &dst_addr); 4876 } 4877 goto bad_addr; 4878 } 4879 4880 /* 4881 * If the app does a connect(), it means that it will most likely 4882 * send more than 1 packet to the destination. It makes sense 4883 * to clear the temporary flag. 4884 */ 4885 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4886 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4887 irb_t *irb = dst_ire->ire_bucket; 4888 4889 rw_enter(&irb->irb_lock, RW_WRITER); 4890 /* 4891 * We need to recheck for IRE_MARK_TEMPORARY after acquiring 4892 * the lock to guarantee irb_tmp_ire_cnt. 4893 */ 4894 if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) { 4895 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4896 irb->irb_tmp_ire_cnt--; 4897 } 4898 rw_exit(&irb->irb_lock); 4899 } 4900 4901 /* 4902 * See if we should notify ULP about LSO/MDT; we do this whether or not 4903 * ire_requested is TRUE, in order to handle active connects; LSO/MDT 4904 * eligibility tests for passive connects are handled separately 4905 * through tcp_adapt_ire(). We do this before the source address 4906 * selection, because dst_ire may change after a call to 4907 * ipif_select_source(). This is a best-effort check, as the 4908 * packet for this connection may not actually go through 4909 * dst_ire->ire_stq, and the exact IRE can only be known after 4910 * calling ip_newroute(). This is why we further check on the 4911 * IRE during LSO/Multidata packet transmission in 4912 * tcp_lsosend()/tcp_multisend(). 4913 */ 4914 if (!ipsec_policy_set && dst_ire != NULL && 4915 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4916 (ill = ire_to_ill(dst_ire), ill != NULL)) { 4917 if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { 4918 lso_dst_ire = dst_ire; 4919 IRE_REFHOLD(lso_dst_ire); 4920 } else if (ipst->ips_ip_multidata_outbound && 4921 ILL_MDT_CAPABLE(ill)) { 4922 md_dst_ire = dst_ire; 4923 IRE_REFHOLD(md_dst_ire); 4924 } 4925 } 4926 4927 if (dst_ire != NULL && 4928 dst_ire->ire_type == IRE_LOCAL && 4929 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4930 /* 4931 * If the IRE belongs to a different zone, look for a matching 4932 * route in the forwarding table and use the source address from 4933 * that route. 4934 */ 4935 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4936 zoneid, 0, NULL, 4937 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4938 MATCH_IRE_RJ_BHOLE, ipst); 4939 if (src_ire == NULL) { 4940 error = EHOSTUNREACH; 4941 goto bad_addr; 4942 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4943 if (!(src_ire->ire_type & IRE_HOST)) 4944 error = ENETUNREACH; 4945 else 4946 error = EHOSTUNREACH; 4947 goto bad_addr; 4948 } 4949 if (src_addr == INADDR_ANY) 4950 src_addr = src_ire->ire_src_addr; 4951 ire_refrele(src_ire); 4952 src_ire = NULL; 4953 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4954 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4955 src_addr = sire->ire_src_addr; 4956 ire_refrele(dst_ire); 4957 dst_ire = sire; 4958 sire = NULL; 4959 } else { 4960 /* 4961 * Pick a source address so that a proper inbound 4962 * load spreading would happen. 4963 */ 4964 ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; 4965 ipif_t *src_ipif = NULL; 4966 ire_t *ipif_ire; 4967 4968 /* 4969 * Supply a local source address such that inbound 4970 * load spreading happens. 4971 * 4972 * Determine the best source address on this ill for 4973 * the destination. 4974 * 4975 * 1) For broadcast, we should return a broadcast ire 4976 * found above so that upper layers know that the 4977 * destination address is a broadcast address. 4978 * 4979 * 2) If this is part of a group, select a better 4980 * source address so that better inbound load 4981 * balancing happens. Do the same if the ipif 4982 * is DEPRECATED. 4983 * 4984 * 3) If the outgoing interface is part of a usesrc 4985 * group, then try selecting a source address from 4986 * the usesrc ILL. 4987 */ 4988 if ((dst_ire->ire_zoneid != zoneid && 4989 dst_ire->ire_zoneid != ALL_ZONES) || 4990 (!(dst_ire->ire_flags & RTF_SETSRC)) && 4991 (!(dst_ire->ire_type & IRE_BROADCAST) && 4992 ((dst_ill->ill_group != NULL) || 4993 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4994 (dst_ill->ill_usesrc_ifindex != 0)))) { 4995 /* 4996 * If the destination is reachable via a 4997 * given gateway, the selected source address 4998 * should be in the same subnet as the gateway. 4999 * Otherwise, the destination is not reachable. 5000 * 5001 * If there are no interfaces on the same subnet 5002 * as the destination, ipif_select_source gives 5003 * first non-deprecated interface which might be 5004 * on a different subnet than the gateway. 5005 * This is not desirable. Hence pass the dst_ire 5006 * source address to ipif_select_source. 5007 * It is sure that the destination is reachable 5008 * with the dst_ire source address subnet. 5009 * So passing dst_ire source address to 5010 * ipif_select_source will make sure that the 5011 * selected source will be on the same subnet 5012 * as dst_ire source address. 5013 */ 5014 ipaddr_t saddr = 5015 dst_ire->ire_ipif->ipif_src_addr; 5016 src_ipif = ipif_select_source(dst_ill, 5017 saddr, zoneid); 5018 if (src_ipif != NULL) { 5019 if (IS_VNI(src_ipif->ipif_ill)) { 5020 /* 5021 * For VNI there is no 5022 * interface route 5023 */ 5024 src_addr = 5025 src_ipif->ipif_src_addr; 5026 } else { 5027 ipif_ire = 5028 ipif_to_ire(src_ipif); 5029 if (ipif_ire != NULL) { 5030 IRE_REFRELE(dst_ire); 5031 dst_ire = ipif_ire; 5032 } 5033 src_addr = 5034 dst_ire->ire_src_addr; 5035 } 5036 ipif_refrele(src_ipif); 5037 } else { 5038 src_addr = dst_ire->ire_src_addr; 5039 } 5040 } else { 5041 src_addr = dst_ire->ire_src_addr; 5042 } 5043 } 5044 } 5045 5046 /* 5047 * We do ire_route_lookup() here (and not 5048 * interface lookup as we assert that 5049 * src_addr should only come from an 5050 * UP interface for hard binding. 5051 */ 5052 ASSERT(src_ire == NULL); 5053 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 5054 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); 5055 /* src_ire must be a local|loopback */ 5056 if (!IRE_IS_LOCAL(src_ire)) { 5057 if (ip_debug > 2) { 5058 pr_addr_dbg("ip_bind_connected: bad connected " 5059 "src %s\n", AF_INET, &src_addr); 5060 } 5061 error = EADDRNOTAVAIL; 5062 goto bad_addr; 5063 } 5064 5065 /* 5066 * If the source address is a loopback address, the 5067 * destination had best be local or multicast. 5068 * The transports that can't handle multicast will reject 5069 * those addresses. 5070 */ 5071 if (src_ire->ire_type == IRE_LOOPBACK && 5072 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 5073 ip1dbg(("ip_bind_connected: bad connected loopback\n")); 5074 error = -1; 5075 goto bad_addr; 5076 } 5077 5078 /* 5079 * Allow setting new policies. For example, disconnects come 5080 * down as ipa_t bind. As we would have set conn_policy_cached 5081 * to B_TRUE before, we should set it to B_FALSE, so that policy 5082 * can change after the disconnect. 5083 */ 5084 connp->conn_policy_cached = B_FALSE; 5085 5086 /* 5087 * Set the conn addresses/ports immediately, so the IPsec policy calls 5088 * can handle their passed-in conn's. 5089 */ 5090 5091 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 5092 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 5093 connp->conn_lport = lport; 5094 connp->conn_fport = fport; 5095 *src_addrp = src_addr; 5096 5097 ASSERT(!(ipsec_policy_set && ire_requested)); 5098 if (ire_requested) { 5099 iulp_t *ulp_info = NULL; 5100 5101 /* 5102 * Note that sire will not be NULL if this is an off-link 5103 * connection and there is not cache for that dest yet. 5104 * 5105 * XXX Because of an existing bug, if there are multiple 5106 * default routes, the IRE returned now may not be the actual 5107 * default route used (default routes are chosen in a 5108 * round robin fashion). So if the metrics for different 5109 * default routes are different, we may return the wrong 5110 * metrics. This will not be a problem if the existing 5111 * bug is fixed. 5112 */ 5113 if (sire != NULL) { 5114 ulp_info = &(sire->ire_uinfo); 5115 } 5116 if (!ip_bind_insert_ire(mp, dst_ire, ulp_info, ipst)) { 5117 error = -1; 5118 goto bad_addr; 5119 } 5120 } else if (ipsec_policy_set) { 5121 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 5122 error = -1; 5123 goto bad_addr; 5124 } 5125 } 5126 5127 /* 5128 * Cache IPsec policy in this conn. If we have per-socket policy, 5129 * we'll cache that. If we don't, we'll inherit global policy. 5130 * 5131 * We can't insert until the conn reflects the policy. Note that 5132 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 5133 * connections where we don't have a policy. This is to prevent 5134 * global policy lookups in the inbound path. 5135 * 5136 * If we insert before we set conn_policy_cached, 5137 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 5138 * because global policy cound be non-empty. We normally call 5139 * ipsec_check_policy() for conn_policy_cached connections only if 5140 * ipc_in_enforce_policy is set. But in this case, 5141 * conn_policy_cached can get set anytime since we made the 5142 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 5143 * called, which will make the above assumption false. Thus, we 5144 * need to insert after we set conn_policy_cached. 5145 */ 5146 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 5147 goto bad_addr; 5148 5149 if (fanout_insert) { 5150 /* 5151 * The addresses have been verified. Time to insert in 5152 * the correct fanout list. 5153 */ 5154 error = ipcl_conn_insert(connp, protocol, src_addr, 5155 dst_addr, connp->conn_ports); 5156 } 5157 5158 if (error == 0) { 5159 connp->conn_fully_bound = B_TRUE; 5160 /* 5161 * Our initial checks for LSO/MDT have passed; the IRE is not 5162 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 5163 * be supporting LSO/MDT. Pass the IRE, IPC and ILL into 5164 * ip_xxinfo_return(), which performs further checks 5165 * against them and upon success, returns the LSO/MDT info 5166 * mblk which we will attach to the bind acknowledgment. 5167 */ 5168 if (lso_dst_ire != NULL) { 5169 mblk_t *lsoinfo_mp; 5170 5171 ASSERT(ill->ill_lso_capab != NULL); 5172 if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp, 5173 ill->ill_name, ill->ill_lso_capab)) != NULL) 5174 linkb(mp, lsoinfo_mp); 5175 } else if (md_dst_ire != NULL) { 5176 mblk_t *mdinfo_mp; 5177 5178 ASSERT(ill->ill_mdt_capab != NULL); 5179 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 5180 ill->ill_name, ill->ill_mdt_capab)) != NULL) 5181 linkb(mp, mdinfo_mp); 5182 } 5183 } 5184 bad_addr: 5185 if (ipsec_policy_set) { 5186 ASSERT(policy_mp == mp->b_cont); 5187 ASSERT(policy_mp != NULL); 5188 freeb(policy_mp); 5189 /* 5190 * As of now assume that nothing else accompanies 5191 * IPSEC_POLICY_SET. 5192 */ 5193 mp->b_cont = NULL; 5194 } 5195 if (src_ire != NULL) 5196 IRE_REFRELE(src_ire); 5197 if (dst_ire != NULL) 5198 IRE_REFRELE(dst_ire); 5199 if (sire != NULL) 5200 IRE_REFRELE(sire); 5201 if (md_dst_ire != NULL) 5202 IRE_REFRELE(md_dst_ire); 5203 if (lso_dst_ire != NULL) 5204 IRE_REFRELE(lso_dst_ire); 5205 return (error); 5206 } 5207 5208 /* 5209 * Insert the ire in b_cont. Returns false if it fails (due to lack of space). 5210 * Prefers dst_ire over src_ire. 5211 */ 5212 static boolean_t 5213 ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) 5214 { 5215 mblk_t *mp1; 5216 ire_t *ret_ire = NULL; 5217 5218 mp1 = mp->b_cont; 5219 ASSERT(mp1 != NULL); 5220 5221 if (ire != NULL) { 5222 /* 5223 * mp1 initialized above to IRE_DB_REQ_TYPE 5224 * appended mblk. Its <upper protocol>'s 5225 * job to make sure there is room. 5226 */ 5227 if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) 5228 return (0); 5229 5230 mp1->b_datap->db_type = IRE_DB_TYPE; 5231 mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); 5232 bcopy(ire, mp1->b_rptr, sizeof (ire_t)); 5233 ret_ire = (ire_t *)mp1->b_rptr; 5234 /* 5235 * Pass the latest setting of the ip_path_mtu_discovery and 5236 * copy the ulp info if any. 5237 */ 5238 ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? 5239 IPH_DF : 0; 5240 if (ulp_info != NULL) { 5241 bcopy(ulp_info, &(ret_ire->ire_uinfo), 5242 sizeof (iulp_t)); 5243 } 5244 ret_ire->ire_mp = mp1; 5245 } else { 5246 /* 5247 * No IRE was found. Remove IRE mblk. 5248 */ 5249 mp->b_cont = mp1->b_cont; 5250 freeb(mp1); 5251 } 5252 5253 return (1); 5254 } 5255 5256 /* 5257 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 5258 * the final piece where we don't. Return a pointer to the first mblk in the 5259 * result, and update the pointer to the next mblk to chew on. If anything 5260 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 5261 * NULL pointer. 5262 */ 5263 mblk_t * 5264 ip_carve_mp(mblk_t **mpp, ssize_t len) 5265 { 5266 mblk_t *mp0; 5267 mblk_t *mp1; 5268 mblk_t *mp2; 5269 5270 if (!len || !mpp || !(mp0 = *mpp)) 5271 return (NULL); 5272 /* If we aren't going to consume the first mblk, we need a dup. */ 5273 if (mp0->b_wptr - mp0->b_rptr > len) { 5274 mp1 = dupb(mp0); 5275 if (mp1) { 5276 /* Partition the data between the two mblks. */ 5277 mp1->b_wptr = mp1->b_rptr + len; 5278 mp0->b_rptr = mp1->b_wptr; 5279 /* 5280 * after adjustments if mblk not consumed is now 5281 * unaligned, try to align it. If this fails free 5282 * all messages and let upper layer recover. 5283 */ 5284 if (!OK_32PTR(mp0->b_rptr)) { 5285 if (!pullupmsg(mp0, -1)) { 5286 freemsg(mp0); 5287 freemsg(mp1); 5288 *mpp = NULL; 5289 return (NULL); 5290 } 5291 } 5292 } 5293 return (mp1); 5294 } 5295 /* Eat through as many mblks as we need to get len bytes. */ 5296 len -= mp0->b_wptr - mp0->b_rptr; 5297 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 5298 if (mp2->b_wptr - mp2->b_rptr > len) { 5299 /* 5300 * We won't consume the entire last mblk. Like 5301 * above, dup and partition it. 5302 */ 5303 mp1->b_cont = dupb(mp2); 5304 mp1 = mp1->b_cont; 5305 if (!mp1) { 5306 /* 5307 * Trouble. Rather than go to a lot of 5308 * trouble to clean up, we free the messages. 5309 * This won't be any worse than losing it on 5310 * the wire. 5311 */ 5312 freemsg(mp0); 5313 freemsg(mp2); 5314 *mpp = NULL; 5315 return (NULL); 5316 } 5317 mp1->b_wptr = mp1->b_rptr + len; 5318 mp2->b_rptr = mp1->b_wptr; 5319 /* 5320 * after adjustments if mblk not consumed is now 5321 * unaligned, try to align it. If this fails free 5322 * all messages and let upper layer recover. 5323 */ 5324 if (!OK_32PTR(mp2->b_rptr)) { 5325 if (!pullupmsg(mp2, -1)) { 5326 freemsg(mp0); 5327 freemsg(mp2); 5328 *mpp = NULL; 5329 return (NULL); 5330 } 5331 } 5332 *mpp = mp2; 5333 return (mp0); 5334 } 5335 /* Decrement len by the amount we just got. */ 5336 len -= mp2->b_wptr - mp2->b_rptr; 5337 } 5338 /* 5339 * len should be reduced to zero now. If not our caller has 5340 * screwed up. 5341 */ 5342 if (len) { 5343 /* Shouldn't happen! */ 5344 freemsg(mp0); 5345 *mpp = NULL; 5346 return (NULL); 5347 } 5348 /* 5349 * We consumed up to exactly the end of an mblk. Detach the part 5350 * we are returning from the rest of the chain. 5351 */ 5352 mp1->b_cont = NULL; 5353 *mpp = mp2; 5354 return (mp0); 5355 } 5356 5357 /* The ill stream is being unplumbed. Called from ip_close */ 5358 int 5359 ip_modclose(ill_t *ill) 5360 { 5361 boolean_t success; 5362 ipsq_t *ipsq; 5363 ipif_t *ipif; 5364 queue_t *q = ill->ill_rq; 5365 ip_stack_t *ipst = ill->ill_ipst; 5366 clock_t timeout; 5367 5368 /* 5369 * Wait for the ACKs of all deferred control messages to be processed. 5370 * In particular, we wait for a potential capability reset initiated 5371 * in ip_sioctl_plink() to complete before proceeding. 5372 * 5373 * Note: we wait for at most ip_modclose_ackwait_ms (by default 3000 ms) 5374 * in case the driver never replies. 5375 */ 5376 timeout = lbolt + MSEC_TO_TICK(ip_modclose_ackwait_ms); 5377 mutex_enter(&ill->ill_lock); 5378 while (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 5379 if (cv_timedwait(&ill->ill_cv, &ill->ill_lock, timeout) < 0) { 5380 /* Timeout */ 5381 break; 5382 } 5383 } 5384 mutex_exit(&ill->ill_lock); 5385 5386 /* 5387 * Forcibly enter the ipsq after some delay. This is to take 5388 * care of the case when some ioctl does not complete because 5389 * we sent a control message to the driver and it did not 5390 * send us a reply. We want to be able to at least unplumb 5391 * and replumb rather than force the user to reboot the system. 5392 */ 5393 success = ipsq_enter(ill, B_FALSE); 5394 5395 /* 5396 * Open/close/push/pop is guaranteed to be single threaded 5397 * per stream by STREAMS. FS guarantees that all references 5398 * from top are gone before close is called. So there can't 5399 * be another close thread that has set CONDEMNED on this ill. 5400 * and cause ipsq_enter to return failure. 5401 */ 5402 ASSERT(success); 5403 ipsq = ill->ill_phyint->phyint_ipsq; 5404 5405 /* 5406 * Mark it condemned. No new reference will be made to this ill. 5407 * Lookup functions will return an error. Threads that try to 5408 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 5409 * that the refcnt will drop down to zero. 5410 */ 5411 mutex_enter(&ill->ill_lock); 5412 ill->ill_state_flags |= ILL_CONDEMNED; 5413 for (ipif = ill->ill_ipif; ipif != NULL; 5414 ipif = ipif->ipif_next) { 5415 ipif->ipif_state_flags |= IPIF_CONDEMNED; 5416 } 5417 /* 5418 * Wake up anybody waiting to enter the ipsq. ipsq_enter 5419 * returns error if ILL_CONDEMNED is set 5420 */ 5421 cv_broadcast(&ill->ill_cv); 5422 mutex_exit(&ill->ill_lock); 5423 5424 /* 5425 * Send all the deferred DLPI messages downstream which came in 5426 * during the small window right before ipsq_enter(). We do this 5427 * without waiting for the ACKs because all the ACKs for M_PROTO 5428 * messages are ignored in ip_rput() when ILL_CONDEMNED is set. 5429 */ 5430 ill_dlpi_send_deferred(ill); 5431 5432 /* 5433 * Shut down fragmentation reassembly. 5434 * ill_frag_timer won't start a timer again. 5435 * Now cancel any existing timer 5436 */ 5437 (void) untimeout(ill->ill_frag_timer_id); 5438 (void) ill_frag_timeout(ill, 0); 5439 5440 /* 5441 * If MOVE was in progress, clear the 5442 * move_in_progress fields also. 5443 */ 5444 if (ill->ill_move_in_progress) { 5445 ILL_CLEAR_MOVE(ill); 5446 } 5447 5448 /* 5449 * Call ill_delete to bring down the ipifs, ilms and ill on 5450 * this ill. Then wait for the refcnts to drop to zero. 5451 * ill_is_freeable checks whether the ill is really quiescent. 5452 * Then make sure that threads that are waiting to enter the 5453 * ipsq have seen the error returned by ipsq_enter and have 5454 * gone away. Then we call ill_delete_tail which does the 5455 * DL_UNBIND_REQ with the driver and then qprocsoff. 5456 */ 5457 ill_delete(ill); 5458 mutex_enter(&ill->ill_lock); 5459 while (!ill_is_freeable(ill)) 5460 cv_wait(&ill->ill_cv, &ill->ill_lock); 5461 while (ill->ill_waiters) 5462 cv_wait(&ill->ill_cv, &ill->ill_lock); 5463 5464 mutex_exit(&ill->ill_lock); 5465 5466 /* 5467 * ill_delete_tail drops reference on ill_ipst, but we need to keep 5468 * it held until the end of the function since the cleanup 5469 * below needs to be able to use the ip_stack_t. 5470 */ 5471 netstack_hold(ipst->ips_netstack); 5472 5473 /* qprocsoff is called in ill_delete_tail */ 5474 ill_delete_tail(ill); 5475 ASSERT(ill->ill_ipst == NULL); 5476 5477 /* 5478 * Walk through all upper (conn) streams and qenable 5479 * those that have queued data. 5480 * close synchronization needs this to 5481 * be done to ensure that all upper layers blocked 5482 * due to flow control to the closing device 5483 * get unblocked. 5484 */ 5485 ip1dbg(("ip_wsrv: walking\n")); 5486 conn_walk_drain(ipst); 5487 5488 mutex_enter(&ipst->ips_ip_mi_lock); 5489 mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); 5490 mutex_exit(&ipst->ips_ip_mi_lock); 5491 5492 /* 5493 * credp could be null if the open didn't succeed and ip_modopen 5494 * itself calls ip_close. 5495 */ 5496 if (ill->ill_credp != NULL) 5497 crfree(ill->ill_credp); 5498 5499 mutex_enter(&ill->ill_lock); 5500 ill_nic_info_dispatch(ill); 5501 mutex_exit(&ill->ill_lock); 5502 5503 /* 5504 * Now we are done with the module close pieces that 5505 * need the netstack_t. 5506 */ 5507 netstack_rele(ipst->ips_netstack); 5508 5509 mi_close_free((IDP)ill); 5510 q->q_ptr = WR(q)->q_ptr = NULL; 5511 5512 ipsq_exit(ipsq); 5513 5514 return (0); 5515 } 5516 5517 /* 5518 * This is called as part of close() for IP, UDP, ICMP, and RTS 5519 * in order to quiesce the conn. 5520 */ 5521 void 5522 ip_quiesce_conn(conn_t *connp) 5523 { 5524 boolean_t drain_cleanup_reqd = B_FALSE; 5525 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 5526 boolean_t ilg_cleanup_reqd = B_FALSE; 5527 ip_stack_t *ipst; 5528 5529 ASSERT(!IPCL_IS_TCP(connp)); 5530 ipst = connp->conn_netstack->netstack_ip; 5531 5532 /* 5533 * Mark the conn as closing, and this conn must not be 5534 * inserted in future into any list. Eg. conn_drain_insert(), 5535 * won't insert this conn into the conn_drain_list. 5536 * Similarly ill_pending_mp_add() will not add any mp to 5537 * the pending mp list, after this conn has started closing. 5538 * 5539 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 5540 * cannot get set henceforth. 5541 */ 5542 mutex_enter(&connp->conn_lock); 5543 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 5544 connp->conn_state_flags |= CONN_CLOSING; 5545 if (connp->conn_idl != NULL) 5546 drain_cleanup_reqd = B_TRUE; 5547 if (connp->conn_oper_pending_ill != NULL) 5548 conn_ioctl_cleanup_reqd = B_TRUE; 5549 if (connp->conn_dhcpinit_ill != NULL) { 5550 ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0); 5551 atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit); 5552 connp->conn_dhcpinit_ill = NULL; 5553 } 5554 if (connp->conn_ilg_inuse != 0) 5555 ilg_cleanup_reqd = B_TRUE; 5556 mutex_exit(&connp->conn_lock); 5557 5558 if (conn_ioctl_cleanup_reqd) 5559 conn_ioctl_cleanup(connp); 5560 5561 if (is_system_labeled() && connp->conn_anon_port) { 5562 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5563 connp->conn_mlp_type, connp->conn_ulp, 5564 ntohs(connp->conn_lport), B_FALSE); 5565 connp->conn_anon_port = 0; 5566 } 5567 connp->conn_mlp_type = mlptSingle; 5568 5569 /* 5570 * Remove this conn from any fanout list it is on. 5571 * and then wait for any threads currently operating 5572 * on this endpoint to finish 5573 */ 5574 ipcl_hash_remove(connp); 5575 5576 /* 5577 * Remove this conn from the drain list, and do 5578 * any other cleanup that may be required. 5579 * (Only non-tcp streams may have a non-null conn_idl. 5580 * TCP streams are never flow controlled, and 5581 * conn_idl will be null) 5582 */ 5583 if (drain_cleanup_reqd) 5584 conn_drain_tail(connp, B_TRUE); 5585 5586 if (connp == ipst->ips_ip_g_mrouter) 5587 (void) ip_mrouter_done(NULL, ipst); 5588 5589 if (ilg_cleanup_reqd) 5590 ilg_delete_all(connp); 5591 5592 conn_delete_ire(connp, NULL); 5593 5594 /* 5595 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5596 * callers from write side can't be there now because close 5597 * is in progress. The only other caller is ipcl_walk 5598 * which checks for the condemned flag. 5599 */ 5600 mutex_enter(&connp->conn_lock); 5601 connp->conn_state_flags |= CONN_CONDEMNED; 5602 while (connp->conn_ref != 1) 5603 cv_wait(&connp->conn_cv, &connp->conn_lock); 5604 connp->conn_state_flags |= CONN_QUIESCED; 5605 mutex_exit(&connp->conn_lock); 5606 } 5607 5608 /* ARGSUSED */ 5609 int 5610 ip_close(queue_t *q, int flags) 5611 { 5612 conn_t *connp; 5613 5614 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5615 5616 /* 5617 * Call the appropriate delete routine depending on whether this is 5618 * a module or device. 5619 */ 5620 if (WR(q)->q_next != NULL) { 5621 /* This is a module close */ 5622 return (ip_modclose((ill_t *)q->q_ptr)); 5623 } 5624 5625 connp = q->q_ptr; 5626 ip_quiesce_conn(connp); 5627 5628 qprocsoff(q); 5629 5630 /* 5631 * Now we are truly single threaded on this stream, and can 5632 * delete the things hanging off the connp, and finally the connp. 5633 * We removed this connp from the fanout list, it cannot be 5634 * accessed thru the fanouts, and we already waited for the 5635 * conn_ref to drop to 0. We are already in close, so 5636 * there cannot be any other thread from the top. qprocsoff 5637 * has completed, and service has completed or won't run in 5638 * future. 5639 */ 5640 ASSERT(connp->conn_ref == 1); 5641 5642 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 5643 5644 connp->conn_ref--; 5645 ipcl_conn_destroy(connp); 5646 5647 q->q_ptr = WR(q)->q_ptr = NULL; 5648 return (0); 5649 } 5650 5651 /* 5652 * Wapper around putnext() so that ip_rts_request can merely use 5653 * conn_recv. 5654 */ 5655 /*ARGSUSED2*/ 5656 static void 5657 ip_conn_input(void *arg1, mblk_t *mp, void *arg2) 5658 { 5659 conn_t *connp = (conn_t *)arg1; 5660 5661 putnext(connp->conn_rq, mp); 5662 } 5663 5664 /* Return the IP checksum for the IP header at "iph". */ 5665 uint16_t 5666 ip_csum_hdr(ipha_t *ipha) 5667 { 5668 uint16_t *uph; 5669 uint32_t sum; 5670 int opt_len; 5671 5672 opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - 5673 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5674 uph = (uint16_t *)ipha; 5675 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 5676 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 5677 if (opt_len > 0) { 5678 do { 5679 sum += uph[10]; 5680 sum += uph[11]; 5681 uph += 2; 5682 } while (--opt_len); 5683 } 5684 sum = (sum & 0xFFFF) + (sum >> 16); 5685 sum = ~(sum + (sum >> 16)) & 0xFFFF; 5686 if (sum == 0xffff) 5687 sum = 0; 5688 return ((uint16_t)sum); 5689 } 5690 5691 /* 5692 * Called when the module is about to be unloaded 5693 */ 5694 void 5695 ip_ddi_destroy(void) 5696 { 5697 tnet_fini(); 5698 5699 icmp_ddi_destroy(); 5700 rts_ddi_destroy(); 5701 udp_ddi_destroy(); 5702 sctp_ddi_g_destroy(); 5703 tcp_ddi_g_destroy(); 5704 ipsec_policy_g_destroy(); 5705 ipcl_g_destroy(); 5706 ip_net_g_destroy(); 5707 ip_ire_g_fini(); 5708 inet_minor_destroy(ip_minor_arena_sa); 5709 #if defined(_LP64) 5710 inet_minor_destroy(ip_minor_arena_la); 5711 #endif 5712 5713 #ifdef DEBUG 5714 list_destroy(&ip_thread_list); 5715 rw_destroy(&ip_thread_rwlock); 5716 tsd_destroy(&ip_thread_data); 5717 #endif 5718 5719 netstack_unregister(NS_IP); 5720 } 5721 5722 /* 5723 * First step in cleanup. 5724 */ 5725 /* ARGSUSED */ 5726 static void 5727 ip_stack_shutdown(netstackid_t stackid, void *arg) 5728 { 5729 ip_stack_t *ipst = (ip_stack_t *)arg; 5730 5731 #ifdef NS_DEBUG 5732 printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid); 5733 #endif 5734 5735 /* Get rid of loopback interfaces and their IREs */ 5736 ip_loopback_cleanup(ipst); 5737 5738 /* 5739 * The destroy functions here will end up causing notify callbacks 5740 * in the hook framework and these need to be run before the shtudown 5741 * of the hook framework is begun - that happens from netstack after 5742 * IP shutdown has completed. If we leave doing these actions until 5743 * ip_stack_fini then the notify callbacks for the net_*_unregister 5744 * are happening against a backdrop of shattered terain. 5745 */ 5746 ipv4_hook_destroy(ipst); 5747 ipv6_hook_destroy(ipst); 5748 ip_net_destroy(ipst); 5749 } 5750 5751 /* 5752 * Free the IP stack instance. 5753 */ 5754 static void 5755 ip_stack_fini(netstackid_t stackid, void *arg) 5756 { 5757 ip_stack_t *ipst = (ip_stack_t *)arg; 5758 int ret; 5759 5760 #ifdef NS_DEBUG 5761 printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); 5762 #endif 5763 rw_destroy(&ipst->ips_srcid_lock); 5764 5765 ip_kstat_fini(stackid, ipst->ips_ip_mibkp); 5766 ipst->ips_ip_mibkp = NULL; 5767 icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp); 5768 ipst->ips_icmp_mibkp = NULL; 5769 ip_kstat2_fini(stackid, ipst->ips_ip_kstat); 5770 ipst->ips_ip_kstat = NULL; 5771 bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics)); 5772 ip6_kstat_fini(stackid, ipst->ips_ip6_kstat); 5773 ipst->ips_ip6_kstat = NULL; 5774 bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics)); 5775 5776 nd_free(&ipst->ips_ip_g_nd); 5777 kmem_free(ipst->ips_param_arr, sizeof (lcl_param_arr)); 5778 ipst->ips_param_arr = NULL; 5779 kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 5780 ipst->ips_ndp_arr = NULL; 5781 5782 ip_mrouter_stack_destroy(ipst); 5783 5784 mutex_destroy(&ipst->ips_ip_mi_lock); 5785 rw_destroy(&ipst->ips_ipsec_capab_ills_lock); 5786 rw_destroy(&ipst->ips_ill_g_usesrc_lock); 5787 rw_destroy(&ipst->ips_ip_g_nd_lock); 5788 5789 ret = untimeout(ipst->ips_igmp_timeout_id); 5790 if (ret == -1) { 5791 ASSERT(ipst->ips_igmp_timeout_id == 0); 5792 } else { 5793 ASSERT(ipst->ips_igmp_timeout_id != 0); 5794 ipst->ips_igmp_timeout_id = 0; 5795 } 5796 ret = untimeout(ipst->ips_igmp_slowtimeout_id); 5797 if (ret == -1) { 5798 ASSERT(ipst->ips_igmp_slowtimeout_id == 0); 5799 } else { 5800 ASSERT(ipst->ips_igmp_slowtimeout_id != 0); 5801 ipst->ips_igmp_slowtimeout_id = 0; 5802 } 5803 ret = untimeout(ipst->ips_mld_timeout_id); 5804 if (ret == -1) { 5805 ASSERT(ipst->ips_mld_timeout_id == 0); 5806 } else { 5807 ASSERT(ipst->ips_mld_timeout_id != 0); 5808 ipst->ips_mld_timeout_id = 0; 5809 } 5810 ret = untimeout(ipst->ips_mld_slowtimeout_id); 5811 if (ret == -1) { 5812 ASSERT(ipst->ips_mld_slowtimeout_id == 0); 5813 } else { 5814 ASSERT(ipst->ips_mld_slowtimeout_id != 0); 5815 ipst->ips_mld_slowtimeout_id = 0; 5816 } 5817 ret = untimeout(ipst->ips_ip_ire_expire_id); 5818 if (ret == -1) { 5819 ASSERT(ipst->ips_ip_ire_expire_id == 0); 5820 } else { 5821 ASSERT(ipst->ips_ip_ire_expire_id != 0); 5822 ipst->ips_ip_ire_expire_id = 0; 5823 } 5824 5825 mutex_destroy(&ipst->ips_igmp_timer_lock); 5826 mutex_destroy(&ipst->ips_mld_timer_lock); 5827 mutex_destroy(&ipst->ips_igmp_slowtimeout_lock); 5828 mutex_destroy(&ipst->ips_mld_slowtimeout_lock); 5829 mutex_destroy(&ipst->ips_ip_addr_avail_lock); 5830 rw_destroy(&ipst->ips_ill_g_lock); 5831 5832 ip_ire_fini(ipst); 5833 ip6_asp_free(ipst); 5834 conn_drain_fini(ipst); 5835 ipcl_destroy(ipst); 5836 5837 mutex_destroy(&ipst->ips_ndp4->ndp_g_lock); 5838 mutex_destroy(&ipst->ips_ndp6->ndp_g_lock); 5839 kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t)); 5840 ipst->ips_ndp4 = NULL; 5841 kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t)); 5842 ipst->ips_ndp6 = NULL; 5843 5844 if (ipst->ips_loopback_ksp != NULL) { 5845 kstat_delete_netstack(ipst->ips_loopback_ksp, stackid); 5846 ipst->ips_loopback_ksp = NULL; 5847 } 5848 5849 kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t)); 5850 ipst->ips_phyint_g_list = NULL; 5851 kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS); 5852 ipst->ips_ill_g_heads = NULL; 5853 5854 kmem_free(ipst, sizeof (*ipst)); 5855 } 5856 5857 /* 5858 * This function is called from the TSD destructor, and is used to debug 5859 * reference count issues in IP. See block comment in <inet/ip_if.h> for 5860 * details. 5861 */ 5862 static void 5863 ip_thread_exit(void *phash) 5864 { 5865 th_hash_t *thh = phash; 5866 5867 rw_enter(&ip_thread_rwlock, RW_WRITER); 5868 list_remove(&ip_thread_list, thh); 5869 rw_exit(&ip_thread_rwlock); 5870 mod_hash_destroy_hash(thh->thh_hash); 5871 kmem_free(thh, sizeof (*thh)); 5872 } 5873 5874 /* 5875 * Called when the IP kernel module is loaded into the kernel 5876 */ 5877 void 5878 ip_ddi_init(void) 5879 { 5880 ip_input_proc = ip_squeue_switch(ip_squeue_enter); 5881 5882 /* 5883 * For IP and TCP the minor numbers should start from 2 since we have 4 5884 * initial devices: ip, ip6, tcp, tcp6. 5885 */ 5886 /* 5887 * If this is a 64-bit kernel, then create two separate arenas - 5888 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the 5889 * other for socket apps in the range 2^^18 through 2^^32-1. 5890 */ 5891 ip_minor_arena_la = NULL; 5892 ip_minor_arena_sa = NULL; 5893 #if defined(_LP64) 5894 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5895 INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) { 5896 cmn_err(CE_PANIC, 5897 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5898 } 5899 if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la", 5900 MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) { 5901 cmn_err(CE_PANIC, 5902 "ip_ddi_init: ip_minor_arena_la creation failed\n"); 5903 } 5904 #else 5905 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa", 5906 INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) { 5907 cmn_err(CE_PANIC, 5908 "ip_ddi_init: ip_minor_arena_sa creation failed\n"); 5909 } 5910 #endif 5911 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5912 5913 ipcl_g_init(); 5914 ip_ire_g_init(); 5915 ip_net_g_init(); 5916 5917 #ifdef DEBUG 5918 tsd_create(&ip_thread_data, ip_thread_exit); 5919 rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL); 5920 list_create(&ip_thread_list, sizeof (th_hash_t), 5921 offsetof(th_hash_t, thh_link)); 5922 #endif 5923 5924 /* 5925 * We want to be informed each time a stack is created or 5926 * destroyed in the kernel, so we can maintain the 5927 * set of udp_stack_t's. 5928 */ 5929 netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown, 5930 ip_stack_fini); 5931 5932 ipsec_policy_g_init(); 5933 tcp_ddi_g_init(); 5934 sctp_ddi_g_init(); 5935 5936 tnet_init(); 5937 5938 udp_ddi_init(); 5939 rts_ddi_init(); 5940 icmp_ddi_init(); 5941 } 5942 5943 /* 5944 * Initialize the IP stack instance. 5945 */ 5946 static void * 5947 ip_stack_init(netstackid_t stackid, netstack_t *ns) 5948 { 5949 ip_stack_t *ipst; 5950 ipparam_t *pa; 5951 ipndp_t *na; 5952 5953 #ifdef NS_DEBUG 5954 printf("ip_stack_init(stack %d)\n", stackid); 5955 #endif 5956 5957 ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP); 5958 ipst->ips_netstack = ns; 5959 5960 ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS, 5961 KM_SLEEP); 5962 ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t), 5963 KM_SLEEP); 5964 ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5965 ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP); 5966 mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5967 mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL); 5968 5969 rw_init(&ipst->ips_ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5970 mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5971 ipst->ips_igmp_deferred_next = INFINITY; 5972 mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5973 ipst->ips_mld_deferred_next = INFINITY; 5974 mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5975 mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5976 mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5977 mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5978 rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL); 5979 rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5980 rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5981 5982 ipcl_init(ipst); 5983 ip_ire_init(ipst); 5984 ip6_asp_init(ipst); 5985 ipif_init(ipst); 5986 conn_drain_init(ipst); 5987 ip_mrouter_stack_init(ipst); 5988 5989 ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT; 5990 ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 5991 5992 ipst->ips_ip_multirt_log_interval = 1000; 5993 5994 ipst->ips_ip_g_forward = IP_FORWARD_DEFAULT; 5995 ipst->ips_ipv6_forward = IP_FORWARD_DEFAULT; 5996 ipst->ips_ill_index = 1; 5997 5998 ipst->ips_saved_ip_g_forward = -1; 5999 ipst->ips_reg_vif_num = ALL_VIFS; /* Index to Register vif */ 6000 6001 pa = (ipparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP); 6002 ipst->ips_param_arr = pa; 6003 bcopy(lcl_param_arr, ipst->ips_param_arr, sizeof (lcl_param_arr)); 6004 6005 na = (ipndp_t *)kmem_alloc(sizeof (lcl_ndp_arr), KM_SLEEP); 6006 ipst->ips_ndp_arr = na; 6007 bcopy(lcl_ndp_arr, ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); 6008 ipst->ips_ndp_arr[IPNDP_IP_FORWARDING_OFFSET].ip_ndp_data = 6009 (caddr_t)&ipst->ips_ip_g_forward; 6010 ipst->ips_ndp_arr[IPNDP_IP6_FORWARDING_OFFSET].ip_ndp_data = 6011 (caddr_t)&ipst->ips_ipv6_forward; 6012 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_name, 6013 "ip_cgtp_filter") == 0); 6014 ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data = 6015 (caddr_t)&ipst->ips_ip_cgtp_filter; 6016 ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_name, 6017 "ipmp_hook_emulation") == 0); 6018 ipst->ips_ndp_arr[IPNDP_IPMP_HOOK_OFFSET].ip_ndp_data = 6019 (caddr_t)&ipst->ips_ipmp_hook_emulation; 6020 6021 (void) ip_param_register(&ipst->ips_ip_g_nd, 6022 ipst->ips_param_arr, A_CNT(lcl_param_arr), 6023 ipst->ips_ndp_arr, A_CNT(lcl_ndp_arr)); 6024 6025 ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst); 6026 ipst->ips_icmp_mibkp = icmp_kstat_init(stackid); 6027 ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics); 6028 ipst->ips_ip6_kstat = 6029 ip6_kstat_init(stackid, &ipst->ips_ip6_statistics); 6030 6031 ipst->ips_ipmp_enable_failback = B_TRUE; 6032 6033 ipst->ips_ip_src_id = 1; 6034 rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); 6035 6036 ip_net_init(ipst, ns); 6037 ipv4_hook_init(ipst); 6038 ipv6_hook_init(ipst); 6039 6040 return (ipst); 6041 } 6042 6043 /* 6044 * Allocate and initialize a DLPI template of the specified length. (May be 6045 * called as writer.) 6046 */ 6047 mblk_t * 6048 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 6049 { 6050 mblk_t *mp; 6051 6052 mp = allocb(len, BPRI_MED); 6053 if (!mp) 6054 return (NULL); 6055 6056 /* 6057 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 6058 * of which we don't seem to use) are sent with M_PCPROTO, and 6059 * that other DLPI are M_PROTO. 6060 */ 6061 if (prim == DL_INFO_REQ) { 6062 mp->b_datap->db_type = M_PCPROTO; 6063 } else { 6064 mp->b_datap->db_type = M_PROTO; 6065 } 6066 6067 mp->b_wptr = mp->b_rptr + len; 6068 bzero(mp->b_rptr, len); 6069 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 6070 return (mp); 6071 } 6072 6073 /* 6074 * Debug formatting routine. Returns a character string representation of the 6075 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 6076 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 6077 * 6078 * Once the ndd table-printing interfaces are removed, this can be changed to 6079 * standard dotted-decimal form. 6080 */ 6081 char * 6082 ip_dot_addr(ipaddr_t addr, char *buf) 6083 { 6084 uint8_t *ap = (uint8_t *)&addr; 6085 6086 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 6087 ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF); 6088 return (buf); 6089 } 6090 6091 /* 6092 * Write the given MAC address as a printable string in the usual colon- 6093 * separated format. 6094 */ 6095 const char * 6096 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen) 6097 { 6098 char *bp; 6099 6100 if (alen == 0 || buflen < 4) 6101 return ("?"); 6102 bp = buf; 6103 for (;;) { 6104 /* 6105 * If there are more MAC address bytes available, but we won't 6106 * have any room to print them, then add "..." to the string 6107 * instead. See below for the 'magic number' explanation. 6108 */ 6109 if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) { 6110 (void) strcpy(bp, "..."); 6111 break; 6112 } 6113 (void) sprintf(bp, "%02x", *addr++); 6114 bp += 2; 6115 if (--alen == 0) 6116 break; 6117 *bp++ = ':'; 6118 buflen -= 3; 6119 /* 6120 * At this point, based on the first 'if' statement above, 6121 * either alen == 1 and buflen >= 3, or alen > 1 and 6122 * buflen >= 4. The first case leaves room for the final "xx" 6123 * number and trailing NUL byte. The second leaves room for at 6124 * least "...". Thus the apparently 'magic' numbers chosen for 6125 * that statement. 6126 */ 6127 } 6128 return (buf); 6129 } 6130 6131 /* 6132 * Send an ICMP error after patching up the packet appropriately. Returns 6133 * non-zero if the appropriate MIB should be bumped; zero otherwise. 6134 */ 6135 static boolean_t 6136 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 6137 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, 6138 zoneid_t zoneid, ip_stack_t *ipst) 6139 { 6140 ipha_t *ipha; 6141 mblk_t *first_mp; 6142 boolean_t secure; 6143 unsigned char db_type; 6144 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6145 6146 first_mp = mp; 6147 if (mctl_present) { 6148 mp = mp->b_cont; 6149 secure = ipsec_in_is_secure(first_mp); 6150 ASSERT(mp != NULL); 6151 } else { 6152 /* 6153 * If this is an ICMP error being reported - which goes 6154 * up as M_CTLs, we need to convert them to M_DATA till 6155 * we finish checking with global policy because 6156 * ipsec_check_global_policy() assumes M_DATA as clear 6157 * and M_CTL as secure. 6158 */ 6159 db_type = DB_TYPE(mp); 6160 DB_TYPE(mp) = M_DATA; 6161 secure = B_FALSE; 6162 } 6163 /* 6164 * We are generating an icmp error for some inbound packet. 6165 * Called from all ip_fanout_(udp, tcp, proto) functions. 6166 * Before we generate an error, check with global policy 6167 * to see whether this is allowed to enter the system. As 6168 * there is no "conn", we are checking with global policy. 6169 */ 6170 ipha = (ipha_t *)mp->b_rptr; 6171 if (secure || ipss->ipsec_inbound_v4_policy_present) { 6172 first_mp = ipsec_check_global_policy(first_mp, NULL, 6173 ipha, NULL, mctl_present, ipst->ips_netstack); 6174 if (first_mp == NULL) 6175 return (B_FALSE); 6176 } 6177 6178 if (!mctl_present) 6179 DB_TYPE(mp) = db_type; 6180 6181 if (flags & IP_FF_SEND_ICMP) { 6182 if (flags & IP_FF_HDR_COMPLETE) { 6183 if (ip_hdr_complete(ipha, zoneid, ipst)) { 6184 freemsg(first_mp); 6185 return (B_TRUE); 6186 } 6187 } 6188 if (flags & IP_FF_CKSUM) { 6189 /* 6190 * Have to correct checksum since 6191 * the packet might have been 6192 * fragmented and the reassembly code in ip_rput 6193 * does not restore the IP checksum. 6194 */ 6195 ipha->ipha_hdr_checksum = 0; 6196 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6197 } 6198 switch (icmp_type) { 6199 case ICMP_DEST_UNREACHABLE: 6200 icmp_unreachable(WR(q), first_mp, icmp_code, zoneid, 6201 ipst); 6202 break; 6203 default: 6204 freemsg(first_mp); 6205 break; 6206 } 6207 } else { 6208 freemsg(first_mp); 6209 return (B_FALSE); 6210 } 6211 6212 return (B_TRUE); 6213 } 6214 6215 /* 6216 * Used to send an ICMP error message when a packet is received for 6217 * a protocol that is not supported. The mblk passed as argument 6218 * is consumed by this function. 6219 */ 6220 void 6221 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid, 6222 ip_stack_t *ipst) 6223 { 6224 mblk_t *mp; 6225 ipha_t *ipha; 6226 ill_t *ill; 6227 ipsec_in_t *ii; 6228 6229 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6230 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6231 6232 mp = ipsec_mp->b_cont; 6233 ipsec_mp->b_cont = NULL; 6234 ipha = (ipha_t *)mp->b_rptr; 6235 /* Get ill from index in ipsec_in_t. */ 6236 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 6237 (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL, 6238 ipst); 6239 if (ill != NULL) { 6240 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 6241 if (ip_fanout_send_icmp(q, mp, flags, 6242 ICMP_DEST_UNREACHABLE, 6243 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) { 6244 BUMP_MIB(ill->ill_ip_mib, 6245 ipIfStatsInUnknownProtos); 6246 } 6247 } else { 6248 if (ip_fanout_send_icmp_v6(q, mp, flags, 6249 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 6250 0, B_FALSE, zoneid, ipst)) { 6251 BUMP_MIB(ill->ill_ip_mib, 6252 ipIfStatsInUnknownProtos); 6253 } 6254 } 6255 ill_refrele(ill); 6256 } else { /* re-link for the freemsg() below. */ 6257 ipsec_mp->b_cont = mp; 6258 } 6259 6260 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 6261 freemsg(ipsec_mp); 6262 } 6263 6264 /* 6265 * See if the inbound datagram has had IPsec processing applied to it. 6266 */ 6267 boolean_t 6268 ipsec_in_is_secure(mblk_t *ipsec_mp) 6269 { 6270 ipsec_in_t *ii; 6271 6272 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 6273 ASSERT(ii->ipsec_in_type == IPSEC_IN); 6274 6275 if (ii->ipsec_in_loopback) { 6276 return (ii->ipsec_in_secure); 6277 } else { 6278 return (ii->ipsec_in_ah_sa != NULL || 6279 ii->ipsec_in_esp_sa != NULL || 6280 ii->ipsec_in_decaps); 6281 } 6282 } 6283 6284 /* 6285 * Handle protocols with which IP is less intimate. There 6286 * can be more than one stream bound to a particular 6287 * protocol. When this is the case, normally each one gets a copy 6288 * of any incoming packets. 6289 * 6290 * IPsec NOTE : 6291 * 6292 * Don't allow a secure packet going up a non-secure connection. 6293 * We don't allow this because 6294 * 6295 * 1) Reply might go out in clear which will be dropped at 6296 * the sending side. 6297 * 2) If the reply goes out in clear it will give the 6298 * adversary enough information for getting the key in 6299 * most of the cases. 6300 * 6301 * Moreover getting a secure packet when we expect clear 6302 * implies that SA's were added without checking for 6303 * policy on both ends. This should not happen once ISAKMP 6304 * is used to negotiate SAs as SAs will be added only after 6305 * verifying the policy. 6306 * 6307 * NOTE : If the packet was tunneled and not multicast we only send 6308 * to it the first match. Unlike TCP and UDP fanouts this doesn't fall 6309 * back to delivering packets to AF_INET6 raw sockets. 6310 * 6311 * IPQoS Notes: 6312 * Once we have determined the client, invoke IPPF processing. 6313 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6314 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6315 * ip_policy will be false. 6316 * 6317 * Zones notes: 6318 * Currently only applications in the global zone can create raw sockets for 6319 * protocols other than ICMP. So unlike the broadcast / multicast case of 6320 * ip_fanout_udp(), we only send a copy of the packet to streams in the 6321 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 6322 */ 6323 static void 6324 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 6325 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 6326 zoneid_t zoneid) 6327 { 6328 queue_t *rq; 6329 mblk_t *mp1, *first_mp1; 6330 uint_t protocol = ipha->ipha_protocol; 6331 ipaddr_t dst; 6332 boolean_t one_only; 6333 mblk_t *first_mp = mp; 6334 boolean_t secure; 6335 uint32_t ill_index; 6336 conn_t *connp, *first_connp, *next_connp; 6337 connf_t *connfp; 6338 boolean_t shared_addr; 6339 mib2_ipIfStatsEntry_t *mibptr; 6340 ip_stack_t *ipst = recv_ill->ill_ipst; 6341 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6342 6343 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 6344 if (mctl_present) { 6345 mp = first_mp->b_cont; 6346 secure = ipsec_in_is_secure(first_mp); 6347 ASSERT(mp != NULL); 6348 } else { 6349 secure = B_FALSE; 6350 } 6351 dst = ipha->ipha_dst; 6352 /* 6353 * If the packet was tunneled and not multicast we only send to it 6354 * the first match. 6355 */ 6356 one_only = ((protocol == IPPROTO_ENCAP || protocol == IPPROTO_IPV6) && 6357 !CLASSD(dst)); 6358 6359 shared_addr = (zoneid == ALL_ZONES); 6360 if (shared_addr) { 6361 /* 6362 * We don't allow multilevel ports for raw IP, so no need to 6363 * check for that here. 6364 */ 6365 zoneid = tsol_packet_to_zoneid(mp); 6366 } 6367 6368 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 6369 mutex_enter(&connfp->connf_lock); 6370 connp = connfp->connf_head; 6371 for (connp = connfp->connf_head; connp != NULL; 6372 connp = connp->conn_next) { 6373 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 6374 zoneid) && 6375 (!is_system_labeled() || 6376 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6377 connp))) { 6378 break; 6379 } 6380 } 6381 6382 if (connp == NULL || connp->conn_upq == NULL) { 6383 /* 6384 * No one bound to these addresses. Is 6385 * there a client that wants all 6386 * unclaimed datagrams? 6387 */ 6388 mutex_exit(&connfp->connf_lock); 6389 /* 6390 * Check for IPPROTO_ENCAP... 6391 */ 6392 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { 6393 /* 6394 * If an IPsec mblk is here on a multicast 6395 * tunnel (using ip_mroute stuff), check policy here, 6396 * THEN ship off to ip_mroute_decap(). 6397 * 6398 * BTW, If I match a configured IP-in-IP 6399 * tunnel, this path will not be reached, and 6400 * ip_mroute_decap will never be called. 6401 */ 6402 first_mp = ipsec_check_global_policy(first_mp, connp, 6403 ipha, NULL, mctl_present, ipst->ips_netstack); 6404 if (first_mp != NULL) { 6405 if (mctl_present) 6406 freeb(first_mp); 6407 ip_mroute_decap(q, mp, ill); 6408 } /* Else we already freed everything! */ 6409 } else { 6410 /* 6411 * Otherwise send an ICMP protocol unreachable. 6412 */ 6413 if (ip_fanout_send_icmp(q, first_mp, flags, 6414 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 6415 mctl_present, zoneid, ipst)) { 6416 BUMP_MIB(mibptr, ipIfStatsInUnknownProtos); 6417 } 6418 } 6419 return; 6420 } 6421 CONN_INC_REF(connp); 6422 first_connp = connp; 6423 6424 /* 6425 * Only send message to one tunnel driver by immediately 6426 * terminating the loop. 6427 */ 6428 connp = one_only ? NULL : connp->conn_next; 6429 6430 for (;;) { 6431 while (connp != NULL) { 6432 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 6433 flags, zoneid) && 6434 (!is_system_labeled() || 6435 tsol_receive_local(mp, &dst, IPV4_VERSION, 6436 shared_addr, connp))) 6437 break; 6438 connp = connp->conn_next; 6439 } 6440 6441 /* 6442 * Copy the packet. 6443 */ 6444 if (connp == NULL || connp->conn_upq == NULL || 6445 (((first_mp1 = dupmsg(first_mp)) == NULL) && 6446 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 6447 /* 6448 * No more interested clients or memory 6449 * allocation failed 6450 */ 6451 connp = first_connp; 6452 break; 6453 } 6454 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 6455 CONN_INC_REF(connp); 6456 mutex_exit(&connfp->connf_lock); 6457 rq = connp->conn_rq; 6458 if (!canputnext(rq)) { 6459 if (flags & IP_FF_RAWIP) { 6460 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6461 } else { 6462 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6463 } 6464 6465 freemsg(first_mp1); 6466 } else { 6467 /* 6468 * Don't enforce here if we're an actual tunnel - 6469 * let "tun" do it instead. 6470 */ 6471 if (!IPCL_IS_IPTUN(connp) && 6472 (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 6473 secure)) { 6474 first_mp1 = ipsec_check_inbound_policy 6475 (first_mp1, connp, ipha, NULL, 6476 mctl_present); 6477 } 6478 if (first_mp1 != NULL) { 6479 int in_flags = 0; 6480 /* 6481 * ip_fanout_proto also gets called from 6482 * icmp_inbound_error_fanout, in which case 6483 * the msg type is M_CTL. Don't add info 6484 * in this case for the time being. In future 6485 * when there is a need for knowing the 6486 * inbound iface index for ICMP error msgs, 6487 * then this can be changed. 6488 */ 6489 if (connp->conn_recvif) 6490 in_flags = IPF_RECVIF; 6491 /* 6492 * The ULP may support IP_RECVPKTINFO for both 6493 * IP v4 and v6 so pass the appropriate argument 6494 * based on conn IP version. 6495 */ 6496 if (connp->conn_ip_recvpktinfo) { 6497 if (connp->conn_af_isv6) { 6498 /* 6499 * V6 only needs index 6500 */ 6501 in_flags |= IPF_RECVIF; 6502 } else { 6503 /* 6504 * V4 needs index + 6505 * matching address. 6506 */ 6507 in_flags |= IPF_RECVADDR; 6508 } 6509 } 6510 if ((in_flags != 0) && 6511 (mp->b_datap->db_type != M_CTL)) { 6512 /* 6513 * the actual data will be 6514 * contained in b_cont upon 6515 * successful return of the 6516 * following call else 6517 * original mblk is returned 6518 */ 6519 ASSERT(recv_ill != NULL); 6520 mp1 = ip_add_info(mp1, recv_ill, 6521 in_flags, IPCL_ZONEID(connp), ipst); 6522 } 6523 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6524 if (mctl_present) 6525 freeb(first_mp1); 6526 (connp->conn_recv)(connp, mp1, NULL); 6527 } 6528 } 6529 mutex_enter(&connfp->connf_lock); 6530 /* Follow the next pointer before releasing the conn. */ 6531 next_connp = connp->conn_next; 6532 CONN_DEC_REF(connp); 6533 connp = next_connp; 6534 } 6535 6536 /* Last one. Send it upstream. */ 6537 mutex_exit(&connfp->connf_lock); 6538 6539 /* 6540 * If this packet is coming from icmp_inbound_error_fanout ip_policy 6541 * will be set to false. 6542 */ 6543 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6544 ill_index = ill->ill_phyint->phyint_ifindex; 6545 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6546 if (mp == NULL) { 6547 CONN_DEC_REF(connp); 6548 if (mctl_present) { 6549 freeb(first_mp); 6550 } 6551 return; 6552 } 6553 } 6554 6555 rq = connp->conn_rq; 6556 if (!canputnext(rq)) { 6557 if (flags & IP_FF_RAWIP) { 6558 BUMP_MIB(mibptr, rawipIfStatsInOverflows); 6559 } else { 6560 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); 6561 } 6562 6563 freemsg(first_mp); 6564 } else { 6565 if (IPCL_IS_IPTUN(connp)) { 6566 /* 6567 * Tunneled packet. We enforce policy in the tunnel 6568 * module itself. 6569 * 6570 * Send the WHOLE packet up (incl. IPSEC_IN) without 6571 * a policy check. 6572 * FIXME to use conn_recv for tun later. 6573 */ 6574 putnext(rq, first_mp); 6575 CONN_DEC_REF(connp); 6576 return; 6577 } 6578 6579 if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) { 6580 first_mp = ipsec_check_inbound_policy(first_mp, connp, 6581 ipha, NULL, mctl_present); 6582 } 6583 6584 if (first_mp != NULL) { 6585 int in_flags = 0; 6586 6587 /* 6588 * ip_fanout_proto also gets called 6589 * from icmp_inbound_error_fanout, in 6590 * which case the msg type is M_CTL. 6591 * Don't add info in this case for time 6592 * being. In future when there is a 6593 * need for knowing the inbound iface 6594 * index for ICMP error msgs, then this 6595 * can be changed 6596 */ 6597 if (connp->conn_recvif) 6598 in_flags = IPF_RECVIF; 6599 if (connp->conn_ip_recvpktinfo) { 6600 if (connp->conn_af_isv6) { 6601 /* 6602 * V6 only needs index 6603 */ 6604 in_flags |= IPF_RECVIF; 6605 } else { 6606 /* 6607 * V4 needs index + 6608 * matching address. 6609 */ 6610 in_flags |= IPF_RECVADDR; 6611 } 6612 } 6613 if ((in_flags != 0) && 6614 (mp->b_datap->db_type != M_CTL)) { 6615 6616 /* 6617 * the actual data will be contained in 6618 * b_cont upon successful return 6619 * of the following call else original 6620 * mblk is returned 6621 */ 6622 ASSERT(recv_ill != NULL); 6623 mp = ip_add_info(mp, recv_ill, 6624 in_flags, IPCL_ZONEID(connp), ipst); 6625 } 6626 BUMP_MIB(mibptr, ipIfStatsHCInDelivers); 6627 (connp->conn_recv)(connp, mp, NULL); 6628 if (mctl_present) 6629 freeb(first_mp); 6630 } 6631 } 6632 CONN_DEC_REF(connp); 6633 } 6634 6635 /* 6636 * Fanout for TCP packets 6637 * The caller puts <fport, lport> in the ports parameter. 6638 * 6639 * IPQoS Notes 6640 * Before sending it to the client, invoke IPPF processing. 6641 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 6642 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 6643 * ip_policy is false. 6644 */ 6645 static void 6646 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 6647 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 6648 { 6649 mblk_t *first_mp; 6650 boolean_t secure; 6651 uint32_t ill_index; 6652 int ip_hdr_len; 6653 tcph_t *tcph; 6654 boolean_t syn_present = B_FALSE; 6655 conn_t *connp; 6656 ip_stack_t *ipst = recv_ill->ill_ipst; 6657 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6658 6659 ASSERT(recv_ill != NULL); 6660 6661 first_mp = mp; 6662 if (mctl_present) { 6663 ASSERT(first_mp->b_datap->db_type == M_CTL); 6664 mp = first_mp->b_cont; 6665 secure = ipsec_in_is_secure(first_mp); 6666 ASSERT(mp != NULL); 6667 } else { 6668 secure = B_FALSE; 6669 } 6670 6671 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 6672 6673 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 6674 zoneid, ipst)) == NULL) { 6675 /* 6676 * No connected connection or listener. Send a 6677 * TH_RST via tcp_xmit_listeners_reset. 6678 */ 6679 6680 /* Initiate IPPf processing, if needed. */ 6681 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 6682 uint32_t ill_index; 6683 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6684 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 6685 if (first_mp == NULL) 6686 return; 6687 } 6688 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6689 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 6690 zoneid)); 6691 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6692 ipst->ips_netstack->netstack_tcp, NULL); 6693 return; 6694 } 6695 6696 /* 6697 * Allocate the SYN for the TCP connection here itself 6698 */ 6699 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6700 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 6701 if (IPCL_IS_TCP(connp)) { 6702 squeue_t *sqp; 6703 6704 /* 6705 * For fused tcp loopback, assign the eager's 6706 * squeue to be that of the active connect's. 6707 * Note that we don't check for IP_FF_LOOPBACK 6708 * here since this routine gets called only 6709 * for loopback (unlike the IPv6 counterpart). 6710 */ 6711 ASSERT(Q_TO_CONN(q) != NULL); 6712 if (do_tcp_fusion && 6713 !CONN_INBOUND_POLICY_PRESENT(connp, ipss) && 6714 !secure && 6715 !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy && 6716 IPCL_IS_TCP(Q_TO_CONN(q))) { 6717 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 6718 sqp = Q_TO_CONN(q)->conn_sqp; 6719 } else { 6720 sqp = IP_SQUEUE_GET(lbolt); 6721 } 6722 6723 mp->b_datap->db_struioflag |= STRUIO_EAGER; 6724 DB_CKSUMSTART(mp) = (intptr_t)sqp; 6725 syn_present = B_TRUE; 6726 } 6727 } 6728 6729 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 6730 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 6731 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6732 if ((flags & TH_RST) || (flags & TH_URG)) { 6733 CONN_DEC_REF(connp); 6734 freemsg(first_mp); 6735 return; 6736 } 6737 if (flags & TH_ACK) { 6738 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 6739 ipst->ips_netstack->netstack_tcp, connp); 6740 CONN_DEC_REF(connp); 6741 return; 6742 } 6743 6744 CONN_DEC_REF(connp); 6745 freemsg(first_mp); 6746 return; 6747 } 6748 6749 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 6750 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6751 NULL, mctl_present); 6752 if (first_mp == NULL) { 6753 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6754 CONN_DEC_REF(connp); 6755 return; 6756 } 6757 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 6758 ASSERT(syn_present); 6759 if (mctl_present) { 6760 ASSERT(first_mp != mp); 6761 first_mp->b_datap->db_struioflag |= 6762 STRUIO_POLICY; 6763 } else { 6764 ASSERT(first_mp == mp); 6765 mp->b_datap->db_struioflag &= 6766 ~STRUIO_EAGER; 6767 mp->b_datap->db_struioflag |= 6768 STRUIO_POLICY; 6769 } 6770 } else { 6771 /* 6772 * Discard first_mp early since we're dealing with a 6773 * fully-connected conn_t and tcp doesn't do policy in 6774 * this case. 6775 */ 6776 if (mctl_present) { 6777 freeb(first_mp); 6778 mctl_present = B_FALSE; 6779 } 6780 first_mp = mp; 6781 } 6782 } 6783 6784 /* 6785 * Initiate policy processing here if needed. If we get here from 6786 * icmp_inbound_error_fanout, ip_policy is false. 6787 */ 6788 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 6789 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6790 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6791 if (mp == NULL) { 6792 CONN_DEC_REF(connp); 6793 if (mctl_present) 6794 freeb(first_mp); 6795 return; 6796 } else if (mctl_present) { 6797 ASSERT(first_mp != mp); 6798 first_mp->b_cont = mp; 6799 } else { 6800 first_mp = mp; 6801 } 6802 } 6803 6804 6805 6806 /* Handle socket options. */ 6807 if (!syn_present && 6808 connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 6809 /* Add header */ 6810 ASSERT(recv_ill != NULL); 6811 /* 6812 * Since tcp does not support IP_RECVPKTINFO for V4, only pass 6813 * IPF_RECVIF. 6814 */ 6815 mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp), 6816 ipst); 6817 if (mp == NULL) { 6818 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 6819 CONN_DEC_REF(connp); 6820 if (mctl_present) 6821 freeb(first_mp); 6822 return; 6823 } else if (mctl_present) { 6824 /* 6825 * ip_add_info might return a new mp. 6826 */ 6827 ASSERT(first_mp != mp); 6828 first_mp->b_cont = mp; 6829 } else { 6830 first_mp = mp; 6831 } 6832 } 6833 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 6834 if (IPCL_IS_TCP(connp)) { 6835 /* do not drain, certain use cases can blow the stack */ 6836 squeue_enter_nodrain(connp->conn_sqp, first_mp, 6837 connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); 6838 } else { 6839 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 6840 (connp->conn_recv)(connp, first_mp, NULL); 6841 CONN_DEC_REF(connp); 6842 } 6843 } 6844 6845 /* 6846 * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or 6847 * pass it along to ESP if the SPI is non-zero. Returns TRUE if the mblk 6848 * is not consumed. 6849 * 6850 * One of four things can happen, all of which affect the passed-in mblk: 6851 * 6852 * 1.) ICMP messages that go through here just get returned TRUE. 6853 * 6854 * 2.) The packet is stock UDP and gets its zero-SPI stripped. Return TRUE. 6855 * 6856 * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent 6857 * ESP packet, and is passed along to ESP for consumption. Return FALSE. 6858 * 6859 * 4.) The packet is an ESP-in-UDP Keepalive. Drop it and return FALSE. 6860 */ 6861 static boolean_t 6862 zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, 6863 ipsec_stack_t *ipss) 6864 { 6865 int shift, plen, iph_len; 6866 ipha_t *ipha; 6867 udpha_t *udpha; 6868 uint32_t *spi; 6869 uint32_t esp_ports; 6870 uint8_t *orptr; 6871 boolean_t free_ire; 6872 6873 if (DB_TYPE(mp) == M_CTL) { 6874 /* 6875 * ICMP message with UDP inside. Don't bother stripping, just 6876 * send it up. 6877 * 6878 * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going 6879 * to ignore errors set by ICMP anyway ('cause they might be 6880 * forged), but that's the app's decision, not ours. 6881 */ 6882 6883 /* Bunch of reality checks for DEBUG kernels... */ 6884 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 6885 ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP); 6886 6887 return (B_TRUE); 6888 } 6889 6890 ipha = (ipha_t *)mp->b_rptr; 6891 iph_len = IPH_HDR_LENGTH(ipha); 6892 plen = ntohs(ipha->ipha_length); 6893 6894 if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) { 6895 /* 6896 * Most likely a keepalive for the benefit of an intervening 6897 * NAT. These aren't for us, per se, so drop it. 6898 * 6899 * RFC 3947/8 doesn't say for sure what to do for 2-3 6900 * byte packets (keepalives are 1-byte), but we'll drop them 6901 * also. 6902 */ 6903 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6904 DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper); 6905 return (B_FALSE); 6906 } 6907 6908 if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) { 6909 /* might as well pull it all up - it might be ESP. */ 6910 if (!pullupmsg(mp, -1)) { 6911 ip_drop_packet(mp, B_TRUE, recv_ill, NULL, 6912 DROPPER(ipss, ipds_esp_nomem), 6913 &ipss->ipsec_dropper); 6914 return (B_FALSE); 6915 } 6916 6917 ipha = (ipha_t *)mp->b_rptr; 6918 } 6919 spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t)); 6920 if (*spi == 0) { 6921 /* UDP packet - remove 0-spi. */ 6922 shift = sizeof (uint32_t); 6923 } else { 6924 /* ESP-in-UDP packet - reduce to ESP. */ 6925 ipha->ipha_protocol = IPPROTO_ESP; 6926 shift = sizeof (udpha_t); 6927 } 6928 6929 /* Fix IP header */ 6930 ipha->ipha_length = htons(plen - shift); 6931 ipha->ipha_hdr_checksum = 0; 6932 6933 orptr = mp->b_rptr; 6934 mp->b_rptr += shift; 6935 6936 udpha = (udpha_t *)(orptr + iph_len); 6937 if (*spi == 0) { 6938 ASSERT((uint8_t *)ipha == orptr); 6939 udpha->uha_length = htons(plen - shift - iph_len); 6940 iph_len += sizeof (udpha_t); /* For the call to ovbcopy(). */ 6941 esp_ports = 0; 6942 } else { 6943 esp_ports = *((uint32_t *)udpha); 6944 ASSERT(esp_ports != 0); 6945 } 6946 ovbcopy(orptr, orptr + shift, iph_len); 6947 if (esp_ports != 0) /* Punt up for ESP processing. */ { 6948 ipha = (ipha_t *)(orptr + shift); 6949 6950 free_ire = (ire == NULL); 6951 if (free_ire) { 6952 /* Re-acquire ire. */ 6953 ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL, 6954 ipss->ipsec_netstack->netstack_ip); 6955 if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) { 6956 if (ire != NULL) 6957 ire_refrele(ire); 6958 /* 6959 * Do a regular freemsg(), as this is an IP 6960 * error (no local route) not an IPsec one. 6961 */ 6962 freemsg(mp); 6963 } 6964 } 6965 6966 ip_proto_input(q, mp, ipha, ire, recv_ill, esp_ports); 6967 if (free_ire) 6968 ire_refrele(ire); 6969 } 6970 6971 return (esp_ports == 0); 6972 } 6973 6974 /* 6975 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 6976 * We are responsible for disposing of mp, such as by freemsg() or putnext() 6977 * Caller is responsible for dropping references to the conn, and freeing 6978 * first_mp. 6979 * 6980 * IPQoS Notes 6981 * Before sending it to the client, invoke IPPF processing. Policy processing 6982 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 6983 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 6984 * ip_wput_local, ip_policy is false. 6985 */ 6986 static void 6987 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 6988 boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 6989 boolean_t ip_policy) 6990 { 6991 boolean_t mctl_present = (first_mp != NULL); 6992 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 6993 uint32_t ill_index; 6994 ip_stack_t *ipst = recv_ill->ill_ipst; 6995 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 6996 6997 ASSERT(ill != NULL); 6998 6999 if (mctl_present) 7000 first_mp->b_cont = mp; 7001 else 7002 first_mp = mp; 7003 7004 if (CONN_UDP_FLOWCTLD(connp)) { 7005 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 7006 freemsg(first_mp); 7007 return; 7008 } 7009 7010 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { 7011 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 7012 NULL, mctl_present); 7013 if (first_mp == NULL) { 7014 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 7015 return; /* Freed by ipsec_check_inbound_policy(). */ 7016 } 7017 } 7018 if (mctl_present) 7019 freeb(first_mp); 7020 7021 /* Let's hope the compilers utter "branch, predict-not-taken..." ;) */ 7022 if (connp->conn_udp->udp_nat_t_endpoint) { 7023 if (mctl_present) { 7024 /* mctl_present *shouldn't* happen. */ 7025 ip_drop_packet(mp, B_TRUE, NULL, NULL, 7026 DROPPER(ipss, ipds_esp_nat_t_ipsec), 7027 &ipss->ipsec_dropper); 7028 return; 7029 } 7030 7031 if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss)) 7032 return; 7033 } 7034 7035 /* Handle options. */ 7036 if (connp->conn_recvif) 7037 in_flags = IPF_RECVIF; 7038 /* 7039 * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag 7040 * passed to ip_add_info is based on IP version of connp. 7041 */ 7042 if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { 7043 if (connp->conn_af_isv6) { 7044 /* 7045 * V6 only needs index 7046 */ 7047 in_flags |= IPF_RECVIF; 7048 } else { 7049 /* 7050 * V4 needs index + matching address. 7051 */ 7052 in_flags |= IPF_RECVADDR; 7053 } 7054 } 7055 7056 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 7057 in_flags |= IPF_RECVSLLA; 7058 7059 /* 7060 * Initiate IPPF processing here, if needed. Note first_mp won't be 7061 * freed if the packet is dropped. The caller will do so. 7062 */ 7063 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { 7064 ill_index = recv_ill->ill_phyint->phyint_ifindex; 7065 ip_process(IPP_LOCAL_IN, &mp, ill_index); 7066 if (mp == NULL) { 7067 return; 7068 } 7069 } 7070 if ((in_flags != 0) && 7071 (mp->b_datap->db_type != M_CTL)) { 7072 /* 7073 * The actual data will be contained in b_cont 7074 * upon successful return of the following call 7075 * else original mblk is returned 7076 */ 7077 ASSERT(recv_ill != NULL); 7078 mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp), 7079 ipst); 7080 } 7081 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 7082 /* Send it upstream */ 7083 (connp->conn_recv)(connp, mp, NULL); 7084 } 7085 7086 /* 7087 * Fanout for UDP packets. 7088 * The caller puts <fport, lport> in the ports parameter. 7089 * 7090 * If SO_REUSEADDR is set all multicast and broadcast packets 7091 * will be delivered to all streams bound to the same port. 7092 * 7093 * Zones notes: 7094 * Multicast and broadcast packets will be distributed to streams in all zones. 7095 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 7096 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 7097 * packets. To maintain this behavior with multiple zones, the conns are grouped 7098 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 7099 * each zone. If unset, all the following conns in the same zone are skipped. 7100 */ 7101 static void 7102 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 7103 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 7104 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 7105 { 7106 uint32_t dstport, srcport; 7107 ipaddr_t dst; 7108 mblk_t *first_mp; 7109 boolean_t secure; 7110 in6_addr_t v6src; 7111 conn_t *connp; 7112 connf_t *connfp; 7113 conn_t *first_connp; 7114 conn_t *next_connp; 7115 mblk_t *mp1, *first_mp1; 7116 ipaddr_t src; 7117 zoneid_t last_zoneid; 7118 boolean_t reuseaddr; 7119 boolean_t shared_addr; 7120 boolean_t unlabeled; 7121 ip_stack_t *ipst; 7122 7123 ASSERT(recv_ill != NULL); 7124 ipst = recv_ill->ill_ipst; 7125 7126 first_mp = mp; 7127 if (mctl_present) { 7128 mp = first_mp->b_cont; 7129 first_mp->b_cont = NULL; 7130 secure = ipsec_in_is_secure(first_mp); 7131 ASSERT(mp != NULL); 7132 } else { 7133 first_mp = NULL; 7134 secure = B_FALSE; 7135 } 7136 7137 /* Extract ports in net byte order */ 7138 dstport = htons(ntohl(ports) & 0xFFFF); 7139 srcport = htons(ntohl(ports) >> 16); 7140 dst = ipha->ipha_dst; 7141 src = ipha->ipha_src; 7142 7143 unlabeled = B_FALSE; 7144 if (is_system_labeled()) 7145 /* Cred cannot be null on IPv4 */ 7146 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 7147 TSLF_UNLABELED) != 0; 7148 shared_addr = (zoneid == ALL_ZONES); 7149 if (shared_addr) { 7150 /* 7151 * No need to handle exclusive-stack zones since ALL_ZONES 7152 * only applies to the shared stack. 7153 */ 7154 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 7155 /* 7156 * If no shared MLP is found, tsol_mlp_findzone returns 7157 * ALL_ZONES. In that case, we assume it's SLP, and 7158 * search for the zone based on the packet label. 7159 * 7160 * If there is such a zone, we prefer to find a 7161 * connection in it. Otherwise, we look for a 7162 * MAC-exempt connection in any zone whose label 7163 * dominates the default label on the packet. 7164 */ 7165 if (zoneid == ALL_ZONES) 7166 zoneid = tsol_packet_to_zoneid(mp); 7167 else 7168 unlabeled = B_FALSE; 7169 } 7170 7171 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7172 mutex_enter(&connfp->connf_lock); 7173 connp = connfp->connf_head; 7174 if (!broadcast && !CLASSD(dst)) { 7175 /* 7176 * Not broadcast or multicast. Send to the one (first) 7177 * client we find. No need to check conn_wantpacket() 7178 * since IP_BOUND_IF/conn_incoming_ill does not apply to 7179 * IPv4 unicast packets. 7180 */ 7181 while ((connp != NULL) && 7182 (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) || 7183 (!IPCL_ZONE_MATCH(connp, zoneid) && 7184 !(unlabeled && connp->conn_mac_exempt)))) { 7185 /* 7186 * We keep searching since the conn did not match, 7187 * or its zone did not match and it is not either 7188 * an allzones conn or a mac exempt conn (if the 7189 * sender is unlabeled.) 7190 */ 7191 connp = connp->conn_next; 7192 } 7193 7194 if (connp == NULL || connp->conn_upq == NULL) 7195 goto notfound; 7196 7197 if (is_system_labeled() && 7198 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7199 connp)) 7200 goto notfound; 7201 7202 CONN_INC_REF(connp); 7203 mutex_exit(&connfp->connf_lock); 7204 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7205 flags, recv_ill, ip_policy); 7206 IP_STAT(ipst, ip_udp_fannorm); 7207 CONN_DEC_REF(connp); 7208 return; 7209 } 7210 7211 /* 7212 * Broadcast and multicast case 7213 * 7214 * Need to check conn_wantpacket(). 7215 * If SO_REUSEADDR has been set on the first we send the 7216 * packet to all clients that have joined the group and 7217 * match the port. 7218 */ 7219 7220 while (connp != NULL) { 7221 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 7222 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7223 (!is_system_labeled() || 7224 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7225 connp))) 7226 break; 7227 connp = connp->conn_next; 7228 } 7229 7230 if (connp == NULL || connp->conn_upq == NULL) 7231 goto notfound; 7232 7233 first_connp = connp; 7234 /* 7235 * When SO_REUSEADDR is not set, send the packet only to the first 7236 * matching connection in its zone by keeping track of the zoneid. 7237 */ 7238 reuseaddr = first_connp->conn_reuseaddr; 7239 last_zoneid = first_connp->conn_zoneid; 7240 7241 CONN_INC_REF(connp); 7242 connp = connp->conn_next; 7243 for (;;) { 7244 while (connp != NULL) { 7245 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 7246 (reuseaddr || connp->conn_zoneid != last_zoneid) && 7247 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7248 (!is_system_labeled() || 7249 tsol_receive_local(mp, &dst, IPV4_VERSION, 7250 shared_addr, connp))) 7251 break; 7252 connp = connp->conn_next; 7253 } 7254 /* 7255 * Just copy the data part alone. The mctl part is 7256 * needed just for verifying policy and it is never 7257 * sent up. 7258 */ 7259 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7260 ((mp1 = copymsg(mp)) == NULL))) { 7261 /* 7262 * No more interested clients or memory 7263 * allocation failed 7264 */ 7265 connp = first_connp; 7266 break; 7267 } 7268 if (connp->conn_zoneid != last_zoneid) { 7269 /* 7270 * Update the zoneid so that the packet isn't sent to 7271 * any more conns in the same zone unless SO_REUSEADDR 7272 * is set. 7273 */ 7274 reuseaddr = connp->conn_reuseaddr; 7275 last_zoneid = connp->conn_zoneid; 7276 } 7277 if (first_mp != NULL) { 7278 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7279 ipsec_info_type == IPSEC_IN); 7280 first_mp1 = ipsec_in_tag(first_mp, NULL, 7281 ipst->ips_netstack); 7282 if (first_mp1 == NULL) { 7283 freemsg(mp1); 7284 connp = first_connp; 7285 break; 7286 } 7287 } else { 7288 first_mp1 = NULL; 7289 } 7290 CONN_INC_REF(connp); 7291 mutex_exit(&connfp->connf_lock); 7292 /* 7293 * IPQoS notes: We don't send the packet for policy 7294 * processing here, will do it for the last one (below). 7295 * i.e. we do it per-packet now, but if we do policy 7296 * processing per-conn, then we would need to do it 7297 * here too. 7298 */ 7299 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7300 ipha, flags, recv_ill, B_FALSE); 7301 mutex_enter(&connfp->connf_lock); 7302 /* Follow the next pointer before releasing the conn. */ 7303 next_connp = connp->conn_next; 7304 IP_STAT(ipst, ip_udp_fanmb); 7305 CONN_DEC_REF(connp); 7306 connp = next_connp; 7307 } 7308 7309 /* Last one. Send it upstream. */ 7310 mutex_exit(&connfp->connf_lock); 7311 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7312 recv_ill, ip_policy); 7313 IP_STAT(ipst, ip_udp_fanmb); 7314 CONN_DEC_REF(connp); 7315 return; 7316 7317 notfound: 7318 7319 mutex_exit(&connfp->connf_lock); 7320 IP_STAT(ipst, ip_udp_fanothers); 7321 /* 7322 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 7323 * have already been matched above, since they live in the IPv4 7324 * fanout tables. This implies we only need to 7325 * check for IPv6 in6addr_any endpoints here. 7326 * Thus we compare using ipv6_all_zeros instead of the destination 7327 * address, except for the multicast group membership lookup which 7328 * uses the IPv4 destination. 7329 */ 7330 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 7331 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; 7332 mutex_enter(&connfp->connf_lock); 7333 connp = connfp->connf_head; 7334 if (!broadcast && !CLASSD(dst)) { 7335 while (connp != NULL) { 7336 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7337 srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) && 7338 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7339 !connp->conn_ipv6_v6only) 7340 break; 7341 connp = connp->conn_next; 7342 } 7343 7344 if (connp != NULL && is_system_labeled() && 7345 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7346 connp)) 7347 connp = NULL; 7348 7349 if (connp == NULL || connp->conn_upq == NULL) { 7350 /* 7351 * No one bound to this port. Is 7352 * there a client that wants all 7353 * unclaimed datagrams? 7354 */ 7355 mutex_exit(&connfp->connf_lock); 7356 7357 if (mctl_present) 7358 first_mp->b_cont = mp; 7359 else 7360 first_mp = mp; 7361 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP]. 7362 connf_head != NULL) { 7363 ip_fanout_proto(q, first_mp, ill, ipha, 7364 flags | IP_FF_RAWIP, mctl_present, 7365 ip_policy, recv_ill, zoneid); 7366 } else { 7367 if (ip_fanout_send_icmp(q, first_mp, flags, 7368 ICMP_DEST_UNREACHABLE, 7369 ICMP_PORT_UNREACHABLE, 7370 mctl_present, zoneid, ipst)) { 7371 BUMP_MIB(ill->ill_ip_mib, 7372 udpIfStatsNoPorts); 7373 } 7374 } 7375 return; 7376 } 7377 7378 CONN_INC_REF(connp); 7379 mutex_exit(&connfp->connf_lock); 7380 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, 7381 flags, recv_ill, ip_policy); 7382 CONN_DEC_REF(connp); 7383 return; 7384 } 7385 /* 7386 * IPv4 multicast packet being delivered to an AF_INET6 7387 * in6addr_any endpoint. 7388 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 7389 * and not conn_wantpacket_v6() since any multicast membership is 7390 * for an IPv4-mapped multicast address. 7391 * The packet is sent to all clients in all zones that have joined the 7392 * group and match the port. 7393 */ 7394 while (connp != NULL) { 7395 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 7396 srcport, v6src) && 7397 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7398 (!is_system_labeled() || 7399 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 7400 connp))) 7401 break; 7402 connp = connp->conn_next; 7403 } 7404 7405 if (connp == NULL || connp->conn_upq == NULL) { 7406 /* 7407 * No one bound to this port. Is 7408 * there a client that wants all 7409 * unclaimed datagrams? 7410 */ 7411 mutex_exit(&connfp->connf_lock); 7412 7413 if (mctl_present) 7414 first_mp->b_cont = mp; 7415 else 7416 first_mp = mp; 7417 if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head != 7418 NULL) { 7419 ip_fanout_proto(q, first_mp, ill, ipha, 7420 flags | IP_FF_RAWIP, mctl_present, ip_policy, 7421 recv_ill, zoneid); 7422 } else { 7423 /* 7424 * We used to attempt to send an icmp error here, but 7425 * since this is known to be a multicast packet 7426 * and we don't send icmp errors in response to 7427 * multicast, just drop the packet and give up sooner. 7428 */ 7429 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); 7430 freemsg(first_mp); 7431 } 7432 return; 7433 } 7434 7435 first_connp = connp; 7436 7437 CONN_INC_REF(connp); 7438 connp = connp->conn_next; 7439 for (;;) { 7440 while (connp != NULL) { 7441 if (IPCL_UDP_MATCH_V6(connp, dstport, 7442 ipv6_all_zeros, srcport, v6src) && 7443 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 7444 (!is_system_labeled() || 7445 tsol_receive_local(mp, &dst, IPV4_VERSION, 7446 shared_addr, connp))) 7447 break; 7448 connp = connp->conn_next; 7449 } 7450 /* 7451 * Just copy the data part alone. The mctl part is 7452 * needed just for verifying policy and it is never 7453 * sent up. 7454 */ 7455 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 7456 ((mp1 = copymsg(mp)) == NULL))) { 7457 /* 7458 * No more intested clients or memory 7459 * allocation failed 7460 */ 7461 connp = first_connp; 7462 break; 7463 } 7464 if (first_mp != NULL) { 7465 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 7466 ipsec_info_type == IPSEC_IN); 7467 first_mp1 = ipsec_in_tag(first_mp, NULL, 7468 ipst->ips_netstack); 7469 if (first_mp1 == NULL) { 7470 freemsg(mp1); 7471 connp = first_connp; 7472 break; 7473 } 7474 } else { 7475 first_mp1 = NULL; 7476 } 7477 CONN_INC_REF(connp); 7478 mutex_exit(&connfp->connf_lock); 7479 /* 7480 * IPQoS notes: We don't send the packet for policy 7481 * processing here, will do it for the last one (below). 7482 * i.e. we do it per-packet now, but if we do policy 7483 * processing per-conn, then we would need to do it 7484 * here too. 7485 */ 7486 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, 7487 ipha, flags, recv_ill, B_FALSE); 7488 mutex_enter(&connfp->connf_lock); 7489 /* Follow the next pointer before releasing the conn. */ 7490 next_connp = connp->conn_next; 7491 CONN_DEC_REF(connp); 7492 connp = next_connp; 7493 } 7494 7495 /* Last one. Send it upstream. */ 7496 mutex_exit(&connfp->connf_lock); 7497 ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, 7498 recv_ill, ip_policy); 7499 CONN_DEC_REF(connp); 7500 } 7501 7502 /* 7503 * Complete the ip_wput header so that it 7504 * is possible to generate ICMP 7505 * errors. 7506 */ 7507 int 7508 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst) 7509 { 7510 ire_t *ire; 7511 7512 if (ipha->ipha_src == INADDR_ANY) { 7513 ire = ire_lookup_local(zoneid, ipst); 7514 if (ire == NULL) { 7515 ip1dbg(("ip_hdr_complete: no source IRE\n")); 7516 return (1); 7517 } 7518 ipha->ipha_src = ire->ire_addr; 7519 ire_refrele(ire); 7520 } 7521 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 7522 ipha->ipha_hdr_checksum = 0; 7523 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 7524 return (0); 7525 } 7526 7527 /* 7528 * Nobody should be sending 7529 * packets up this stream 7530 */ 7531 static void 7532 ip_lrput(queue_t *q, mblk_t *mp) 7533 { 7534 mblk_t *mp1; 7535 7536 switch (mp->b_datap->db_type) { 7537 case M_FLUSH: 7538 /* Turn around */ 7539 if (*mp->b_rptr & FLUSHW) { 7540 *mp->b_rptr &= ~FLUSHR; 7541 qreply(q, mp); 7542 return; 7543 } 7544 break; 7545 } 7546 /* Could receive messages that passed through ar_rput */ 7547 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 7548 mp1->b_prev = mp1->b_next = NULL; 7549 freemsg(mp); 7550 } 7551 7552 /* Nobody should be sending packets down this stream */ 7553 /* ARGSUSED */ 7554 void 7555 ip_lwput(queue_t *q, mblk_t *mp) 7556 { 7557 freemsg(mp); 7558 } 7559 7560 /* 7561 * Move the first hop in any source route to ipha_dst and remove that part of 7562 * the source route. Called by other protocols. Errors in option formatting 7563 * are ignored - will be handled by ip_wput_options Return the final 7564 * destination (either ipha_dst or the last entry in a source route.) 7565 */ 7566 ipaddr_t 7567 ip_massage_options(ipha_t *ipha, netstack_t *ns) 7568 { 7569 ipoptp_t opts; 7570 uchar_t *opt; 7571 uint8_t optval; 7572 uint8_t optlen; 7573 ipaddr_t dst; 7574 int i; 7575 ire_t *ire; 7576 ip_stack_t *ipst = ns->netstack_ip; 7577 7578 ip2dbg(("ip_massage_options\n")); 7579 dst = ipha->ipha_dst; 7580 for (optval = ipoptp_first(&opts, ipha); 7581 optval != IPOPT_EOL; 7582 optval = ipoptp_next(&opts)) { 7583 opt = opts.ipoptp_cur; 7584 switch (optval) { 7585 uint8_t off; 7586 case IPOPT_SSRR: 7587 case IPOPT_LSRR: 7588 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 7589 ip1dbg(("ip_massage_options: bad src route\n")); 7590 break; 7591 } 7592 optlen = opts.ipoptp_len; 7593 off = opt[IPOPT_OFFSET]; 7594 off--; 7595 redo_srr: 7596 if (optlen < IP_ADDR_LEN || 7597 off > optlen - IP_ADDR_LEN) { 7598 /* End of source route */ 7599 ip1dbg(("ip_massage_options: end of SR\n")); 7600 break; 7601 } 7602 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 7603 ip1dbg(("ip_massage_options: next hop 0x%x\n", 7604 ntohl(dst))); 7605 /* 7606 * Check if our address is present more than 7607 * once as consecutive hops in source route. 7608 * XXX verify per-interface ip_forwarding 7609 * for source route? 7610 */ 7611 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 7612 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7613 if (ire != NULL) { 7614 ire_refrele(ire); 7615 off += IP_ADDR_LEN; 7616 goto redo_srr; 7617 } 7618 if (dst == htonl(INADDR_LOOPBACK)) { 7619 ip1dbg(("ip_massage_options: loopback addr in " 7620 "source route!\n")); 7621 break; 7622 } 7623 /* 7624 * Update ipha_dst to be the first hop and remove the 7625 * first hop from the source route (by overwriting 7626 * part of the option with NOP options). 7627 */ 7628 ipha->ipha_dst = dst; 7629 /* Put the last entry in dst */ 7630 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 7631 3; 7632 bcopy(&opt[off], &dst, IP_ADDR_LEN); 7633 7634 ip1dbg(("ip_massage_options: last hop 0x%x\n", 7635 ntohl(dst))); 7636 /* Move down and overwrite */ 7637 opt[IP_ADDR_LEN] = opt[0]; 7638 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 7639 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 7640 for (i = 0; i < IP_ADDR_LEN; i++) 7641 opt[i] = IPOPT_NOP; 7642 break; 7643 } 7644 } 7645 return (dst); 7646 } 7647 7648 /* 7649 * Return the network mask 7650 * associated with the specified address. 7651 */ 7652 ipaddr_t 7653 ip_net_mask(ipaddr_t addr) 7654 { 7655 uchar_t *up = (uchar_t *)&addr; 7656 ipaddr_t mask = 0; 7657 uchar_t *maskp = (uchar_t *)&mask; 7658 7659 #if defined(__i386) || defined(__amd64) 7660 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 7661 #endif 7662 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 7663 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 7664 #endif 7665 if (CLASSD(addr)) { 7666 maskp[0] = 0xF0; 7667 return (mask); 7668 } 7669 7670 /* We assume Class E default netmask to be 32 */ 7671 if (CLASSE(addr)) 7672 return (0xffffffffU); 7673 7674 if (addr == 0) 7675 return (0); 7676 maskp[0] = 0xFF; 7677 if ((up[0] & 0x80) == 0) 7678 return (mask); 7679 7680 maskp[1] = 0xFF; 7681 if ((up[0] & 0xC0) == 0x80) 7682 return (mask); 7683 7684 maskp[2] = 0xFF; 7685 if ((up[0] & 0xE0) == 0xC0) 7686 return (mask); 7687 7688 /* Otherwise return no mask */ 7689 return ((ipaddr_t)0); 7690 } 7691 7692 /* 7693 * Select an ill for the packet by considering load spreading across 7694 * a different ill in the group if dst_ill is part of some group. 7695 */ 7696 ill_t * 7697 ip_newroute_get_dst_ill(ill_t *dst_ill) 7698 { 7699 ill_t *ill; 7700 7701 /* 7702 * We schedule irrespective of whether the source address is 7703 * INADDR_ANY or not. illgrp_scheduler returns a held ill. 7704 */ 7705 ill = illgrp_scheduler(dst_ill); 7706 if (ill == NULL) 7707 return (NULL); 7708 7709 /* 7710 * For groups with names ip_sioctl_groupname ensures that all 7711 * ills are of same type. For groups without names, ifgrp_insert 7712 * ensures this. 7713 */ 7714 ASSERT(dst_ill->ill_type == ill->ill_type); 7715 7716 return (ill); 7717 } 7718 7719 /* 7720 * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. 7721 */ 7722 ill_t * 7723 ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6, 7724 ip_stack_t *ipst) 7725 { 7726 ill_t *ret_ill; 7727 7728 ASSERT(ifindex != 0); 7729 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 7730 ipst); 7731 if (ret_ill == NULL || 7732 (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { 7733 if (isv6) { 7734 if (ill != NULL) { 7735 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 7736 } else { 7737 BUMP_MIB(&ipst->ips_ip6_mib, 7738 ipIfStatsOutDiscards); 7739 } 7740 ip1dbg(("ip_grab_attach_ill (IPv6): " 7741 "bad ifindex %d.\n", ifindex)); 7742 } else { 7743 if (ill != NULL) { 7744 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 7745 } else { 7746 BUMP_MIB(&ipst->ips_ip_mib, 7747 ipIfStatsOutDiscards); 7748 } 7749 ip1dbg(("ip_grab_attach_ill (IPv4): " 7750 "bad ifindex %d.\n", ifindex)); 7751 } 7752 if (ret_ill != NULL) 7753 ill_refrele(ret_ill); 7754 freemsg(first_mp); 7755 return (NULL); 7756 } 7757 7758 return (ret_ill); 7759 } 7760 7761 /* 7762 * IPv4 - 7763 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 7764 * out a packet to a destination address for which we do not have specific 7765 * (or sufficient) routing information. 7766 * 7767 * NOTE : These are the scopes of some of the variables that point at IRE, 7768 * which needs to be followed while making any future modifications 7769 * to avoid memory leaks. 7770 * 7771 * - ire and sire are the entries looked up initially by 7772 * ire_ftable_lookup. 7773 * - ipif_ire is used to hold the interface ire associated with 7774 * the new cache ire. But it's scope is limited, so we always REFRELE 7775 * it before branching out to error paths. 7776 * - save_ire is initialized before ire_create, so that ire returned 7777 * by ire_create will not over-write the ire. We REFRELE save_ire 7778 * before breaking out of the switch. 7779 * 7780 * Thus on failures, we have to REFRELE only ire and sire, if they 7781 * are not NULL. 7782 */ 7783 void 7784 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, 7785 zoneid_t zoneid, ip_stack_t *ipst) 7786 { 7787 areq_t *areq; 7788 ipaddr_t gw = 0; 7789 ire_t *ire = NULL; 7790 mblk_t *res_mp; 7791 ipaddr_t *addrp; 7792 ipaddr_t nexthop_addr; 7793 ipif_t *src_ipif = NULL; 7794 ill_t *dst_ill = NULL; 7795 ipha_t *ipha; 7796 ire_t *sire = NULL; 7797 mblk_t *first_mp; 7798 ire_t *save_ire; 7799 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ 7800 ushort_t ire_marks = 0; 7801 boolean_t mctl_present; 7802 ipsec_out_t *io; 7803 mblk_t *saved_mp; 7804 ire_t *first_sire = NULL; 7805 mblk_t *copy_mp = NULL; 7806 mblk_t *xmit_mp = NULL; 7807 ipaddr_t save_dst; 7808 uint32_t multirt_flags = 7809 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 7810 boolean_t multirt_is_resolvable; 7811 boolean_t multirt_resolve_next; 7812 boolean_t unspec_src; 7813 boolean_t do_attach_ill = B_FALSE; 7814 boolean_t ip_nexthop = B_FALSE; 7815 tsol_ire_gw_secattr_t *attrp = NULL; 7816 tsol_gcgrp_t *gcgrp = NULL; 7817 tsol_gcgrp_addr_t ga; 7818 7819 if (ip_debug > 2) { 7820 /* ip1dbg */ 7821 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7822 } 7823 7824 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7825 if (mctl_present) { 7826 io = (ipsec_out_t *)first_mp->b_rptr; 7827 ASSERT(io->ipsec_out_type == IPSEC_OUT); 7828 ASSERT(zoneid == io->ipsec_out_zoneid); 7829 ASSERT(zoneid != ALL_ZONES); 7830 } 7831 7832 ipha = (ipha_t *)mp->b_rptr; 7833 7834 /* All multicast lookups come through ip_newroute_ipif() */ 7835 if (CLASSD(dst)) { 7836 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7837 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7838 freemsg(first_mp); 7839 return; 7840 } 7841 7842 if (mctl_present && io->ipsec_out_attach_if) { 7843 /* ip_grab_attach_ill returns a held ill */ 7844 attach_ill = ip_grab_attach_ill(NULL, first_mp, 7845 io->ipsec_out_ill_index, B_FALSE, ipst); 7846 7847 /* Failure case frees things for us. */ 7848 if (attach_ill == NULL) 7849 return; 7850 7851 /* 7852 * Check if we need an ire that will not be 7853 * looked up by anybody else i.e. HIDDEN. 7854 */ 7855 if (ill_is_probeonly(attach_ill)) 7856 ire_marks = IRE_MARK_HIDDEN; 7857 } 7858 if (mctl_present && io->ipsec_out_ip_nexthop) { 7859 ip_nexthop = B_TRUE; 7860 nexthop_addr = io->ipsec_out_nexthop_addr; 7861 } 7862 /* 7863 * If this IRE is created for forwarding or it is not for 7864 * traffic for congestion controlled protocols, mark it as temporary. 7865 */ 7866 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7867 ire_marks |= IRE_MARK_TEMPORARY; 7868 7869 /* 7870 * Get what we can from ire_ftable_lookup which will follow an IRE 7871 * chain until it gets the most specific information available. 7872 * For example, we know that there is no IRE_CACHE for this dest, 7873 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7874 * ire_ftable_lookup will look up the gateway, etc. 7875 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7876 * to the destination, of equal netmask length in the forward table, 7877 * will be recursively explored. If no information is available 7878 * for the final gateway of that route, we force the returned ire 7879 * to be equal to sire using MATCH_IRE_PARENT. 7880 * At least, in this case we have a starting point (in the buckets) 7881 * to look for other routes to the destination in the forward table. 7882 * This is actually used only for multirouting, where a list 7883 * of routes has to be processed in sequence. 7884 * 7885 * In the process of coming up with the most specific information, 7886 * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry 7887 * for the gateway (i.e., one for which the ire_nce->nce_state is 7888 * not yet ND_REACHABLE, and is in the middle of arp resolution). 7889 * Two caveats when handling incomplete ire's in ip_newroute: 7890 * - we should be careful when accessing its ire_nce (specifically 7891 * the nce_res_mp) ast it might change underneath our feet, and, 7892 * - not all legacy code path callers are prepared to handle 7893 * incomplete ire's, so we should not create/add incomplete 7894 * ire_cache entries here. (See discussion about temporary solution 7895 * further below). 7896 * 7897 * In order to minimize packet dropping, and to preserve existing 7898 * behavior, we treat this case as if there were no IRE_CACHE for the 7899 * gateway, and instead use the IF_RESOLVER ire to send out 7900 * another request to ARP (this is achieved by passing the 7901 * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the 7902 * arp response comes back in ip_wput_nondata, we will create 7903 * a per-dst ire_cache that has an ND_COMPLETE ire. 7904 * 7905 * Note that this is a temporary solution; the correct solution is 7906 * to create an incomplete per-dst ire_cache entry, and send the 7907 * packet out when the gw's nce is resolved. In order to achieve this, 7908 * all packet processing must have been completed prior to calling 7909 * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need 7910 * to be modified to accomodate this solution. 7911 */ 7912 if (ip_nexthop) { 7913 /* 7914 * The first time we come here, we look for an IRE_INTERFACE 7915 * entry for the specified nexthop, set the dst to be the 7916 * nexthop address and create an IRE_CACHE entry for the 7917 * nexthop. The next time around, we are able to find an 7918 * IRE_CACHE entry for the nexthop, set the gateway to be the 7919 * nexthop address and create an IRE_CACHE entry for the 7920 * destination address via the specified nexthop. 7921 */ 7922 ire = ire_cache_lookup(nexthop_addr, zoneid, 7923 MBLK_GETLABEL(mp), ipst); 7924 if (ire != NULL) { 7925 gw = nexthop_addr; 7926 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7927 } else { 7928 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7929 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7930 MBLK_GETLABEL(mp), 7931 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 7932 ipst); 7933 if (ire != NULL) { 7934 dst = nexthop_addr; 7935 } 7936 } 7937 } else if (attach_ill == NULL) { 7938 ire = ire_ftable_lookup(dst, 0, 0, 0, 7939 NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), 7940 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7941 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7942 MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, 7943 ipst); 7944 } else { 7945 /* 7946 * attach_ill is set only for communicating with 7947 * on-link hosts. So, don't look for DEFAULT. 7948 */ 7949 ipif_t *attach_ipif; 7950 7951 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 7952 if (attach_ipif == NULL) { 7953 ill_refrele(attach_ill); 7954 goto icmp_err_ret; 7955 } 7956 ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, 7957 &sire, zoneid, 0, MBLK_GETLABEL(mp), 7958 MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | 7959 MATCH_IRE_SECATTR, ipst); 7960 ipif_refrele(attach_ipif); 7961 } 7962 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7963 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7964 7965 /* 7966 * This loop is run only once in most cases. 7967 * We loop to resolve further routes only when the destination 7968 * can be reached through multiple RTF_MULTIRT-flagged ires. 7969 */ 7970 do { 7971 /* Clear the previous iteration's values */ 7972 if (src_ipif != NULL) { 7973 ipif_refrele(src_ipif); 7974 src_ipif = NULL; 7975 } 7976 if (dst_ill != NULL) { 7977 ill_refrele(dst_ill); 7978 dst_ill = NULL; 7979 } 7980 7981 multirt_resolve_next = B_FALSE; 7982 /* 7983 * We check if packets have to be multirouted. 7984 * In this case, given the current <ire, sire> couple, 7985 * we look for the next suitable <ire, sire>. 7986 * This check is done in ire_multirt_lookup(), 7987 * which applies various criteria to find the next route 7988 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7989 * unchanged if it detects it has not been tried yet. 7990 */ 7991 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7992 ip3dbg(("ip_newroute: starting next_resolution " 7993 "with first_mp %p, tag %d\n", 7994 (void *)first_mp, 7995 MULTIRT_DEBUG_TAGGED(first_mp))); 7996 7997 ASSERT(sire != NULL); 7998 multirt_is_resolvable = 7999 ire_multirt_lookup(&ire, &sire, multirt_flags, 8000 MBLK_GETLABEL(mp), ipst); 8001 8002 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 8003 "ire %p, sire %p\n", 8004 multirt_is_resolvable, 8005 (void *)ire, (void *)sire)); 8006 8007 if (!multirt_is_resolvable) { 8008 /* 8009 * No more multirt route to resolve; give up 8010 * (all routes resolved or no more 8011 * resolvable routes). 8012 */ 8013 if (ire != NULL) { 8014 ire_refrele(ire); 8015 ire = NULL; 8016 } 8017 } else { 8018 ASSERT(sire != NULL); 8019 ASSERT(ire != NULL); 8020 /* 8021 * We simply use first_sire as a flag that 8022 * indicates if a resolvable multirt route 8023 * has already been found. 8024 * If it is not the case, we may have to send 8025 * an ICMP error to report that the 8026 * destination is unreachable. 8027 * We do not IRE_REFHOLD first_sire. 8028 */ 8029 if (first_sire == NULL) { 8030 first_sire = sire; 8031 } 8032 } 8033 } 8034 if (ire == NULL) { 8035 if (ip_debug > 3) { 8036 /* ip2dbg */ 8037 pr_addr_dbg("ip_newroute: " 8038 "can't resolve %s\n", AF_INET, &dst); 8039 } 8040 ip3dbg(("ip_newroute: " 8041 "ire %p, sire %p, first_sire %p\n", 8042 (void *)ire, (void *)sire, (void *)first_sire)); 8043 8044 if (sire != NULL) { 8045 ire_refrele(sire); 8046 sire = NULL; 8047 } 8048 8049 if (first_sire != NULL) { 8050 /* 8051 * At least one multirt route has been found 8052 * in the same call to ip_newroute(); 8053 * there is no need to report an ICMP error. 8054 * first_sire was not IRE_REFHOLDed. 8055 */ 8056 MULTIRT_DEBUG_UNTAG(first_mp); 8057 freemsg(first_mp); 8058 return; 8059 } 8060 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 8061 RTA_DST, ipst); 8062 if (attach_ill != NULL) 8063 ill_refrele(attach_ill); 8064 goto icmp_err_ret; 8065 } 8066 8067 /* 8068 * Verify that the returned IRE does not have either 8069 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 8070 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 8071 */ 8072 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 8073 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 8074 if (attach_ill != NULL) 8075 ill_refrele(attach_ill); 8076 goto icmp_err_ret; 8077 } 8078 /* 8079 * Increment the ire_ob_pkt_count field for ire if it is an 8080 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 8081 * increment the same for the parent IRE, sire, if it is some 8082 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST) 8083 */ 8084 if ((ire->ire_type & IRE_INTERFACE) != 0) { 8085 UPDATE_OB_PKT_COUNT(ire); 8086 ire->ire_last_used_time = lbolt; 8087 } 8088 8089 if (sire != NULL) { 8090 gw = sire->ire_gateway_addr; 8091 ASSERT((sire->ire_type & (IRE_CACHETABLE | 8092 IRE_INTERFACE)) == 0); 8093 UPDATE_OB_PKT_COUNT(sire); 8094 sire->ire_last_used_time = lbolt; 8095 } 8096 /* 8097 * We have a route to reach the destination. 8098 * 8099 * 1) If the interface is part of ill group, try to get a new 8100 * ill taking load spreading into account. 8101 * 8102 * 2) After selecting the ill, get a source address that 8103 * might create good inbound load spreading. 8104 * ipif_select_source does this for us. 8105 * 8106 * If the application specified the ill (ifindex), we still 8107 * load spread. Only if the packets needs to go out 8108 * specifically on a given ill e.g. binding to 8109 * IPIF_NOFAILOVER address, then we don't try to use a 8110 * different ill for load spreading. 8111 */ 8112 if (attach_ill == NULL) { 8113 /* 8114 * Don't perform outbound load spreading in the 8115 * case of an RTF_MULTIRT route, as we actually 8116 * typically want to replicate outgoing packets 8117 * through particular interfaces. 8118 */ 8119 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8120 dst_ill = ire->ire_ipif->ipif_ill; 8121 /* for uniformity */ 8122 ill_refhold(dst_ill); 8123 } else { 8124 /* 8125 * If we are here trying to create an IRE_CACHE 8126 * for an offlink destination and have the 8127 * IRE_CACHE for the next hop and the latter is 8128 * using virtual IP source address selection i.e 8129 * it's ire->ire_ipif is pointing to a virtual 8130 * network interface (vni) then 8131 * ip_newroute_get_dst_ll() will return the vni 8132 * interface as the dst_ill. Since the vni is 8133 * virtual i.e not associated with any physical 8134 * interface, it cannot be the dst_ill, hence 8135 * in such a case call ip_newroute_get_dst_ll() 8136 * with the stq_ill instead of the ire_ipif ILL. 8137 * The function returns a refheld ill. 8138 */ 8139 if ((ire->ire_type == IRE_CACHE) && 8140 IS_VNI(ire->ire_ipif->ipif_ill)) 8141 dst_ill = ip_newroute_get_dst_ill( 8142 ire->ire_stq->q_ptr); 8143 else 8144 dst_ill = ip_newroute_get_dst_ill( 8145 ire->ire_ipif->ipif_ill); 8146 } 8147 if (dst_ill == NULL) { 8148 if (ip_debug > 2) { 8149 pr_addr_dbg("ip_newroute: " 8150 "no dst ill for dst" 8151 " %s\n", AF_INET, &dst); 8152 } 8153 goto icmp_err_ret; 8154 } 8155 } else { 8156 dst_ill = ire->ire_ipif->ipif_ill; 8157 /* for uniformity */ 8158 ill_refhold(dst_ill); 8159 /* 8160 * We should have found a route matching ill as we 8161 * called ire_ftable_lookup with MATCH_IRE_ILL. 8162 * Rather than asserting, when there is a mismatch, 8163 * we just drop the packet. 8164 */ 8165 if (dst_ill != attach_ill) { 8166 ip0dbg(("ip_newroute: Packet dropped as " 8167 "IPIF_NOFAILOVER ill is %s, " 8168 "ire->ire_ipif->ipif_ill is %s\n", 8169 attach_ill->ill_name, 8170 dst_ill->ill_name)); 8171 ill_refrele(attach_ill); 8172 goto icmp_err_ret; 8173 } 8174 } 8175 /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ 8176 if (attach_ill != NULL) { 8177 ill_refrele(attach_ill); 8178 attach_ill = NULL; 8179 do_attach_ill = B_TRUE; 8180 } 8181 ASSERT(dst_ill != NULL); 8182 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 8183 8184 /* 8185 * Pick the best source address from dst_ill. 8186 * 8187 * 1) If it is part of a multipathing group, we would 8188 * like to spread the inbound packets across different 8189 * interfaces. ipif_select_source picks a random source 8190 * across the different ills in the group. 8191 * 8192 * 2) If it is not part of a multipathing group, we try 8193 * to pick the source address from the destination 8194 * route. Clustering assumes that when we have multiple 8195 * prefixes hosted on an interface, the prefix of the 8196 * source address matches the prefix of the destination 8197 * route. We do this only if the address is not 8198 * DEPRECATED. 8199 * 8200 * 3) If the conn is in a different zone than the ire, we 8201 * need to pick a source address from the right zone. 8202 * 8203 * NOTE : If we hit case (1) above, the prefix of the source 8204 * address picked may not match the prefix of the 8205 * destination routes prefix as ipif_select_source 8206 * does not look at "dst" while picking a source 8207 * address. 8208 * If we want the same behavior as (2), we will need 8209 * to change the behavior of ipif_select_source. 8210 */ 8211 ASSERT(src_ipif == NULL); 8212 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 8213 /* 8214 * The RTF_SETSRC flag is set in the parent ire (sire). 8215 * Check that the ipif matching the requested source 8216 * address still exists. 8217 */ 8218 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 8219 zoneid, NULL, NULL, NULL, NULL, ipst); 8220 } 8221 8222 unspec_src = (connp != NULL && connp->conn_unspec_src); 8223 8224 if (src_ipif == NULL && 8225 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 8226 ire_marks |= IRE_MARK_USESRC_CHECK; 8227 if ((dst_ill->ill_group != NULL) || 8228 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 8229 (connp != NULL && ire->ire_zoneid != zoneid && 8230 ire->ire_zoneid != ALL_ZONES) || 8231 (dst_ill->ill_usesrc_ifindex != 0)) { 8232 /* 8233 * If the destination is reachable via a 8234 * given gateway, the selected source address 8235 * should be in the same subnet as the gateway. 8236 * Otherwise, the destination is not reachable. 8237 * 8238 * If there are no interfaces on the same subnet 8239 * as the destination, ipif_select_source gives 8240 * first non-deprecated interface which might be 8241 * on a different subnet than the gateway. 8242 * This is not desirable. Hence pass the dst_ire 8243 * source address to ipif_select_source. 8244 * It is sure that the destination is reachable 8245 * with the dst_ire source address subnet. 8246 * So passing dst_ire source address to 8247 * ipif_select_source will make sure that the 8248 * selected source will be on the same subnet 8249 * as dst_ire source address. 8250 */ 8251 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 8252 src_ipif = ipif_select_source(dst_ill, saddr, 8253 zoneid); 8254 if (src_ipif == NULL) { 8255 if (ip_debug > 2) { 8256 pr_addr_dbg("ip_newroute: " 8257 "no src for dst %s ", 8258 AF_INET, &dst); 8259 printf("through interface %s\n", 8260 dst_ill->ill_name); 8261 } 8262 goto icmp_err_ret; 8263 } 8264 } else { 8265 src_ipif = ire->ire_ipif; 8266 ASSERT(src_ipif != NULL); 8267 /* hold src_ipif for uniformity */ 8268 ipif_refhold(src_ipif); 8269 } 8270 } 8271 8272 /* 8273 * Assign a source address while we have the conn. 8274 * We can't have ip_wput_ire pick a source address when the 8275 * packet returns from arp since we need to look at 8276 * conn_unspec_src and conn_zoneid, and we lose the conn when 8277 * going through arp. 8278 * 8279 * NOTE : ip_newroute_v6 does not have this piece of code as 8280 * it uses ip6i to store this information. 8281 */ 8282 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 8283 ipha->ipha_src = src_ipif->ipif_src_addr; 8284 8285 if (ip_debug > 3) { 8286 /* ip2dbg */ 8287 pr_addr_dbg("ip_newroute: first hop %s\n", 8288 AF_INET, &gw); 8289 } 8290 ip2dbg(("\tire type %s (%d)\n", 8291 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 8292 8293 /* 8294 * The TTL of multirouted packets is bounded by the 8295 * ip_multirt_ttl ndd variable. 8296 */ 8297 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 8298 /* Force TTL of multirouted packets */ 8299 if ((ipst->ips_ip_multirt_ttl > 0) && 8300 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 8301 ip2dbg(("ip_newroute: forcing multirt TTL " 8302 "to %d (was %d), dst 0x%08x\n", 8303 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 8304 ntohl(sire->ire_addr))); 8305 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 8306 } 8307 } 8308 /* 8309 * At this point in ip_newroute(), ire is either the 8310 * IRE_CACHE of the next-hop gateway for an off-subnet 8311 * destination or an IRE_INTERFACE type that should be used 8312 * to resolve an on-subnet destination or an on-subnet 8313 * next-hop gateway. 8314 * 8315 * In the IRE_CACHE case, we have the following : 8316 * 8317 * 1) src_ipif - used for getting a source address. 8318 * 8319 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8320 * means packets using this IRE_CACHE will go out on 8321 * dst_ill. 8322 * 8323 * 3) The IRE sire will point to the prefix that is the 8324 * longest matching route for the destination. These 8325 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST. 8326 * 8327 * The newly created IRE_CACHE entry for the off-subnet 8328 * destination is tied to both the prefix route and the 8329 * interface route used to resolve the next-hop gateway 8330 * via the ire_phandle and ire_ihandle fields, 8331 * respectively. 8332 * 8333 * In the IRE_INTERFACE case, we have the following : 8334 * 8335 * 1) src_ipif - used for getting a source address. 8336 * 8337 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 8338 * means packets using the IRE_CACHE that we will build 8339 * here will go out on dst_ill. 8340 * 8341 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 8342 * to be created will only be tied to the IRE_INTERFACE 8343 * that was derived from the ire_ihandle field. 8344 * 8345 * If sire is non-NULL, it means the destination is 8346 * off-link and we will first create the IRE_CACHE for the 8347 * gateway. Next time through ip_newroute, we will create 8348 * the IRE_CACHE for the final destination as described 8349 * above. 8350 * 8351 * In both cases, after the current resolution has been 8352 * completed (or possibly initialised, in the IRE_INTERFACE 8353 * case), the loop may be re-entered to attempt the resolution 8354 * of another RTF_MULTIRT route. 8355 * 8356 * When an IRE_CACHE entry for the off-subnet destination is 8357 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 8358 * for further processing in emission loops. 8359 */ 8360 save_ire = ire; 8361 switch (ire->ire_type) { 8362 case IRE_CACHE: { 8363 ire_t *ipif_ire; 8364 8365 ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE); 8366 if (gw == 0) 8367 gw = ire->ire_gateway_addr; 8368 /* 8369 * We need 3 ire's to create a new cache ire for an 8370 * off-link destination from the cache ire of the 8371 * gateway. 8372 * 8373 * 1. The prefix ire 'sire' (Note that this does 8374 * not apply to the conn_nexthop_set case) 8375 * 2. The cache ire of the gateway 'ire' 8376 * 3. The interface ire 'ipif_ire' 8377 * 8378 * We have (1) and (2). We lookup (3) below. 8379 * 8380 * If there is no interface route to the gateway, 8381 * it is a race condition, where we found the cache 8382 * but the interface route has been deleted. 8383 */ 8384 if (ip_nexthop) { 8385 ipif_ire = ire_ihandle_lookup_onlink(ire); 8386 } else { 8387 ipif_ire = 8388 ire_ihandle_lookup_offlink(ire, sire); 8389 } 8390 if (ipif_ire == NULL) { 8391 ip1dbg(("ip_newroute: " 8392 "ire_ihandle_lookup_offlink failed\n")); 8393 goto icmp_err_ret; 8394 } 8395 8396 /* 8397 * Check cached gateway IRE for any security 8398 * attributes; if found, associate the gateway 8399 * credentials group to the destination IRE. 8400 */ 8401 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 8402 mutex_enter(&attrp->igsa_lock); 8403 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 8404 GCGRP_REFHOLD(gcgrp); 8405 mutex_exit(&attrp->igsa_lock); 8406 } 8407 8408 /* 8409 * XXX For the source of the resolver mp, 8410 * we are using the same DL_UNITDATA_REQ 8411 * (from save_ire->ire_nce->nce_res_mp) 8412 * though the save_ire is not pointing at the same ill. 8413 * This is incorrect. We need to send it up to the 8414 * resolver to get the right res_mp. For ethernets 8415 * this may be okay (ill_type == DL_ETHER). 8416 */ 8417 8418 ire = ire_create( 8419 (uchar_t *)&dst, /* dest address */ 8420 (uchar_t *)&ip_g_all_ones, /* mask */ 8421 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8422 (uchar_t *)&gw, /* gateway address */ 8423 &save_ire->ire_max_frag, 8424 save_ire->ire_nce, /* src nce */ 8425 dst_ill->ill_rq, /* recv-from queue */ 8426 dst_ill->ill_wq, /* send-to queue */ 8427 IRE_CACHE, /* IRE type */ 8428 src_ipif, 8429 (sire != NULL) ? 8430 sire->ire_mask : 0, /* Parent mask */ 8431 (sire != NULL) ? 8432 sire->ire_phandle : 0, /* Parent handle */ 8433 ipif_ire->ire_ihandle, /* Interface handle */ 8434 (sire != NULL) ? (sire->ire_flags & 8435 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 8436 (sire != NULL) ? 8437 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 8438 NULL, 8439 gcgrp, 8440 ipst); 8441 8442 if (ire == NULL) { 8443 if (gcgrp != NULL) { 8444 GCGRP_REFRELE(gcgrp); 8445 gcgrp = NULL; 8446 } 8447 ire_refrele(ipif_ire); 8448 ire_refrele(save_ire); 8449 break; 8450 } 8451 8452 /* reference now held by IRE */ 8453 gcgrp = NULL; 8454 8455 ire->ire_marks |= ire_marks; 8456 8457 /* 8458 * Prevent sire and ipif_ire from getting deleted. 8459 * The newly created ire is tied to both of them via 8460 * the phandle and ihandle respectively. 8461 */ 8462 if (sire != NULL) { 8463 IRB_REFHOLD(sire->ire_bucket); 8464 /* Has it been removed already ? */ 8465 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 8466 IRB_REFRELE(sire->ire_bucket); 8467 ire_refrele(ipif_ire); 8468 ire_refrele(save_ire); 8469 break; 8470 } 8471 } 8472 8473 IRB_REFHOLD(ipif_ire->ire_bucket); 8474 /* Has it been removed already ? */ 8475 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 8476 IRB_REFRELE(ipif_ire->ire_bucket); 8477 if (sire != NULL) 8478 IRB_REFRELE(sire->ire_bucket); 8479 ire_refrele(ipif_ire); 8480 ire_refrele(save_ire); 8481 break; 8482 } 8483 8484 xmit_mp = first_mp; 8485 /* 8486 * In the case of multirouting, a copy 8487 * of the packet is done before its sending. 8488 * The copy is used to attempt another 8489 * route resolution, in a next loop. 8490 */ 8491 if (ire->ire_flags & RTF_MULTIRT) { 8492 copy_mp = copymsg(first_mp); 8493 if (copy_mp != NULL) { 8494 xmit_mp = copy_mp; 8495 MULTIRT_DEBUG_TAG(first_mp); 8496 } 8497 } 8498 ire_add_then_send(q, ire, xmit_mp); 8499 ire_refrele(save_ire); 8500 8501 /* Assert that sire is not deleted yet. */ 8502 if (sire != NULL) { 8503 ASSERT(sire->ire_ptpn != NULL); 8504 IRB_REFRELE(sire->ire_bucket); 8505 } 8506 8507 /* Assert that ipif_ire is not deleted yet. */ 8508 ASSERT(ipif_ire->ire_ptpn != NULL); 8509 IRB_REFRELE(ipif_ire->ire_bucket); 8510 ire_refrele(ipif_ire); 8511 8512 /* 8513 * If copy_mp is not NULL, multirouting was 8514 * requested. We loop to initiate a next 8515 * route resolution attempt, starting from sire. 8516 */ 8517 if (copy_mp != NULL) { 8518 /* 8519 * Search for the next unresolved 8520 * multirt route. 8521 */ 8522 copy_mp = NULL; 8523 ipif_ire = NULL; 8524 ire = NULL; 8525 multirt_resolve_next = B_TRUE; 8526 continue; 8527 } 8528 if (sire != NULL) 8529 ire_refrele(sire); 8530 ipif_refrele(src_ipif); 8531 ill_refrele(dst_ill); 8532 return; 8533 } 8534 case IRE_IF_NORESOLVER: { 8535 8536 if (dst_ill->ill_phys_addr_length != IP_ADDR_LEN && 8537 dst_ill->ill_resolver_mp == NULL) { 8538 ip1dbg(("ip_newroute: dst_ill %p " 8539 "for IRE_IF_NORESOLVER ire %p has " 8540 "no ill_resolver_mp\n", 8541 (void *)dst_ill, (void *)ire)); 8542 break; 8543 } 8544 8545 /* 8546 * TSol note: We are creating the ire cache for the 8547 * destination 'dst'. If 'dst' is offlink, going 8548 * through the first hop 'gw', the security attributes 8549 * of 'dst' must be set to point to the gateway 8550 * credentials of gateway 'gw'. If 'dst' is onlink, it 8551 * is possible that 'dst' is a potential gateway that is 8552 * referenced by some route that has some security 8553 * attributes. Thus in the former case, we need to do a 8554 * gcgrp_lookup of 'gw' while in the latter case we 8555 * need to do gcgrp_lookup of 'dst' itself. 8556 */ 8557 ga.ga_af = AF_INET; 8558 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 8559 &ga.ga_addr); 8560 gcgrp = gcgrp_lookup(&ga, B_FALSE); 8561 8562 ire = ire_create( 8563 (uchar_t *)&dst, /* dest address */ 8564 (uchar_t *)&ip_g_all_ones, /* mask */ 8565 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8566 (uchar_t *)&gw, /* gateway address */ 8567 &save_ire->ire_max_frag, 8568 NULL, /* no src nce */ 8569 dst_ill->ill_rq, /* recv-from queue */ 8570 dst_ill->ill_wq, /* send-to queue */ 8571 IRE_CACHE, 8572 src_ipif, 8573 save_ire->ire_mask, /* Parent mask */ 8574 (sire != NULL) ? /* Parent handle */ 8575 sire->ire_phandle : 0, 8576 save_ire->ire_ihandle, /* Interface handle */ 8577 (sire != NULL) ? sire->ire_flags & 8578 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 8579 &(save_ire->ire_uinfo), 8580 NULL, 8581 gcgrp, 8582 ipst); 8583 8584 if (ire == NULL) { 8585 if (gcgrp != NULL) { 8586 GCGRP_REFRELE(gcgrp); 8587 gcgrp = NULL; 8588 } 8589 ire_refrele(save_ire); 8590 break; 8591 } 8592 8593 /* reference now held by IRE */ 8594 gcgrp = NULL; 8595 8596 ire->ire_marks |= ire_marks; 8597 8598 /* Prevent save_ire from getting deleted */ 8599 IRB_REFHOLD(save_ire->ire_bucket); 8600 /* Has it been removed already ? */ 8601 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8602 IRB_REFRELE(save_ire->ire_bucket); 8603 ire_refrele(save_ire); 8604 break; 8605 } 8606 8607 /* 8608 * In the case of multirouting, a copy 8609 * of the packet is made before it is sent. 8610 * The copy is used in the next 8611 * loop to attempt another resolution. 8612 */ 8613 xmit_mp = first_mp; 8614 if ((sire != NULL) && 8615 (sire->ire_flags & RTF_MULTIRT)) { 8616 copy_mp = copymsg(first_mp); 8617 if (copy_mp != NULL) { 8618 xmit_mp = copy_mp; 8619 MULTIRT_DEBUG_TAG(first_mp); 8620 } 8621 } 8622 ire_add_then_send(q, ire, xmit_mp); 8623 8624 /* Assert that it is not deleted yet. */ 8625 ASSERT(save_ire->ire_ptpn != NULL); 8626 IRB_REFRELE(save_ire->ire_bucket); 8627 ire_refrele(save_ire); 8628 8629 if (copy_mp != NULL) { 8630 /* 8631 * If we found a (no)resolver, we ignore any 8632 * trailing top priority IRE_CACHE in further 8633 * loops. This ensures that we do not omit any 8634 * (no)resolver. 8635 * This IRE_CACHE, if any, will be processed 8636 * by another thread entering ip_newroute(). 8637 * IRE_CACHE entries, if any, will be processed 8638 * by another thread entering ip_newroute(), 8639 * (upon resolver response, for instance). 8640 * This aims to force parallel multirt 8641 * resolutions as soon as a packet must be sent. 8642 * In the best case, after the tx of only one 8643 * packet, all reachable routes are resolved. 8644 * Otherwise, the resolution of all RTF_MULTIRT 8645 * routes would require several emissions. 8646 */ 8647 multirt_flags &= ~MULTIRT_CACHEGW; 8648 8649 /* 8650 * Search for the next unresolved multirt 8651 * route. 8652 */ 8653 copy_mp = NULL; 8654 save_ire = NULL; 8655 ire = NULL; 8656 multirt_resolve_next = B_TRUE; 8657 continue; 8658 } 8659 8660 /* 8661 * Don't need sire anymore 8662 */ 8663 if (sire != NULL) 8664 ire_refrele(sire); 8665 8666 ipif_refrele(src_ipif); 8667 ill_refrele(dst_ill); 8668 return; 8669 } 8670 case IRE_IF_RESOLVER: 8671 /* 8672 * We can't build an IRE_CACHE yet, but at least we 8673 * found a resolver that can help. 8674 */ 8675 res_mp = dst_ill->ill_resolver_mp; 8676 if (!OK_RESOLVER_MP(res_mp)) 8677 break; 8678 8679 /* 8680 * To be at this point in the code with a non-zero gw 8681 * means that dst is reachable through a gateway that 8682 * we have never resolved. By changing dst to the gw 8683 * addr we resolve the gateway first. 8684 * When ire_add_then_send() tries to put the IP dg 8685 * to dst, it will reenter ip_newroute() at which 8686 * time we will find the IRE_CACHE for the gw and 8687 * create another IRE_CACHE in case IRE_CACHE above. 8688 */ 8689 if (gw != INADDR_ANY) { 8690 /* 8691 * The source ipif that was determined above was 8692 * relative to the destination address, not the 8693 * gateway's. If src_ipif was not taken out of 8694 * the IRE_IF_RESOLVER entry, we'll need to call 8695 * ipif_select_source() again. 8696 */ 8697 if (src_ipif != ire->ire_ipif) { 8698 ipif_refrele(src_ipif); 8699 src_ipif = ipif_select_source(dst_ill, 8700 gw, zoneid); 8701 if (src_ipif == NULL) { 8702 if (ip_debug > 2) { 8703 pr_addr_dbg( 8704 "ip_newroute: no " 8705 "src for gw %s ", 8706 AF_INET, &gw); 8707 printf("through " 8708 "interface %s\n", 8709 dst_ill->ill_name); 8710 } 8711 goto icmp_err_ret; 8712 } 8713 } 8714 save_dst = dst; 8715 dst = gw; 8716 gw = INADDR_ANY; 8717 } 8718 8719 /* 8720 * We obtain a partial IRE_CACHE which we will pass 8721 * along with the resolver query. When the response 8722 * comes back it will be there ready for us to add. 8723 * The ire_max_frag is atomically set under the 8724 * irebucket lock in ire_add_v[46]. 8725 */ 8726 8727 ire = ire_create_mp( 8728 (uchar_t *)&dst, /* dest address */ 8729 (uchar_t *)&ip_g_all_ones, /* mask */ 8730 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8731 (uchar_t *)&gw, /* gateway address */ 8732 NULL, /* ire_max_frag */ 8733 NULL, /* no src nce */ 8734 dst_ill->ill_rq, /* recv-from queue */ 8735 dst_ill->ill_wq, /* send-to queue */ 8736 IRE_CACHE, 8737 src_ipif, /* Interface ipif */ 8738 save_ire->ire_mask, /* Parent mask */ 8739 0, 8740 save_ire->ire_ihandle, /* Interface handle */ 8741 0, /* flags if any */ 8742 &(save_ire->ire_uinfo), 8743 NULL, 8744 NULL, 8745 ipst); 8746 8747 if (ire == NULL) { 8748 ire_refrele(save_ire); 8749 break; 8750 } 8751 8752 if ((sire != NULL) && 8753 (sire->ire_flags & RTF_MULTIRT)) { 8754 copy_mp = copymsg(first_mp); 8755 if (copy_mp != NULL) 8756 MULTIRT_DEBUG_TAG(copy_mp); 8757 } 8758 8759 ire->ire_marks |= ire_marks; 8760 8761 /* 8762 * Construct message chain for the resolver 8763 * of the form: 8764 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8765 * Packet could contain a IPSEC_OUT mp. 8766 * 8767 * NOTE : ire will be added later when the response 8768 * comes back from ARP. If the response does not 8769 * come back, ARP frees the packet. For this reason, 8770 * we can't REFHOLD the bucket of save_ire to prevent 8771 * deletions. We may not be able to REFRELE the bucket 8772 * if the response never comes back. Thus, before 8773 * adding the ire, ire_add_v4 will make sure that the 8774 * interface route does not get deleted. This is the 8775 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8776 * where we can always prevent deletions because of 8777 * the synchronous nature of adding IRES i.e 8778 * ire_add_then_send is called after creating the IRE. 8779 */ 8780 ASSERT(ire->ire_mp != NULL); 8781 ire->ire_mp->b_cont = first_mp; 8782 /* Have saved_mp handy, for cleanup if canput fails */ 8783 saved_mp = mp; 8784 mp = copyb(res_mp); 8785 if (mp == NULL) { 8786 /* Prepare for cleanup */ 8787 mp = saved_mp; /* pkt */ 8788 ire_delete(ire); /* ire_mp */ 8789 ire = NULL; 8790 ire_refrele(save_ire); 8791 if (copy_mp != NULL) { 8792 MULTIRT_DEBUG_UNTAG(copy_mp); 8793 freemsg(copy_mp); 8794 copy_mp = NULL; 8795 } 8796 break; 8797 } 8798 linkb(mp, ire->ire_mp); 8799 8800 /* 8801 * Fill in the source and dest addrs for the resolver. 8802 * NOTE: this depends on memory layouts imposed by 8803 * ill_init(). 8804 */ 8805 areq = (areq_t *)mp->b_rptr; 8806 addrp = (ipaddr_t *)((char *)areq + 8807 areq->areq_sender_addr_offset); 8808 if (do_attach_ill) { 8809 /* 8810 * This is bind to no failover case. 8811 * arp packet also must go out on attach_ill. 8812 */ 8813 ASSERT(ipha->ipha_src != NULL); 8814 *addrp = ipha->ipha_src; 8815 } else { 8816 *addrp = save_ire->ire_src_addr; 8817 } 8818 8819 ire_refrele(save_ire); 8820 addrp = (ipaddr_t *)((char *)areq + 8821 areq->areq_target_addr_offset); 8822 *addrp = dst; 8823 /* Up to the resolver. */ 8824 if (canputnext(dst_ill->ill_rq) && 8825 !(dst_ill->ill_arp_closing)) { 8826 putnext(dst_ill->ill_rq, mp); 8827 ire = NULL; 8828 if (copy_mp != NULL) { 8829 /* 8830 * If we found a resolver, we ignore 8831 * any trailing top priority IRE_CACHE 8832 * in the further loops. This ensures 8833 * that we do not omit any resolver. 8834 * IRE_CACHE entries, if any, will be 8835 * processed next time we enter 8836 * ip_newroute(). 8837 */ 8838 multirt_flags &= ~MULTIRT_CACHEGW; 8839 /* 8840 * Search for the next unresolved 8841 * multirt route. 8842 */ 8843 first_mp = copy_mp; 8844 copy_mp = NULL; 8845 /* Prepare the next resolution loop. */ 8846 mp = first_mp; 8847 EXTRACT_PKT_MP(mp, first_mp, 8848 mctl_present); 8849 if (mctl_present) 8850 io = (ipsec_out_t *) 8851 first_mp->b_rptr; 8852 ipha = (ipha_t *)mp->b_rptr; 8853 8854 ASSERT(sire != NULL); 8855 8856 dst = save_dst; 8857 multirt_resolve_next = B_TRUE; 8858 continue; 8859 } 8860 8861 if (sire != NULL) 8862 ire_refrele(sire); 8863 8864 /* 8865 * The response will come back in ip_wput 8866 * with db_type IRE_DB_TYPE. 8867 */ 8868 ipif_refrele(src_ipif); 8869 ill_refrele(dst_ill); 8870 return; 8871 } else { 8872 /* Prepare for cleanup */ 8873 DTRACE_PROBE1(ip__newroute__drop, mblk_t *, 8874 mp); 8875 mp->b_cont = NULL; 8876 freeb(mp); /* areq */ 8877 /* 8878 * this is an ire that is not added to the 8879 * cache. ire_freemblk will handle the release 8880 * of any resources associated with the ire. 8881 */ 8882 ire_delete(ire); /* ire_mp */ 8883 mp = saved_mp; /* pkt */ 8884 ire = NULL; 8885 if (copy_mp != NULL) { 8886 MULTIRT_DEBUG_UNTAG(copy_mp); 8887 freemsg(copy_mp); 8888 copy_mp = NULL; 8889 } 8890 break; 8891 } 8892 default: 8893 break; 8894 } 8895 } while (multirt_resolve_next); 8896 8897 ip1dbg(("ip_newroute: dropped\n")); 8898 /* Did this packet originate externally? */ 8899 if (mp->b_prev) { 8900 mp->b_next = NULL; 8901 mp->b_prev = NULL; 8902 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 8903 } else { 8904 if (dst_ill != NULL) { 8905 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); 8906 } else { 8907 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 8908 } 8909 } 8910 ASSERT(copy_mp == NULL); 8911 MULTIRT_DEBUG_UNTAG(first_mp); 8912 freemsg(first_mp); 8913 if (ire != NULL) 8914 ire_refrele(ire); 8915 if (sire != NULL) 8916 ire_refrele(sire); 8917 if (src_ipif != NULL) 8918 ipif_refrele(src_ipif); 8919 if (dst_ill != NULL) 8920 ill_refrele(dst_ill); 8921 return; 8922 8923 icmp_err_ret: 8924 ip1dbg(("ip_newroute: no route\n")); 8925 if (src_ipif != NULL) 8926 ipif_refrele(src_ipif); 8927 if (dst_ill != NULL) 8928 ill_refrele(dst_ill); 8929 if (sire != NULL) 8930 ire_refrele(sire); 8931 /* Did this packet originate externally? */ 8932 if (mp->b_prev) { 8933 mp->b_next = NULL; 8934 mp->b_prev = NULL; 8935 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes); 8936 q = WR(q); 8937 } else { 8938 /* 8939 * There is no outgoing ill, so just increment the 8940 * system MIB. 8941 */ 8942 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 8943 /* 8944 * Since ip_wput() isn't close to finished, we fill 8945 * in enough of the header for credible error reporting. 8946 */ 8947 if (ip_hdr_complete(ipha, zoneid, ipst)) { 8948 /* Failed */ 8949 MULTIRT_DEBUG_UNTAG(first_mp); 8950 freemsg(first_mp); 8951 if (ire != NULL) 8952 ire_refrele(ire); 8953 return; 8954 } 8955 } 8956 8957 /* 8958 * At this point we will have ire only if RTF_BLACKHOLE 8959 * or RTF_REJECT flags are set on the IRE. It will not 8960 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8961 */ 8962 if (ire != NULL) { 8963 if (ire->ire_flags & RTF_BLACKHOLE) { 8964 ire_refrele(ire); 8965 MULTIRT_DEBUG_UNTAG(first_mp); 8966 freemsg(first_mp); 8967 return; 8968 } 8969 ire_refrele(ire); 8970 } 8971 if (ip_source_routed(ipha, ipst)) { 8972 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED, 8973 zoneid, ipst); 8974 return; 8975 } 8976 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 8977 } 8978 8979 ip_opt_info_t zero_info; 8980 8981 /* 8982 * IPv4 - 8983 * ip_newroute_ipif is called by ip_wput_multicast and 8984 * ip_rput_forward_multicast whenever we need to send 8985 * out a packet to a destination address for which we do not have specific 8986 * routing information. It is used when the packet will be sent out 8987 * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF 8988 * socket option is set or icmp error message wants to go out on a particular 8989 * interface for a unicast packet. 8990 * 8991 * In most cases, the destination address is resolved thanks to the ipif 8992 * intrinsic resolver. However, there are some cases where the call to 8993 * ip_newroute_ipif must take into account the potential presence of 8994 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8995 * that uses the interface. This is specified through flags, 8996 * which can be a combination of: 8997 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8998 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8999 * and flags. Additionally, the packet source address has to be set to 9000 * the specified address. The caller is thus expected to set this flag 9001 * if the packet has no specific source address yet. 9002 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 9003 * flag, the resulting ire will inherit the flag. All unresolved routes 9004 * to the destination must be explored in the same call to 9005 * ip_newroute_ipif(). 9006 */ 9007 static void 9008 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 9009 conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop) 9010 { 9011 areq_t *areq; 9012 ire_t *ire = NULL; 9013 mblk_t *res_mp; 9014 ipaddr_t *addrp; 9015 mblk_t *first_mp; 9016 ire_t *save_ire = NULL; 9017 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ 9018 ipif_t *src_ipif = NULL; 9019 ushort_t ire_marks = 0; 9020 ill_t *dst_ill = NULL; 9021 boolean_t mctl_present; 9022 ipsec_out_t *io; 9023 ipha_t *ipha; 9024 int ihandle = 0; 9025 mblk_t *saved_mp; 9026 ire_t *fire = NULL; 9027 mblk_t *copy_mp = NULL; 9028 boolean_t multirt_resolve_next; 9029 boolean_t unspec_src; 9030 ipaddr_t ipha_dst; 9031 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 9032 9033 /* 9034 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 9035 * here for uniformity 9036 */ 9037 ipif_refhold(ipif); 9038 9039 /* 9040 * This loop is run only once in most cases. 9041 * We loop to resolve further routes only when the destination 9042 * can be reached through multiple RTF_MULTIRT-flagged ires. 9043 */ 9044 do { 9045 if (dst_ill != NULL) { 9046 ill_refrele(dst_ill); 9047 dst_ill = NULL; 9048 } 9049 if (src_ipif != NULL) { 9050 ipif_refrele(src_ipif); 9051 src_ipif = NULL; 9052 } 9053 multirt_resolve_next = B_FALSE; 9054 9055 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 9056 ipif->ipif_ill->ill_name)); 9057 9058 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 9059 if (mctl_present) 9060 io = (ipsec_out_t *)first_mp->b_rptr; 9061 9062 ipha = (ipha_t *)mp->b_rptr; 9063 9064 /* 9065 * Save the packet destination address, we may need it after 9066 * the packet has been consumed. 9067 */ 9068 ipha_dst = ipha->ipha_dst; 9069 9070 /* 9071 * If the interface is a pt-pt interface we look for an 9072 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 9073 * local_address and the pt-pt destination address. Otherwise 9074 * we just match the local address. 9075 * NOTE: dst could be different than ipha->ipha_dst in case 9076 * of sending igmp multicast packets over a point-to-point 9077 * connection. 9078 * Thus we must be careful enough to check ipha_dst to be a 9079 * multicast address, otherwise it will take xmit_if path for 9080 * multicast packets resulting into kernel stack overflow by 9081 * repeated calls to ip_newroute_ipif from ire_send(). 9082 */ 9083 if (CLASSD(ipha_dst) && 9084 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 9085 goto err_ret; 9086 } 9087 9088 /* 9089 * We check if an IRE_OFFSUBNET for the addr that goes through 9090 * ipif exists. We need it to determine if the RTF_SETSRC and/or 9091 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 9092 * propagate its flags to the new ire. 9093 */ 9094 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 9095 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 9096 ip2dbg(("ip_newroute_ipif: " 9097 "ipif_lookup_multi_ire(" 9098 "ipif %p, dst %08x) = fire %p\n", 9099 (void *)ipif, ntohl(dst), (void *)fire)); 9100 } 9101 9102 if (mctl_present && io->ipsec_out_attach_if) { 9103 attach_ill = ip_grab_attach_ill(NULL, first_mp, 9104 io->ipsec_out_ill_index, B_FALSE, ipst); 9105 9106 /* Failure case frees things for us. */ 9107 if (attach_ill == NULL) { 9108 ipif_refrele(ipif); 9109 if (fire != NULL) 9110 ire_refrele(fire); 9111 return; 9112 } 9113 9114 /* 9115 * Check if we need an ire that will not be 9116 * looked up by anybody else i.e. HIDDEN. 9117 */ 9118 if (ill_is_probeonly(attach_ill)) { 9119 ire_marks = IRE_MARK_HIDDEN; 9120 } 9121 /* 9122 * ip_wput passes the right ipif for IPIF_NOFAILOVER 9123 * case. 9124 */ 9125 dst_ill = ipif->ipif_ill; 9126 /* attach_ill has been refheld by ip_grab_attach_ill */ 9127 ASSERT(dst_ill == attach_ill); 9128 } else { 9129 /* 9130 * If the interface belongs to an interface group, 9131 * make sure the next possible interface in the group 9132 * is used. This encourages load spreading among 9133 * peers in an interface group. 9134 * Note: load spreading is disabled for RTF_MULTIRT 9135 * routes. 9136 */ 9137 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9138 (fire->ire_flags & RTF_MULTIRT)) { 9139 /* 9140 * Don't perform outbound load spreading 9141 * in the case of an RTF_MULTIRT issued route, 9142 * we actually typically want to replicate 9143 * outgoing packets through particular 9144 * interfaces. 9145 */ 9146 dst_ill = ipif->ipif_ill; 9147 ill_refhold(dst_ill); 9148 } else { 9149 dst_ill = ip_newroute_get_dst_ill( 9150 ipif->ipif_ill); 9151 } 9152 if (dst_ill == NULL) { 9153 if (ip_debug > 2) { 9154 pr_addr_dbg("ip_newroute_ipif: " 9155 "no dst ill for dst %s\n", 9156 AF_INET, &dst); 9157 } 9158 goto err_ret; 9159 } 9160 } 9161 9162 /* 9163 * Pick a source address preferring non-deprecated ones. 9164 * Unlike ip_newroute, we don't do any source address 9165 * selection here since for multicast it really does not help 9166 * in inbound load spreading as in the unicast case. 9167 */ 9168 if ((flags & RTF_SETSRC) && (fire != NULL) && 9169 (fire->ire_flags & RTF_SETSRC)) { 9170 /* 9171 * As requested by flags, an IRE_OFFSUBNET was looked up 9172 * on that interface. This ire has RTF_SETSRC flag, so 9173 * the source address of the packet must be changed. 9174 * Check that the ipif matching the requested source 9175 * address still exists. 9176 */ 9177 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 9178 zoneid, NULL, NULL, NULL, NULL, ipst); 9179 } 9180 9181 unspec_src = (connp != NULL && connp->conn_unspec_src); 9182 9183 if (((!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || 9184 (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || 9185 (connp != NULL && ipif->ipif_zoneid != zoneid && 9186 ipif->ipif_zoneid != ALL_ZONES)) && 9187 (src_ipif == NULL) && 9188 (!unspec_src || ipha->ipha_src != INADDR_ANY)) { 9189 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 9190 if (src_ipif == NULL) { 9191 if (ip_debug > 2) { 9192 /* ip1dbg */ 9193 pr_addr_dbg("ip_newroute_ipif: " 9194 "no src for dst %s", 9195 AF_INET, &dst); 9196 } 9197 ip1dbg((" through interface %s\n", 9198 dst_ill->ill_name)); 9199 goto err_ret; 9200 } 9201 ipif_refrele(ipif); 9202 ipif = src_ipif; 9203 ipif_refhold(ipif); 9204 } 9205 if (src_ipif == NULL) { 9206 src_ipif = ipif; 9207 ipif_refhold(src_ipif); 9208 } 9209 9210 /* 9211 * Assign a source address while we have the conn. 9212 * We can't have ip_wput_ire pick a source address when the 9213 * packet returns from arp since conn_unspec_src might be set 9214 * and we lose the conn when going through arp. 9215 */ 9216 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 9217 ipha->ipha_src = src_ipif->ipif_src_addr; 9218 9219 /* 9220 * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible 9221 * that the outgoing interface does not have an interface ire. 9222 */ 9223 if (CLASSD(ipha_dst) && (connp == NULL || 9224 connp->conn_outgoing_ill == NULL) && 9225 infop->ip_opt_ill_index == 0) { 9226 /* ipif_to_ire returns an held ire */ 9227 ire = ipif_to_ire(ipif); 9228 if (ire == NULL) 9229 goto err_ret; 9230 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 9231 goto err_ret; 9232 /* 9233 * ihandle is needed when the ire is added to 9234 * cache table. 9235 */ 9236 save_ire = ire; 9237 ihandle = save_ire->ire_ihandle; 9238 9239 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 9240 "flags %04x\n", 9241 (void *)ire, (void *)ipif, flags)); 9242 if ((flags & RTF_MULTIRT) && (fire != NULL) && 9243 (fire->ire_flags & RTF_MULTIRT)) { 9244 /* 9245 * As requested by flags, an IRE_OFFSUBNET was 9246 * looked up on that interface. This ire has 9247 * RTF_MULTIRT flag, so the resolution loop will 9248 * be re-entered to resolve additional routes on 9249 * other interfaces. For that purpose, a copy of 9250 * the packet is performed at this point. 9251 */ 9252 fire->ire_last_used_time = lbolt; 9253 copy_mp = copymsg(first_mp); 9254 if (copy_mp) { 9255 MULTIRT_DEBUG_TAG(copy_mp); 9256 } 9257 } 9258 if ((flags & RTF_SETSRC) && (fire != NULL) && 9259 (fire->ire_flags & RTF_SETSRC)) { 9260 /* 9261 * As requested by flags, an IRE_OFFSUBET was 9262 * looked up on that interface. This ire has 9263 * RTF_SETSRC flag, so the source address of the 9264 * packet must be changed. 9265 */ 9266 ipha->ipha_src = fire->ire_src_addr; 9267 } 9268 } else { 9269 ASSERT((connp == NULL) || 9270 (connp->conn_outgoing_ill != NULL) || 9271 (connp->conn_dontroute) || 9272 infop->ip_opt_ill_index != 0); 9273 /* 9274 * The only ways we can come here are: 9275 * 1) IP_BOUND_IF socket option is set 9276 * 2) SO_DONTROUTE socket option is set 9277 * 3) IP_PKTINFO option is passed in as ancillary data. 9278 * In all cases, the new ire will not be added 9279 * into cache table. 9280 */ 9281 ire_marks |= IRE_MARK_NOADD; 9282 } 9283 9284 switch (ipif->ipif_net_type) { 9285 case IRE_IF_NORESOLVER: { 9286 /* We have what we need to build an IRE_CACHE. */ 9287 9288 if ((dst_ill->ill_phys_addr_length != IP_ADDR_LEN) && 9289 (dst_ill->ill_resolver_mp == NULL)) { 9290 ip1dbg(("ip_newroute_ipif: dst_ill %p " 9291 "for IRE_IF_NORESOLVER ire %p has " 9292 "no ill_resolver_mp\n", 9293 (void *)dst_ill, (void *)ire)); 9294 break; 9295 } 9296 9297 /* 9298 * The new ire inherits the IRE_OFFSUBNET flags 9299 * and source address, if this was requested. 9300 */ 9301 ire = ire_create( 9302 (uchar_t *)&dst, /* dest address */ 9303 (uchar_t *)&ip_g_all_ones, /* mask */ 9304 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9305 NULL, /* gateway address */ 9306 &ipif->ipif_mtu, 9307 NULL, /* no src nce */ 9308 dst_ill->ill_rq, /* recv-from queue */ 9309 dst_ill->ill_wq, /* send-to queue */ 9310 IRE_CACHE, 9311 src_ipif, 9312 (save_ire != NULL ? save_ire->ire_mask : 0), 9313 (fire != NULL) ? /* Parent handle */ 9314 fire->ire_phandle : 0, 9315 ihandle, /* Interface handle */ 9316 (fire != NULL) ? 9317 (fire->ire_flags & 9318 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9319 (save_ire == NULL ? &ire_uinfo_null : 9320 &save_ire->ire_uinfo), 9321 NULL, 9322 NULL, 9323 ipst); 9324 9325 if (ire == NULL) { 9326 if (save_ire != NULL) 9327 ire_refrele(save_ire); 9328 break; 9329 } 9330 9331 ire->ire_marks |= ire_marks; 9332 9333 /* 9334 * If IRE_MARK_NOADD is set then we need to convert 9335 * the max_fragp to a useable value now. This is 9336 * normally done in ire_add_v[46]. We also need to 9337 * associate the ire with an nce (normally would be 9338 * done in ip_wput_nondata()). 9339 * 9340 * Note that IRE_MARK_NOADD packets created here 9341 * do not have a non-null ire_mp pointer. The null 9342 * value of ire_bucket indicates that they were 9343 * never added. 9344 */ 9345 if (ire->ire_marks & IRE_MARK_NOADD) { 9346 uint_t max_frag; 9347 9348 max_frag = *ire->ire_max_fragp; 9349 ire->ire_max_fragp = NULL; 9350 ire->ire_max_frag = max_frag; 9351 9352 if ((ire->ire_nce = ndp_lookup_v4( 9353 ire_to_ill(ire), 9354 (ire->ire_gateway_addr != INADDR_ANY ? 9355 &ire->ire_gateway_addr : &ire->ire_addr), 9356 B_FALSE)) == NULL) { 9357 if (save_ire != NULL) 9358 ire_refrele(save_ire); 9359 break; 9360 } 9361 ASSERT(ire->ire_nce->nce_state == 9362 ND_REACHABLE); 9363 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 9364 } 9365 9366 /* Prevent save_ire from getting deleted */ 9367 if (save_ire != NULL) { 9368 IRB_REFHOLD(save_ire->ire_bucket); 9369 /* Has it been removed already ? */ 9370 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 9371 IRB_REFRELE(save_ire->ire_bucket); 9372 ire_refrele(save_ire); 9373 break; 9374 } 9375 } 9376 9377 ire_add_then_send(q, ire, first_mp); 9378 9379 /* Assert that save_ire is not deleted yet. */ 9380 if (save_ire != NULL) { 9381 ASSERT(save_ire->ire_ptpn != NULL); 9382 IRB_REFRELE(save_ire->ire_bucket); 9383 ire_refrele(save_ire); 9384 save_ire = NULL; 9385 } 9386 if (fire != NULL) { 9387 ire_refrele(fire); 9388 fire = NULL; 9389 } 9390 9391 /* 9392 * the resolution loop is re-entered if this 9393 * was requested through flags and if we 9394 * actually are in a multirouting case. 9395 */ 9396 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9397 boolean_t need_resolve = 9398 ire_multirt_need_resolve(ipha_dst, 9399 MBLK_GETLABEL(copy_mp), ipst); 9400 if (!need_resolve) { 9401 MULTIRT_DEBUG_UNTAG(copy_mp); 9402 freemsg(copy_mp); 9403 copy_mp = NULL; 9404 } else { 9405 /* 9406 * ipif_lookup_group() calls 9407 * ire_lookup_multi() that uses 9408 * ire_ftable_lookup() to find 9409 * an IRE_INTERFACE for the group. 9410 * In the multirt case, 9411 * ire_lookup_multi() then invokes 9412 * ire_multirt_lookup() to find 9413 * the next resolvable ire. 9414 * As a result, we obtain an new 9415 * interface, derived from the 9416 * next ire. 9417 */ 9418 ipif_refrele(ipif); 9419 ipif = ipif_lookup_group(ipha_dst, 9420 zoneid, ipst); 9421 ip2dbg(("ip_newroute_ipif: " 9422 "multirt dst %08x, ipif %p\n", 9423 htonl(dst), (void *)ipif)); 9424 if (ipif != NULL) { 9425 mp = copy_mp; 9426 copy_mp = NULL; 9427 multirt_resolve_next = B_TRUE; 9428 continue; 9429 } else { 9430 freemsg(copy_mp); 9431 } 9432 } 9433 } 9434 if (ipif != NULL) 9435 ipif_refrele(ipif); 9436 ill_refrele(dst_ill); 9437 ipif_refrele(src_ipif); 9438 return; 9439 } 9440 case IRE_IF_RESOLVER: 9441 /* 9442 * We can't build an IRE_CACHE yet, but at least 9443 * we found a resolver that can help. 9444 */ 9445 res_mp = dst_ill->ill_resolver_mp; 9446 if (!OK_RESOLVER_MP(res_mp)) 9447 break; 9448 9449 /* 9450 * We obtain a partial IRE_CACHE which we will pass 9451 * along with the resolver query. When the response 9452 * comes back it will be there ready for us to add. 9453 * The new ire inherits the IRE_OFFSUBNET flags 9454 * and source address, if this was requested. 9455 * The ire_max_frag is atomically set under the 9456 * irebucket lock in ire_add_v[46]. Only in the 9457 * case of IRE_MARK_NOADD, we set it here itself. 9458 */ 9459 ire = ire_create_mp( 9460 (uchar_t *)&dst, /* dest address */ 9461 (uchar_t *)&ip_g_all_ones, /* mask */ 9462 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 9463 NULL, /* gateway address */ 9464 (ire_marks & IRE_MARK_NOADD) ? 9465 ipif->ipif_mtu : 0, /* max_frag */ 9466 NULL, /* no src nce */ 9467 dst_ill->ill_rq, /* recv-from queue */ 9468 dst_ill->ill_wq, /* send-to queue */ 9469 IRE_CACHE, 9470 src_ipif, 9471 (save_ire != NULL ? save_ire->ire_mask : 0), 9472 (fire != NULL) ? /* Parent handle */ 9473 fire->ire_phandle : 0, 9474 ihandle, /* Interface handle */ 9475 (fire != NULL) ? /* flags if any */ 9476 (fire->ire_flags & 9477 (RTF_SETSRC | RTF_MULTIRT)) : 0, 9478 (save_ire == NULL ? &ire_uinfo_null : 9479 &save_ire->ire_uinfo), 9480 NULL, 9481 NULL, 9482 ipst); 9483 9484 if (save_ire != NULL) { 9485 ire_refrele(save_ire); 9486 save_ire = NULL; 9487 } 9488 if (ire == NULL) 9489 break; 9490 9491 ire->ire_marks |= ire_marks; 9492 /* 9493 * Construct message chain for the resolver of the 9494 * form: 9495 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 9496 * 9497 * NOTE : ire will be added later when the response 9498 * comes back from ARP. If the response does not 9499 * come back, ARP frees the packet. For this reason, 9500 * we can't REFHOLD the bucket of save_ire to prevent 9501 * deletions. We may not be able to REFRELE the 9502 * bucket if the response never comes back. 9503 * Thus, before adding the ire, ire_add_v4 will make 9504 * sure that the interface route does not get deleted. 9505 * This is the only case unlike ip_newroute_v6, 9506 * ip_newroute_ipif_v6 where we can always prevent 9507 * deletions because ire_add_then_send is called after 9508 * creating the IRE. 9509 * If IRE_MARK_NOADD is set, then ire_add_then_send 9510 * does not add this IRE into the IRE CACHE. 9511 */ 9512 ASSERT(ire->ire_mp != NULL); 9513 ire->ire_mp->b_cont = first_mp; 9514 /* Have saved_mp handy, for cleanup if canput fails */ 9515 saved_mp = mp; 9516 mp = copyb(res_mp); 9517 if (mp == NULL) { 9518 /* Prepare for cleanup */ 9519 mp = saved_mp; /* pkt */ 9520 ire_delete(ire); /* ire_mp */ 9521 ire = NULL; 9522 if (copy_mp != NULL) { 9523 MULTIRT_DEBUG_UNTAG(copy_mp); 9524 freemsg(copy_mp); 9525 copy_mp = NULL; 9526 } 9527 break; 9528 } 9529 linkb(mp, ire->ire_mp); 9530 9531 /* 9532 * Fill in the source and dest addrs for the resolver. 9533 * NOTE: this depends on memory layouts imposed by 9534 * ill_init(). 9535 */ 9536 areq = (areq_t *)mp->b_rptr; 9537 addrp = (ipaddr_t *)((char *)areq + 9538 areq->areq_sender_addr_offset); 9539 *addrp = ire->ire_src_addr; 9540 addrp = (ipaddr_t *)((char *)areq + 9541 areq->areq_target_addr_offset); 9542 *addrp = dst; 9543 /* Up to the resolver. */ 9544 if (canputnext(dst_ill->ill_rq) && 9545 !(dst_ill->ill_arp_closing)) { 9546 putnext(dst_ill->ill_rq, mp); 9547 /* 9548 * The response will come back in ip_wput 9549 * with db_type IRE_DB_TYPE. 9550 */ 9551 } else { 9552 mp->b_cont = NULL; 9553 freeb(mp); /* areq */ 9554 ire_delete(ire); /* ire_mp */ 9555 saved_mp->b_next = NULL; 9556 saved_mp->b_prev = NULL; 9557 freemsg(first_mp); /* pkt */ 9558 ip2dbg(("ip_newroute_ipif: dropped\n")); 9559 } 9560 9561 if (fire != NULL) { 9562 ire_refrele(fire); 9563 fire = NULL; 9564 } 9565 9566 9567 /* 9568 * The resolution loop is re-entered if this was 9569 * requested through flags and we actually are 9570 * in a multirouting case. 9571 */ 9572 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 9573 boolean_t need_resolve = 9574 ire_multirt_need_resolve(ipha_dst, 9575 MBLK_GETLABEL(copy_mp), ipst); 9576 if (!need_resolve) { 9577 MULTIRT_DEBUG_UNTAG(copy_mp); 9578 freemsg(copy_mp); 9579 copy_mp = NULL; 9580 } else { 9581 /* 9582 * ipif_lookup_group() calls 9583 * ire_lookup_multi() that uses 9584 * ire_ftable_lookup() to find 9585 * an IRE_INTERFACE for the group. 9586 * In the multirt case, 9587 * ire_lookup_multi() then invokes 9588 * ire_multirt_lookup() to find 9589 * the next resolvable ire. 9590 * As a result, we obtain an new 9591 * interface, derived from the 9592 * next ire. 9593 */ 9594 ipif_refrele(ipif); 9595 ipif = ipif_lookup_group(ipha_dst, 9596 zoneid, ipst); 9597 if (ipif != NULL) { 9598 mp = copy_mp; 9599 copy_mp = NULL; 9600 multirt_resolve_next = B_TRUE; 9601 continue; 9602 } else { 9603 freemsg(copy_mp); 9604 } 9605 } 9606 } 9607 if (ipif != NULL) 9608 ipif_refrele(ipif); 9609 ill_refrele(dst_ill); 9610 ipif_refrele(src_ipif); 9611 return; 9612 default: 9613 break; 9614 } 9615 } while (multirt_resolve_next); 9616 9617 err_ret: 9618 ip2dbg(("ip_newroute_ipif: dropped\n")); 9619 if (fire != NULL) 9620 ire_refrele(fire); 9621 ipif_refrele(ipif); 9622 /* Did this packet originate externally? */ 9623 if (dst_ill != NULL) 9624 ill_refrele(dst_ill); 9625 if (src_ipif != NULL) 9626 ipif_refrele(src_ipif); 9627 if (mp->b_prev || mp->b_next) { 9628 mp->b_next = NULL; 9629 mp->b_prev = NULL; 9630 } else { 9631 /* 9632 * Since ip_wput() isn't close to finished, we fill 9633 * in enough of the header for credible error reporting. 9634 */ 9635 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 9636 /* Failed */ 9637 freemsg(first_mp); 9638 if (ire != NULL) 9639 ire_refrele(ire); 9640 return; 9641 } 9642 } 9643 /* 9644 * At this point we will have ire only if RTF_BLACKHOLE 9645 * or RTF_REJECT flags are set on the IRE. It will not 9646 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 9647 */ 9648 if (ire != NULL) { 9649 if (ire->ire_flags & RTF_BLACKHOLE) { 9650 ire_refrele(ire); 9651 freemsg(first_mp); 9652 return; 9653 } 9654 ire_refrele(ire); 9655 } 9656 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); 9657 } 9658 9659 /* Name/Value Table Lookup Routine */ 9660 char * 9661 ip_nv_lookup(nv_t *nv, int value) 9662 { 9663 if (!nv) 9664 return (NULL); 9665 for (; nv->nv_name; nv++) { 9666 if (nv->nv_value == value) 9667 return (nv->nv_name); 9668 } 9669 return ("unknown"); 9670 } 9671 9672 /* 9673 * This is a module open, i.e. this is a control stream for access 9674 * to a DLPI device. We allocate an ill_t as the instance data in 9675 * this case. 9676 */ 9677 int 9678 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9679 { 9680 ill_t *ill; 9681 int err; 9682 zoneid_t zoneid; 9683 netstack_t *ns; 9684 ip_stack_t *ipst; 9685 9686 /* 9687 * Prevent unprivileged processes from pushing IP so that 9688 * they can't send raw IP. 9689 */ 9690 if (secpolicy_net_rawaccess(credp) != 0) 9691 return (EPERM); 9692 9693 ns = netstack_find_by_cred(credp); 9694 ASSERT(ns != NULL); 9695 ipst = ns->netstack_ip; 9696 ASSERT(ipst != NULL); 9697 9698 /* 9699 * For exclusive stacks we set the zoneid to zero 9700 * to make IP operate as if in the global zone. 9701 */ 9702 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9703 zoneid = GLOBAL_ZONEID; 9704 else 9705 zoneid = crgetzoneid(credp); 9706 9707 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 9708 q->q_ptr = WR(q)->q_ptr = ill; 9709 ill->ill_ipst = ipst; 9710 ill->ill_zoneid = zoneid; 9711 9712 /* 9713 * ill_init initializes the ill fields and then sends down 9714 * down a DL_INFO_REQ after calling qprocson. 9715 */ 9716 err = ill_init(q, ill); 9717 if (err != 0) { 9718 mi_free(ill); 9719 netstack_rele(ipst->ips_netstack); 9720 q->q_ptr = NULL; 9721 WR(q)->q_ptr = NULL; 9722 return (err); 9723 } 9724 9725 /* ill_init initializes the ipsq marking this thread as writer */ 9726 ipsq_exit(ill->ill_phyint->phyint_ipsq); 9727 /* Wait for the DL_INFO_ACK */ 9728 mutex_enter(&ill->ill_lock); 9729 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 9730 /* 9731 * Return value of 0 indicates a pending signal. 9732 */ 9733 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 9734 if (err == 0) { 9735 mutex_exit(&ill->ill_lock); 9736 (void) ip_close(q, 0); 9737 return (EINTR); 9738 } 9739 } 9740 mutex_exit(&ill->ill_lock); 9741 9742 /* 9743 * ip_rput_other could have set an error in ill_error on 9744 * receipt of M_ERROR. 9745 */ 9746 9747 err = ill->ill_error; 9748 if (err != 0) { 9749 (void) ip_close(q, 0); 9750 return (err); 9751 } 9752 9753 ill->ill_credp = credp; 9754 crhold(credp); 9755 9756 mutex_enter(&ipst->ips_ip_mi_lock); 9757 err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag, 9758 credp); 9759 mutex_exit(&ipst->ips_ip_mi_lock); 9760 if (err) { 9761 (void) ip_close(q, 0); 9762 return (err); 9763 } 9764 return (0); 9765 } 9766 9767 /* For /dev/ip aka AF_INET open */ 9768 int 9769 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9770 { 9771 return (ip_open(q, devp, flag, sflag, credp, B_FALSE)); 9772 } 9773 9774 /* For /dev/ip6 aka AF_INET6 open */ 9775 int 9776 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9777 { 9778 return (ip_open(q, devp, flag, sflag, credp, B_TRUE)); 9779 } 9780 9781 /* IP open routine. */ 9782 int 9783 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 9784 boolean_t isv6) 9785 { 9786 conn_t *connp; 9787 major_t maj; 9788 zoneid_t zoneid; 9789 netstack_t *ns; 9790 ip_stack_t *ipst; 9791 9792 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 9793 9794 /* Allow reopen. */ 9795 if (q->q_ptr != NULL) 9796 return (0); 9797 9798 if (sflag & MODOPEN) { 9799 /* This is a module open */ 9800 return (ip_modopen(q, devp, flag, sflag, credp)); 9801 } 9802 9803 ns = netstack_find_by_cred(credp); 9804 ASSERT(ns != NULL); 9805 ipst = ns->netstack_ip; 9806 ASSERT(ipst != NULL); 9807 9808 /* 9809 * For exclusive stacks we set the zoneid to zero 9810 * to make IP operate as if in the global zone. 9811 */ 9812 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 9813 zoneid = GLOBAL_ZONEID; 9814 else 9815 zoneid = crgetzoneid(credp); 9816 9817 /* 9818 * We are opening as a device. This is an IP client stream, and we 9819 * allocate an conn_t as the instance data. 9820 */ 9821 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack); 9822 9823 /* 9824 * ipcl_conn_create did a netstack_hold. Undo the hold that was 9825 * done by netstack_find_by_cred() 9826 */ 9827 netstack_rele(ipst->ips_netstack); 9828 9829 connp->conn_zoneid = zoneid; 9830 9831 connp->conn_upq = q; 9832 q->q_ptr = WR(q)->q_ptr = connp; 9833 9834 if (flag & SO_SOCKSTR) 9835 connp->conn_flags |= IPCL_SOCKET; 9836 9837 /* Minor tells us which /dev entry was opened */ 9838 if (isv6) { 9839 connp->conn_flags |= IPCL_ISV6; 9840 connp->conn_af_isv6 = B_TRUE; 9841 ip_setpktversion(connp, isv6, B_FALSE, ipst); 9842 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9843 } else { 9844 connp->conn_af_isv6 = B_FALSE; 9845 connp->conn_pkt_isv6 = B_FALSE; 9846 } 9847 9848 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 9849 ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 9850 connp->conn_minor_arena = ip_minor_arena_la; 9851 } else { 9852 /* 9853 * Either minor numbers in the large arena were exhausted 9854 * or a non socket application is doing the open. 9855 * Try to allocate from the small arena. 9856 */ 9857 if ((connp->conn_dev = 9858 inet_minor_alloc(ip_minor_arena_sa)) == 0) { 9859 /* CONN_DEC_REF takes care of netstack_rele() */ 9860 q->q_ptr = WR(q)->q_ptr = NULL; 9861 CONN_DEC_REF(connp); 9862 return (EBUSY); 9863 } 9864 connp->conn_minor_arena = ip_minor_arena_sa; 9865 } 9866 9867 maj = getemajor(*devp); 9868 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9869 9870 /* 9871 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9872 */ 9873 connp->conn_cred = credp; 9874 9875 /* 9876 * Handle IP_RTS_REQUEST and other ioctls which use conn_recv 9877 */ 9878 connp->conn_recv = ip_conn_input; 9879 9880 crhold(connp->conn_cred); 9881 9882 /* 9883 * If the caller has the process-wide flag set, then default to MAC 9884 * exempt mode. This allows read-down to unlabeled hosts. 9885 */ 9886 if (getpflags(NET_MAC_AWARE, credp) != 0) 9887 connp->conn_mac_exempt = B_TRUE; 9888 9889 connp->conn_rq = q; 9890 connp->conn_wq = WR(q); 9891 9892 /* Non-zero default values */ 9893 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9894 9895 /* 9896 * Make the conn globally visible to walkers 9897 */ 9898 ASSERT(connp->conn_ref == 1); 9899 mutex_enter(&connp->conn_lock); 9900 connp->conn_state_flags &= ~CONN_INCIPIENT; 9901 mutex_exit(&connp->conn_lock); 9902 9903 qprocson(q); 9904 9905 return (0); 9906 } 9907 9908 /* 9909 * Change the output format (IPv4 vs. IPv6) for a conn_t. 9910 * Note that there is no race since either ip_output function works - it 9911 * is just an optimization to enter the best ip_output routine directly. 9912 */ 9913 void 9914 ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib, 9915 ip_stack_t *ipst) 9916 { 9917 if (isv6) { 9918 if (bump_mib) { 9919 BUMP_MIB(&ipst->ips_ip6_mib, 9920 ipIfStatsOutSwitchIPVersion); 9921 } 9922 connp->conn_send = ip_output_v6; 9923 connp->conn_pkt_isv6 = B_TRUE; 9924 } else { 9925 if (bump_mib) { 9926 BUMP_MIB(&ipst->ips_ip_mib, 9927 ipIfStatsOutSwitchIPVersion); 9928 } 9929 connp->conn_send = ip_output; 9930 connp->conn_pkt_isv6 = B_FALSE; 9931 } 9932 9933 } 9934 9935 /* 9936 * See if IPsec needs loading because of the options in mp. 9937 */ 9938 static boolean_t 9939 ipsec_opt_present(mblk_t *mp) 9940 { 9941 uint8_t *optcp, *next_optcp, *opt_endcp; 9942 struct opthdr *opt; 9943 struct T_opthdr *topt; 9944 int opthdr_len; 9945 t_uscalar_t optname, optlevel; 9946 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9947 ipsec_req_t *ipsr; 9948 9949 /* 9950 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9951 * return TRUE. 9952 */ 9953 9954 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9955 opt_endcp = optcp + tor->OPT_length; 9956 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9957 opthdr_len = sizeof (struct T_opthdr); 9958 } else { /* O_OPTMGMT_REQ */ 9959 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9960 opthdr_len = sizeof (struct opthdr); 9961 } 9962 for (; optcp < opt_endcp; optcp = next_optcp) { 9963 if (optcp + opthdr_len > opt_endcp) 9964 return (B_FALSE); /* Not enough option header. */ 9965 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9966 topt = (struct T_opthdr *)optcp; 9967 optlevel = topt->level; 9968 optname = topt->name; 9969 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9970 } else { 9971 opt = (struct opthdr *)optcp; 9972 optlevel = opt->level; 9973 optname = opt->name; 9974 next_optcp = optcp + opthdr_len + 9975 _TPI_ALIGN_OPT(opt->len); 9976 } 9977 if ((next_optcp < optcp) || /* wraparound pointer space */ 9978 ((next_optcp >= opt_endcp) && /* last option bad len */ 9979 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9980 return (B_FALSE); /* bad option buffer */ 9981 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9982 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9983 /* 9984 * Check to see if it's an all-bypass or all-zeroes 9985 * IPsec request. Don't bother loading IPsec if 9986 * the socket doesn't want to use it. (A good example 9987 * is a bypass request.) 9988 * 9989 * Basically, if any of the non-NEVER bits are set, 9990 * load IPsec. 9991 */ 9992 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9993 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9994 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9995 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9996 != 0) 9997 return (B_TRUE); 9998 } 9999 } 10000 return (B_FALSE); 10001 } 10002 10003 /* 10004 * If conn is is waiting for ipsec to finish loading, kick it. 10005 */ 10006 /* ARGSUSED */ 10007 static void 10008 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 10009 { 10010 t_scalar_t optreq_prim; 10011 mblk_t *mp; 10012 cred_t *cr; 10013 int err = 0; 10014 10015 /* 10016 * This function is called, after ipsec loading is complete. 10017 * Since IP checks exclusively and atomically (i.e it prevents 10018 * ipsec load from completing until ip_optcom_req completes) 10019 * whether ipsec load is complete, there cannot be a race with IP 10020 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 10021 */ 10022 mutex_enter(&connp->conn_lock); 10023 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 10024 ASSERT(connp->conn_ipsec_opt_mp != NULL); 10025 mp = connp->conn_ipsec_opt_mp; 10026 connp->conn_ipsec_opt_mp = NULL; 10027 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 10028 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(CONNP_TO_WQ(connp))); 10029 mutex_exit(&connp->conn_lock); 10030 10031 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 10032 10033 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 10034 if (optreq_prim == T_OPTMGMT_REQ) { 10035 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10036 &ip_opt_obj, B_FALSE); 10037 } else { 10038 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 10039 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 10040 &ip_opt_obj, B_FALSE); 10041 } 10042 if (err != EINPROGRESS) 10043 CONN_OPER_PENDING_DONE(connp); 10044 return; 10045 } 10046 mutex_exit(&connp->conn_lock); 10047 } 10048 10049 /* 10050 * Called from the ipsec_loader thread, outside any perimeter, to tell 10051 * ip qenable any of the queues waiting for the ipsec loader to 10052 * complete. 10053 */ 10054 void 10055 ip_ipsec_load_complete(ipsec_stack_t *ipss) 10056 { 10057 netstack_t *ns = ipss->ipsec_netstack; 10058 10059 ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip); 10060 } 10061 10062 /* 10063 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 10064 * determines the grp on which it has to become exclusive, queues the mp 10065 * and sq draining restarts the optmgmt 10066 */ 10067 static boolean_t 10068 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 10069 { 10070 conn_t *connp = Q_TO_CONN(q); 10071 ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec; 10072 10073 /* 10074 * Take IPsec requests and treat them special. 10075 */ 10076 if (ipsec_opt_present(mp)) { 10077 /* First check if IPsec is loaded. */ 10078 mutex_enter(&ipss->ipsec_loader_lock); 10079 if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) { 10080 mutex_exit(&ipss->ipsec_loader_lock); 10081 return (B_FALSE); 10082 } 10083 mutex_enter(&connp->conn_lock); 10084 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 10085 10086 ASSERT(connp->conn_ipsec_opt_mp == NULL); 10087 connp->conn_ipsec_opt_mp = mp; 10088 mutex_exit(&connp->conn_lock); 10089 mutex_exit(&ipss->ipsec_loader_lock); 10090 10091 ipsec_loader_loadnow(ipss); 10092 return (B_TRUE); 10093 } 10094 return (B_FALSE); 10095 } 10096 10097 /* 10098 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 10099 * all of them are copied to the conn_t. If the req is "zero", the policy is 10100 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 10101 * fields. 10102 * We keep only the latest setting of the policy and thus policy setting 10103 * is not incremental/cumulative. 10104 * 10105 * Requests to set policies with multiple alternative actions will 10106 * go through a different API. 10107 */ 10108 int 10109 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 10110 { 10111 uint_t ah_req = 0; 10112 uint_t esp_req = 0; 10113 uint_t se_req = 0; 10114 ipsec_selkey_t sel; 10115 ipsec_act_t *actp = NULL; 10116 uint_t nact; 10117 ipsec_policy_t *pin4 = NULL, *pout4 = NULL; 10118 ipsec_policy_t *pin6 = NULL, *pout6 = NULL; 10119 ipsec_policy_root_t *pr; 10120 ipsec_policy_head_t *ph; 10121 int fam; 10122 boolean_t is_pol_reset; 10123 int error = 0; 10124 netstack_t *ns = connp->conn_netstack; 10125 ip_stack_t *ipst = ns->netstack_ip; 10126 ipsec_stack_t *ipss = ns->netstack_ipsec; 10127 10128 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 10129 10130 /* 10131 * The IP_SEC_OPT option does not allow variable length parameters, 10132 * hence a request cannot be NULL. 10133 */ 10134 if (req == NULL) 10135 return (EINVAL); 10136 10137 ah_req = req->ipsr_ah_req; 10138 esp_req = req->ipsr_esp_req; 10139 se_req = req->ipsr_self_encap_req; 10140 10141 /* Don't allow setting self-encap without one or more of AH/ESP. */ 10142 if (se_req != 0 && esp_req == 0 && ah_req == 0) 10143 return (EINVAL); 10144 10145 /* 10146 * Are we dealing with a request to reset the policy (i.e. 10147 * zero requests). 10148 */ 10149 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 10150 (esp_req & REQ_MASK) == 0 && 10151 (se_req & REQ_MASK) == 0); 10152 10153 if (!is_pol_reset) { 10154 /* 10155 * If we couldn't load IPsec, fail with "protocol 10156 * not supported". 10157 * IPsec may not have been loaded for a request with zero 10158 * policies, so we don't fail in this case. 10159 */ 10160 mutex_enter(&ipss->ipsec_loader_lock); 10161 if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 10162 mutex_exit(&ipss->ipsec_loader_lock); 10163 return (EPROTONOSUPPORT); 10164 } 10165 mutex_exit(&ipss->ipsec_loader_lock); 10166 10167 /* 10168 * Test for valid requests. Invalid algorithms 10169 * need to be tested by IPsec code because new 10170 * algorithms can be added dynamically. 10171 */ 10172 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10173 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 10174 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 10175 return (EINVAL); 10176 } 10177 10178 /* 10179 * Only privileged users can issue these 10180 * requests. 10181 */ 10182 if (((ah_req & IPSEC_PREF_NEVER) || 10183 (esp_req & IPSEC_PREF_NEVER) || 10184 (se_req & IPSEC_PREF_NEVER)) && 10185 secpolicy_ip_config(cr, B_FALSE) != 0) { 10186 return (EPERM); 10187 } 10188 10189 /* 10190 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 10191 * are mutually exclusive. 10192 */ 10193 if (((ah_req & REQ_MASK) == REQ_MASK) || 10194 ((esp_req & REQ_MASK) == REQ_MASK) || 10195 ((se_req & REQ_MASK) == REQ_MASK)) { 10196 /* Both of them are set */ 10197 return (EINVAL); 10198 } 10199 } 10200 10201 mutex_enter(&connp->conn_lock); 10202 10203 /* 10204 * If we have already cached policies in ip_bind_connected*(), don't 10205 * let them change now. We cache policies for connections 10206 * whose src,dst [addr, port] is known. 10207 */ 10208 if (connp->conn_policy_cached) { 10209 mutex_exit(&connp->conn_lock); 10210 return (EINVAL); 10211 } 10212 10213 /* 10214 * We have a zero policies, reset the connection policy if already 10215 * set. This will cause the connection to inherit the 10216 * global policy, if any. 10217 */ 10218 if (is_pol_reset) { 10219 if (connp->conn_policy != NULL) { 10220 IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack); 10221 connp->conn_policy = NULL; 10222 } 10223 connp->conn_flags &= ~IPCL_CHECK_POLICY; 10224 connp->conn_in_enforce_policy = B_FALSE; 10225 connp->conn_out_enforce_policy = B_FALSE; 10226 mutex_exit(&connp->conn_lock); 10227 return (0); 10228 } 10229 10230 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy, 10231 ipst->ips_netstack); 10232 if (ph == NULL) 10233 goto enomem; 10234 10235 ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack); 10236 if (actp == NULL) 10237 goto enomem; 10238 10239 /* 10240 * Always allocate IPv4 policy entries, since they can also 10241 * apply to ipv6 sockets being used in ipv4-compat mode. 10242 */ 10243 bzero(&sel, sizeof (sel)); 10244 sel.ipsl_valid = IPSL_IPV4; 10245 10246 pin4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET, NULL, 10247 ipst->ips_netstack); 10248 if (pin4 == NULL) 10249 goto enomem; 10250 10251 pout4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET, NULL, 10252 ipst->ips_netstack); 10253 if (pout4 == NULL) 10254 goto enomem; 10255 10256 if (connp->conn_af_isv6) { 10257 /* 10258 * We're looking at a v6 socket, also allocate the 10259 * v6-specific entries... 10260 */ 10261 sel.ipsl_valid = IPSL_IPV6; 10262 pin6 = ipsec_policy_create(&sel, actp, nact, 10263 IPSEC_PRIO_SOCKET, NULL, ipst->ips_netstack); 10264 if (pin6 == NULL) 10265 goto enomem; 10266 10267 pout6 = ipsec_policy_create(&sel, actp, nact, 10268 IPSEC_PRIO_SOCKET, NULL, ipst->ips_netstack); 10269 if (pout6 == NULL) 10270 goto enomem; 10271 10272 /* 10273 * .. and file them away in the right place. 10274 */ 10275 fam = IPSEC_AF_V6; 10276 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 10277 HASHLIST_INSERT(pin6, ipsp_hash, pr->ipr_nonhash[fam]); 10278 ipsec_insert_always(&ph->iph_rulebyid, pin6); 10279 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 10280 HASHLIST_INSERT(pout6, ipsp_hash, pr->ipr_nonhash[fam]); 10281 ipsec_insert_always(&ph->iph_rulebyid, pout6); 10282 } 10283 10284 ipsec_actvec_free(actp, nact); 10285 10286 /* 10287 * File the v4 policies. 10288 */ 10289 fam = IPSEC_AF_V4; 10290 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 10291 HASHLIST_INSERT(pin4, ipsp_hash, pr->ipr_nonhash[fam]); 10292 ipsec_insert_always(&ph->iph_rulebyid, pin4); 10293 10294 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 10295 HASHLIST_INSERT(pout4, ipsp_hash, pr->ipr_nonhash[fam]); 10296 ipsec_insert_always(&ph->iph_rulebyid, pout4); 10297 10298 /* 10299 * If the requests need security, set enforce_policy. 10300 * If the requests are IPSEC_PREF_NEVER, one should 10301 * still set conn_out_enforce_policy so that an ipsec_out 10302 * gets attached in ip_wput. This is needed so that 10303 * for connections that we don't cache policy in ip_bind, 10304 * if global policy matches in ip_wput_attach_policy, we 10305 * don't wrongly inherit global policy. Similarly, we need 10306 * to set conn_in_enforce_policy also so that we don't verify 10307 * policy wrongly. 10308 */ 10309 if ((ah_req & REQ_MASK) != 0 || 10310 (esp_req & REQ_MASK) != 0 || 10311 (se_req & REQ_MASK) != 0) { 10312 connp->conn_in_enforce_policy = B_TRUE; 10313 connp->conn_out_enforce_policy = B_TRUE; 10314 connp->conn_flags |= IPCL_CHECK_POLICY; 10315 } 10316 10317 mutex_exit(&connp->conn_lock); 10318 return (error); 10319 #undef REQ_MASK 10320 10321 /* 10322 * Common memory-allocation-failure exit path. 10323 */ 10324 enomem: 10325 mutex_exit(&connp->conn_lock); 10326 if (actp != NULL) 10327 ipsec_actvec_free(actp, nact); 10328 if (pin4 != NULL) 10329 IPPOL_REFRELE(pin4, ipst->ips_netstack); 10330 if (pout4 != NULL) 10331 IPPOL_REFRELE(pout4, ipst->ips_netstack); 10332 if (pin6 != NULL) 10333 IPPOL_REFRELE(pin6, ipst->ips_netstack); 10334 if (pout6 != NULL) 10335 IPPOL_REFRELE(pout6, ipst->ips_netstack); 10336 return (ENOMEM); 10337 } 10338 10339 /* 10340 * Only for options that pass in an IP addr. Currently only V4 options 10341 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 10342 * So this function assumes level is IPPROTO_IP 10343 */ 10344 int 10345 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 10346 mblk_t *first_mp) 10347 { 10348 ipif_t *ipif = NULL; 10349 int error; 10350 ill_t *ill; 10351 int zoneid; 10352 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10353 10354 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 10355 10356 if (addr != INADDR_ANY || checkonly) { 10357 ASSERT(connp != NULL); 10358 zoneid = IPCL_ZONEID(connp); 10359 if (option == IP_NEXTHOP) { 10360 ipif = ipif_lookup_onlink_addr(addr, 10361 connp->conn_zoneid, ipst); 10362 } else { 10363 ipif = ipif_lookup_addr(addr, NULL, zoneid, 10364 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 10365 &error, ipst); 10366 } 10367 if (ipif == NULL) { 10368 if (error == EINPROGRESS) 10369 return (error); 10370 else if ((option == IP_MULTICAST_IF) || 10371 (option == IP_NEXTHOP)) 10372 return (EHOSTUNREACH); 10373 else 10374 return (EINVAL); 10375 } else if (checkonly) { 10376 if (option == IP_MULTICAST_IF) { 10377 ill = ipif->ipif_ill; 10378 /* not supported by the virtual network iface */ 10379 if (IS_VNI(ill)) { 10380 ipif_refrele(ipif); 10381 return (EINVAL); 10382 } 10383 } 10384 ipif_refrele(ipif); 10385 return (0); 10386 } 10387 ill = ipif->ipif_ill; 10388 mutex_enter(&connp->conn_lock); 10389 mutex_enter(&ill->ill_lock); 10390 if ((ill->ill_state_flags & ILL_CONDEMNED) || 10391 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 10392 mutex_exit(&ill->ill_lock); 10393 mutex_exit(&connp->conn_lock); 10394 ipif_refrele(ipif); 10395 return (option == IP_MULTICAST_IF ? 10396 EHOSTUNREACH : EINVAL); 10397 } 10398 } else { 10399 mutex_enter(&connp->conn_lock); 10400 } 10401 10402 /* None of the options below are supported on the VNI */ 10403 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 10404 mutex_exit(&ill->ill_lock); 10405 mutex_exit(&connp->conn_lock); 10406 ipif_refrele(ipif); 10407 return (EINVAL); 10408 } 10409 10410 switch (option) { 10411 case IP_DONTFAILOVER_IF: 10412 /* 10413 * This option is used by in.mpathd to ensure 10414 * that IPMP probe packets only go out on the 10415 * test interfaces. in.mpathd sets this option 10416 * on the non-failover interfaces. 10417 * For backward compatibility, this option 10418 * implicitly sets IP_MULTICAST_IF, as used 10419 * be done in bind(), so that ip_wput gets 10420 * this ipif to send mcast packets. 10421 */ 10422 if (ipif != NULL) { 10423 ASSERT(addr != INADDR_ANY); 10424 connp->conn_nofailover_ill = ipif->ipif_ill; 10425 connp->conn_multicast_ipif = ipif; 10426 } else { 10427 ASSERT(addr == INADDR_ANY); 10428 connp->conn_nofailover_ill = NULL; 10429 connp->conn_multicast_ipif = NULL; 10430 } 10431 break; 10432 10433 case IP_MULTICAST_IF: 10434 connp->conn_multicast_ipif = ipif; 10435 break; 10436 case IP_NEXTHOP: 10437 connp->conn_nexthop_v4 = addr; 10438 connp->conn_nexthop_set = B_TRUE; 10439 break; 10440 } 10441 10442 if (ipif != NULL) { 10443 mutex_exit(&ill->ill_lock); 10444 mutex_exit(&connp->conn_lock); 10445 ipif_refrele(ipif); 10446 return (0); 10447 } 10448 mutex_exit(&connp->conn_lock); 10449 /* We succeded in cleared the option */ 10450 return (0); 10451 } 10452 10453 /* 10454 * For options that pass in an ifindex specifying the ill. V6 options always 10455 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 10456 */ 10457 int 10458 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 10459 int level, int option, mblk_t *first_mp) 10460 { 10461 ill_t *ill = NULL; 10462 int error = 0; 10463 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10464 10465 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 10466 if (ifindex != 0) { 10467 ASSERT(connp != NULL); 10468 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 10469 first_mp, ip_restart_optmgmt, &error, ipst); 10470 if (ill != NULL) { 10471 if (checkonly) { 10472 /* not supported by the virtual network iface */ 10473 if (IS_VNI(ill)) { 10474 ill_refrele(ill); 10475 return (EINVAL); 10476 } 10477 ill_refrele(ill); 10478 return (0); 10479 } 10480 if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, 10481 0, NULL)) { 10482 ill_refrele(ill); 10483 ill = NULL; 10484 mutex_enter(&connp->conn_lock); 10485 goto setit; 10486 } 10487 mutex_enter(&connp->conn_lock); 10488 mutex_enter(&ill->ill_lock); 10489 if (ill->ill_state_flags & ILL_CONDEMNED) { 10490 mutex_exit(&ill->ill_lock); 10491 mutex_exit(&connp->conn_lock); 10492 ill_refrele(ill); 10493 ill = NULL; 10494 mutex_enter(&connp->conn_lock); 10495 } 10496 goto setit; 10497 } else if (error == EINPROGRESS) { 10498 return (error); 10499 } else { 10500 error = 0; 10501 } 10502 } 10503 mutex_enter(&connp->conn_lock); 10504 setit: 10505 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 10506 10507 /* 10508 * The options below assume that the ILL (if any) transmits and/or 10509 * receives traffic. Neither of which is true for the virtual network 10510 * interface, so fail setting these on a VNI. 10511 */ 10512 if (IS_VNI(ill)) { 10513 ASSERT(ill != NULL); 10514 mutex_exit(&ill->ill_lock); 10515 mutex_exit(&connp->conn_lock); 10516 ill_refrele(ill); 10517 return (EINVAL); 10518 } 10519 10520 if (level == IPPROTO_IP) { 10521 switch (option) { 10522 case IP_BOUND_IF: 10523 connp->conn_incoming_ill = ill; 10524 connp->conn_outgoing_ill = ill; 10525 connp->conn_orig_bound_ifindex = (ill == NULL) ? 10526 0 : ifindex; 10527 break; 10528 10529 case IP_MULTICAST_IF: 10530 /* 10531 * This option is an internal special. The socket 10532 * level IP_MULTICAST_IF specifies an 'ipaddr' and 10533 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 10534 * specifies an ifindex and we try first on V6 ill's. 10535 * If we don't find one, we they try using on v4 ill's 10536 * intenally and we come here. 10537 */ 10538 if (!checkonly && ill != NULL) { 10539 ipif_t *ipif; 10540 ipif = ill->ill_ipif; 10541 10542 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 10543 mutex_exit(&ill->ill_lock); 10544 mutex_exit(&connp->conn_lock); 10545 ill_refrele(ill); 10546 ill = NULL; 10547 mutex_enter(&connp->conn_lock); 10548 } else { 10549 connp->conn_multicast_ipif = ipif; 10550 } 10551 } 10552 break; 10553 10554 case IP_DHCPINIT_IF: 10555 if (connp->conn_dhcpinit_ill != NULL) { 10556 /* 10557 * We've locked the conn so conn_cleanup_ill() 10558 * cannot clear conn_dhcpinit_ill -- so it's 10559 * safe to access the ill. 10560 */ 10561 ill_t *oill = connp->conn_dhcpinit_ill; 10562 10563 ASSERT(oill->ill_dhcpinit != 0); 10564 atomic_dec_32(&oill->ill_dhcpinit); 10565 connp->conn_dhcpinit_ill = NULL; 10566 } 10567 10568 if (ill != NULL) { 10569 connp->conn_dhcpinit_ill = ill; 10570 atomic_inc_32(&ill->ill_dhcpinit); 10571 } 10572 break; 10573 } 10574 } else { 10575 switch (option) { 10576 case IPV6_BOUND_IF: 10577 connp->conn_incoming_ill = ill; 10578 connp->conn_outgoing_ill = ill; 10579 connp->conn_orig_bound_ifindex = (ill == NULL) ? 10580 0 : ifindex; 10581 break; 10582 10583 case IPV6_BOUND_PIF: 10584 /* 10585 * Limit all transmit to this ill. 10586 * Unlike IPV6_BOUND_IF, using this option 10587 * prevents load spreading and failover from 10588 * happening when the interface is part of the 10589 * group. That's why we don't need to remember 10590 * the ifindex in orig_bound_ifindex as in 10591 * IPV6_BOUND_IF. 10592 */ 10593 connp->conn_outgoing_pill = ill; 10594 break; 10595 10596 case IPV6_DONTFAILOVER_IF: 10597 /* 10598 * This option is used by in.mpathd to ensure 10599 * that IPMP probe packets only go out on the 10600 * test interfaces. in.mpathd sets this option 10601 * on the non-failover interfaces. 10602 */ 10603 connp->conn_nofailover_ill = ill; 10604 /* 10605 * For backward compatibility, this option 10606 * implicitly sets ip_multicast_ill as used in 10607 * IPV6_MULTICAST_IF so that ip_wput gets 10608 * this ill to send mcast packets. 10609 */ 10610 connp->conn_multicast_ill = ill; 10611 connp->conn_orig_multicast_ifindex = (ill == NULL) ? 10612 0 : ifindex; 10613 break; 10614 10615 case IPV6_MULTICAST_IF: 10616 /* 10617 * Set conn_multicast_ill to be the IPv6 ill. 10618 * Set conn_multicast_ipif to be an IPv4 ipif 10619 * for ifindex to make IPv4 mapped addresses 10620 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 10621 * Even if no IPv6 ill exists for the ifindex 10622 * we need to check for an IPv4 ifindex in order 10623 * for this to work with mapped addresses. In that 10624 * case only set conn_multicast_ipif. 10625 */ 10626 if (!checkonly) { 10627 if (ifindex == 0) { 10628 connp->conn_multicast_ill = NULL; 10629 connp->conn_orig_multicast_ifindex = 0; 10630 connp->conn_multicast_ipif = NULL; 10631 } else if (ill != NULL) { 10632 connp->conn_multicast_ill = ill; 10633 connp->conn_orig_multicast_ifindex = 10634 ifindex; 10635 } 10636 } 10637 break; 10638 } 10639 } 10640 10641 if (ill != NULL) { 10642 mutex_exit(&ill->ill_lock); 10643 mutex_exit(&connp->conn_lock); 10644 ill_refrele(ill); 10645 return (0); 10646 } 10647 mutex_exit(&connp->conn_lock); 10648 /* 10649 * We succeeded in clearing the option (ifindex == 0) or failed to 10650 * locate the ill and could not set the option (ifindex != 0) 10651 */ 10652 return (ifindex == 0 ? 0 : EINVAL); 10653 } 10654 10655 /* This routine sets socket options. */ 10656 /* ARGSUSED */ 10657 int 10658 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 10659 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 10660 void *dummy, cred_t *cr, mblk_t *first_mp) 10661 { 10662 int *i1 = (int *)invalp; 10663 conn_t *connp = Q_TO_CONN(q); 10664 int error = 0; 10665 boolean_t checkonly; 10666 ire_t *ire; 10667 boolean_t found; 10668 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 10669 10670 switch (optset_context) { 10671 10672 case SETFN_OPTCOM_CHECKONLY: 10673 checkonly = B_TRUE; 10674 /* 10675 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 10676 * inlen != 0 implies value supplied and 10677 * we have to "pretend" to set it. 10678 * inlen == 0 implies that there is no 10679 * value part in T_CHECK request and just validation 10680 * done elsewhere should be enough, we just return here. 10681 */ 10682 if (inlen == 0) { 10683 *outlenp = 0; 10684 return (0); 10685 } 10686 break; 10687 case SETFN_OPTCOM_NEGOTIATE: 10688 case SETFN_UD_NEGOTIATE: 10689 case SETFN_CONN_NEGOTIATE: 10690 checkonly = B_FALSE; 10691 break; 10692 default: 10693 /* 10694 * We should never get here 10695 */ 10696 *outlenp = 0; 10697 return (EINVAL); 10698 } 10699 10700 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 10701 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 10702 10703 /* 10704 * For fixed length options, no sanity check 10705 * of passed in length is done. It is assumed *_optcom_req() 10706 * routines do the right thing. 10707 */ 10708 10709 switch (level) { 10710 case SOL_SOCKET: 10711 /* 10712 * conn_lock protects the bitfields, and is used to 10713 * set the fields atomically. 10714 */ 10715 switch (name) { 10716 case SO_BROADCAST: 10717 if (!checkonly) { 10718 /* TODO: use value someplace? */ 10719 mutex_enter(&connp->conn_lock); 10720 connp->conn_broadcast = *i1 ? 1 : 0; 10721 mutex_exit(&connp->conn_lock); 10722 } 10723 break; /* goto sizeof (int) option return */ 10724 case SO_USELOOPBACK: 10725 if (!checkonly) { 10726 /* TODO: use value someplace? */ 10727 mutex_enter(&connp->conn_lock); 10728 connp->conn_loopback = *i1 ? 1 : 0; 10729 mutex_exit(&connp->conn_lock); 10730 } 10731 break; /* goto sizeof (int) option return */ 10732 case SO_DONTROUTE: 10733 if (!checkonly) { 10734 mutex_enter(&connp->conn_lock); 10735 connp->conn_dontroute = *i1 ? 1 : 0; 10736 mutex_exit(&connp->conn_lock); 10737 } 10738 break; /* goto sizeof (int) option return */ 10739 case SO_REUSEADDR: 10740 if (!checkonly) { 10741 mutex_enter(&connp->conn_lock); 10742 connp->conn_reuseaddr = *i1 ? 1 : 0; 10743 mutex_exit(&connp->conn_lock); 10744 } 10745 break; /* goto sizeof (int) option return */ 10746 case SO_PROTOTYPE: 10747 if (!checkonly) { 10748 mutex_enter(&connp->conn_lock); 10749 connp->conn_proto = *i1; 10750 mutex_exit(&connp->conn_lock); 10751 } 10752 break; /* goto sizeof (int) option return */ 10753 case SO_ALLZONES: 10754 if (!checkonly) { 10755 mutex_enter(&connp->conn_lock); 10756 if (IPCL_IS_BOUND(connp)) { 10757 mutex_exit(&connp->conn_lock); 10758 return (EINVAL); 10759 } 10760 connp->conn_allzones = *i1 != 0 ? 1 : 0; 10761 mutex_exit(&connp->conn_lock); 10762 } 10763 break; /* goto sizeof (int) option return */ 10764 case SO_ANON_MLP: 10765 if (!checkonly) { 10766 mutex_enter(&connp->conn_lock); 10767 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 10768 mutex_exit(&connp->conn_lock); 10769 } 10770 break; /* goto sizeof (int) option return */ 10771 case SO_MAC_EXEMPT: 10772 if (secpolicy_net_mac_aware(cr) != 0 || 10773 IPCL_IS_BOUND(connp)) 10774 return (EACCES); 10775 if (!checkonly) { 10776 mutex_enter(&connp->conn_lock); 10777 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 10778 mutex_exit(&connp->conn_lock); 10779 } 10780 break; /* goto sizeof (int) option return */ 10781 default: 10782 /* 10783 * "soft" error (negative) 10784 * option not handled at this level 10785 * Note: Do not modify *outlenp 10786 */ 10787 return (-EINVAL); 10788 } 10789 break; 10790 case IPPROTO_IP: 10791 switch (name) { 10792 case IP_NEXTHOP: 10793 if (secpolicy_ip_config(cr, B_FALSE) != 0) 10794 return (EPERM); 10795 /* FALLTHRU */ 10796 case IP_MULTICAST_IF: 10797 case IP_DONTFAILOVER_IF: { 10798 ipaddr_t addr = *i1; 10799 10800 error = ip_opt_set_ipif(connp, addr, checkonly, name, 10801 first_mp); 10802 if (error != 0) 10803 return (error); 10804 break; /* goto sizeof (int) option return */ 10805 } 10806 10807 case IP_MULTICAST_TTL: 10808 /* Recorded in transport above IP */ 10809 *outvalp = *invalp; 10810 *outlenp = sizeof (uchar_t); 10811 return (0); 10812 case IP_MULTICAST_LOOP: 10813 if (!checkonly) { 10814 mutex_enter(&connp->conn_lock); 10815 connp->conn_multicast_loop = *invalp ? 1 : 0; 10816 mutex_exit(&connp->conn_lock); 10817 } 10818 *outvalp = *invalp; 10819 *outlenp = sizeof (uchar_t); 10820 return (0); 10821 case IP_ADD_MEMBERSHIP: 10822 case MCAST_JOIN_GROUP: 10823 case IP_DROP_MEMBERSHIP: 10824 case MCAST_LEAVE_GROUP: { 10825 struct ip_mreq *mreqp; 10826 struct group_req *greqp; 10827 ire_t *ire; 10828 boolean_t done = B_FALSE; 10829 ipaddr_t group, ifaddr; 10830 struct sockaddr_in *sin; 10831 uint32_t *ifindexp; 10832 boolean_t mcast_opt = B_TRUE; 10833 mcast_record_t fmode; 10834 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10835 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10836 10837 switch (name) { 10838 case IP_ADD_MEMBERSHIP: 10839 mcast_opt = B_FALSE; 10840 /* FALLTHRU */ 10841 case MCAST_JOIN_GROUP: 10842 fmode = MODE_IS_EXCLUDE; 10843 optfn = ip_opt_add_group; 10844 break; 10845 10846 case IP_DROP_MEMBERSHIP: 10847 mcast_opt = B_FALSE; 10848 /* FALLTHRU */ 10849 case MCAST_LEAVE_GROUP: 10850 fmode = MODE_IS_INCLUDE; 10851 optfn = ip_opt_delete_group; 10852 break; 10853 } 10854 10855 if (mcast_opt) { 10856 greqp = (struct group_req *)i1; 10857 sin = (struct sockaddr_in *)&greqp->gr_group; 10858 if (sin->sin_family != AF_INET) { 10859 *outlenp = 0; 10860 return (ENOPROTOOPT); 10861 } 10862 group = (ipaddr_t)sin->sin_addr.s_addr; 10863 ifaddr = INADDR_ANY; 10864 ifindexp = &greqp->gr_interface; 10865 } else { 10866 mreqp = (struct ip_mreq *)i1; 10867 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10868 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10869 ifindexp = NULL; 10870 } 10871 10872 /* 10873 * In the multirouting case, we need to replicate 10874 * the request on all interfaces that will take part 10875 * in replication. We do so because multirouting is 10876 * reflective, thus we will probably receive multi- 10877 * casts on those interfaces. 10878 * The ip_multirt_apply_membership() succeeds if the 10879 * operation succeeds on at least one interface. 10880 */ 10881 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10882 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10883 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10884 if (ire != NULL) { 10885 if (ire->ire_flags & RTF_MULTIRT) { 10886 error = ip_multirt_apply_membership( 10887 optfn, ire, connp, checkonly, group, 10888 fmode, INADDR_ANY, first_mp); 10889 done = B_TRUE; 10890 } 10891 ire_refrele(ire); 10892 } 10893 if (!done) { 10894 error = optfn(connp, checkonly, group, ifaddr, 10895 ifindexp, fmode, INADDR_ANY, first_mp); 10896 } 10897 if (error) { 10898 /* 10899 * EINPROGRESS is a soft error, needs retry 10900 * so don't make *outlenp zero. 10901 */ 10902 if (error != EINPROGRESS) 10903 *outlenp = 0; 10904 return (error); 10905 } 10906 /* OK return - copy input buffer into output buffer */ 10907 if (invalp != outvalp) { 10908 /* don't trust bcopy for identical src/dst */ 10909 bcopy(invalp, outvalp, inlen); 10910 } 10911 *outlenp = inlen; 10912 return (0); 10913 } 10914 case IP_BLOCK_SOURCE: 10915 case IP_UNBLOCK_SOURCE: 10916 case IP_ADD_SOURCE_MEMBERSHIP: 10917 case IP_DROP_SOURCE_MEMBERSHIP: 10918 case MCAST_BLOCK_SOURCE: 10919 case MCAST_UNBLOCK_SOURCE: 10920 case MCAST_JOIN_SOURCE_GROUP: 10921 case MCAST_LEAVE_SOURCE_GROUP: { 10922 struct ip_mreq_source *imreqp; 10923 struct group_source_req *gsreqp; 10924 in_addr_t grp, src, ifaddr = INADDR_ANY; 10925 uint32_t ifindex = 0; 10926 mcast_record_t fmode; 10927 struct sockaddr_in *sin; 10928 ire_t *ire; 10929 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10930 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10931 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10932 10933 switch (name) { 10934 case IP_BLOCK_SOURCE: 10935 mcast_opt = B_FALSE; 10936 /* FALLTHRU */ 10937 case MCAST_BLOCK_SOURCE: 10938 fmode = MODE_IS_EXCLUDE; 10939 optfn = ip_opt_add_group; 10940 break; 10941 10942 case IP_UNBLOCK_SOURCE: 10943 mcast_opt = B_FALSE; 10944 /* FALLTHRU */ 10945 case MCAST_UNBLOCK_SOURCE: 10946 fmode = MODE_IS_EXCLUDE; 10947 optfn = ip_opt_delete_group; 10948 break; 10949 10950 case IP_ADD_SOURCE_MEMBERSHIP: 10951 mcast_opt = B_FALSE; 10952 /* FALLTHRU */ 10953 case MCAST_JOIN_SOURCE_GROUP: 10954 fmode = MODE_IS_INCLUDE; 10955 optfn = ip_opt_add_group; 10956 break; 10957 10958 case IP_DROP_SOURCE_MEMBERSHIP: 10959 mcast_opt = B_FALSE; 10960 /* FALLTHRU */ 10961 case MCAST_LEAVE_SOURCE_GROUP: 10962 fmode = MODE_IS_INCLUDE; 10963 optfn = ip_opt_delete_group; 10964 break; 10965 } 10966 10967 if (mcast_opt) { 10968 gsreqp = (struct group_source_req *)i1; 10969 if (gsreqp->gsr_group.ss_family != AF_INET) { 10970 *outlenp = 0; 10971 return (ENOPROTOOPT); 10972 } 10973 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10974 grp = (ipaddr_t)sin->sin_addr.s_addr; 10975 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10976 src = (ipaddr_t)sin->sin_addr.s_addr; 10977 ifindex = gsreqp->gsr_interface; 10978 } else { 10979 imreqp = (struct ip_mreq_source *)i1; 10980 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10981 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10982 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10983 } 10984 10985 /* 10986 * In the multirouting case, we need to replicate 10987 * the request as noted in the mcast cases above. 10988 */ 10989 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10990 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10991 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 10992 if (ire != NULL) { 10993 if (ire->ire_flags & RTF_MULTIRT) { 10994 error = ip_multirt_apply_membership( 10995 optfn, ire, connp, checkonly, grp, 10996 fmode, src, first_mp); 10997 done = B_TRUE; 10998 } 10999 ire_refrele(ire); 11000 } 11001 if (!done) { 11002 error = optfn(connp, checkonly, grp, ifaddr, 11003 &ifindex, fmode, src, first_mp); 11004 } 11005 if (error != 0) { 11006 /* 11007 * EINPROGRESS is a soft error, needs retry 11008 * so don't make *outlenp zero. 11009 */ 11010 if (error != EINPROGRESS) 11011 *outlenp = 0; 11012 return (error); 11013 } 11014 /* OK return - copy input buffer into output buffer */ 11015 if (invalp != outvalp) { 11016 bcopy(invalp, outvalp, inlen); 11017 } 11018 *outlenp = inlen; 11019 return (0); 11020 } 11021 case IP_SEC_OPT: 11022 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 11023 if (error != 0) { 11024 *outlenp = 0; 11025 return (error); 11026 } 11027 break; 11028 case IP_HDRINCL: 11029 case IP_OPTIONS: 11030 case T_IP_OPTIONS: 11031 case IP_TOS: 11032 case T_IP_TOS: 11033 case IP_TTL: 11034 case IP_RECVDSTADDR: 11035 case IP_RECVOPTS: 11036 /* OK return - copy input buffer into output buffer */ 11037 if (invalp != outvalp) { 11038 /* don't trust bcopy for identical src/dst */ 11039 bcopy(invalp, outvalp, inlen); 11040 } 11041 *outlenp = inlen; 11042 return (0); 11043 case IP_RECVIF: 11044 /* Retrieve the inbound interface index */ 11045 if (!checkonly) { 11046 mutex_enter(&connp->conn_lock); 11047 connp->conn_recvif = *i1 ? 1 : 0; 11048 mutex_exit(&connp->conn_lock); 11049 } 11050 break; /* goto sizeof (int) option return */ 11051 case IP_RECVPKTINFO: 11052 if (!checkonly) { 11053 mutex_enter(&connp->conn_lock); 11054 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11055 mutex_exit(&connp->conn_lock); 11056 } 11057 break; /* goto sizeof (int) option return */ 11058 case IP_RECVSLLA: 11059 /* Retrieve the source link layer address */ 11060 if (!checkonly) { 11061 mutex_enter(&connp->conn_lock); 11062 connp->conn_recvslla = *i1 ? 1 : 0; 11063 mutex_exit(&connp->conn_lock); 11064 } 11065 break; /* goto sizeof (int) option return */ 11066 case MRT_INIT: 11067 case MRT_DONE: 11068 case MRT_ADD_VIF: 11069 case MRT_DEL_VIF: 11070 case MRT_ADD_MFC: 11071 case MRT_DEL_MFC: 11072 case MRT_ASSERT: 11073 if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { 11074 *outlenp = 0; 11075 return (error); 11076 } 11077 error = ip_mrouter_set((int)name, q, checkonly, 11078 (uchar_t *)invalp, inlen, first_mp); 11079 if (error) { 11080 *outlenp = 0; 11081 return (error); 11082 } 11083 /* OK return - copy input buffer into output buffer */ 11084 if (invalp != outvalp) { 11085 /* don't trust bcopy for identical src/dst */ 11086 bcopy(invalp, outvalp, inlen); 11087 } 11088 *outlenp = inlen; 11089 return (0); 11090 case IP_BOUND_IF: 11091 case IP_DHCPINIT_IF: 11092 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 11093 level, name, first_mp); 11094 if (error != 0) 11095 return (error); 11096 break; /* goto sizeof (int) option return */ 11097 11098 case IP_UNSPEC_SRC: 11099 /* Allow sending with a zero source address */ 11100 if (!checkonly) { 11101 mutex_enter(&connp->conn_lock); 11102 connp->conn_unspec_src = *i1 ? 1 : 0; 11103 mutex_exit(&connp->conn_lock); 11104 } 11105 break; /* goto sizeof (int) option return */ 11106 default: 11107 /* 11108 * "soft" error (negative) 11109 * option not handled at this level 11110 * Note: Do not modify *outlenp 11111 */ 11112 return (-EINVAL); 11113 } 11114 break; 11115 case IPPROTO_IPV6: 11116 switch (name) { 11117 case IPV6_BOUND_IF: 11118 case IPV6_BOUND_PIF: 11119 case IPV6_DONTFAILOVER_IF: 11120 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 11121 level, name, first_mp); 11122 if (error != 0) 11123 return (error); 11124 break; /* goto sizeof (int) option return */ 11125 11126 case IPV6_MULTICAST_IF: 11127 /* 11128 * The only possible errors are EINPROGRESS and 11129 * EINVAL. EINPROGRESS will be restarted and is not 11130 * a hard error. We call this option on both V4 and V6 11131 * If both return EINVAL, then this call returns 11132 * EINVAL. If at least one of them succeeds we 11133 * return success. 11134 */ 11135 found = B_FALSE; 11136 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 11137 level, name, first_mp); 11138 if (error == EINPROGRESS) 11139 return (error); 11140 if (error == 0) 11141 found = B_TRUE; 11142 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 11143 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 11144 if (error == 0) 11145 found = B_TRUE; 11146 if (!found) 11147 return (error); 11148 break; /* goto sizeof (int) option return */ 11149 11150 case IPV6_MULTICAST_HOPS: 11151 /* Recorded in transport above IP */ 11152 break; /* goto sizeof (int) option return */ 11153 case IPV6_MULTICAST_LOOP: 11154 if (!checkonly) { 11155 mutex_enter(&connp->conn_lock); 11156 connp->conn_multicast_loop = *i1; 11157 mutex_exit(&connp->conn_lock); 11158 } 11159 break; /* goto sizeof (int) option return */ 11160 case IPV6_JOIN_GROUP: 11161 case MCAST_JOIN_GROUP: 11162 case IPV6_LEAVE_GROUP: 11163 case MCAST_LEAVE_GROUP: { 11164 struct ipv6_mreq *ip_mreqp; 11165 struct group_req *greqp; 11166 ire_t *ire; 11167 boolean_t done = B_FALSE; 11168 in6_addr_t groupv6; 11169 uint32_t ifindex; 11170 boolean_t mcast_opt = B_TRUE; 11171 mcast_record_t fmode; 11172 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11173 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11174 11175 switch (name) { 11176 case IPV6_JOIN_GROUP: 11177 mcast_opt = B_FALSE; 11178 /* FALLTHRU */ 11179 case MCAST_JOIN_GROUP: 11180 fmode = MODE_IS_EXCLUDE; 11181 optfn = ip_opt_add_group_v6; 11182 break; 11183 11184 case IPV6_LEAVE_GROUP: 11185 mcast_opt = B_FALSE; 11186 /* FALLTHRU */ 11187 case MCAST_LEAVE_GROUP: 11188 fmode = MODE_IS_INCLUDE; 11189 optfn = ip_opt_delete_group_v6; 11190 break; 11191 } 11192 11193 if (mcast_opt) { 11194 struct sockaddr_in *sin; 11195 struct sockaddr_in6 *sin6; 11196 greqp = (struct group_req *)i1; 11197 if (greqp->gr_group.ss_family == AF_INET) { 11198 sin = (struct sockaddr_in *) 11199 &(greqp->gr_group); 11200 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 11201 &groupv6); 11202 } else { 11203 sin6 = (struct sockaddr_in6 *) 11204 &(greqp->gr_group); 11205 groupv6 = sin6->sin6_addr; 11206 } 11207 ifindex = greqp->gr_interface; 11208 } else { 11209 ip_mreqp = (struct ipv6_mreq *)i1; 11210 groupv6 = ip_mreqp->ipv6mr_multiaddr; 11211 ifindex = ip_mreqp->ipv6mr_interface; 11212 } 11213 /* 11214 * In the multirouting case, we need to replicate 11215 * the request on all interfaces that will take part 11216 * in replication. We do so because multirouting is 11217 * reflective, thus we will probably receive multi- 11218 * casts on those interfaces. 11219 * The ip_multirt_apply_membership_v6() succeeds if 11220 * the operation succeeds on at least one interface. 11221 */ 11222 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 11223 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11224 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11225 if (ire != NULL) { 11226 if (ire->ire_flags & RTF_MULTIRT) { 11227 error = ip_multirt_apply_membership_v6( 11228 optfn, ire, connp, checkonly, 11229 &groupv6, fmode, &ipv6_all_zeros, 11230 first_mp); 11231 done = B_TRUE; 11232 } 11233 ire_refrele(ire); 11234 } 11235 if (!done) { 11236 error = optfn(connp, checkonly, &groupv6, 11237 ifindex, fmode, &ipv6_all_zeros, first_mp); 11238 } 11239 if (error) { 11240 /* 11241 * EINPROGRESS is a soft error, needs retry 11242 * so don't make *outlenp zero. 11243 */ 11244 if (error != EINPROGRESS) 11245 *outlenp = 0; 11246 return (error); 11247 } 11248 /* OK return - copy input buffer into output buffer */ 11249 if (invalp != outvalp) { 11250 /* don't trust bcopy for identical src/dst */ 11251 bcopy(invalp, outvalp, inlen); 11252 } 11253 *outlenp = inlen; 11254 return (0); 11255 } 11256 case MCAST_BLOCK_SOURCE: 11257 case MCAST_UNBLOCK_SOURCE: 11258 case MCAST_JOIN_SOURCE_GROUP: 11259 case MCAST_LEAVE_SOURCE_GROUP: { 11260 struct group_source_req *gsreqp; 11261 in6_addr_t v6grp, v6src; 11262 uint32_t ifindex; 11263 mcast_record_t fmode; 11264 ire_t *ire; 11265 boolean_t done = B_FALSE; 11266 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 11267 int, mcast_record_t, const in6_addr_t *, mblk_t *); 11268 11269 switch (name) { 11270 case MCAST_BLOCK_SOURCE: 11271 fmode = MODE_IS_EXCLUDE; 11272 optfn = ip_opt_add_group_v6; 11273 break; 11274 case MCAST_UNBLOCK_SOURCE: 11275 fmode = MODE_IS_EXCLUDE; 11276 optfn = ip_opt_delete_group_v6; 11277 break; 11278 case MCAST_JOIN_SOURCE_GROUP: 11279 fmode = MODE_IS_INCLUDE; 11280 optfn = ip_opt_add_group_v6; 11281 break; 11282 case MCAST_LEAVE_SOURCE_GROUP: 11283 fmode = MODE_IS_INCLUDE; 11284 optfn = ip_opt_delete_group_v6; 11285 break; 11286 } 11287 11288 gsreqp = (struct group_source_req *)i1; 11289 ifindex = gsreqp->gsr_interface; 11290 if (gsreqp->gsr_group.ss_family == AF_INET) { 11291 struct sockaddr_in *s; 11292 s = (struct sockaddr_in *)&gsreqp->gsr_group; 11293 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 11294 s = (struct sockaddr_in *)&gsreqp->gsr_source; 11295 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 11296 } else { 11297 struct sockaddr_in6 *s6; 11298 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 11299 v6grp = s6->sin6_addr; 11300 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 11301 v6src = s6->sin6_addr; 11302 } 11303 11304 /* 11305 * In the multirouting case, we need to replicate 11306 * the request as noted in the mcast cases above. 11307 */ 11308 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 11309 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 11310 MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); 11311 if (ire != NULL) { 11312 if (ire->ire_flags & RTF_MULTIRT) { 11313 error = ip_multirt_apply_membership_v6( 11314 optfn, ire, connp, checkonly, 11315 &v6grp, fmode, &v6src, first_mp); 11316 done = B_TRUE; 11317 } 11318 ire_refrele(ire); 11319 } 11320 if (!done) { 11321 error = optfn(connp, checkonly, &v6grp, 11322 ifindex, fmode, &v6src, first_mp); 11323 } 11324 if (error != 0) { 11325 /* 11326 * EINPROGRESS is a soft error, needs retry 11327 * so don't make *outlenp zero. 11328 */ 11329 if (error != EINPROGRESS) 11330 *outlenp = 0; 11331 return (error); 11332 } 11333 /* OK return - copy input buffer into output buffer */ 11334 if (invalp != outvalp) { 11335 bcopy(invalp, outvalp, inlen); 11336 } 11337 *outlenp = inlen; 11338 return (0); 11339 } 11340 case IPV6_UNICAST_HOPS: 11341 /* Recorded in transport above IP */ 11342 break; /* goto sizeof (int) option return */ 11343 case IPV6_UNSPEC_SRC: 11344 /* Allow sending with a zero source address */ 11345 if (!checkonly) { 11346 mutex_enter(&connp->conn_lock); 11347 connp->conn_unspec_src = *i1 ? 1 : 0; 11348 mutex_exit(&connp->conn_lock); 11349 } 11350 break; /* goto sizeof (int) option return */ 11351 case IPV6_RECVPKTINFO: 11352 if (!checkonly) { 11353 mutex_enter(&connp->conn_lock); 11354 connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; 11355 mutex_exit(&connp->conn_lock); 11356 } 11357 break; /* goto sizeof (int) option return */ 11358 case IPV6_RECVTCLASS: 11359 if (!checkonly) { 11360 if (*i1 < 0 || *i1 > 1) { 11361 return (EINVAL); 11362 } 11363 mutex_enter(&connp->conn_lock); 11364 connp->conn_ipv6_recvtclass = *i1; 11365 mutex_exit(&connp->conn_lock); 11366 } 11367 break; 11368 case IPV6_RECVPATHMTU: 11369 if (!checkonly) { 11370 if (*i1 < 0 || *i1 > 1) { 11371 return (EINVAL); 11372 } 11373 mutex_enter(&connp->conn_lock); 11374 connp->conn_ipv6_recvpathmtu = *i1; 11375 mutex_exit(&connp->conn_lock); 11376 } 11377 break; 11378 case IPV6_RECVHOPLIMIT: 11379 if (!checkonly) { 11380 mutex_enter(&connp->conn_lock); 11381 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 11382 mutex_exit(&connp->conn_lock); 11383 } 11384 break; /* goto sizeof (int) option return */ 11385 case IPV6_RECVHOPOPTS: 11386 if (!checkonly) { 11387 mutex_enter(&connp->conn_lock); 11388 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 11389 mutex_exit(&connp->conn_lock); 11390 } 11391 break; /* goto sizeof (int) option return */ 11392 case IPV6_RECVDSTOPTS: 11393 if (!checkonly) { 11394 mutex_enter(&connp->conn_lock); 11395 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 11396 mutex_exit(&connp->conn_lock); 11397 } 11398 break; /* goto sizeof (int) option return */ 11399 case IPV6_RECVRTHDR: 11400 if (!checkonly) { 11401 mutex_enter(&connp->conn_lock); 11402 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 11403 mutex_exit(&connp->conn_lock); 11404 } 11405 break; /* goto sizeof (int) option return */ 11406 case IPV6_RECVRTHDRDSTOPTS: 11407 if (!checkonly) { 11408 mutex_enter(&connp->conn_lock); 11409 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 11410 mutex_exit(&connp->conn_lock); 11411 } 11412 break; /* goto sizeof (int) option return */ 11413 case IPV6_PKTINFO: 11414 if (inlen == 0) 11415 return (-EINVAL); /* clearing option */ 11416 error = ip6_set_pktinfo(cr, connp, 11417 (struct in6_pktinfo *)invalp, first_mp); 11418 if (error != 0) 11419 *outlenp = 0; 11420 else 11421 *outlenp = inlen; 11422 return (error); 11423 case IPV6_NEXTHOP: { 11424 struct sockaddr_in6 *sin6; 11425 11426 /* Verify that the nexthop is reachable */ 11427 if (inlen == 0) 11428 return (-EINVAL); /* clearing option */ 11429 11430 sin6 = (struct sockaddr_in6 *)invalp; 11431 ire = ire_route_lookup_v6(&sin6->sin6_addr, 11432 0, 0, 0, NULL, NULL, connp->conn_zoneid, 11433 NULL, MATCH_IRE_DEFAULT, ipst); 11434 11435 if (ire == NULL) { 11436 *outlenp = 0; 11437 return (EHOSTUNREACH); 11438 } 11439 ire_refrele(ire); 11440 return (-EINVAL); 11441 } 11442 case IPV6_SEC_OPT: 11443 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 11444 if (error != 0) { 11445 *outlenp = 0; 11446 return (error); 11447 } 11448 break; 11449 case IPV6_SRC_PREFERENCES: { 11450 /* 11451 * This is implemented strictly in the ip module 11452 * (here and in tcp_opt_*() to accomodate tcp 11453 * sockets). Modules above ip pass this option 11454 * down here since ip is the only one that needs to 11455 * be aware of source address preferences. 11456 * 11457 * This socket option only affects connected 11458 * sockets that haven't already bound to a specific 11459 * IPv6 address. In other words, sockets that 11460 * don't call bind() with an address other than the 11461 * unspecified address and that call connect(). 11462 * ip_bind_connected_v6() passes these preferences 11463 * to the ipif_select_source_v6() function. 11464 */ 11465 if (inlen != sizeof (uint32_t)) 11466 return (EINVAL); 11467 error = ip6_set_src_preferences(connp, 11468 *(uint32_t *)invalp); 11469 if (error != 0) { 11470 *outlenp = 0; 11471 return (error); 11472 } else { 11473 *outlenp = sizeof (uint32_t); 11474 } 11475 break; 11476 } 11477 case IPV6_V6ONLY: 11478 if (*i1 < 0 || *i1 > 1) { 11479 return (EINVAL); 11480 } 11481 mutex_enter(&connp->conn_lock); 11482 connp->conn_ipv6_v6only = *i1; 11483 mutex_exit(&connp->conn_lock); 11484 break; 11485 default: 11486 return (-EINVAL); 11487 } 11488 break; 11489 default: 11490 /* 11491 * "soft" error (negative) 11492 * option not handled at this level 11493 * Note: Do not modify *outlenp 11494 */ 11495 return (-EINVAL); 11496 } 11497 /* 11498 * Common case of return from an option that is sizeof (int) 11499 */ 11500 *(int *)outvalp = *i1; 11501 *outlenp = sizeof (int); 11502 return (0); 11503 } 11504 11505 /* 11506 * This routine gets default values of certain options whose default 11507 * values are maintained by protocol specific code 11508 */ 11509 /* ARGSUSED */ 11510 int 11511 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 11512 { 11513 int *i1 = (int *)ptr; 11514 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11515 11516 switch (level) { 11517 case IPPROTO_IP: 11518 switch (name) { 11519 case IP_MULTICAST_TTL: 11520 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 11521 return (sizeof (uchar_t)); 11522 case IP_MULTICAST_LOOP: 11523 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 11524 return (sizeof (uchar_t)); 11525 default: 11526 return (-1); 11527 } 11528 case IPPROTO_IPV6: 11529 switch (name) { 11530 case IPV6_UNICAST_HOPS: 11531 *i1 = ipst->ips_ipv6_def_hops; 11532 return (sizeof (int)); 11533 case IPV6_MULTICAST_HOPS: 11534 *i1 = IP_DEFAULT_MULTICAST_TTL; 11535 return (sizeof (int)); 11536 case IPV6_MULTICAST_LOOP: 11537 *i1 = IP_DEFAULT_MULTICAST_LOOP; 11538 return (sizeof (int)); 11539 case IPV6_V6ONLY: 11540 *i1 = 1; 11541 return (sizeof (int)); 11542 default: 11543 return (-1); 11544 } 11545 default: 11546 return (-1); 11547 } 11548 /* NOTREACHED */ 11549 } 11550 11551 /* 11552 * Given a destination address and a pointer to where to put the information 11553 * this routine fills in the mtuinfo. 11554 */ 11555 int 11556 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 11557 struct ip6_mtuinfo *mtuinfo, netstack_t *ns) 11558 { 11559 ire_t *ire; 11560 ip_stack_t *ipst = ns->netstack_ip; 11561 11562 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 11563 return (-1); 11564 11565 bzero(mtuinfo, sizeof (*mtuinfo)); 11566 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 11567 mtuinfo->ip6m_addr.sin6_port = port; 11568 mtuinfo->ip6m_addr.sin6_addr = *in6; 11569 11570 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst); 11571 if (ire != NULL) { 11572 mtuinfo->ip6m_mtu = ire->ire_max_frag; 11573 ire_refrele(ire); 11574 } else { 11575 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 11576 } 11577 return (sizeof (struct ip6_mtuinfo)); 11578 } 11579 11580 /* 11581 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 11582 * checking of GET_QUEUE_CRED(q) and that ip_g_mrouter is set should be done and 11583 * isn't. This doesn't matter as the error checking is done properly for the 11584 * other MRT options coming in through ip_opt_set. 11585 */ 11586 int 11587 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 11588 { 11589 conn_t *connp = Q_TO_CONN(q); 11590 ipsec_req_t *req = (ipsec_req_t *)ptr; 11591 11592 switch (level) { 11593 case IPPROTO_IP: 11594 switch (name) { 11595 case MRT_VERSION: 11596 case MRT_ASSERT: 11597 (void) ip_mrouter_get(name, q, ptr); 11598 return (sizeof (int)); 11599 case IP_SEC_OPT: 11600 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 11601 case IP_NEXTHOP: 11602 if (connp->conn_nexthop_set) { 11603 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 11604 return (sizeof (ipaddr_t)); 11605 } else 11606 return (0); 11607 case IP_RECVPKTINFO: 11608 *(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0; 11609 return (sizeof (int)); 11610 default: 11611 break; 11612 } 11613 break; 11614 case IPPROTO_IPV6: 11615 switch (name) { 11616 case IPV6_SEC_OPT: 11617 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 11618 case IPV6_SRC_PREFERENCES: { 11619 return (ip6_get_src_preferences(connp, 11620 (uint32_t *)ptr)); 11621 } 11622 case IPV6_V6ONLY: 11623 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 11624 return (sizeof (int)); 11625 case IPV6_PATHMTU: 11626 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 11627 (struct ip6_mtuinfo *)ptr, connp->conn_netstack)); 11628 default: 11629 break; 11630 } 11631 break; 11632 default: 11633 break; 11634 } 11635 return (-1); 11636 } 11637 11638 /* Named Dispatch routine to get a current value out of our parameter table. */ 11639 /* ARGSUSED */ 11640 static int 11641 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11642 { 11643 ipparam_t *ippa = (ipparam_t *)cp; 11644 11645 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 11646 return (0); 11647 } 11648 11649 /* ARGSUSED */ 11650 static int 11651 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 11652 { 11653 11654 (void) mi_mpprintf(mp, "%d", *(int *)cp); 11655 return (0); 11656 } 11657 11658 /* 11659 * Set ip{,6}_forwarding values. This means walking through all of the 11660 * ill's and toggling their forwarding values. 11661 */ 11662 /* ARGSUSED */ 11663 static int 11664 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11665 { 11666 long new_value; 11667 int *forwarding_value = (int *)cp; 11668 ill_t *ill; 11669 boolean_t isv6; 11670 ill_walk_context_t ctx; 11671 ip_stack_t *ipst = CONNQ_TO_IPST(q); 11672 11673 isv6 = (forwarding_value == &ipst->ips_ipv6_forward); 11674 11675 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11676 new_value < 0 || new_value > 1) { 11677 return (EINVAL); 11678 } 11679 11680 *forwarding_value = new_value; 11681 11682 /* 11683 * Regardless of the current value of ip_forwarding, set all per-ill 11684 * values of ip_forwarding to the value being set. 11685 * 11686 * Bring all the ill's up to date with the new global value. 11687 */ 11688 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11689 11690 if (isv6) 11691 ill = ILL_START_WALK_V6(&ctx, ipst); 11692 else 11693 ill = ILL_START_WALK_V4(&ctx, ipst); 11694 11695 for (; ill != NULL; ill = ill_next(&ctx, ill)) 11696 (void) ill_forward_set(ill, new_value != 0); 11697 11698 rw_exit(&ipst->ips_ill_g_lock); 11699 return (0); 11700 } 11701 11702 /* 11703 * Walk through the param array specified registering each element with the 11704 * Named Dispatch handler. This is called only during init. So it is ok 11705 * not to acquire any locks 11706 */ 11707 static boolean_t 11708 ip_param_register(IDP *ndp, ipparam_t *ippa, size_t ippa_cnt, 11709 ipndp_t *ipnd, size_t ipnd_cnt) 11710 { 11711 for (; ippa_cnt-- > 0; ippa++) { 11712 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 11713 if (!nd_load(ndp, ippa->ip_param_name, 11714 ip_param_get, ip_param_set, (caddr_t)ippa)) { 11715 nd_free(ndp); 11716 return (B_FALSE); 11717 } 11718 } 11719 } 11720 11721 for (; ipnd_cnt-- > 0; ipnd++) { 11722 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 11723 if (!nd_load(ndp, ipnd->ip_ndp_name, 11724 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 11725 ipnd->ip_ndp_data)) { 11726 nd_free(ndp); 11727 return (B_FALSE); 11728 } 11729 } 11730 } 11731 11732 return (B_TRUE); 11733 } 11734 11735 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 11736 /* ARGSUSED */ 11737 static int 11738 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 11739 { 11740 long new_value; 11741 ipparam_t *ippa = (ipparam_t *)cp; 11742 11743 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11744 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 11745 return (EINVAL); 11746 } 11747 ippa->ip_param_value = new_value; 11748 return (0); 11749 } 11750 11751 /* 11752 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 11753 * When an ipf is passed here for the first time, if 11754 * we already have in-order fragments on the queue, we convert from the fast- 11755 * path reassembly scheme to the hard-case scheme. From then on, additional 11756 * fragments are reassembled here. We keep track of the start and end offsets 11757 * of each piece, and the number of holes in the chain. When the hole count 11758 * goes to zero, we are done! 11759 * 11760 * The ipf_count will be updated to account for any mblk(s) added (pointed to 11761 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 11762 * ipfb_count and ill_frag_count by the difference of ipf_count before and 11763 * after the call to ip_reassemble(). 11764 */ 11765 int 11766 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 11767 size_t msg_len) 11768 { 11769 uint_t end; 11770 mblk_t *next_mp; 11771 mblk_t *mp1; 11772 uint_t offset; 11773 boolean_t incr_dups = B_TRUE; 11774 boolean_t offset_zero_seen = B_FALSE; 11775 boolean_t pkt_boundary_checked = B_FALSE; 11776 11777 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 11778 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 11779 11780 /* Add in byte count */ 11781 ipf->ipf_count += msg_len; 11782 if (ipf->ipf_end) { 11783 /* 11784 * We were part way through in-order reassembly, but now there 11785 * is a hole. We walk through messages already queued, and 11786 * mark them for hard case reassembly. We know that up till 11787 * now they were in order starting from offset zero. 11788 */ 11789 offset = 0; 11790 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11791 IP_REASS_SET_START(mp1, offset); 11792 if (offset == 0) { 11793 ASSERT(ipf->ipf_nf_hdr_len != 0); 11794 offset = -ipf->ipf_nf_hdr_len; 11795 } 11796 offset += mp1->b_wptr - mp1->b_rptr; 11797 IP_REASS_SET_END(mp1, offset); 11798 } 11799 /* One hole at the end. */ 11800 ipf->ipf_hole_cnt = 1; 11801 /* Brand it as a hard case, forever. */ 11802 ipf->ipf_end = 0; 11803 } 11804 /* Walk through all the new pieces. */ 11805 do { 11806 end = start + (mp->b_wptr - mp->b_rptr); 11807 /* 11808 * If start is 0, decrease 'end' only for the first mblk of 11809 * the fragment. Otherwise 'end' can get wrong value in the 11810 * second pass of the loop if first mblk is exactly the 11811 * size of ipf_nf_hdr_len. 11812 */ 11813 if (start == 0 && !offset_zero_seen) { 11814 /* First segment */ 11815 ASSERT(ipf->ipf_nf_hdr_len != 0); 11816 end -= ipf->ipf_nf_hdr_len; 11817 offset_zero_seen = B_TRUE; 11818 } 11819 next_mp = mp->b_cont; 11820 /* 11821 * We are checking to see if there is any interesing data 11822 * to process. If there isn't and the mblk isn't the 11823 * one which carries the unfragmentable header then we 11824 * drop it. It's possible to have just the unfragmentable 11825 * header come through without any data. That needs to be 11826 * saved. 11827 * 11828 * If the assert at the top of this function holds then the 11829 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 11830 * is infrequently traveled enough that the test is left in 11831 * to protect against future code changes which break that 11832 * invariant. 11833 */ 11834 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 11835 /* Empty. Blast it. */ 11836 IP_REASS_SET_START(mp, 0); 11837 IP_REASS_SET_END(mp, 0); 11838 /* 11839 * If the ipf points to the mblk we are about to free, 11840 * update ipf to point to the next mblk (or NULL 11841 * if none). 11842 */ 11843 if (ipf->ipf_mp->b_cont == mp) 11844 ipf->ipf_mp->b_cont = next_mp; 11845 freeb(mp); 11846 continue; 11847 } 11848 mp->b_cont = NULL; 11849 IP_REASS_SET_START(mp, start); 11850 IP_REASS_SET_END(mp, end); 11851 if (!ipf->ipf_tail_mp) { 11852 ipf->ipf_tail_mp = mp; 11853 ipf->ipf_mp->b_cont = mp; 11854 if (start == 0 || !more) { 11855 ipf->ipf_hole_cnt = 1; 11856 /* 11857 * if the first fragment comes in more than one 11858 * mblk, this loop will be executed for each 11859 * mblk. Need to adjust hole count so exiting 11860 * this routine will leave hole count at 1. 11861 */ 11862 if (next_mp) 11863 ipf->ipf_hole_cnt++; 11864 } else 11865 ipf->ipf_hole_cnt = 2; 11866 continue; 11867 } else if (ipf->ipf_last_frag_seen && !more && 11868 !pkt_boundary_checked) { 11869 /* 11870 * We check datagram boundary only if this fragment 11871 * claims to be the last fragment and we have seen a 11872 * last fragment in the past too. We do this only 11873 * once for a given fragment. 11874 * 11875 * start cannot be 0 here as fragments with start=0 11876 * and MF=0 gets handled as a complete packet. These 11877 * fragments should not reach here. 11878 */ 11879 11880 if (start + msgdsize(mp) != 11881 IP_REASS_END(ipf->ipf_tail_mp)) { 11882 /* 11883 * We have two fragments both of which claim 11884 * to be the last fragment but gives conflicting 11885 * information about the whole datagram size. 11886 * Something fishy is going on. Drop the 11887 * fragment and free up the reassembly list. 11888 */ 11889 return (IP_REASS_FAILED); 11890 } 11891 11892 /* 11893 * We shouldn't come to this code block again for this 11894 * particular fragment. 11895 */ 11896 pkt_boundary_checked = B_TRUE; 11897 } 11898 11899 /* New stuff at or beyond tail? */ 11900 offset = IP_REASS_END(ipf->ipf_tail_mp); 11901 if (start >= offset) { 11902 if (ipf->ipf_last_frag_seen) { 11903 /* current fragment is beyond last fragment */ 11904 return (IP_REASS_FAILED); 11905 } 11906 /* Link it on end. */ 11907 ipf->ipf_tail_mp->b_cont = mp; 11908 ipf->ipf_tail_mp = mp; 11909 if (more) { 11910 if (start != offset) 11911 ipf->ipf_hole_cnt++; 11912 } else if (start == offset && next_mp == NULL) 11913 ipf->ipf_hole_cnt--; 11914 continue; 11915 } 11916 mp1 = ipf->ipf_mp->b_cont; 11917 offset = IP_REASS_START(mp1); 11918 /* New stuff at the front? */ 11919 if (start < offset) { 11920 if (start == 0) { 11921 if (end >= offset) { 11922 /* Nailed the hole at the begining. */ 11923 ipf->ipf_hole_cnt--; 11924 } 11925 } else if (end < offset) { 11926 /* 11927 * A hole, stuff, and a hole where there used 11928 * to be just a hole. 11929 */ 11930 ipf->ipf_hole_cnt++; 11931 } 11932 mp->b_cont = mp1; 11933 /* Check for overlap. */ 11934 while (end > offset) { 11935 if (end < IP_REASS_END(mp1)) { 11936 mp->b_wptr -= end - offset; 11937 IP_REASS_SET_END(mp, offset); 11938 BUMP_MIB(ill->ill_ip_mib, 11939 ipIfStatsReasmPartDups); 11940 break; 11941 } 11942 /* Did we cover another hole? */ 11943 if ((mp1->b_cont && 11944 IP_REASS_END(mp1) != 11945 IP_REASS_START(mp1->b_cont) && 11946 end >= IP_REASS_START(mp1->b_cont)) || 11947 (!ipf->ipf_last_frag_seen && !more)) { 11948 ipf->ipf_hole_cnt--; 11949 } 11950 /* Clip out mp1. */ 11951 if ((mp->b_cont = mp1->b_cont) == NULL) { 11952 /* 11953 * After clipping out mp1, this guy 11954 * is now hanging off the end. 11955 */ 11956 ipf->ipf_tail_mp = mp; 11957 } 11958 IP_REASS_SET_START(mp1, 0); 11959 IP_REASS_SET_END(mp1, 0); 11960 /* Subtract byte count */ 11961 ipf->ipf_count -= mp1->b_datap->db_lim - 11962 mp1->b_datap->db_base; 11963 freeb(mp1); 11964 BUMP_MIB(ill->ill_ip_mib, 11965 ipIfStatsReasmPartDups); 11966 mp1 = mp->b_cont; 11967 if (!mp1) 11968 break; 11969 offset = IP_REASS_START(mp1); 11970 } 11971 ipf->ipf_mp->b_cont = mp; 11972 continue; 11973 } 11974 /* 11975 * The new piece starts somewhere between the start of the head 11976 * and before the end of the tail. 11977 */ 11978 for (; mp1; mp1 = mp1->b_cont) { 11979 offset = IP_REASS_END(mp1); 11980 if (start < offset) { 11981 if (end <= offset) { 11982 /* Nothing new. */ 11983 IP_REASS_SET_START(mp, 0); 11984 IP_REASS_SET_END(mp, 0); 11985 /* Subtract byte count */ 11986 ipf->ipf_count -= mp->b_datap->db_lim - 11987 mp->b_datap->db_base; 11988 if (incr_dups) { 11989 ipf->ipf_num_dups++; 11990 incr_dups = B_FALSE; 11991 } 11992 freeb(mp); 11993 BUMP_MIB(ill->ill_ip_mib, 11994 ipIfStatsReasmDuplicates); 11995 break; 11996 } 11997 /* 11998 * Trim redundant stuff off beginning of new 11999 * piece. 12000 */ 12001 IP_REASS_SET_START(mp, offset); 12002 mp->b_rptr += offset - start; 12003 BUMP_MIB(ill->ill_ip_mib, 12004 ipIfStatsReasmPartDups); 12005 start = offset; 12006 if (!mp1->b_cont) { 12007 /* 12008 * After trimming, this guy is now 12009 * hanging off the end. 12010 */ 12011 mp1->b_cont = mp; 12012 ipf->ipf_tail_mp = mp; 12013 if (!more) { 12014 ipf->ipf_hole_cnt--; 12015 } 12016 break; 12017 } 12018 } 12019 if (start >= IP_REASS_START(mp1->b_cont)) 12020 continue; 12021 /* Fill a hole */ 12022 if (start > offset) 12023 ipf->ipf_hole_cnt++; 12024 mp->b_cont = mp1->b_cont; 12025 mp1->b_cont = mp; 12026 mp1 = mp->b_cont; 12027 offset = IP_REASS_START(mp1); 12028 if (end >= offset) { 12029 ipf->ipf_hole_cnt--; 12030 /* Check for overlap. */ 12031 while (end > offset) { 12032 if (end < IP_REASS_END(mp1)) { 12033 mp->b_wptr -= end - offset; 12034 IP_REASS_SET_END(mp, offset); 12035 /* 12036 * TODO we might bump 12037 * this up twice if there is 12038 * overlap at both ends. 12039 */ 12040 BUMP_MIB(ill->ill_ip_mib, 12041 ipIfStatsReasmPartDups); 12042 break; 12043 } 12044 /* Did we cover another hole? */ 12045 if ((mp1->b_cont && 12046 IP_REASS_END(mp1) 12047 != IP_REASS_START(mp1->b_cont) && 12048 end >= 12049 IP_REASS_START(mp1->b_cont)) || 12050 (!ipf->ipf_last_frag_seen && 12051 !more)) { 12052 ipf->ipf_hole_cnt--; 12053 } 12054 /* Clip out mp1. */ 12055 if ((mp->b_cont = mp1->b_cont) == 12056 NULL) { 12057 /* 12058 * After clipping out mp1, 12059 * this guy is now hanging 12060 * off the end. 12061 */ 12062 ipf->ipf_tail_mp = mp; 12063 } 12064 IP_REASS_SET_START(mp1, 0); 12065 IP_REASS_SET_END(mp1, 0); 12066 /* Subtract byte count */ 12067 ipf->ipf_count -= 12068 mp1->b_datap->db_lim - 12069 mp1->b_datap->db_base; 12070 freeb(mp1); 12071 BUMP_MIB(ill->ill_ip_mib, 12072 ipIfStatsReasmPartDups); 12073 mp1 = mp->b_cont; 12074 if (!mp1) 12075 break; 12076 offset = IP_REASS_START(mp1); 12077 } 12078 } 12079 break; 12080 } 12081 } while (start = end, mp = next_mp); 12082 12083 /* Fragment just processed could be the last one. Remember this fact */ 12084 if (!more) 12085 ipf->ipf_last_frag_seen = B_TRUE; 12086 12087 /* Still got holes? */ 12088 if (ipf->ipf_hole_cnt) 12089 return (IP_REASS_PARTIAL); 12090 /* Clean up overloaded fields to avoid upstream disasters. */ 12091 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 12092 IP_REASS_SET_START(mp1, 0); 12093 IP_REASS_SET_END(mp1, 0); 12094 } 12095 return (IP_REASS_COMPLETE); 12096 } 12097 12098 /* 12099 * ipsec processing for the fast path, used for input UDP Packets 12100 * Returns true if ready for passup to UDP. 12101 * Return false if packet is not passable to UDP (e.g. it failed IPsec policy, 12102 * was an ESP-in-UDP packet, etc.). 12103 */ 12104 static boolean_t 12105 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 12106 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire) 12107 { 12108 uint32_t ill_index; 12109 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 12110 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 12111 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12112 udp_t *udp = connp->conn_udp; 12113 12114 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12115 /* The ill_index of the incoming ILL */ 12116 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 12117 12118 /* pass packet up to the transport */ 12119 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 12120 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 12121 NULL, mctl_present); 12122 if (*first_mpp == NULL) { 12123 return (B_FALSE); 12124 } 12125 } 12126 12127 /* Initiate IPPF processing for fastpath UDP */ 12128 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 12129 ip_process(IPP_LOCAL_IN, mpp, ill_index); 12130 if (*mpp == NULL) { 12131 ip2dbg(("ip_input_ipsec_process: UDP pkt " 12132 "deferred/dropped during IPPF processing\n")); 12133 return (B_FALSE); 12134 } 12135 } 12136 /* 12137 * Remove 0-spi if it's 0, or move everything behind 12138 * the UDP header over it and forward to ESP via 12139 * ip_proto_input(). 12140 */ 12141 if (udp->udp_nat_t_endpoint) { 12142 if (mctl_present) { 12143 /* mctl_present *shouldn't* happen. */ 12144 ip_drop_packet(*first_mpp, B_TRUE, NULL, 12145 NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec), 12146 &ipss->ipsec_dropper); 12147 *first_mpp = NULL; 12148 return (B_FALSE); 12149 } 12150 12151 /* "ill" is "recv_ill" in actuality. */ 12152 if (!zero_spi_check(q, *mpp, ire, ill, ipss)) 12153 return (B_FALSE); 12154 12155 /* Else continue like a normal UDP packet. */ 12156 } 12157 12158 /* 12159 * We make the checks as below since we are in the fast path 12160 * and want to minimize the number of checks if the IP_RECVIF and/or 12161 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 12162 */ 12163 if (connp->conn_recvif || connp->conn_recvslla || 12164 connp->conn_ip_recvpktinfo) { 12165 if (connp->conn_recvif) { 12166 in_flags = IPF_RECVIF; 12167 } 12168 /* 12169 * UDP supports IP_RECVPKTINFO option for both v4 and v6 12170 * so the flag passed to ip_add_info is based on IP version 12171 * of connp. 12172 */ 12173 if (connp->conn_ip_recvpktinfo) { 12174 if (connp->conn_af_isv6) { 12175 /* 12176 * V6 only needs index 12177 */ 12178 in_flags |= IPF_RECVIF; 12179 } else { 12180 /* 12181 * V4 needs index + matching address. 12182 */ 12183 in_flags |= IPF_RECVADDR; 12184 } 12185 } 12186 if (connp->conn_recvslla) { 12187 in_flags |= IPF_RECVSLLA; 12188 } 12189 /* 12190 * since in_flags are being set ill will be 12191 * referenced in ip_add_info, so it better not 12192 * be NULL. 12193 */ 12194 /* 12195 * the actual data will be contained in b_cont 12196 * upon successful return of the following call. 12197 * If the call fails then the original mblk is 12198 * returned. 12199 */ 12200 *mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp), 12201 ipst); 12202 } 12203 12204 return (B_TRUE); 12205 } 12206 12207 /* 12208 * Fragmentation reassembly. Each ILL has a hash table for 12209 * queuing packets undergoing reassembly for all IPIFs 12210 * associated with the ILL. The hash is based on the packet 12211 * IP ident field. The ILL frag hash table was allocated 12212 * as a timer block at the time the ILL was created. Whenever 12213 * there is anything on the reassembly queue, the timer will 12214 * be running. Returns B_TRUE if successful else B_FALSE; 12215 * frees mp on failure. 12216 */ 12217 static boolean_t 12218 ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, 12219 uint32_t *cksum_val, uint16_t *cksum_flags) 12220 { 12221 uint32_t frag_offset_flags; 12222 ill_t *ill = (ill_t *)q->q_ptr; 12223 mblk_t *mp = *mpp; 12224 mblk_t *t_mp; 12225 ipaddr_t dst; 12226 uint8_t proto = ipha->ipha_protocol; 12227 uint32_t sum_val; 12228 uint16_t sum_flags; 12229 ipf_t *ipf; 12230 ipf_t **ipfp; 12231 ipfb_t *ipfb; 12232 uint16_t ident; 12233 uint32_t offset; 12234 ipaddr_t src; 12235 uint_t hdr_length; 12236 uint32_t end; 12237 mblk_t *mp1; 12238 mblk_t *tail_mp; 12239 size_t count; 12240 size_t msg_len; 12241 uint8_t ecn_info = 0; 12242 uint32_t packet_size; 12243 boolean_t pruned = B_FALSE; 12244 ip_stack_t *ipst = ill->ill_ipst; 12245 12246 if (cksum_val != NULL) 12247 *cksum_val = 0; 12248 if (cksum_flags != NULL) 12249 *cksum_flags = 0; 12250 12251 /* 12252 * Drop the fragmented as early as possible, if 12253 * we don't have resource(s) to re-assemble. 12254 */ 12255 if (ipst->ips_ip_reass_queue_bytes == 0) { 12256 freemsg(mp); 12257 return (B_FALSE); 12258 } 12259 12260 /* Check for fragmentation offset; return if there's none */ 12261 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 12262 (IPH_MF | IPH_OFFSET)) == 0) 12263 return (B_TRUE); 12264 12265 /* 12266 * We utilize hardware computed checksum info only for UDP since 12267 * IP fragmentation is a normal occurence for the protocol. In 12268 * addition, checksum offload support for IP fragments carrying 12269 * UDP payload is commonly implemented across network adapters. 12270 */ 12271 ASSERT(ill != NULL); 12272 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && 12273 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 12274 mblk_t *mp1 = mp->b_cont; 12275 int32_t len; 12276 12277 /* Record checksum information from the packet */ 12278 sum_val = (uint32_t)DB_CKSUM16(mp); 12279 sum_flags = DB_CKSUMFLAGS(mp); 12280 12281 /* IP payload offset from beginning of mblk */ 12282 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 12283 12284 if ((sum_flags & HCK_PARTIALCKSUM) && 12285 (mp1 == NULL || mp1->b_cont == NULL) && 12286 offset >= DB_CKSUMSTART(mp) && 12287 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 12288 uint32_t adj; 12289 /* 12290 * Partial checksum has been calculated by hardware 12291 * and attached to the packet; in addition, any 12292 * prepended extraneous data is even byte aligned. 12293 * If any such data exists, we adjust the checksum; 12294 * this would also handle any postpended data. 12295 */ 12296 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 12297 mp, mp1, len, adj); 12298 12299 /* One's complement subtract extraneous checksum */ 12300 if (adj >= sum_val) 12301 sum_val = ~(adj - sum_val) & 0xFFFF; 12302 else 12303 sum_val -= adj; 12304 } 12305 } else { 12306 sum_val = 0; 12307 sum_flags = 0; 12308 } 12309 12310 /* Clear hardware checksumming flag */ 12311 DB_CKSUMFLAGS(mp) = 0; 12312 12313 ident = ipha->ipha_ident; 12314 offset = (frag_offset_flags << 3) & 0xFFFF; 12315 src = ipha->ipha_src; 12316 dst = ipha->ipha_dst; 12317 hdr_length = IPH_HDR_LENGTH(ipha); 12318 end = ntohs(ipha->ipha_length) - hdr_length; 12319 12320 /* If end == 0 then we have a packet with no data, so just free it */ 12321 if (end == 0) { 12322 freemsg(mp); 12323 return (B_FALSE); 12324 } 12325 12326 /* Record the ECN field info. */ 12327 ecn_info = (ipha->ipha_type_of_service & 0x3); 12328 if (offset != 0) { 12329 /* 12330 * If this isn't the first piece, strip the header, and 12331 * add the offset to the end value. 12332 */ 12333 mp->b_rptr += hdr_length; 12334 end += offset; 12335 } 12336 12337 msg_len = MBLKSIZE(mp); 12338 tail_mp = mp; 12339 while (tail_mp->b_cont != NULL) { 12340 tail_mp = tail_mp->b_cont; 12341 msg_len += MBLKSIZE(tail_mp); 12342 } 12343 12344 /* If the reassembly list for this ILL will get too big, prune it */ 12345 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 12346 ipst->ips_ip_reass_queue_bytes) { 12347 ill_frag_prune(ill, 12348 (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : 12349 (ipst->ips_ip_reass_queue_bytes - msg_len)); 12350 pruned = B_TRUE; 12351 } 12352 12353 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 12354 mutex_enter(&ipfb->ipfb_lock); 12355 12356 ipfp = &ipfb->ipfb_ipf; 12357 /* Try to find an existing fragment queue for this packet. */ 12358 for (;;) { 12359 ipf = ipfp[0]; 12360 if (ipf != NULL) { 12361 /* 12362 * It has to match on ident and src/dst address. 12363 */ 12364 if (ipf->ipf_ident == ident && 12365 ipf->ipf_src == src && 12366 ipf->ipf_dst == dst && 12367 ipf->ipf_protocol == proto) { 12368 /* 12369 * If we have received too many 12370 * duplicate fragments for this packet 12371 * free it. 12372 */ 12373 if (ipf->ipf_num_dups > ip_max_frag_dups) { 12374 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12375 freemsg(mp); 12376 mutex_exit(&ipfb->ipfb_lock); 12377 return (B_FALSE); 12378 } 12379 /* Found it. */ 12380 break; 12381 } 12382 ipfp = &ipf->ipf_hash_next; 12383 continue; 12384 } 12385 12386 /* 12387 * If we pruned the list, do we want to store this new 12388 * fragment?. We apply an optimization here based on the 12389 * fact that most fragments will be received in order. 12390 * So if the offset of this incoming fragment is zero, 12391 * it is the first fragment of a new packet. We will 12392 * keep it. Otherwise drop the fragment, as we have 12393 * probably pruned the packet already (since the 12394 * packet cannot be found). 12395 */ 12396 if (pruned && offset != 0) { 12397 mutex_exit(&ipfb->ipfb_lock); 12398 freemsg(mp); 12399 return (B_FALSE); 12400 } 12401 12402 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) { 12403 /* 12404 * Too many fragmented packets in this hash 12405 * bucket. Free the oldest. 12406 */ 12407 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 12408 } 12409 12410 /* New guy. Allocate a frag message. */ 12411 mp1 = allocb(sizeof (*ipf), BPRI_MED); 12412 if (mp1 == NULL) { 12413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12414 freemsg(mp); 12415 reass_done: 12416 mutex_exit(&ipfb->ipfb_lock); 12417 return (B_FALSE); 12418 } 12419 12420 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); 12421 mp1->b_cont = mp; 12422 12423 /* Initialize the fragment header. */ 12424 ipf = (ipf_t *)mp1->b_rptr; 12425 ipf->ipf_mp = mp1; 12426 ipf->ipf_ptphn = ipfp; 12427 ipfp[0] = ipf; 12428 ipf->ipf_hash_next = NULL; 12429 ipf->ipf_ident = ident; 12430 ipf->ipf_protocol = proto; 12431 ipf->ipf_src = src; 12432 ipf->ipf_dst = dst; 12433 ipf->ipf_nf_hdr_len = 0; 12434 /* Record reassembly start time. */ 12435 ipf->ipf_timestamp = gethrestime_sec(); 12436 /* Record ipf generation and account for frag header */ 12437 ipf->ipf_gen = ill->ill_ipf_gen++; 12438 ipf->ipf_count = MBLKSIZE(mp1); 12439 ipf->ipf_last_frag_seen = B_FALSE; 12440 ipf->ipf_ecn = ecn_info; 12441 ipf->ipf_num_dups = 0; 12442 ipfb->ipfb_frag_pkts++; 12443 ipf->ipf_checksum = 0; 12444 ipf->ipf_checksum_flags = 0; 12445 12446 /* Store checksum value in fragment header */ 12447 if (sum_flags != 0) { 12448 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12449 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12450 ipf->ipf_checksum = sum_val; 12451 ipf->ipf_checksum_flags = sum_flags; 12452 } 12453 12454 /* 12455 * We handle reassembly two ways. In the easy case, 12456 * where all the fragments show up in order, we do 12457 * minimal bookkeeping, and just clip new pieces on 12458 * the end. If we ever see a hole, then we go off 12459 * to ip_reassemble which has to mark the pieces and 12460 * keep track of the number of holes, etc. Obviously, 12461 * the point of having both mechanisms is so we can 12462 * handle the easy case as efficiently as possible. 12463 */ 12464 if (offset == 0) { 12465 /* Easy case, in-order reassembly so far. */ 12466 ipf->ipf_count += msg_len; 12467 ipf->ipf_tail_mp = tail_mp; 12468 /* 12469 * Keep track of next expected offset in 12470 * ipf_end. 12471 */ 12472 ipf->ipf_end = end; 12473 ipf->ipf_nf_hdr_len = hdr_length; 12474 } else { 12475 /* Hard case, hole at the beginning. */ 12476 ipf->ipf_tail_mp = NULL; 12477 /* 12478 * ipf_end == 0 means that we have given up 12479 * on easy reassembly. 12480 */ 12481 ipf->ipf_end = 0; 12482 12483 /* Forget checksum offload from now on */ 12484 ipf->ipf_checksum_flags = 0; 12485 12486 /* 12487 * ipf_hole_cnt is set by ip_reassemble. 12488 * ipf_count is updated by ip_reassemble. 12489 * No need to check for return value here 12490 * as we don't expect reassembly to complete 12491 * or fail for the first fragment itself. 12492 */ 12493 (void) ip_reassemble(mp, ipf, 12494 (frag_offset_flags & IPH_OFFSET) << 3, 12495 (frag_offset_flags & IPH_MF), ill, msg_len); 12496 } 12497 /* Update per ipfb and ill byte counts */ 12498 ipfb->ipfb_count += ipf->ipf_count; 12499 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12500 atomic_add_32(&ill->ill_frag_count, ipf->ipf_count); 12501 /* If the frag timer wasn't already going, start it. */ 12502 mutex_enter(&ill->ill_lock); 12503 ill_frag_timer_start(ill); 12504 mutex_exit(&ill->ill_lock); 12505 goto reass_done; 12506 } 12507 12508 /* 12509 * If the packet's flag has changed (it could be coming up 12510 * from an interface different than the previous, therefore 12511 * possibly different checksum capability), then forget about 12512 * any stored checksum states. Otherwise add the value to 12513 * the existing one stored in the fragment header. 12514 */ 12515 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 12516 sum_val += ipf->ipf_checksum; 12517 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12518 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 12519 ipf->ipf_checksum = sum_val; 12520 } else if (ipf->ipf_checksum_flags != 0) { 12521 /* Forget checksum offload from now on */ 12522 ipf->ipf_checksum_flags = 0; 12523 } 12524 12525 /* 12526 * We have a new piece of a datagram which is already being 12527 * reassembled. Update the ECN info if all IP fragments 12528 * are ECN capable. If there is one which is not, clear 12529 * all the info. If there is at least one which has CE 12530 * code point, IP needs to report that up to transport. 12531 */ 12532 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 12533 if (ecn_info == IPH_ECN_CE) 12534 ipf->ipf_ecn = IPH_ECN_CE; 12535 } else { 12536 ipf->ipf_ecn = IPH_ECN_NECT; 12537 } 12538 if (offset && ipf->ipf_end == offset) { 12539 /* The new fragment fits at the end */ 12540 ipf->ipf_tail_mp->b_cont = mp; 12541 /* Update the byte count */ 12542 ipf->ipf_count += msg_len; 12543 /* Update per ipfb and ill byte counts */ 12544 ipfb->ipfb_count += msg_len; 12545 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12546 atomic_add_32(&ill->ill_frag_count, msg_len); 12547 if (frag_offset_flags & IPH_MF) { 12548 /* More to come. */ 12549 ipf->ipf_end = end; 12550 ipf->ipf_tail_mp = tail_mp; 12551 goto reass_done; 12552 } 12553 } else { 12554 /* Go do the hard cases. */ 12555 int ret; 12556 12557 if (offset == 0) 12558 ipf->ipf_nf_hdr_len = hdr_length; 12559 12560 /* Save current byte count */ 12561 count = ipf->ipf_count; 12562 ret = ip_reassemble(mp, ipf, 12563 (frag_offset_flags & IPH_OFFSET) << 3, 12564 (frag_offset_flags & IPH_MF), ill, msg_len); 12565 /* Count of bytes added and subtracted (freeb()ed) */ 12566 count = ipf->ipf_count - count; 12567 if (count) { 12568 /* Update per ipfb and ill byte counts */ 12569 ipfb->ipfb_count += count; 12570 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 12571 atomic_add_32(&ill->ill_frag_count, count); 12572 } 12573 if (ret == IP_REASS_PARTIAL) { 12574 goto reass_done; 12575 } else if (ret == IP_REASS_FAILED) { 12576 /* Reassembly failed. Free up all resources */ 12577 ill_frag_free_pkts(ill, ipfb, ipf, 1); 12578 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 12579 IP_REASS_SET_START(t_mp, 0); 12580 IP_REASS_SET_END(t_mp, 0); 12581 } 12582 freemsg(mp); 12583 goto reass_done; 12584 } 12585 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 12586 } 12587 /* 12588 * We have completed reassembly. Unhook the frag header from 12589 * the reassembly list. 12590 * 12591 * Before we free the frag header, record the ECN info 12592 * to report back to the transport. 12593 */ 12594 ecn_info = ipf->ipf_ecn; 12595 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs); 12596 ipfp = ipf->ipf_ptphn; 12597 12598 /* We need to supply these to caller */ 12599 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 12600 sum_val = ipf->ipf_checksum; 12601 else 12602 sum_val = 0; 12603 12604 mp1 = ipf->ipf_mp; 12605 count = ipf->ipf_count; 12606 ipf = ipf->ipf_hash_next; 12607 if (ipf != NULL) 12608 ipf->ipf_ptphn = ipfp; 12609 ipfp[0] = ipf; 12610 atomic_add_32(&ill->ill_frag_count, -count); 12611 ASSERT(ipfb->ipfb_count >= count); 12612 ipfb->ipfb_count -= count; 12613 ipfb->ipfb_frag_pkts--; 12614 mutex_exit(&ipfb->ipfb_lock); 12615 /* Ditch the frag header. */ 12616 mp = mp1->b_cont; 12617 12618 freeb(mp1); 12619 12620 /* Restore original IP length in header. */ 12621 packet_size = (uint32_t)msgdsize(mp); 12622 if (packet_size > IP_MAXPACKET) { 12623 freemsg(mp); 12624 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 12625 return (B_FALSE); 12626 } 12627 12628 if (DB_REF(mp) > 1) { 12629 mblk_t *mp2 = copymsg(mp); 12630 12631 freemsg(mp); 12632 if (mp2 == NULL) { 12633 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12634 return (B_FALSE); 12635 } 12636 mp = mp2; 12637 } 12638 ipha = (ipha_t *)mp->b_rptr; 12639 12640 ipha->ipha_length = htons((uint16_t)packet_size); 12641 /* We're now complete, zip the frag state */ 12642 ipha->ipha_fragment_offset_and_flags = 0; 12643 /* Record the ECN info. */ 12644 ipha->ipha_type_of_service &= 0xFC; 12645 ipha->ipha_type_of_service |= ecn_info; 12646 *mpp = mp; 12647 12648 /* Reassembly is successful; return checksum information if needed */ 12649 if (cksum_val != NULL) 12650 *cksum_val = sum_val; 12651 if (cksum_flags != NULL) 12652 *cksum_flags = sum_flags; 12653 12654 return (B_TRUE); 12655 } 12656 12657 /* 12658 * Perform ip header check sum update local options. 12659 * return B_TRUE if all is well, else return B_FALSE and release 12660 * the mp. caller is responsible for decrementing ire ref cnt. 12661 */ 12662 static boolean_t 12663 ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12664 ip_stack_t *ipst) 12665 { 12666 mblk_t *first_mp; 12667 boolean_t mctl_present; 12668 uint16_t sum; 12669 12670 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12671 /* 12672 * Don't do the checksum if it has gone through AH/ESP 12673 * processing. 12674 */ 12675 if (!mctl_present) { 12676 sum = ip_csum_hdr(ipha); 12677 if (sum != 0) { 12678 if (ill != NULL) { 12679 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12680 } else { 12681 BUMP_MIB(&ipst->ips_ip_mib, 12682 ipIfStatsInCksumErrs); 12683 } 12684 freemsg(first_mp); 12685 return (B_FALSE); 12686 } 12687 } 12688 12689 if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) { 12690 if (mctl_present) 12691 freeb(first_mp); 12692 return (B_FALSE); 12693 } 12694 12695 return (B_TRUE); 12696 } 12697 12698 /* 12699 * All udp packet are delivered to the local host via this routine. 12700 */ 12701 void 12702 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 12703 ill_t *recv_ill) 12704 { 12705 uint32_t sum; 12706 uint32_t u1; 12707 boolean_t mctl_present; 12708 conn_t *connp; 12709 mblk_t *first_mp; 12710 uint16_t *up; 12711 ill_t *ill = (ill_t *)q->q_ptr; 12712 uint16_t reass_hck_flags = 0; 12713 ip_stack_t *ipst; 12714 12715 ASSERT(recv_ill != NULL); 12716 ipst = recv_ill->ill_ipst; 12717 12718 #define rptr ((uchar_t *)ipha) 12719 12720 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12721 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 12722 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 12723 ASSERT(ill != NULL); 12724 12725 /* 12726 * FAST PATH for udp packets 12727 */ 12728 12729 /* u1 is # words of IP options */ 12730 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 12731 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12732 12733 /* IP options present */ 12734 if (u1 != 0) 12735 goto ipoptions; 12736 12737 /* Check the IP header checksum. */ 12738 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12739 /* Clear the IP header h/w cksum flag */ 12740 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12741 } else if (!mctl_present) { 12742 /* 12743 * Don't verify header checksum if this packet is coming 12744 * back from AH/ESP as we already did it. 12745 */ 12746 #define uph ((uint16_t *)ipha) 12747 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 12748 uph[6] + uph[7] + uph[8] + uph[9]; 12749 #undef uph 12750 /* finish doing IP checksum */ 12751 sum = (sum & 0xFFFF) + (sum >> 16); 12752 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12753 if (sum != 0 && sum != 0xFFFF) { 12754 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 12755 freemsg(first_mp); 12756 return; 12757 } 12758 } 12759 12760 /* 12761 * Count for SNMP of inbound packets for ire. 12762 * if mctl is present this might be a secure packet and 12763 * has already been counted for in ip_proto_input(). 12764 */ 12765 if (!mctl_present) { 12766 UPDATE_IB_PKT_COUNT(ire); 12767 ire->ire_last_used_time = lbolt; 12768 } 12769 12770 /* packet part of fragmented IP packet? */ 12771 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12772 if (u1 & (IPH_MF | IPH_OFFSET)) { 12773 goto fragmented; 12774 } 12775 12776 /* u1 = IP header length (20 bytes) */ 12777 u1 = IP_SIMPLE_HDR_LENGTH; 12778 12779 /* packet does not contain complete IP & UDP headers */ 12780 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 12781 goto udppullup; 12782 12783 /* up points to UDP header */ 12784 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 12785 #define iphs ((uint16_t *)ipha) 12786 12787 /* if udp hdr cksum != 0, then need to checksum udp packet */ 12788 if (up[3] != 0) { 12789 mblk_t *mp1 = mp->b_cont; 12790 boolean_t cksum_err; 12791 uint16_t hck_flags = 0; 12792 12793 /* Pseudo-header checksum */ 12794 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12795 iphs[9] + up[2]; 12796 12797 /* 12798 * Revert to software checksum calculation if the interface 12799 * isn't capable of checksum offload or if IPsec is present. 12800 */ 12801 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 12802 hck_flags = DB_CKSUMFLAGS(mp); 12803 12804 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12805 IP_STAT(ipst, ip_in_sw_cksum); 12806 12807 IP_CKSUM_RECV(hck_flags, u1, 12808 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12809 (int32_t)((uchar_t *)up - rptr), 12810 mp, mp1, cksum_err); 12811 12812 if (cksum_err) { 12813 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12814 if (hck_flags & HCK_FULLCKSUM) 12815 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12816 else if (hck_flags & HCK_PARTIALCKSUM) 12817 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12818 else 12819 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12820 12821 freemsg(first_mp); 12822 return; 12823 } 12824 } 12825 12826 /* Non-fragmented broadcast or multicast packet? */ 12827 if (ire->ire_type == IRE_BROADCAST) 12828 goto udpslowpath; 12829 12830 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 12831 ire->ire_zoneid, ipst)) != NULL) { 12832 ASSERT(connp->conn_upq != NULL); 12833 IP_STAT(ipst, ip_udp_fast_path); 12834 12835 if (CONN_UDP_FLOWCTLD(connp)) { 12836 freemsg(mp); 12837 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 12838 } else { 12839 if (!mctl_present) { 12840 BUMP_MIB(ill->ill_ip_mib, 12841 ipIfStatsHCInDelivers); 12842 } 12843 /* 12844 * mp and first_mp can change. 12845 */ 12846 if (ip_udp_check(q, connp, recv_ill, 12847 ipha, &mp, &first_mp, mctl_present, ire)) { 12848 /* Send it upstream */ 12849 (connp->conn_recv)(connp, mp, NULL); 12850 } 12851 } 12852 /* 12853 * freeb() cannot deal with null mblk being passed 12854 * in and first_mp can be set to null in the call 12855 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 12856 */ 12857 if (mctl_present && first_mp != NULL) { 12858 freeb(first_mp); 12859 } 12860 CONN_DEC_REF(connp); 12861 return; 12862 } 12863 12864 /* 12865 * if we got here we know the packet is not fragmented and 12866 * has no options. The classifier could not find a conn_t and 12867 * most likely its an icmp packet so send it through slow path. 12868 */ 12869 12870 goto udpslowpath; 12871 12872 ipoptions: 12873 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 12874 goto slow_done; 12875 } 12876 12877 UPDATE_IB_PKT_COUNT(ire); 12878 ire->ire_last_used_time = lbolt; 12879 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12880 if (u1 & (IPH_MF | IPH_OFFSET)) { 12881 fragmented: 12882 /* 12883 * "sum" and "reass_hck_flags" are non-zero if the 12884 * reassembled packet has a valid hardware computed 12885 * checksum information associated with it. 12886 */ 12887 if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) 12888 goto slow_done; 12889 /* 12890 * Make sure that first_mp points back to mp as 12891 * the mp we came in with could have changed in 12892 * ip_rput_fragment(). 12893 */ 12894 ASSERT(!mctl_present); 12895 ipha = (ipha_t *)mp->b_rptr; 12896 first_mp = mp; 12897 } 12898 12899 /* Now we have a complete datagram, destined for this machine. */ 12900 u1 = IPH_HDR_LENGTH(ipha); 12901 /* Pull up the UDP header, if necessary. */ 12902 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12903 udppullup: 12904 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12905 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 12906 freemsg(first_mp); 12907 goto slow_done; 12908 } 12909 ipha = (ipha_t *)mp->b_rptr; 12910 } 12911 12912 /* 12913 * Validate the checksum for the reassembled packet; for the 12914 * pullup case we calculate the payload checksum in software. 12915 */ 12916 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12917 if (up[3] != 0) { 12918 boolean_t cksum_err; 12919 12920 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12921 IP_STAT(ipst, ip_in_sw_cksum); 12922 12923 IP_CKSUM_RECV_REASS(reass_hck_flags, 12924 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12925 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12926 iphs[9] + up[2], sum, cksum_err); 12927 12928 if (cksum_err) { 12929 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 12930 12931 if (reass_hck_flags & HCK_FULLCKSUM) 12932 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 12933 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12934 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 12935 else 12936 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 12937 12938 freemsg(first_mp); 12939 goto slow_done; 12940 } 12941 } 12942 udpslowpath: 12943 12944 /* Clear hardware checksum flag to be safe */ 12945 DB_CKSUMFLAGS(mp) = 0; 12946 12947 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12948 (ire->ire_type == IRE_BROADCAST), 12949 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO, 12950 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12951 12952 slow_done: 12953 IP_STAT(ipst, ip_udp_slow_path); 12954 return; 12955 12956 #undef iphs 12957 #undef rptr 12958 } 12959 12960 /* ARGSUSED */ 12961 static mblk_t * 12962 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12963 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12964 ill_rx_ring_t *ill_ring) 12965 { 12966 conn_t *connp; 12967 uint32_t sum; 12968 uint32_t u1; 12969 uint16_t *up; 12970 int offset; 12971 ssize_t len; 12972 mblk_t *mp1; 12973 boolean_t syn_present = B_FALSE; 12974 tcph_t *tcph; 12975 uint_t ip_hdr_len; 12976 ill_t *ill = (ill_t *)q->q_ptr; 12977 zoneid_t zoneid = ire->ire_zoneid; 12978 boolean_t cksum_err; 12979 uint16_t hck_flags = 0; 12980 ip_stack_t *ipst = recv_ill->ill_ipst; 12981 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 12982 12983 #define rptr ((uchar_t *)ipha) 12984 12985 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12986 ASSERT(ill != NULL); 12987 12988 /* 12989 * FAST PATH for tcp packets 12990 */ 12991 12992 /* u1 is # words of IP options */ 12993 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12994 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12995 12996 /* IP options present */ 12997 if (u1) { 12998 goto ipoptions; 12999 } else if (!mctl_present) { 13000 /* Check the IP header checksum. */ 13001 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 13002 /* Clear the IP header h/w cksum flag */ 13003 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 13004 } else if (!mctl_present) { 13005 /* 13006 * Don't verify header checksum if this packet 13007 * is coming back from AH/ESP as we already did it. 13008 */ 13009 #define uph ((uint16_t *)ipha) 13010 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 13011 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 13012 #undef uph 13013 /* finish doing IP checksum */ 13014 sum = (sum & 0xFFFF) + (sum >> 16); 13015 sum = ~(sum + (sum >> 16)) & 0xFFFF; 13016 if (sum != 0 && sum != 0xFFFF) { 13017 BUMP_MIB(ill->ill_ip_mib, 13018 ipIfStatsInCksumErrs); 13019 goto error; 13020 } 13021 } 13022 } 13023 13024 if (!mctl_present) { 13025 UPDATE_IB_PKT_COUNT(ire); 13026 ire->ire_last_used_time = lbolt; 13027 } 13028 13029 /* packet part of fragmented IP packet? */ 13030 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13031 if (u1 & (IPH_MF | IPH_OFFSET)) { 13032 goto fragmented; 13033 } 13034 13035 /* u1 = IP header length (20 bytes) */ 13036 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 13037 13038 /* does packet contain IP+TCP headers? */ 13039 len = mp->b_wptr - rptr; 13040 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 13041 IP_STAT(ipst, ip_tcppullup); 13042 goto tcppullup; 13043 } 13044 13045 /* TCP options present? */ 13046 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 13047 13048 /* 13049 * If options need to be pulled up, then goto tcpoptions. 13050 * otherwise we are still in the fast path 13051 */ 13052 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 13053 IP_STAT(ipst, ip_tcpoptions); 13054 goto tcpoptions; 13055 } 13056 13057 /* multiple mblks of tcp data? */ 13058 if ((mp1 = mp->b_cont) != NULL) { 13059 /* more then two? */ 13060 if (mp1->b_cont != NULL) { 13061 IP_STAT(ipst, ip_multipkttcp); 13062 goto multipkttcp; 13063 } 13064 len += mp1->b_wptr - mp1->b_rptr; 13065 } 13066 13067 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 13068 13069 /* part of pseudo checksum */ 13070 13071 /* TCP datagram length */ 13072 u1 = len - IP_SIMPLE_HDR_LENGTH; 13073 13074 #define iphs ((uint16_t *)ipha) 13075 13076 #ifdef _BIG_ENDIAN 13077 u1 += IPPROTO_TCP; 13078 #else 13079 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13080 #endif 13081 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13082 13083 /* 13084 * Revert to software checksum calculation if the interface 13085 * isn't capable of checksum offload or if IPsec is present. 13086 */ 13087 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 13088 hck_flags = DB_CKSUMFLAGS(mp); 13089 13090 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 13091 IP_STAT(ipst, ip_in_sw_cksum); 13092 13093 IP_CKSUM_RECV(hck_flags, u1, 13094 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 13095 (int32_t)((uchar_t *)up - rptr), 13096 mp, mp1, cksum_err); 13097 13098 if (cksum_err) { 13099 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13100 13101 if (hck_flags & HCK_FULLCKSUM) 13102 IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); 13103 else if (hck_flags & HCK_PARTIALCKSUM) 13104 IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); 13105 else 13106 IP_STAT(ipst, ip_tcp_in_sw_cksum_err); 13107 13108 goto error; 13109 } 13110 13111 try_again: 13112 13113 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, 13114 zoneid, ipst)) == NULL) { 13115 /* Send the TH_RST */ 13116 goto no_conn; 13117 } 13118 13119 /* 13120 * TCP FAST PATH for AF_INET socket. 13121 * 13122 * TCP fast path to avoid extra work. An AF_INET socket type 13123 * does not have facility to receive extra information via 13124 * ip_process or ip_add_info. Also, when the connection was 13125 * established, we made a check if this connection is impacted 13126 * by any global IPsec policy or per connection policy (a 13127 * policy that comes in effect later will not apply to this 13128 * connection). Since all this can be determined at the 13129 * connection establishment time, a quick check of flags 13130 * can avoid extra work. 13131 */ 13132 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 13133 !IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13134 ASSERT(first_mp == mp); 13135 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13136 SET_SQUEUE(mp, tcp_rput_data, connp); 13137 return (mp); 13138 } 13139 13140 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 13141 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 13142 if (IPCL_IS_TCP(connp)) { 13143 mp->b_datap->db_struioflag |= STRUIO_EAGER; 13144 DB_CKSUMSTART(mp) = 13145 (intptr_t)ip_squeue_get(ill_ring); 13146 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 13147 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13148 BUMP_MIB(ill->ill_ip_mib, 13149 ipIfStatsHCInDelivers); 13150 SET_SQUEUE(mp, connp->conn_recv, connp); 13151 return (mp); 13152 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 13153 !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { 13154 BUMP_MIB(ill->ill_ip_mib, 13155 ipIfStatsHCInDelivers); 13156 ip_squeue_enter_unbound++; 13157 SET_SQUEUE(mp, tcp_conn_request_unbound, 13158 connp); 13159 return (mp); 13160 } 13161 syn_present = B_TRUE; 13162 } 13163 13164 } 13165 13166 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 13167 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 13168 13169 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13170 /* No need to send this packet to TCP */ 13171 if ((flags & TH_RST) || (flags & TH_URG)) { 13172 CONN_DEC_REF(connp); 13173 freemsg(first_mp); 13174 return (NULL); 13175 } 13176 if (flags & TH_ACK) { 13177 tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, 13178 ipst->ips_netstack->netstack_tcp, connp); 13179 CONN_DEC_REF(connp); 13180 return (NULL); 13181 } 13182 13183 CONN_DEC_REF(connp); 13184 freemsg(first_mp); 13185 return (NULL); 13186 } 13187 13188 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { 13189 first_mp = ipsec_check_inbound_policy(first_mp, connp, 13190 ipha, NULL, mctl_present); 13191 if (first_mp == NULL) { 13192 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13193 CONN_DEC_REF(connp); 13194 return (NULL); 13195 } 13196 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 13197 ASSERT(syn_present); 13198 if (mctl_present) { 13199 ASSERT(first_mp != mp); 13200 first_mp->b_datap->db_struioflag |= 13201 STRUIO_POLICY; 13202 } else { 13203 ASSERT(first_mp == mp); 13204 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 13205 mp->b_datap->db_struioflag |= STRUIO_POLICY; 13206 } 13207 } else { 13208 /* 13209 * Discard first_mp early since we're dealing with a 13210 * fully-connected conn_t and tcp doesn't do policy in 13211 * this case. 13212 */ 13213 if (mctl_present) { 13214 freeb(first_mp); 13215 mctl_present = B_FALSE; 13216 } 13217 first_mp = mp; 13218 } 13219 } 13220 13221 /* Initiate IPPF processing for fastpath */ 13222 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13223 uint32_t ill_index; 13224 13225 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13226 ip_process(IPP_LOCAL_IN, &mp, ill_index); 13227 if (mp == NULL) { 13228 ip2dbg(("ip_input_ipsec_process: TCP pkt " 13229 "deferred/dropped during IPPF processing\n")); 13230 CONN_DEC_REF(connp); 13231 if (mctl_present) 13232 freeb(first_mp); 13233 return (NULL); 13234 } else if (mctl_present) { 13235 /* 13236 * ip_process might return a new mp. 13237 */ 13238 ASSERT(first_mp != mp); 13239 first_mp->b_cont = mp; 13240 } else { 13241 first_mp = mp; 13242 } 13243 13244 } 13245 13246 if (!syn_present && connp->conn_ip_recvpktinfo) { 13247 /* 13248 * TCP does not support IP_RECVPKTINFO for v4 so lets 13249 * make sure IPF_RECVIF is passed to ip_add_info. 13250 */ 13251 mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF, 13252 IPCL_ZONEID(connp), ipst); 13253 if (mp == NULL) { 13254 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13255 CONN_DEC_REF(connp); 13256 if (mctl_present) 13257 freeb(first_mp); 13258 return (NULL); 13259 } else if (mctl_present) { 13260 /* 13261 * ip_add_info might return a new mp. 13262 */ 13263 ASSERT(first_mp != mp); 13264 first_mp->b_cont = mp; 13265 } else { 13266 first_mp = mp; 13267 } 13268 } 13269 13270 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13271 if (IPCL_IS_TCP(connp)) { 13272 SET_SQUEUE(first_mp, connp->conn_recv, connp); 13273 return (first_mp); 13274 } else { 13275 /* SOCK_RAW, IPPROTO_TCP case */ 13276 (connp->conn_recv)(connp, first_mp, NULL); 13277 CONN_DEC_REF(connp); 13278 return (NULL); 13279 } 13280 13281 no_conn: 13282 /* Initiate IPPf processing, if needed. */ 13283 if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { 13284 uint32_t ill_index; 13285 ill_index = recv_ill->ill_phyint->phyint_ifindex; 13286 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 13287 if (first_mp == NULL) { 13288 return (NULL); 13289 } 13290 } 13291 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13292 13293 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid, 13294 ipst->ips_netstack->netstack_tcp, NULL); 13295 return (NULL); 13296 ipoptions: 13297 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) { 13298 goto slow_done; 13299 } 13300 13301 UPDATE_IB_PKT_COUNT(ire); 13302 ire->ire_last_used_time = lbolt; 13303 13304 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13305 if (u1 & (IPH_MF | IPH_OFFSET)) { 13306 fragmented: 13307 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 13308 if (mctl_present) 13309 freeb(first_mp); 13310 goto slow_done; 13311 } 13312 /* 13313 * Make sure that first_mp points back to mp as 13314 * the mp we came in with could have changed in 13315 * ip_rput_fragment(). 13316 */ 13317 ASSERT(!mctl_present); 13318 ipha = (ipha_t *)mp->b_rptr; 13319 first_mp = mp; 13320 } 13321 13322 /* Now we have a complete datagram, destined for this machine. */ 13323 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 13324 13325 len = mp->b_wptr - mp->b_rptr; 13326 /* Pull up a minimal TCP header, if necessary. */ 13327 if (len < (u1 + 20)) { 13328 tcppullup: 13329 if (!pullupmsg(mp, u1 + 20)) { 13330 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13331 goto error; 13332 } 13333 ipha = (ipha_t *)mp->b_rptr; 13334 len = mp->b_wptr - mp->b_rptr; 13335 } 13336 13337 /* 13338 * Extract the offset field from the TCP header. As usual, we 13339 * try to help the compiler more than the reader. 13340 */ 13341 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 13342 if (offset != 5) { 13343 tcpoptions: 13344 if (offset < 5) { 13345 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13346 goto error; 13347 } 13348 /* 13349 * There must be TCP options. 13350 * Make sure we can grab them. 13351 */ 13352 offset <<= 2; 13353 offset += u1; 13354 if (len < offset) { 13355 if (!pullupmsg(mp, offset)) { 13356 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13357 goto error; 13358 } 13359 ipha = (ipha_t *)mp->b_rptr; 13360 len = mp->b_wptr - rptr; 13361 } 13362 } 13363 13364 /* Get the total packet length in len, including headers. */ 13365 if (mp->b_cont) { 13366 multipkttcp: 13367 len = msgdsize(mp); 13368 } 13369 13370 /* 13371 * Check the TCP checksum by pulling together the pseudo- 13372 * header checksum, and passing it to ip_csum to be added in 13373 * with the TCP datagram. 13374 * 13375 * Since we are not using the hwcksum if available we must 13376 * clear the flag. We may come here via tcppullup or tcpoptions. 13377 * If either of these fails along the way the mblk is freed. 13378 * If this logic ever changes and mblk is reused to say send 13379 * ICMP's back, then this flag may need to be cleared in 13380 * other places as well. 13381 */ 13382 DB_CKSUMFLAGS(mp) = 0; 13383 13384 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 13385 13386 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 13387 #ifdef _BIG_ENDIAN 13388 u1 += IPPROTO_TCP; 13389 #else 13390 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 13391 #endif 13392 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 13393 /* 13394 * Not M_DATA mblk or its a dup, so do the checksum now. 13395 */ 13396 IP_STAT(ipst, ip_in_sw_cksum); 13397 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 13398 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 13399 goto error; 13400 } 13401 13402 IP_STAT(ipst, ip_tcp_slow_path); 13403 goto try_again; 13404 #undef iphs 13405 #undef rptr 13406 13407 error: 13408 freemsg(first_mp); 13409 slow_done: 13410 return (NULL); 13411 } 13412 13413 /* ARGSUSED */ 13414 static void 13415 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 13416 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 13417 { 13418 conn_t *connp; 13419 uint32_t sum; 13420 uint32_t u1; 13421 ssize_t len; 13422 sctp_hdr_t *sctph; 13423 zoneid_t zoneid = ire->ire_zoneid; 13424 uint32_t pktsum; 13425 uint32_t calcsum; 13426 uint32_t ports; 13427 in6_addr_t map_src, map_dst; 13428 ill_t *ill = (ill_t *)q->q_ptr; 13429 ip_stack_t *ipst; 13430 sctp_stack_t *sctps; 13431 boolean_t sctp_csum_err = B_FALSE; 13432 13433 ASSERT(recv_ill != NULL); 13434 ipst = recv_ill->ill_ipst; 13435 sctps = ipst->ips_netstack->netstack_sctp; 13436 13437 #define rptr ((uchar_t *)ipha) 13438 13439 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 13440 ASSERT(ill != NULL); 13441 13442 /* u1 is # words of IP options */ 13443 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 13444 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 13445 13446 /* IP options present */ 13447 if (u1 > 0) { 13448 goto ipoptions; 13449 } else { 13450 /* Check the IP header checksum. */ 13451 if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill) && 13452 !mctl_present) { 13453 #define uph ((uint16_t *)ipha) 13454 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 13455 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 13456 #undef uph 13457 /* finish doing IP checksum */ 13458 sum = (sum & 0xFFFF) + (sum >> 16); 13459 sum = ~(sum + (sum >> 16)) & 0xFFFF; 13460 /* 13461 * Don't verify header checksum if this packet 13462 * is coming back from AH/ESP as we already did it. 13463 */ 13464 if (sum != 0 && sum != 0xFFFF) { 13465 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 13466 goto error; 13467 } 13468 } 13469 /* 13470 * Since there is no SCTP h/w cksum support yet, just 13471 * clear the flag. 13472 */ 13473 DB_CKSUMFLAGS(mp) = 0; 13474 } 13475 13476 /* 13477 * Don't verify header checksum if this packet is coming 13478 * back from AH/ESP as we already did it. 13479 */ 13480 if (!mctl_present) { 13481 UPDATE_IB_PKT_COUNT(ire); 13482 ire->ire_last_used_time = lbolt; 13483 } 13484 13485 /* packet part of fragmented IP packet? */ 13486 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13487 if (u1 & (IPH_MF | IPH_OFFSET)) 13488 goto fragmented; 13489 13490 /* u1 = IP header length (20 bytes) */ 13491 u1 = IP_SIMPLE_HDR_LENGTH; 13492 13493 find_sctp_client: 13494 /* Pullup if we don't have the sctp common header. */ 13495 len = MBLKL(mp); 13496 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 13497 if (mp->b_cont == NULL || 13498 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 13499 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13500 goto error; 13501 } 13502 ipha = (ipha_t *)mp->b_rptr; 13503 len = MBLKL(mp); 13504 } 13505 13506 sctph = (sctp_hdr_t *)(rptr + u1); 13507 #ifdef DEBUG 13508 if (!skip_sctp_cksum) { 13509 #endif 13510 pktsum = sctph->sh_chksum; 13511 sctph->sh_chksum = 0; 13512 calcsum = sctp_cksum(mp, u1); 13513 sctph->sh_chksum = pktsum; 13514 if (calcsum != pktsum) 13515 sctp_csum_err = B_TRUE; 13516 #ifdef DEBUG /* skip_sctp_cksum */ 13517 } 13518 #endif 13519 /* get the ports */ 13520 ports = *(uint32_t *)&sctph->sh_sport; 13521 13522 IRE_REFRELE(ire); 13523 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 13524 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 13525 if (sctp_csum_err) { 13526 /* 13527 * No potential sctp checksum errors go to the Sun 13528 * sctp stack however they might be Adler-32 summed 13529 * packets a userland stack bound to a raw IP socket 13530 * could reasonably use. Note though that Adler-32 is 13531 * a long deprecated algorithm and customer sctp 13532 * networks should eventually migrate to CRC-32 at 13533 * which time this facility should be removed. 13534 */ 13535 flags |= IP_FF_SCTP_CSUM_ERR; 13536 goto no_conn; 13537 } 13538 if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp, 13539 sctps)) == NULL) { 13540 /* Check for raw socket or OOTB handling */ 13541 goto no_conn; 13542 } 13543 13544 /* Found a client; up it goes */ 13545 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 13546 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 13547 return; 13548 13549 no_conn: 13550 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 13551 ports, mctl_present, flags, B_TRUE, zoneid); 13552 return; 13553 13554 ipoptions: 13555 DB_CKSUMFLAGS(mp) = 0; 13556 if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) 13557 goto slow_done; 13558 13559 UPDATE_IB_PKT_COUNT(ire); 13560 ire->ire_last_used_time = lbolt; 13561 13562 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 13563 if (u1 & (IPH_MF | IPH_OFFSET)) { 13564 fragmented: 13565 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) 13566 goto slow_done; 13567 /* 13568 * Make sure that first_mp points back to mp as 13569 * the mp we came in with could have changed in 13570 * ip_rput_fragment(). 13571 */ 13572 ASSERT(!mctl_present); 13573 ipha = (ipha_t *)mp->b_rptr; 13574 first_mp = mp; 13575 } 13576 13577 /* Now we have a complete datagram, destined for this machine. */ 13578 u1 = IPH_HDR_LENGTH(ipha); 13579 goto find_sctp_client; 13580 #undef iphs 13581 #undef rptr 13582 13583 error: 13584 freemsg(first_mp); 13585 slow_done: 13586 IRE_REFRELE(ire); 13587 } 13588 13589 #define VER_BITS 0xF0 13590 #define VERSION_6 0x60 13591 13592 static boolean_t 13593 ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp, 13594 ipaddr_t *dstp, ip_stack_t *ipst) 13595 { 13596 uint_t opt_len; 13597 ipha_t *ipha; 13598 ssize_t len; 13599 uint_t pkt_len; 13600 13601 ASSERT(ill != NULL); 13602 IP_STAT(ipst, ip_ipoptions); 13603 ipha = *iphapp; 13604 13605 #define rptr ((uchar_t *)ipha) 13606 /* Assume no IPv6 packets arrive over the IPv4 queue */ 13607 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 13608 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); 13609 freemsg(mp); 13610 return (B_FALSE); 13611 } 13612 13613 /* multiple mblk or too short */ 13614 pkt_len = ntohs(ipha->ipha_length); 13615 13616 /* Get the number of words of IP options in the IP header. */ 13617 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 13618 if (opt_len) { 13619 /* IP Options present! Validate and process. */ 13620 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 13621 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13622 goto done; 13623 } 13624 /* 13625 * Recompute complete header length and make sure we 13626 * have access to all of it. 13627 */ 13628 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 13629 if (len > (mp->b_wptr - rptr)) { 13630 if (len > pkt_len) { 13631 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13632 goto done; 13633 } 13634 if (!pullupmsg(mp, len)) { 13635 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13636 goto done; 13637 } 13638 ipha = (ipha_t *)mp->b_rptr; 13639 } 13640 /* 13641 * Go off to ip_rput_options which returns the next hop 13642 * destination address, which may have been affected 13643 * by source routing. 13644 */ 13645 IP_STAT(ipst, ip_opt); 13646 if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) { 13647 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13648 return (B_FALSE); 13649 } 13650 } 13651 *iphapp = ipha; 13652 return (B_TRUE); 13653 done: 13654 /* clear b_prev - used by ip_mroute_decap */ 13655 mp->b_prev = NULL; 13656 freemsg(mp); 13657 return (B_FALSE); 13658 #undef rptr 13659 } 13660 13661 /* 13662 * Deal with the fact that there is no ire for the destination. 13663 */ 13664 static ire_t * 13665 ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst) 13666 { 13667 ipha_t *ipha; 13668 ill_t *ill; 13669 ire_t *ire; 13670 ip_stack_t *ipst; 13671 enum ire_forward_action ret_action; 13672 13673 ipha = (ipha_t *)mp->b_rptr; 13674 ill = (ill_t *)q->q_ptr; 13675 13676 ASSERT(ill != NULL); 13677 ipst = ill->ill_ipst; 13678 13679 /* 13680 * No IRE for this destination, so it can't be for us. 13681 * Unless we are forwarding, drop the packet. 13682 * We have to let source routed packets through 13683 * since we don't yet know if they are 'ping -l' 13684 * packets i.e. if they will go out over the 13685 * same interface as they came in on. 13686 */ 13687 if (ll_multicast) { 13688 freemsg(mp); 13689 return (NULL); 13690 } 13691 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { 13692 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13693 freemsg(mp); 13694 return (NULL); 13695 } 13696 13697 /* 13698 * Mark this packet as having originated externally. 13699 * 13700 * For non-forwarding code path, ire_send later double 13701 * checks this interface to see if it is still exists 13702 * post-ARP resolution. 13703 * 13704 * Also, IPQOS uses this to differentiate between 13705 * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP 13706 * QOS packet processing in ip_wput_attach_llhdr(). 13707 * The QoS module can mark the b_band for a fastpath message 13708 * or the dl_priority field in a unitdata_req header for 13709 * CoS marking. This info can only be found in 13710 * ip_wput_attach_llhdr(). 13711 */ 13712 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 13713 /* 13714 * Clear the indication that this may have a hardware checksum 13715 * as we are not using it 13716 */ 13717 DB_CKSUMFLAGS(mp) = 0; 13718 13719 ire = ire_forward(dst, &ret_action, NULL, NULL, 13720 MBLK_GETLABEL(mp), ipst); 13721 13722 if (ire == NULL && ret_action == Forward_check_multirt) { 13723 /* Let ip_newroute handle CGTP */ 13724 ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst); 13725 return (NULL); 13726 } 13727 13728 if (ire != NULL) 13729 return (ire); 13730 13731 mp->b_prev = mp->b_next = 0; 13732 13733 if (ret_action == Forward_blackhole) { 13734 freemsg(mp); 13735 return (NULL); 13736 } 13737 /* send icmp unreachable */ 13738 q = WR(q); 13739 /* Sent by forwarding path, and router is global zone */ 13740 if (ip_source_routed(ipha, ipst)) { 13741 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, 13742 GLOBAL_ZONEID, ipst); 13743 } else { 13744 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID, 13745 ipst); 13746 } 13747 13748 return (NULL); 13749 13750 } 13751 13752 /* 13753 * check ip header length and align it. 13754 */ 13755 static boolean_t 13756 ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) 13757 { 13758 ssize_t len; 13759 ill_t *ill; 13760 ipha_t *ipha; 13761 13762 len = MBLKL(mp); 13763 13764 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13765 ill = (ill_t *)q->q_ptr; 13766 13767 if (!OK_32PTR(mp->b_rptr)) 13768 IP_STAT(ipst, ip_notaligned1); 13769 else 13770 IP_STAT(ipst, ip_notaligned2); 13771 /* Guard against bogus device drivers */ 13772 if (len < 0) { 13773 /* clear b_prev - used by ip_mroute_decap */ 13774 mp->b_prev = NULL; 13775 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 13776 freemsg(mp); 13777 return (B_FALSE); 13778 } 13779 13780 if (ip_rput_pullups++ == 0) { 13781 ipha = (ipha_t *)mp->b_rptr; 13782 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 13783 "ip_check_and_align_header: %s forced us to " 13784 " pullup pkt, hdr len %ld, hdr addr %p", 13785 ill->ill_name, len, (void *)ipha); 13786 } 13787 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 13788 /* clear b_prev - used by ip_mroute_decap */ 13789 mp->b_prev = NULL; 13790 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13791 freemsg(mp); 13792 return (B_FALSE); 13793 } 13794 } 13795 return (B_TRUE); 13796 } 13797 13798 ire_t * 13799 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) 13800 { 13801 ire_t *new_ire; 13802 ill_t *ire_ill; 13803 uint_t ifindex; 13804 ip_stack_t *ipst = ill->ill_ipst; 13805 boolean_t strict_check = B_FALSE; 13806 13807 /* 13808 * This packet came in on an interface other than the one associated 13809 * with the first ire we found for the destination address. We do 13810 * another ire lookup here, using the ingress ill, to see if the 13811 * interface is in an interface group. 13812 * As long as the ills belong to the same group, we don't consider 13813 * them to be arriving on the wrong interface. Thus, if the switch 13814 * is doing inbound load spreading, we won't drop packets when the 13815 * ip*_strict_dst_multihoming switch is on. Note, the same holds true 13816 * for 'usesrc groups' where the destination address may belong to 13817 * another interface to allow multipathing to happen. 13818 * We also need to check for IPIF_UNNUMBERED point2point interfaces 13819 * where the local address may not be unique. In this case we were 13820 * at the mercy of the initial ire cache lookup and the IRE_LOCAL it 13821 * actually returned. The new lookup, which is more specific, should 13822 * only find the IRE_LOCAL associated with the ingress ill if one 13823 * exists. 13824 */ 13825 13826 if (ire->ire_ipversion == IPV4_VERSION) { 13827 if (ipst->ips_ip_strict_dst_multihoming) 13828 strict_check = B_TRUE; 13829 new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, 13830 ill->ill_ipif, ALL_ZONES, NULL, 13831 (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); 13832 } else { 13833 ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); 13834 if (ipst->ips_ipv6_strict_dst_multihoming) 13835 strict_check = B_TRUE; 13836 new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, 13837 IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, 13838 (MATCH_IRE_TYPE|MATCH_IRE_ILL_GROUP), ipst); 13839 } 13840 /* 13841 * If the same ire that was returned in ip_input() is found then this 13842 * is an indication that interface groups are in use. The packet 13843 * arrived on a different ill in the group than the one associated with 13844 * the destination address. If a different ire was found then the same 13845 * IP address must be hosted on multiple ills. This is possible with 13846 * unnumbered point2point interfaces. We switch to use this new ire in 13847 * order to have accurate interface statistics. 13848 */ 13849 if (new_ire != NULL) { 13850 if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) { 13851 ire_refrele(ire); 13852 ire = new_ire; 13853 } else { 13854 ire_refrele(new_ire); 13855 } 13856 return (ire); 13857 } else if ((ire->ire_rfq == NULL) && 13858 (ire->ire_ipversion == IPV4_VERSION)) { 13859 /* 13860 * The best match could have been the original ire which 13861 * was created against an IRE_LOCAL on lo0. In the IPv4 case 13862 * the strict multihoming checks are irrelevant as we consider 13863 * local addresses hosted on lo0 to be interface agnostic. We 13864 * only expect a null ire_rfq on IREs which are associated with 13865 * lo0 hence we can return now. 13866 */ 13867 return (ire); 13868 } 13869 13870 /* 13871 * Chase pointers once and store locally. 13872 */ 13873 ire_ill = (ire->ire_rfq == NULL) ? NULL : 13874 (ill_t *)(ire->ire_rfq->q_ptr); 13875 ifindex = ill->ill_usesrc_ifindex; 13876 13877 /* 13878 * Check if it's a legal address on the 'usesrc' interface. 13879 */ 13880 if ((ifindex != 0) && (ire_ill != NULL) && 13881 (ifindex == ire_ill->ill_phyint->phyint_ifindex)) { 13882 return (ire); 13883 } 13884 13885 /* 13886 * If the ip*_strict_dst_multihoming switch is on then we can 13887 * only accept this packet if the interface is marked as routing. 13888 */ 13889 if (!(strict_check)) 13890 return (ire); 13891 13892 if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & 13893 ILLF_ROUTER) != 0) { 13894 return (ire); 13895 } 13896 13897 ire_refrele(ire); 13898 return (NULL); 13899 } 13900 13901 ire_t * 13902 ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) 13903 { 13904 ipha_t *ipha; 13905 ire_t *src_ire; 13906 ill_t *stq_ill; 13907 uint_t hlen; 13908 uint_t pkt_len; 13909 uint32_t sum; 13910 queue_t *dev_q; 13911 ip_stack_t *ipst = ill->ill_ipst; 13912 mblk_t *fpmp; 13913 enum ire_forward_action ret_action; 13914 13915 ipha = (ipha_t *)mp->b_rptr; 13916 13917 if (ire != NULL && 13918 ire->ire_zoneid != GLOBAL_ZONEID && 13919 ire->ire_zoneid != ALL_ZONES) { 13920 /* 13921 * Should only use IREs that are visible to the global 13922 * zone for forwarding. 13923 */ 13924 ire_refrele(ire); 13925 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst); 13926 } 13927 13928 /* 13929 * Martian Address Filtering [RFC 1812, Section 5.3.7] 13930 * The loopback address check for both src and dst has already 13931 * been checked in ip_input 13932 */ 13933 13934 if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) { 13935 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13936 goto drop; 13937 } 13938 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 13939 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 13940 13941 if (src_ire != NULL) { 13942 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 13943 ire_refrele(src_ire); 13944 goto drop; 13945 } 13946 13947 /* No ire cache of nexthop. So first create one */ 13948 if (ire == NULL) { 13949 13950 ire = ire_forward(dst, &ret_action, NULL, NULL, 13951 NULL, ipst); 13952 /* 13953 * We only come to ip_fast_forward if ip_cgtp_filter 13954 * is not set. So ire_forward() should not return with 13955 * Forward_check_multirt as the next action. 13956 */ 13957 ASSERT(ret_action != Forward_check_multirt); 13958 if (ire == NULL) { 13959 /* An attempt was made to forward the packet */ 13960 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 13961 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 13962 mp->b_prev = mp->b_next = 0; 13963 /* send icmp unreachable */ 13964 /* Sent by forwarding path, and router is global zone */ 13965 if (ret_action == Forward_ret_icmp_err) { 13966 if (ip_source_routed(ipha, ipst)) { 13967 icmp_unreachable(ill->ill_wq, mp, 13968 ICMP_SOURCE_ROUTE_FAILED, 13969 GLOBAL_ZONEID, ipst); 13970 } else { 13971 icmp_unreachable(ill->ill_wq, mp, 13972 ICMP_HOST_UNREACHABLE, 13973 GLOBAL_ZONEID, ipst); 13974 } 13975 } else { 13976 freemsg(mp); 13977 } 13978 return (NULL); 13979 } 13980 } 13981 13982 /* 13983 * Forwarding fastpath exception case: 13984 * If either of the follwoing case is true, we take 13985 * the slowpath 13986 * o forwarding is not enabled 13987 * o incoming and outgoing interface are the same, or the same 13988 * IPMP group 13989 * o corresponding ire is in incomplete state 13990 * o packet needs fragmentation 13991 * o ARP cache is not resolved 13992 * 13993 * The codeflow from here on is thus: 13994 * ip_rput_process_forward->ip_rput_forward->ip_xmit_v4 13995 */ 13996 pkt_len = ntohs(ipha->ipha_length); 13997 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 13998 if (!(stq_ill->ill_flags & ILLF_ROUTER) || 13999 !(ill->ill_flags & ILLF_ROUTER) || 14000 (ill == stq_ill) || 14001 (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) || 14002 (ire->ire_nce == NULL) || 14003 (pkt_len > ire->ire_max_frag) || 14004 ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || 14005 ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) || 14006 ipha->ipha_ttl <= 1) { 14007 ip_rput_process_forward(ill->ill_rq, mp, ire, 14008 ipha, ill, B_FALSE); 14009 return (ire); 14010 } 14011 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 14012 14013 DTRACE_PROBE4(ip4__forwarding__start, 14014 ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); 14015 14016 FW_HOOKS(ipst->ips_ip4_forwarding_event, 14017 ipst->ips_ipv4firewall_forwarding, 14018 ill, stq_ill, ipha, mp, mp, 0, ipst); 14019 14020 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 14021 14022 if (mp == NULL) 14023 goto drop; 14024 14025 mp->b_datap->db_struioun.cksum.flags = 0; 14026 /* Adjust the checksum to reflect the ttl decrement. */ 14027 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 14028 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 14029 ipha->ipha_ttl--; 14030 14031 /* 14032 * Write the link layer header. We can do this safely here, 14033 * because we have already tested to make sure that the IP 14034 * policy is not set, and that we have a fast path destination 14035 * header. 14036 */ 14037 mp->b_rptr -= hlen; 14038 bcopy(fpmp->b_rptr, mp->b_rptr, hlen); 14039 14040 UPDATE_IB_PKT_COUNT(ire); 14041 ire->ire_last_used_time = lbolt; 14042 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 14043 BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 14044 UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len); 14045 14046 dev_q = ire->ire_stq->q_next; 14047 if ((dev_q->q_next != NULL || dev_q->q_first != NULL) && 14048 !canputnext(ire->ire_stq)) { 14049 goto indiscard; 14050 } 14051 if (ILL_DLS_CAPABLE(stq_ill)) { 14052 /* 14053 * Send the packet directly to DLD, where it 14054 * may be queued depending on the availability 14055 * of transmit resources at the media layer. 14056 */ 14057 IP_DLS_ILL_TX(stq_ill, ipha, mp, ipst); 14058 } else { 14059 DTRACE_PROBE4(ip4__physical__out__start, 14060 ill_t *, NULL, ill_t *, stq_ill, 14061 ipha_t *, ipha, mblk_t *, mp); 14062 FW_HOOKS(ipst->ips_ip4_physical_out_event, 14063 ipst->ips_ipv4firewall_physical_out, 14064 NULL, stq_ill, ipha, mp, mp, 0, ipst); 14065 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 14066 if (mp == NULL) 14067 goto drop; 14068 14069 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 14070 ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, 14071 ip6_t *, NULL, int, 0); 14072 14073 putnext(ire->ire_stq, mp); 14074 } 14075 return (ire); 14076 14077 indiscard: 14078 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14079 drop: 14080 if (mp != NULL) 14081 freemsg(mp); 14082 return (ire); 14083 14084 } 14085 14086 /* 14087 * This function is called in the forwarding slowpath, when 14088 * either the ire lacks the link-layer address, or the packet needs 14089 * further processing(eg. fragmentation), before transmission. 14090 */ 14091 14092 static void 14093 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14094 ill_t *ill, boolean_t ll_multicast) 14095 { 14096 ill_group_t *ill_group; 14097 ill_group_t *ire_group; 14098 queue_t *dev_q; 14099 ire_t *src_ire; 14100 ip_stack_t *ipst = ill->ill_ipst; 14101 14102 ASSERT(ire->ire_stq != NULL); 14103 14104 mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */ 14105 mp->b_next = NULL; /* ip_rput_noire sets dst here */ 14106 14107 if (ll_multicast != 0) { 14108 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14109 goto drop_pkt; 14110 } 14111 14112 /* 14113 * check if ipha_src is a broadcast address. Note that this 14114 * check is redundant when we get here from ip_fast_forward() 14115 * which has already done this check. However, since we can 14116 * also get here from ip_rput_process_broadcast() or, for 14117 * for the slow path through ip_fast_forward(), we perform 14118 * the check again for code-reusability 14119 */ 14120 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 14121 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 14122 if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) { 14123 if (src_ire != NULL) 14124 ire_refrele(src_ire); 14125 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14126 ip2dbg(("ip_rput_process_forward: Received packet with" 14127 " bad src/dst address on %s\n", ill->ill_name)); 14128 goto drop_pkt; 14129 } 14130 14131 ill_group = ill->ill_group; 14132 ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; 14133 /* 14134 * Check if we want to forward this one at this time. 14135 * We allow source routed packets on a host provided that 14136 * they go out the same interface or same interface group 14137 * as they came in on. 14138 * 14139 * XXX To be quicker, we may wish to not chase pointers to 14140 * get the ILLF_ROUTER flag and instead store the 14141 * forwarding policy in the ire. An unfortunate 14142 * side-effect of that would be requiring an ire flush 14143 * whenever the ILLF_ROUTER flag changes. 14144 */ 14145 if (((ill->ill_flags & 14146 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 14147 ILLF_ROUTER) == 0) && 14148 !(ip_source_routed(ipha, ipst) && (ire->ire_rfq == q || 14149 (ill_group != NULL && ill_group == ire_group)))) { 14150 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 14151 if (ip_source_routed(ipha, ipst)) { 14152 q = WR(q); 14153 /* 14154 * Clear the indication that this may have 14155 * hardware checksum as we are not using it. 14156 */ 14157 DB_CKSUMFLAGS(mp) = 0; 14158 /* Sent by forwarding path, and router is global zone */ 14159 icmp_unreachable(q, mp, 14160 ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst); 14161 return; 14162 } 14163 goto drop_pkt; 14164 } 14165 14166 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 14167 14168 /* Packet is being forwarded. Turning off hwcksum flag. */ 14169 DB_CKSUMFLAGS(mp) = 0; 14170 if (ipst->ips_ip_g_send_redirects) { 14171 /* 14172 * Check whether the incoming interface and outgoing 14173 * interface is part of the same group. If so, 14174 * send redirects. 14175 * 14176 * Check the source address to see if it originated 14177 * on the same logical subnet it is going back out on. 14178 * If so, we should be able to send it a redirect. 14179 * Avoid sending a redirect if the destination 14180 * is directly connected (i.e., ipha_dst is the same 14181 * as ire_gateway_addr or the ire_addr of the 14182 * nexthop IRE_CACHE ), or if the packet was source 14183 * routed out this interface. 14184 */ 14185 ipaddr_t src, nhop; 14186 mblk_t *mp1; 14187 ire_t *nhop_ire = NULL; 14188 14189 /* 14190 * Check whether ire_rfq and q are from the same ill 14191 * or if they are not same, they at least belong 14192 * to the same group. If so, send redirects. 14193 */ 14194 if ((ire->ire_rfq == q || 14195 (ill_group != NULL && ill_group == ire_group)) && 14196 !ip_source_routed(ipha, ipst)) { 14197 14198 nhop = (ire->ire_gateway_addr != 0 ? 14199 ire->ire_gateway_addr : ire->ire_addr); 14200 14201 if (ipha->ipha_dst == nhop) { 14202 /* 14203 * We avoid sending a redirect if the 14204 * destination is directly connected 14205 * because it is possible that multiple 14206 * IP subnets may have been configured on 14207 * the link, and the source may not 14208 * be on the same subnet as ip destination, 14209 * even though they are on the same 14210 * physical link. 14211 */ 14212 goto sendit; 14213 } 14214 14215 src = ipha->ipha_src; 14216 14217 /* 14218 * We look up the interface ire for the nexthop, 14219 * to see if ipha_src is in the same subnet 14220 * as the nexthop. 14221 * 14222 * Note that, if, in the future, IRE_CACHE entries 14223 * are obsoleted, this lookup will not be needed, 14224 * as the ire passed to this function will be the 14225 * same as the nhop_ire computed below. 14226 */ 14227 nhop_ire = ire_ftable_lookup(nhop, 0, 0, 14228 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 14229 0, NULL, MATCH_IRE_TYPE, ipst); 14230 14231 if (nhop_ire != NULL) { 14232 if ((src & nhop_ire->ire_mask) == 14233 (nhop & nhop_ire->ire_mask)) { 14234 /* 14235 * The source is directly connected. 14236 * Just copy the ip header (which is 14237 * in the first mblk) 14238 */ 14239 mp1 = copyb(mp); 14240 if (mp1 != NULL) { 14241 icmp_send_redirect(WR(q), mp1, 14242 nhop, ipst); 14243 } 14244 } 14245 ire_refrele(nhop_ire); 14246 } 14247 } 14248 } 14249 sendit: 14250 dev_q = ire->ire_stq->q_next; 14251 if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { 14252 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14253 freemsg(mp); 14254 return; 14255 } 14256 14257 ip_rput_forward(ire, ipha, mp, ill); 14258 return; 14259 14260 drop_pkt: 14261 ip2dbg(("ip_rput_process_forward: drop pkt\n")); 14262 freemsg(mp); 14263 } 14264 14265 ire_t * 14266 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, 14267 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 14268 { 14269 queue_t *q; 14270 uint16_t hcksumflags; 14271 ip_stack_t *ipst = ill->ill_ipst; 14272 14273 q = *qp; 14274 14275 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); 14276 14277 /* 14278 * Clear the indication that this may have hardware 14279 * checksum as we are not using it for forwarding. 14280 */ 14281 hcksumflags = DB_CKSUMFLAGS(mp); 14282 DB_CKSUMFLAGS(mp) = 0; 14283 14284 /* 14285 * Directed broadcast forwarding: if the packet came in over a 14286 * different interface then it is routed out over we can forward it. 14287 */ 14288 if (ipha->ipha_protocol == IPPROTO_TCP) { 14289 ire_refrele(ire); 14290 freemsg(mp); 14291 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14292 return (NULL); 14293 } 14294 /* 14295 * For multicast we have set dst to be INADDR_BROADCAST 14296 * for delivering to all STREAMS. IRE_MARK_NORECV is really 14297 * only for broadcast packets. 14298 */ 14299 if (!CLASSD(ipha->ipha_dst)) { 14300 ire_t *new_ire; 14301 ipif_t *ipif; 14302 /* 14303 * For ill groups, as the switch duplicates broadcasts 14304 * across all the ports, we need to filter out and 14305 * send up only one copy. There is one copy for every 14306 * broadcast address on each ill. Thus, we look for a 14307 * specific IRE on this ill and look at IRE_MARK_NORECV 14308 * later to see whether this ill is eligible to receive 14309 * them or not. ill_nominate_bcast_rcv() nominates only 14310 * one set of IREs for receiving. 14311 */ 14312 14313 ipif = ipif_get_next_ipif(NULL, ill); 14314 if (ipif == NULL) { 14315 ire_refrele(ire); 14316 freemsg(mp); 14317 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14318 return (NULL); 14319 } 14320 new_ire = ire_ctable_lookup(dst, 0, 0, 14321 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst); 14322 ipif_refrele(ipif); 14323 14324 if (new_ire != NULL) { 14325 if (new_ire->ire_marks & IRE_MARK_NORECV) { 14326 ire_refrele(ire); 14327 ire_refrele(new_ire); 14328 freemsg(mp); 14329 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14330 return (NULL); 14331 } 14332 /* 14333 * In the special case of multirouted broadcast 14334 * packets, we unconditionally need to "gateway" 14335 * them to the appropriate interface here. 14336 * In the normal case, this cannot happen, because 14337 * there is no broadcast IRE tagged with the 14338 * RTF_MULTIRT flag. 14339 */ 14340 if (new_ire->ire_flags & RTF_MULTIRT) { 14341 ire_refrele(new_ire); 14342 if (ire->ire_rfq != NULL) { 14343 q = ire->ire_rfq; 14344 *qp = q; 14345 } 14346 } else { 14347 ire_refrele(ire); 14348 ire = new_ire; 14349 } 14350 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 14351 if (!ipst->ips_ip_g_forward_directed_bcast) { 14352 /* 14353 * Free the message if 14354 * ip_g_forward_directed_bcast is turned 14355 * off for non-local broadcast. 14356 */ 14357 ire_refrele(ire); 14358 freemsg(mp); 14359 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14360 return (NULL); 14361 } 14362 } else { 14363 /* 14364 * This CGTP packet successfully passed the 14365 * CGTP filter, but the related CGTP 14366 * broadcast IRE has not been found, 14367 * meaning that the redundant ipif is 14368 * probably down. However, if we discarded 14369 * this packet, its duplicate would be 14370 * filtered out by the CGTP filter so none 14371 * of them would get through. So we keep 14372 * going with this one. 14373 */ 14374 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 14375 if (ire->ire_rfq != NULL) { 14376 q = ire->ire_rfq; 14377 *qp = q; 14378 } 14379 } 14380 } 14381 if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) { 14382 /* 14383 * Verify that there are not more then one 14384 * IRE_BROADCAST with this broadcast address which 14385 * has ire_stq set. 14386 * TODO: simplify, loop over all IRE's 14387 */ 14388 ire_t *ire1; 14389 int num_stq = 0; 14390 mblk_t *mp1; 14391 14392 /* Find the first one with ire_stq set */ 14393 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 14394 for (ire1 = ire; ire1 && 14395 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 14396 ire1 = ire1->ire_next) 14397 ; 14398 if (ire1) { 14399 ire_refrele(ire); 14400 ire = ire1; 14401 IRE_REFHOLD(ire); 14402 } 14403 14404 /* Check if there are additional ones with stq set */ 14405 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 14406 if (ire->ire_addr != ire1->ire_addr) 14407 break; 14408 if (ire1->ire_stq) { 14409 num_stq++; 14410 break; 14411 } 14412 } 14413 rw_exit(&ire->ire_bucket->irb_lock); 14414 if (num_stq == 1 && ire->ire_stq != NULL) { 14415 ip1dbg(("ip_rput_process_broadcast: directed " 14416 "broadcast to 0x%x\n", 14417 ntohl(ire->ire_addr))); 14418 mp1 = copymsg(mp); 14419 if (mp1) { 14420 switch (ipha->ipha_protocol) { 14421 case IPPROTO_UDP: 14422 ip_udp_input(q, mp1, ipha, ire, ill); 14423 break; 14424 default: 14425 ip_proto_input(q, mp1, ipha, ire, ill, 14426 0); 14427 break; 14428 } 14429 } 14430 /* 14431 * Adjust ttl to 2 (1+1 - the forward engine 14432 * will decrement it by one. 14433 */ 14434 if (ip_csum_hdr(ipha)) { 14435 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 14436 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 14437 freemsg(mp); 14438 ire_refrele(ire); 14439 return (NULL); 14440 } 14441 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; 14442 ipha->ipha_hdr_checksum = 0; 14443 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 14444 ip_rput_process_forward(q, mp, ire, ipha, 14445 ill, ll_multicast); 14446 ire_refrele(ire); 14447 return (NULL); 14448 } 14449 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 14450 ntohl(ire->ire_addr))); 14451 } 14452 14453 14454 /* Restore any hardware checksum flags */ 14455 DB_CKSUMFLAGS(mp) = hcksumflags; 14456 return (ire); 14457 } 14458 14459 /* ARGSUSED */ 14460 static boolean_t 14461 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 14462 int *ll_multicast, ipaddr_t *dstp) 14463 { 14464 ip_stack_t *ipst = ill->ill_ipst; 14465 14466 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); 14467 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, 14468 ntohs(ipha->ipha_length)); 14469 14470 /* 14471 * Forward packets only if we have joined the allmulti 14472 * group on this interface. 14473 */ 14474 if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) { 14475 int retval; 14476 14477 /* 14478 * Clear the indication that this may have hardware 14479 * checksum as we are not using it. 14480 */ 14481 DB_CKSUMFLAGS(mp) = 0; 14482 retval = ip_mforward(ill, ipha, mp); 14483 /* ip_mforward updates mib variables if needed */ 14484 /* clear b_prev - used by ip_mroute_decap */ 14485 mp->b_prev = NULL; 14486 14487 switch (retval) { 14488 case 0: 14489 /* 14490 * pkt is okay and arrived on phyint. 14491 * 14492 * If we are running as a multicast router 14493 * we need to see all IGMP and/or PIM packets. 14494 */ 14495 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 14496 (ipha->ipha_protocol == IPPROTO_PIM)) { 14497 goto done; 14498 } 14499 break; 14500 case -1: 14501 /* pkt is mal-formed, toss it */ 14502 goto drop_pkt; 14503 case 1: 14504 /* pkt is okay and arrived on a tunnel */ 14505 /* 14506 * If we are running a multicast router 14507 * we need to see all igmp packets. 14508 */ 14509 if (ipha->ipha_protocol == IPPROTO_IGMP) { 14510 *dstp = INADDR_BROADCAST; 14511 *ll_multicast = 1; 14512 return (B_FALSE); 14513 } 14514 14515 goto drop_pkt; 14516 } 14517 } 14518 14519 ILM_WALKER_HOLD(ill); 14520 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 14521 /* 14522 * This might just be caused by the fact that 14523 * multiple IP Multicast addresses map to the same 14524 * link layer multicast - no need to increment counter! 14525 */ 14526 ILM_WALKER_RELE(ill); 14527 freemsg(mp); 14528 return (B_TRUE); 14529 } 14530 ILM_WALKER_RELE(ill); 14531 done: 14532 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 14533 /* 14534 * This assumes the we deliver to all streams for multicast 14535 * and broadcast packets. 14536 */ 14537 *dstp = INADDR_BROADCAST; 14538 *ll_multicast = 1; 14539 return (B_FALSE); 14540 drop_pkt: 14541 ip2dbg(("ip_rput: drop pkt\n")); 14542 freemsg(mp); 14543 return (B_TRUE); 14544 } 14545 14546 /* 14547 * This function is used to both return an indication of whether or not 14548 * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND) 14549 * and in doing so, determine whether or not it is broadcast vs multicast. 14550 * For it to be a broadcast packet, we must have the appropriate mblk_t 14551 * hanging off the ill_t. If this is either not present or doesn't match 14552 * the destination mac address in the DL_UNITDATA_IND, the packet is deemed 14553 * to be multicast. Thus NICs that have no broadcast address (or no 14554 * capability for one, such as point to point links) cannot return as 14555 * the packet being broadcast. The use of HPE_BROADCAST/HPE_MULTICAST as 14556 * the return values simplifies the current use of the return value of this 14557 * function, which is to pass through the multicast/broadcast characteristic 14558 * to consumers of the netinfo/pfhooks API. While this is not cast in stone, 14559 * changing the return value to some other symbol demands the appropriate 14560 * "translation" when hpe_flags is set prior to calling hook_run() for 14561 * packet events. 14562 */ 14563 int 14564 ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb) 14565 { 14566 dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr; 14567 mblk_t *bmp; 14568 14569 if (ind->dl_group_address) { 14570 if (ind->dl_dest_addr_offset > sizeof (*ind) && 14571 ind->dl_dest_addr_offset + ind->dl_dest_addr_length < 14572 MBLKL(mb) && 14573 (bmp = ill->ill_bcast_mp) != NULL) { 14574 dl_unitdata_req_t *dlur; 14575 uint8_t *bphys_addr; 14576 14577 dlur = (dl_unitdata_req_t *)bmp->b_rptr; 14578 if (ill->ill_sap_length < 0) 14579 bphys_addr = (uchar_t *)dlur + 14580 dlur->dl_dest_addr_offset; 14581 else 14582 bphys_addr = (uchar_t *)dlur + 14583 dlur->dl_dest_addr_offset + 14584 ill->ill_sap_length; 14585 14586 if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset, 14587 bphys_addr, ind->dl_dest_addr_length) == 0) { 14588 return (HPE_BROADCAST); 14589 } 14590 return (HPE_MULTICAST); 14591 } 14592 return (HPE_MULTICAST); 14593 } 14594 return (0); 14595 } 14596 14597 static boolean_t 14598 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 14599 int *ll_multicast, mblk_t **mpp) 14600 { 14601 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 14602 boolean_t must_copy = B_FALSE; 14603 struct iocblk *iocp; 14604 ipha_t *ipha; 14605 ip_stack_t *ipst = ill->ill_ipst; 14606 14607 #define rptr ((uchar_t *)ipha) 14608 14609 first_mp = *first_mpp; 14610 mp = *mpp; 14611 14612 ASSERT(first_mp == mp); 14613 14614 /* 14615 * if db_ref > 1 then copymsg and free original. Packet may be 14616 * changed and do not want other entity who has a reference to this 14617 * message to trip over the changes. This is a blind change because 14618 * trying to catch all places that might change packet is too 14619 * difficult (since it may be a module above this one) 14620 * 14621 * This corresponds to the non-fast path case. We walk down the full 14622 * chain in this case, and check the db_ref count of all the dblks, 14623 * and do a copymsg if required. It is possible that the db_ref counts 14624 * of the data blocks in the mblk chain can be different. 14625 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 14626 * count of 1, followed by a M_DATA block with a ref count of 2, if 14627 * 'snoop' is running. 14628 */ 14629 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 14630 if (mp1->b_datap->db_ref > 1) { 14631 must_copy = B_TRUE; 14632 break; 14633 } 14634 } 14635 14636 if (must_copy) { 14637 mp1 = copymsg(mp); 14638 if (mp1 == NULL) { 14639 for (mp1 = mp; mp1 != NULL; 14640 mp1 = mp1->b_cont) { 14641 mp1->b_next = NULL; 14642 mp1->b_prev = NULL; 14643 } 14644 freemsg(mp); 14645 if (ill != NULL) { 14646 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14647 } else { 14648 BUMP_MIB(&ipst->ips_ip_mib, 14649 ipIfStatsInDiscards); 14650 } 14651 return (B_TRUE); 14652 } 14653 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 14654 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 14655 /* Copy b_prev - used by ip_mroute_decap */ 14656 to_mp->b_prev = from_mp->b_prev; 14657 from_mp->b_prev = NULL; 14658 } 14659 *first_mpp = first_mp = mp1; 14660 freemsg(mp); 14661 mp = mp1; 14662 *mpp = mp1; 14663 } 14664 14665 ipha = (ipha_t *)mp->b_rptr; 14666 14667 /* 14668 * previous code has a case for M_DATA. 14669 * We want to check how that happens. 14670 */ 14671 ASSERT(first_mp->b_datap->db_type != M_DATA); 14672 switch (first_mp->b_datap->db_type) { 14673 case M_PROTO: 14674 case M_PCPROTO: 14675 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 14676 DL_UNITDATA_IND) { 14677 /* Go handle anything other than data elsewhere. */ 14678 ip_rput_dlpi(q, mp); 14679 return (B_TRUE); 14680 } 14681 14682 *ll_multicast = ip_get_dlpi_mbcast(ill, mp); 14683 /* Ditch the DLPI header. */ 14684 mp1 = mp->b_cont; 14685 ASSERT(first_mp == mp); 14686 *first_mpp = mp1; 14687 freeb(mp); 14688 *mpp = mp1; 14689 return (B_FALSE); 14690 case M_IOCACK: 14691 ip1dbg(("got iocack ")); 14692 iocp = (struct iocblk *)mp->b_rptr; 14693 switch (iocp->ioc_cmd) { 14694 case DL_IOC_HDR_INFO: 14695 ill = (ill_t *)q->q_ptr; 14696 ill_fastpath_ack(ill, mp); 14697 return (B_TRUE); 14698 case SIOCSTUNPARAM: 14699 case OSIOCSTUNPARAM: 14700 /* Go through qwriter_ip */ 14701 break; 14702 case SIOCGTUNPARAM: 14703 case OSIOCGTUNPARAM: 14704 ip_rput_other(NULL, q, mp, NULL); 14705 return (B_TRUE); 14706 default: 14707 putnext(q, mp); 14708 return (B_TRUE); 14709 } 14710 /* FALLTHRU */ 14711 case M_ERROR: 14712 case M_HANGUP: 14713 /* 14714 * Since this is on the ill stream we unconditionally 14715 * bump up the refcount 14716 */ 14717 ill_refhold(ill); 14718 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14719 return (B_TRUE); 14720 case M_CTL: 14721 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 14722 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 14723 IPHADA_M_CTL)) { 14724 /* 14725 * It's an IPsec accelerated packet. 14726 * Make sure that the ill from which we received the 14727 * packet has enabled IPsec hardware acceleration. 14728 */ 14729 if (!(ill->ill_capabilities & 14730 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 14731 /* IPsec kstats: bean counter */ 14732 freemsg(mp); 14733 return (B_TRUE); 14734 } 14735 14736 /* 14737 * Make mp point to the mblk following the M_CTL, 14738 * then process according to type of mp. 14739 * After this processing, first_mp will point to 14740 * the data-attributes and mp to the pkt following 14741 * the M_CTL. 14742 */ 14743 mp = first_mp->b_cont; 14744 if (mp == NULL) { 14745 freemsg(first_mp); 14746 return (B_TRUE); 14747 } 14748 /* 14749 * A Hardware Accelerated packet can only be M_DATA 14750 * ESP or AH packet. 14751 */ 14752 if (mp->b_datap->db_type != M_DATA) { 14753 /* non-M_DATA IPsec accelerated packet */ 14754 IPSECHW_DEBUG(IPSECHW_PKT, 14755 ("non-M_DATA IPsec accelerated pkt\n")); 14756 freemsg(first_mp); 14757 return (B_TRUE); 14758 } 14759 ipha = (ipha_t *)mp->b_rptr; 14760 if (ipha->ipha_protocol != IPPROTO_AH && 14761 ipha->ipha_protocol != IPPROTO_ESP) { 14762 IPSECHW_DEBUG(IPSECHW_PKT, 14763 ("non-M_DATA IPsec accelerated pkt\n")); 14764 freemsg(first_mp); 14765 return (B_TRUE); 14766 } 14767 *mpp = mp; 14768 return (B_FALSE); 14769 } 14770 putnext(q, mp); 14771 return (B_TRUE); 14772 case M_IOCNAK: 14773 ip1dbg(("got iocnak ")); 14774 iocp = (struct iocblk *)mp->b_rptr; 14775 switch (iocp->ioc_cmd) { 14776 case SIOCSTUNPARAM: 14777 case OSIOCSTUNPARAM: 14778 /* 14779 * Since this is on the ill stream we unconditionally 14780 * bump up the refcount 14781 */ 14782 ill_refhold(ill); 14783 qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); 14784 return (B_TRUE); 14785 case DL_IOC_HDR_INFO: 14786 case SIOCGTUNPARAM: 14787 case OSIOCGTUNPARAM: 14788 ip_rput_other(NULL, q, mp, NULL); 14789 return (B_TRUE); 14790 default: 14791 break; 14792 } 14793 /* FALLTHRU */ 14794 default: 14795 putnext(q, mp); 14796 return (B_TRUE); 14797 } 14798 } 14799 14800 /* Read side put procedure. Packets coming from the wire arrive here. */ 14801 void 14802 ip_rput(queue_t *q, mblk_t *mp) 14803 { 14804 ill_t *ill; 14805 union DL_primitives *dl; 14806 14807 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 14808 14809 ill = (ill_t *)q->q_ptr; 14810 14811 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 14812 /* 14813 * If things are opening or closing, only accept high-priority 14814 * DLPI messages. (On open ill->ill_ipif has not yet been 14815 * created; on close, things hanging off the ill may have been 14816 * freed already.) 14817 */ 14818 dl = (union DL_primitives *)mp->b_rptr; 14819 if (DB_TYPE(mp) != M_PCPROTO || 14820 dl->dl_primitive == DL_UNITDATA_IND) { 14821 /* 14822 * SIOC[GS]TUNPARAM ioctls can come here. 14823 */ 14824 inet_freemsg(mp); 14825 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14826 "ip_rput_end: q %p (%S)", q, "uninit"); 14827 return; 14828 } 14829 } 14830 14831 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14832 "ip_rput_end: q %p (%S)", q, "end"); 14833 14834 ip_input(ill, NULL, mp, NULL); 14835 } 14836 14837 static mblk_t * 14838 ip_fix_dbref(ill_t *ill, mblk_t *mp) 14839 { 14840 mblk_t *mp1; 14841 boolean_t adjusted = B_FALSE; 14842 ip_stack_t *ipst = ill->ill_ipst; 14843 14844 IP_STAT(ipst, ip_db_ref); 14845 /* 14846 * The IP_RECVSLLA option depends on having the 14847 * link layer header. First check that: 14848 * a> the underlying device is of type ether, 14849 * since this option is currently supported only 14850 * over ethernet. 14851 * b> there is enough room to copy over the link 14852 * layer header. 14853 * 14854 * Once the checks are done, adjust rptr so that 14855 * the link layer header will be copied via 14856 * copymsg. Note that, IFT_ETHER may be returned 14857 * by some non-ethernet drivers but in this case 14858 * the second check will fail. 14859 */ 14860 if (ill->ill_type == IFT_ETHER && 14861 (mp->b_rptr - mp->b_datap->db_base) >= 14862 sizeof (struct ether_header)) { 14863 mp->b_rptr -= sizeof (struct ether_header); 14864 adjusted = B_TRUE; 14865 } 14866 mp1 = copymsg(mp); 14867 14868 if (mp1 == NULL) { 14869 mp->b_next = NULL; 14870 /* clear b_prev - used by ip_mroute_decap */ 14871 mp->b_prev = NULL; 14872 freemsg(mp); 14873 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 14874 return (NULL); 14875 } 14876 14877 if (adjusted) { 14878 /* 14879 * Copy is done. Restore the pointer in 14880 * the _new_ mblk 14881 */ 14882 mp1->b_rptr += sizeof (struct ether_header); 14883 } 14884 14885 /* Copy b_prev - used by ip_mroute_decap */ 14886 mp1->b_prev = mp->b_prev; 14887 mp->b_prev = NULL; 14888 14889 /* preserve the hardware checksum flags and data, if present */ 14890 if (DB_CKSUMFLAGS(mp) != 0) { 14891 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 14892 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 14893 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 14894 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 14895 DB_CKSUM16(mp1) = DB_CKSUM16(mp); 14896 } 14897 14898 freemsg(mp); 14899 return (mp1); 14900 } 14901 14902 /* 14903 * Direct read side procedure capable of dealing with chains. GLDv3 based 14904 * drivers call this function directly with mblk chains while STREAMS 14905 * read side procedure ip_rput() calls this for single packet with ip_ring 14906 * set to NULL to process one packet at a time. 14907 * 14908 * The ill will always be valid if this function is called directly from 14909 * the driver. 14910 * 14911 * If ip_input() is called from GLDv3: 14912 * 14913 * - This must be a non-VLAN IP stream. 14914 * - 'mp' is either an untagged or a special priority-tagged packet. 14915 * - Any VLAN tag that was in the MAC header has been stripped. 14916 * 14917 * If the IP header in packet is not 32-bit aligned, every message in the 14918 * chain will be aligned before further operations. This is required on SPARC 14919 * platform. 14920 */ 14921 /* ARGSUSED */ 14922 void 14923 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 14924 struct mac_header_info_s *mhip) 14925 { 14926 ipaddr_t dst = NULL; 14927 ipaddr_t prev_dst; 14928 ire_t *ire = NULL; 14929 ipha_t *ipha; 14930 uint_t pkt_len; 14931 ssize_t len; 14932 uint_t opt_len; 14933 int ll_multicast; 14934 int cgtp_flt_pkt; 14935 queue_t *q = ill->ill_rq; 14936 squeue_t *curr_sqp = NULL; 14937 mblk_t *head = NULL; 14938 mblk_t *tail = NULL; 14939 mblk_t *first_mp; 14940 mblk_t *mp; 14941 mblk_t *dmp; 14942 int cnt = 0; 14943 ip_stack_t *ipst = ill->ill_ipst; 14944 14945 ASSERT(mp_chain != NULL); 14946 ASSERT(ill != NULL); 14947 14948 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 14949 14950 #define rptr ((uchar_t *)ipha) 14951 14952 while (mp_chain != NULL) { 14953 first_mp = mp = mp_chain; 14954 mp_chain = mp_chain->b_next; 14955 mp->b_next = NULL; 14956 ll_multicast = 0; 14957 14958 /* 14959 * We do ire caching from one iteration to 14960 * another. In the event the packet chain contains 14961 * all packets from the same dst, this caching saves 14962 * an ire_cache_lookup for each of the succeeding 14963 * packets in a packet chain. 14964 */ 14965 prev_dst = dst; 14966 14967 /* 14968 * if db_ref > 1 then copymsg and free original. Packet 14969 * may be changed and we do not want the other entity 14970 * who has a reference to this message to trip over the 14971 * changes. This is a blind change because trying to 14972 * catch all places that might change the packet is too 14973 * difficult. 14974 * 14975 * This corresponds to the fast path case, where we have 14976 * a chain of M_DATA mblks. We check the db_ref count 14977 * of only the 1st data block in the mblk chain. There 14978 * doesn't seem to be a reason why a device driver would 14979 * send up data with varying db_ref counts in the mblk 14980 * chain. In any case the Fast path is a private 14981 * interface, and our drivers don't do such a thing. 14982 * Given the above assumption, there is no need to walk 14983 * down the entire mblk chain (which could have a 14984 * potential performance problem) 14985 */ 14986 14987 if (DB_REF(mp) > 1) { 14988 if ((mp = ip_fix_dbref(ill, mp)) == NULL) 14989 continue; 14990 } 14991 14992 /* 14993 * Check and align the IP header. 14994 */ 14995 first_mp = mp; 14996 if (DB_TYPE(mp) == M_DATA) { 14997 dmp = mp; 14998 } else if (DB_TYPE(mp) == M_PROTO && 14999 *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) { 15000 dmp = mp->b_cont; 15001 } else { 15002 dmp = NULL; 15003 } 15004 if (dmp != NULL) { 15005 /* 15006 * IP header ptr not aligned? 15007 * OR IP header not complete in first mblk 15008 */ 15009 if (!OK_32PTR(dmp->b_rptr) || 15010 MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) { 15011 if (!ip_check_and_align_header(q, dmp, ipst)) 15012 continue; 15013 } 15014 } 15015 15016 /* 15017 * ip_input fast path 15018 */ 15019 15020 /* mblk type is not M_DATA */ 15021 if (DB_TYPE(mp) != M_DATA) { 15022 if (ip_rput_process_notdata(q, &first_mp, ill, 15023 &ll_multicast, &mp)) 15024 continue; 15025 15026 /* 15027 * The only way we can get here is if we had a 15028 * packet that was either a DL_UNITDATA_IND or 15029 * an M_CTL for an IPsec accelerated packet. 15030 * 15031 * In either case, the first_mp will point to 15032 * the leading M_PROTO or M_CTL. 15033 */ 15034 ASSERT(first_mp != NULL); 15035 } else if (mhip != NULL) { 15036 /* 15037 * ll_multicast is set here so that it is ready 15038 * for easy use with FW_HOOKS(). ip_get_dlpi_mbcast 15039 * manipulates ll_multicast in the same fashion when 15040 * called from ip_rput_process_notdata. 15041 */ 15042 switch (mhip->mhi_dsttype) { 15043 case MAC_ADDRTYPE_MULTICAST : 15044 ll_multicast = HPE_MULTICAST; 15045 break; 15046 case MAC_ADDRTYPE_BROADCAST : 15047 ll_multicast = HPE_BROADCAST; 15048 break; 15049 default : 15050 break; 15051 } 15052 } 15053 15054 /* Make sure its an M_DATA and that its aligned */ 15055 ASSERT(DB_TYPE(mp) == M_DATA); 15056 ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr)); 15057 15058 ipha = (ipha_t *)mp->b_rptr; 15059 len = mp->b_wptr - rptr; 15060 pkt_len = ntohs(ipha->ipha_length); 15061 15062 /* 15063 * We must count all incoming packets, even if they end 15064 * up being dropped later on. 15065 */ 15066 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 15067 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); 15068 15069 /* multiple mblk or too short */ 15070 len -= pkt_len; 15071 if (len != 0) { 15072 /* 15073 * Make sure we have data length consistent 15074 * with the IP header. 15075 */ 15076 if (mp->b_cont == NULL) { 15077 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 15078 BUMP_MIB(ill->ill_ip_mib, 15079 ipIfStatsInHdrErrors); 15080 ip2dbg(("ip_input: drop pkt\n")); 15081 freemsg(mp); 15082 continue; 15083 } 15084 mp->b_wptr = rptr + pkt_len; 15085 } else if ((len += msgdsize(mp->b_cont)) != 0) { 15086 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 15087 BUMP_MIB(ill->ill_ip_mib, 15088 ipIfStatsInHdrErrors); 15089 ip2dbg(("ip_input: drop pkt\n")); 15090 freemsg(mp); 15091 continue; 15092 } 15093 (void) adjmsg(mp, -len); 15094 IP_STAT(ipst, ip_multimblk3); 15095 } 15096 } 15097 15098 /* Obtain the dst of the current packet */ 15099 dst = ipha->ipha_dst; 15100 15101 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, 15102 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, 15103 ipha, ip6_t *, NULL, int, 0); 15104 15105 /* 15106 * The following test for loopback is faster than 15107 * IP_LOOPBACK_ADDR(), because it avoids any bitwise 15108 * operations. 15109 * Note that these addresses are always in network byte order 15110 */ 15111 if (((*(uchar_t *)&ipha->ipha_dst) == 127) || 15112 ((*(uchar_t *)&ipha->ipha_src) == 127)) { 15113 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 15114 freemsg(mp); 15115 continue; 15116 } 15117 15118 /* 15119 * The event for packets being received from a 'physical' 15120 * interface is placed after validation of the source and/or 15121 * destination address as being local so that packets can be 15122 * redirected to loopback addresses using ipnat. 15123 */ 15124 DTRACE_PROBE4(ip4__physical__in__start, 15125 ill_t *, ill, ill_t *, NULL, 15126 ipha_t *, ipha, mblk_t *, first_mp); 15127 15128 FW_HOOKS(ipst->ips_ip4_physical_in_event, 15129 ipst->ips_ipv4firewall_physical_in, 15130 ill, NULL, ipha, first_mp, mp, ll_multicast, ipst); 15131 15132 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp); 15133 15134 if (first_mp == NULL) { 15135 continue; 15136 } 15137 dst = ipha->ipha_dst; 15138 15139 /* 15140 * Attach any necessary label information to 15141 * this packet 15142 */ 15143 if (is_system_labeled() && 15144 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 15145 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 15146 freemsg(mp); 15147 continue; 15148 } 15149 15150 /* 15151 * Reuse the cached ire only if the ipha_dst of the previous 15152 * packet is the same as the current packet AND it is not 15153 * INADDR_ANY. 15154 */ 15155 if (!(dst == prev_dst && dst != INADDR_ANY) && 15156 (ire != NULL)) { 15157 ire_refrele(ire); 15158 ire = NULL; 15159 } 15160 opt_len = ipha->ipha_version_and_hdr_length - 15161 IP_SIMPLE_HDR_VERSION; 15162 15163 /* 15164 * Check to see if we can take the fastpath. 15165 * That is possible if the following conditions are met 15166 * o Tsol disabled 15167 * o CGTP disabled 15168 * o ipp_action_count is 0 15169 * o no options in the packet 15170 * o not a RSVP packet 15171 * o not a multicast packet 15172 * o ill not in IP_DHCPINIT_IF mode 15173 */ 15174 if (!is_system_labeled() && 15175 !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 && 15176 opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP && 15177 !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) { 15178 if (ire == NULL) 15179 ire = ire_cache_lookup(dst, ALL_ZONES, NULL, 15180 ipst); 15181 15182 /* incoming packet is for forwarding */ 15183 if (ire == NULL || (ire->ire_type & IRE_CACHE)) { 15184 ire = ip_fast_forward(ire, dst, ill, mp); 15185 continue; 15186 } 15187 /* incoming packet is for local consumption */ 15188 if (ire->ire_type & IRE_LOCAL) 15189 goto local; 15190 } 15191 15192 /* 15193 * Disable ire caching for anything more complex 15194 * than the simple fast path case we checked for above. 15195 */ 15196 if (ire != NULL) { 15197 ire_refrele(ire); 15198 ire = NULL; 15199 } 15200 15201 /* 15202 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP 15203 * server to unicast DHCP packets to a DHCP client using the 15204 * IP address it is offering to the client. This can be 15205 * disabled through the "broadcast bit", but not all DHCP 15206 * servers honor that bit. Therefore, to interoperate with as 15207 * many DHCP servers as possible, the DHCP client allows the 15208 * server to unicast, but we treat those packets as broadcast 15209 * here. Note that we don't rewrite the packet itself since 15210 * (a) that would mess up the checksums and (b) the DHCP 15211 * client conn is bound to INADDR_ANY so ip_fanout_udp() will 15212 * hand it the packet regardless. 15213 */ 15214 if (ill->ill_dhcpinit != 0 && 15215 IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP && 15216 pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) { 15217 udpha_t *udpha; 15218 15219 /* 15220 * Reload ipha since pullupmsg() can change b_rptr. 15221 */ 15222 ipha = (ipha_t *)mp->b_rptr; 15223 udpha = (udpha_t *)&ipha[1]; 15224 15225 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { 15226 DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, 15227 mblk_t *, mp); 15228 dst = INADDR_BROADCAST; 15229 } 15230 } 15231 15232 /* Full-blown slow path */ 15233 if (opt_len != 0) { 15234 if (len != 0) 15235 IP_STAT(ipst, ip_multimblk4); 15236 else 15237 IP_STAT(ipst, ip_ipoptions); 15238 if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha, 15239 &dst, ipst)) 15240 continue; 15241 } 15242 15243 /* 15244 * Invoke the CGTP (multirouting) filtering module to process 15245 * the incoming packet. Packets identified as duplicates 15246 * must be discarded. Filtering is active only if the 15247 * the ip_cgtp_filter ndd variable is non-zero. 15248 */ 15249 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 15250 if (ipst->ips_ip_cgtp_filter && 15251 ipst->ips_ip_cgtp_filter_ops != NULL) { 15252 netstackid_t stackid; 15253 15254 stackid = ipst->ips_netstack->netstack_stackid; 15255 cgtp_flt_pkt = 15256 ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, 15257 ill->ill_phyint->phyint_ifindex, mp); 15258 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 15259 freemsg(first_mp); 15260 continue; 15261 } 15262 } 15263 15264 /* 15265 * If rsvpd is running, let RSVP daemon handle its processing 15266 * and forwarding of RSVP multicast/unicast packets. 15267 * If rsvpd is not running but mrouted is running, RSVP 15268 * multicast packets are forwarded as multicast traffic 15269 * and RSVP unicast packets are forwarded by unicast router. 15270 * If neither rsvpd nor mrouted is running, RSVP multicast 15271 * packets are not forwarded, but the unicast packets are 15272 * forwarded like unicast traffic. 15273 */ 15274 if (ipha->ipha_protocol == IPPROTO_RSVP && 15275 ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head != 15276 NULL) { 15277 /* RSVP packet and rsvpd running. Treat as ours */ 15278 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 15279 /* 15280 * This assumes that we deliver to all streams for 15281 * multicast and broadcast packets. 15282 * We have to force ll_multicast to 1 to handle the 15283 * M_DATA messages passed in from ip_mroute_decap. 15284 */ 15285 dst = INADDR_BROADCAST; 15286 ll_multicast = 1; 15287 } else if (CLASSD(dst)) { 15288 /* packet is multicast */ 15289 mp->b_next = NULL; 15290 if (ip_rput_process_multicast(q, mp, ill, ipha, 15291 &ll_multicast, &dst)) 15292 continue; 15293 } 15294 15295 if (ire == NULL) { 15296 ire = ire_cache_lookup(dst, ALL_ZONES, 15297 MBLK_GETLABEL(mp), ipst); 15298 } 15299 15300 if (ire != NULL && ire->ire_stq != NULL && 15301 ire->ire_zoneid != GLOBAL_ZONEID && 15302 ire->ire_zoneid != ALL_ZONES) { 15303 /* 15304 * Should only use IREs that are visible from the 15305 * global zone for forwarding. 15306 */ 15307 ire_refrele(ire); 15308 ire = ire_cache_lookup(dst, GLOBAL_ZONEID, 15309 MBLK_GETLABEL(mp), ipst); 15310 } 15311 15312 if (ire == NULL) { 15313 /* 15314 * No IRE for this destination, so it can't be for us. 15315 * Unless we are forwarding, drop the packet. 15316 * We have to let source routed packets through 15317 * since we don't yet know if they are 'ping -l' 15318 * packets i.e. if they will go out over the 15319 * same interface as they came in on. 15320 */ 15321 ire = ip_rput_noire(q, mp, ll_multicast, dst); 15322 if (ire == NULL) 15323 continue; 15324 } 15325 15326 /* 15327 * Broadcast IRE may indicate either broadcast or 15328 * multicast packet 15329 */ 15330 if (ire->ire_type == IRE_BROADCAST) { 15331 /* 15332 * Skip broadcast checks if packet is UDP multicast; 15333 * we'd rather not enter ip_rput_process_broadcast() 15334 * unless the packet is broadcast for real, since 15335 * that routine is a no-op for multicast. 15336 */ 15337 if (ipha->ipha_protocol != IPPROTO_UDP || 15338 !CLASSD(ipha->ipha_dst)) { 15339 ire = ip_rput_process_broadcast(&q, mp, 15340 ire, ipha, ill, dst, cgtp_flt_pkt, 15341 ll_multicast); 15342 if (ire == NULL) 15343 continue; 15344 } 15345 } else if (ire->ire_stq != NULL) { 15346 /* fowarding? */ 15347 ip_rput_process_forward(q, mp, ire, ipha, ill, 15348 ll_multicast); 15349 /* ip_rput_process_forward consumed the packet */ 15350 continue; 15351 } 15352 15353 local: 15354 /* 15355 * If the queue in the ire is different to the ingress queue 15356 * then we need to check to see if we can accept the packet. 15357 * Note that for multicast packets and broadcast packets sent 15358 * to a broadcast address which is shared between multiple 15359 * interfaces we should not do this since we just got a random 15360 * broadcast ire. 15361 */ 15362 if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { 15363 if ((ire = ip_check_multihome(&ipha->ipha_dst, ire, 15364 ill)) == NULL) { 15365 /* Drop packet */ 15366 BUMP_MIB(ill->ill_ip_mib, 15367 ipIfStatsForwProhibits); 15368 freemsg(mp); 15369 continue; 15370 } 15371 if (ire->ire_rfq != NULL) 15372 q = ire->ire_rfq; 15373 } 15374 15375 switch (ipha->ipha_protocol) { 15376 case IPPROTO_TCP: 15377 ASSERT(first_mp == mp); 15378 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 15379 mp, 0, q, ip_ring)) != NULL) { 15380 if (curr_sqp == NULL) { 15381 curr_sqp = GET_SQUEUE(mp); 15382 ASSERT(cnt == 0); 15383 cnt++; 15384 head = tail = mp; 15385 } else if (curr_sqp == GET_SQUEUE(mp)) { 15386 ASSERT(tail != NULL); 15387 cnt++; 15388 tail->b_next = mp; 15389 tail = mp; 15390 } else { 15391 /* 15392 * A different squeue. Send the 15393 * chain for the previous squeue on 15394 * its way. This shouldn't happen 15395 * often unless interrupt binding 15396 * changes. 15397 */ 15398 IP_STAT(ipst, ip_input_multi_squeue); 15399 squeue_enter_chain(curr_sqp, head, 15400 tail, cnt, SQTAG_IP_INPUT); 15401 curr_sqp = GET_SQUEUE(mp); 15402 head = mp; 15403 tail = mp; 15404 cnt = 1; 15405 } 15406 } 15407 continue; 15408 case IPPROTO_UDP: 15409 ASSERT(first_mp == mp); 15410 ip_udp_input(q, mp, ipha, ire, ill); 15411 continue; 15412 case IPPROTO_SCTP: 15413 ASSERT(first_mp == mp); 15414 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 15415 q, dst); 15416 /* ire has been released by ip_sctp_input */ 15417 ire = NULL; 15418 continue; 15419 default: 15420 ip_proto_input(q, first_mp, ipha, ire, ill, 0); 15421 continue; 15422 } 15423 } 15424 15425 if (ire != NULL) 15426 ire_refrele(ire); 15427 15428 if (head != NULL) 15429 squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); 15430 15431 /* 15432 * This code is there just to make netperf/ttcp look good. 15433 * 15434 * Its possible that after being in polling mode (and having cleared 15435 * the backlog), squeues have turned the interrupt frequency higher 15436 * to improve latency at the expense of more CPU utilization (less 15437 * packets per interrupts or more number of interrupts). Workloads 15438 * like ttcp/netperf do manage to tickle polling once in a while 15439 * but for the remaining time, stay in higher interrupt mode since 15440 * their packet arrival rate is pretty uniform and this shows up 15441 * as higher CPU utilization. Since people care about CPU utilization 15442 * while running netperf/ttcp, turn the interrupt frequency back to 15443 * normal/default if polling has not been used in ip_poll_normal_ticks. 15444 */ 15445 if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { 15446 if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { 15447 ip_ring->rr_poll_state &= ~ILL_POLLING; 15448 ip_ring->rr_blank(ip_ring->rr_handle, 15449 ip_ring->rr_normal_blank_time, 15450 ip_ring->rr_normal_pkt_cnt); 15451 } 15452 } 15453 15454 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 15455 "ip_input_end: q %p (%S)", q, "end"); 15456 #undef rptr 15457 } 15458 15459 static void 15460 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 15461 t_uscalar_t err) 15462 { 15463 if (dl_err == DL_SYSERR) { 15464 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15465 "%s: %s failed: DL_SYSERR (errno %u)\n", 15466 ill->ill_name, dl_primstr(prim), err); 15467 return; 15468 } 15469 15470 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 15471 "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim), 15472 dl_errstr(dl_err)); 15473 } 15474 15475 /* 15476 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 15477 * than DL_UNITDATA_IND messages. If we need to process this message 15478 * exclusively, we call qwriter_ip, in which case we also need to call 15479 * ill_refhold before that, since qwriter_ip does an ill_refrele. 15480 */ 15481 void 15482 ip_rput_dlpi(queue_t *q, mblk_t *mp) 15483 { 15484 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15485 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15486 ill_t *ill = q->q_ptr; 15487 t_uscalar_t prim = dloa->dl_primitive; 15488 t_uscalar_t reqprim = DL_PRIM_INVAL; 15489 15490 ip1dbg(("ip_rput_dlpi")); 15491 15492 /* 15493 * If we received an ACK but didn't send a request for it, then it 15494 * can't be part of any pending operation; discard up-front. 15495 */ 15496 switch (prim) { 15497 case DL_ERROR_ACK: 15498 reqprim = dlea->dl_error_primitive; 15499 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s " 15500 "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim), 15501 reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno, 15502 dlea->dl_unix_errno)); 15503 break; 15504 case DL_OK_ACK: 15505 reqprim = dloa->dl_correct_primitive; 15506 break; 15507 case DL_INFO_ACK: 15508 reqprim = DL_INFO_REQ; 15509 break; 15510 case DL_BIND_ACK: 15511 reqprim = DL_BIND_REQ; 15512 break; 15513 case DL_PHYS_ADDR_ACK: 15514 reqprim = DL_PHYS_ADDR_REQ; 15515 break; 15516 case DL_NOTIFY_ACK: 15517 reqprim = DL_NOTIFY_REQ; 15518 break; 15519 case DL_CONTROL_ACK: 15520 reqprim = DL_CONTROL_REQ; 15521 break; 15522 case DL_CAPABILITY_ACK: 15523 reqprim = DL_CAPABILITY_REQ; 15524 break; 15525 } 15526 15527 if (prim != DL_NOTIFY_IND) { 15528 if (reqprim == DL_PRIM_INVAL || 15529 !ill_dlpi_pending(ill, reqprim)) { 15530 /* Not a DLPI message we support or expected */ 15531 freemsg(mp); 15532 return; 15533 } 15534 ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim), 15535 dl_primstr(reqprim))); 15536 } 15537 15538 switch (reqprim) { 15539 case DL_UNBIND_REQ: 15540 /* 15541 * NOTE: we mark the unbind as complete even if we got a 15542 * DL_ERROR_ACK, since there's not much else we can do. 15543 */ 15544 mutex_enter(&ill->ill_lock); 15545 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 15546 cv_signal(&ill->ill_cv); 15547 mutex_exit(&ill->ill_lock); 15548 break; 15549 15550 case DL_ENABMULTI_REQ: 15551 if (prim == DL_OK_ACK) { 15552 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15553 ill->ill_dlpi_multicast_state = IDS_OK; 15554 } 15555 break; 15556 } 15557 15558 /* 15559 * The message is one we're waiting for (or DL_NOTIFY_IND), but we 15560 * need to become writer to continue to process it. Because an 15561 * exclusive operation doesn't complete until replies to all queued 15562 * DLPI messages have been received, we know we're in the middle of an 15563 * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND). 15564 * 15565 * As required by qwriter_ip(), we refhold the ill; it will refrele. 15566 * Since this is on the ill stream we unconditionally bump up the 15567 * refcount without doing ILL_CAN_LOOKUP(). 15568 */ 15569 ill_refhold(ill); 15570 if (prim == DL_NOTIFY_IND) 15571 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE); 15572 else 15573 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE); 15574 } 15575 15576 /* 15577 * Handling of DLPI messages that require exclusive access to the ipsq. 15578 * 15579 * Need to do ill_pending_mp_release on ioctl completion, which could 15580 * happen here. (along with mi_copy_done) 15581 */ 15582 /* ARGSUSED */ 15583 static void 15584 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15585 { 15586 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 15587 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 15588 int err = 0; 15589 ill_t *ill; 15590 ipif_t *ipif = NULL; 15591 mblk_t *mp1 = NULL; 15592 conn_t *connp = NULL; 15593 t_uscalar_t paddrreq; 15594 mblk_t *mp_hw; 15595 boolean_t success; 15596 boolean_t ioctl_aborted = B_FALSE; 15597 boolean_t log = B_TRUE; 15598 ip_stack_t *ipst; 15599 15600 ip1dbg(("ip_rput_dlpi_writer ..")); 15601 ill = (ill_t *)q->q_ptr; 15602 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 15603 15604 ASSERT(IAM_WRITER_ILL(ill)); 15605 15606 ipst = ill->ill_ipst; 15607 15608 /* 15609 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. 15610 * both are null or non-null. However we can assert that only 15611 * after grabbing the ipsq_lock. So we don't make any assertion 15612 * here and in other places in the code. 15613 */ 15614 ipif = ipsq->ipsq_pending_ipif; 15615 /* 15616 * The current ioctl could have been aborted by the user and a new 15617 * ioctl to bring up another ill could have started. We could still 15618 * get a response from the driver later. 15619 */ 15620 if (ipif != NULL && ipif->ipif_ill != ill) 15621 ioctl_aborted = B_TRUE; 15622 15623 switch (dloa->dl_primitive) { 15624 case DL_ERROR_ACK: 15625 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n", 15626 dl_primstr(dlea->dl_error_primitive))); 15627 15628 switch (dlea->dl_error_primitive) { 15629 case DL_DISABMULTI_REQ: 15630 if (!ill->ill_isv6) 15631 ipsq_current_finish(ipsq); 15632 ill_dlpi_done(ill, dlea->dl_error_primitive); 15633 break; 15634 case DL_PROMISCON_REQ: 15635 case DL_PROMISCOFF_REQ: 15636 case DL_UNBIND_REQ: 15637 case DL_ATTACH_REQ: 15638 case DL_INFO_REQ: 15639 ill_dlpi_done(ill, dlea->dl_error_primitive); 15640 break; 15641 case DL_NOTIFY_REQ: 15642 ill_dlpi_done(ill, DL_NOTIFY_REQ); 15643 log = B_FALSE; 15644 break; 15645 case DL_PHYS_ADDR_REQ: 15646 /* 15647 * For IPv6 only, there are two additional 15648 * phys_addr_req's sent to the driver to get the 15649 * IPv6 token and lla. This allows IP to acquire 15650 * the hardware address format for a given interface 15651 * without having built in knowledge of the hardware 15652 * address. ill_phys_addr_pend keeps track of the last 15653 * DL_PAR sent so we know which response we are 15654 * dealing with. ill_dlpi_done will update 15655 * ill_phys_addr_pend when it sends the next req. 15656 * We don't complete the IOCTL until all three DL_PARs 15657 * have been attempted, so set *_len to 0 and break. 15658 */ 15659 paddrreq = ill->ill_phys_addr_pend; 15660 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 15661 if (paddrreq == DL_IPV6_TOKEN) { 15662 ill->ill_token_length = 0; 15663 log = B_FALSE; 15664 break; 15665 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 15666 ill->ill_nd_lla_len = 0; 15667 log = B_FALSE; 15668 break; 15669 } 15670 /* 15671 * Something went wrong with the DL_PHYS_ADDR_REQ. 15672 * We presumably have an IOCTL hanging out waiting 15673 * for completion. Find it and complete the IOCTL 15674 * with the error noted. 15675 * However, ill_dl_phys was called on an ill queue 15676 * (from SIOCSLIFNAME), thus conn_pending_ill is not 15677 * set. But the ioctl is known to be pending on ill_wq. 15678 */ 15679 if (!ill->ill_ifname_pending) 15680 break; 15681 ill->ill_ifname_pending = 0; 15682 if (!ioctl_aborted) 15683 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15684 if (mp1 != NULL) { 15685 /* 15686 * This operation (SIOCSLIFNAME) must have 15687 * happened on the ill. Assert there is no conn 15688 */ 15689 ASSERT(connp == NULL); 15690 q = ill->ill_wq; 15691 } 15692 break; 15693 case DL_BIND_REQ: 15694 ill_dlpi_done(ill, DL_BIND_REQ); 15695 if (ill->ill_ifname_pending) 15696 break; 15697 /* 15698 * Something went wrong with the bind. We presumably 15699 * have an IOCTL hanging out waiting for completion. 15700 * Find it, take down the interface that was coming 15701 * up, and complete the IOCTL with the error noted. 15702 */ 15703 if (!ioctl_aborted) 15704 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15705 if (mp1 != NULL) { 15706 /* 15707 * This operation (SIOCSLIFFLAGS) must have 15708 * happened from a conn. 15709 */ 15710 ASSERT(connp != NULL); 15711 q = CONNP_TO_WQ(connp); 15712 if (ill->ill_move_in_progress) { 15713 ILL_CLEAR_MOVE(ill); 15714 } 15715 (void) ipif_down(ipif, NULL, NULL); 15716 /* error is set below the switch */ 15717 } 15718 break; 15719 case DL_ENABMULTI_REQ: 15720 if (!ill->ill_isv6) 15721 ipsq_current_finish(ipsq); 15722 ill_dlpi_done(ill, DL_ENABMULTI_REQ); 15723 15724 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) 15725 ill->ill_dlpi_multicast_state = IDS_FAILED; 15726 if (ill->ill_dlpi_multicast_state == IDS_FAILED) { 15727 ipif_t *ipif; 15728 15729 printf("ip: joining multicasts failed (%d)" 15730 " on %s - will use link layer " 15731 "broadcasts for multicast\n", 15732 dlea->dl_errno, ill->ill_name); 15733 15734 /* 15735 * Set up the multicast mapping alone. 15736 * writer, so ok to access ill->ill_ipif 15737 * without any lock. 15738 */ 15739 ipif = ill->ill_ipif; 15740 mutex_enter(&ill->ill_phyint->phyint_lock); 15741 ill->ill_phyint->phyint_flags |= 15742 PHYI_MULTI_BCAST; 15743 mutex_exit(&ill->ill_phyint->phyint_lock); 15744 15745 if (!ill->ill_isv6) { 15746 (void) ipif_arp_setup_multicast(ipif, 15747 NULL); 15748 } else { 15749 (void) ipif_ndp_setup_multicast(ipif, 15750 NULL); 15751 } 15752 } 15753 freemsg(mp); /* Don't want to pass this up */ 15754 return; 15755 15756 case DL_CAPABILITY_REQ: 15757 case DL_CONTROL_REQ: 15758 ill_dlpi_done(ill, dlea->dl_error_primitive); 15759 ill->ill_dlpi_capab_state = IDS_FAILED; 15760 freemsg(mp); 15761 return; 15762 } 15763 /* 15764 * Note the error for IOCTL completion (mp1 is set when 15765 * ready to complete ioctl). If ill_ifname_pending_err is 15766 * set, an error occured during plumbing (ill_ifname_pending), 15767 * so we want to report that error. 15768 * 15769 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 15770 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 15771 * expected to get errack'd if the driver doesn't support 15772 * these flags (e.g. ethernet). log will be set to B_FALSE 15773 * if these error conditions are encountered. 15774 */ 15775 if (mp1 != NULL) { 15776 if (ill->ill_ifname_pending_err != 0) { 15777 err = ill->ill_ifname_pending_err; 15778 ill->ill_ifname_pending_err = 0; 15779 } else { 15780 err = dlea->dl_unix_errno ? 15781 dlea->dl_unix_errno : ENXIO; 15782 } 15783 /* 15784 * If we're plumbing an interface and an error hasn't already 15785 * been saved, set ill_ifname_pending_err to the error passed 15786 * up. Ignore the error if log is B_FALSE (see comment above). 15787 */ 15788 } else if (log && ill->ill_ifname_pending && 15789 ill->ill_ifname_pending_err == 0) { 15790 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 15791 dlea->dl_unix_errno : ENXIO; 15792 } 15793 15794 if (log) 15795 ip_dlpi_error(ill, dlea->dl_error_primitive, 15796 dlea->dl_errno, dlea->dl_unix_errno); 15797 break; 15798 case DL_CAPABILITY_ACK: 15799 /* Call a routine to handle this one. */ 15800 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 15801 ill_capability_ack(ill, mp); 15802 15803 /* 15804 * If the ack is due to renegotiation, we will need to send 15805 * a new CAPABILITY_REQ to start the renegotiation. 15806 */ 15807 if (ill->ill_capab_reneg) { 15808 ill->ill_capab_reneg = B_FALSE; 15809 ill_capability_probe(ill); 15810 } 15811 break; 15812 case DL_CONTROL_ACK: 15813 /* We treat all of these as "fire and forget" */ 15814 ill_dlpi_done(ill, DL_CONTROL_REQ); 15815 break; 15816 case DL_INFO_ACK: 15817 /* Call a routine to handle this one. */ 15818 ill_dlpi_done(ill, DL_INFO_REQ); 15819 ip_ll_subnet_defaults(ill, mp); 15820 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 15821 return; 15822 case DL_BIND_ACK: 15823 /* 15824 * We should have an IOCTL waiting on this unless 15825 * sent by ill_dl_phys, in which case just return 15826 */ 15827 ill_dlpi_done(ill, DL_BIND_REQ); 15828 if (ill->ill_ifname_pending) 15829 break; 15830 15831 if (!ioctl_aborted) 15832 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15833 if (mp1 == NULL) 15834 break; 15835 /* 15836 * Because mp1 was added by ill_dl_up(), and it always 15837 * passes a valid connp, connp must be valid here. 15838 */ 15839 ASSERT(connp != NULL); 15840 q = CONNP_TO_WQ(connp); 15841 15842 /* 15843 * We are exclusive. So nothing can change even after 15844 * we get the pending mp. If need be we can put it back 15845 * and restart, as in calling ipif_arp_up() below. 15846 */ 15847 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 15848 15849 mutex_enter(&ill->ill_lock); 15850 ill->ill_dl_up = 1; 15851 (void) ill_hook_event_create(ill, 0, NE_UP, NULL, 0); 15852 mutex_exit(&ill->ill_lock); 15853 15854 /* 15855 * Now bring up the resolver; when that is complete, we'll 15856 * create IREs. Note that we intentionally mirror what 15857 * ipif_up() would have done, because we got here by way of 15858 * ill_dl_up(), which stopped ipif_up()'s processing. 15859 */ 15860 if (ill->ill_isv6) { 15861 /* 15862 * v6 interfaces. 15863 * Unlike ARP which has to do another bind 15864 * and attach, once we get here we are 15865 * done with NDP. Except in the case of 15866 * ILLF_XRESOLV, in which case we send an 15867 * AR_INTERFACE_UP to the external resolver. 15868 * If all goes well, the ioctl will complete 15869 * in ip_rput(). If there's an error, we 15870 * complete it here. 15871 */ 15872 if ((err = ipif_ndp_up(ipif)) == 0) { 15873 if (ill->ill_flags & ILLF_XRESOLV) { 15874 mutex_enter(&connp->conn_lock); 15875 mutex_enter(&ill->ill_lock); 15876 success = ipsq_pending_mp_add( 15877 connp, ipif, q, mp1, 0); 15878 mutex_exit(&ill->ill_lock); 15879 mutex_exit(&connp->conn_lock); 15880 if (success) { 15881 err = ipif_resolver_up(ipif, 15882 Res_act_initial); 15883 if (err == EINPROGRESS) { 15884 freemsg(mp); 15885 return; 15886 } 15887 ASSERT(err != 0); 15888 mp1 = ipsq_pending_mp_get(ipsq, 15889 &connp); 15890 ASSERT(mp1 != NULL); 15891 } else { 15892 /* conn has started closing */ 15893 err = EINTR; 15894 } 15895 } else { /* Non XRESOLV interface */ 15896 (void) ipif_resolver_up(ipif, 15897 Res_act_initial); 15898 err = ipif_up_done_v6(ipif); 15899 } 15900 } 15901 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 15902 /* 15903 * ARP and other v4 external resolvers. 15904 * Leave the pending mblk intact so that 15905 * the ioctl completes in ip_rput(). 15906 */ 15907 mutex_enter(&connp->conn_lock); 15908 mutex_enter(&ill->ill_lock); 15909 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 15910 mutex_exit(&ill->ill_lock); 15911 mutex_exit(&connp->conn_lock); 15912 if (success) { 15913 err = ipif_resolver_up(ipif, Res_act_initial); 15914 if (err == EINPROGRESS) { 15915 freemsg(mp); 15916 return; 15917 } 15918 ASSERT(err != 0); 15919 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15920 } else { 15921 /* The conn has started closing */ 15922 err = EINTR; 15923 } 15924 } else { 15925 /* 15926 * This one is complete. Reply to pending ioctl. 15927 */ 15928 (void) ipif_resolver_up(ipif, Res_act_initial); 15929 err = ipif_up_done(ipif); 15930 } 15931 15932 if ((err == 0) && (ill->ill_up_ipifs)) { 15933 err = ill_up_ipifs(ill, q, mp1); 15934 if (err == EINPROGRESS) { 15935 freemsg(mp); 15936 return; 15937 } 15938 } 15939 15940 if (ill->ill_up_ipifs) { 15941 ill_group_cleanup(ill); 15942 } 15943 15944 break; 15945 case DL_NOTIFY_IND: { 15946 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 15947 ire_t *ire; 15948 boolean_t need_ire_walk_v4 = B_FALSE; 15949 boolean_t need_ire_walk_v6 = B_FALSE; 15950 15951 switch (notify->dl_notification) { 15952 case DL_NOTE_PHYS_ADDR: 15953 err = ill_set_phys_addr(ill, mp); 15954 break; 15955 15956 case DL_NOTE_FASTPATH_FLUSH: 15957 ill_fastpath_flush(ill); 15958 break; 15959 15960 case DL_NOTE_SDU_SIZE: 15961 /* 15962 * Change the MTU size of the interface, of all 15963 * attached ipif's, and of all relevant ire's. The 15964 * new value's a uint32_t at notify->dl_data. 15965 * Mtu change Vs. new ire creation - protocol below. 15966 * 15967 * a Mark the ipif as IPIF_CHANGING. 15968 * b Set the new mtu in the ipif. 15969 * c Change the ire_max_frag on all affected ires 15970 * d Unmark the IPIF_CHANGING 15971 * 15972 * To see how the protocol works, assume an interface 15973 * route is also being added simultaneously by 15974 * ip_rt_add and let 'ipif' be the ipif referenced by 15975 * the ire. If the ire is created before step a, 15976 * it will be cleaned up by step c. If the ire is 15977 * created after step d, it will see the new value of 15978 * ipif_mtu. Any attempt to create the ire between 15979 * steps a to d will fail because of the IPIF_CHANGING 15980 * flag. Note that ire_create() is passed a pointer to 15981 * the ipif_mtu, and not the value. During ire_add 15982 * under the bucket lock, the ire_max_frag of the 15983 * new ire being created is set from the ipif/ire from 15984 * which it is being derived. 15985 */ 15986 mutex_enter(&ill->ill_lock); 15987 ill->ill_max_frag = (uint_t)notify->dl_data; 15988 15989 /* 15990 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu 15991 * leave it alone 15992 */ 15993 if (ill->ill_mtu_userspecified) { 15994 mutex_exit(&ill->ill_lock); 15995 break; 15996 } 15997 ill->ill_max_mtu = ill->ill_max_frag; 15998 if (ill->ill_isv6) { 15999 if (ill->ill_max_mtu < IPV6_MIN_MTU) 16000 ill->ill_max_mtu = IPV6_MIN_MTU; 16001 } else { 16002 if (ill->ill_max_mtu < IP_MIN_MTU) 16003 ill->ill_max_mtu = IP_MIN_MTU; 16004 } 16005 for (ipif = ill->ill_ipif; ipif != NULL; 16006 ipif = ipif->ipif_next) { 16007 /* 16008 * Don't override the mtu if the user 16009 * has explicitly set it. 16010 */ 16011 if (ipif->ipif_flags & IPIF_FIXEDMTU) 16012 continue; 16013 ipif->ipif_mtu = (uint_t)notify->dl_data; 16014 if (ipif->ipif_isv6) 16015 ire = ipif_to_ire_v6(ipif); 16016 else 16017 ire = ipif_to_ire(ipif); 16018 if (ire != NULL) { 16019 ire->ire_max_frag = ipif->ipif_mtu; 16020 ire_refrele(ire); 16021 } 16022 if (ipif->ipif_flags & IPIF_UP) { 16023 if (ill->ill_isv6) 16024 need_ire_walk_v6 = B_TRUE; 16025 else 16026 need_ire_walk_v4 = B_TRUE; 16027 } 16028 } 16029 mutex_exit(&ill->ill_lock); 16030 if (need_ire_walk_v4) 16031 ire_walk_v4(ill_mtu_change, (char *)ill, 16032 ALL_ZONES, ipst); 16033 if (need_ire_walk_v6) 16034 ire_walk_v6(ill_mtu_change, (char *)ill, 16035 ALL_ZONES, ipst); 16036 break; 16037 case DL_NOTE_LINK_UP: 16038 case DL_NOTE_LINK_DOWN: { 16039 /* 16040 * We are writer. ill / phyint / ipsq assocs stable. 16041 * The RUNNING flag reflects the state of the link. 16042 */ 16043 phyint_t *phyint = ill->ill_phyint; 16044 uint64_t new_phyint_flags; 16045 boolean_t changed = B_FALSE; 16046 boolean_t went_up; 16047 16048 went_up = notify->dl_notification == DL_NOTE_LINK_UP; 16049 mutex_enter(&phyint->phyint_lock); 16050 new_phyint_flags = went_up ? 16051 phyint->phyint_flags | PHYI_RUNNING : 16052 phyint->phyint_flags & ~PHYI_RUNNING; 16053 if (new_phyint_flags != phyint->phyint_flags) { 16054 phyint->phyint_flags = new_phyint_flags; 16055 changed = B_TRUE; 16056 } 16057 mutex_exit(&phyint->phyint_lock); 16058 /* 16059 * ill_restart_dad handles the DAD restart and routing 16060 * socket notification logic. 16061 */ 16062 if (changed) { 16063 ill_restart_dad(phyint->phyint_illv4, went_up); 16064 ill_restart_dad(phyint->phyint_illv6, went_up); 16065 } 16066 break; 16067 } 16068 case DL_NOTE_PROMISC_ON_PHYS: 16069 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16070 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 16071 mutex_enter(&ill->ill_lock); 16072 ill->ill_promisc_on_phys = B_TRUE; 16073 mutex_exit(&ill->ill_lock); 16074 break; 16075 case DL_NOTE_PROMISC_OFF_PHYS: 16076 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 16077 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 16078 mutex_enter(&ill->ill_lock); 16079 ill->ill_promisc_on_phys = B_FALSE; 16080 mutex_exit(&ill->ill_lock); 16081 break; 16082 case DL_NOTE_CAPAB_RENEG: 16083 /* 16084 * Something changed on the driver side. 16085 * It wants us to renegotiate the capabilities 16086 * on this ill. One possible cause is the aggregation 16087 * interface under us where a port got added or 16088 * went away. 16089 * 16090 * If the capability negotiation is already done 16091 * or is in progress, reset the capabilities and 16092 * mark the ill's ill_capab_reneg to be B_TRUE, 16093 * so that when the ack comes back, we can start 16094 * the renegotiation process. 16095 * 16096 * Note that if ill_capab_reneg is already B_TRUE 16097 * (ill_dlpi_capab_state is IDS_UNKNOWN in this case), 16098 * the capability resetting request has been sent 16099 * and the renegotiation has not been started yet; 16100 * nothing needs to be done in this case. 16101 */ 16102 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) { 16103 ill_capability_reset(ill); 16104 ill->ill_capab_reneg = B_TRUE; 16105 } 16106 break; 16107 default: 16108 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 16109 "type 0x%x for DL_NOTIFY_IND\n", 16110 notify->dl_notification)); 16111 break; 16112 } 16113 16114 /* 16115 * As this is an asynchronous operation, we 16116 * should not call ill_dlpi_done 16117 */ 16118 break; 16119 } 16120 case DL_NOTIFY_ACK: { 16121 dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr; 16122 16123 if (noteack->dl_notifications & DL_NOTE_LINK_UP) 16124 ill->ill_note_link = 1; 16125 ill_dlpi_done(ill, DL_NOTIFY_REQ); 16126 break; 16127 } 16128 case DL_PHYS_ADDR_ACK: { 16129 /* 16130 * As part of plumbing the interface via SIOCSLIFNAME, 16131 * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs, 16132 * whose answers we receive here. As each answer is received, 16133 * we call ill_dlpi_done() to dispatch the next request as 16134 * we're processing the current one. Once all answers have 16135 * been received, we use ipsq_pending_mp_get() to dequeue the 16136 * outstanding IOCTL and reply to it. (Because ill_dl_phys() 16137 * is invoked from an ill queue, conn_oper_pending_ill is not 16138 * available, but we know the ioctl is pending on ill_wq.) 16139 */ 16140 uint_t paddrlen, paddroff; 16141 16142 paddrreq = ill->ill_phys_addr_pend; 16143 paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length; 16144 paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset; 16145 16146 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 16147 if (paddrreq == DL_IPV6_TOKEN) { 16148 /* 16149 * bcopy to low-order bits of ill_token 16150 * 16151 * XXX Temporary hack - currently, all known tokens 16152 * are 64 bits, so I'll cheat for the moment. 16153 */ 16154 bcopy(mp->b_rptr + paddroff, 16155 &ill->ill_token.s6_addr32[2], paddrlen); 16156 ill->ill_token_length = paddrlen; 16157 break; 16158 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) { 16159 ASSERT(ill->ill_nd_lla_mp == NULL); 16160 ill_set_ndmp(ill, mp, paddroff, paddrlen); 16161 mp = NULL; 16162 break; 16163 } 16164 16165 ASSERT(paddrreq == DL_CURR_PHYS_ADDR); 16166 ASSERT(ill->ill_phys_addr_mp == NULL); 16167 if (!ill->ill_ifname_pending) 16168 break; 16169 ill->ill_ifname_pending = 0; 16170 if (!ioctl_aborted) 16171 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16172 if (mp1 != NULL) { 16173 ASSERT(connp == NULL); 16174 q = ill->ill_wq; 16175 } 16176 /* 16177 * If any error acks received during the plumbing sequence, 16178 * ill_ifname_pending_err will be set. Break out and send up 16179 * the error to the pending ioctl. 16180 */ 16181 if (ill->ill_ifname_pending_err != 0) { 16182 err = ill->ill_ifname_pending_err; 16183 ill->ill_ifname_pending_err = 0; 16184 break; 16185 } 16186 16187 ill->ill_phys_addr_mp = mp; 16188 ill->ill_phys_addr = mp->b_rptr + paddroff; 16189 mp = NULL; 16190 16191 /* 16192 * If paddrlen is zero, the DLPI provider doesn't support 16193 * physical addresses. The other two tests were historical 16194 * workarounds for bugs in our former PPP implementation, but 16195 * now other things have grown dependencies on them -- e.g., 16196 * the tun module specifies a dl_addr_length of zero in its 16197 * DL_BIND_ACK, but then specifies an incorrect value in its 16198 * DL_PHYS_ADDR_ACK. These bogus checks need to be removed, 16199 * but only after careful testing ensures that all dependent 16200 * broken DLPI providers have been fixed. 16201 */ 16202 if (paddrlen == 0 || ill->ill_phys_addr_length == 0 || 16203 ill->ill_phys_addr_length == IP_ADDR_LEN) { 16204 ill->ill_phys_addr = NULL; 16205 } else if (paddrlen != ill->ill_phys_addr_length) { 16206 ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d", 16207 paddrlen, ill->ill_phys_addr_length)); 16208 err = EINVAL; 16209 break; 16210 } 16211 16212 if (ill->ill_nd_lla_mp == NULL) { 16213 if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) { 16214 err = ENOMEM; 16215 break; 16216 } 16217 ill_set_ndmp(ill, mp_hw, paddroff, paddrlen); 16218 } 16219 16220 /* 16221 * Set the interface token. If the zeroth interface address 16222 * is unspecified, then set it to the link local address. 16223 */ 16224 if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 16225 (void) ill_setdefaulttoken(ill); 16226 16227 ASSERT(ill->ill_ipif->ipif_id == 0); 16228 if (ipif != NULL && 16229 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 16230 (void) ipif_setlinklocal(ipif); 16231 } 16232 break; 16233 } 16234 case DL_OK_ACK: 16235 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 16236 dl_primstr((int)dloa->dl_correct_primitive), 16237 dloa->dl_correct_primitive)); 16238 switch (dloa->dl_correct_primitive) { 16239 case DL_ENABMULTI_REQ: 16240 case DL_DISABMULTI_REQ: 16241 if (!ill->ill_isv6) 16242 ipsq_current_finish(ipsq); 16243 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16244 break; 16245 case DL_PROMISCON_REQ: 16246 case DL_PROMISCOFF_REQ: 16247 case DL_UNBIND_REQ: 16248 case DL_ATTACH_REQ: 16249 ill_dlpi_done(ill, dloa->dl_correct_primitive); 16250 break; 16251 } 16252 break; 16253 default: 16254 break; 16255 } 16256 16257 freemsg(mp); 16258 if (mp1 != NULL) { 16259 /* 16260 * The operation must complete without EINPROGRESS 16261 * since ipsq_pending_mp_get() has removed the mblk 16262 * from ipsq_pending_mp. Otherwise, the operation 16263 * will be stuck forever in the ipsq. 16264 */ 16265 ASSERT(err != EINPROGRESS); 16266 16267 switch (ipsq->ipsq_current_ioctl) { 16268 case 0: 16269 ipsq_current_finish(ipsq); 16270 break; 16271 16272 case SIOCLIFADDIF: 16273 case SIOCSLIFNAME: 16274 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq); 16275 break; 16276 16277 default: 16278 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 16279 break; 16280 } 16281 } 16282 } 16283 16284 /* 16285 * ip_rput_other is called by ip_rput to handle messages modifying the global 16286 * state in IP. Normally called as writer. Exception SIOCGTUNPARAM (shared) 16287 */ 16288 /* ARGSUSED */ 16289 void 16290 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 16291 { 16292 ill_t *ill; 16293 struct iocblk *iocp; 16294 mblk_t *mp1; 16295 conn_t *connp = NULL; 16296 16297 ip1dbg(("ip_rput_other ")); 16298 ill = (ill_t *)q->q_ptr; 16299 /* 16300 * This routine is not a writer in the case of SIOCGTUNPARAM 16301 * in which case ipsq is NULL. 16302 */ 16303 if (ipsq != NULL) { 16304 ASSERT(IAM_WRITER_IPSQ(ipsq)); 16305 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 16306 } 16307 16308 switch (mp->b_datap->db_type) { 16309 case M_ERROR: 16310 case M_HANGUP: 16311 /* 16312 * The device has a problem. We force the ILL down. It can 16313 * be brought up again manually using SIOCSIFFLAGS (via 16314 * ifconfig or equivalent). 16315 */ 16316 ASSERT(ipsq != NULL); 16317 if (mp->b_rptr < mp->b_wptr) 16318 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 16319 if (ill->ill_error == 0) 16320 ill->ill_error = ENXIO; 16321 if (!ill_down_start(q, mp)) 16322 return; 16323 ipif_all_down_tail(ipsq, q, mp, NULL); 16324 break; 16325 case M_IOCACK: 16326 iocp = (struct iocblk *)mp->b_rptr; 16327 ASSERT(iocp->ioc_cmd != DL_IOC_HDR_INFO); 16328 switch (iocp->ioc_cmd) { 16329 case SIOCSTUNPARAM: 16330 case OSIOCSTUNPARAM: 16331 ASSERT(ipsq != NULL); 16332 /* 16333 * Finish socket ioctl passed through to tun. 16334 * We should have an IOCTL waiting on this. 16335 */ 16336 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16337 if (ill->ill_isv6) { 16338 struct iftun_req *ta; 16339 16340 /* 16341 * if a source or destination is 16342 * being set, try and set the link 16343 * local address for the tunnel 16344 */ 16345 ta = (struct iftun_req *)mp->b_cont-> 16346 b_cont->b_rptr; 16347 if (ta->ifta_flags & (IFTUN_SRC | IFTUN_DST)) { 16348 ipif_set_tun_llink(ill, ta); 16349 } 16350 16351 } 16352 if (mp1 != NULL) { 16353 /* 16354 * Now copy back the b_next/b_prev used by 16355 * mi code for the mi_copy* functions. 16356 * See ip_sioctl_tunparam() for the reason. 16357 * Also protect against missing b_cont. 16358 */ 16359 if (mp->b_cont != NULL) { 16360 mp->b_cont->b_next = 16361 mp1->b_cont->b_next; 16362 mp->b_cont->b_prev = 16363 mp1->b_cont->b_prev; 16364 } 16365 inet_freemsg(mp1); 16366 ASSERT(connp != NULL); 16367 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16368 iocp->ioc_error, NO_COPYOUT, ipsq); 16369 } else { 16370 ASSERT(connp == NULL); 16371 putnext(q, mp); 16372 } 16373 break; 16374 case SIOCGTUNPARAM: 16375 case OSIOCGTUNPARAM: 16376 /* 16377 * This is really M_IOCDATA from the tunnel driver. 16378 * convert back and complete the ioctl. 16379 * We should have an IOCTL waiting on this. 16380 */ 16381 mp1 = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 16382 if (mp1) { 16383 /* 16384 * Now copy back the b_next/b_prev used by 16385 * mi code for the mi_copy* functions. 16386 * See ip_sioctl_tunparam() for the reason. 16387 * Also protect against missing b_cont. 16388 */ 16389 if (mp->b_cont != NULL) { 16390 mp->b_cont->b_next = 16391 mp1->b_cont->b_next; 16392 mp->b_cont->b_prev = 16393 mp1->b_cont->b_prev; 16394 } 16395 inet_freemsg(mp1); 16396 if (iocp->ioc_error == 0) 16397 mp->b_datap->db_type = M_IOCDATA; 16398 ASSERT(connp != NULL); 16399 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16400 iocp->ioc_error, COPYOUT, NULL); 16401 } else { 16402 ASSERT(connp == NULL); 16403 putnext(q, mp); 16404 } 16405 break; 16406 default: 16407 break; 16408 } 16409 break; 16410 case M_IOCNAK: 16411 iocp = (struct iocblk *)mp->b_rptr; 16412 16413 switch (iocp->ioc_cmd) { 16414 int mode; 16415 16416 case DL_IOC_HDR_INFO: 16417 /* 16418 * If this was the first attempt turn of the 16419 * fastpath probing. 16420 */ 16421 mutex_enter(&ill->ill_lock); 16422 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) { 16423 ill->ill_dlpi_fastpath_state = IDS_FAILED; 16424 mutex_exit(&ill->ill_lock); 16425 ill_fastpath_nack(ill); 16426 ip1dbg(("ip_rput: DLPI fastpath off on " 16427 "interface %s\n", 16428 ill->ill_name)); 16429 } else { 16430 mutex_exit(&ill->ill_lock); 16431 } 16432 freemsg(mp); 16433 break; 16434 case SIOCSTUNPARAM: 16435 case OSIOCSTUNPARAM: 16436 ASSERT(ipsq != NULL); 16437 /* 16438 * Finish socket ioctl passed through to tun 16439 * We should have an IOCTL waiting on this. 16440 */ 16441 /* FALLTHRU */ 16442 case SIOCGTUNPARAM: 16443 case OSIOCGTUNPARAM: 16444 /* 16445 * This is really M_IOCDATA from the tunnel driver. 16446 * convert back and complete the ioctl. 16447 * We should have an IOCTL waiting on this. 16448 */ 16449 if (iocp->ioc_cmd == SIOCGTUNPARAM || 16450 iocp->ioc_cmd == OSIOCGTUNPARAM) { 16451 mp1 = ill_pending_mp_get(ill, &connp, 16452 iocp->ioc_id); 16453 mode = COPYOUT; 16454 ipsq = NULL; 16455 } else { 16456 mp1 = ipsq_pending_mp_get(ipsq, &connp); 16457 mode = NO_COPYOUT; 16458 } 16459 if (mp1 != NULL) { 16460 /* 16461 * Now copy back the b_next/b_prev used by 16462 * mi code for the mi_copy* functions. 16463 * See ip_sioctl_tunparam() for the reason. 16464 * Also protect against missing b_cont. 16465 */ 16466 if (mp->b_cont != NULL) { 16467 mp->b_cont->b_next = 16468 mp1->b_cont->b_next; 16469 mp->b_cont->b_prev = 16470 mp1->b_cont->b_prev; 16471 } 16472 inet_freemsg(mp1); 16473 if (iocp->ioc_error == 0) 16474 iocp->ioc_error = EINVAL; 16475 ASSERT(connp != NULL); 16476 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 16477 iocp->ioc_error, mode, ipsq); 16478 } else { 16479 ASSERT(connp == NULL); 16480 putnext(q, mp); 16481 } 16482 break; 16483 default: 16484 break; 16485 } 16486 default: 16487 break; 16488 } 16489 } 16490 16491 /* 16492 * NOTE : This function does not ire_refrele the ire argument passed in. 16493 * 16494 * IPQoS notes 16495 * IP policy is invoked twice for a forwarded packet, once on the read side 16496 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 16497 * enabled. An additional parameter, in_ill, has been added for this purpose. 16498 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 16499 * because ip_mroute drops this information. 16500 * 16501 */ 16502 void 16503 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 16504 { 16505 uint32_t old_pkt_len; 16506 uint32_t pkt_len; 16507 queue_t *q; 16508 uint32_t sum; 16509 #define rptr ((uchar_t *)ipha) 16510 uint32_t max_frag; 16511 uint32_t ill_index; 16512 ill_t *out_ill; 16513 mib2_ipIfStatsEntry_t *mibptr; 16514 ip_stack_t *ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst; 16515 16516 /* Get the ill_index of the incoming ILL */ 16517 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 16518 mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib; 16519 16520 /* Initiate Read side IPPF processing */ 16521 if (IPP_ENABLED(IPP_FWD_IN, ipst)) { 16522 ip_process(IPP_FWD_IN, &mp, ill_index); 16523 if (mp == NULL) { 16524 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 16525 "during IPPF processing\n")); 16526 return; 16527 } 16528 } 16529 16530 /* Adjust the checksum to reflect the ttl decrement. */ 16531 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 16532 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 16533 16534 if (ipha->ipha_ttl-- <= 1) { 16535 if (ip_csum_hdr(ipha)) { 16536 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16537 goto drop_pkt; 16538 } 16539 /* 16540 * Note: ire_stq this will be NULL for multicast 16541 * datagrams using the long path through arp (the IRE 16542 * is not an IRE_CACHE). This should not cause 16543 * problems since we don't generate ICMP errors for 16544 * multicast packets. 16545 */ 16546 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16547 q = ire->ire_stq; 16548 if (q != NULL) { 16549 /* Sent by forwarding path, and router is global zone */ 16550 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED, 16551 GLOBAL_ZONEID, ipst); 16552 } else 16553 freemsg(mp); 16554 return; 16555 } 16556 16557 /* 16558 * Don't forward if the interface is down 16559 */ 16560 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 16561 BUMP_MIB(mibptr, ipIfStatsInDiscards); 16562 ip2dbg(("ip_rput_forward:interface is down\n")); 16563 goto drop_pkt; 16564 } 16565 16566 /* Get the ill_index of the outgoing ILL */ 16567 out_ill = ire_to_ill(ire); 16568 ill_index = out_ill->ill_phyint->phyint_ifindex; 16569 16570 DTRACE_PROBE4(ip4__forwarding__start, 16571 ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16572 16573 FW_HOOKS(ipst->ips_ip4_forwarding_event, 16574 ipst->ips_ipv4firewall_forwarding, 16575 in_ill, out_ill, ipha, mp, mp, 0, ipst); 16576 16577 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 16578 16579 if (mp == NULL) 16580 return; 16581 old_pkt_len = pkt_len = ntohs(ipha->ipha_length); 16582 16583 if (is_system_labeled()) { 16584 mblk_t *mp1; 16585 16586 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 16587 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16588 goto drop_pkt; 16589 } 16590 /* Size may have changed */ 16591 mp = mp1; 16592 ipha = (ipha_t *)mp->b_rptr; 16593 pkt_len = ntohs(ipha->ipha_length); 16594 } 16595 16596 /* Check if there are options to update */ 16597 if (!IS_SIMPLE_IPH(ipha)) { 16598 if (ip_csum_hdr(ipha)) { 16599 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16600 goto drop_pkt; 16601 } 16602 if (ip_rput_forward_options(mp, ipha, ire, ipst)) { 16603 BUMP_MIB(mibptr, ipIfStatsForwProhibits); 16604 return; 16605 } 16606 16607 ipha->ipha_hdr_checksum = 0; 16608 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 16609 } 16610 max_frag = ire->ire_max_frag; 16611 if (pkt_len > max_frag) { 16612 /* 16613 * It needs fragging on its way out. We haven't 16614 * verified the header checksum yet. Since we 16615 * are going to put a surely good checksum in the 16616 * outgoing header, we have to make sure that it 16617 * was good coming in. 16618 */ 16619 if (ip_csum_hdr(ipha)) { 16620 BUMP_MIB(mibptr, ipIfStatsInCksumErrs); 16621 goto drop_pkt; 16622 } 16623 /* Initiate Write side IPPF processing */ 16624 if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { 16625 ip_process(IPP_FWD_OUT, &mp, ill_index); 16626 if (mp == NULL) { 16627 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 16628 " during IPPF processing\n")); 16629 return; 16630 } 16631 } 16632 /* 16633 * Handle labeled packet resizing. 16634 * 16635 * If we have added a label, inform ip_wput_frag() of its 16636 * effect on the MTU for ICMP messages. 16637 */ 16638 if (pkt_len > old_pkt_len) { 16639 uint32_t secopt_size; 16640 16641 secopt_size = pkt_len - old_pkt_len; 16642 if (secopt_size < max_frag) 16643 max_frag -= secopt_size; 16644 } 16645 16646 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, GLOBAL_ZONEID, ipst); 16647 ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n")); 16648 return; 16649 } 16650 16651 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 16652 ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); 16653 FW_HOOKS(ipst->ips_ip4_physical_out_event, 16654 ipst->ips_ipv4firewall_physical_out, 16655 NULL, out_ill, ipha, mp, mp, 0, ipst); 16656 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 16657 if (mp == NULL) 16658 return; 16659 16660 mp->b_prev = (mblk_t *)IPP_FWD_OUT; 16661 ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n")); 16662 (void) ip_xmit_v4(mp, ire, NULL, B_FALSE); 16663 /* ip_xmit_v4 always consumes the packet */ 16664 return; 16665 16666 drop_pkt:; 16667 ip1dbg(("ip_rput_forward: drop pkt\n")); 16668 freemsg(mp); 16669 #undef rptr 16670 } 16671 16672 void 16673 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 16674 { 16675 ire_t *ire; 16676 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16677 16678 ASSERT(!ipif->ipif_isv6); 16679 /* 16680 * Find an IRE which matches the destination and the outgoing 16681 * queue in the cache table. All we need is an IRE_CACHE which 16682 * is pointing at ipif->ipif_ill. If it is part of some ill group, 16683 * then it is enough to have some IRE_CACHE in the group. 16684 */ 16685 if (ipif->ipif_flags & IPIF_POINTOPOINT) 16686 dst = ipif->ipif_pp_dst_addr; 16687 16688 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), 16689 MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR, ipst); 16690 if (ire == NULL) { 16691 /* 16692 * Mark this packet to make it be delivered to 16693 * ip_rput_forward after the new ire has been 16694 * created. 16695 */ 16696 mp->b_prev = NULL; 16697 mp->b_next = mp; 16698 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 16699 NULL, 0, GLOBAL_ZONEID, &zero_info); 16700 } else { 16701 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 16702 IRE_REFRELE(ire); 16703 } 16704 } 16705 16706 /* Update any source route, record route or timestamp options */ 16707 static int 16708 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) 16709 { 16710 ipoptp_t opts; 16711 uchar_t *opt; 16712 uint8_t optval; 16713 uint8_t optlen; 16714 ipaddr_t dst; 16715 uint32_t ts; 16716 ire_t *dst_ire = NULL; 16717 ire_t *tmp_ire = NULL; 16718 timestruc_t now; 16719 16720 ip2dbg(("ip_rput_forward_options\n")); 16721 dst = ipha->ipha_dst; 16722 for (optval = ipoptp_first(&opts, ipha); 16723 optval != IPOPT_EOL; 16724 optval = ipoptp_next(&opts)) { 16725 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16726 opt = opts.ipoptp_cur; 16727 optlen = opts.ipoptp_len; 16728 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 16729 optval, opts.ipoptp_len)); 16730 switch (optval) { 16731 uint32_t off; 16732 case IPOPT_SSRR: 16733 case IPOPT_LSRR: 16734 /* Check if adminstratively disabled */ 16735 if (!ipst->ips_ip_forward_src_routed) { 16736 if (ire->ire_stq != NULL) { 16737 /* 16738 * Sent by forwarding path, and router 16739 * is global zone 16740 */ 16741 icmp_unreachable(ire->ire_stq, mp, 16742 ICMP_SOURCE_ROUTE_FAILED, 16743 GLOBAL_ZONEID, ipst); 16744 } else { 16745 ip0dbg(("ip_rput_forward_options: " 16746 "unable to send unreach\n")); 16747 freemsg(mp); 16748 } 16749 return (-1); 16750 } 16751 16752 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16753 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16754 if (dst_ire == NULL) { 16755 /* 16756 * Must be partial since ip_rput_options 16757 * checked for strict. 16758 */ 16759 break; 16760 } 16761 off = opt[IPOPT_OFFSET]; 16762 off--; 16763 redo_srr: 16764 if (optlen < IP_ADDR_LEN || 16765 off > optlen - IP_ADDR_LEN) { 16766 /* End of source route */ 16767 ip1dbg(( 16768 "ip_rput_forward_options: end of SR\n")); 16769 ire_refrele(dst_ire); 16770 break; 16771 } 16772 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16773 bcopy(&ire->ire_src_addr, (char *)opt + off, 16774 IP_ADDR_LEN); 16775 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 16776 ntohl(dst))); 16777 16778 /* 16779 * Check if our address is present more than 16780 * once as consecutive hops in source route. 16781 */ 16782 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16783 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 16784 if (tmp_ire != NULL) { 16785 ire_refrele(tmp_ire); 16786 off += IP_ADDR_LEN; 16787 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16788 goto redo_srr; 16789 } 16790 ipha->ipha_dst = dst; 16791 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16792 ire_refrele(dst_ire); 16793 break; 16794 case IPOPT_RR: 16795 off = opt[IPOPT_OFFSET]; 16796 off--; 16797 if (optlen < IP_ADDR_LEN || 16798 off > optlen - IP_ADDR_LEN) { 16799 /* No more room - ignore */ 16800 ip1dbg(( 16801 "ip_rput_forward_options: end of RR\n")); 16802 break; 16803 } 16804 bcopy(&ire->ire_src_addr, (char *)opt + off, 16805 IP_ADDR_LEN); 16806 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16807 break; 16808 case IPOPT_TS: 16809 /* Insert timestamp if there is room */ 16810 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16811 case IPOPT_TS_TSONLY: 16812 off = IPOPT_TS_TIMELEN; 16813 break; 16814 case IPOPT_TS_PRESPEC: 16815 case IPOPT_TS_PRESPEC_RFC791: 16816 /* Verify that the address matched */ 16817 off = opt[IPOPT_OFFSET] - 1; 16818 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16819 dst_ire = ire_ctable_lookup(dst, 0, 16820 IRE_LOCAL, NULL, ALL_ZONES, NULL, 16821 MATCH_IRE_TYPE, ipst); 16822 if (dst_ire == NULL) { 16823 /* Not for us */ 16824 break; 16825 } 16826 ire_refrele(dst_ire); 16827 /* FALLTHRU */ 16828 case IPOPT_TS_TSANDADDR: 16829 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16830 break; 16831 default: 16832 /* 16833 * ip_*put_options should have already 16834 * dropped this packet. 16835 */ 16836 cmn_err(CE_PANIC, "ip_rput_forward_options: " 16837 "unknown IT - bug in ip_rput_options?\n"); 16838 return (0); /* Keep "lint" happy */ 16839 } 16840 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16841 /* Increase overflow counter */ 16842 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16843 opt[IPOPT_POS_OV_FLG] = 16844 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16845 (off << 4)); 16846 break; 16847 } 16848 off = opt[IPOPT_OFFSET] - 1; 16849 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16850 case IPOPT_TS_PRESPEC: 16851 case IPOPT_TS_PRESPEC_RFC791: 16852 case IPOPT_TS_TSANDADDR: 16853 bcopy(&ire->ire_src_addr, 16854 (char *)opt + off, IP_ADDR_LEN); 16855 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16856 /* FALLTHRU */ 16857 case IPOPT_TS_TSONLY: 16858 off = opt[IPOPT_OFFSET] - 1; 16859 /* Compute # of milliseconds since midnight */ 16860 gethrestime(&now); 16861 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16862 now.tv_nsec / (NANOSEC / MILLISEC); 16863 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16864 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16865 break; 16866 } 16867 break; 16868 } 16869 } 16870 return (0); 16871 } 16872 16873 /* 16874 * This is called after processing at least one of AH/ESP headers. 16875 * 16876 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 16877 * the actual, physical interface on which the packet was received, 16878 * but, when ip_strict_dst_multihoming is set to 1, could be the 16879 * interface which had the ipha_dst configured when the packet went 16880 * through ip_rput. The ill_index corresponding to the recv_ill 16881 * is saved in ipsec_in_rill_index 16882 * 16883 * NOTE2: The "ire" argument is only used in IPv4 cases. This function 16884 * cannot assume "ire" points to valid data for any IPv6 cases. 16885 */ 16886 void 16887 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 16888 { 16889 mblk_t *mp; 16890 ipaddr_t dst; 16891 in6_addr_t *v6dstp; 16892 ipha_t *ipha; 16893 ip6_t *ip6h; 16894 ipsec_in_t *ii; 16895 boolean_t ill_need_rele = B_FALSE; 16896 boolean_t rill_need_rele = B_FALSE; 16897 boolean_t ire_need_rele = B_FALSE; 16898 netstack_t *ns; 16899 ip_stack_t *ipst; 16900 16901 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 16902 ASSERT(ii->ipsec_in_ill_index != 0); 16903 ns = ii->ipsec_in_ns; 16904 ASSERT(ii->ipsec_in_ns != NULL); 16905 ipst = ns->netstack_ip; 16906 16907 mp = ipsec_mp->b_cont; 16908 ASSERT(mp != NULL); 16909 16910 16911 if (ill == NULL) { 16912 ASSERT(recv_ill == NULL); 16913 /* 16914 * We need to get the original queue on which ip_rput_local 16915 * or ip_rput_data_v6 was called. 16916 */ 16917 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 16918 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst); 16919 ill_need_rele = B_TRUE; 16920 16921 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 16922 recv_ill = ill_lookup_on_ifindex( 16923 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 16924 NULL, NULL, NULL, NULL, ipst); 16925 rill_need_rele = B_TRUE; 16926 } else { 16927 recv_ill = ill; 16928 } 16929 16930 if ((ill == NULL) || (recv_ill == NULL)) { 16931 ip0dbg(("ip_fanout_proto_again: interface " 16932 "disappeared\n")); 16933 if (ill != NULL) 16934 ill_refrele(ill); 16935 if (recv_ill != NULL) 16936 ill_refrele(recv_ill); 16937 freemsg(ipsec_mp); 16938 return; 16939 } 16940 } 16941 16942 ASSERT(ill != NULL && recv_ill != NULL); 16943 16944 if (mp->b_datap->db_type == M_CTL) { 16945 /* 16946 * AH/ESP is returning the ICMP message after 16947 * removing their headers. Fanout again till 16948 * it gets to the right protocol. 16949 */ 16950 if (ii->ipsec_in_v4) { 16951 icmph_t *icmph; 16952 int iph_hdr_length; 16953 int hdr_length; 16954 16955 ipha = (ipha_t *)mp->b_rptr; 16956 iph_hdr_length = IPH_HDR_LENGTH(ipha); 16957 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 16958 ipha = (ipha_t *)&icmph[1]; 16959 hdr_length = IPH_HDR_LENGTH(ipha); 16960 /* 16961 * icmp_inbound_error_fanout may need to do pullupmsg. 16962 * Reset the type to M_DATA. 16963 */ 16964 mp->b_datap->db_type = M_DATA; 16965 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 16966 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 16967 B_FALSE, ill, ii->ipsec_in_zoneid); 16968 } else { 16969 icmp6_t *icmp6; 16970 int hdr_length; 16971 16972 ip6h = (ip6_t *)mp->b_rptr; 16973 /* Don't call hdr_length_v6() unless you have to. */ 16974 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 16975 hdr_length = ip_hdr_length_v6(mp, ip6h); 16976 else 16977 hdr_length = IPV6_HDR_LEN; 16978 16979 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 16980 /* 16981 * icmp_inbound_error_fanout_v6 may need to do 16982 * pullupmsg. Reset the type to M_DATA. 16983 */ 16984 mp->b_datap->db_type = M_DATA; 16985 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 16986 ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); 16987 } 16988 if (ill_need_rele) 16989 ill_refrele(ill); 16990 if (rill_need_rele) 16991 ill_refrele(recv_ill); 16992 return; 16993 } 16994 16995 if (ii->ipsec_in_v4) { 16996 ipha = (ipha_t *)mp->b_rptr; 16997 dst = ipha->ipha_dst; 16998 if (CLASSD(dst)) { 16999 /* 17000 * Multicast has to be delivered to all streams. 17001 */ 17002 dst = INADDR_BROADCAST; 17003 } 17004 17005 if (ire == NULL) { 17006 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 17007 MBLK_GETLABEL(mp), ipst); 17008 if (ire == NULL) { 17009 if (ill_need_rele) 17010 ill_refrele(ill); 17011 if (rill_need_rele) 17012 ill_refrele(recv_ill); 17013 ip1dbg(("ip_fanout_proto_again: " 17014 "IRE not found")); 17015 freemsg(ipsec_mp); 17016 return; 17017 } 17018 ire_need_rele = B_TRUE; 17019 } 17020 17021 switch (ipha->ipha_protocol) { 17022 case IPPROTO_UDP: 17023 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 17024 recv_ill); 17025 if (ire_need_rele) 17026 ire_refrele(ire); 17027 break; 17028 case IPPROTO_TCP: 17029 if (!ire_need_rele) 17030 IRE_REFHOLD(ire); 17031 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 17032 ire, ipsec_mp, 0, ill->ill_rq, NULL); 17033 IRE_REFRELE(ire); 17034 if (mp != NULL) 17035 squeue_enter_chain(GET_SQUEUE(mp), mp, 17036 mp, 1, SQTAG_IP_PROTO_AGAIN); 17037 break; 17038 case IPPROTO_SCTP: 17039 if (!ire_need_rele) 17040 IRE_REFHOLD(ire); 17041 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 17042 ipsec_mp, 0, ill->ill_rq, dst); 17043 break; 17044 default: 17045 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 17046 recv_ill, 0); 17047 if (ire_need_rele) 17048 ire_refrele(ire); 17049 break; 17050 } 17051 } else { 17052 uint32_t rput_flags = 0; 17053 17054 ip6h = (ip6_t *)mp->b_rptr; 17055 v6dstp = &ip6h->ip6_dst; 17056 /* 17057 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 17058 * address. 17059 * 17060 * Currently, we don't store that state in the IPSEC_IN 17061 * message, and we may need to. 17062 */ 17063 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 17064 IP6_IN_LLMCAST : 0); 17065 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 17066 NULL, NULL); 17067 } 17068 if (ill_need_rele) 17069 ill_refrele(ill); 17070 if (rill_need_rele) 17071 ill_refrele(recv_ill); 17072 } 17073 17074 /* 17075 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 17076 * returns 'true' if there are still fragments left on the queue, in 17077 * which case we restart the timer. 17078 */ 17079 void 17080 ill_frag_timer(void *arg) 17081 { 17082 ill_t *ill = (ill_t *)arg; 17083 boolean_t frag_pending; 17084 ip_stack_t *ipst = ill->ill_ipst; 17085 17086 mutex_enter(&ill->ill_lock); 17087 ASSERT(!ill->ill_fragtimer_executing); 17088 if (ill->ill_state_flags & ILL_CONDEMNED) { 17089 ill->ill_frag_timer_id = 0; 17090 mutex_exit(&ill->ill_lock); 17091 return; 17092 } 17093 ill->ill_fragtimer_executing = 1; 17094 mutex_exit(&ill->ill_lock); 17095 17096 frag_pending = ill_frag_timeout(ill, ipst->ips_ip_g_frag_timeout); 17097 17098 /* 17099 * Restart the timer, if we have fragments pending or if someone 17100 * wanted us to be scheduled again. 17101 */ 17102 mutex_enter(&ill->ill_lock); 17103 ill->ill_fragtimer_executing = 0; 17104 ill->ill_frag_timer_id = 0; 17105 if (frag_pending || ill->ill_fragtimer_needrestart) 17106 ill_frag_timer_start(ill); 17107 mutex_exit(&ill->ill_lock); 17108 } 17109 17110 void 17111 ill_frag_timer_start(ill_t *ill) 17112 { 17113 ip_stack_t *ipst = ill->ill_ipst; 17114 17115 ASSERT(MUTEX_HELD(&ill->ill_lock)); 17116 17117 /* If the ill is closing or opening don't proceed */ 17118 if (ill->ill_state_flags & ILL_CONDEMNED) 17119 return; 17120 17121 if (ill->ill_fragtimer_executing) { 17122 /* 17123 * ill_frag_timer is currently executing. Just record the 17124 * the fact that we want the timer to be restarted. 17125 * ill_frag_timer will post a timeout before it returns, 17126 * ensuring it will be called again. 17127 */ 17128 ill->ill_fragtimer_needrestart = 1; 17129 return; 17130 } 17131 17132 if (ill->ill_frag_timer_id == 0) { 17133 /* 17134 * The timer is neither running nor is the timeout handler 17135 * executing. Post a timeout so that ill_frag_timer will be 17136 * called 17137 */ 17138 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 17139 MSEC_TO_TICK(ipst->ips_ip_g_frag_timo_ms >> 1)); 17140 ill->ill_fragtimer_needrestart = 0; 17141 } 17142 } 17143 17144 /* 17145 * This routine is needed for loopback when forwarding multicasts. 17146 * 17147 * IPQoS Notes: 17148 * IPPF processing is done in fanout routines. 17149 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 17150 * processing for IPsec packets is done when it comes back in clear. 17151 * NOTE : The callers of this function need to do the ire_refrele for the 17152 * ire that is being passed in. 17153 */ 17154 void 17155 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17156 ill_t *recv_ill, uint32_t esp_udp_ports) 17157 { 17158 boolean_t esp_in_udp_packet = (esp_udp_ports != 0); 17159 ill_t *ill = (ill_t *)q->q_ptr; 17160 uint32_t sum; 17161 uint32_t u1; 17162 uint32_t u2; 17163 int hdr_length; 17164 boolean_t mctl_present; 17165 mblk_t *first_mp = mp; 17166 mblk_t *hada_mp = NULL; 17167 ipha_t *inner_ipha; 17168 ip_stack_t *ipst; 17169 17170 ASSERT(recv_ill != NULL); 17171 ipst = recv_ill->ill_ipst; 17172 17173 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 17174 "ip_rput_locl_start: q %p", q); 17175 17176 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17177 ASSERT(ill != NULL); 17178 17179 17180 #define rptr ((uchar_t *)ipha) 17181 #define iphs ((uint16_t *)ipha) 17182 17183 /* 17184 * no UDP or TCP packet should come here anymore. 17185 */ 17186 ASSERT(ipha->ipha_protocol != IPPROTO_TCP && 17187 ipha->ipha_protocol != IPPROTO_UDP); 17188 17189 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 17190 if (mctl_present && 17191 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 17192 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 17193 17194 /* 17195 * It's an IPsec accelerated packet. 17196 * Keep a pointer to the data attributes around until 17197 * we allocate the ipsec_info_t. 17198 */ 17199 IPSECHW_DEBUG(IPSECHW_PKT, 17200 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 17201 hada_mp = first_mp; 17202 hada_mp->b_cont = NULL; 17203 /* 17204 * Since it is accelerated, it comes directly from 17205 * the ill and the data attributes is followed by 17206 * the packet data. 17207 */ 17208 ASSERT(mp->b_datap->db_type != M_CTL); 17209 first_mp = mp; 17210 mctl_present = B_FALSE; 17211 } 17212 17213 /* 17214 * IF M_CTL is not present, then ipsec_in_is_secure 17215 * should return B_TRUE. There is a case where loopback 17216 * packets has an M_CTL in the front with all the 17217 * IPsec options set to IPSEC_PREF_NEVER - which means 17218 * ipsec_in_is_secure will return B_FALSE. As loopback 17219 * packets never comes here, it is safe to ASSERT the 17220 * following. 17221 */ 17222 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 17223 17224 /* 17225 * Also, we should never have an mctl_present if this is an 17226 * ESP-in-UDP packet. 17227 */ 17228 ASSERT(!mctl_present || !esp_in_udp_packet); 17229 17230 17231 /* u1 is # words of IP options */ 17232 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 17233 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 17234 17235 /* 17236 * Don't verify header checksum if we just removed UDP header or 17237 * packet is coming back from AH/ESP. 17238 */ 17239 if (!esp_in_udp_packet && !mctl_present) { 17240 if (u1) { 17241 if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { 17242 if (hada_mp != NULL) 17243 freemsg(hada_mp); 17244 return; 17245 } 17246 } else { 17247 /* Check the IP header checksum. */ 17248 #define uph ((uint16_t *)ipha) 17249 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 17250 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 17251 #undef uph 17252 /* finish doing IP checksum */ 17253 sum = (sum & 0xFFFF) + (sum >> 16); 17254 sum = ~(sum + (sum >> 16)) & 0xFFFF; 17255 if (sum && sum != 0xFFFF) { 17256 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 17257 goto drop_pkt; 17258 } 17259 } 17260 } 17261 17262 /* 17263 * Count for SNMP of inbound packets for ire. As ip_proto_input 17264 * might be called more than once for secure packets, count only 17265 * the first time. 17266 */ 17267 if (!mctl_present) { 17268 UPDATE_IB_PKT_COUNT(ire); 17269 ire->ire_last_used_time = lbolt; 17270 } 17271 17272 /* Check for fragmentation offset. */ 17273 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 17274 u1 = u2 & (IPH_MF | IPH_OFFSET); 17275 if (u1) { 17276 /* 17277 * We re-assemble fragments before we do the AH/ESP 17278 * processing. Thus, M_CTL should not be present 17279 * while we are re-assembling. 17280 */ 17281 ASSERT(!mctl_present); 17282 ASSERT(first_mp == mp); 17283 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 17284 return; 17285 } 17286 /* 17287 * Make sure that first_mp points back to mp as 17288 * the mp we came in with could have changed in 17289 * ip_rput_fragment(). 17290 */ 17291 ipha = (ipha_t *)mp->b_rptr; 17292 first_mp = mp; 17293 } 17294 17295 /* 17296 * Clear hardware checksumming flag as it is currently only 17297 * used by TCP and UDP. 17298 */ 17299 DB_CKSUMFLAGS(mp) = 0; 17300 17301 /* Now we have a complete datagram, destined for this machine. */ 17302 u1 = IPH_HDR_LENGTH(ipha); 17303 switch (ipha->ipha_protocol) { 17304 case IPPROTO_ICMP: { 17305 ire_t *ire_zone; 17306 ilm_t *ilm; 17307 mblk_t *mp1; 17308 zoneid_t last_zoneid; 17309 17310 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { 17311 ASSERT(ire->ire_type == IRE_BROADCAST); 17312 /* 17313 * Inactive/Failed interfaces are not supposed to 17314 * respond to the multicast packets. 17315 */ 17316 if (ill_is_probeonly(ill)) { 17317 freemsg(first_mp); 17318 return; 17319 } 17320 17321 /* 17322 * In the multicast case, applications may have joined 17323 * the group from different zones, so we need to deliver 17324 * the packet to each of them. Loop through the 17325 * multicast memberships structures (ilm) on the receive 17326 * ill and send a copy of the packet up each matching 17327 * one. However, we don't do this for multicasts sent on 17328 * the loopback interface (PHYI_LOOPBACK flag set) as 17329 * they must stay in the sender's zone. 17330 * 17331 * ilm_add_v6() ensures that ilms in the same zone are 17332 * contiguous in the ill_ilm list. We use this property 17333 * to avoid sending duplicates needed when two 17334 * applications in the same zone join the same group on 17335 * different logical interfaces: we ignore the ilm if 17336 * its zoneid is the same as the last matching one. 17337 * In addition, the sending of the packet for 17338 * ire_zoneid is delayed until all of the other ilms 17339 * have been exhausted. 17340 */ 17341 last_zoneid = -1; 17342 ILM_WALKER_HOLD(recv_ill); 17343 for (ilm = recv_ill->ill_ilm; ilm != NULL; 17344 ilm = ilm->ilm_next) { 17345 if ((ilm->ilm_flags & ILM_DELETED) || 17346 ipha->ipha_dst != ilm->ilm_addr || 17347 ilm->ilm_zoneid == last_zoneid || 17348 ilm->ilm_zoneid == ire->ire_zoneid || 17349 ilm->ilm_zoneid == ALL_ZONES || 17350 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 17351 continue; 17352 mp1 = ip_copymsg(first_mp); 17353 if (mp1 == NULL) 17354 continue; 17355 icmp_inbound(q, mp1, B_TRUE, ill, 17356 0, sum, mctl_present, B_TRUE, 17357 recv_ill, ilm->ilm_zoneid); 17358 last_zoneid = ilm->ilm_zoneid; 17359 } 17360 ILM_WALKER_RELE(recv_ill); 17361 } else if (ire->ire_type == IRE_BROADCAST) { 17362 /* 17363 * In the broadcast case, there may be many zones 17364 * which need a copy of the packet delivered to them. 17365 * There is one IRE_BROADCAST per broadcast address 17366 * and per zone; we walk those using a helper function. 17367 * In addition, the sending of the packet for ire is 17368 * delayed until all of the other ires have been 17369 * processed. 17370 */ 17371 IRB_REFHOLD(ire->ire_bucket); 17372 ire_zone = NULL; 17373 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 17374 ire)) != NULL) { 17375 mp1 = ip_copymsg(first_mp); 17376 if (mp1 == NULL) 17377 continue; 17378 17379 UPDATE_IB_PKT_COUNT(ire_zone); 17380 ire_zone->ire_last_used_time = lbolt; 17381 icmp_inbound(q, mp1, B_TRUE, ill, 17382 0, sum, mctl_present, B_TRUE, 17383 recv_ill, ire_zone->ire_zoneid); 17384 } 17385 IRB_REFRELE(ire->ire_bucket); 17386 } 17387 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 17388 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 17389 ire->ire_zoneid); 17390 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17391 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 17392 return; 17393 } 17394 case IPPROTO_IGMP: 17395 /* 17396 * If we are not willing to accept IGMP packets in clear, 17397 * then check with global policy. 17398 */ 17399 if (ipst->ips_igmp_accept_clear_messages == 0) { 17400 first_mp = ipsec_check_global_policy(first_mp, NULL, 17401 ipha, NULL, mctl_present, ipst->ips_netstack); 17402 if (first_mp == NULL) 17403 return; 17404 } 17405 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17406 freemsg(first_mp); 17407 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 17408 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17409 return; 17410 } 17411 if ((mp = igmp_input(q, mp, ill)) == NULL) { 17412 /* Bad packet - discarded by igmp_input */ 17413 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17414 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 17415 if (mctl_present) 17416 freeb(first_mp); 17417 return; 17418 } 17419 /* 17420 * igmp_input() may have returned the pulled up message. 17421 * So first_mp and ipha need to be reinitialized. 17422 */ 17423 ipha = (ipha_t *)mp->b_rptr; 17424 if (mctl_present) 17425 first_mp->b_cont = mp; 17426 else 17427 first_mp = mp; 17428 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17429 connf_head != NULL) { 17430 /* No user-level listener for IGMP packets */ 17431 goto drop_pkt; 17432 } 17433 /* deliver to local raw users */ 17434 break; 17435 case IPPROTO_PIM: 17436 /* 17437 * If we are not willing to accept PIM packets in clear, 17438 * then check with global policy. 17439 */ 17440 if (ipst->ips_pim_accept_clear_messages == 0) { 17441 first_mp = ipsec_check_global_policy(first_mp, NULL, 17442 ipha, NULL, mctl_present, ipst->ips_netstack); 17443 if (first_mp == NULL) 17444 return; 17445 } 17446 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 17447 freemsg(first_mp); 17448 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 17449 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17450 return; 17451 } 17452 if (pim_input(q, mp, ill) != 0) { 17453 /* Bad packet - discarded by pim_input */ 17454 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17455 "ip_rput_locl_end: q %p (%S)", q, "pim"); 17456 if (mctl_present) 17457 freeb(first_mp); 17458 return; 17459 } 17460 17461 /* 17462 * pim_input() may have pulled up the message so ipha needs to 17463 * be reinitialized. 17464 */ 17465 ipha = (ipha_t *)mp->b_rptr; 17466 if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. 17467 connf_head != NULL) { 17468 /* No user-level listener for PIM packets */ 17469 goto drop_pkt; 17470 } 17471 /* deliver to local raw users */ 17472 break; 17473 case IPPROTO_ENCAP: 17474 /* 17475 * Handle self-encapsulated packets (IP-in-IP where 17476 * the inner addresses == the outer addresses). 17477 */ 17478 hdr_length = IPH_HDR_LENGTH(ipha); 17479 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 17480 mp->b_wptr) { 17481 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 17482 sizeof (ipha_t) - mp->b_rptr)) { 17483 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17484 freemsg(first_mp); 17485 return; 17486 } 17487 ipha = (ipha_t *)mp->b_rptr; 17488 } 17489 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 17490 /* 17491 * Check the sanity of the inner IP header. 17492 */ 17493 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 17494 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17495 freemsg(first_mp); 17496 return; 17497 } 17498 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 17499 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17500 freemsg(first_mp); 17501 return; 17502 } 17503 if (inner_ipha->ipha_src == ipha->ipha_src && 17504 inner_ipha->ipha_dst == ipha->ipha_dst) { 17505 ipsec_in_t *ii; 17506 17507 /* 17508 * Self-encapsulated tunnel packet. Remove 17509 * the outer IP header and fanout again. 17510 * We also need to make sure that the inner 17511 * header is pulled up until options. 17512 */ 17513 mp->b_rptr = (uchar_t *)inner_ipha; 17514 ipha = inner_ipha; 17515 hdr_length = IPH_HDR_LENGTH(ipha); 17516 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 17517 if (!pullupmsg(mp, (uchar_t *)ipha + 17518 + hdr_length - mp->b_rptr)) { 17519 freemsg(first_mp); 17520 return; 17521 } 17522 ipha = (ipha_t *)mp->b_rptr; 17523 } 17524 if (hdr_length > sizeof (ipha_t)) { 17525 /* We got options on the inner packet. */ 17526 ipaddr_t dst = ipha->ipha_dst; 17527 17528 if (ip_rput_options(q, mp, ipha, &dst, ipst) == 17529 -1) { 17530 /* Bad options! */ 17531 return; 17532 } 17533 if (dst != ipha->ipha_dst) { 17534 /* 17535 * Someone put a source-route in 17536 * the inside header of a self- 17537 * encapsulated packet. Drop it 17538 * with extreme prejudice and let 17539 * the sender know. 17540 */ 17541 icmp_unreachable(q, first_mp, 17542 ICMP_SOURCE_ROUTE_FAILED, 17543 recv_ill->ill_zoneid, ipst); 17544 return; 17545 } 17546 } 17547 if (!mctl_present) { 17548 ASSERT(first_mp == mp); 17549 /* 17550 * This means that somebody is sending 17551 * Self-encapsualted packets without AH/ESP. 17552 * If AH/ESP was present, we would have already 17553 * allocated the first_mp. 17554 * 17555 * Send this packet to find a tunnel endpoint. 17556 * if I can't find one, an ICMP 17557 * PROTOCOL_UNREACHABLE will get sent. 17558 */ 17559 goto fanout; 17560 } 17561 /* 17562 * We generally store the ill_index if we need to 17563 * do IPsec processing as we lose the ill queue when 17564 * we come back. But in this case, we never should 17565 * have to store the ill_index here as it should have 17566 * been stored previously when we processed the 17567 * AH/ESP header in this routine or for non-ipsec 17568 * cases, we still have the queue. But for some bad 17569 * packets from the wire, we can get to IPsec after 17570 * this and we better store the index for that case. 17571 */ 17572 ill = (ill_t *)q->q_ptr; 17573 ii = (ipsec_in_t *)first_mp->b_rptr; 17574 ii->ipsec_in_ill_index = 17575 ill->ill_phyint->phyint_ifindex; 17576 ii->ipsec_in_rill_index = 17577 recv_ill->ill_phyint->phyint_ifindex; 17578 if (ii->ipsec_in_decaps) { 17579 /* 17580 * This packet is self-encapsulated multiple 17581 * times. We don't want to recurse infinitely. 17582 * To keep it simple, drop the packet. 17583 */ 17584 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17585 freemsg(first_mp); 17586 return; 17587 } 17588 ii->ipsec_in_decaps = B_TRUE; 17589 ip_fanout_proto_again(first_mp, recv_ill, recv_ill, 17590 ire); 17591 return; 17592 } 17593 break; 17594 case IPPROTO_AH: 17595 case IPPROTO_ESP: { 17596 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 17597 17598 /* 17599 * Fast path for AH/ESP. If this is the first time 17600 * we are sending a datagram to AH/ESP, allocate 17601 * a IPSEC_IN message and prepend it. Otherwise, 17602 * just fanout. 17603 */ 17604 17605 int ipsec_rc; 17606 ipsec_in_t *ii; 17607 netstack_t *ns = ipst->ips_netstack; 17608 17609 IP_STAT(ipst, ipsec_proto_ahesp); 17610 if (!mctl_present) { 17611 ASSERT(first_mp == mp); 17612 first_mp = ipsec_in_alloc(B_TRUE, ns); 17613 if (first_mp == NULL) { 17614 ip1dbg(("ip_proto_input: IPSEC_IN " 17615 "allocation failure.\n")); 17616 freemsg(hada_mp); /* okay ifnull */ 17617 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17618 freemsg(mp); 17619 return; 17620 } 17621 /* 17622 * Store the ill_index so that when we come back 17623 * from IPsec we ride on the same queue. 17624 */ 17625 ill = (ill_t *)q->q_ptr; 17626 ii = (ipsec_in_t *)first_mp->b_rptr; 17627 ii->ipsec_in_ill_index = 17628 ill->ill_phyint->phyint_ifindex; 17629 ii->ipsec_in_rill_index = 17630 recv_ill->ill_phyint->phyint_ifindex; 17631 first_mp->b_cont = mp; 17632 /* 17633 * Cache hardware acceleration info. 17634 */ 17635 if (hada_mp != NULL) { 17636 IPSECHW_DEBUG(IPSECHW_PKT, 17637 ("ip_rput_local: caching data attr.\n")); 17638 ii->ipsec_in_accelerated = B_TRUE; 17639 ii->ipsec_in_da = hada_mp; 17640 hada_mp = NULL; 17641 } 17642 } else { 17643 ii = (ipsec_in_t *)first_mp->b_rptr; 17644 } 17645 17646 ii->ipsec_in_esp_udp_ports = esp_udp_ports; 17647 17648 if (!ipsec_loaded(ipss)) { 17649 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 17650 ire->ire_zoneid, ipst); 17651 return; 17652 } 17653 17654 ns = ipst->ips_netstack; 17655 /* select inbound SA and have IPsec process the pkt */ 17656 if (ipha->ipha_protocol == IPPROTO_ESP) { 17657 esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns); 17658 boolean_t esp_in_udp_sa; 17659 if (esph == NULL) 17660 return; 17661 ASSERT(ii->ipsec_in_esp_sa != NULL); 17662 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 17663 esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags & 17664 IPSA_F_NATT) != 0); 17665 /* 17666 * The following is a fancy, but quick, way of saying: 17667 * ESP-in-UDP SA and Raw ESP packet --> drop 17668 * OR 17669 * ESP SA and ESP-in-UDP packet --> drop 17670 */ 17671 if (esp_in_udp_sa != esp_in_udp_packet) { 17672 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17673 ip_drop_packet(first_mp, B_TRUE, ill, NULL, 17674 DROPPER(ns->netstack_ipsec, ipds_esp_no_sa), 17675 &ns->netstack_ipsec->ipsec_dropper); 17676 return; 17677 } 17678 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 17679 first_mp, esph); 17680 } else { 17681 ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns); 17682 if (ah == NULL) 17683 return; 17684 ASSERT(ii->ipsec_in_ah_sa != NULL); 17685 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 17686 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 17687 first_mp, ah); 17688 } 17689 17690 switch (ipsec_rc) { 17691 case IPSEC_STATUS_SUCCESS: 17692 break; 17693 case IPSEC_STATUS_FAILED: 17694 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 17695 /* FALLTHRU */ 17696 case IPSEC_STATUS_PENDING: 17697 return; 17698 } 17699 /* we're done with IPsec processing, send it up */ 17700 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 17701 return; 17702 } 17703 default: 17704 break; 17705 } 17706 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 17707 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 17708 ire->ire_zoneid)); 17709 goto drop_pkt; 17710 } 17711 /* 17712 * Handle protocols with which IP is less intimate. There 17713 * can be more than one stream bound to a particular 17714 * protocol. When this is the case, each one gets a copy 17715 * of any incoming packets. 17716 */ 17717 fanout: 17718 ip_fanout_proto(q, first_mp, ill, ipha, 17719 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 17720 B_TRUE, recv_ill, ire->ire_zoneid); 17721 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17722 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 17723 return; 17724 17725 drop_pkt: 17726 freemsg(first_mp); 17727 if (hada_mp != NULL) 17728 freeb(hada_mp); 17729 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 17730 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 17731 #undef rptr 17732 #undef iphs 17733 17734 } 17735 17736 /* 17737 * Update any source route, record route or timestamp options. 17738 * Check that we are at end of strict source route. 17739 * The options have already been checked for sanity in ip_rput_options(). 17740 */ 17741 static boolean_t 17742 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 17743 ip_stack_t *ipst) 17744 { 17745 ipoptp_t opts; 17746 uchar_t *opt; 17747 uint8_t optval; 17748 uint8_t optlen; 17749 ipaddr_t dst; 17750 uint32_t ts; 17751 ire_t *dst_ire; 17752 timestruc_t now; 17753 zoneid_t zoneid; 17754 ill_t *ill; 17755 17756 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17757 17758 ip2dbg(("ip_rput_local_options\n")); 17759 17760 for (optval = ipoptp_first(&opts, ipha); 17761 optval != IPOPT_EOL; 17762 optval = ipoptp_next(&opts)) { 17763 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 17764 opt = opts.ipoptp_cur; 17765 optlen = opts.ipoptp_len; 17766 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 17767 optval, optlen)); 17768 switch (optval) { 17769 uint32_t off; 17770 case IPOPT_SSRR: 17771 case IPOPT_LSRR: 17772 off = opt[IPOPT_OFFSET]; 17773 off--; 17774 if (optlen < IP_ADDR_LEN || 17775 off > optlen - IP_ADDR_LEN) { 17776 /* End of source route */ 17777 ip1dbg(("ip_rput_local_options: end of SR\n")); 17778 break; 17779 } 17780 /* 17781 * This will only happen if two consecutive entries 17782 * in the source route contains our address or if 17783 * it is a packet with a loose source route which 17784 * reaches us before consuming the whole source route 17785 */ 17786 ip1dbg(("ip_rput_local_options: not end of SR\n")); 17787 if (optval == IPOPT_SSRR) { 17788 goto bad_src_route; 17789 } 17790 /* 17791 * Hack: instead of dropping the packet truncate the 17792 * source route to what has been used by filling the 17793 * rest with IPOPT_NOP. 17794 */ 17795 opt[IPOPT_OLEN] = (uint8_t)off; 17796 while (off < optlen) { 17797 opt[off++] = IPOPT_NOP; 17798 } 17799 break; 17800 case IPOPT_RR: 17801 off = opt[IPOPT_OFFSET]; 17802 off--; 17803 if (optlen < IP_ADDR_LEN || 17804 off > optlen - IP_ADDR_LEN) { 17805 /* No more room - ignore */ 17806 ip1dbg(( 17807 "ip_rput_local_options: end of RR\n")); 17808 break; 17809 } 17810 bcopy(&ire->ire_src_addr, (char *)opt + off, 17811 IP_ADDR_LEN); 17812 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17813 break; 17814 case IPOPT_TS: 17815 /* Insert timestamp if there is romm */ 17816 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17817 case IPOPT_TS_TSONLY: 17818 off = IPOPT_TS_TIMELEN; 17819 break; 17820 case IPOPT_TS_PRESPEC: 17821 case IPOPT_TS_PRESPEC_RFC791: 17822 /* Verify that the address matched */ 17823 off = opt[IPOPT_OFFSET] - 1; 17824 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17825 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 17826 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 17827 ipst); 17828 if (dst_ire == NULL) { 17829 /* Not for us */ 17830 break; 17831 } 17832 ire_refrele(dst_ire); 17833 /* FALLTHRU */ 17834 case IPOPT_TS_TSANDADDR: 17835 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 17836 break; 17837 default: 17838 /* 17839 * ip_*put_options should have already 17840 * dropped this packet. 17841 */ 17842 cmn_err(CE_PANIC, "ip_rput_local_options: " 17843 "unknown IT - bug in ip_rput_options?\n"); 17844 return (B_TRUE); /* Keep "lint" happy */ 17845 } 17846 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 17847 /* Increase overflow counter */ 17848 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 17849 opt[IPOPT_POS_OV_FLG] = 17850 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 17851 (off << 4)); 17852 break; 17853 } 17854 off = opt[IPOPT_OFFSET] - 1; 17855 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 17856 case IPOPT_TS_PRESPEC: 17857 case IPOPT_TS_PRESPEC_RFC791: 17858 case IPOPT_TS_TSANDADDR: 17859 bcopy(&ire->ire_src_addr, (char *)opt + off, 17860 IP_ADDR_LEN); 17861 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 17862 /* FALLTHRU */ 17863 case IPOPT_TS_TSONLY: 17864 off = opt[IPOPT_OFFSET] - 1; 17865 /* Compute # of milliseconds since midnight */ 17866 gethrestime(&now); 17867 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 17868 now.tv_nsec / (NANOSEC / MILLISEC); 17869 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 17870 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 17871 break; 17872 } 17873 break; 17874 } 17875 } 17876 return (B_TRUE); 17877 17878 bad_src_route: 17879 q = WR(q); 17880 if (q->q_next != NULL) 17881 ill = q->q_ptr; 17882 else 17883 ill = NULL; 17884 17885 /* make sure we clear any indication of a hardware checksum */ 17886 DB_CKSUMFLAGS(mp) = 0; 17887 zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst); 17888 if (zoneid == ALL_ZONES) 17889 freemsg(mp); 17890 else 17891 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 17892 return (B_FALSE); 17893 17894 } 17895 17896 /* 17897 * Process IP options in an inbound packet. If an option affects the 17898 * effective destination address, return the next hop address via dstp. 17899 * Returns -1 if something fails in which case an ICMP error has been sent 17900 * and mp freed. 17901 */ 17902 static int 17903 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, 17904 ip_stack_t *ipst) 17905 { 17906 ipoptp_t opts; 17907 uchar_t *opt; 17908 uint8_t optval; 17909 uint8_t optlen; 17910 ipaddr_t dst; 17911 intptr_t code = 0; 17912 ire_t *ire = NULL; 17913 zoneid_t zoneid; 17914 ill_t *ill; 17915 17916 ip2dbg(("ip_rput_options\n")); 17917 dst = ipha->ipha_dst; 17918 for (optval = ipoptp_first(&opts, ipha); 17919 optval != IPOPT_EOL; 17920 optval = ipoptp_next(&opts)) { 17921 opt = opts.ipoptp_cur; 17922 optlen = opts.ipoptp_len; 17923 ip2dbg(("ip_rput_options: opt %d, len %d\n", 17924 optval, optlen)); 17925 /* 17926 * Note: we need to verify the checksum before we 17927 * modify anything thus this routine only extracts the next 17928 * hop dst from any source route. 17929 */ 17930 switch (optval) { 17931 uint32_t off; 17932 case IPOPT_SSRR: 17933 case IPOPT_LSRR: 17934 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 17935 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 17936 if (ire == NULL) { 17937 if (optval == IPOPT_SSRR) { 17938 ip1dbg(("ip_rput_options: not next" 17939 " strict source route 0x%x\n", 17940 ntohl(dst))); 17941 code = (char *)&ipha->ipha_dst - 17942 (char *)ipha; 17943 goto param_prob; /* RouterReq's */ 17944 } 17945 ip2dbg(("ip_rput_options: " 17946 "not next source route 0x%x\n", 17947 ntohl(dst))); 17948 break; 17949 } 17950 ire_refrele(ire); 17951 17952 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 17953 ip1dbg(( 17954 "ip_rput_options: bad option offset\n")); 17955 code = (char *)&opt[IPOPT_OLEN] - 17956 (char *)ipha; 17957 goto param_prob; 17958 } 17959 off = opt[IPOPT_OFFSET]; 17960 off--; 17961 redo_srr: 17962 if (optlen < IP_ADDR_LEN || 17963 off > optlen - IP_ADDR_LEN) { 17964 /* End of source route */ 17965 ip1dbg(("ip_rput_options: end of SR\n")); 17966 break; 17967 } 17968 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 17969 ip1dbg(("ip_rput_options: next hop 0x%x\n", 17970 ntohl(dst))); 17971 17972 /* 17973 * Check if our address is present more than 17974 * once as consecutive hops in source route. 17975 * XXX verify per-interface ip_forwarding 17976 * for source route? 17977 */ 17978 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 17979 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 17980 17981 if (ire != NULL) { 17982 ire_refrele(ire); 17983 off += IP_ADDR_LEN; 17984 goto redo_srr; 17985 } 17986 17987 if (dst == htonl(INADDR_LOOPBACK)) { 17988 ip1dbg(("ip_rput_options: loopback addr in " 17989 "source route!\n")); 17990 goto bad_src_route; 17991 } 17992 /* 17993 * For strict: verify that dst is directly 17994 * reachable. 17995 */ 17996 if (optval == IPOPT_SSRR) { 17997 ire = ire_ftable_lookup(dst, 0, 0, 17998 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 17999 MBLK_GETLABEL(mp), 18000 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 18001 if (ire == NULL) { 18002 ip1dbg(("ip_rput_options: SSRR not " 18003 "directly reachable: 0x%x\n", 18004 ntohl(dst))); 18005 goto bad_src_route; 18006 } 18007 ire_refrele(ire); 18008 } 18009 /* 18010 * Defer update of the offset and the record route 18011 * until the packet is forwarded. 18012 */ 18013 break; 18014 case IPOPT_RR: 18015 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18016 ip1dbg(( 18017 "ip_rput_options: bad option offset\n")); 18018 code = (char *)&opt[IPOPT_OLEN] - 18019 (char *)ipha; 18020 goto param_prob; 18021 } 18022 break; 18023 case IPOPT_TS: 18024 /* 18025 * Verify that length >= 5 and that there is either 18026 * room for another timestamp or that the overflow 18027 * counter is not maxed out. 18028 */ 18029 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 18030 if (optlen < IPOPT_MINLEN_IT) { 18031 goto param_prob; 18032 } 18033 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 18034 ip1dbg(( 18035 "ip_rput_options: bad option offset\n")); 18036 code = (char *)&opt[IPOPT_OFFSET] - 18037 (char *)ipha; 18038 goto param_prob; 18039 } 18040 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 18041 case IPOPT_TS_TSONLY: 18042 off = IPOPT_TS_TIMELEN; 18043 break; 18044 case IPOPT_TS_TSANDADDR: 18045 case IPOPT_TS_PRESPEC: 18046 case IPOPT_TS_PRESPEC_RFC791: 18047 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 18048 break; 18049 default: 18050 code = (char *)&opt[IPOPT_POS_OV_FLG] - 18051 (char *)ipha; 18052 goto param_prob; 18053 } 18054 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 18055 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 18056 /* 18057 * No room and the overflow counter is 15 18058 * already. 18059 */ 18060 goto param_prob; 18061 } 18062 break; 18063 } 18064 } 18065 18066 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 18067 *dstp = dst; 18068 return (0); 18069 } 18070 18071 ip1dbg(("ip_rput_options: error processing IP options.")); 18072 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 18073 18074 param_prob: 18075 q = WR(q); 18076 if (q->q_next != NULL) 18077 ill = q->q_ptr; 18078 else 18079 ill = NULL; 18080 18081 /* make sure we clear any indication of a hardware checksum */ 18082 DB_CKSUMFLAGS(mp) = 0; 18083 /* Don't know whether this is for non-global or global/forwarding */ 18084 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18085 if (zoneid == ALL_ZONES) 18086 freemsg(mp); 18087 else 18088 icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst); 18089 return (-1); 18090 18091 bad_src_route: 18092 q = WR(q); 18093 if (q->q_next != NULL) 18094 ill = q->q_ptr; 18095 else 18096 ill = NULL; 18097 18098 /* make sure we clear any indication of a hardware checksum */ 18099 DB_CKSUMFLAGS(mp) = 0; 18100 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 18101 if (zoneid == ALL_ZONES) 18102 freemsg(mp); 18103 else 18104 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 18105 return (-1); 18106 } 18107 18108 /* 18109 * IP & ICMP info in >=14 msg's ... 18110 * - ip fixed part (mib2_ip_t) 18111 * - icmp fixed part (mib2_icmp_t) 18112 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 18113 * - ipRouteEntryTable (ip 21) all IPv4 IREs 18114 * - ipNetToMediaEntryTable (ip 22) [filled in by the arp module] 18115 * - ipRouteAttributeTable (ip 102) labeled routes 18116 * - ip multicast membership (ip_member_t) 18117 * - ip multicast source filtering (ip_grpsrc_t) 18118 * - igmp fixed part (struct igmpstat) 18119 * - multicast routing stats (struct mrtstat) 18120 * - multicast routing vifs (array of struct vifctl) 18121 * - multicast routing routes (array of struct mfcctl) 18122 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 18123 * One per ill plus one generic 18124 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 18125 * One per ill plus one generic 18126 * - ipv6RouteEntry all IPv6 IREs 18127 * - ipv6RouteAttributeTable (ip6 102) labeled routes 18128 * - ipv6NetToMediaEntry all Neighbor Cache entries 18129 * - ipv6AddrEntry all IPv6 ipifs 18130 * - ipv6 multicast membership (ipv6_member_t) 18131 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 18132 * 18133 * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries. 18134 * 18135 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 18136 * already filled in by the caller. 18137 * Return value of 0 indicates that no messages were sent and caller 18138 * should free mpctl. 18139 */ 18140 int 18141 ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) 18142 { 18143 ip_stack_t *ipst; 18144 sctp_stack_t *sctps; 18145 18146 if (q->q_next != NULL) { 18147 ipst = ILLQ_TO_IPST(q); 18148 } else { 18149 ipst = CONNQ_TO_IPST(q); 18150 } 18151 ASSERT(ipst != NULL); 18152 sctps = ipst->ips_netstack->netstack_sctp; 18153 18154 if (mpctl == NULL || mpctl->b_cont == NULL) { 18155 return (0); 18156 } 18157 18158 /* 18159 * For the purposes of the (broken) packet shell use 18160 * of the level we make sure MIB2_TCP/MIB2_UDP can be used 18161 * to make TCP and UDP appear first in the list of mib items. 18162 * TBD: We could expand this and use it in netstat so that 18163 * the kernel doesn't have to produce large tables (connections, 18164 * routes, etc) when netstat only wants the statistics or a particular 18165 * table. 18166 */ 18167 if (!(level == MIB2_TCP || level == MIB2_UDP)) { 18168 if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) { 18169 return (1); 18170 } 18171 } 18172 18173 if (level != MIB2_TCP) { 18174 if ((mpctl = udp_snmp_get(q, mpctl)) == NULL) { 18175 return (1); 18176 } 18177 } 18178 18179 if (level != MIB2_UDP) { 18180 if ((mpctl = tcp_snmp_get(q, mpctl)) == NULL) { 18181 return (1); 18182 } 18183 } 18184 18185 if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl, 18186 ipst)) == NULL) { 18187 return (1); 18188 } 18189 18190 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst)) == NULL) { 18191 return (1); 18192 } 18193 18194 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) { 18195 return (1); 18196 } 18197 18198 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) { 18199 return (1); 18200 } 18201 18202 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) { 18203 return (1); 18204 } 18205 18206 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) { 18207 return (1); 18208 } 18209 18210 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst)) == NULL) { 18211 return (1); 18212 } 18213 18214 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst)) == NULL) { 18215 return (1); 18216 } 18217 18218 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) { 18219 return (1); 18220 } 18221 18222 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) { 18223 return (1); 18224 } 18225 18226 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) { 18227 return (1); 18228 } 18229 18230 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) { 18231 return (1); 18232 } 18233 18234 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) { 18235 return (1); 18236 } 18237 18238 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) { 18239 return (1); 18240 } 18241 18242 if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, ipst)) == NULL) { 18243 return (1); 18244 } 18245 18246 mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, ipst); 18247 if (mpctl == NULL) { 18248 return (1); 18249 } 18250 18251 if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { 18252 return (1); 18253 } 18254 freemsg(mpctl); 18255 return (1); 18256 } 18257 18258 18259 /* Get global (legacy) IPv4 statistics */ 18260 static mblk_t * 18261 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib, 18262 ip_stack_t *ipst) 18263 { 18264 mib2_ip_t old_ip_mib; 18265 struct opthdr *optp; 18266 mblk_t *mp2ctl; 18267 18268 /* 18269 * make a copy of the original message 18270 */ 18271 mp2ctl = copymsg(mpctl); 18272 18273 /* fixed length IP structure... */ 18274 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18275 optp->level = MIB2_IP; 18276 optp->name = 0; 18277 SET_MIB(old_ip_mib.ipForwarding, 18278 (WE_ARE_FORWARDING(ipst) ? 1 : 2)); 18279 SET_MIB(old_ip_mib.ipDefaultTTL, 18280 (uint32_t)ipst->ips_ip_def_ttl); 18281 SET_MIB(old_ip_mib.ipReasmTimeout, 18282 ipst->ips_ip_g_frag_timeout); 18283 SET_MIB(old_ip_mib.ipAddrEntrySize, 18284 sizeof (mib2_ipAddrEntry_t)); 18285 SET_MIB(old_ip_mib.ipRouteEntrySize, 18286 sizeof (mib2_ipRouteEntry_t)); 18287 SET_MIB(old_ip_mib.ipNetToMediaEntrySize, 18288 sizeof (mib2_ipNetToMediaEntry_t)); 18289 SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 18290 SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 18291 SET_MIB(old_ip_mib.ipRouteAttributeSize, 18292 sizeof (mib2_ipAttributeEntry_t)); 18293 SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 18294 18295 /* 18296 * Grab the statistics from the new IP MIB 18297 */ 18298 SET_MIB(old_ip_mib.ipInReceives, 18299 (uint32_t)ipmib->ipIfStatsHCInReceives); 18300 SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors); 18301 SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors); 18302 SET_MIB(old_ip_mib.ipForwDatagrams, 18303 (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams); 18304 SET_MIB(old_ip_mib.ipInUnknownProtos, 18305 ipmib->ipIfStatsInUnknownProtos); 18306 SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards); 18307 SET_MIB(old_ip_mib.ipInDelivers, 18308 (uint32_t)ipmib->ipIfStatsHCInDelivers); 18309 SET_MIB(old_ip_mib.ipOutRequests, 18310 (uint32_t)ipmib->ipIfStatsHCOutRequests); 18311 SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards); 18312 SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes); 18313 SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds); 18314 SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs); 18315 SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails); 18316 SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs); 18317 SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails); 18318 SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates); 18319 18320 /* ipRoutingDiscards is not being used */ 18321 SET_MIB(old_ip_mib.ipRoutingDiscards, 0); 18322 SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs); 18323 SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts); 18324 SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs); 18325 SET_MIB(old_ip_mib.ipReasmDuplicates, 18326 ipmib->ipIfStatsReasmDuplicates); 18327 SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups); 18328 SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits); 18329 SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs); 18330 SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows); 18331 SET_MIB(old_ip_mib.rawipInOverflows, 18332 ipmib->rawipIfStatsInOverflows); 18333 18334 SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded); 18335 SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed); 18336 SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion); 18337 SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion); 18338 SET_MIB(old_ip_mib.ipOutSwitchIPv6, 18339 ipmib->ipIfStatsOutSwitchIPVersion); 18340 18341 if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib, 18342 (int)sizeof (old_ip_mib))) { 18343 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 18344 (uint_t)sizeof (old_ip_mib))); 18345 } 18346 18347 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18348 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 18349 (int)optp->level, (int)optp->name, (int)optp->len)); 18350 qreply(q, mpctl); 18351 return (mp2ctl); 18352 } 18353 18354 /* Per interface IPv4 statistics */ 18355 static mblk_t * 18356 ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18357 { 18358 struct opthdr *optp; 18359 mblk_t *mp2ctl; 18360 ill_t *ill; 18361 ill_walk_context_t ctx; 18362 mblk_t *mp_tail = NULL; 18363 mib2_ipIfStatsEntry_t global_ip_mib; 18364 18365 /* 18366 * Make a copy of the original message 18367 */ 18368 mp2ctl = copymsg(mpctl); 18369 18370 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18371 optp->level = MIB2_IP; 18372 optp->name = MIB2_IP_TRAFFIC_STATS; 18373 /* Include "unknown interface" ip_mib */ 18374 ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 18375 ipst->ips_ip_mib.ipIfStatsIfIndex = 18376 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 18377 SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding, 18378 (ipst->ips_ip_g_forward ? 1 : 2)); 18379 SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL, 18380 (uint32_t)ipst->ips_ip_def_ttl); 18381 SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize, 18382 sizeof (mib2_ipIfStatsEntry_t)); 18383 SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize, 18384 sizeof (mib2_ipAddrEntry_t)); 18385 SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize, 18386 sizeof (mib2_ipRouteEntry_t)); 18387 SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize, 18388 sizeof (mib2_ipNetToMediaEntry_t)); 18389 SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize, 18390 sizeof (ip_member_t)); 18391 SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize, 18392 sizeof (ip_grpsrc_t)); 18393 18394 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18395 (char *)&ipst->ips_ip_mib, (int)sizeof (ipst->ips_ip_mib))) { 18396 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18397 "failed to allocate %u bytes\n", 18398 (uint_t)sizeof (ipst->ips_ip_mib))); 18399 } 18400 18401 bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib)); 18402 18403 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18404 ill = ILL_START_WALK_V4(&ctx, ipst); 18405 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18406 ill->ill_ip_mib->ipIfStatsIfIndex = 18407 ill->ill_phyint->phyint_ifindex; 18408 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 18409 (ipst->ips_ip_g_forward ? 1 : 2)); 18410 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL, 18411 (uint32_t)ipst->ips_ip_def_ttl); 18412 18413 ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib); 18414 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18415 (char *)ill->ill_ip_mib, 18416 (int)sizeof (*ill->ill_ip_mib))) { 18417 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18418 "failed to allocate %u bytes\n", 18419 (uint_t)sizeof (*ill->ill_ip_mib))); 18420 } 18421 } 18422 rw_exit(&ipst->ips_ill_g_lock); 18423 18424 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18425 ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: " 18426 "level %d, name %d, len %d\n", 18427 (int)optp->level, (int)optp->name, (int)optp->len)); 18428 qreply(q, mpctl); 18429 18430 if (mp2ctl == NULL) 18431 return (NULL); 18432 18433 return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst)); 18434 } 18435 18436 /* Global IPv4 ICMP statistics */ 18437 static mblk_t * 18438 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18439 { 18440 struct opthdr *optp; 18441 mblk_t *mp2ctl; 18442 18443 /* 18444 * Make a copy of the original message 18445 */ 18446 mp2ctl = copymsg(mpctl); 18447 18448 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18449 optp->level = MIB2_ICMP; 18450 optp->name = 0; 18451 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib, 18452 (int)sizeof (ipst->ips_icmp_mib))) { 18453 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 18454 (uint_t)sizeof (ipst->ips_icmp_mib))); 18455 } 18456 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18457 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 18458 (int)optp->level, (int)optp->name, (int)optp->len)); 18459 qreply(q, mpctl); 18460 return (mp2ctl); 18461 } 18462 18463 /* Global IPv4 IGMP statistics */ 18464 static mblk_t * 18465 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18466 { 18467 struct opthdr *optp; 18468 mblk_t *mp2ctl; 18469 18470 /* 18471 * make a copy of the original message 18472 */ 18473 mp2ctl = copymsg(mpctl); 18474 18475 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18476 optp->level = EXPER_IGMP; 18477 optp->name = 0; 18478 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat, 18479 (int)sizeof (ipst->ips_igmpstat))) { 18480 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 18481 (uint_t)sizeof (ipst->ips_igmpstat))); 18482 } 18483 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18484 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 18485 (int)optp->level, (int)optp->name, (int)optp->len)); 18486 qreply(q, mpctl); 18487 return (mp2ctl); 18488 } 18489 18490 /* Global IPv4 Multicast Routing statistics */ 18491 static mblk_t * 18492 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18493 { 18494 struct opthdr *optp; 18495 mblk_t *mp2ctl; 18496 18497 /* 18498 * make a copy of the original message 18499 */ 18500 mp2ctl = copymsg(mpctl); 18501 18502 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18503 optp->level = EXPER_DVMRP; 18504 optp->name = 0; 18505 if (!ip_mroute_stats(mpctl->b_cont, ipst)) { 18506 ip0dbg(("ip_mroute_stats: failed\n")); 18507 } 18508 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18509 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 18510 (int)optp->level, (int)optp->name, (int)optp->len)); 18511 qreply(q, mpctl); 18512 return (mp2ctl); 18513 } 18514 18515 /* IPv4 address information */ 18516 static mblk_t * 18517 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18518 { 18519 struct opthdr *optp; 18520 mblk_t *mp2ctl; 18521 mblk_t *mp_tail = NULL; 18522 ill_t *ill; 18523 ipif_t *ipif; 18524 uint_t bitval; 18525 mib2_ipAddrEntry_t mae; 18526 zoneid_t zoneid; 18527 ill_walk_context_t ctx; 18528 18529 /* 18530 * make a copy of the original message 18531 */ 18532 mp2ctl = copymsg(mpctl); 18533 18534 /* ipAddrEntryTable */ 18535 18536 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18537 optp->level = MIB2_IP; 18538 optp->name = MIB2_IP_ADDR; 18539 zoneid = Q_TO_CONN(q)->conn_zoneid; 18540 18541 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18542 ill = ILL_START_WALK_V4(&ctx, ipst); 18543 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18544 for (ipif = ill->ill_ipif; ipif != NULL; 18545 ipif = ipif->ipif_next) { 18546 if (ipif->ipif_zoneid != zoneid && 18547 ipif->ipif_zoneid != ALL_ZONES) 18548 continue; 18549 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18550 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18551 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18552 18553 ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes, 18554 OCTET_LENGTH); 18555 mae.ipAdEntIfIndex.o_length = 18556 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 18557 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 18558 mae.ipAdEntNetMask = ipif->ipif_net_mask; 18559 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 18560 mae.ipAdEntInfo.ae_subnet_len = 18561 ip_mask_to_plen(ipif->ipif_net_mask); 18562 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 18563 for (bitval = 1; 18564 bitval && 18565 !(bitval & ipif->ipif_brd_addr); 18566 bitval <<= 1) 18567 noop; 18568 mae.ipAdEntBcastAddr = bitval; 18569 mae.ipAdEntReasmMaxSize = IP_MAXPACKET; 18570 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 18571 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 18572 mae.ipAdEntInfo.ae_broadcast_addr = 18573 ipif->ipif_brd_addr; 18574 mae.ipAdEntInfo.ae_pp_dst_addr = 18575 ipif->ipif_pp_dst_addr; 18576 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 18577 ill->ill_flags | ill->ill_phyint->phyint_flags; 18578 mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL; 18579 18580 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18581 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 18582 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 18583 "allocate %u bytes\n", 18584 (uint_t)sizeof (mib2_ipAddrEntry_t))); 18585 } 18586 } 18587 } 18588 rw_exit(&ipst->ips_ill_g_lock); 18589 18590 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18591 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 18592 (int)optp->level, (int)optp->name, (int)optp->len)); 18593 qreply(q, mpctl); 18594 return (mp2ctl); 18595 } 18596 18597 /* IPv6 address information */ 18598 static mblk_t * 18599 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18600 { 18601 struct opthdr *optp; 18602 mblk_t *mp2ctl; 18603 mblk_t *mp_tail = NULL; 18604 ill_t *ill; 18605 ipif_t *ipif; 18606 mib2_ipv6AddrEntry_t mae6; 18607 zoneid_t zoneid; 18608 ill_walk_context_t ctx; 18609 18610 /* 18611 * make a copy of the original message 18612 */ 18613 mp2ctl = copymsg(mpctl); 18614 18615 /* ipv6AddrEntryTable */ 18616 18617 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18618 optp->level = MIB2_IP6; 18619 optp->name = MIB2_IP6_ADDR; 18620 zoneid = Q_TO_CONN(q)->conn_zoneid; 18621 18622 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18623 ill = ILL_START_WALK_V6(&ctx, ipst); 18624 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18625 for (ipif = ill->ill_ipif; ipif != NULL; 18626 ipif = ipif->ipif_next) { 18627 if (ipif->ipif_zoneid != zoneid && 18628 ipif->ipif_zoneid != ALL_ZONES) 18629 continue; 18630 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 18631 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 18632 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 18633 18634 ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes, 18635 OCTET_LENGTH); 18636 mae6.ipv6AddrIfIndex.o_length = 18637 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 18638 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 18639 mae6.ipv6AddrPfxLength = 18640 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 18641 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 18642 mae6.ipv6AddrInfo.ae_subnet_len = 18643 mae6.ipv6AddrPfxLength; 18644 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 18645 18646 /* Type: stateless(1), stateful(2), unknown(3) */ 18647 if (ipif->ipif_flags & IPIF_ADDRCONF) 18648 mae6.ipv6AddrType = 1; 18649 else 18650 mae6.ipv6AddrType = 2; 18651 /* Anycast: true(1), false(2) */ 18652 if (ipif->ipif_flags & IPIF_ANYCAST) 18653 mae6.ipv6AddrAnycastFlag = 1; 18654 else 18655 mae6.ipv6AddrAnycastFlag = 2; 18656 18657 /* 18658 * Address status: preferred(1), deprecated(2), 18659 * invalid(3), inaccessible(4), unknown(5) 18660 */ 18661 if (ipif->ipif_flags & IPIF_NOLOCAL) 18662 mae6.ipv6AddrStatus = 3; 18663 else if (ipif->ipif_flags & IPIF_DEPRECATED) 18664 mae6.ipv6AddrStatus = 2; 18665 else 18666 mae6.ipv6AddrStatus = 1; 18667 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 18668 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 18669 mae6.ipv6AddrInfo.ae_pp_dst_addr = 18670 ipif->ipif_v6pp_dst_addr; 18671 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 18672 ill->ill_flags | ill->ill_phyint->phyint_flags; 18673 mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET; 18674 mae6.ipv6AddrIdentifier = ill->ill_token; 18675 mae6.ipv6AddrIdentifierLen = ill->ill_token_length; 18676 mae6.ipv6AddrReachableTime = ill->ill_reachable_time; 18677 mae6.ipv6AddrRetransmitTime = 18678 ill->ill_reachable_retrans_time; 18679 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18680 (char *)&mae6, 18681 (int)sizeof (mib2_ipv6AddrEntry_t))) { 18682 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 18683 "allocate %u bytes\n", 18684 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 18685 } 18686 } 18687 } 18688 rw_exit(&ipst->ips_ill_g_lock); 18689 18690 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18691 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 18692 (int)optp->level, (int)optp->name, (int)optp->len)); 18693 qreply(q, mpctl); 18694 return (mp2ctl); 18695 } 18696 18697 /* IPv4 multicast group membership. */ 18698 static mblk_t * 18699 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18700 { 18701 struct opthdr *optp; 18702 mblk_t *mp2ctl; 18703 ill_t *ill; 18704 ipif_t *ipif; 18705 ilm_t *ilm; 18706 ip_member_t ipm; 18707 mblk_t *mp_tail = NULL; 18708 ill_walk_context_t ctx; 18709 zoneid_t zoneid; 18710 18711 /* 18712 * make a copy of the original message 18713 */ 18714 mp2ctl = copymsg(mpctl); 18715 zoneid = Q_TO_CONN(q)->conn_zoneid; 18716 18717 /* ipGroupMember table */ 18718 optp = (struct opthdr *)&mpctl->b_rptr[ 18719 sizeof (struct T_optmgmt_ack)]; 18720 optp->level = MIB2_IP; 18721 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 18722 18723 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18724 ill = ILL_START_WALK_V4(&ctx, ipst); 18725 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18726 ILM_WALKER_HOLD(ill); 18727 for (ipif = ill->ill_ipif; ipif != NULL; 18728 ipif = ipif->ipif_next) { 18729 if (ipif->ipif_zoneid != zoneid && 18730 ipif->ipif_zoneid != ALL_ZONES) 18731 continue; /* not this zone */ 18732 ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes, 18733 OCTET_LENGTH); 18734 ipm.ipGroupMemberIfIndex.o_length = 18735 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 18736 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18737 ASSERT(ilm->ilm_ipif != NULL); 18738 ASSERT(ilm->ilm_ill == NULL); 18739 if (ilm->ilm_ipif != ipif) 18740 continue; 18741 ipm.ipGroupMemberAddress = ilm->ilm_addr; 18742 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 18743 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 18744 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18745 (char *)&ipm, (int)sizeof (ipm))) { 18746 ip1dbg(("ip_snmp_get_mib2_ip_group: " 18747 "failed to allocate %u bytes\n", 18748 (uint_t)sizeof (ipm))); 18749 } 18750 } 18751 } 18752 ILM_WALKER_RELE(ill); 18753 } 18754 rw_exit(&ipst->ips_ill_g_lock); 18755 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18756 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18757 (int)optp->level, (int)optp->name, (int)optp->len)); 18758 qreply(q, mpctl); 18759 return (mp2ctl); 18760 } 18761 18762 /* IPv6 multicast group membership. */ 18763 static mblk_t * 18764 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18765 { 18766 struct opthdr *optp; 18767 mblk_t *mp2ctl; 18768 ill_t *ill; 18769 ilm_t *ilm; 18770 ipv6_member_t ipm6; 18771 mblk_t *mp_tail = NULL; 18772 ill_walk_context_t ctx; 18773 zoneid_t zoneid; 18774 18775 /* 18776 * make a copy of the original message 18777 */ 18778 mp2ctl = copymsg(mpctl); 18779 zoneid = Q_TO_CONN(q)->conn_zoneid; 18780 18781 /* ip6GroupMember table */ 18782 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18783 optp->level = MIB2_IP6; 18784 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 18785 18786 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18787 ill = ILL_START_WALK_V6(&ctx, ipst); 18788 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18789 ILM_WALKER_HOLD(ill); 18790 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 18791 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18792 ASSERT(ilm->ilm_ipif == NULL); 18793 ASSERT(ilm->ilm_ill != NULL); 18794 if (ilm->ilm_zoneid != zoneid) 18795 continue; /* not this zone */ 18796 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 18797 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 18798 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 18799 if (!snmp_append_data2(mpctl->b_cont, 18800 &mp_tail, 18801 (char *)&ipm6, (int)sizeof (ipm6))) { 18802 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 18803 "failed to allocate %u bytes\n", 18804 (uint_t)sizeof (ipm6))); 18805 } 18806 } 18807 ILM_WALKER_RELE(ill); 18808 } 18809 rw_exit(&ipst->ips_ill_g_lock); 18810 18811 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18812 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18813 (int)optp->level, (int)optp->name, (int)optp->len)); 18814 qreply(q, mpctl); 18815 return (mp2ctl); 18816 } 18817 18818 /* IP multicast filtered sources */ 18819 static mblk_t * 18820 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18821 { 18822 struct opthdr *optp; 18823 mblk_t *mp2ctl; 18824 ill_t *ill; 18825 ipif_t *ipif; 18826 ilm_t *ilm; 18827 ip_grpsrc_t ips; 18828 mblk_t *mp_tail = NULL; 18829 ill_walk_context_t ctx; 18830 zoneid_t zoneid; 18831 int i; 18832 slist_t *sl; 18833 18834 /* 18835 * make a copy of the original message 18836 */ 18837 mp2ctl = copymsg(mpctl); 18838 zoneid = Q_TO_CONN(q)->conn_zoneid; 18839 18840 /* ipGroupSource table */ 18841 optp = (struct opthdr *)&mpctl->b_rptr[ 18842 sizeof (struct T_optmgmt_ack)]; 18843 optp->level = MIB2_IP; 18844 optp->name = EXPER_IP_GROUP_SOURCES; 18845 18846 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18847 ill = ILL_START_WALK_V4(&ctx, ipst); 18848 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18849 ILM_WALKER_HOLD(ill); 18850 for (ipif = ill->ill_ipif; ipif != NULL; 18851 ipif = ipif->ipif_next) { 18852 if (ipif->ipif_zoneid != zoneid) 18853 continue; /* not this zone */ 18854 ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes, 18855 OCTET_LENGTH); 18856 ips.ipGroupSourceIfIndex.o_length = 18857 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 18858 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18859 ASSERT(ilm->ilm_ipif != NULL); 18860 ASSERT(ilm->ilm_ill == NULL); 18861 sl = ilm->ilm_filter; 18862 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 18863 continue; 18864 ips.ipGroupSourceGroup = ilm->ilm_addr; 18865 for (i = 0; i < sl->sl_numsrc; i++) { 18866 if (!IN6_IS_ADDR_V4MAPPED( 18867 &sl->sl_addr[i])) 18868 continue; 18869 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 18870 ips.ipGroupSourceAddress); 18871 if (snmp_append_data2(mpctl->b_cont, 18872 &mp_tail, (char *)&ips, 18873 (int)sizeof (ips)) == 0) { 18874 ip1dbg(("ip_snmp_get_mib2_" 18875 "ip_group_src: failed to " 18876 "allocate %u bytes\n", 18877 (uint_t)sizeof (ips))); 18878 } 18879 } 18880 } 18881 } 18882 ILM_WALKER_RELE(ill); 18883 } 18884 rw_exit(&ipst->ips_ill_g_lock); 18885 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18886 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18887 (int)optp->level, (int)optp->name, (int)optp->len)); 18888 qreply(q, mpctl); 18889 return (mp2ctl); 18890 } 18891 18892 /* IPv6 multicast filtered sources. */ 18893 static mblk_t * 18894 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18895 { 18896 struct opthdr *optp; 18897 mblk_t *mp2ctl; 18898 ill_t *ill; 18899 ilm_t *ilm; 18900 ipv6_grpsrc_t ips6; 18901 mblk_t *mp_tail = NULL; 18902 ill_walk_context_t ctx; 18903 zoneid_t zoneid; 18904 int i; 18905 slist_t *sl; 18906 18907 /* 18908 * make a copy of the original message 18909 */ 18910 mp2ctl = copymsg(mpctl); 18911 zoneid = Q_TO_CONN(q)->conn_zoneid; 18912 18913 /* ip6GroupMember table */ 18914 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18915 optp->level = MIB2_IP6; 18916 optp->name = EXPER_IP6_GROUP_SOURCES; 18917 18918 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18919 ill = ILL_START_WALK_V6(&ctx, ipst); 18920 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18921 ILM_WALKER_HOLD(ill); 18922 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 18923 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 18924 ASSERT(ilm->ilm_ipif == NULL); 18925 ASSERT(ilm->ilm_ill != NULL); 18926 sl = ilm->ilm_filter; 18927 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 18928 continue; 18929 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 18930 for (i = 0; i < sl->sl_numsrc; i++) { 18931 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 18932 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 18933 (char *)&ips6, (int)sizeof (ips6))) { 18934 ip1dbg(("ip_snmp_get_mib2_ip6_" 18935 "group_src: failed to allocate " 18936 "%u bytes\n", 18937 (uint_t)sizeof (ips6))); 18938 } 18939 } 18940 } 18941 ILM_WALKER_RELE(ill); 18942 } 18943 rw_exit(&ipst->ips_ill_g_lock); 18944 18945 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18946 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 18947 (int)optp->level, (int)optp->name, (int)optp->len)); 18948 qreply(q, mpctl); 18949 return (mp2ctl); 18950 } 18951 18952 /* Multicast routing virtual interface table. */ 18953 static mblk_t * 18954 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18955 { 18956 struct opthdr *optp; 18957 mblk_t *mp2ctl; 18958 18959 /* 18960 * make a copy of the original message 18961 */ 18962 mp2ctl = copymsg(mpctl); 18963 18964 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18965 optp->level = EXPER_DVMRP; 18966 optp->name = EXPER_DVMRP_VIF; 18967 if (!ip_mroute_vif(mpctl->b_cont, ipst)) { 18968 ip0dbg(("ip_mroute_vif: failed\n")); 18969 } 18970 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18971 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 18972 (int)optp->level, (int)optp->name, (int)optp->len)); 18973 qreply(q, mpctl); 18974 return (mp2ctl); 18975 } 18976 18977 /* Multicast routing table. */ 18978 static mblk_t * 18979 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 18980 { 18981 struct opthdr *optp; 18982 mblk_t *mp2ctl; 18983 18984 /* 18985 * make a copy of the original message 18986 */ 18987 mp2ctl = copymsg(mpctl); 18988 18989 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 18990 optp->level = EXPER_DVMRP; 18991 optp->name = EXPER_DVMRP_MRT; 18992 if (!ip_mroute_mrt(mpctl->b_cont, ipst)) { 18993 ip0dbg(("ip_mroute_mrt: failed\n")); 18994 } 18995 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 18996 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 18997 (int)optp->level, (int)optp->name, (int)optp->len)); 18998 qreply(q, mpctl); 18999 return (mp2ctl); 19000 } 19001 19002 /* 19003 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 19004 * in one IRE walk. 19005 */ 19006 static mblk_t * 19007 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19008 { 19009 struct opthdr *optp; 19010 mblk_t *mp2ctl; /* Returned */ 19011 mblk_t *mp3ctl; /* nettomedia */ 19012 mblk_t *mp4ctl; /* routeattrs */ 19013 iproutedata_t ird; 19014 zoneid_t zoneid; 19015 19016 /* 19017 * make copies of the original message 19018 * - mp2ctl is returned unchanged to the caller for his use 19019 * - mpctl is sent upstream as ipRouteEntryTable 19020 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 19021 * - mp4ctl is sent upstream as ipRouteAttributeTable 19022 */ 19023 mp2ctl = copymsg(mpctl); 19024 mp3ctl = copymsg(mpctl); 19025 mp4ctl = copymsg(mpctl); 19026 if (mp3ctl == NULL || mp4ctl == NULL) { 19027 freemsg(mp4ctl); 19028 freemsg(mp3ctl); 19029 freemsg(mp2ctl); 19030 freemsg(mpctl); 19031 return (NULL); 19032 } 19033 19034 bzero(&ird, sizeof (ird)); 19035 19036 ird.ird_route.lp_head = mpctl->b_cont; 19037 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19038 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19039 19040 zoneid = Q_TO_CONN(q)->conn_zoneid; 19041 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); 19042 19043 /* ipRouteEntryTable in mpctl */ 19044 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19045 optp->level = MIB2_IP; 19046 optp->name = MIB2_IP_ROUTE; 19047 optp->len = msgdsize(ird.ird_route.lp_head); 19048 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19049 (int)optp->level, (int)optp->name, (int)optp->len)); 19050 qreply(q, mpctl); 19051 19052 /* ipNetToMediaEntryTable in mp3ctl */ 19053 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19054 optp->level = MIB2_IP; 19055 optp->name = MIB2_IP_MEDIA; 19056 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19057 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19058 (int)optp->level, (int)optp->name, (int)optp->len)); 19059 qreply(q, mp3ctl); 19060 19061 /* ipRouteAttributeTable in mp4ctl */ 19062 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19063 optp->level = MIB2_IP; 19064 optp->name = EXPER_IP_RTATTR; 19065 optp->len = msgdsize(ird.ird_attrs.lp_head); 19066 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 19067 (int)optp->level, (int)optp->name, (int)optp->len)); 19068 if (optp->len == 0) 19069 freemsg(mp4ctl); 19070 else 19071 qreply(q, mp4ctl); 19072 19073 return (mp2ctl); 19074 } 19075 19076 /* 19077 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 19078 * ipv6NetToMediaEntryTable in an NDP walk. 19079 */ 19080 static mblk_t * 19081 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19082 { 19083 struct opthdr *optp; 19084 mblk_t *mp2ctl; /* Returned */ 19085 mblk_t *mp3ctl; /* nettomedia */ 19086 mblk_t *mp4ctl; /* routeattrs */ 19087 iproutedata_t ird; 19088 zoneid_t zoneid; 19089 19090 /* 19091 * make copies of the original message 19092 * - mp2ctl is returned unchanged to the caller for his use 19093 * - mpctl is sent upstream as ipv6RouteEntryTable 19094 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 19095 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 19096 */ 19097 mp2ctl = copymsg(mpctl); 19098 mp3ctl = copymsg(mpctl); 19099 mp4ctl = copymsg(mpctl); 19100 if (mp3ctl == NULL || mp4ctl == NULL) { 19101 freemsg(mp4ctl); 19102 freemsg(mp3ctl); 19103 freemsg(mp2ctl); 19104 freemsg(mpctl); 19105 return (NULL); 19106 } 19107 19108 bzero(&ird, sizeof (ird)); 19109 19110 ird.ird_route.lp_head = mpctl->b_cont; 19111 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 19112 ird.ird_attrs.lp_head = mp4ctl->b_cont; 19113 19114 zoneid = Q_TO_CONN(q)->conn_zoneid; 19115 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); 19116 19117 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19118 optp->level = MIB2_IP6; 19119 optp->name = MIB2_IP6_ROUTE; 19120 optp->len = msgdsize(ird.ird_route.lp_head); 19121 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19122 (int)optp->level, (int)optp->name, (int)optp->len)); 19123 qreply(q, mpctl); 19124 19125 /* ipv6NetToMediaEntryTable in mp3ctl */ 19126 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst); 19127 19128 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19129 optp->level = MIB2_IP6; 19130 optp->name = MIB2_IP6_MEDIA; 19131 optp->len = msgdsize(ird.ird_netmedia.lp_head); 19132 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19133 (int)optp->level, (int)optp->name, (int)optp->len)); 19134 qreply(q, mp3ctl); 19135 19136 /* ipv6RouteAttributeTable in mp4ctl */ 19137 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19138 optp->level = MIB2_IP6; 19139 optp->name = EXPER_IP_RTATTR; 19140 optp->len = msgdsize(ird.ird_attrs.lp_head); 19141 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 19142 (int)optp->level, (int)optp->name, (int)optp->len)); 19143 if (optp->len == 0) 19144 freemsg(mp4ctl); 19145 else 19146 qreply(q, mp4ctl); 19147 19148 return (mp2ctl); 19149 } 19150 19151 /* 19152 * IPv6 mib: One per ill 19153 */ 19154 static mblk_t * 19155 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19156 { 19157 struct opthdr *optp; 19158 mblk_t *mp2ctl; 19159 ill_t *ill; 19160 ill_walk_context_t ctx; 19161 mblk_t *mp_tail = NULL; 19162 19163 /* 19164 * Make a copy of the original message 19165 */ 19166 mp2ctl = copymsg(mpctl); 19167 19168 /* fixed length IPv6 structure ... */ 19169 19170 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19171 optp->level = MIB2_IP6; 19172 optp->name = 0; 19173 /* Include "unknown interface" ip6_mib */ 19174 ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 19175 ipst->ips_ip6_mib.ipIfStatsIfIndex = 19176 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */ 19177 SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding, 19178 ipst->ips_ipv6_forward ? 1 : 2); 19179 SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit, 19180 ipst->ips_ipv6_def_hops); 19181 SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize, 19182 sizeof (mib2_ipIfStatsEntry_t)); 19183 SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize, 19184 sizeof (mib2_ipv6AddrEntry_t)); 19185 SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize, 19186 sizeof (mib2_ipv6RouteEntry_t)); 19187 SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize, 19188 sizeof (mib2_ipv6NetToMediaEntry_t)); 19189 SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize, 19190 sizeof (ipv6_member_t)); 19191 SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize, 19192 sizeof (ipv6_grpsrc_t)); 19193 19194 /* 19195 * Synchronize 64- and 32-bit counters 19196 */ 19197 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives, 19198 ipIfStatsHCInReceives); 19199 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers, 19200 ipIfStatsHCInDelivers); 19201 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests, 19202 ipIfStatsHCOutRequests); 19203 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams, 19204 ipIfStatsHCOutForwDatagrams); 19205 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts, 19206 ipIfStatsHCOutMcastPkts); 19207 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts, 19208 ipIfStatsHCInMcastPkts); 19209 19210 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19211 (char *)&ipst->ips_ip6_mib, (int)sizeof (ipst->ips_ip6_mib))) { 19212 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 19213 (uint_t)sizeof (ipst->ips_ip6_mib))); 19214 } 19215 19216 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19217 ill = ILL_START_WALK_V6(&ctx, ipst); 19218 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19219 ill->ill_ip_mib->ipIfStatsIfIndex = 19220 ill->ill_phyint->phyint_ifindex; 19221 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding, 19222 ipst->ips_ipv6_forward ? 1 : 2); 19223 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit, 19224 ill->ill_max_hops); 19225 19226 /* 19227 * Synchronize 64- and 32-bit counters 19228 */ 19229 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives, 19230 ipIfStatsHCInReceives); 19231 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers, 19232 ipIfStatsHCInDelivers); 19233 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests, 19234 ipIfStatsHCOutRequests); 19235 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams, 19236 ipIfStatsHCOutForwDatagrams); 19237 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts, 19238 ipIfStatsHCOutMcastPkts); 19239 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts, 19240 ipIfStatsHCInMcastPkts); 19241 19242 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19243 (char *)ill->ill_ip_mib, 19244 (int)sizeof (*ill->ill_ip_mib))) { 19245 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 19246 "%u bytes\n", (uint_t)sizeof (*ill->ill_ip_mib))); 19247 } 19248 } 19249 rw_exit(&ipst->ips_ill_g_lock); 19250 19251 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19252 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 19253 (int)optp->level, (int)optp->name, (int)optp->len)); 19254 qreply(q, mpctl); 19255 return (mp2ctl); 19256 } 19257 19258 /* 19259 * ICMPv6 mib: One per ill 19260 */ 19261 static mblk_t * 19262 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 19263 { 19264 struct opthdr *optp; 19265 mblk_t *mp2ctl; 19266 ill_t *ill; 19267 ill_walk_context_t ctx; 19268 mblk_t *mp_tail = NULL; 19269 /* 19270 * Make a copy of the original message 19271 */ 19272 mp2ctl = copymsg(mpctl); 19273 19274 /* fixed length ICMPv6 structure ... */ 19275 19276 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 19277 optp->level = MIB2_ICMP6; 19278 optp->name = 0; 19279 /* Include "unknown interface" icmp6_mib */ 19280 ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex = 19281 MIB2_UNKNOWN_INTERFACE; /* netstat flag */ 19282 ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize = 19283 sizeof (mib2_ipv6IfIcmpEntry_t); 19284 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19285 (char *)&ipst->ips_icmp6_mib, 19286 (int)sizeof (ipst->ips_icmp6_mib))) { 19287 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 19288 (uint_t)sizeof (ipst->ips_icmp6_mib))); 19289 } 19290 19291 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19292 ill = ILL_START_WALK_V6(&ctx, ipst); 19293 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19294 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 19295 ill->ill_phyint->phyint_ifindex; 19296 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 19297 (char *)ill->ill_icmp6_mib, 19298 (int)sizeof (*ill->ill_icmp6_mib))) { 19299 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 19300 "%u bytes\n", 19301 (uint_t)sizeof (*ill->ill_icmp6_mib))); 19302 } 19303 } 19304 rw_exit(&ipst->ips_ill_g_lock); 19305 19306 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 19307 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 19308 (int)optp->level, (int)optp->name, (int)optp->len)); 19309 qreply(q, mpctl); 19310 return (mp2ctl); 19311 } 19312 19313 /* 19314 * ire_walk routine to create both ipRouteEntryTable and 19315 * ipRouteAttributeTable in one IRE walk 19316 */ 19317 static void 19318 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 19319 { 19320 ill_t *ill; 19321 ipif_t *ipif; 19322 mib2_ipRouteEntry_t *re; 19323 mib2_ipAttributeEntry_t *iae, *iaeptr; 19324 ipaddr_t gw_addr; 19325 tsol_ire_gw_secattr_t *attrp; 19326 tsol_gc_t *gc = NULL; 19327 tsol_gcgrp_t *gcgrp = NULL; 19328 uint_t sacnt = 0; 19329 int i; 19330 19331 ASSERT(ire->ire_ipversion == IPV4_VERSION); 19332 19333 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19334 return; 19335 19336 if ((attrp = ire->ire_gw_secattr) != NULL) { 19337 mutex_enter(&attrp->igsa_lock); 19338 if ((gc = attrp->igsa_gc) != NULL) { 19339 gcgrp = gc->gc_grp; 19340 ASSERT(gcgrp != NULL); 19341 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19342 sacnt = 1; 19343 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19344 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19345 gc = gcgrp->gcgrp_head; 19346 sacnt = gcgrp->gcgrp_count; 19347 } 19348 mutex_exit(&attrp->igsa_lock); 19349 19350 /* do nothing if there's no gc to report */ 19351 if (gc == NULL) { 19352 ASSERT(sacnt == 0); 19353 if (gcgrp != NULL) { 19354 /* we might as well drop the lock now */ 19355 rw_exit(&gcgrp->gcgrp_rwlock); 19356 gcgrp = NULL; 19357 } 19358 attrp = NULL; 19359 } 19360 19361 ASSERT(gc == NULL || (gcgrp != NULL && 19362 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19363 } 19364 ASSERT(sacnt == 0 || gc != NULL); 19365 19366 if (sacnt != 0 && 19367 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19368 kmem_free(re, sizeof (*re)); 19369 rw_exit(&gcgrp->gcgrp_rwlock); 19370 return; 19371 } 19372 19373 /* 19374 * Return all IRE types for route table... let caller pick and choose 19375 */ 19376 re->ipRouteDest = ire->ire_addr; 19377 ipif = ire->ire_ipif; 19378 re->ipRouteIfIndex.o_length = 0; 19379 if (ire->ire_type == IRE_CACHE) { 19380 ill = (ill_t *)ire->ire_stq->q_ptr; 19381 re->ipRouteIfIndex.o_length = 19382 ill->ill_name_length == 0 ? 0 : 19383 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19384 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 19385 re->ipRouteIfIndex.o_length); 19386 } else if (ipif != NULL) { 19387 ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH); 19388 re->ipRouteIfIndex.o_length = 19389 mi_strlen(re->ipRouteIfIndex.o_bytes); 19390 } 19391 re->ipRouteMetric1 = -1; 19392 re->ipRouteMetric2 = -1; 19393 re->ipRouteMetric3 = -1; 19394 re->ipRouteMetric4 = -1; 19395 19396 gw_addr = ire->ire_gateway_addr; 19397 19398 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 19399 re->ipRouteNextHop = ire->ire_src_addr; 19400 else 19401 re->ipRouteNextHop = gw_addr; 19402 /* indirect(4), direct(3), or invalid(2) */ 19403 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19404 re->ipRouteType = 2; 19405 else 19406 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 19407 re->ipRouteProto = -1; 19408 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 19409 re->ipRouteMask = ire->ire_mask; 19410 re->ipRouteMetric5 = -1; 19411 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 19412 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 19413 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19414 re->ipRouteInfo.re_ref = ire->ire_refcnt; 19415 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 19416 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19417 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19418 re->ipRouteInfo.re_flags = ire->ire_flags; 19419 19420 if (ire->ire_flags & RTF_DYNAMIC) { 19421 re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19422 } else { 19423 re->ipRouteInfo.re_ire_type = ire->ire_type; 19424 } 19425 19426 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19427 (char *)re, (int)sizeof (*re))) { 19428 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19429 (uint_t)sizeof (*re))); 19430 } 19431 19432 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19433 iaeptr->iae_routeidx = ird->ird_idx; 19434 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19435 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19436 } 19437 19438 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19439 (char *)iae, sacnt * sizeof (*iae))) { 19440 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 19441 (unsigned)(sacnt * sizeof (*iae)))); 19442 } 19443 19444 /* bump route index for next pass */ 19445 ird->ird_idx++; 19446 19447 kmem_free(re, sizeof (*re)); 19448 if (sacnt != 0) 19449 kmem_free(iae, sacnt * sizeof (*iae)); 19450 19451 if (gcgrp != NULL) 19452 rw_exit(&gcgrp->gcgrp_rwlock); 19453 } 19454 19455 /* 19456 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 19457 */ 19458 static void 19459 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 19460 { 19461 ill_t *ill; 19462 ipif_t *ipif; 19463 mib2_ipv6RouteEntry_t *re; 19464 mib2_ipAttributeEntry_t *iae, *iaeptr; 19465 in6_addr_t gw_addr_v6; 19466 tsol_ire_gw_secattr_t *attrp; 19467 tsol_gc_t *gc = NULL; 19468 tsol_gcgrp_t *gcgrp = NULL; 19469 uint_t sacnt = 0; 19470 int i; 19471 19472 ASSERT(ire->ire_ipversion == IPV6_VERSION); 19473 19474 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 19475 return; 19476 19477 if ((attrp = ire->ire_gw_secattr) != NULL) { 19478 mutex_enter(&attrp->igsa_lock); 19479 if ((gc = attrp->igsa_gc) != NULL) { 19480 gcgrp = gc->gc_grp; 19481 ASSERT(gcgrp != NULL); 19482 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19483 sacnt = 1; 19484 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 19485 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 19486 gc = gcgrp->gcgrp_head; 19487 sacnt = gcgrp->gcgrp_count; 19488 } 19489 mutex_exit(&attrp->igsa_lock); 19490 19491 /* do nothing if there's no gc to report */ 19492 if (gc == NULL) { 19493 ASSERT(sacnt == 0); 19494 if (gcgrp != NULL) { 19495 /* we might as well drop the lock now */ 19496 rw_exit(&gcgrp->gcgrp_rwlock); 19497 gcgrp = NULL; 19498 } 19499 attrp = NULL; 19500 } 19501 19502 ASSERT(gc == NULL || (gcgrp != NULL && 19503 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 19504 } 19505 ASSERT(sacnt == 0 || gc != NULL); 19506 19507 if (sacnt != 0 && 19508 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 19509 kmem_free(re, sizeof (*re)); 19510 rw_exit(&gcgrp->gcgrp_rwlock); 19511 return; 19512 } 19513 19514 /* 19515 * Return all IRE types for route table... let caller pick and choose 19516 */ 19517 re->ipv6RouteDest = ire->ire_addr_v6; 19518 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 19519 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 19520 re->ipv6RouteIfIndex.o_length = 0; 19521 ipif = ire->ire_ipif; 19522 if (ire->ire_type == IRE_CACHE) { 19523 ill = (ill_t *)ire->ire_stq->q_ptr; 19524 re->ipv6RouteIfIndex.o_length = 19525 ill->ill_name_length == 0 ? 0 : 19526 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 19527 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 19528 re->ipv6RouteIfIndex.o_length); 19529 } else if (ipif != NULL) { 19530 ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH); 19531 re->ipv6RouteIfIndex.o_length = 19532 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 19533 } 19534 19535 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19536 19537 mutex_enter(&ire->ire_lock); 19538 gw_addr_v6 = ire->ire_gateway_addr_v6; 19539 mutex_exit(&ire->ire_lock); 19540 19541 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 19542 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 19543 else 19544 re->ipv6RouteNextHop = gw_addr_v6; 19545 19546 /* remote(4), local(3), or discard(2) */ 19547 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 19548 re->ipv6RouteType = 2; 19549 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 19550 re->ipv6RouteType = 3; 19551 else 19552 re->ipv6RouteType = 4; 19553 19554 re->ipv6RouteProtocol = -1; 19555 re->ipv6RoutePolicy = 0; 19556 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 19557 re->ipv6RouteNextHopRDI = 0; 19558 re->ipv6RouteWeight = 0; 19559 re->ipv6RouteMetric = 0; 19560 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 19561 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 19562 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 19563 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 19564 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 19565 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 19566 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 19567 re->ipv6RouteInfo.re_flags = ire->ire_flags; 19568 19569 if (ire->ire_flags & RTF_DYNAMIC) { 19570 re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT; 19571 } else { 19572 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 19573 } 19574 19575 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 19576 (char *)re, (int)sizeof (*re))) { 19577 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19578 (uint_t)sizeof (*re))); 19579 } 19580 19581 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 19582 iaeptr->iae_routeidx = ird->ird_idx; 19583 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 19584 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 19585 } 19586 19587 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 19588 (char *)iae, sacnt * sizeof (*iae))) { 19589 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 19590 (unsigned)(sacnt * sizeof (*iae)))); 19591 } 19592 19593 /* bump route index for next pass */ 19594 ird->ird_idx++; 19595 19596 kmem_free(re, sizeof (*re)); 19597 if (sacnt != 0) 19598 kmem_free(iae, sacnt * sizeof (*iae)); 19599 19600 if (gcgrp != NULL) 19601 rw_exit(&gcgrp->gcgrp_rwlock); 19602 } 19603 19604 /* 19605 * ndp_walk routine to create ipv6NetToMediaEntryTable 19606 */ 19607 static int 19608 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 19609 { 19610 ill_t *ill; 19611 mib2_ipv6NetToMediaEntry_t ntme; 19612 dl_unitdata_req_t *dl; 19613 19614 ill = nce->nce_ill; 19615 if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */ 19616 return (0); 19617 19618 /* 19619 * Neighbor cache entry attached to IRE with on-link 19620 * destination. 19621 */ 19622 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 19623 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 19624 if ((ill->ill_flags & ILLF_XRESOLV) && 19625 (nce->nce_res_mp != NULL)) { 19626 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 19627 ntme.ipv6NetToMediaPhysAddress.o_length = 19628 dl->dl_dest_addr_length; 19629 } else { 19630 ntme.ipv6NetToMediaPhysAddress.o_length = 19631 ill->ill_phys_addr_length; 19632 } 19633 if (nce->nce_res_mp != NULL) { 19634 bcopy((char *)nce->nce_res_mp->b_rptr + 19635 NCE_LL_ADDR_OFFSET(ill), 19636 ntme.ipv6NetToMediaPhysAddress.o_bytes, 19637 ntme.ipv6NetToMediaPhysAddress.o_length); 19638 } else { 19639 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 19640 ill->ill_phys_addr_length); 19641 } 19642 /* 19643 * Note: Returns ND_* states. Should be: 19644 * reachable(1), stale(2), delay(3), probe(4), 19645 * invalid(5), unknown(6) 19646 */ 19647 ntme.ipv6NetToMediaState = nce->nce_state; 19648 ntme.ipv6NetToMediaLastUpdated = 0; 19649 19650 /* other(1), dynamic(2), static(3), local(4) */ 19651 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 19652 ntme.ipv6NetToMediaType = 4; 19653 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 19654 ntme.ipv6NetToMediaType = 1; 19655 } else { 19656 ntme.ipv6NetToMediaType = 2; 19657 } 19658 19659 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 19660 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 19661 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 19662 (uint_t)sizeof (ntme))); 19663 } 19664 return (0); 19665 } 19666 19667 /* 19668 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 19669 */ 19670 /* ARGSUSED */ 19671 int 19672 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 19673 { 19674 switch (level) { 19675 case MIB2_IP: 19676 case MIB2_ICMP: 19677 switch (name) { 19678 default: 19679 break; 19680 } 19681 return (1); 19682 default: 19683 return (1); 19684 } 19685 } 19686 19687 /* 19688 * When there exists both a 64- and 32-bit counter of a particular type 19689 * (i.e., InReceives), only the 64-bit counters are added. 19690 */ 19691 void 19692 ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2) 19693 { 19694 UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors); 19695 UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors); 19696 UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes); 19697 UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors); 19698 UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos); 19699 UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts); 19700 UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards); 19701 UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards); 19702 UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs); 19703 UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails); 19704 UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates); 19705 UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds); 19706 UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs); 19707 UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails); 19708 UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes); 19709 UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates); 19710 UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups); 19711 UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits); 19712 UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs); 19713 UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows); 19714 UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows); 19715 UPDATE_MIB(o1, ipIfStatsInWrongIPVersion, 19716 o2->ipIfStatsInWrongIPVersion); 19717 UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion, 19718 o2->ipIfStatsInWrongIPVersion); 19719 UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion, 19720 o2->ipIfStatsOutSwitchIPVersion); 19721 UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives); 19722 UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets); 19723 UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams, 19724 o2->ipIfStatsHCInForwDatagrams); 19725 UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers); 19726 UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests); 19727 UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams, 19728 o2->ipIfStatsHCOutForwDatagrams); 19729 UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds); 19730 UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits); 19731 UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets); 19732 UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts); 19733 UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets); 19734 UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts); 19735 UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets, 19736 o2->ipIfStatsHCOutMcastOctets); 19737 UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts); 19738 UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts); 19739 UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded); 19740 UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed); 19741 UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs); 19742 UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs); 19743 UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts); 19744 } 19745 19746 void 19747 ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2) 19748 { 19749 UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs); 19750 UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors); 19751 UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs); 19752 UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs); 19753 UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds); 19754 UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems); 19755 UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs); 19756 UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos); 19757 UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies); 19758 UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits, 19759 o2->ipv6IfIcmpInRouterSolicits); 19760 UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements, 19761 o2->ipv6IfIcmpInRouterAdvertisements); 19762 UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits, 19763 o2->ipv6IfIcmpInNeighborSolicits); 19764 UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements, 19765 o2->ipv6IfIcmpInNeighborAdvertisements); 19766 UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects); 19767 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries, 19768 o2->ipv6IfIcmpInGroupMembQueries); 19769 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses, 19770 o2->ipv6IfIcmpInGroupMembResponses); 19771 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions, 19772 o2->ipv6IfIcmpInGroupMembReductions); 19773 UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs); 19774 UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors); 19775 UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs, 19776 o2->ipv6IfIcmpOutDestUnreachs); 19777 UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs, 19778 o2->ipv6IfIcmpOutAdminProhibs); 19779 UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds); 19780 UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems, 19781 o2->ipv6IfIcmpOutParmProblems); 19782 UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs); 19783 UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos); 19784 UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies); 19785 UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits, 19786 o2->ipv6IfIcmpOutRouterSolicits); 19787 UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements, 19788 o2->ipv6IfIcmpOutRouterAdvertisements); 19789 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits, 19790 o2->ipv6IfIcmpOutNeighborSolicits); 19791 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements, 19792 o2->ipv6IfIcmpOutNeighborAdvertisements); 19793 UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects); 19794 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries, 19795 o2->ipv6IfIcmpOutGroupMembQueries); 19796 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses, 19797 o2->ipv6IfIcmpOutGroupMembResponses); 19798 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions, 19799 o2->ipv6IfIcmpOutGroupMembReductions); 19800 UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows); 19801 UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit); 19802 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements, 19803 o2->ipv6IfIcmpInBadNeighborAdvertisements); 19804 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations, 19805 o2->ipv6IfIcmpInBadNeighborSolicitations); 19806 UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects); 19807 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal, 19808 o2->ipv6IfIcmpInGroupMembTotal); 19809 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries, 19810 o2->ipv6IfIcmpInGroupMembBadQueries); 19811 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports, 19812 o2->ipv6IfIcmpInGroupMembBadReports); 19813 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports, 19814 o2->ipv6IfIcmpInGroupMembOurReports); 19815 } 19816 19817 /* 19818 * Called before the options are updated to check if this packet will 19819 * be source routed from here. 19820 * This routine assumes that the options are well formed i.e. that they 19821 * have already been checked. 19822 */ 19823 static boolean_t 19824 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) 19825 { 19826 ipoptp_t opts; 19827 uchar_t *opt; 19828 uint8_t optval; 19829 uint8_t optlen; 19830 ipaddr_t dst; 19831 ire_t *ire; 19832 19833 if (IS_SIMPLE_IPH(ipha)) { 19834 ip2dbg(("not source routed\n")); 19835 return (B_FALSE); 19836 } 19837 dst = ipha->ipha_dst; 19838 for (optval = ipoptp_first(&opts, ipha); 19839 optval != IPOPT_EOL; 19840 optval = ipoptp_next(&opts)) { 19841 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 19842 opt = opts.ipoptp_cur; 19843 optlen = opts.ipoptp_len; 19844 ip2dbg(("ip_source_routed: opt %d, len %d\n", 19845 optval, optlen)); 19846 switch (optval) { 19847 uint32_t off; 19848 case IPOPT_SSRR: 19849 case IPOPT_LSRR: 19850 /* 19851 * If dst is one of our addresses and there are some 19852 * entries left in the source route return (true). 19853 */ 19854 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 19855 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19856 if (ire == NULL) { 19857 ip2dbg(("ip_source_routed: not next" 19858 " source route 0x%x\n", 19859 ntohl(dst))); 19860 return (B_FALSE); 19861 } 19862 ire_refrele(ire); 19863 off = opt[IPOPT_OFFSET]; 19864 off--; 19865 if (optlen < IP_ADDR_LEN || 19866 off > optlen - IP_ADDR_LEN) { 19867 /* End of source route */ 19868 ip1dbg(("ip_source_routed: end of SR\n")); 19869 return (B_FALSE); 19870 } 19871 return (B_TRUE); 19872 } 19873 } 19874 ip2dbg(("not source routed\n")); 19875 return (B_FALSE); 19876 } 19877 19878 /* 19879 * Check if the packet contains any source route. 19880 */ 19881 static boolean_t 19882 ip_source_route_included(ipha_t *ipha) 19883 { 19884 ipoptp_t opts; 19885 uint8_t optval; 19886 19887 if (IS_SIMPLE_IPH(ipha)) 19888 return (B_FALSE); 19889 for (optval = ipoptp_first(&opts, ipha); 19890 optval != IPOPT_EOL; 19891 optval = ipoptp_next(&opts)) { 19892 switch (optval) { 19893 case IPOPT_SSRR: 19894 case IPOPT_LSRR: 19895 return (B_TRUE); 19896 } 19897 } 19898 return (B_FALSE); 19899 } 19900 19901 /* 19902 * Called when the IRE expiration timer fires. 19903 */ 19904 void 19905 ip_trash_timer_expire(void *args) 19906 { 19907 int flush_flag = 0; 19908 ire_expire_arg_t iea; 19909 ip_stack_t *ipst = (ip_stack_t *)args; 19910 19911 iea.iea_ipst = ipst; /* No netstack_hold */ 19912 19913 /* 19914 * ip_ire_expire_id is protected by ip_trash_timer_lock. 19915 * This lock makes sure that a new invocation of this function 19916 * that occurs due to an almost immediate timer firing will not 19917 * progress beyond this point until the current invocation is done 19918 */ 19919 mutex_enter(&ipst->ips_ip_trash_timer_lock); 19920 ipst->ips_ip_ire_expire_id = 0; 19921 mutex_exit(&ipst->ips_ip_trash_timer_lock); 19922 19923 /* Periodic timer */ 19924 if (ipst->ips_ip_ire_arp_time_elapsed >= 19925 ipst->ips_ip_ire_arp_interval) { 19926 /* 19927 * Remove all IRE_CACHE entries since they might 19928 * contain arp information. 19929 */ 19930 flush_flag |= FLUSH_ARP_TIME; 19931 ipst->ips_ip_ire_arp_time_elapsed = 0; 19932 IP_STAT(ipst, ip_ire_arp_timer_expired); 19933 } 19934 if (ipst->ips_ip_ire_rd_time_elapsed >= 19935 ipst->ips_ip_ire_redir_interval) { 19936 /* Remove all redirects */ 19937 flush_flag |= FLUSH_REDIRECT_TIME; 19938 ipst->ips_ip_ire_rd_time_elapsed = 0; 19939 IP_STAT(ipst, ip_ire_redirect_timer_expired); 19940 } 19941 if (ipst->ips_ip_ire_pmtu_time_elapsed >= 19942 ipst->ips_ip_ire_pathmtu_interval) { 19943 /* Increase path mtu */ 19944 flush_flag |= FLUSH_MTU_TIME; 19945 ipst->ips_ip_ire_pmtu_time_elapsed = 0; 19946 IP_STAT(ipst, ip_ire_pmtu_timer_expired); 19947 } 19948 19949 /* 19950 * Optimize for the case when there are no redirects in the 19951 * ftable, that is, no need to walk the ftable in that case. 19952 */ 19953 if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) { 19954 iea.iea_flush_flag = flush_flag; 19955 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire, 19956 (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL, 19957 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 19958 NULL, ALL_ZONES, ipst); 19959 } 19960 if ((flush_flag & FLUSH_REDIRECT_TIME) && 19961 ipst->ips_ip_redirect_cnt > 0) { 19962 iea.iea_flush_flag = flush_flag; 19963 ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE, 19964 ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 19965 0, NULL, 0, NULL, NULL, ALL_ZONES, ipst); 19966 } 19967 if (flush_flag & FLUSH_MTU_TIME) { 19968 /* 19969 * Walk all IPv6 IRE's and update them 19970 * Note that ARP and redirect timers are not 19971 * needed since NUD handles stale entries. 19972 */ 19973 flush_flag = FLUSH_MTU_TIME; 19974 iea.iea_flush_flag = flush_flag; 19975 ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea, 19976 ALL_ZONES, ipst); 19977 } 19978 19979 ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval; 19980 ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval; 19981 ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval; 19982 19983 /* 19984 * Hold the lock to serialize timeout calls and prevent 19985 * stale values in ip_ire_expire_id. Otherwise it is possible 19986 * for the timer to fire and a new invocation of this function 19987 * to start before the return value of timeout has been stored 19988 * in ip_ire_expire_id by the current invocation. 19989 */ 19990 mutex_enter(&ipst->ips_ip_trash_timer_lock); 19991 ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire, 19992 (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 19993 mutex_exit(&ipst->ips_ip_trash_timer_lock); 19994 } 19995 19996 /* 19997 * Called by the memory allocator subsystem directly, when the system 19998 * is running low on memory. 19999 */ 20000 /* ARGSUSED */ 20001 void 20002 ip_trash_ire_reclaim(void *args) 20003 { 20004 netstack_handle_t nh; 20005 netstack_t *ns; 20006 20007 netstack_next_init(&nh); 20008 while ((ns = netstack_next(&nh)) != NULL) { 20009 ip_trash_ire_reclaim_stack(ns->netstack_ip); 20010 netstack_rele(ns); 20011 } 20012 netstack_next_fini(&nh); 20013 } 20014 20015 static void 20016 ip_trash_ire_reclaim_stack(ip_stack_t *ipst) 20017 { 20018 ire_cache_count_t icc; 20019 ire_cache_reclaim_t icr; 20020 ncc_cache_count_t ncc; 20021 nce_cache_reclaim_t ncr; 20022 uint_t delete_cnt; 20023 /* 20024 * Memory reclaim call back. 20025 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 20026 * Then, with a target of freeing 1/Nth of IRE_CACHE 20027 * entries, determine what fraction to free for 20028 * each category of IRE_CACHE entries giving absolute priority 20029 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 20030 * entry will be freed unless all offlink entries are freed). 20031 */ 20032 icc.icc_total = 0; 20033 icc.icc_unused = 0; 20034 icc.icc_offlink = 0; 20035 icc.icc_pmtu = 0; 20036 icc.icc_onlink = 0; 20037 ire_walk(ire_cache_count, (char *)&icc, ipst); 20038 20039 /* 20040 * Free NCEs for IPv6 like the onlink ires. 20041 */ 20042 ncc.ncc_total = 0; 20043 ncc.ncc_host = 0; 20044 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst); 20045 20046 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 20047 icc.icc_pmtu + icc.icc_onlink); 20048 delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction; 20049 IP_STAT(ipst, ip_trash_ire_reclaim_calls); 20050 if (delete_cnt == 0) 20051 return; 20052 IP_STAT(ipst, ip_trash_ire_reclaim_success); 20053 /* Always delete all unused offlink entries */ 20054 icr.icr_ipst = ipst; 20055 icr.icr_unused = 1; 20056 if (delete_cnt <= icc.icc_unused) { 20057 /* 20058 * Only need to free unused entries. In other words, 20059 * there are enough unused entries to free to meet our 20060 * target number of freed ire cache entries. 20061 */ 20062 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 20063 ncr.ncr_host = 0; 20064 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 20065 /* 20066 * Only need to free unused entries, plus a fraction of offlink 20067 * entries. It follows from the first if statement that 20068 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 20069 */ 20070 delete_cnt -= icc.icc_unused; 20071 /* Round up # deleted by truncating fraction */ 20072 icr.icr_offlink = icc.icc_offlink / delete_cnt; 20073 icr.icr_pmtu = icr.icr_onlink = 0; 20074 ncr.ncr_host = 0; 20075 } else if (delete_cnt <= 20076 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 20077 /* 20078 * Free all unused and offlink entries, plus a fraction of 20079 * pmtu entries. It follows from the previous if statement 20080 * that icc_pmtu is non-zero, and that 20081 * delete_cnt != icc_unused + icc_offlink. 20082 */ 20083 icr.icr_offlink = 1; 20084 delete_cnt -= icc.icc_unused + icc.icc_offlink; 20085 /* Round up # deleted by truncating fraction */ 20086 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 20087 icr.icr_onlink = 0; 20088 ncr.ncr_host = 0; 20089 } else { 20090 /* 20091 * Free all unused, offlink, and pmtu entries, plus a fraction 20092 * of onlink entries. If we're here, then we know that 20093 * icc_onlink is non-zero, and that 20094 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 20095 */ 20096 icr.icr_offlink = icr.icr_pmtu = 1; 20097 delete_cnt -= icc.icc_unused + icc.icc_offlink + 20098 icc.icc_pmtu; 20099 /* Round up # deleted by truncating fraction */ 20100 icr.icr_onlink = icc.icc_onlink / delete_cnt; 20101 /* Using the same delete fraction as for onlink IREs */ 20102 ncr.ncr_host = ncc.ncc_host / delete_cnt; 20103 } 20104 #ifdef DEBUG 20105 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 20106 "fractions %d/%d/%d/%d\n", 20107 icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total, 20108 icc.icc_unused, icc.icc_offlink, 20109 icc.icc_pmtu, icc.icc_onlink, 20110 icr.icr_unused, icr.icr_offlink, 20111 icr.icr_pmtu, icr.icr_onlink)); 20112 #endif 20113 ire_walk(ire_cache_reclaim, (char *)&icr, ipst); 20114 if (ncr.ncr_host != 0) 20115 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 20116 (uchar_t *)&ncr, ipst); 20117 #ifdef DEBUG 20118 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 20119 icc.icc_pmtu = 0; icc.icc_onlink = 0; 20120 ire_walk(ire_cache_count, (char *)&icc, ipst); 20121 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 20122 icc.icc_total, icc.icc_unused, icc.icc_offlink, 20123 icc.icc_pmtu, icc.icc_onlink)); 20124 #endif 20125 } 20126 20127 /* 20128 * ip_unbind is called when a copy of an unbind request is received from the 20129 * upper level protocol. We remove this conn from any fanout hash list it is 20130 * on, and zero out the bind information. No reply is expected up above. 20131 */ 20132 mblk_t * 20133 ip_unbind(queue_t *q, mblk_t *mp) 20134 { 20135 conn_t *connp = Q_TO_CONN(q); 20136 20137 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 20138 20139 if (is_system_labeled() && connp->conn_anon_port) { 20140 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 20141 connp->conn_mlp_type, connp->conn_ulp, 20142 ntohs(connp->conn_lport), B_FALSE); 20143 connp->conn_anon_port = 0; 20144 } 20145 connp->conn_mlp_type = mlptSingle; 20146 20147 ipcl_hash_remove(connp); 20148 20149 ASSERT(mp->b_cont == NULL); 20150 /* 20151 * Convert mp into a T_OK_ACK 20152 */ 20153 mp = mi_tpi_ok_ack_alloc(mp); 20154 20155 /* 20156 * should not happen in practice... T_OK_ACK is smaller than the 20157 * original message. 20158 */ 20159 if (mp == NULL) 20160 return (NULL); 20161 20162 return (mp); 20163 } 20164 20165 /* 20166 * Write side put procedure. Outbound data, IOCTLs, responses from 20167 * resolvers, etc, come down through here. 20168 * 20169 * arg2 is always a queue_t *. 20170 * When that queue is an ill_t (i.e. q_next != NULL), then arg must be 20171 * the zoneid. 20172 * When that queue is not an ill_t, then arg must be a conn_t pointer. 20173 */ 20174 void 20175 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 20176 { 20177 ip_output_options(arg, mp, arg2, caller, &zero_info); 20178 } 20179 20180 void 20181 ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, 20182 ip_opt_info_t *infop) 20183 { 20184 conn_t *connp = NULL; 20185 queue_t *q = (queue_t *)arg2; 20186 ipha_t *ipha; 20187 #define rptr ((uchar_t *)ipha) 20188 ire_t *ire = NULL; 20189 ire_t *sctp_ire = NULL; 20190 uint32_t v_hlen_tos_len; 20191 ipaddr_t dst; 20192 mblk_t *first_mp = NULL; 20193 boolean_t mctl_present; 20194 ipsec_out_t *io; 20195 int match_flags; 20196 ill_t *attach_ill = NULL; 20197 /* Bind to IPIF_NOFAILOVER ill etc. */ 20198 ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ 20199 ipif_t *dst_ipif; 20200 boolean_t multirt_need_resolve = B_FALSE; 20201 mblk_t *copy_mp = NULL; 20202 int err; 20203 zoneid_t zoneid; 20204 boolean_t need_decref = B_FALSE; 20205 boolean_t ignore_dontroute = B_FALSE; 20206 boolean_t ignore_nexthop = B_FALSE; 20207 boolean_t ip_nexthop = B_FALSE; 20208 ipaddr_t nexthop_addr; 20209 ip_stack_t *ipst; 20210 20211 #ifdef _BIG_ENDIAN 20212 #define V_HLEN (v_hlen_tos_len >> 24) 20213 #else 20214 #define V_HLEN (v_hlen_tos_len & 0xFF) 20215 #endif 20216 20217 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 20218 "ip_wput_start: q %p", q); 20219 20220 /* 20221 * ip_wput fast path 20222 */ 20223 20224 /* is packet from ARP ? */ 20225 if (q->q_next != NULL) { 20226 zoneid = (zoneid_t)(uintptr_t)arg; 20227 goto qnext; 20228 } 20229 20230 connp = (conn_t *)arg; 20231 ASSERT(connp != NULL); 20232 zoneid = connp->conn_zoneid; 20233 ipst = connp->conn_netstack->netstack_ip; 20234 20235 /* is queue flow controlled? */ 20236 if ((q->q_first != NULL || connp->conn_draining) && 20237 (caller == IP_WPUT)) { 20238 ASSERT(!need_decref); 20239 (void) putq(q, mp); 20240 return; 20241 } 20242 20243 /* Multidata transmit? */ 20244 if (DB_TYPE(mp) == M_MULTIDATA) { 20245 /* 20246 * We should never get here, since all Multidata messages 20247 * originating from tcp should have been directed over to 20248 * tcp_multisend() in the first place. 20249 */ 20250 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20251 freemsg(mp); 20252 return; 20253 } else if (DB_TYPE(mp) != M_DATA) 20254 goto notdata; 20255 20256 if (mp->b_flag & MSGHASREF) { 20257 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20258 mp->b_flag &= ~MSGHASREF; 20259 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 20260 need_decref = B_TRUE; 20261 } 20262 ipha = (ipha_t *)mp->b_rptr; 20263 20264 /* is IP header non-aligned or mblk smaller than basic IP header */ 20265 #ifndef SAFETY_BEFORE_SPEED 20266 if (!OK_32PTR(rptr) || 20267 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 20268 goto hdrtoosmall; 20269 #endif 20270 20271 ASSERT(OK_32PTR(ipha)); 20272 20273 /* 20274 * This function assumes that mp points to an IPv4 packet. If it's the 20275 * wrong version, we'll catch it again in ip_output_v6. 20276 * 20277 * Note that this is *only* locally-generated output here, and never 20278 * forwarded data, and that we need to deal only with transports that 20279 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 20280 * label.) 20281 */ 20282 if (is_system_labeled() && 20283 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 20284 !connp->conn_ulp_labeled) { 20285 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 20286 connp->conn_mac_exempt, ipst); 20287 ipha = (ipha_t *)mp->b_rptr; 20288 if (err != 0) { 20289 first_mp = mp; 20290 if (err == EINVAL) 20291 goto icmp_parameter_problem; 20292 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 20293 goto discard_pkt; 20294 } 20295 } 20296 20297 ASSERT(infop != NULL); 20298 20299 if (infop->ip_opt_flags & IP_VERIFY_SRC) { 20300 /* 20301 * IP_PKTINFO ancillary option is present. 20302 * IPCL_ZONEID is used to honor IP_ALLZONES option which 20303 * allows using address of any zone as the source address. 20304 */ 20305 ire = ire_ctable_lookup(ipha->ipha_src, 0, 20306 (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp), 20307 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 20308 if (ire == NULL) 20309 goto drop_pkt; 20310 ire_refrele(ire); 20311 ire = NULL; 20312 } 20313 20314 /* 20315 * IP_DONTFAILOVER_IF and IP_BOUND_IF have precedence over ill index 20316 * passed in IP_PKTINFO. 20317 */ 20318 if (infop->ip_opt_ill_index != 0 && 20319 connp->conn_outgoing_ill == NULL && 20320 connp->conn_nofailover_ill == NULL) { 20321 20322 xmit_ill = ill_lookup_on_ifindex( 20323 infop->ip_opt_ill_index, B_FALSE, NULL, NULL, NULL, NULL, 20324 ipst); 20325 20326 if (xmit_ill == NULL || IS_VNI(xmit_ill)) 20327 goto drop_pkt; 20328 /* 20329 * check that there is an ipif belonging 20330 * to our zone. IPCL_ZONEID is not used because 20331 * IP_ALLZONES option is valid only when the ill is 20332 * accessible from all zones i.e has a valid ipif in 20333 * all zones. 20334 */ 20335 if (!ipif_lookup_zoneid_group(xmit_ill, zoneid, 0, NULL)) { 20336 goto drop_pkt; 20337 } 20338 } 20339 20340 /* 20341 * If there is a policy, try to attach an ipsec_out in 20342 * the front. At the end, first_mp either points to a 20343 * M_DATA message or IPSEC_OUT message linked to a 20344 * M_DATA message. We have to do it now as we might 20345 * lose the "conn" if we go through ip_newroute. 20346 */ 20347 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 20348 if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL, 20349 ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) { 20350 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20351 if (need_decref) 20352 CONN_DEC_REF(connp); 20353 return; 20354 } else { 20355 ASSERT(mp->b_datap->db_type == M_CTL); 20356 first_mp = mp; 20357 mp = mp->b_cont; 20358 mctl_present = B_TRUE; 20359 } 20360 } else { 20361 first_mp = mp; 20362 mctl_present = B_FALSE; 20363 } 20364 20365 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20366 20367 /* is wrong version or IP options present */ 20368 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 20369 goto version_hdrlen_check; 20370 dst = ipha->ipha_dst; 20371 20372 if (connp->conn_nofailover_ill != NULL) { 20373 attach_ill = conn_get_held_ill(connp, 20374 &connp->conn_nofailover_ill, &err); 20375 if (err == ILL_LOOKUP_FAILED) { 20376 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20377 if (need_decref) 20378 CONN_DEC_REF(connp); 20379 freemsg(first_mp); 20380 return; 20381 } 20382 } 20383 20384 /* If IP_BOUND_IF has been set, use that ill. */ 20385 if (connp->conn_outgoing_ill != NULL) { 20386 xmit_ill = conn_get_held_ill(connp, 20387 &connp->conn_outgoing_ill, &err); 20388 if (err == ILL_LOOKUP_FAILED) 20389 goto drop_pkt; 20390 20391 goto send_from_ill; 20392 } 20393 20394 /* is packet multicast? */ 20395 if (CLASSD(dst)) 20396 goto multicast; 20397 20398 /* 20399 * If xmit_ill is set above due to index passed in ip_pkt_info. It 20400 * takes precedence over conn_dontroute and conn_nexthop_set 20401 */ 20402 if (xmit_ill != NULL) 20403 goto send_from_ill; 20404 20405 if (connp->conn_dontroute || connp->conn_nexthop_set) { 20406 /* 20407 * If the destination is a broadcast, local, or loopback 20408 * address, SO_DONTROUTE and IP_NEXTHOP go through the 20409 * standard path. 20410 */ 20411 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20412 if ((ire == NULL) || (ire->ire_type & 20413 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) { 20414 if (ire != NULL) { 20415 ire_refrele(ire); 20416 /* No more access to ire */ 20417 ire = NULL; 20418 } 20419 /* 20420 * bypass routing checks and go directly to interface. 20421 */ 20422 if (connp->conn_dontroute) 20423 goto dontroute; 20424 20425 ASSERT(connp->conn_nexthop_set); 20426 ip_nexthop = B_TRUE; 20427 nexthop_addr = connp->conn_nexthop_v4; 20428 goto send_from_ill; 20429 } 20430 20431 /* Must be a broadcast, a loopback or a local ire */ 20432 ire_refrele(ire); 20433 /* No more access to ire */ 20434 ire = NULL; 20435 } 20436 20437 if (attach_ill != NULL) 20438 goto send_from_ill; 20439 20440 /* 20441 * We cache IRE_CACHEs to avoid lookups. We don't do 20442 * this for the tcp global queue and listen end point 20443 * as it does not really have a real destination to 20444 * talk to. This is also true for SCTP. 20445 */ 20446 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 20447 !connp->conn_fully_bound) { 20448 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20449 if (ire == NULL) 20450 goto noirefound; 20451 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20452 "ip_wput_end: q %p (%S)", q, "end"); 20453 20454 /* 20455 * Check if the ire has the RTF_MULTIRT flag, inherited 20456 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20457 */ 20458 if (ire->ire_flags & RTF_MULTIRT) { 20459 20460 /* 20461 * Force the TTL of multirouted packets if required. 20462 * The TTL of such packets is bounded by the 20463 * ip_multirt_ttl ndd variable. 20464 */ 20465 if ((ipst->ips_ip_multirt_ttl > 0) && 20466 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20467 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20468 "(was %d), dst 0x%08x\n", 20469 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20470 ntohl(ire->ire_addr))); 20471 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20472 } 20473 /* 20474 * We look at this point if there are pending 20475 * unresolved routes. ire_multirt_resolvable() 20476 * checks in O(n) that all IRE_OFFSUBNET ire 20477 * entries for the packet's destination and 20478 * flagged RTF_MULTIRT are currently resolved. 20479 * If some remain unresolved, we make a copy 20480 * of the current message. It will be used 20481 * to initiate additional route resolutions. 20482 */ 20483 multirt_need_resolve = 20484 ire_multirt_need_resolve(ire->ire_addr, 20485 MBLK_GETLABEL(first_mp), ipst); 20486 ip2dbg(("ip_wput[TCP]: ire %p, " 20487 "multirt_need_resolve %d, first_mp %p\n", 20488 (void *)ire, multirt_need_resolve, 20489 (void *)first_mp)); 20490 if (multirt_need_resolve) { 20491 copy_mp = copymsg(first_mp); 20492 if (copy_mp != NULL) { 20493 MULTIRT_DEBUG_TAG(copy_mp); 20494 } 20495 } 20496 } 20497 20498 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20499 20500 /* 20501 * Try to resolve another multiroute if 20502 * ire_multirt_need_resolve() deemed it necessary. 20503 */ 20504 if (copy_mp != NULL) 20505 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20506 if (need_decref) 20507 CONN_DEC_REF(connp); 20508 return; 20509 } 20510 20511 /* 20512 * Access to conn_ire_cache. (protected by conn_lock) 20513 * 20514 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 20515 * the ire bucket lock here to check for CONDEMNED as it is okay to 20516 * send a packet or two with the IRE_CACHE that is going away. 20517 * Access to the ire requires an ire refhold on the ire prior to 20518 * its use since an interface unplumb thread may delete the cached 20519 * ire and release the refhold at any time. 20520 * 20521 * Caching an ire in the conn_ire_cache 20522 * 20523 * o Caching an ire pointer in the conn requires a strict check for 20524 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 20525 * ires before cleaning up the conns. So the caching of an ire pointer 20526 * in the conn is done after making sure under the bucket lock that the 20527 * ire has not yet been marked CONDEMNED. Otherwise we will end up 20528 * caching an ire after the unplumb thread has cleaned up the conn. 20529 * If the conn does not send a packet subsequently the unplumb thread 20530 * will be hanging waiting for the ire count to drop to zero. 20531 * 20532 * o We also need to atomically test for a null conn_ire_cache and 20533 * set the conn_ire_cache under the the protection of the conn_lock 20534 * to avoid races among concurrent threads trying to simultaneously 20535 * cache an ire in the conn_ire_cache. 20536 */ 20537 mutex_enter(&connp->conn_lock); 20538 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 20539 20540 if (ire != NULL && ire->ire_addr == dst && 20541 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20542 20543 IRE_REFHOLD(ire); 20544 mutex_exit(&connp->conn_lock); 20545 20546 } else { 20547 boolean_t cached = B_FALSE; 20548 connp->conn_ire_cache = NULL; 20549 mutex_exit(&connp->conn_lock); 20550 /* Release the old ire */ 20551 if (ire != NULL && sctp_ire == NULL) 20552 IRE_REFRELE_NOTR(ire); 20553 20554 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 20555 if (ire == NULL) 20556 goto noirefound; 20557 IRE_REFHOLD_NOTR(ire); 20558 20559 mutex_enter(&connp->conn_lock); 20560 if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) { 20561 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20562 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 20563 if (connp->conn_ulp == IPPROTO_TCP) 20564 TCP_CHECK_IREINFO(connp->conn_tcp, ire); 20565 connp->conn_ire_cache = ire; 20566 cached = B_TRUE; 20567 } 20568 rw_exit(&ire->ire_bucket->irb_lock); 20569 } 20570 mutex_exit(&connp->conn_lock); 20571 20572 /* 20573 * We can continue to use the ire but since it was 20574 * not cached, we should drop the extra reference. 20575 */ 20576 if (!cached) 20577 IRE_REFRELE_NOTR(ire); 20578 } 20579 20580 20581 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20582 "ip_wput_end: q %p (%S)", q, "end"); 20583 20584 /* 20585 * Check if the ire has the RTF_MULTIRT flag, inherited 20586 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 20587 */ 20588 if (ire->ire_flags & RTF_MULTIRT) { 20589 20590 /* 20591 * Force the TTL of multirouted packets if required. 20592 * The TTL of such packets is bounded by the 20593 * ip_multirt_ttl ndd variable. 20594 */ 20595 if ((ipst->ips_ip_multirt_ttl > 0) && 20596 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 20597 ip2dbg(("ip_wput: forcing multirt TTL to %d " 20598 "(was %d), dst 0x%08x\n", 20599 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 20600 ntohl(ire->ire_addr))); 20601 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 20602 } 20603 20604 /* 20605 * At this point, we check to see if there are any pending 20606 * unresolved routes. ire_multirt_resolvable() 20607 * checks in O(n) that all IRE_OFFSUBNET ire 20608 * entries for the packet's destination and 20609 * flagged RTF_MULTIRT are currently resolved. 20610 * If some remain unresolved, we make a copy 20611 * of the current message. It will be used 20612 * to initiate additional route resolutions. 20613 */ 20614 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 20615 MBLK_GETLABEL(first_mp), ipst); 20616 ip2dbg(("ip_wput[not TCP]: ire %p, " 20617 "multirt_need_resolve %d, first_mp %p\n", 20618 (void *)ire, multirt_need_resolve, (void *)first_mp)); 20619 if (multirt_need_resolve) { 20620 copy_mp = copymsg(first_mp); 20621 if (copy_mp != NULL) { 20622 MULTIRT_DEBUG_TAG(copy_mp); 20623 } 20624 } 20625 } 20626 20627 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 20628 20629 /* 20630 * Try to resolve another multiroute if 20631 * ire_multirt_resolvable() deemed it necessary 20632 */ 20633 if (copy_mp != NULL) 20634 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 20635 if (need_decref) 20636 CONN_DEC_REF(connp); 20637 return; 20638 20639 qnext: 20640 /* 20641 * Upper Level Protocols pass down complete IP datagrams 20642 * as M_DATA messages. Everything else is a sideshow. 20643 * 20644 * 1) We could be re-entering ip_wput because of ip_neworute 20645 * in which case we could have a IPSEC_OUT message. We 20646 * need to pass through ip_wput like other datagrams and 20647 * hence cannot branch to ip_wput_nondata. 20648 * 20649 * 2) ARP, AH, ESP, and other clients who are on the module 20650 * instance of IP stream, give us something to deal with. 20651 * We will handle AH and ESP here and rest in ip_wput_nondata. 20652 * 20653 * 3) ICMP replies also could come here. 20654 */ 20655 ipst = ILLQ_TO_IPST(q); 20656 20657 if (DB_TYPE(mp) != M_DATA) { 20658 notdata: 20659 if (DB_TYPE(mp) == M_CTL) { 20660 /* 20661 * M_CTL messages are used by ARP, AH and ESP to 20662 * communicate with IP. We deal with IPSEC_IN and 20663 * IPSEC_OUT here. ip_wput_nondata handles other 20664 * cases. 20665 */ 20666 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 20667 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 20668 first_mp = mp->b_cont; 20669 first_mp->b_flag &= ~MSGHASREF; 20670 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 20671 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 20672 CONN_DEC_REF(connp); 20673 connp = NULL; 20674 } 20675 if (ii->ipsec_info_type == IPSEC_IN) { 20676 /* 20677 * Either this message goes back to 20678 * IPsec for further processing or to 20679 * ULP after policy checks. 20680 */ 20681 ip_fanout_proto_again(mp, NULL, NULL, NULL); 20682 return; 20683 } else if (ii->ipsec_info_type == IPSEC_OUT) { 20684 io = (ipsec_out_t *)ii; 20685 if (io->ipsec_out_proc_begin) { 20686 /* 20687 * IPsec processing has already started. 20688 * Complete it. 20689 * IPQoS notes: We don't care what is 20690 * in ipsec_out_ill_index since this 20691 * won't be processed for IPQoS policies 20692 * in ipsec_out_process. 20693 */ 20694 ipsec_out_process(q, mp, NULL, 20695 io->ipsec_out_ill_index); 20696 return; 20697 } else { 20698 connp = (q->q_next != NULL) ? 20699 NULL : Q_TO_CONN(q); 20700 first_mp = mp; 20701 mp = mp->b_cont; 20702 mctl_present = B_TRUE; 20703 } 20704 zoneid = io->ipsec_out_zoneid; 20705 ASSERT(zoneid != ALL_ZONES); 20706 } else if (ii->ipsec_info_type == IPSEC_CTL) { 20707 /* 20708 * It's an IPsec control message requesting 20709 * an SADB update to be sent to the IPsec 20710 * hardware acceleration capable ills. 20711 */ 20712 ipsec_ctl_t *ipsec_ctl = 20713 (ipsec_ctl_t *)mp->b_rptr; 20714 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 20715 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 20716 mblk_t *cmp = mp->b_cont; 20717 20718 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 20719 ASSERT(cmp != NULL); 20720 20721 freeb(mp); 20722 ill_ipsec_capab_send_all(satype, cmp, sa, 20723 ipst->ips_netstack); 20724 return; 20725 } else { 20726 /* 20727 * This must be ARP or special TSOL signaling. 20728 */ 20729 ip_wput_nondata(NULL, q, mp, NULL); 20730 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20731 "ip_wput_end: q %p (%S)", q, "nondata"); 20732 return; 20733 } 20734 } else { 20735 /* 20736 * This must be non-(ARP/AH/ESP) messages. 20737 */ 20738 ASSERT(!need_decref); 20739 ip_wput_nondata(NULL, q, mp, NULL); 20740 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20741 "ip_wput_end: q %p (%S)", q, "nondata"); 20742 return; 20743 } 20744 } else { 20745 first_mp = mp; 20746 mctl_present = B_FALSE; 20747 } 20748 20749 ASSERT(first_mp != NULL); 20750 /* 20751 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if 20752 * to make sure that this packet goes out on the same interface it 20753 * came in. We handle that here. 20754 */ 20755 if (mctl_present) { 20756 uint_t ifindex; 20757 20758 io = (ipsec_out_t *)first_mp->b_rptr; 20759 if (io->ipsec_out_attach_if || io->ipsec_out_ip_nexthop) { 20760 /* 20761 * We may have lost the conn context if we are 20762 * coming here from ip_newroute(). Copy the 20763 * nexthop information. 20764 */ 20765 if (io->ipsec_out_ip_nexthop) { 20766 ip_nexthop = B_TRUE; 20767 nexthop_addr = io->ipsec_out_nexthop_addr; 20768 20769 ipha = (ipha_t *)mp->b_rptr; 20770 dst = ipha->ipha_dst; 20771 goto send_from_ill; 20772 } else { 20773 ASSERT(io->ipsec_out_ill_index != 0); 20774 ifindex = io->ipsec_out_ill_index; 20775 attach_ill = ill_lookup_on_ifindex(ifindex, 20776 B_FALSE, NULL, NULL, NULL, NULL, ipst); 20777 if (attach_ill == NULL) { 20778 ASSERT(xmit_ill == NULL); 20779 ip1dbg(("ip_output: bad ifindex for " 20780 "(BIND TO IPIF_NOFAILOVER) %d\n", 20781 ifindex)); 20782 freemsg(first_mp); 20783 BUMP_MIB(&ipst->ips_ip_mib, 20784 ipIfStatsOutDiscards); 20785 ASSERT(!need_decref); 20786 return; 20787 } 20788 } 20789 } 20790 } 20791 20792 ASSERT(xmit_ill == NULL); 20793 20794 /* We have a complete IP datagram heading outbound. */ 20795 ipha = (ipha_t *)mp->b_rptr; 20796 20797 #ifndef SPEED_BEFORE_SAFETY 20798 /* 20799 * Make sure we have a full-word aligned message and that at least 20800 * a simple IP header is accessible in the first message. If not, 20801 * try a pullup. For labeled systems we need to always take this 20802 * path as M_CTLs are "notdata" but have trailing data to process. 20803 */ 20804 if (!OK_32PTR(rptr) || 20805 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) { 20806 hdrtoosmall: 20807 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 20808 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20809 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 20810 if (first_mp == NULL) 20811 first_mp = mp; 20812 goto discard_pkt; 20813 } 20814 20815 /* This function assumes that mp points to an IPv4 packet. */ 20816 if (is_system_labeled() && q->q_next == NULL && 20817 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 20818 !connp->conn_ulp_labeled) { 20819 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 20820 connp->conn_mac_exempt, ipst); 20821 ipha = (ipha_t *)mp->b_rptr; 20822 if (first_mp != NULL) 20823 first_mp->b_cont = mp; 20824 if (err != 0) { 20825 if (first_mp == NULL) 20826 first_mp = mp; 20827 if (err == EINVAL) 20828 goto icmp_parameter_problem; 20829 ip2dbg(("ip_wput: label check failed (%d)\n", 20830 err)); 20831 goto discard_pkt; 20832 } 20833 } 20834 20835 ipha = (ipha_t *)mp->b_rptr; 20836 if (first_mp == NULL) { 20837 ASSERT(attach_ill == NULL && xmit_ill == NULL); 20838 /* 20839 * If we got here because of "goto hdrtoosmall" 20840 * We need to attach a IPSEC_OUT. 20841 */ 20842 if (connp->conn_out_enforce_policy) { 20843 if (((mp = ipsec_attach_ipsec_out(&mp, connp, 20844 NULL, ipha->ipha_protocol, 20845 ipst->ips_netstack)) == NULL)) { 20846 BUMP_MIB(&ipst->ips_ip_mib, 20847 ipIfStatsOutDiscards); 20848 if (need_decref) 20849 CONN_DEC_REF(connp); 20850 return; 20851 } else { 20852 ASSERT(mp->b_datap->db_type == M_CTL); 20853 first_mp = mp; 20854 mp = mp->b_cont; 20855 mctl_present = B_TRUE; 20856 } 20857 } else { 20858 first_mp = mp; 20859 mctl_present = B_FALSE; 20860 } 20861 } 20862 } 20863 #endif 20864 20865 /* Most of the code below is written for speed, not readability */ 20866 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20867 20868 /* 20869 * If ip_newroute() fails, we're going to need a full 20870 * header for the icmp wraparound. 20871 */ 20872 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 20873 uint_t v_hlen; 20874 version_hdrlen_check: 20875 ASSERT(first_mp != NULL); 20876 v_hlen = V_HLEN; 20877 /* 20878 * siphon off IPv6 packets coming down from transport 20879 * layer modules here. 20880 * Note: high-order bit carries NUD reachability confirmation 20881 */ 20882 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 20883 /* 20884 * FIXME: assume that callers of ip_output* call 20885 * the right version? 20886 */ 20887 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); 20888 ASSERT(xmit_ill == NULL); 20889 if (attach_ill != NULL) 20890 ill_refrele(attach_ill); 20891 if (need_decref) 20892 mp->b_flag |= MSGHASREF; 20893 (void) ip_output_v6(arg, first_mp, arg2, caller); 20894 return; 20895 } 20896 20897 if ((v_hlen >> 4) != IP_VERSION) { 20898 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20899 "ip_wput_end: q %p (%S)", q, "badvers"); 20900 goto discard_pkt; 20901 } 20902 /* 20903 * Is the header length at least 20 bytes? 20904 * 20905 * Are there enough bytes accessible in the header? If 20906 * not, try a pullup. 20907 */ 20908 v_hlen &= 0xF; 20909 v_hlen <<= 2; 20910 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 20911 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20912 "ip_wput_end: q %p (%S)", q, "badlen"); 20913 goto discard_pkt; 20914 } 20915 if (v_hlen > (mp->b_wptr - rptr)) { 20916 if (!pullupmsg(mp, v_hlen)) { 20917 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20918 "ip_wput_end: q %p (%S)", q, "badpullup2"); 20919 goto discard_pkt; 20920 } 20921 ipha = (ipha_t *)mp->b_rptr; 20922 } 20923 /* 20924 * Move first entry from any source route into ipha_dst and 20925 * verify the options 20926 */ 20927 if (ip_wput_options(q, first_mp, ipha, mctl_present, 20928 zoneid, ipst)) { 20929 ASSERT(xmit_ill == NULL); 20930 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 20931 if (attach_ill != NULL) 20932 ill_refrele(attach_ill); 20933 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 20934 "ip_wput_end: q %p (%S)", q, "badopts"); 20935 if (need_decref) 20936 CONN_DEC_REF(connp); 20937 return; 20938 } 20939 } 20940 dst = ipha->ipha_dst; 20941 20942 /* 20943 * Try to get an IRE_CACHE for the destination address. If we can't, 20944 * we have to run the packet through ip_newroute which will take 20945 * the appropriate action to arrange for an IRE_CACHE, such as querying 20946 * a resolver, or assigning a default gateway, etc. 20947 */ 20948 if (CLASSD(dst)) { 20949 ipif_t *ipif; 20950 uint32_t setsrc = 0; 20951 20952 multicast: 20953 ASSERT(first_mp != NULL); 20954 ip2dbg(("ip_wput: CLASSD\n")); 20955 if (connp == NULL) { 20956 /* 20957 * Use the first good ipif on the ill. 20958 * XXX Should this ever happen? (Appears 20959 * to show up with just ppp and no ethernet due 20960 * to in.rdisc.) 20961 * However, ire_send should be able to 20962 * call ip_wput_ire directly. 20963 * 20964 * XXX Also, this can happen for ICMP and other packets 20965 * with multicast source addresses. Perhaps we should 20966 * fix things so that we drop the packet in question, 20967 * but for now, just run with it. 20968 */ 20969 ill_t *ill = (ill_t *)q->q_ptr; 20970 20971 /* 20972 * Don't honor attach_if for this case. If ill 20973 * is part of the group, ipif could belong to 20974 * any ill and we cannot maintain attach_ill 20975 * and ipif_ill same anymore and the assert 20976 * below would fail. 20977 */ 20978 if (mctl_present && io->ipsec_out_attach_if) { 20979 io->ipsec_out_ill_index = 0; 20980 io->ipsec_out_attach_if = B_FALSE; 20981 ASSERT(attach_ill != NULL); 20982 ill_refrele(attach_ill); 20983 attach_ill = NULL; 20984 } 20985 20986 ASSERT(attach_ill == NULL); 20987 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 20988 if (ipif == NULL) { 20989 if (need_decref) 20990 CONN_DEC_REF(connp); 20991 freemsg(first_mp); 20992 return; 20993 } 20994 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 20995 ntohl(dst), ill->ill_name)); 20996 } else { 20997 /* 20998 * The order of precedence is IP_BOUND_IF, IP_PKTINFO 20999 * and IP_MULTICAST_IF. The block comment above this 21000 * function explains the locking mechanism used here. 21001 */ 21002 if (xmit_ill == NULL) { 21003 xmit_ill = conn_get_held_ill(connp, 21004 &connp->conn_outgoing_ill, &err); 21005 if (err == ILL_LOOKUP_FAILED) { 21006 ip1dbg(("ip_wput: No ill for " 21007 "IP_BOUND_IF\n")); 21008 BUMP_MIB(&ipst->ips_ip_mib, 21009 ipIfStatsOutNoRoutes); 21010 goto drop_pkt; 21011 } 21012 } 21013 21014 if (xmit_ill == NULL) { 21015 ipif = conn_get_held_ipif(connp, 21016 &connp->conn_multicast_ipif, &err); 21017 if (err == IPIF_LOOKUP_FAILED) { 21018 ip1dbg(("ip_wput: No ipif for " 21019 "multicast\n")); 21020 BUMP_MIB(&ipst->ips_ip_mib, 21021 ipIfStatsOutNoRoutes); 21022 goto drop_pkt; 21023 } 21024 } 21025 if (xmit_ill != NULL) { 21026 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21027 if (ipif == NULL) { 21028 ip1dbg(("ip_wput: No ipif for " 21029 "xmit_ill\n")); 21030 BUMP_MIB(&ipst->ips_ip_mib, 21031 ipIfStatsOutNoRoutes); 21032 goto drop_pkt; 21033 } 21034 } else if (ipif == NULL || ipif->ipif_isv6) { 21035 /* 21036 * We must do this ipif determination here 21037 * else we could pass through ip_newroute 21038 * and come back here without the conn context. 21039 * 21040 * Note: we do late binding i.e. we bind to 21041 * the interface when the first packet is sent. 21042 * For performance reasons we do not rebind on 21043 * each packet but keep the binding until the 21044 * next IP_MULTICAST_IF option. 21045 * 21046 * conn_multicast_{ipif,ill} are shared between 21047 * IPv4 and IPv6 and AF_INET6 sockets can 21048 * send both IPv4 and IPv6 packets. Hence 21049 * we have to check that "isv6" matches above. 21050 */ 21051 if (ipif != NULL) 21052 ipif_refrele(ipif); 21053 ipif = ipif_lookup_group(dst, zoneid, ipst); 21054 if (ipif == NULL) { 21055 ip1dbg(("ip_wput: No ipif for " 21056 "multicast\n")); 21057 BUMP_MIB(&ipst->ips_ip_mib, 21058 ipIfStatsOutNoRoutes); 21059 goto drop_pkt; 21060 } 21061 err = conn_set_held_ipif(connp, 21062 &connp->conn_multicast_ipif, ipif); 21063 if (err == IPIF_LOOKUP_FAILED) { 21064 ipif_refrele(ipif); 21065 ip1dbg(("ip_wput: No ipif for " 21066 "multicast\n")); 21067 BUMP_MIB(&ipst->ips_ip_mib, 21068 ipIfStatsOutNoRoutes); 21069 goto drop_pkt; 21070 } 21071 } 21072 } 21073 ASSERT(!ipif->ipif_isv6); 21074 /* 21075 * As we may lose the conn by the time we reach ip_wput_ire, 21076 * we copy conn_multicast_loop and conn_dontroute on to an 21077 * ipsec_out. In case if this datagram goes out secure, 21078 * we need the ill_index also. Copy that also into the 21079 * ipsec_out. 21080 */ 21081 if (mctl_present) { 21082 io = (ipsec_out_t *)first_mp->b_rptr; 21083 ASSERT(first_mp->b_datap->db_type == M_CTL); 21084 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21085 } else { 21086 ASSERT(mp == first_mp); 21087 if ((first_mp = allocb(sizeof (ipsec_info_t), 21088 BPRI_HI)) == NULL) { 21089 ipif_refrele(ipif); 21090 first_mp = mp; 21091 goto discard_pkt; 21092 } 21093 first_mp->b_datap->db_type = M_CTL; 21094 first_mp->b_wptr += sizeof (ipsec_info_t); 21095 /* ipsec_out_secure is B_FALSE now */ 21096 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 21097 io = (ipsec_out_t *)first_mp->b_rptr; 21098 io->ipsec_out_type = IPSEC_OUT; 21099 io->ipsec_out_len = sizeof (ipsec_out_t); 21100 io->ipsec_out_use_global_policy = B_TRUE; 21101 io->ipsec_out_ns = ipst->ips_netstack; 21102 first_mp->b_cont = mp; 21103 mctl_present = B_TRUE; 21104 } 21105 if (attach_ill != NULL) { 21106 ASSERT(attach_ill == ipif->ipif_ill); 21107 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21108 21109 /* 21110 * Check if we need an ire that will not be 21111 * looked up by anybody else i.e. HIDDEN. 21112 */ 21113 if (ill_is_probeonly(attach_ill)) { 21114 match_flags |= MATCH_IRE_MARK_HIDDEN; 21115 } 21116 io->ipsec_out_ill_index = 21117 attach_ill->ill_phyint->phyint_ifindex; 21118 io->ipsec_out_attach_if = B_TRUE; 21119 } else { 21120 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 21121 io->ipsec_out_ill_index = 21122 ipif->ipif_ill->ill_phyint->phyint_ifindex; 21123 } 21124 if (connp != NULL) { 21125 io->ipsec_out_multicast_loop = 21126 connp->conn_multicast_loop; 21127 io->ipsec_out_dontroute = connp->conn_dontroute; 21128 io->ipsec_out_zoneid = connp->conn_zoneid; 21129 } 21130 /* 21131 * If the application uses IP_MULTICAST_IF with 21132 * different logical addresses of the same ILL, we 21133 * need to make sure that the soruce address of 21134 * the packet matches the logical IP address used 21135 * in the option. We do it by initializing ipha_src 21136 * here. This should keep IPsec also happy as 21137 * when we return from IPsec processing, we don't 21138 * have to worry about getting the right address on 21139 * the packet. Thus it is sufficient to look for 21140 * IRE_CACHE using MATCH_IRE_ILL rathen than 21141 * MATCH_IRE_IPIF. 21142 * 21143 * NOTE : We need to do it for non-secure case also as 21144 * this might go out secure if there is a global policy 21145 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER 21146 * address, the source should be initialized already and 21147 * hence we won't be initializing here. 21148 * 21149 * As we do not have the ire yet, it is possible that 21150 * we set the source address here and then later discover 21151 * that the ire implies the source address to be assigned 21152 * through the RTF_SETSRC flag. 21153 * In that case, the setsrc variable will remind us 21154 * that overwritting the source address by the one 21155 * of the RTF_SETSRC-flagged ire is allowed. 21156 */ 21157 if (ipha->ipha_src == INADDR_ANY && 21158 (connp == NULL || !connp->conn_unspec_src)) { 21159 ipha->ipha_src = ipif->ipif_src_addr; 21160 setsrc = RTF_SETSRC; 21161 } 21162 /* 21163 * Find an IRE which matches the destination and the outgoing 21164 * queue (i.e. the outgoing interface.) 21165 * For loopback use a unicast IP address for 21166 * the ire lookup. 21167 */ 21168 if (IS_LOOPBACK(ipif->ipif_ill)) 21169 dst = ipif->ipif_lcl_addr; 21170 21171 /* 21172 * If xmit_ill is set, we branch out to ip_newroute_ipif. 21173 * We don't need to lookup ire in ctable as the packet 21174 * needs to be sent to the destination through the specified 21175 * ill irrespective of ires in the cache table. 21176 */ 21177 ire = NULL; 21178 if (xmit_ill == NULL) { 21179 ire = ire_ctable_lookup(dst, 0, 0, ipif, 21180 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21181 } 21182 21183 /* 21184 * refrele attach_ill as its not needed anymore. 21185 */ 21186 if (attach_ill != NULL) { 21187 ill_refrele(attach_ill); 21188 attach_ill = NULL; 21189 } 21190 21191 if (ire == NULL) { 21192 /* 21193 * Multicast loopback and multicast forwarding is 21194 * done in ip_wput_ire. 21195 * 21196 * Mark this packet to make it be delivered to 21197 * ip_wput_ire after the new ire has been 21198 * created. 21199 * 21200 * The call to ip_newroute_ipif takes into account 21201 * the setsrc reminder. In any case, we take care 21202 * of the RTF_MULTIRT flag. 21203 */ 21204 mp->b_prev = mp->b_next = NULL; 21205 if (xmit_ill == NULL || 21206 xmit_ill->ill_ipif_up_count > 0) { 21207 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 21208 setsrc | RTF_MULTIRT, zoneid, infop); 21209 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21210 "ip_wput_end: q %p (%S)", q, "noire"); 21211 } else { 21212 freemsg(first_mp); 21213 } 21214 ipif_refrele(ipif); 21215 if (xmit_ill != NULL) 21216 ill_refrele(xmit_ill); 21217 if (need_decref) 21218 CONN_DEC_REF(connp); 21219 return; 21220 } 21221 21222 ipif_refrele(ipif); 21223 ipif = NULL; 21224 ASSERT(xmit_ill == NULL); 21225 21226 /* 21227 * Honor the RTF_SETSRC flag for multicast packets, 21228 * if allowed by the setsrc reminder. 21229 */ 21230 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 21231 ipha->ipha_src = ire->ire_src_addr; 21232 } 21233 21234 /* 21235 * Unconditionally force the TTL to 1 for 21236 * multirouted multicast packets: 21237 * multirouted multicast should not cross 21238 * multicast routers. 21239 */ 21240 if (ire->ire_flags & RTF_MULTIRT) { 21241 if (ipha->ipha_ttl > 1) { 21242 ip2dbg(("ip_wput: forcing multicast " 21243 "multirt TTL to 1 (was %d), dst 0x%08x\n", 21244 ipha->ipha_ttl, ntohl(ire->ire_addr))); 21245 ipha->ipha_ttl = 1; 21246 } 21247 } 21248 } else { 21249 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), ipst); 21250 if ((ire != NULL) && (ire->ire_type & 21251 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 21252 ignore_dontroute = B_TRUE; 21253 ignore_nexthop = B_TRUE; 21254 } 21255 if (ire != NULL) { 21256 ire_refrele(ire); 21257 ire = NULL; 21258 } 21259 /* 21260 * Guard against coming in from arp in which case conn is NULL. 21261 * Also guard against non M_DATA with dontroute set but 21262 * destined to local, loopback or broadcast addresses. 21263 */ 21264 if (connp != NULL && connp->conn_dontroute && 21265 !ignore_dontroute) { 21266 dontroute: 21267 /* 21268 * Set TTL to 1 if SO_DONTROUTE is set to prevent 21269 * routing protocols from seeing false direct 21270 * connectivity. 21271 */ 21272 ipha->ipha_ttl = 1; 21273 21274 /* If suitable ipif not found, drop packet */ 21275 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst); 21276 if (dst_ipif == NULL) { 21277 noroute: 21278 ip1dbg(("ip_wput: no route for dst using" 21279 " SO_DONTROUTE\n")); 21280 BUMP_MIB(&ipst->ips_ip_mib, 21281 ipIfStatsOutNoRoutes); 21282 mp->b_prev = mp->b_next = NULL; 21283 if (first_mp == NULL) 21284 first_mp = mp; 21285 goto drop_pkt; 21286 } else { 21287 /* 21288 * If suitable ipif has been found, set 21289 * xmit_ill to the corresponding 21290 * ipif_ill because we'll be using the 21291 * send_from_ill logic below. 21292 */ 21293 ASSERT(xmit_ill == NULL); 21294 xmit_ill = dst_ipif->ipif_ill; 21295 mutex_enter(&xmit_ill->ill_lock); 21296 if (!ILL_CAN_LOOKUP(xmit_ill)) { 21297 mutex_exit(&xmit_ill->ill_lock); 21298 xmit_ill = NULL; 21299 ipif_refrele(dst_ipif); 21300 goto noroute; 21301 } 21302 ill_refhold_locked(xmit_ill); 21303 mutex_exit(&xmit_ill->ill_lock); 21304 ipif_refrele(dst_ipif); 21305 } 21306 } 21307 /* 21308 * If we are bound to IPIF_NOFAILOVER address, look for 21309 * an IRE_CACHE matching the ill. 21310 */ 21311 send_from_ill: 21312 if (attach_ill != NULL) { 21313 ipif_t *attach_ipif; 21314 21315 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 21316 21317 /* 21318 * Check if we need an ire that will not be 21319 * looked up by anybody else i.e. HIDDEN. 21320 */ 21321 if (ill_is_probeonly(attach_ill)) { 21322 match_flags |= MATCH_IRE_MARK_HIDDEN; 21323 } 21324 21325 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 21326 if (attach_ipif == NULL) { 21327 ip1dbg(("ip_wput: No ipif for attach_ill\n")); 21328 goto discard_pkt; 21329 } 21330 ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, 21331 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21332 ipif_refrele(attach_ipif); 21333 } else if (xmit_ill != NULL) { 21334 ipif_t *ipif; 21335 21336 /* 21337 * Mark this packet as originated locally 21338 */ 21339 mp->b_prev = mp->b_next = NULL; 21340 21341 /* 21342 * Could be SO_DONTROUTE case also. 21343 * Verify that at least one ipif is up on the ill. 21344 */ 21345 if (xmit_ill->ill_ipif_up_count == 0) { 21346 ip1dbg(("ip_output: xmit_ill %s is down\n", 21347 xmit_ill->ill_name)); 21348 goto drop_pkt; 21349 } 21350 21351 ipif = ipif_get_next_ipif(NULL, xmit_ill); 21352 if (ipif == NULL) { 21353 ip1dbg(("ip_output: xmit_ill %s NULL ipif\n", 21354 xmit_ill->ill_name)); 21355 goto drop_pkt; 21356 } 21357 21358 /* 21359 * Look for a ire that is part of the group, 21360 * if found use it else call ip_newroute_ipif. 21361 * IPCL_ZONEID is not used for matching because 21362 * IP_ALLZONES option is valid only when the 21363 * ill is accessible from all zones i.e has a 21364 * valid ipif in all zones. 21365 */ 21366 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 21367 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 21368 MBLK_GETLABEL(mp), match_flags, ipst); 21369 /* 21370 * If an ire exists use it or else create 21371 * an ire but don't add it to the cache. 21372 * Adding an ire may cause issues with 21373 * asymmetric routing. 21374 * In case of multiroute always act as if 21375 * ire does not exist. 21376 */ 21377 if (ire == NULL || ire->ire_flags & RTF_MULTIRT) { 21378 if (ire != NULL) 21379 ire_refrele(ire); 21380 ip_newroute_ipif(q, first_mp, ipif, 21381 dst, connp, 0, zoneid, infop); 21382 ipif_refrele(ipif); 21383 ip1dbg(("ip_output: xmit_ill via %s\n", 21384 xmit_ill->ill_name)); 21385 ill_refrele(xmit_ill); 21386 if (need_decref) 21387 CONN_DEC_REF(connp); 21388 return; 21389 } 21390 ipif_refrele(ipif); 21391 } else if (ip_nexthop || (connp != NULL && 21392 (connp->conn_nexthop_set)) && !ignore_nexthop) { 21393 if (!ip_nexthop) { 21394 ip_nexthop = B_TRUE; 21395 nexthop_addr = connp->conn_nexthop_v4; 21396 } 21397 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 21398 MATCH_IRE_GW; 21399 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 21400 NULL, zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 21401 } else { 21402 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp), 21403 ipst); 21404 } 21405 if (!ire) { 21406 /* 21407 * Make sure we don't load spread if this 21408 * is IPIF_NOFAILOVER case. 21409 */ 21410 if ((attach_ill != NULL) || 21411 (ip_nexthop && !ignore_nexthop)) { 21412 if (mctl_present) { 21413 io = (ipsec_out_t *)first_mp->b_rptr; 21414 ASSERT(first_mp->b_datap->db_type == 21415 M_CTL); 21416 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21417 } else { 21418 ASSERT(mp == first_mp); 21419 first_mp = allocb( 21420 sizeof (ipsec_info_t), BPRI_HI); 21421 if (first_mp == NULL) { 21422 first_mp = mp; 21423 goto discard_pkt; 21424 } 21425 first_mp->b_datap->db_type = M_CTL; 21426 first_mp->b_wptr += 21427 sizeof (ipsec_info_t); 21428 /* ipsec_out_secure is B_FALSE now */ 21429 bzero(first_mp->b_rptr, 21430 sizeof (ipsec_info_t)); 21431 io = (ipsec_out_t *)first_mp->b_rptr; 21432 io->ipsec_out_type = IPSEC_OUT; 21433 io->ipsec_out_len = 21434 sizeof (ipsec_out_t); 21435 io->ipsec_out_use_global_policy = 21436 B_TRUE; 21437 io->ipsec_out_ns = ipst->ips_netstack; 21438 first_mp->b_cont = mp; 21439 mctl_present = B_TRUE; 21440 } 21441 if (attach_ill != NULL) { 21442 io->ipsec_out_ill_index = attach_ill-> 21443 ill_phyint->phyint_ifindex; 21444 io->ipsec_out_attach_if = B_TRUE; 21445 } else { 21446 io->ipsec_out_ip_nexthop = ip_nexthop; 21447 io->ipsec_out_nexthop_addr = 21448 nexthop_addr; 21449 } 21450 } 21451 noirefound: 21452 /* 21453 * Mark this packet as having originated on 21454 * this machine. This will be noted in 21455 * ire_add_then_send, which needs to know 21456 * whether to run it back through ip_wput or 21457 * ip_rput following successful resolution. 21458 */ 21459 mp->b_prev = NULL; 21460 mp->b_next = NULL; 21461 ip_newroute(q, first_mp, dst, connp, zoneid, ipst); 21462 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21463 "ip_wput_end: q %p (%S)", q, "newroute"); 21464 if (attach_ill != NULL) 21465 ill_refrele(attach_ill); 21466 if (xmit_ill != NULL) 21467 ill_refrele(xmit_ill); 21468 if (need_decref) 21469 CONN_DEC_REF(connp); 21470 return; 21471 } 21472 } 21473 21474 /* We now know where we are going with it. */ 21475 21476 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21477 "ip_wput_end: q %p (%S)", q, "end"); 21478 21479 /* 21480 * Check if the ire has the RTF_MULTIRT flag, inherited 21481 * from an IRE_OFFSUBNET ire entry in ip_newroute. 21482 */ 21483 if (ire->ire_flags & RTF_MULTIRT) { 21484 /* 21485 * Force the TTL of multirouted packets if required. 21486 * The TTL of such packets is bounded by the 21487 * ip_multirt_ttl ndd variable. 21488 */ 21489 if ((ipst->ips_ip_multirt_ttl > 0) && 21490 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 21491 ip2dbg(("ip_wput: forcing multirt TTL to %d " 21492 "(was %d), dst 0x%08x\n", 21493 ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, 21494 ntohl(ire->ire_addr))); 21495 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 21496 } 21497 /* 21498 * At this point, we check to see if there are any pending 21499 * unresolved routes. ire_multirt_resolvable() 21500 * checks in O(n) that all IRE_OFFSUBNET ire 21501 * entries for the packet's destination and 21502 * flagged RTF_MULTIRT are currently resolved. 21503 * If some remain unresolved, we make a copy 21504 * of the current message. It will be used 21505 * to initiate additional route resolutions. 21506 */ 21507 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 21508 MBLK_GETLABEL(first_mp), ipst); 21509 ip2dbg(("ip_wput[noirefound]: ire %p, " 21510 "multirt_need_resolve %d, first_mp %p\n", 21511 (void *)ire, multirt_need_resolve, (void *)first_mp)); 21512 if (multirt_need_resolve) { 21513 copy_mp = copymsg(first_mp); 21514 if (copy_mp != NULL) { 21515 MULTIRT_DEBUG_TAG(copy_mp); 21516 } 21517 } 21518 } 21519 21520 ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); 21521 /* 21522 * Try to resolve another multiroute if 21523 * ire_multirt_resolvable() deemed it necessary. 21524 * At this point, we need to distinguish 21525 * multicasts from other packets. For multicasts, 21526 * we call ip_newroute_ipif() and request that both 21527 * multirouting and setsrc flags are checked. 21528 */ 21529 if (copy_mp != NULL) { 21530 if (CLASSD(dst)) { 21531 ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst); 21532 if (ipif) { 21533 ASSERT(infop->ip_opt_ill_index == 0); 21534 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 21535 RTF_SETSRC | RTF_MULTIRT, zoneid, infop); 21536 ipif_refrele(ipif); 21537 } else { 21538 MULTIRT_DEBUG_UNTAG(copy_mp); 21539 freemsg(copy_mp); 21540 copy_mp = NULL; 21541 } 21542 } else { 21543 ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); 21544 } 21545 } 21546 if (attach_ill != NULL) 21547 ill_refrele(attach_ill); 21548 if (xmit_ill != NULL) 21549 ill_refrele(xmit_ill); 21550 if (need_decref) 21551 CONN_DEC_REF(connp); 21552 return; 21553 21554 icmp_parameter_problem: 21555 /* could not have originated externally */ 21556 ASSERT(mp->b_prev == NULL); 21557 if (ip_hdr_complete(ipha, zoneid, ipst) == 0) { 21558 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 21559 /* it's the IP header length that's in trouble */ 21560 icmp_param_problem(q, first_mp, 0, zoneid, ipst); 21561 first_mp = NULL; 21562 } 21563 21564 discard_pkt: 21565 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 21566 drop_pkt: 21567 ip1dbg(("ip_wput: dropped packet\n")); 21568 if (ire != NULL) 21569 ire_refrele(ire); 21570 if (need_decref) 21571 CONN_DEC_REF(connp); 21572 freemsg(first_mp); 21573 if (attach_ill != NULL) 21574 ill_refrele(attach_ill); 21575 if (xmit_ill != NULL) 21576 ill_refrele(xmit_ill); 21577 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 21578 "ip_wput_end: q %p (%S)", q, "droppkt"); 21579 } 21580 21581 /* 21582 * If this is a conn_t queue, then we pass in the conn. This includes the 21583 * zoneid. 21584 * Otherwise, this is a message coming back from ARP or for an ill_t queue, 21585 * in which case we use the global zoneid since those are all part of 21586 * the global zone. 21587 */ 21588 void 21589 ip_wput(queue_t *q, mblk_t *mp) 21590 { 21591 if (CONN_Q(q)) 21592 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 21593 else 21594 ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT); 21595 } 21596 21597 /* 21598 * 21599 * The following rules must be observed when accessing any ipif or ill 21600 * that has been cached in the conn. Typically conn_nofailover_ill, 21601 * conn_outgoing_ill, conn_multicast_ipif and conn_multicast_ill. 21602 * 21603 * Access: The ipif or ill pointed to from the conn can be accessed under 21604 * the protection of the conn_lock or after it has been refheld under the 21605 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 21606 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 21607 * The reason for this is that a concurrent unplumb could actually be 21608 * cleaning up these cached pointers by walking the conns and might have 21609 * finished cleaning up the conn in question. The macros check that an 21610 * unplumb has not yet started on the ipif or ill. 21611 * 21612 * Caching: An ipif or ill pointer may be cached in the conn only after 21613 * making sure that an unplumb has not started. So the caching is done 21614 * while holding both the conn_lock and the ill_lock and after using the 21615 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 21616 * flag before starting the cleanup of conns. 21617 * 21618 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 21619 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 21620 * or a reference to the ipif or a reference to an ire that references the 21621 * ipif. An ipif does not change its ill except for failover/failback. Since 21622 * failover/failback happens only after bringing down the ipif and making sure 21623 * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock 21624 * the above holds. 21625 */ 21626 ipif_t * 21627 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 21628 { 21629 ipif_t *ipif; 21630 ill_t *ill; 21631 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 21632 21633 *err = 0; 21634 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21635 mutex_enter(&connp->conn_lock); 21636 ipif = *ipifp; 21637 if (ipif != NULL) { 21638 ill = ipif->ipif_ill; 21639 mutex_enter(&ill->ill_lock); 21640 if (IPIF_CAN_LOOKUP(ipif)) { 21641 ipif_refhold_locked(ipif); 21642 mutex_exit(&ill->ill_lock); 21643 mutex_exit(&connp->conn_lock); 21644 rw_exit(&ipst->ips_ill_g_lock); 21645 return (ipif); 21646 } else { 21647 *err = IPIF_LOOKUP_FAILED; 21648 } 21649 mutex_exit(&ill->ill_lock); 21650 } 21651 mutex_exit(&connp->conn_lock); 21652 rw_exit(&ipst->ips_ill_g_lock); 21653 return (NULL); 21654 } 21655 21656 ill_t * 21657 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 21658 { 21659 ill_t *ill; 21660 21661 *err = 0; 21662 mutex_enter(&connp->conn_lock); 21663 ill = *illp; 21664 if (ill != NULL) { 21665 mutex_enter(&ill->ill_lock); 21666 if (ILL_CAN_LOOKUP(ill)) { 21667 ill_refhold_locked(ill); 21668 mutex_exit(&ill->ill_lock); 21669 mutex_exit(&connp->conn_lock); 21670 return (ill); 21671 } else { 21672 *err = ILL_LOOKUP_FAILED; 21673 } 21674 mutex_exit(&ill->ill_lock); 21675 } 21676 mutex_exit(&connp->conn_lock); 21677 return (NULL); 21678 } 21679 21680 static int 21681 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 21682 { 21683 ill_t *ill; 21684 21685 ill = ipif->ipif_ill; 21686 mutex_enter(&connp->conn_lock); 21687 mutex_enter(&ill->ill_lock); 21688 if (IPIF_CAN_LOOKUP(ipif)) { 21689 *ipifp = ipif; 21690 mutex_exit(&ill->ill_lock); 21691 mutex_exit(&connp->conn_lock); 21692 return (0); 21693 } 21694 mutex_exit(&ill->ill_lock); 21695 mutex_exit(&connp->conn_lock); 21696 return (IPIF_LOOKUP_FAILED); 21697 } 21698 21699 /* 21700 * This is called if the outbound datagram needs fragmentation. 21701 * 21702 * NOTE : This function does not ire_refrele the ire argument passed in. 21703 */ 21704 static void 21705 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, 21706 ip_stack_t *ipst) 21707 { 21708 ipha_t *ipha; 21709 mblk_t *mp; 21710 uint32_t v_hlen_tos_len; 21711 uint32_t max_frag; 21712 uint32_t frag_flag; 21713 boolean_t dont_use; 21714 21715 if (ipsec_mp->b_datap->db_type == M_CTL) { 21716 mp = ipsec_mp->b_cont; 21717 } else { 21718 mp = ipsec_mp; 21719 } 21720 21721 ipha = (ipha_t *)mp->b_rptr; 21722 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 21723 21724 #ifdef _BIG_ENDIAN 21725 #define V_HLEN (v_hlen_tos_len >> 24) 21726 #define LENGTH (v_hlen_tos_len & 0xFFFF) 21727 #else 21728 #define V_HLEN (v_hlen_tos_len & 0xFF) 21729 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 21730 #endif 21731 21732 #ifndef SPEED_BEFORE_SAFETY 21733 /* 21734 * Check that ipha_length is consistent with 21735 * the mblk length 21736 */ 21737 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 21738 ip0dbg(("Packet length mismatch: %d, %ld\n", 21739 LENGTH, msgdsize(mp))); 21740 freemsg(ipsec_mp); 21741 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21742 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 21743 "packet length mismatch"); 21744 return; 21745 } 21746 #endif 21747 /* 21748 * Don't use frag_flag if pre-built packet or source 21749 * routed or if multicast (since multicast packets do not solicit 21750 * ICMP "packet too big" messages). Get the values of 21751 * max_frag and frag_flag atomically by acquiring the 21752 * ire_lock. 21753 */ 21754 mutex_enter(&ire->ire_lock); 21755 max_frag = ire->ire_max_frag; 21756 frag_flag = ire->ire_frag_flag; 21757 mutex_exit(&ire->ire_lock); 21758 21759 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 21760 (V_HLEN != IP_SIMPLE_HDR_VERSION && 21761 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 21762 21763 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 21764 (dont_use ? 0 : frag_flag), zoneid, ipst); 21765 } 21766 21767 /* 21768 * Used for deciding the MSS size for the upper layer. Thus 21769 * we need to check the outbound policy values in the conn. 21770 */ 21771 int 21772 conn_ipsec_length(conn_t *connp) 21773 { 21774 ipsec_latch_t *ipl; 21775 21776 ipl = connp->conn_latch; 21777 if (ipl == NULL) 21778 return (0); 21779 21780 if (ipl->ipl_out_policy == NULL) 21781 return (0); 21782 21783 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 21784 } 21785 21786 /* 21787 * Returns an estimate of the IPsec headers size. This is used if 21788 * we don't want to call into IPsec to get the exact size. 21789 */ 21790 int 21791 ipsec_out_extra_length(mblk_t *ipsec_mp) 21792 { 21793 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 21794 ipsec_action_t *a; 21795 21796 ASSERT(io->ipsec_out_type == IPSEC_OUT); 21797 if (!io->ipsec_out_secure) 21798 return (0); 21799 21800 a = io->ipsec_out_act; 21801 21802 if (a == NULL) { 21803 ASSERT(io->ipsec_out_policy != NULL); 21804 a = io->ipsec_out_policy->ipsp_act; 21805 } 21806 ASSERT(a != NULL); 21807 21808 return (a->ipa_ovhd); 21809 } 21810 21811 /* 21812 * Returns an estimate of the IPsec headers size. This is used if 21813 * we don't want to call into IPsec to get the exact size. 21814 */ 21815 int 21816 ipsec_in_extra_length(mblk_t *ipsec_mp) 21817 { 21818 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21819 ipsec_action_t *a; 21820 21821 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21822 21823 a = ii->ipsec_in_action; 21824 return (a == NULL ? 0 : a->ipa_ovhd); 21825 } 21826 21827 /* 21828 * If there are any source route options, return the true final 21829 * destination. Otherwise, return the destination. 21830 */ 21831 ipaddr_t 21832 ip_get_dst(ipha_t *ipha) 21833 { 21834 ipoptp_t opts; 21835 uchar_t *opt; 21836 uint8_t optval; 21837 uint8_t optlen; 21838 ipaddr_t dst; 21839 uint32_t off; 21840 21841 dst = ipha->ipha_dst; 21842 21843 if (IS_SIMPLE_IPH(ipha)) 21844 return (dst); 21845 21846 for (optval = ipoptp_first(&opts, ipha); 21847 optval != IPOPT_EOL; 21848 optval = ipoptp_next(&opts)) { 21849 opt = opts.ipoptp_cur; 21850 optlen = opts.ipoptp_len; 21851 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 21852 switch (optval) { 21853 case IPOPT_SSRR: 21854 case IPOPT_LSRR: 21855 off = opt[IPOPT_OFFSET]; 21856 /* 21857 * If one of the conditions is true, it means 21858 * end of options and dst already has the right 21859 * value. 21860 */ 21861 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 21862 off = optlen - IP_ADDR_LEN; 21863 bcopy(&opt[off], &dst, IP_ADDR_LEN); 21864 } 21865 return (dst); 21866 default: 21867 break; 21868 } 21869 } 21870 21871 return (dst); 21872 } 21873 21874 mblk_t * 21875 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 21876 conn_t *connp, boolean_t unspec_src, zoneid_t zoneid) 21877 { 21878 ipsec_out_t *io; 21879 mblk_t *first_mp; 21880 boolean_t policy_present; 21881 ip_stack_t *ipst; 21882 ipsec_stack_t *ipss; 21883 21884 ASSERT(ire != NULL); 21885 ipst = ire->ire_ipst; 21886 ipss = ipst->ips_netstack->netstack_ipsec; 21887 21888 first_mp = mp; 21889 if (mp->b_datap->db_type == M_CTL) { 21890 io = (ipsec_out_t *)first_mp->b_rptr; 21891 /* 21892 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 21893 * 21894 * 1) There is per-socket policy (including cached global 21895 * policy) or a policy on the IP-in-IP tunnel. 21896 * 2) There is no per-socket policy, but it is 21897 * a multicast packet that needs to go out 21898 * on a specific interface. This is the case 21899 * where (ip_wput and ip_wput_multicast) attaches 21900 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 21901 * 21902 * In case (2) we check with global policy to 21903 * see if there is a match and set the ill_index 21904 * appropriately so that we can lookup the ire 21905 * properly in ip_wput_ipsec_out. 21906 */ 21907 21908 /* 21909 * ipsec_out_use_global_policy is set to B_FALSE 21910 * in ipsec_in_to_out(). Refer to that function for 21911 * details. 21912 */ 21913 if ((io->ipsec_out_latch == NULL) && 21914 (io->ipsec_out_use_global_policy)) { 21915 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 21916 ire, connp, unspec_src, zoneid)); 21917 } 21918 if (!io->ipsec_out_secure) { 21919 /* 21920 * If this is not a secure packet, drop 21921 * the IPSEC_OUT mp and treat it as a clear 21922 * packet. This happens when we are sending 21923 * a ICMP reply back to a clear packet. See 21924 * ipsec_in_to_out() for details. 21925 */ 21926 mp = first_mp->b_cont; 21927 freeb(first_mp); 21928 } 21929 return (mp); 21930 } 21931 /* 21932 * See whether we need to attach a global policy here. We 21933 * don't depend on the conn (as it could be null) for deciding 21934 * what policy this datagram should go through because it 21935 * should have happened in ip_wput if there was some 21936 * policy. This normally happens for connections which are not 21937 * fully bound preventing us from caching policies in 21938 * ip_bind. Packets coming from the TCP listener/global queue 21939 * - which are non-hard_bound - could also be affected by 21940 * applying policy here. 21941 * 21942 * If this packet is coming from tcp global queue or listener, 21943 * we will be applying policy here. This may not be *right* 21944 * if these packets are coming from the detached connection as 21945 * it could have gone in clear before. This happens only if a 21946 * TCP connection started when there is no policy and somebody 21947 * added policy before it became detached. Thus packets of the 21948 * detached connection could go out secure and the other end 21949 * would drop it because it will be expecting in clear. The 21950 * converse is not true i.e if somebody starts a TCP 21951 * connection and deletes the policy, all the packets will 21952 * still go out with the policy that existed before deleting 21953 * because ip_unbind sends up policy information which is used 21954 * by TCP on subsequent ip_wputs. The right solution is to fix 21955 * TCP to attach a dummy IPSEC_OUT and set 21956 * ipsec_out_use_global_policy to B_FALSE. As this might 21957 * affect performance for normal cases, we are not doing it. 21958 * Thus, set policy before starting any TCP connections. 21959 * 21960 * NOTE - We might apply policy even for a hard bound connection 21961 * - for which we cached policy in ip_bind - if somebody added 21962 * global policy after we inherited the policy in ip_bind. 21963 * This means that the packets that were going out in clear 21964 * previously would start going secure and hence get dropped 21965 * on the other side. To fix this, TCP attaches a dummy 21966 * ipsec_out and make sure that we don't apply global policy. 21967 */ 21968 if (ipha != NULL) 21969 policy_present = ipss->ipsec_outbound_v4_policy_present; 21970 else 21971 policy_present = ipss->ipsec_outbound_v6_policy_present; 21972 if (!policy_present) 21973 return (mp); 21974 21975 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src, 21976 zoneid)); 21977 } 21978 21979 ire_t * 21980 conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) 21981 { 21982 ipaddr_t addr; 21983 ire_t *save_ire; 21984 irb_t *irb; 21985 ill_group_t *illgrp; 21986 int err; 21987 21988 save_ire = ire; 21989 addr = ire->ire_addr; 21990 21991 ASSERT(ire->ire_type == IRE_BROADCAST); 21992 21993 illgrp = connp->conn_outgoing_ill->ill_group; 21994 if (illgrp == NULL) { 21995 *conn_outgoing_ill = conn_get_held_ill(connp, 21996 &connp->conn_outgoing_ill, &err); 21997 if (err == ILL_LOOKUP_FAILED) { 21998 ire_refrele(save_ire); 21999 return (NULL); 22000 } 22001 return (save_ire); 22002 } 22003 /* 22004 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. 22005 * If it is part of the group, we need to send on the ire 22006 * that has been cleared of IRE_MARK_NORECV and that belongs 22007 * to this group. This is okay as IP_BOUND_IF really means 22008 * any ill in the group. We depend on the fact that the 22009 * first ire in the group is always cleared of IRE_MARK_NORECV 22010 * if such an ire exists. This is possible only if you have 22011 * at least one ill in the group that has not failed. 22012 * 22013 * First get to the ire that matches the address and group. 22014 * 22015 * We don't look for an ire with a matching zoneid because a given zone 22016 * won't always have broadcast ires on all ills in the group. 22017 */ 22018 irb = ire->ire_bucket; 22019 rw_enter(&irb->irb_lock, RW_READER); 22020 if (ire->ire_marks & IRE_MARK_NORECV) { 22021 /* 22022 * If the current zone only has an ire broadcast for this 22023 * address marked NORECV, the ire we want is ahead in the 22024 * bucket, so we look it up deliberately ignoring the zoneid. 22025 */ 22026 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 22027 if (ire->ire_addr != addr) 22028 continue; 22029 /* skip over deleted ires */ 22030 if (ire->ire_marks & IRE_MARK_CONDEMNED) 22031 continue; 22032 } 22033 } 22034 while (ire != NULL) { 22035 /* 22036 * If a new interface is coming up, we could end up 22037 * seeing the loopback ire and the non-loopback ire 22038 * may not have been added yet. So check for ire_stq 22039 */ 22040 if (ire->ire_stq != NULL && (ire->ire_addr != addr || 22041 ire->ire_ipif->ipif_ill->ill_group == illgrp)) { 22042 break; 22043 } 22044 ire = ire->ire_next; 22045 } 22046 if (ire != NULL && ire->ire_addr == addr && 22047 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 22048 IRE_REFHOLD(ire); 22049 rw_exit(&irb->irb_lock); 22050 ire_refrele(save_ire); 22051 *conn_outgoing_ill = ire_to_ill(ire); 22052 /* 22053 * Refhold the ill to make the conn_outgoing_ill 22054 * independent of the ire. ip_wput_ire goes in a loop 22055 * and may refrele the ire. Since we have an ire at this 22056 * point we don't need to use ILL_CAN_LOOKUP on the ill. 22057 */ 22058 ill_refhold(*conn_outgoing_ill); 22059 return (ire); 22060 } 22061 rw_exit(&irb->irb_lock); 22062 ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); 22063 /* 22064 * If we can't find a suitable ire, return the original ire. 22065 */ 22066 return (save_ire); 22067 } 22068 22069 /* 22070 * This function does the ire_refrele of the ire passed in as the 22071 * argument. As this function looks up more ires i.e broadcast ires, 22072 * it needs to REFRELE them. Currently, for simplicity we don't 22073 * differentiate the one passed in and looked up here. We always 22074 * REFRELE. 22075 * IPQoS Notes: 22076 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 22077 * IPsec packets are done in ipsec_out_process. 22078 * 22079 */ 22080 void 22081 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, 22082 zoneid_t zoneid) 22083 { 22084 ipha_t *ipha; 22085 #define rptr ((uchar_t *)ipha) 22086 queue_t *stq; 22087 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 22088 uint32_t v_hlen_tos_len; 22089 uint32_t ttl_protocol; 22090 ipaddr_t src; 22091 ipaddr_t dst; 22092 uint32_t cksum; 22093 ipaddr_t orig_src; 22094 ire_t *ire1; 22095 mblk_t *next_mp; 22096 uint_t hlen; 22097 uint16_t *up; 22098 uint32_t max_frag = ire->ire_max_frag; 22099 ill_t *ill = ire_to_ill(ire); 22100 int clusterwide; 22101 uint16_t ip_hdr_included; /* IP header included by ULP? */ 22102 int ipsec_len; 22103 mblk_t *first_mp; 22104 ipsec_out_t *io; 22105 boolean_t conn_dontroute; /* conn value for multicast */ 22106 boolean_t conn_multicast_loop; /* conn value for multicast */ 22107 boolean_t multicast_forward; /* Should we forward ? */ 22108 boolean_t unspec_src; 22109 ill_t *conn_outgoing_ill = NULL; 22110 ill_t *ire_ill; 22111 ill_t *ire1_ill; 22112 ill_t *out_ill; 22113 uint32_t ill_index = 0; 22114 boolean_t multirt_send = B_FALSE; 22115 int err; 22116 ipxmit_state_t pktxmit_state; 22117 ip_stack_t *ipst = ire->ire_ipst; 22118 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 22119 22120 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 22121 "ip_wput_ire_start: q %p", q); 22122 22123 multicast_forward = B_FALSE; 22124 unspec_src = (connp != NULL && connp->conn_unspec_src); 22125 22126 if (ire->ire_flags & RTF_MULTIRT) { 22127 /* 22128 * Multirouting case. The bucket where ire is stored 22129 * probably holds other RTF_MULTIRT flagged ire 22130 * to the destination. In this call to ip_wput_ire, 22131 * we attempt to send the packet through all 22132 * those ires. Thus, we first ensure that ire is the 22133 * first RTF_MULTIRT ire in the bucket, 22134 * before walking the ire list. 22135 */ 22136 ire_t *first_ire; 22137 irb_t *irb = ire->ire_bucket; 22138 ASSERT(irb != NULL); 22139 22140 /* Make sure we do not omit any multiroute ire. */ 22141 IRB_REFHOLD(irb); 22142 for (first_ire = irb->irb_ire; 22143 first_ire != NULL; 22144 first_ire = first_ire->ire_next) { 22145 if ((first_ire->ire_flags & RTF_MULTIRT) && 22146 (first_ire->ire_addr == ire->ire_addr) && 22147 !(first_ire->ire_marks & 22148 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 22149 break; 22150 } 22151 } 22152 22153 if ((first_ire != NULL) && (first_ire != ire)) { 22154 IRE_REFHOLD(first_ire); 22155 ire_refrele(ire); 22156 ire = first_ire; 22157 ill = ire_to_ill(ire); 22158 } 22159 IRB_REFRELE(irb); 22160 } 22161 22162 /* 22163 * conn_outgoing_ill variable is used only in the broadcast loop. 22164 * for performance we don't grab the mutexs in the fastpath 22165 */ 22166 if ((connp != NULL) && 22167 (ire->ire_type == IRE_BROADCAST) && 22168 ((connp->conn_nofailover_ill != NULL) || 22169 (connp->conn_outgoing_ill != NULL))) { 22170 /* 22171 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF 22172 * option. So, see if this endpoint is bound to a 22173 * IPIF_NOFAILOVER address. If so, honor it. This implies 22174 * that if the interface is failed, we will still send 22175 * the packet on the same ill which is what we want. 22176 */ 22177 conn_outgoing_ill = conn_get_held_ill(connp, 22178 &connp->conn_nofailover_ill, &err); 22179 if (err == ILL_LOOKUP_FAILED) { 22180 ire_refrele(ire); 22181 freemsg(mp); 22182 return; 22183 } 22184 if (conn_outgoing_ill == NULL) { 22185 /* 22186 * Choose a good ill in the group to send the 22187 * packets on. 22188 */ 22189 ire = conn_set_outgoing_ill(connp, ire, 22190 &conn_outgoing_ill); 22191 if (ire == NULL) { 22192 freemsg(mp); 22193 return; 22194 } 22195 } 22196 } 22197 22198 if (mp->b_datap->db_type != M_CTL) { 22199 ipha = (ipha_t *)mp->b_rptr; 22200 } else { 22201 io = (ipsec_out_t *)mp->b_rptr; 22202 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22203 ASSERT(zoneid == io->ipsec_out_zoneid); 22204 ASSERT(zoneid != ALL_ZONES); 22205 ipha = (ipha_t *)mp->b_cont->b_rptr; 22206 dst = ipha->ipha_dst; 22207 /* 22208 * For the multicast case, ipsec_out carries conn_dontroute and 22209 * conn_multicast_loop as conn may not be available here. We 22210 * need this for multicast loopback and forwarding which is done 22211 * later in the code. 22212 */ 22213 if (CLASSD(dst)) { 22214 conn_dontroute = io->ipsec_out_dontroute; 22215 conn_multicast_loop = io->ipsec_out_multicast_loop; 22216 /* 22217 * If conn_dontroute is not set or conn_multicast_loop 22218 * is set, we need to do forwarding/loopback. For 22219 * datagrams from ip_wput_multicast, conn_dontroute is 22220 * set to B_TRUE and conn_multicast_loop is set to 22221 * B_FALSE so that we neither do forwarding nor 22222 * loopback. 22223 */ 22224 if (!conn_dontroute || conn_multicast_loop) 22225 multicast_forward = B_TRUE; 22226 } 22227 } 22228 22229 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 22230 ire->ire_zoneid != ALL_ZONES) { 22231 /* 22232 * When a zone sends a packet to another zone, we try to deliver 22233 * the packet under the same conditions as if the destination 22234 * was a real node on the network. To do so, we look for a 22235 * matching route in the forwarding table. 22236 * RTF_REJECT and RTF_BLACKHOLE are handled just like 22237 * ip_newroute() does. 22238 * Note that IRE_LOCAL are special, since they are used 22239 * when the zoneid doesn't match in some cases. This means that 22240 * we need to handle ipha_src differently since ire_src_addr 22241 * belongs to the receiving zone instead of the sending zone. 22242 * When ip_restrict_interzone_loopback is set, then 22243 * ire_cache_lookup() ensures that IRE_LOCAL are only used 22244 * for loopback between zones when the logical "Ethernet" would 22245 * have looped them back. 22246 */ 22247 ire_t *src_ire; 22248 22249 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 22250 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 22251 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst); 22252 if (src_ire != NULL && 22253 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && 22254 (!ipst->ips_ip_restrict_interzone_loopback || 22255 ire_local_same_ill_group(ire, src_ire))) { 22256 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 22257 ipha->ipha_src = src_ire->ire_src_addr; 22258 ire_refrele(src_ire); 22259 } else { 22260 ire_refrele(ire); 22261 if (conn_outgoing_ill != NULL) 22262 ill_refrele(conn_outgoing_ill); 22263 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 22264 if (src_ire != NULL) { 22265 if (src_ire->ire_flags & RTF_BLACKHOLE) { 22266 ire_refrele(src_ire); 22267 freemsg(mp); 22268 return; 22269 } 22270 ire_refrele(src_ire); 22271 } 22272 if (ip_hdr_complete(ipha, zoneid, ipst)) { 22273 /* Failed */ 22274 freemsg(mp); 22275 return; 22276 } 22277 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid, 22278 ipst); 22279 return; 22280 } 22281 } 22282 22283 if (mp->b_datap->db_type == M_CTL || 22284 ipss->ipsec_outbound_v4_policy_present) { 22285 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 22286 unspec_src, zoneid); 22287 if (mp == NULL) { 22288 ire_refrele(ire); 22289 if (conn_outgoing_ill != NULL) 22290 ill_refrele(conn_outgoing_ill); 22291 return; 22292 } 22293 /* 22294 * Trusted Extensions supports all-zones interfaces, so 22295 * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to 22296 * the global zone. 22297 */ 22298 if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) { 22299 io = (ipsec_out_t *)mp->b_rptr; 22300 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22301 zoneid = io->ipsec_out_zoneid; 22302 } 22303 } 22304 22305 first_mp = mp; 22306 ipsec_len = 0; 22307 22308 if (first_mp->b_datap->db_type == M_CTL) { 22309 io = (ipsec_out_t *)first_mp->b_rptr; 22310 ASSERT(io->ipsec_out_type == IPSEC_OUT); 22311 mp = first_mp->b_cont; 22312 ipsec_len = ipsec_out_extra_length(first_mp); 22313 ASSERT(ipsec_len >= 0); 22314 /* We already picked up the zoneid from the M_CTL above */ 22315 ASSERT(zoneid == io->ipsec_out_zoneid); 22316 ASSERT(zoneid != ALL_ZONES); 22317 22318 /* 22319 * Drop M_CTL here if IPsec processing is not needed. 22320 * (Non-IPsec use of M_CTL extracted any information it 22321 * needed above). 22322 */ 22323 if (ipsec_len == 0) { 22324 freeb(first_mp); 22325 first_mp = mp; 22326 } 22327 } 22328 22329 /* 22330 * Fast path for ip_wput_ire 22331 */ 22332 22333 ipha = (ipha_t *)mp->b_rptr; 22334 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22335 dst = ipha->ipha_dst; 22336 22337 /* 22338 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 22339 * if the socket is a SOCK_RAW type. The transport checksum should 22340 * be provided in the pre-built packet, so we don't need to compute it. 22341 * Also, other application set flags, like DF, should not be altered. 22342 * Other transport MUST pass down zero. 22343 */ 22344 ip_hdr_included = ipha->ipha_ident; 22345 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 22346 22347 if (CLASSD(dst)) { 22348 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 22349 ntohl(dst), 22350 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 22351 ntohl(ire->ire_addr))); 22352 } 22353 22354 /* Macros to extract header fields from data already in registers */ 22355 #ifdef _BIG_ENDIAN 22356 #define V_HLEN (v_hlen_tos_len >> 24) 22357 #define LENGTH (v_hlen_tos_len & 0xFFFF) 22358 #define PROTO (ttl_protocol & 0xFF) 22359 #else 22360 #define V_HLEN (v_hlen_tos_len & 0xFF) 22361 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 22362 #define PROTO (ttl_protocol >> 8) 22363 #endif 22364 22365 22366 orig_src = src = ipha->ipha_src; 22367 /* (The loop back to "another" is explained down below.) */ 22368 another:; 22369 /* 22370 * Assign an ident value for this packet. We assign idents on 22371 * a per destination basis out of the IRE. There could be 22372 * other threads targeting the same destination, so we have to 22373 * arrange for a atomic increment. Note that we use a 32-bit 22374 * atomic add because it has better performance than its 22375 * 16-bit sibling. 22376 * 22377 * If running in cluster mode and if the source address 22378 * belongs to a replicated service then vector through 22379 * cl_inet_ipident vector to allocate ip identifier 22380 * NOTE: This is a contract private interface with the 22381 * clustering group. 22382 */ 22383 clusterwide = 0; 22384 if (cl_inet_ipident) { 22385 ASSERT(cl_inet_isclusterwide); 22386 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 22387 AF_INET, (uint8_t *)(uintptr_t)src)) { 22388 ipha->ipha_ident = (*cl_inet_ipident)(IPPROTO_IP, 22389 AF_INET, (uint8_t *)(uintptr_t)src, 22390 (uint8_t *)(uintptr_t)dst); 22391 clusterwide = 1; 22392 } 22393 } 22394 if (!clusterwide) { 22395 ipha->ipha_ident = 22396 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 22397 } 22398 22399 #ifndef _BIG_ENDIAN 22400 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 22401 #endif 22402 22403 /* 22404 * Set source address unless sent on an ill or conn_unspec_src is set. 22405 * This is needed to obey conn_unspec_src when packets go through 22406 * ip_newroute + arp. 22407 * Assumes ip_newroute{,_multi} sets the source address as well. 22408 */ 22409 if (src == INADDR_ANY && !unspec_src) { 22410 /* 22411 * Assign the appropriate source address from the IRE if none 22412 * was specified. 22413 */ 22414 ASSERT(ire->ire_ipversion == IPV4_VERSION); 22415 22416 /* 22417 * With IP multipathing, broadcast packets are sent on the ire 22418 * that has been cleared of IRE_MARK_NORECV and that belongs to 22419 * the group. However, this ire might not be in the same zone so 22420 * we can't always use its source address. We look for a 22421 * broadcast ire in the same group and in the right zone. 22422 */ 22423 if (ire->ire_type == IRE_BROADCAST && 22424 ire->ire_zoneid != zoneid) { 22425 ire_t *src_ire = ire_ctable_lookup(dst, 0, 22426 IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, 22427 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); 22428 if (src_ire != NULL) { 22429 src = src_ire->ire_src_addr; 22430 ire_refrele(src_ire); 22431 } else { 22432 ire_refrele(ire); 22433 if (conn_outgoing_ill != NULL) 22434 ill_refrele(conn_outgoing_ill); 22435 freemsg(first_mp); 22436 if (ill != NULL) { 22437 BUMP_MIB(ill->ill_ip_mib, 22438 ipIfStatsOutDiscards); 22439 } else { 22440 BUMP_MIB(&ipst->ips_ip_mib, 22441 ipIfStatsOutDiscards); 22442 } 22443 return; 22444 } 22445 } else { 22446 src = ire->ire_src_addr; 22447 } 22448 22449 if (connp == NULL) { 22450 ip1dbg(("ip_wput_ire: no connp and no src " 22451 "address for dst 0x%x, using src 0x%x\n", 22452 ntohl(dst), 22453 ntohl(src))); 22454 } 22455 ipha->ipha_src = src; 22456 } 22457 stq = ire->ire_stq; 22458 22459 /* 22460 * We only allow ire chains for broadcasts since there will 22461 * be multiple IRE_CACHE entries for the same multicast 22462 * address (one per ipif). 22463 */ 22464 next_mp = NULL; 22465 22466 /* broadcast packet */ 22467 if (ire->ire_type == IRE_BROADCAST) 22468 goto broadcast; 22469 22470 /* loopback ? */ 22471 if (stq == NULL) 22472 goto nullstq; 22473 22474 /* The ill_index for outbound ILL */ 22475 ill_index = Q_TO_INDEX(stq); 22476 22477 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 22478 ttl_protocol = ((uint16_t *)ipha)[4]; 22479 22480 /* pseudo checksum (do it in parts for IP header checksum) */ 22481 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 22482 22483 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 22484 queue_t *dev_q = stq->q_next; 22485 22486 /* flow controlled */ 22487 if ((dev_q->q_next || dev_q->q_first) && 22488 !canput(dev_q)) 22489 goto blocked; 22490 if ((PROTO == IPPROTO_UDP) && 22491 (ip_hdr_included != IP_HDR_INCLUDED)) { 22492 hlen = (V_HLEN & 0xF) << 2; 22493 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 22494 if (*up != 0) { 22495 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 22496 hlen, LENGTH, max_frag, ipsec_len, cksum); 22497 /* Software checksum? */ 22498 if (DB_CKSUMFLAGS(mp) == 0) { 22499 IP_STAT(ipst, ip_out_sw_cksum); 22500 IP_STAT_UPDATE(ipst, 22501 ip_udp_out_sw_cksum_bytes, 22502 LENGTH - hlen); 22503 } 22504 } 22505 } 22506 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 22507 hlen = (V_HLEN & 0xF) << 2; 22508 if (PROTO == IPPROTO_TCP) { 22509 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22510 /* 22511 * The packet header is processed once and for all, even 22512 * in the multirouting case. We disable hardware 22513 * checksum if the packet is multirouted, as it will be 22514 * replicated via several interfaces, and not all of 22515 * them may have this capability. 22516 */ 22517 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 22518 LENGTH, max_frag, ipsec_len, cksum); 22519 /* Software checksum? */ 22520 if (DB_CKSUMFLAGS(mp) == 0) { 22521 IP_STAT(ipst, ip_out_sw_cksum); 22522 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22523 LENGTH - hlen); 22524 } 22525 } else { 22526 sctp_hdr_t *sctph; 22527 22528 ASSERT(PROTO == IPPROTO_SCTP); 22529 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22530 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22531 /* 22532 * Zero out the checksum field to ensure proper 22533 * checksum calculation. 22534 */ 22535 sctph->sh_chksum = 0; 22536 #ifdef DEBUG 22537 if (!skip_sctp_cksum) 22538 #endif 22539 sctph->sh_chksum = sctp_cksum(mp, hlen); 22540 } 22541 } 22542 22543 /* 22544 * If this is a multicast packet and originated from ip_wput 22545 * we need to do loopback and forwarding checks. If it comes 22546 * from ip_wput_multicast, we SHOULD not do this. 22547 */ 22548 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 22549 22550 /* checksum */ 22551 cksum += ttl_protocol; 22552 22553 /* fragment the packet */ 22554 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 22555 goto fragmentit; 22556 /* 22557 * Don't use frag_flag if packet is pre-built or source 22558 * routed or if multicast (since multicast packets do 22559 * not solicit ICMP "packet too big" messages). 22560 */ 22561 if ((ip_hdr_included != IP_HDR_INCLUDED) && 22562 (V_HLEN == IP_SIMPLE_HDR_VERSION || 22563 !ip_source_route_included(ipha)) && 22564 !CLASSD(ipha->ipha_dst)) 22565 ipha->ipha_fragment_offset_and_flags |= 22566 htons(ire->ire_frag_flag); 22567 22568 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 22569 /* calculate IP header checksum */ 22570 cksum += ipha->ipha_ident; 22571 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 22572 cksum += ipha->ipha_fragment_offset_and_flags; 22573 22574 /* IP options present */ 22575 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 22576 if (hlen) 22577 goto checksumoptions; 22578 22579 /* calculate hdr checksum */ 22580 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 22581 cksum = ~(cksum + (cksum >> 16)); 22582 ipha->ipha_hdr_checksum = (uint16_t)cksum; 22583 } 22584 if (ipsec_len != 0) { 22585 /* 22586 * We will do the rest of the processing after 22587 * we come back from IPsec in ip_wput_ipsec_out(). 22588 */ 22589 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 22590 22591 io = (ipsec_out_t *)first_mp->b_rptr; 22592 io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> 22593 ill_phyint->phyint_ifindex; 22594 22595 ipsec_out_process(q, first_mp, ire, ill_index); 22596 ire_refrele(ire); 22597 if (conn_outgoing_ill != NULL) 22598 ill_refrele(conn_outgoing_ill); 22599 return; 22600 } 22601 22602 /* 22603 * In most cases, the emission loop below is entered only 22604 * once. Only in the case where the ire holds the 22605 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 22606 * flagged ires in the bucket, and send the packet 22607 * through all crossed RTF_MULTIRT routes. 22608 */ 22609 if (ire->ire_flags & RTF_MULTIRT) { 22610 multirt_send = B_TRUE; 22611 } 22612 do { 22613 if (multirt_send) { 22614 irb_t *irb; 22615 /* 22616 * We are in a multiple send case, need to get 22617 * the next ire and make a duplicate of the packet. 22618 * ire1 holds here the next ire to process in the 22619 * bucket. If multirouting is expected, 22620 * any non-RTF_MULTIRT ire that has the 22621 * right destination address is ignored. 22622 */ 22623 irb = ire->ire_bucket; 22624 ASSERT(irb != NULL); 22625 22626 IRB_REFHOLD(irb); 22627 for (ire1 = ire->ire_next; 22628 ire1 != NULL; 22629 ire1 = ire1->ire_next) { 22630 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22631 continue; 22632 if (ire1->ire_addr != ire->ire_addr) 22633 continue; 22634 if (ire1->ire_marks & 22635 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 22636 continue; 22637 22638 /* Got one */ 22639 IRE_REFHOLD(ire1); 22640 break; 22641 } 22642 IRB_REFRELE(irb); 22643 22644 if (ire1 != NULL) { 22645 next_mp = copyb(mp); 22646 if ((next_mp == NULL) || 22647 ((mp->b_cont != NULL) && 22648 ((next_mp->b_cont = 22649 dupmsg(mp->b_cont)) == NULL))) { 22650 freemsg(next_mp); 22651 next_mp = NULL; 22652 ire_refrele(ire1); 22653 ire1 = NULL; 22654 } 22655 } 22656 22657 /* Last multiroute ire; don't loop anymore. */ 22658 if (ire1 == NULL) { 22659 multirt_send = B_FALSE; 22660 } 22661 } 22662 22663 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 22664 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha, 22665 mblk_t *, mp); 22666 FW_HOOKS(ipst->ips_ip4_physical_out_event, 22667 ipst->ips_ipv4firewall_physical_out, 22668 NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst); 22669 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 22670 if (mp == NULL) 22671 goto release_ire_and_ill; 22672 22673 mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT); 22674 DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire); 22675 pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE); 22676 if ((pktxmit_state == SEND_FAILED) || 22677 (pktxmit_state == LLHDR_RESLV_FAILED)) { 22678 ip2dbg(("ip_wput_ire: ip_xmit_v4 failed" 22679 "- packet dropped\n")); 22680 release_ire_and_ill: 22681 ire_refrele(ire); 22682 if (next_mp != NULL) { 22683 freemsg(next_mp); 22684 ire_refrele(ire1); 22685 } 22686 if (conn_outgoing_ill != NULL) 22687 ill_refrele(conn_outgoing_ill); 22688 return; 22689 } 22690 22691 if (CLASSD(dst)) { 22692 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts); 22693 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets, 22694 LENGTH); 22695 } 22696 22697 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22698 "ip_wput_ire_end: q %p (%S)", 22699 q, "last copy out"); 22700 IRE_REFRELE(ire); 22701 22702 if (multirt_send) { 22703 ASSERT(ire1); 22704 /* 22705 * Proceed with the next RTF_MULTIRT ire, 22706 * Also set up the send-to queue accordingly. 22707 */ 22708 ire = ire1; 22709 ire1 = NULL; 22710 stq = ire->ire_stq; 22711 mp = next_mp; 22712 next_mp = NULL; 22713 ipha = (ipha_t *)mp->b_rptr; 22714 ill_index = Q_TO_INDEX(stq); 22715 ill = (ill_t *)stq->q_ptr; 22716 } 22717 } while (multirt_send); 22718 if (conn_outgoing_ill != NULL) 22719 ill_refrele(conn_outgoing_ill); 22720 return; 22721 22722 /* 22723 * ire->ire_type == IRE_BROADCAST (minimize diffs) 22724 */ 22725 broadcast: 22726 { 22727 /* 22728 * To avoid broadcast storms, we usually set the TTL to 1 for 22729 * broadcasts. However, if SO_DONTROUTE isn't set, this value 22730 * can be overridden stack-wide through the ip_broadcast_ttl 22731 * ndd tunable, or on a per-connection basis through the 22732 * IP_BROADCAST_TTL socket option. 22733 * 22734 * In the event that we are replying to incoming ICMP packets, 22735 * connp could be NULL. 22736 */ 22737 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 22738 if (connp != NULL) { 22739 if (connp->conn_dontroute) 22740 ipha->ipha_ttl = 1; 22741 else if (connp->conn_broadcast_ttl != 0) 22742 ipha->ipha_ttl = connp->conn_broadcast_ttl; 22743 } 22744 22745 /* 22746 * Note that we are not doing a IRB_REFHOLD here. 22747 * Actually we don't care if the list changes i.e 22748 * if somebody deletes an IRE from the list while 22749 * we drop the lock, the next time we come around 22750 * ire_next will be NULL and hence we won't send 22751 * out multiple copies which is fine. 22752 */ 22753 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 22754 ire1 = ire->ire_next; 22755 if (conn_outgoing_ill != NULL) { 22756 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 22757 ASSERT(ire1 == ire->ire_next); 22758 if (ire1 != NULL && ire1->ire_addr == dst) { 22759 ire_refrele(ire); 22760 ire = ire1; 22761 IRE_REFHOLD(ire); 22762 ire1 = ire->ire_next; 22763 continue; 22764 } 22765 rw_exit(&ire->ire_bucket->irb_lock); 22766 /* Did not find a matching ill */ 22767 ip1dbg(("ip_wput_ire: broadcast with no " 22768 "matching IP_BOUND_IF ill %s dst %x\n", 22769 conn_outgoing_ill->ill_name, dst)); 22770 freemsg(first_mp); 22771 if (ire != NULL) 22772 ire_refrele(ire); 22773 ill_refrele(conn_outgoing_ill); 22774 return; 22775 } 22776 } else if (ire1 != NULL && ire1->ire_addr == dst) { 22777 /* 22778 * If the next IRE has the same address and is not one 22779 * of the two copies that we need to send, try to see 22780 * whether this copy should be sent at all. This 22781 * assumes that we insert loopbacks first and then 22782 * non-loopbacks. This is acheived by inserting the 22783 * loopback always before non-loopback. 22784 * This is used to send a single copy of a broadcast 22785 * packet out all physical interfaces that have an 22786 * matching IRE_BROADCAST while also looping 22787 * back one copy (to ip_wput_local) for each 22788 * matching physical interface. However, we avoid 22789 * sending packets out different logical that match by 22790 * having ipif_up/ipif_down supress duplicate 22791 * IRE_BROADCASTS. 22792 * 22793 * This feature is currently used to get broadcasts 22794 * sent to multiple interfaces, when the broadcast 22795 * address being used applies to multiple interfaces. 22796 * For example, a whole net broadcast will be 22797 * replicated on every connected subnet of 22798 * the target net. 22799 * 22800 * Each zone has its own set of IRE_BROADCASTs, so that 22801 * we're able to distribute inbound packets to multiple 22802 * zones who share a broadcast address. We avoid looping 22803 * back outbound packets in different zones but on the 22804 * same ill, as the application would see duplicates. 22805 * 22806 * If the interfaces are part of the same group, 22807 * we would want to send only one copy out for 22808 * whole group. 22809 * 22810 * This logic assumes that ire_add_v4() groups the 22811 * IRE_BROADCAST entries so that those with the same 22812 * ire_addr and ill_group are kept together. 22813 */ 22814 ire_ill = ire->ire_ipif->ipif_ill; 22815 if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { 22816 if (ire_ill->ill_group != NULL && 22817 (ire->ire_marks & IRE_MARK_NORECV)) { 22818 /* 22819 * If the current zone only has an ire 22820 * broadcast for this address marked 22821 * NORECV, the ire we want is ahead in 22822 * the bucket, so we look it up 22823 * deliberately ignoring the zoneid. 22824 */ 22825 for (ire1 = ire->ire_bucket->irb_ire; 22826 ire1 != NULL; 22827 ire1 = ire1->ire_next) { 22828 ire1_ill = 22829 ire1->ire_ipif->ipif_ill; 22830 if (ire1->ire_addr != dst) 22831 continue; 22832 /* skip over the current ire */ 22833 if (ire1 == ire) 22834 continue; 22835 /* skip over deleted ires */ 22836 if (ire1->ire_marks & 22837 IRE_MARK_CONDEMNED) 22838 continue; 22839 /* 22840 * non-loopback ire in our 22841 * group: use it for the next 22842 * pass in the loop 22843 */ 22844 if (ire1->ire_stq != NULL && 22845 ire1_ill->ill_group == 22846 ire_ill->ill_group) 22847 break; 22848 } 22849 } 22850 } else { 22851 while (ire1 != NULL && ire1->ire_addr == dst) { 22852 ire1_ill = ire1->ire_ipif->ipif_ill; 22853 /* 22854 * We can have two broadcast ires on the 22855 * same ill in different zones; here 22856 * we'll send a copy of the packet on 22857 * each ill and the fanout code will 22858 * call conn_wantpacket() to check that 22859 * the zone has the broadcast address 22860 * configured on the ill. If the two 22861 * ires are in the same group we only 22862 * send one copy up. 22863 */ 22864 if (ire1_ill != ire_ill && 22865 (ire1_ill->ill_group == NULL || 22866 ire_ill->ill_group == NULL || 22867 ire1_ill->ill_group != 22868 ire_ill->ill_group)) { 22869 break; 22870 } 22871 ire1 = ire1->ire_next; 22872 } 22873 } 22874 } 22875 ASSERT(multirt_send == B_FALSE); 22876 if (ire1 != NULL && ire1->ire_addr == dst) { 22877 if ((ire->ire_flags & RTF_MULTIRT) && 22878 (ire1->ire_flags & RTF_MULTIRT)) { 22879 /* 22880 * We are in the multirouting case. 22881 * The message must be sent at least 22882 * on both ires. These ires have been 22883 * inserted AFTER the standard ones 22884 * in ip_rt_add(). There are thus no 22885 * other ire entries for the destination 22886 * address in the rest of the bucket 22887 * that do not have the RTF_MULTIRT 22888 * flag. We don't process a copy 22889 * of the message here. This will be 22890 * done in the final sending loop. 22891 */ 22892 multirt_send = B_TRUE; 22893 } else { 22894 next_mp = ip_copymsg(first_mp); 22895 if (next_mp != NULL) 22896 IRE_REFHOLD(ire1); 22897 } 22898 } 22899 rw_exit(&ire->ire_bucket->irb_lock); 22900 } 22901 22902 if (stq) { 22903 /* 22904 * A non-NULL send-to queue means this packet is going 22905 * out of this machine. 22906 */ 22907 out_ill = (ill_t *)stq->q_ptr; 22908 22909 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests); 22910 ttl_protocol = ((uint16_t *)ipha)[4]; 22911 /* 22912 * We accumulate the pseudo header checksum in cksum. 22913 * This is pretty hairy code, so watch close. One 22914 * thing to keep in mind is that UDP and TCP have 22915 * stored their respective datagram lengths in their 22916 * checksum fields. This lines things up real nice. 22917 */ 22918 cksum = (dst >> 16) + (dst & 0xFFFF) + 22919 (src >> 16) + (src & 0xFFFF); 22920 /* 22921 * We assume the udp checksum field contains the 22922 * length, so to compute the pseudo header checksum, 22923 * all we need is the protocol number and src/dst. 22924 */ 22925 /* Provide the checksums for UDP and TCP. */ 22926 if ((PROTO == IPPROTO_TCP) && 22927 (ip_hdr_included != IP_HDR_INCLUDED)) { 22928 /* hlen gets the number of uchar_ts in the IP header */ 22929 hlen = (V_HLEN & 0xF) << 2; 22930 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 22931 IP_STAT(ipst, ip_out_sw_cksum); 22932 IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, 22933 LENGTH - hlen); 22934 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 22935 } else if (PROTO == IPPROTO_SCTP && 22936 (ip_hdr_included != IP_HDR_INCLUDED)) { 22937 sctp_hdr_t *sctph; 22938 22939 hlen = (V_HLEN & 0xF) << 2; 22940 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 22941 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 22942 sctph->sh_chksum = 0; 22943 #ifdef DEBUG 22944 if (!skip_sctp_cksum) 22945 #endif 22946 sctph->sh_chksum = sctp_cksum(mp, hlen); 22947 } else { 22948 queue_t *dev_q = stq->q_next; 22949 22950 if ((dev_q->q_next || dev_q->q_first) && 22951 !canput(dev_q)) { 22952 blocked: 22953 ipha->ipha_ident = ip_hdr_included; 22954 /* 22955 * If we don't have a conn to apply 22956 * backpressure, free the message. 22957 * In the ire_send path, we don't know 22958 * the position to requeue the packet. Rather 22959 * than reorder packets, we just drop this 22960 * packet. 22961 */ 22962 if (ipst->ips_ip_output_queue && 22963 connp != NULL && 22964 caller != IRE_SEND) { 22965 if (caller == IP_WSRV) { 22966 connp->conn_did_putbq = 1; 22967 (void) putbq(connp->conn_wq, 22968 first_mp); 22969 conn_drain_insert(connp); 22970 /* 22971 * This is the service thread, 22972 * and the queue is already 22973 * noenabled. The check for 22974 * canput and the putbq is not 22975 * atomic. So we need to check 22976 * again. 22977 */ 22978 if (canput(stq->q_next)) 22979 connp->conn_did_putbq 22980 = 0; 22981 IP_STAT(ipst, ip_conn_flputbq); 22982 } else { 22983 /* 22984 * We are not the service proc. 22985 * ip_wsrv will be scheduled or 22986 * is already running. 22987 */ 22988 (void) putq(connp->conn_wq, 22989 first_mp); 22990 } 22991 } else { 22992 out_ill = (ill_t *)stq->q_ptr; 22993 BUMP_MIB(out_ill->ill_ip_mib, 22994 ipIfStatsOutDiscards); 22995 freemsg(first_mp); 22996 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 22997 "ip_wput_ire_end: q %p (%S)", 22998 q, "discard"); 22999 } 23000 ire_refrele(ire); 23001 if (next_mp) { 23002 ire_refrele(ire1); 23003 freemsg(next_mp); 23004 } 23005 if (conn_outgoing_ill != NULL) 23006 ill_refrele(conn_outgoing_ill); 23007 return; 23008 } 23009 if ((PROTO == IPPROTO_UDP) && 23010 (ip_hdr_included != IP_HDR_INCLUDED)) { 23011 /* 23012 * hlen gets the number of uchar_ts in the 23013 * IP header 23014 */ 23015 hlen = (V_HLEN & 0xF) << 2; 23016 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 23017 max_frag = ire->ire_max_frag; 23018 if (*up != 0) { 23019 IP_CKSUM_XMIT(out_ill, ire, mp, ipha, 23020 up, PROTO, hlen, LENGTH, max_frag, 23021 ipsec_len, cksum); 23022 /* Software checksum? */ 23023 if (DB_CKSUMFLAGS(mp) == 0) { 23024 IP_STAT(ipst, ip_out_sw_cksum); 23025 IP_STAT_UPDATE(ipst, 23026 ip_udp_out_sw_cksum_bytes, 23027 LENGTH - hlen); 23028 } 23029 } 23030 } 23031 } 23032 /* 23033 * Need to do this even when fragmenting. The local 23034 * loopback can be done without computing checksums 23035 * but forwarding out other interface must be done 23036 * after the IP checksum (and ULP checksums) have been 23037 * computed. 23038 * 23039 * NOTE : multicast_forward is set only if this packet 23040 * originated from ip_wput. For packets originating from 23041 * ip_wput_multicast, it is not set. 23042 */ 23043 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 23044 multi_loopback: 23045 ip2dbg(("ip_wput: multicast, loop %d\n", 23046 conn_multicast_loop)); 23047 23048 /* Forget header checksum offload */ 23049 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 23050 23051 /* 23052 * Local loopback of multicasts? Check the 23053 * ill. 23054 * 23055 * Note that the loopback function will not come 23056 * in through ip_rput - it will only do the 23057 * client fanout thus we need to do an mforward 23058 * as well. The is different from the BSD 23059 * logic. 23060 */ 23061 if (ill != NULL) { 23062 ilm_t *ilm; 23063 23064 ILM_WALKER_HOLD(ill); 23065 ilm = ilm_lookup_ill(ill, ipha->ipha_dst, 23066 ALL_ZONES); 23067 ILM_WALKER_RELE(ill); 23068 if (ilm != NULL) { 23069 /* 23070 * Pass along the virtual output q. 23071 * ip_wput_local() will distribute the 23072 * packet to all the matching zones, 23073 * except the sending zone when 23074 * IP_MULTICAST_LOOP is false. 23075 */ 23076 ip_multicast_loopback(q, ill, first_mp, 23077 conn_multicast_loop ? 0 : 23078 IP_FF_NO_MCAST_LOOP, zoneid); 23079 } 23080 } 23081 if (ipha->ipha_ttl == 0) { 23082 /* 23083 * 0 => only to this host i.e. we are 23084 * done. We are also done if this was the 23085 * loopback interface since it is sufficient 23086 * to loopback one copy of a multicast packet. 23087 */ 23088 freemsg(first_mp); 23089 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23090 "ip_wput_ire_end: q %p (%S)", 23091 q, "loopback"); 23092 ire_refrele(ire); 23093 if (conn_outgoing_ill != NULL) 23094 ill_refrele(conn_outgoing_ill); 23095 return; 23096 } 23097 /* 23098 * ILLF_MULTICAST is checked in ip_newroute 23099 * i.e. we don't need to check it here since 23100 * all IRE_CACHEs come from ip_newroute. 23101 * For multicast traffic, SO_DONTROUTE is interpreted 23102 * to mean only send the packet out the interface 23103 * (optionally specified with IP_MULTICAST_IF) 23104 * and do not forward it out additional interfaces. 23105 * RSVP and the rsvp daemon is an example of a 23106 * protocol and user level process that 23107 * handles it's own routing. Hence, it uses the 23108 * SO_DONTROUTE option to accomplish this. 23109 */ 23110 23111 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 23112 ill != NULL) { 23113 /* Unconditionally redo the checksum */ 23114 ipha->ipha_hdr_checksum = 0; 23115 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 23116 23117 /* 23118 * If this needs to go out secure, we need 23119 * to wait till we finish the IPsec 23120 * processing. 23121 */ 23122 if (ipsec_len == 0 && 23123 ip_mforward(ill, ipha, mp)) { 23124 freemsg(first_mp); 23125 ip1dbg(("ip_wput: mforward failed\n")); 23126 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23127 "ip_wput_ire_end: q %p (%S)", 23128 q, "mforward failed"); 23129 ire_refrele(ire); 23130 if (conn_outgoing_ill != NULL) 23131 ill_refrele(conn_outgoing_ill); 23132 return; 23133 } 23134 } 23135 } 23136 max_frag = ire->ire_max_frag; 23137 cksum += ttl_protocol; 23138 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 23139 /* No fragmentation required for this one. */ 23140 /* 23141 * Don't use frag_flag if packet is pre-built or source 23142 * routed or if multicast (since multicast packets do 23143 * not solicit ICMP "packet too big" messages). 23144 */ 23145 if ((ip_hdr_included != IP_HDR_INCLUDED) && 23146 (V_HLEN == IP_SIMPLE_HDR_VERSION || 23147 !ip_source_route_included(ipha)) && 23148 !CLASSD(ipha->ipha_dst)) 23149 ipha->ipha_fragment_offset_and_flags |= 23150 htons(ire->ire_frag_flag); 23151 23152 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 23153 /* Complete the IP header checksum. */ 23154 cksum += ipha->ipha_ident; 23155 cksum += (v_hlen_tos_len >> 16)+ 23156 (v_hlen_tos_len & 0xFFFF); 23157 cksum += ipha->ipha_fragment_offset_and_flags; 23158 hlen = (V_HLEN & 0xF) - 23159 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 23160 if (hlen) { 23161 checksumoptions: 23162 /* 23163 * Account for the IP Options in the IP 23164 * header checksum. 23165 */ 23166 up = (uint16_t *)(rptr+ 23167 IP_SIMPLE_HDR_LENGTH); 23168 do { 23169 cksum += up[0]; 23170 cksum += up[1]; 23171 up += 2; 23172 } while (--hlen); 23173 } 23174 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 23175 cksum = ~(cksum + (cksum >> 16)); 23176 ipha->ipha_hdr_checksum = (uint16_t)cksum; 23177 } 23178 if (ipsec_len != 0) { 23179 ipsec_out_process(q, first_mp, ire, ill_index); 23180 if (!next_mp) { 23181 ire_refrele(ire); 23182 if (conn_outgoing_ill != NULL) 23183 ill_refrele(conn_outgoing_ill); 23184 return; 23185 } 23186 goto next; 23187 } 23188 23189 /* 23190 * multirt_send has already been handled 23191 * for broadcast, but not yet for multicast 23192 * or IP options. 23193 */ 23194 if (next_mp == NULL) { 23195 if (ire->ire_flags & RTF_MULTIRT) { 23196 multirt_send = B_TRUE; 23197 } 23198 } 23199 23200 /* 23201 * In most cases, the emission loop below is 23202 * entered only once. Only in the case where 23203 * the ire holds the RTF_MULTIRT flag, do we loop 23204 * to process all RTF_MULTIRT ires in the bucket, 23205 * and send the packet through all crossed 23206 * RTF_MULTIRT routes. 23207 */ 23208 do { 23209 if (multirt_send) { 23210 irb_t *irb; 23211 23212 irb = ire->ire_bucket; 23213 ASSERT(irb != NULL); 23214 /* 23215 * We are in a multiple send case, 23216 * need to get the next IRE and make 23217 * a duplicate of the packet. 23218 */ 23219 IRB_REFHOLD(irb); 23220 for (ire1 = ire->ire_next; 23221 ire1 != NULL; 23222 ire1 = ire1->ire_next) { 23223 if (!(ire1->ire_flags & 23224 RTF_MULTIRT)) { 23225 continue; 23226 } 23227 if (ire1->ire_addr != 23228 ire->ire_addr) { 23229 continue; 23230 } 23231 if (ire1->ire_marks & 23232 (IRE_MARK_CONDEMNED| 23233 IRE_MARK_HIDDEN)) { 23234 continue; 23235 } 23236 23237 /* Got one */ 23238 IRE_REFHOLD(ire1); 23239 break; 23240 } 23241 IRB_REFRELE(irb); 23242 23243 if (ire1 != NULL) { 23244 next_mp = copyb(mp); 23245 if ((next_mp == NULL) || 23246 ((mp->b_cont != NULL) && 23247 ((next_mp->b_cont = 23248 dupmsg(mp->b_cont)) 23249 == NULL))) { 23250 freemsg(next_mp); 23251 next_mp = NULL; 23252 ire_refrele(ire1); 23253 ire1 = NULL; 23254 } 23255 } 23256 23257 /* 23258 * Last multiroute ire; don't loop 23259 * anymore. The emission is over 23260 * and next_mp is NULL. 23261 */ 23262 if (ire1 == NULL) { 23263 multirt_send = B_FALSE; 23264 } 23265 } 23266 23267 out_ill = ire_to_ill(ire); 23268 DTRACE_PROBE4(ip4__physical__out__start, 23269 ill_t *, NULL, 23270 ill_t *, out_ill, 23271 ipha_t *, ipha, mblk_t *, mp); 23272 FW_HOOKS(ipst->ips_ip4_physical_out_event, 23273 ipst->ips_ipv4firewall_physical_out, 23274 NULL, out_ill, ipha, mp, mp, 0, ipst); 23275 DTRACE_PROBE1(ip4__physical__out__end, 23276 mblk_t *, mp); 23277 if (mp == NULL) 23278 goto release_ire_and_ill_2; 23279 23280 ASSERT(ipsec_len == 0); 23281 mp->b_prev = 23282 SET_BPREV_FLAG(IPP_LOCAL_OUT); 23283 DTRACE_PROBE2(ip__xmit__2, 23284 mblk_t *, mp, ire_t *, ire); 23285 pktxmit_state = ip_xmit_v4(mp, ire, 23286 NULL, B_TRUE); 23287 if ((pktxmit_state == SEND_FAILED) || 23288 (pktxmit_state == LLHDR_RESLV_FAILED)) { 23289 release_ire_and_ill_2: 23290 if (next_mp) { 23291 freemsg(next_mp); 23292 ire_refrele(ire1); 23293 } 23294 ire_refrele(ire); 23295 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23296 "ip_wput_ire_end: q %p (%S)", 23297 q, "discard MDATA"); 23298 if (conn_outgoing_ill != NULL) 23299 ill_refrele(conn_outgoing_ill); 23300 return; 23301 } 23302 23303 if (CLASSD(dst)) { 23304 BUMP_MIB(out_ill->ill_ip_mib, 23305 ipIfStatsHCOutMcastPkts); 23306 UPDATE_MIB(out_ill->ill_ip_mib, 23307 ipIfStatsHCOutMcastOctets, 23308 LENGTH); 23309 } else if (ire->ire_type == IRE_BROADCAST) { 23310 BUMP_MIB(out_ill->ill_ip_mib, 23311 ipIfStatsHCOutBcastPkts); 23312 } 23313 23314 if (multirt_send) { 23315 /* 23316 * We are in a multiple send case, 23317 * need to re-enter the sending loop 23318 * using the next ire. 23319 */ 23320 ire_refrele(ire); 23321 ire = ire1; 23322 stq = ire->ire_stq; 23323 mp = next_mp; 23324 next_mp = NULL; 23325 ipha = (ipha_t *)mp->b_rptr; 23326 ill_index = Q_TO_INDEX(stq); 23327 } 23328 } while (multirt_send); 23329 23330 if (!next_mp) { 23331 /* 23332 * Last copy going out (the ultra-common 23333 * case). Note that we intentionally replicate 23334 * the putnext rather than calling it before 23335 * the next_mp check in hopes of a little 23336 * tail-call action out of the compiler. 23337 */ 23338 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23339 "ip_wput_ire_end: q %p (%S)", 23340 q, "last copy out(1)"); 23341 ire_refrele(ire); 23342 if (conn_outgoing_ill != NULL) 23343 ill_refrele(conn_outgoing_ill); 23344 return; 23345 } 23346 /* More copies going out below. */ 23347 } else { 23348 int offset; 23349 fragmentit: 23350 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 23351 /* 23352 * If this would generate a icmp_frag_needed message, 23353 * we need to handle it before we do the IPsec 23354 * processing. Otherwise, we need to strip the IPsec 23355 * headers before we send up the message to the ULPs 23356 * which becomes messy and difficult. 23357 */ 23358 if (ipsec_len != 0) { 23359 if ((max_frag < (unsigned int)(LENGTH + 23360 ipsec_len)) && (offset & IPH_DF)) { 23361 out_ill = (ill_t *)stq->q_ptr; 23362 BUMP_MIB(out_ill->ill_ip_mib, 23363 ipIfStatsOutFragFails); 23364 BUMP_MIB(out_ill->ill_ip_mib, 23365 ipIfStatsOutFragReqds); 23366 ipha->ipha_hdr_checksum = 0; 23367 ipha->ipha_hdr_checksum = 23368 (uint16_t)ip_csum_hdr(ipha); 23369 icmp_frag_needed(ire->ire_stq, first_mp, 23370 max_frag, zoneid, ipst); 23371 if (!next_mp) { 23372 ire_refrele(ire); 23373 if (conn_outgoing_ill != NULL) { 23374 ill_refrele( 23375 conn_outgoing_ill); 23376 } 23377 return; 23378 } 23379 } else { 23380 /* 23381 * This won't cause a icmp_frag_needed 23382 * message. to be generated. Send it on 23383 * the wire. Note that this could still 23384 * cause fragmentation and all we 23385 * do is the generation of the message 23386 * to the ULP if needed before IPsec. 23387 */ 23388 if (!next_mp) { 23389 ipsec_out_process(q, first_mp, 23390 ire, ill_index); 23391 TRACE_2(TR_FAC_IP, 23392 TR_IP_WPUT_IRE_END, 23393 "ip_wput_ire_end: q %p " 23394 "(%S)", q, 23395 "last ipsec_out_process"); 23396 ire_refrele(ire); 23397 if (conn_outgoing_ill != NULL) { 23398 ill_refrele( 23399 conn_outgoing_ill); 23400 } 23401 return; 23402 } 23403 ipsec_out_process(q, first_mp, 23404 ire, ill_index); 23405 } 23406 } else { 23407 /* 23408 * Initiate IPPF processing. For 23409 * fragmentable packets we finish 23410 * all QOS packet processing before 23411 * calling: 23412 * ip_wput_ire_fragmentit->ip_wput_frag 23413 */ 23414 23415 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23416 ip_process(IPP_LOCAL_OUT, &mp, 23417 ill_index); 23418 if (mp == NULL) { 23419 out_ill = (ill_t *)stq->q_ptr; 23420 BUMP_MIB(out_ill->ill_ip_mib, 23421 ipIfStatsOutDiscards); 23422 if (next_mp != NULL) { 23423 freemsg(next_mp); 23424 ire_refrele(ire1); 23425 } 23426 ire_refrele(ire); 23427 TRACE_2(TR_FAC_IP, 23428 TR_IP_WPUT_IRE_END, 23429 "ip_wput_ire: q %p (%S)", 23430 q, "discard MDATA"); 23431 if (conn_outgoing_ill != NULL) { 23432 ill_refrele( 23433 conn_outgoing_ill); 23434 } 23435 return; 23436 } 23437 } 23438 if (!next_mp) { 23439 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23440 "ip_wput_ire_end: q %p (%S)", 23441 q, "last fragmentation"); 23442 ip_wput_ire_fragmentit(mp, ire, 23443 zoneid, ipst); 23444 ire_refrele(ire); 23445 if (conn_outgoing_ill != NULL) 23446 ill_refrele(conn_outgoing_ill); 23447 return; 23448 } 23449 ip_wput_ire_fragmentit(mp, ire, zoneid, ipst); 23450 } 23451 } 23452 } else { 23453 nullstq: 23454 /* A NULL stq means the destination address is local. */ 23455 UPDATE_OB_PKT_COUNT(ire); 23456 ire->ire_last_used_time = lbolt; 23457 ASSERT(ire->ire_ipif != NULL); 23458 if (!next_mp) { 23459 /* 23460 * Is there an "in" and "out" for traffic local 23461 * to a host (loopback)? The code in Solaris doesn't 23462 * explicitly draw a line in its code for in vs out, 23463 * so we've had to draw a line in the sand: ip_wput_ire 23464 * is considered to be the "output" side and 23465 * ip_wput_local to be the "input" side. 23466 */ 23467 out_ill = ire_to_ill(ire); 23468 23469 /* 23470 * DTrace this as ip:::send. A blocked packet will 23471 * fire the send probe, but not the receive probe. 23472 */ 23473 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23474 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23475 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23476 23477 DTRACE_PROBE4(ip4__loopback__out__start, 23478 ill_t *, NULL, ill_t *, out_ill, 23479 ipha_t *, ipha, mblk_t *, first_mp); 23480 23481 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23482 ipst->ips_ipv4firewall_loopback_out, 23483 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23484 23485 DTRACE_PROBE1(ip4__loopback__out_end, 23486 mblk_t *, first_mp); 23487 23488 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 23489 "ip_wput_ire_end: q %p (%S)", 23490 q, "local address"); 23491 23492 if (first_mp != NULL) 23493 ip_wput_local(q, out_ill, ipha, 23494 first_mp, ire, 0, ire->ire_zoneid); 23495 ire_refrele(ire); 23496 if (conn_outgoing_ill != NULL) 23497 ill_refrele(conn_outgoing_ill); 23498 return; 23499 } 23500 23501 out_ill = ire_to_ill(ire); 23502 23503 /* 23504 * DTrace this as ip:::send. A blocked packet will fire the 23505 * send probe, but not the receive probe. 23506 */ 23507 DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, 23508 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 23509 ipha_t *, ipha, ip6_t *, NULL, int, 1); 23510 23511 DTRACE_PROBE4(ip4__loopback__out__start, 23512 ill_t *, NULL, ill_t *, out_ill, 23513 ipha_t *, ipha, mblk_t *, first_mp); 23514 23515 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 23516 ipst->ips_ipv4firewall_loopback_out, 23517 NULL, out_ill, ipha, first_mp, mp, 0, ipst); 23518 23519 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp); 23520 23521 if (first_mp != NULL) 23522 ip_wput_local(q, out_ill, ipha, 23523 first_mp, ire, 0, ire->ire_zoneid); 23524 } 23525 next: 23526 /* 23527 * More copies going out to additional interfaces. 23528 * ire1 has already been held. We don't need the 23529 * "ire" anymore. 23530 */ 23531 ire_refrele(ire); 23532 ire = ire1; 23533 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 23534 mp = next_mp; 23535 ASSERT(ire->ire_ipversion == IPV4_VERSION); 23536 ill = ire_to_ill(ire); 23537 first_mp = mp; 23538 if (ipsec_len != 0) { 23539 ASSERT(first_mp->b_datap->db_type == M_CTL); 23540 mp = mp->b_cont; 23541 } 23542 dst = ire->ire_addr; 23543 ipha = (ipha_t *)mp->b_rptr; 23544 /* 23545 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 23546 * Restore ipha_ident "no checksum" flag. 23547 */ 23548 src = orig_src; 23549 ipha->ipha_ident = ip_hdr_included; 23550 goto another; 23551 23552 #undef rptr 23553 #undef Q_TO_INDEX 23554 } 23555 23556 /* 23557 * Routine to allocate a message that is used to notify the ULP about MDT. 23558 * The caller may provide a pointer to the link-layer MDT capabilities, 23559 * or NULL if MDT is to be disabled on the stream. 23560 */ 23561 mblk_t * 23562 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 23563 { 23564 mblk_t *mp; 23565 ip_mdt_info_t *mdti; 23566 ill_mdt_capab_t *idst; 23567 23568 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 23569 DB_TYPE(mp) = M_CTL; 23570 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 23571 mdti = (ip_mdt_info_t *)mp->b_rptr; 23572 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 23573 idst = &(mdti->mdt_capab); 23574 23575 /* 23576 * If the caller provides us with the capability, copy 23577 * it over into our notification message; otherwise 23578 * we zero out the capability portion. 23579 */ 23580 if (isrc != NULL) 23581 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23582 else 23583 bzero((caddr_t)idst, sizeof (*idst)); 23584 } 23585 return (mp); 23586 } 23587 23588 /* 23589 * Routine which determines whether MDT can be enabled on the destination 23590 * IRE and IPC combination, and if so, allocates and returns the MDT 23591 * notification mblk that may be used by ULP. We also check if we need to 23592 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 23593 * MDT usage in the past have been lifted. This gets called during IP 23594 * and ULP binding. 23595 */ 23596 mblk_t * 23597 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23598 ill_mdt_capab_t *mdt_cap) 23599 { 23600 mblk_t *mp; 23601 boolean_t rc = B_FALSE; 23602 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23603 23604 ASSERT(dst_ire != NULL); 23605 ASSERT(connp != NULL); 23606 ASSERT(mdt_cap != NULL); 23607 23608 /* 23609 * Currently, we only support simple TCP/{IPv4,IPv6} with 23610 * Multidata, which is handled in tcp_multisend(). This 23611 * is the reason why we do all these checks here, to ensure 23612 * that we don't enable Multidata for the cases which we 23613 * can't handle at the moment. 23614 */ 23615 do { 23616 /* Only do TCP at the moment */ 23617 if (connp->conn_ulp != IPPROTO_TCP) 23618 break; 23619 23620 /* 23621 * IPsec outbound policy present? Note that we get here 23622 * after calling ipsec_conn_cache_policy() where the global 23623 * policy checking is performed. conn_latch will be 23624 * non-NULL as long as there's a policy defined, 23625 * i.e. conn_out_enforce_policy may be NULL in such case 23626 * when the connection is non-secure, and hence we check 23627 * further if the latch refers to an outbound policy. 23628 */ 23629 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 23630 break; 23631 23632 /* CGTP (multiroute) is enabled? */ 23633 if (dst_ire->ire_flags & RTF_MULTIRT) 23634 break; 23635 23636 /* Outbound IPQoS enabled? */ 23637 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23638 /* 23639 * In this case, we disable MDT for this and all 23640 * future connections going over the interface. 23641 */ 23642 mdt_cap->ill_mdt_on = 0; 23643 break; 23644 } 23645 23646 /* socket option(s) present? */ 23647 if (!CONN_IS_LSO_MD_FASTPATH(connp)) 23648 break; 23649 23650 rc = B_TRUE; 23651 /* CONSTCOND */ 23652 } while (0); 23653 23654 /* Remember the result */ 23655 connp->conn_mdt_ok = rc; 23656 23657 if (!rc) 23658 return (NULL); 23659 else if (!mdt_cap->ill_mdt_on) { 23660 /* 23661 * If MDT has been previously turned off in the past, and we 23662 * currently can do MDT (due to IPQoS policy removal, etc.) 23663 * then enable it for this interface. 23664 */ 23665 mdt_cap->ill_mdt_on = 1; 23666 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 23667 "interface %s\n", ill_name)); 23668 } 23669 23670 /* Allocate the MDT info mblk */ 23671 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 23672 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 23673 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23674 return (NULL); 23675 } 23676 return (mp); 23677 } 23678 23679 /* 23680 * Routine to allocate a message that is used to notify the ULP about LSO. 23681 * The caller may provide a pointer to the link-layer LSO capabilities, 23682 * or NULL if LSO is to be disabled on the stream. 23683 */ 23684 mblk_t * 23685 ip_lsoinfo_alloc(ill_lso_capab_t *isrc) 23686 { 23687 mblk_t *mp; 23688 ip_lso_info_t *lsoi; 23689 ill_lso_capab_t *idst; 23690 23691 if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) { 23692 DB_TYPE(mp) = M_CTL; 23693 mp->b_wptr = mp->b_rptr + sizeof (*lsoi); 23694 lsoi = (ip_lso_info_t *)mp->b_rptr; 23695 lsoi->lso_info_id = LSO_IOC_INFO_UPDATE; 23696 idst = &(lsoi->lso_capab); 23697 23698 /* 23699 * If the caller provides us with the capability, copy 23700 * it over into our notification message; otherwise 23701 * we zero out the capability portion. 23702 */ 23703 if (isrc != NULL) 23704 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 23705 else 23706 bzero((caddr_t)idst, sizeof (*idst)); 23707 } 23708 return (mp); 23709 } 23710 23711 /* 23712 * Routine which determines whether LSO can be enabled on the destination 23713 * IRE and IPC combination, and if so, allocates and returns the LSO 23714 * notification mblk that may be used by ULP. We also check if we need to 23715 * turn LSO back to 'on' when certain restrictions prohibiting us to allow 23716 * LSO usage in the past have been lifted. This gets called during IP 23717 * and ULP binding. 23718 */ 23719 mblk_t * 23720 ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 23721 ill_lso_capab_t *lso_cap) 23722 { 23723 mblk_t *mp; 23724 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 23725 23726 ASSERT(dst_ire != NULL); 23727 ASSERT(connp != NULL); 23728 ASSERT(lso_cap != NULL); 23729 23730 connp->conn_lso_ok = B_TRUE; 23731 23732 if ((connp->conn_ulp != IPPROTO_TCP) || 23733 CONN_IPSEC_OUT_ENCAPSULATED(connp) || 23734 (dst_ire->ire_flags & RTF_MULTIRT) || 23735 !CONN_IS_LSO_MD_FASTPATH(connp) || 23736 (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { 23737 connp->conn_lso_ok = B_FALSE; 23738 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 23739 /* 23740 * Disable LSO for this and all future connections going 23741 * over the interface. 23742 */ 23743 lso_cap->ill_lso_on = 0; 23744 } 23745 } 23746 23747 if (!connp->conn_lso_ok) 23748 return (NULL); 23749 else if (!lso_cap->ill_lso_on) { 23750 /* 23751 * If LSO has been previously turned off in the past, and we 23752 * currently can do LSO (due to IPQoS policy removal, etc.) 23753 * then enable it for this interface. 23754 */ 23755 lso_cap->ill_lso_on = 1; 23756 ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n", 23757 ill_name)); 23758 } 23759 23760 /* Allocate the LSO info mblk */ 23761 if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL) 23762 ip0dbg(("ip_lsoinfo_return: can't enable LSO for " 23763 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 23764 23765 return (mp); 23766 } 23767 23768 /* 23769 * Create destination address attribute, and fill it with the physical 23770 * destination address and SAP taken from the template DL_UNITDATA_REQ 23771 * message block. 23772 */ 23773 boolean_t 23774 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 23775 { 23776 dl_unitdata_req_t *dlurp; 23777 pattr_t *pa; 23778 pattrinfo_t pa_info; 23779 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 23780 uint_t das_len, das_off; 23781 23782 ASSERT(dlmp != NULL); 23783 23784 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 23785 das_len = dlurp->dl_dest_addr_length; 23786 das_off = dlurp->dl_dest_addr_offset; 23787 23788 pa_info.type = PATTR_DSTADDRSAP; 23789 pa_info.len = sizeof (**das) + das_len - 1; 23790 23791 /* create and associate the attribute */ 23792 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23793 if (pa != NULL) { 23794 ASSERT(*das != NULL); 23795 (*das)->addr_is_group = 0; 23796 (*das)->addr_len = (uint8_t)das_len; 23797 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 23798 } 23799 23800 return (pa != NULL); 23801 } 23802 23803 /* 23804 * Create hardware checksum attribute and fill it with the values passed. 23805 */ 23806 boolean_t 23807 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 23808 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 23809 { 23810 pattr_t *pa; 23811 pattrinfo_t pa_info; 23812 23813 ASSERT(mmd != NULL); 23814 23815 pa_info.type = PATTR_HCKSUM; 23816 pa_info.len = sizeof (pattr_hcksum_t); 23817 23818 /* create and associate the attribute */ 23819 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23820 if (pa != NULL) { 23821 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 23822 23823 hck->hcksum_start_offset = start_offset; 23824 hck->hcksum_stuff_offset = stuff_offset; 23825 hck->hcksum_end_offset = end_offset; 23826 hck->hcksum_flags = flags; 23827 } 23828 return (pa != NULL); 23829 } 23830 23831 /* 23832 * Create zerocopy attribute and fill it with the specified flags 23833 */ 23834 boolean_t 23835 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 23836 { 23837 pattr_t *pa; 23838 pattrinfo_t pa_info; 23839 23840 ASSERT(mmd != NULL); 23841 pa_info.type = PATTR_ZCOPY; 23842 pa_info.len = sizeof (pattr_zcopy_t); 23843 23844 /* create and associate the attribute */ 23845 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 23846 if (pa != NULL) { 23847 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 23848 23849 zcopy->zcopy_flags = flags; 23850 } 23851 return (pa != NULL); 23852 } 23853 23854 /* 23855 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 23856 * block chain. We could rewrite to handle arbitrary message block chains but 23857 * that would make the code complicated and slow. Right now there three 23858 * restrictions: 23859 * 23860 * 1. The first message block must contain the complete IP header and 23861 * at least 1 byte of payload data. 23862 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 23863 * so that we can use a single Multidata message. 23864 * 3. No frag must be distributed over two or more message blocks so 23865 * that we don't need more than two packet descriptors per frag. 23866 * 23867 * The above restrictions allow us to support userland applications (which 23868 * will send down a single message block) and NFS over UDP (which will 23869 * send down a chain of at most three message blocks). 23870 * 23871 * We also don't use MDT for payloads with less than or equal to 23872 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 23873 */ 23874 boolean_t 23875 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 23876 { 23877 int blocks; 23878 ssize_t total, missing, size; 23879 23880 ASSERT(mp != NULL); 23881 ASSERT(hdr_len > 0); 23882 23883 size = MBLKL(mp) - hdr_len; 23884 if (size <= 0) 23885 return (B_FALSE); 23886 23887 /* The first mblk contains the header and some payload. */ 23888 blocks = 1; 23889 total = size; 23890 size %= len; 23891 missing = (size == 0) ? 0 : (len - size); 23892 mp = mp->b_cont; 23893 23894 while (mp != NULL) { 23895 /* 23896 * Give up if we encounter a zero length message block. 23897 * In practice, this should rarely happen and therefore 23898 * not worth the trouble of freeing and re-linking the 23899 * mblk from the chain to handle such case. 23900 */ 23901 if ((size = MBLKL(mp)) == 0) 23902 return (B_FALSE); 23903 23904 /* Too many payload buffers for a single Multidata message? */ 23905 if (++blocks > MULTIDATA_MAX_PBUFS) 23906 return (B_FALSE); 23907 23908 total += size; 23909 /* Is a frag distributed over two or more message blocks? */ 23910 if (missing > size) 23911 return (B_FALSE); 23912 size -= missing; 23913 23914 size %= len; 23915 missing = (size == 0) ? 0 : (len - size); 23916 23917 mp = mp->b_cont; 23918 } 23919 23920 return (total > ip_wput_frag_mdt_min); 23921 } 23922 23923 /* 23924 * Outbound IPv4 fragmentation routine using MDT. 23925 */ 23926 static void 23927 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 23928 uint32_t frag_flag, int offset) 23929 { 23930 ipha_t *ipha_orig; 23931 int i1, ip_data_end; 23932 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 23933 mblk_t *hdr_mp, *md_mp = NULL; 23934 unsigned char *hdr_ptr, *pld_ptr; 23935 multidata_t *mmd; 23936 ip_pdescinfo_t pdi; 23937 ill_t *ill; 23938 ip_stack_t *ipst = ire->ire_ipst; 23939 23940 ASSERT(DB_TYPE(mp) == M_DATA); 23941 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 23942 23943 ill = ire_to_ill(ire); 23944 ASSERT(ill != NULL); 23945 23946 ipha_orig = (ipha_t *)mp->b_rptr; 23947 mp->b_rptr += sizeof (ipha_t); 23948 23949 /* Calculate how many packets we will send out */ 23950 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 23951 pkts = (i1 + len - 1) / len; 23952 ASSERT(pkts > 1); 23953 23954 /* Allocate a message block which will hold all the IP Headers. */ 23955 wroff = ipst->ips_ip_wroff_extra; 23956 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 23957 23958 i1 = pkts * hdr_chunk_len; 23959 /* 23960 * Create the header buffer, Multidata and destination address 23961 * and SAP attribute that should be associated with it. 23962 */ 23963 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 23964 ((hdr_mp->b_wptr += i1), 23965 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 23966 !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { 23967 freemsg(mp); 23968 if (md_mp == NULL) { 23969 freemsg(hdr_mp); 23970 } else { 23971 free_mmd: IP_STAT(ipst, ip_frag_mdt_discarded); 23972 freemsg(md_mp); 23973 } 23974 IP_STAT(ipst, ip_frag_mdt_allocfail); 23975 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); 23976 return; 23977 } 23978 IP_STAT(ipst, ip_frag_mdt_allocd); 23979 23980 /* 23981 * Add a payload buffer to the Multidata; this operation must not 23982 * fail, or otherwise our logic in this routine is broken. There 23983 * is no memory allocation done by the routine, so any returned 23984 * failure simply tells us that we've done something wrong. 23985 * 23986 * A failure tells us that either we're adding the same payload 23987 * buffer more than once, or we're trying to add more buffers than 23988 * allowed. None of the above cases should happen, and we panic 23989 * because either there's horrible heap corruption, and/or 23990 * programming mistake. 23991 */ 23992 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 23993 goto pbuf_panic; 23994 23995 hdr_ptr = hdr_mp->b_rptr; 23996 pld_ptr = mp->b_rptr; 23997 23998 /* Establish the ending byte offset, based on the starting offset. */ 23999 offset <<= 3; 24000 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 24001 IP_SIMPLE_HDR_LENGTH; 24002 24003 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 24004 24005 while (pld_ptr < mp->b_wptr) { 24006 ipha_t *ipha; 24007 uint16_t offset_and_flags; 24008 uint16_t ip_len; 24009 int error; 24010 24011 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 24012 ipha = (ipha_t *)(hdr_ptr + wroff); 24013 ASSERT(OK_32PTR(ipha)); 24014 *ipha = *ipha_orig; 24015 24016 if (ip_data_end - offset > len) { 24017 offset_and_flags = IPH_MF; 24018 } else { 24019 /* 24020 * Last frag. Set len to the length of this last piece. 24021 */ 24022 len = ip_data_end - offset; 24023 /* A frag of a frag might have IPH_MF non-zero */ 24024 offset_and_flags = 24025 ntohs(ipha->ipha_fragment_offset_and_flags) & 24026 IPH_MF; 24027 } 24028 offset_and_flags |= (uint16_t)(offset >> 3); 24029 offset_and_flags |= (uint16_t)frag_flag; 24030 /* Store the offset and flags in the IP header. */ 24031 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24032 24033 /* Store the length in the IP header. */ 24034 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 24035 ipha->ipha_length = htons(ip_len); 24036 24037 /* 24038 * Set the IP header checksum. Note that mp is just 24039 * the header, so this is easy to pass to ip_csum. 24040 */ 24041 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24042 24043 DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *, 24044 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 24045 NULL, int, 0); 24046 24047 /* 24048 * Record offset and size of header and data of the next packet 24049 * in the multidata message. 24050 */ 24051 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 24052 PDESC_PLD_INIT(&pdi); 24053 i1 = MIN(mp->b_wptr - pld_ptr, len); 24054 ASSERT(i1 > 0); 24055 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 24056 if (i1 == len) { 24057 pld_ptr += len; 24058 } else { 24059 i1 = len - i1; 24060 mp = mp->b_cont; 24061 ASSERT(mp != NULL); 24062 ASSERT(MBLKL(mp) >= i1); 24063 /* 24064 * Attach the next payload message block to the 24065 * multidata message. 24066 */ 24067 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24068 goto pbuf_panic; 24069 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 24070 pld_ptr = mp->b_rptr + i1; 24071 } 24072 24073 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 24074 KM_NOSLEEP)) == NULL) { 24075 /* 24076 * Any failure other than ENOMEM indicates that we 24077 * have passed in invalid pdesc info or parameters 24078 * to mmd_addpdesc, which must not happen. 24079 * 24080 * EINVAL is a result of failure on boundary checks 24081 * against the pdesc info contents. It should not 24082 * happen, and we panic because either there's 24083 * horrible heap corruption, and/or programming 24084 * mistake. 24085 */ 24086 if (error != ENOMEM) { 24087 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 24088 "pdesc logic error detected for " 24089 "mmd %p pinfo %p (%d)\n", 24090 (void *)mmd, (void *)&pdi, error); 24091 /* NOTREACHED */ 24092 } 24093 IP_STAT(ipst, ip_frag_mdt_addpdescfail); 24094 /* Free unattached payload message blocks as well */ 24095 md_mp->b_cont = mp->b_cont; 24096 goto free_mmd; 24097 } 24098 24099 /* Advance fragment offset. */ 24100 offset += len; 24101 24102 /* Advance to location for next header in the buffer. */ 24103 hdr_ptr += hdr_chunk_len; 24104 24105 /* Did we reach the next payload message block? */ 24106 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 24107 mp = mp->b_cont; 24108 /* 24109 * Attach the next message block with payload 24110 * data to the multidata message. 24111 */ 24112 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 24113 goto pbuf_panic; 24114 pld_ptr = mp->b_rptr; 24115 } 24116 } 24117 24118 ASSERT(hdr_mp->b_wptr == hdr_ptr); 24119 ASSERT(mp->b_wptr == pld_ptr); 24120 24121 /* Update IP statistics */ 24122 IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts); 24123 24124 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts); 24125 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); 24126 24127 len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH; 24128 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts); 24129 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len); 24130 24131 if (pkt_type == OB_PKT) { 24132 ire->ire_ob_pkt_count += pkts; 24133 if (ire->ire_ipif != NULL) 24134 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 24135 } else { 24136 /* The type is IB_PKT in the forwarding path. */ 24137 ire->ire_ib_pkt_count += pkts; 24138 ASSERT(!IRE_IS_LOCAL(ire)); 24139 if (ire->ire_type & IRE_BROADCAST) { 24140 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 24141 } else { 24142 UPDATE_MIB(ill->ill_ip_mib, 24143 ipIfStatsHCOutForwDatagrams, pkts); 24144 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 24145 } 24146 } 24147 ire->ire_last_used_time = lbolt; 24148 /* Send it down */ 24149 putnext(ire->ire_stq, md_mp); 24150 return; 24151 24152 pbuf_panic: 24153 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 24154 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 24155 pbuf_idx); 24156 /* NOTREACHED */ 24157 } 24158 24159 /* 24160 * Outbound IP fragmentation routine. 24161 * 24162 * NOTE : This routine does not ire_refrele the ire that is passed in 24163 * as the argument. 24164 */ 24165 static void 24166 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 24167 uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst) 24168 { 24169 int i1; 24170 mblk_t *ll_hdr_mp; 24171 int ll_hdr_len; 24172 int hdr_len; 24173 mblk_t *hdr_mp; 24174 ipha_t *ipha; 24175 int ip_data_end; 24176 int len; 24177 mblk_t *mp = mp_orig, *mp1; 24178 int offset; 24179 queue_t *q; 24180 uint32_t v_hlen_tos_len; 24181 mblk_t *first_mp; 24182 boolean_t mctl_present; 24183 ill_t *ill; 24184 ill_t *out_ill; 24185 mblk_t *xmit_mp; 24186 mblk_t *carve_mp; 24187 ire_t *ire1 = NULL; 24188 ire_t *save_ire = NULL; 24189 mblk_t *next_mp = NULL; 24190 boolean_t last_frag = B_FALSE; 24191 boolean_t multirt_send = B_FALSE; 24192 ire_t *first_ire = NULL; 24193 irb_t *irb = NULL; 24194 mib2_ipIfStatsEntry_t *mibptr = NULL; 24195 24196 ill = ire_to_ill(ire); 24197 mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; 24198 24199 BUMP_MIB(mibptr, ipIfStatsOutFragReqds); 24200 24201 if (max_frag == 0) { 24202 ip1dbg(("ip_wput_frag: ire frag size is 0" 24203 " - dropping packet\n")); 24204 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24205 freemsg(mp); 24206 return; 24207 } 24208 24209 /* 24210 * IPsec does not allow hw accelerated packets to be fragmented 24211 * This check is made in ip_wput_ipsec_out prior to coming here 24212 * via ip_wput_ire_fragmentit. 24213 * 24214 * If at this point we have an ire whose ARP request has not 24215 * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger 24216 * sending of ARP query and change ire's state to ND_INCOMPLETE. 24217 * This packet and all fragmentable packets for this ire will 24218 * continue to get dropped while ire_nce->nce_state remains in 24219 * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to 24220 * ND_REACHABLE, all subsquent large packets for this ire will 24221 * get fragemented and sent out by this function. 24222 */ 24223 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 24224 /* If nce_state is ND_INITIAL, trigger ARP query */ 24225 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 24226 ip1dbg(("ip_wput_frag: mac address for ire is unresolved" 24227 " - dropping packet\n")); 24228 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24229 freemsg(mp); 24230 return; 24231 } 24232 24233 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 24234 "ip_wput_frag_start:"); 24235 24236 if (mp->b_datap->db_type == M_CTL) { 24237 first_mp = mp; 24238 mp_orig = mp = mp->b_cont; 24239 mctl_present = B_TRUE; 24240 } else { 24241 first_mp = mp; 24242 mctl_present = B_FALSE; 24243 } 24244 24245 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 24246 ipha = (ipha_t *)mp->b_rptr; 24247 24248 /* 24249 * If the Don't Fragment flag is on, generate an ICMP destination 24250 * unreachable, fragmentation needed. 24251 */ 24252 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 24253 if (offset & IPH_DF) { 24254 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24255 if (is_system_labeled()) { 24256 max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag, 24257 ire->ire_max_frag - max_frag, AF_INET); 24258 } 24259 /* 24260 * Need to compute hdr checksum if called from ip_wput_ire. 24261 * Note that ip_rput_forward verifies the checksum before 24262 * calling this routine so in that case this is a noop. 24263 */ 24264 ipha->ipha_hdr_checksum = 0; 24265 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24266 icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid, 24267 ipst); 24268 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24269 "ip_wput_frag_end:(%S)", 24270 "don't fragment"); 24271 return; 24272 } 24273 /* 24274 * Labeled systems adjust max_frag if they add a label 24275 * to send the correct path mtu. We need the real mtu since we 24276 * are fragmenting the packet after label adjustment. 24277 */ 24278 if (is_system_labeled()) 24279 max_frag = ire->ire_max_frag; 24280 if (mctl_present) 24281 freeb(first_mp); 24282 /* 24283 * Establish the starting offset. May not be zero if we are fragging 24284 * a fragment that is being forwarded. 24285 */ 24286 offset = offset & IPH_OFFSET; 24287 24288 /* TODO why is this test needed? */ 24289 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 24290 if (((max_frag - LENGTH) & ~7) < 8) { 24291 /* TODO: notify ulp somehow */ 24292 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24293 freemsg(mp); 24294 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24295 "ip_wput_frag_end:(%S)", 24296 "len < 8"); 24297 return; 24298 } 24299 24300 hdr_len = (V_HLEN & 0xF) << 2; 24301 24302 ipha->ipha_hdr_checksum = 0; 24303 24304 /* 24305 * Establish the number of bytes maximum per frag, after putting 24306 * in the header. 24307 */ 24308 len = (max_frag - hdr_len) & ~7; 24309 24310 /* Check if we can use MDT to send out the frags. */ 24311 ASSERT(!IRE_IS_LOCAL(ire)); 24312 if (hdr_len == IP_SIMPLE_HDR_LENGTH && 24313 ipst->ips_ip_multidata_outbound && 24314 !(ire->ire_flags & RTF_MULTIRT) && 24315 !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && 24316 ill != NULL && ILL_MDT_CAPABLE(ill) && 24317 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 24318 ASSERT(ill->ill_mdt_capab != NULL); 24319 if (!ill->ill_mdt_capab->ill_mdt_on) { 24320 /* 24321 * If MDT has been previously turned off in the past, 24322 * and we currently can do MDT (due to IPQoS policy 24323 * removal, etc.) then enable it for this interface. 24324 */ 24325 ill->ill_mdt_capab->ill_mdt_on = 1; 24326 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 24327 ill->ill_name)); 24328 } 24329 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 24330 offset); 24331 return; 24332 } 24333 24334 /* Get a copy of the header for the trailing frags */ 24335 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst); 24336 if (!hdr_mp) { 24337 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24338 freemsg(mp); 24339 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24340 "ip_wput_frag_end:(%S)", 24341 "couldn't copy hdr"); 24342 return; 24343 } 24344 if (DB_CRED(mp) != NULL) 24345 mblk_setcred(hdr_mp, DB_CRED(mp)); 24346 24347 /* Store the starting offset, with the MoreFrags flag. */ 24348 i1 = offset | IPH_MF | frag_flag; 24349 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 24350 24351 /* Establish the ending byte offset, based on the starting offset. */ 24352 offset <<= 3; 24353 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 24354 24355 /* Store the length of the first fragment in the IP header. */ 24356 i1 = len + hdr_len; 24357 ASSERT(i1 <= IP_MAXPACKET); 24358 ipha->ipha_length = htons((uint16_t)i1); 24359 24360 /* 24361 * Compute the IP header checksum for the first frag. We have to 24362 * watch out that we stop at the end of the header. 24363 */ 24364 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24365 24366 /* 24367 * Now carve off the first frag. Note that this will include the 24368 * original IP header. 24369 */ 24370 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 24371 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24372 freeb(hdr_mp); 24373 freemsg(mp_orig); 24374 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24375 "ip_wput_frag_end:(%S)", 24376 "couldn't carve first"); 24377 return; 24378 } 24379 24380 /* 24381 * Multirouting case. Each fragment is replicated 24382 * via all non-condemned RTF_MULTIRT routes 24383 * currently resolved. 24384 * We ensure that first_ire is the first RTF_MULTIRT 24385 * ire in the bucket. 24386 */ 24387 if (ire->ire_flags & RTF_MULTIRT) { 24388 irb = ire->ire_bucket; 24389 ASSERT(irb != NULL); 24390 24391 multirt_send = B_TRUE; 24392 24393 /* Make sure we do not omit any multiroute ire. */ 24394 IRB_REFHOLD(irb); 24395 for (first_ire = irb->irb_ire; 24396 first_ire != NULL; 24397 first_ire = first_ire->ire_next) { 24398 if ((first_ire->ire_flags & RTF_MULTIRT) && 24399 (first_ire->ire_addr == ire->ire_addr) && 24400 !(first_ire->ire_marks & 24401 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 24402 break; 24403 } 24404 } 24405 24406 if (first_ire != NULL) { 24407 if (first_ire != ire) { 24408 IRE_REFHOLD(first_ire); 24409 /* 24410 * Do not release the ire passed in 24411 * as the argument. 24412 */ 24413 ire = first_ire; 24414 } else { 24415 first_ire = NULL; 24416 } 24417 } 24418 IRB_REFRELE(irb); 24419 24420 /* 24421 * Save the first ire; we will need to restore it 24422 * for the trailing frags. 24423 * We REFHOLD save_ire, as each iterated ire will be 24424 * REFRELEd. 24425 */ 24426 save_ire = ire; 24427 IRE_REFHOLD(save_ire); 24428 } 24429 24430 /* 24431 * First fragment emission loop. 24432 * In most cases, the emission loop below is entered only 24433 * once. Only in the case where the ire holds the RTF_MULTIRT 24434 * flag, do we loop to process all RTF_MULTIRT ires in the 24435 * bucket, and send the fragment through all crossed 24436 * RTF_MULTIRT routes. 24437 */ 24438 do { 24439 if (ire->ire_flags & RTF_MULTIRT) { 24440 /* 24441 * We are in a multiple send case, need to get 24442 * the next ire and make a copy of the packet. 24443 * ire1 holds here the next ire to process in the 24444 * bucket. If multirouting is expected, 24445 * any non-RTF_MULTIRT ire that has the 24446 * right destination address is ignored. 24447 * 24448 * We have to take into account the MTU of 24449 * each walked ire. max_frag is set by the 24450 * the caller and generally refers to 24451 * the primary ire entry. Here we ensure that 24452 * no route with a lower MTU will be used, as 24453 * fragments are carved once for all ires, 24454 * then replicated. 24455 */ 24456 ASSERT(irb != NULL); 24457 IRB_REFHOLD(irb); 24458 for (ire1 = ire->ire_next; 24459 ire1 != NULL; 24460 ire1 = ire1->ire_next) { 24461 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24462 continue; 24463 if (ire1->ire_addr != ire->ire_addr) 24464 continue; 24465 if (ire1->ire_marks & 24466 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 24467 continue; 24468 /* 24469 * Ensure we do not exceed the MTU 24470 * of the next route. 24471 */ 24472 if (ire1->ire_max_frag < max_frag) { 24473 ip_multirt_bad_mtu(ire1, max_frag); 24474 continue; 24475 } 24476 24477 /* Got one. */ 24478 IRE_REFHOLD(ire1); 24479 break; 24480 } 24481 IRB_REFRELE(irb); 24482 24483 if (ire1 != NULL) { 24484 next_mp = copyb(mp); 24485 if ((next_mp == NULL) || 24486 ((mp->b_cont != NULL) && 24487 ((next_mp->b_cont = 24488 dupmsg(mp->b_cont)) == NULL))) { 24489 freemsg(next_mp); 24490 next_mp = NULL; 24491 ire_refrele(ire1); 24492 ire1 = NULL; 24493 } 24494 } 24495 24496 /* Last multiroute ire; don't loop anymore. */ 24497 if (ire1 == NULL) { 24498 multirt_send = B_FALSE; 24499 } 24500 } 24501 24502 ll_hdr_len = 0; 24503 LOCK_IRE_FP_MP(ire); 24504 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24505 if (ll_hdr_mp != NULL) { 24506 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24507 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 24508 } else { 24509 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24510 } 24511 24512 /* If there is a transmit header, get a copy for this frag. */ 24513 /* 24514 * TODO: should check db_ref before calling ip_carve_mp since 24515 * it might give us a dup. 24516 */ 24517 if (!ll_hdr_mp) { 24518 /* No xmit header. */ 24519 xmit_mp = mp; 24520 24521 /* We have a link-layer header that can fit in our mblk. */ 24522 } else if (mp->b_datap->db_ref == 1 && 24523 ll_hdr_len != 0 && 24524 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24525 /* M_DATA fastpath */ 24526 mp->b_rptr -= ll_hdr_len; 24527 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 24528 xmit_mp = mp; 24529 24530 /* Corner case if copyb has failed */ 24531 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 24532 UNLOCK_IRE_FP_MP(ire); 24533 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24534 freeb(hdr_mp); 24535 freemsg(mp); 24536 freemsg(mp_orig); 24537 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24538 "ip_wput_frag_end:(%S)", 24539 "discard"); 24540 24541 if (multirt_send) { 24542 ASSERT(ire1); 24543 ASSERT(next_mp); 24544 24545 freemsg(next_mp); 24546 ire_refrele(ire1); 24547 } 24548 if (save_ire != NULL) 24549 IRE_REFRELE(save_ire); 24550 24551 if (first_ire != NULL) 24552 ire_refrele(first_ire); 24553 return; 24554 24555 /* 24556 * Case of res_mp OR the fastpath mp can't fit 24557 * in the mblk 24558 */ 24559 } else { 24560 xmit_mp->b_cont = mp; 24561 if (DB_CRED(mp) != NULL) 24562 mblk_setcred(xmit_mp, DB_CRED(mp)); 24563 /* 24564 * Get priority marking, if any. 24565 * We propagate the CoS marking from the 24566 * original packet that went to QoS processing 24567 * in ip_wput_ire to the newly carved mp. 24568 */ 24569 if (DB_TYPE(xmit_mp) == M_DATA) 24570 xmit_mp->b_band = mp->b_band; 24571 } 24572 UNLOCK_IRE_FP_MP(ire); 24573 24574 q = ire->ire_stq; 24575 out_ill = (ill_t *)q->q_ptr; 24576 24577 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24578 24579 DTRACE_PROBE4(ip4__physical__out__start, 24580 ill_t *, NULL, ill_t *, out_ill, 24581 ipha_t *, ipha, mblk_t *, xmit_mp); 24582 24583 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24584 ipst->ips_ipv4firewall_physical_out, 24585 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24586 24587 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp); 24588 24589 if (xmit_mp != NULL) { 24590 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL, 24591 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 24592 ipha_t *, ipha, ip6_t *, NULL, int, 0); 24593 24594 putnext(q, xmit_mp); 24595 24596 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); 24597 UPDATE_MIB(out_ill->ill_ip_mib, 24598 ipIfStatsHCOutOctets, i1); 24599 24600 if (pkt_type != OB_PKT) { 24601 /* 24602 * Update the packet count and MIB stats 24603 * of trailing RTF_MULTIRT ires. 24604 */ 24605 UPDATE_OB_PKT_COUNT(ire); 24606 BUMP_MIB(out_ill->ill_ip_mib, 24607 ipIfStatsOutFragReqds); 24608 } 24609 } 24610 24611 if (multirt_send) { 24612 /* 24613 * We are in a multiple send case; look for 24614 * the next ire and re-enter the loop. 24615 */ 24616 ASSERT(ire1); 24617 ASSERT(next_mp); 24618 /* REFRELE the current ire before looping */ 24619 ire_refrele(ire); 24620 ire = ire1; 24621 ire1 = NULL; 24622 mp = next_mp; 24623 next_mp = NULL; 24624 } 24625 } while (multirt_send); 24626 24627 ASSERT(ire1 == NULL); 24628 24629 /* Restore the original ire; we need it for the trailing frags */ 24630 if (save_ire != NULL) { 24631 /* REFRELE the last iterated ire */ 24632 ire_refrele(ire); 24633 /* save_ire has been REFHOLDed */ 24634 ire = save_ire; 24635 save_ire = NULL; 24636 q = ire->ire_stq; 24637 } 24638 24639 if (pkt_type == OB_PKT) { 24640 UPDATE_OB_PKT_COUNT(ire); 24641 } else { 24642 out_ill = (ill_t *)q->q_ptr; 24643 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 24644 UPDATE_IB_PKT_COUNT(ire); 24645 } 24646 24647 /* Advance the offset to the second frag starting point. */ 24648 offset += len; 24649 /* 24650 * Update hdr_len from the copied header - there might be less options 24651 * in the later fragments. 24652 */ 24653 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 24654 /* Loop until done. */ 24655 for (;;) { 24656 uint16_t offset_and_flags; 24657 uint16_t ip_len; 24658 24659 if (ip_data_end - offset > len) { 24660 /* 24661 * Carve off the appropriate amount from the original 24662 * datagram. 24663 */ 24664 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24665 mp = NULL; 24666 break; 24667 } 24668 /* 24669 * More frags after this one. Get another copy 24670 * of the header. 24671 */ 24672 if (carve_mp->b_datap->db_ref == 1 && 24673 hdr_mp->b_wptr - hdr_mp->b_rptr < 24674 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24675 /* Inline IP header */ 24676 carve_mp->b_rptr -= hdr_mp->b_wptr - 24677 hdr_mp->b_rptr; 24678 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24679 hdr_mp->b_wptr - hdr_mp->b_rptr); 24680 mp = carve_mp; 24681 } else { 24682 if (!(mp = copyb(hdr_mp))) { 24683 freemsg(carve_mp); 24684 break; 24685 } 24686 /* Get priority marking, if any. */ 24687 mp->b_band = carve_mp->b_band; 24688 mp->b_cont = carve_mp; 24689 } 24690 ipha = (ipha_t *)mp->b_rptr; 24691 offset_and_flags = IPH_MF; 24692 } else { 24693 /* 24694 * Last frag. Consume the header. Set len to 24695 * the length of this last piece. 24696 */ 24697 len = ip_data_end - offset; 24698 24699 /* 24700 * Carve off the appropriate amount from the original 24701 * datagram. 24702 */ 24703 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 24704 mp = NULL; 24705 break; 24706 } 24707 if (carve_mp->b_datap->db_ref == 1 && 24708 hdr_mp->b_wptr - hdr_mp->b_rptr < 24709 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 24710 /* Inline IP header */ 24711 carve_mp->b_rptr -= hdr_mp->b_wptr - 24712 hdr_mp->b_rptr; 24713 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 24714 hdr_mp->b_wptr - hdr_mp->b_rptr); 24715 mp = carve_mp; 24716 freeb(hdr_mp); 24717 hdr_mp = mp; 24718 } else { 24719 mp = hdr_mp; 24720 /* Get priority marking, if any. */ 24721 mp->b_band = carve_mp->b_band; 24722 mp->b_cont = carve_mp; 24723 } 24724 ipha = (ipha_t *)mp->b_rptr; 24725 /* A frag of a frag might have IPH_MF non-zero */ 24726 offset_and_flags = 24727 ntohs(ipha->ipha_fragment_offset_and_flags) & 24728 IPH_MF; 24729 } 24730 offset_and_flags |= (uint16_t)(offset >> 3); 24731 offset_and_flags |= (uint16_t)frag_flag; 24732 /* Store the offset and flags in the IP header. */ 24733 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 24734 24735 /* Store the length in the IP header. */ 24736 ip_len = (uint16_t)(len + hdr_len); 24737 ipha->ipha_length = htons(ip_len); 24738 24739 /* 24740 * Set the IP header checksum. Note that mp is just 24741 * the header, so this is easy to pass to ip_csum. 24742 */ 24743 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 24744 24745 /* Attach a transmit header, if any, and ship it. */ 24746 if (pkt_type == OB_PKT) { 24747 UPDATE_OB_PKT_COUNT(ire); 24748 } else { 24749 out_ill = (ill_t *)q->q_ptr; 24750 BUMP_MIB(out_ill->ill_ip_mib, 24751 ipIfStatsHCOutForwDatagrams); 24752 UPDATE_IB_PKT_COUNT(ire); 24753 } 24754 24755 if (ire->ire_flags & RTF_MULTIRT) { 24756 irb = ire->ire_bucket; 24757 ASSERT(irb != NULL); 24758 24759 multirt_send = B_TRUE; 24760 24761 /* 24762 * Save the original ire; we will need to restore it 24763 * for the tailing frags. 24764 */ 24765 save_ire = ire; 24766 IRE_REFHOLD(save_ire); 24767 } 24768 /* 24769 * Emission loop for this fragment, similar 24770 * to what is done for the first fragment. 24771 */ 24772 do { 24773 if (multirt_send) { 24774 /* 24775 * We are in a multiple send case, need to get 24776 * the next ire and make a copy of the packet. 24777 */ 24778 ASSERT(irb != NULL); 24779 IRB_REFHOLD(irb); 24780 for (ire1 = ire->ire_next; 24781 ire1 != NULL; 24782 ire1 = ire1->ire_next) { 24783 if (!(ire1->ire_flags & RTF_MULTIRT)) 24784 continue; 24785 if (ire1->ire_addr != ire->ire_addr) 24786 continue; 24787 if (ire1->ire_marks & 24788 (IRE_MARK_CONDEMNED| 24789 IRE_MARK_HIDDEN)) { 24790 continue; 24791 } 24792 /* 24793 * Ensure we do not exceed the MTU 24794 * of the next route. 24795 */ 24796 if (ire1->ire_max_frag < max_frag) { 24797 ip_multirt_bad_mtu(ire1, 24798 max_frag); 24799 continue; 24800 } 24801 24802 /* Got one. */ 24803 IRE_REFHOLD(ire1); 24804 break; 24805 } 24806 IRB_REFRELE(irb); 24807 24808 if (ire1 != NULL) { 24809 next_mp = copyb(mp); 24810 if ((next_mp == NULL) || 24811 ((mp->b_cont != NULL) && 24812 ((next_mp->b_cont = 24813 dupmsg(mp->b_cont)) == NULL))) { 24814 freemsg(next_mp); 24815 next_mp = NULL; 24816 ire_refrele(ire1); 24817 ire1 = NULL; 24818 } 24819 } 24820 24821 /* Last multiroute ire; don't loop anymore. */ 24822 if (ire1 == NULL) { 24823 multirt_send = B_FALSE; 24824 } 24825 } 24826 24827 /* Update transmit header */ 24828 ll_hdr_len = 0; 24829 LOCK_IRE_FP_MP(ire); 24830 ll_hdr_mp = ire->ire_nce->nce_fp_mp; 24831 if (ll_hdr_mp != NULL) { 24832 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 24833 ll_hdr_len = MBLKL(ll_hdr_mp); 24834 } else { 24835 ll_hdr_mp = ire->ire_nce->nce_res_mp; 24836 } 24837 24838 if (!ll_hdr_mp) { 24839 xmit_mp = mp; 24840 24841 /* 24842 * We have link-layer header that can fit in 24843 * our mblk. 24844 */ 24845 } else if (mp->b_datap->db_ref == 1 && 24846 ll_hdr_len != 0 && 24847 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 24848 /* M_DATA fastpath */ 24849 mp->b_rptr -= ll_hdr_len; 24850 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 24851 ll_hdr_len); 24852 xmit_mp = mp; 24853 24854 /* 24855 * Case of res_mp OR the fastpath mp can't fit 24856 * in the mblk 24857 */ 24858 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 24859 xmit_mp->b_cont = mp; 24860 if (DB_CRED(mp) != NULL) 24861 mblk_setcred(xmit_mp, DB_CRED(mp)); 24862 /* Get priority marking, if any. */ 24863 if (DB_TYPE(xmit_mp) == M_DATA) 24864 xmit_mp->b_band = mp->b_band; 24865 24866 /* Corner case if copyb failed */ 24867 } else { 24868 /* 24869 * Exit both the replication and 24870 * fragmentation loops. 24871 */ 24872 UNLOCK_IRE_FP_MP(ire); 24873 goto drop_pkt; 24874 } 24875 UNLOCK_IRE_FP_MP(ire); 24876 24877 mp1 = mp; 24878 out_ill = (ill_t *)q->q_ptr; 24879 24880 BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); 24881 24882 DTRACE_PROBE4(ip4__physical__out__start, 24883 ill_t *, NULL, ill_t *, out_ill, 24884 ipha_t *, ipha, mblk_t *, xmit_mp); 24885 24886 FW_HOOKS(ipst->ips_ip4_physical_out_event, 24887 ipst->ips_ipv4firewall_physical_out, 24888 NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); 24889 24890 DTRACE_PROBE1(ip4__physical__out__end, 24891 mblk_t *, xmit_mp); 24892 24893 if (mp != mp1 && hdr_mp == mp1) 24894 hdr_mp = mp; 24895 if (mp != mp1 && mp_orig == mp1) 24896 mp_orig = mp; 24897 24898 if (xmit_mp != NULL) { 24899 DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, 24900 NULL, void_ip_t *, ipha, 24901 __dtrace_ipsr_ill_t *, out_ill, ipha_t *, 24902 ipha, ip6_t *, NULL, int, 0); 24903 24904 putnext(q, xmit_mp); 24905 24906 BUMP_MIB(out_ill->ill_ip_mib, 24907 ipIfStatsHCOutTransmits); 24908 UPDATE_MIB(out_ill->ill_ip_mib, 24909 ipIfStatsHCOutOctets, ip_len); 24910 24911 if (pkt_type != OB_PKT) { 24912 /* 24913 * Update the packet count of trailing 24914 * RTF_MULTIRT ires. 24915 */ 24916 UPDATE_OB_PKT_COUNT(ire); 24917 } 24918 } 24919 24920 /* All done if we just consumed the hdr_mp. */ 24921 if (mp == hdr_mp) { 24922 last_frag = B_TRUE; 24923 BUMP_MIB(out_ill->ill_ip_mib, 24924 ipIfStatsOutFragOKs); 24925 } 24926 24927 if (multirt_send) { 24928 /* 24929 * We are in a multiple send case; look for 24930 * the next ire and re-enter the loop. 24931 */ 24932 ASSERT(ire1); 24933 ASSERT(next_mp); 24934 /* REFRELE the current ire before looping */ 24935 ire_refrele(ire); 24936 ire = ire1; 24937 ire1 = NULL; 24938 q = ire->ire_stq; 24939 mp = next_mp; 24940 next_mp = NULL; 24941 } 24942 } while (multirt_send); 24943 /* 24944 * Restore the original ire; we need it for the 24945 * trailing frags 24946 */ 24947 if (save_ire != NULL) { 24948 ASSERT(ire1 == NULL); 24949 /* REFRELE the last iterated ire */ 24950 ire_refrele(ire); 24951 /* save_ire has been REFHOLDed */ 24952 ire = save_ire; 24953 q = ire->ire_stq; 24954 save_ire = NULL; 24955 } 24956 24957 if (last_frag) { 24958 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24959 "ip_wput_frag_end:(%S)", 24960 "consumed hdr_mp"); 24961 24962 if (first_ire != NULL) 24963 ire_refrele(first_ire); 24964 return; 24965 } 24966 /* Otherwise, advance and loop. */ 24967 offset += len; 24968 } 24969 24970 drop_pkt: 24971 /* Clean up following allocation failure. */ 24972 BUMP_MIB(mibptr, ipIfStatsOutFragFails); 24973 freemsg(mp); 24974 if (mp != hdr_mp) 24975 freeb(hdr_mp); 24976 if (mp != mp_orig) 24977 freemsg(mp_orig); 24978 24979 if (save_ire != NULL) 24980 IRE_REFRELE(save_ire); 24981 if (first_ire != NULL) 24982 ire_refrele(first_ire); 24983 24984 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 24985 "ip_wput_frag_end:(%S)", 24986 "end--alloc failure"); 24987 } 24988 24989 /* 24990 * Copy the header plus those options which have the copy bit set 24991 */ 24992 static mblk_t * 24993 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst) 24994 { 24995 mblk_t *mp; 24996 uchar_t *up; 24997 24998 /* 24999 * Quick check if we need to look for options without the copy bit 25000 * set 25001 */ 25002 mp = allocb(ipst->ips_ip_wroff_extra + hdr_len, BPRI_HI); 25003 if (!mp) 25004 return (mp); 25005 mp->b_rptr += ipst->ips_ip_wroff_extra; 25006 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 25007 bcopy(rptr, mp->b_rptr, hdr_len); 25008 mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra; 25009 return (mp); 25010 } 25011 up = mp->b_rptr; 25012 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 25013 up += IP_SIMPLE_HDR_LENGTH; 25014 rptr += IP_SIMPLE_HDR_LENGTH; 25015 hdr_len -= IP_SIMPLE_HDR_LENGTH; 25016 while (hdr_len > 0) { 25017 uint32_t optval; 25018 uint32_t optlen; 25019 25020 optval = *rptr; 25021 if (optval == IPOPT_EOL) 25022 break; 25023 if (optval == IPOPT_NOP) 25024 optlen = 1; 25025 else 25026 optlen = rptr[1]; 25027 if (optval & IPOPT_COPY) { 25028 bcopy(rptr, up, optlen); 25029 up += optlen; 25030 } 25031 rptr += optlen; 25032 hdr_len -= optlen; 25033 } 25034 /* 25035 * Make sure that we drop an even number of words by filling 25036 * with EOL to the next word boundary. 25037 */ 25038 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 25039 hdr_len & 0x3; hdr_len++) 25040 *up++ = IPOPT_EOL; 25041 mp->b_wptr = up; 25042 /* Update header length */ 25043 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 25044 return (mp); 25045 } 25046 25047 /* 25048 * Delivery to local recipients including fanout to multiple recipients. 25049 * Does not do checksumming of UDP/TCP. 25050 * Note: q should be the read side queue for either the ill or conn. 25051 * Note: rq should be the read side q for the lower (ill) stream. 25052 * We don't send packets to IPPF processing, thus the last argument 25053 * to all the fanout calls are B_FALSE. 25054 */ 25055 void 25056 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 25057 int fanout_flags, zoneid_t zoneid) 25058 { 25059 uint32_t protocol; 25060 mblk_t *first_mp; 25061 boolean_t mctl_present; 25062 int ire_type; 25063 #define rptr ((uchar_t *)ipha) 25064 ip_stack_t *ipst = ill->ill_ipst; 25065 25066 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 25067 "ip_wput_local_start: q %p", q); 25068 25069 if (ire != NULL) { 25070 ire_type = ire->ire_type; 25071 } else { 25072 /* 25073 * Only ip_multicast_loopback() calls us with a NULL ire. If the 25074 * packet is not multicast, we can't tell the ire type. 25075 */ 25076 ASSERT(CLASSD(ipha->ipha_dst)); 25077 ire_type = IRE_BROADCAST; 25078 } 25079 25080 first_mp = mp; 25081 if (first_mp->b_datap->db_type == M_CTL) { 25082 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 25083 if (!io->ipsec_out_secure) { 25084 /* 25085 * This ipsec_out_t was allocated in ip_wput 25086 * for multicast packets to store the ill_index. 25087 * As this is being delivered locally, we don't 25088 * need this anymore. 25089 */ 25090 mp = first_mp->b_cont; 25091 freeb(first_mp); 25092 first_mp = mp; 25093 mctl_present = B_FALSE; 25094 } else { 25095 /* 25096 * Convert IPSEC_OUT to IPSEC_IN, preserving all 25097 * security properties for the looped-back packet. 25098 */ 25099 mctl_present = B_TRUE; 25100 mp = first_mp->b_cont; 25101 ASSERT(mp != NULL); 25102 ipsec_out_to_in(first_mp); 25103 } 25104 } else { 25105 mctl_present = B_FALSE; 25106 } 25107 25108 DTRACE_PROBE4(ip4__loopback__in__start, 25109 ill_t *, ill, ill_t *, NULL, 25110 ipha_t *, ipha, mblk_t *, first_mp); 25111 25112 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 25113 ipst->ips_ipv4firewall_loopback_in, 25114 ill, NULL, ipha, first_mp, mp, 0, ipst); 25115 25116 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp); 25117 25118 if (first_mp == NULL) 25119 return; 25120 25121 DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, 25122 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 25123 int, 1); 25124 25125 ipst->ips_loopback_packets++; 25126 25127 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 25128 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 25129 if (!IS_SIMPLE_IPH(ipha)) { 25130 ip_wput_local_options(ipha, ipst); 25131 } 25132 25133 protocol = ipha->ipha_protocol; 25134 switch (protocol) { 25135 case IPPROTO_ICMP: { 25136 ire_t *ire_zone; 25137 ilm_t *ilm; 25138 mblk_t *mp1; 25139 zoneid_t last_zoneid; 25140 25141 if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { 25142 ASSERT(ire_type == IRE_BROADCAST); 25143 /* 25144 * In the multicast case, applications may have joined 25145 * the group from different zones, so we need to deliver 25146 * the packet to each of them. Loop through the 25147 * multicast memberships structures (ilm) on the receive 25148 * ill and send a copy of the packet up each matching 25149 * one. However, we don't do this for multicasts sent on 25150 * the loopback interface (PHYI_LOOPBACK flag set) as 25151 * they must stay in the sender's zone. 25152 * 25153 * ilm_add_v6() ensures that ilms in the same zone are 25154 * contiguous in the ill_ilm list. We use this property 25155 * to avoid sending duplicates needed when two 25156 * applications in the same zone join the same group on 25157 * different logical interfaces: we ignore the ilm if 25158 * it's zoneid is the same as the last matching one. 25159 * In addition, the sending of the packet for 25160 * ire_zoneid is delayed until all of the other ilms 25161 * have been exhausted. 25162 */ 25163 last_zoneid = -1; 25164 ILM_WALKER_HOLD(ill); 25165 for (ilm = ill->ill_ilm; ilm != NULL; 25166 ilm = ilm->ilm_next) { 25167 if ((ilm->ilm_flags & ILM_DELETED) || 25168 ipha->ipha_dst != ilm->ilm_addr || 25169 ilm->ilm_zoneid == last_zoneid || 25170 ilm->ilm_zoneid == zoneid || 25171 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 25172 continue; 25173 mp1 = ip_copymsg(first_mp); 25174 if (mp1 == NULL) 25175 continue; 25176 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25177 mctl_present, B_FALSE, ill, 25178 ilm->ilm_zoneid); 25179 last_zoneid = ilm->ilm_zoneid; 25180 } 25181 ILM_WALKER_RELE(ill); 25182 /* 25183 * Loopback case: the sending endpoint has 25184 * IP_MULTICAST_LOOP disabled, therefore we don't 25185 * dispatch the multicast packet to the sending zone. 25186 */ 25187 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 25188 freemsg(first_mp); 25189 return; 25190 } 25191 } else if (ire_type == IRE_BROADCAST) { 25192 /* 25193 * In the broadcast case, there may be many zones 25194 * which need a copy of the packet delivered to them. 25195 * There is one IRE_BROADCAST per broadcast address 25196 * and per zone; we walk those using a helper function. 25197 * In addition, the sending of the packet for zoneid is 25198 * delayed until all of the other ires have been 25199 * processed. 25200 */ 25201 IRB_REFHOLD(ire->ire_bucket); 25202 ire_zone = NULL; 25203 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 25204 ire)) != NULL) { 25205 mp1 = ip_copymsg(first_mp); 25206 if (mp1 == NULL) 25207 continue; 25208 25209 UPDATE_IB_PKT_COUNT(ire_zone); 25210 ire_zone->ire_last_used_time = lbolt; 25211 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 25212 mctl_present, B_FALSE, ill, 25213 ire_zone->ire_zoneid); 25214 } 25215 IRB_REFRELE(ire->ire_bucket); 25216 } 25217 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 25218 0, mctl_present, B_FALSE, ill, zoneid); 25219 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25220 "ip_wput_local_end: q %p (%S)", 25221 q, "icmp"); 25222 return; 25223 } 25224 case IPPROTO_IGMP: 25225 if ((mp = igmp_input(q, mp, ill)) == NULL) { 25226 /* Bad packet - discarded by igmp_input */ 25227 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25228 "ip_wput_local_end: q %p (%S)", 25229 q, "igmp_input--bad packet"); 25230 if (mctl_present) 25231 freeb(first_mp); 25232 return; 25233 } 25234 /* 25235 * igmp_input() may have returned the pulled up message. 25236 * So first_mp and ipha need to be reinitialized. 25237 */ 25238 ipha = (ipha_t *)mp->b_rptr; 25239 if (mctl_present) 25240 first_mp->b_cont = mp; 25241 else 25242 first_mp = mp; 25243 /* deliver to local raw users */ 25244 break; 25245 case IPPROTO_ENCAP: 25246 /* 25247 * This case is covered by either ip_fanout_proto, or by 25248 * the above security processing for self-tunneled packets. 25249 */ 25250 break; 25251 case IPPROTO_UDP: { 25252 uint16_t *up; 25253 uint32_t ports; 25254 25255 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 25256 UDP_PORTS_OFFSET); 25257 /* Force a 'valid' checksum. */ 25258 up[3] = 0; 25259 25260 ports = *(uint32_t *)up; 25261 ip_fanout_udp(q, first_mp, ill, ipha, ports, 25262 (ire_type == IRE_BROADCAST), 25263 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25264 IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE, 25265 ill, zoneid); 25266 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25267 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 25268 return; 25269 } 25270 case IPPROTO_TCP: { 25271 25272 /* 25273 * For TCP, discard broadcast packets. 25274 */ 25275 if ((ushort_t)ire_type == IRE_BROADCAST) { 25276 freemsg(first_mp); 25277 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 25278 ip2dbg(("ip_wput_local: discard broadcast\n")); 25279 return; 25280 } 25281 25282 if (mp->b_datap->db_type == M_DATA) { 25283 /* 25284 * M_DATA mblk, so init mblk (chain) for no struio(). 25285 */ 25286 mblk_t *mp1 = mp; 25287 25288 do { 25289 mp1->b_datap->db_struioflag = 0; 25290 } while ((mp1 = mp1->b_cont) != NULL); 25291 } 25292 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 25293 <= mp->b_wptr); 25294 ip_fanout_tcp(q, first_mp, ill, ipha, 25295 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25296 IP_FF_SYN_ADDIRE | IP_FF_IPINFO, 25297 mctl_present, B_FALSE, zoneid); 25298 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25299 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 25300 return; 25301 } 25302 case IPPROTO_SCTP: 25303 { 25304 uint32_t ports; 25305 25306 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 25307 ip_fanout_sctp(first_mp, ill, ipha, ports, 25308 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 25309 IP_FF_IPINFO, mctl_present, B_FALSE, zoneid); 25310 return; 25311 } 25312 25313 default: 25314 break; 25315 } 25316 /* 25317 * Find a client for some other protocol. We give 25318 * copies to multiple clients, if more than one is 25319 * bound. 25320 */ 25321 ip_fanout_proto(q, first_mp, ill, ipha, 25322 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 25323 mctl_present, B_FALSE, ill, zoneid); 25324 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 25325 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 25326 #undef rptr 25327 } 25328 25329 /* 25330 * Update any source route, record route, or timestamp options. 25331 * Check that we are at end of strict source route. 25332 * The options have been sanity checked by ip_wput_options(). 25333 */ 25334 static void 25335 ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) 25336 { 25337 ipoptp_t opts; 25338 uchar_t *opt; 25339 uint8_t optval; 25340 uint8_t optlen; 25341 ipaddr_t dst; 25342 uint32_t ts; 25343 ire_t *ire; 25344 timestruc_t now; 25345 25346 ip2dbg(("ip_wput_local_options\n")); 25347 for (optval = ipoptp_first(&opts, ipha); 25348 optval != IPOPT_EOL; 25349 optval = ipoptp_next(&opts)) { 25350 opt = opts.ipoptp_cur; 25351 optlen = opts.ipoptp_len; 25352 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 25353 switch (optval) { 25354 uint32_t off; 25355 case IPOPT_SSRR: 25356 case IPOPT_LSRR: 25357 off = opt[IPOPT_OFFSET]; 25358 off--; 25359 if (optlen < IP_ADDR_LEN || 25360 off > optlen - IP_ADDR_LEN) { 25361 /* End of source route */ 25362 break; 25363 } 25364 /* 25365 * This will only happen if two consecutive entries 25366 * in the source route contains our address or if 25367 * it is a packet with a loose source route which 25368 * reaches us before consuming the whole source route 25369 */ 25370 ip1dbg(("ip_wput_local_options: not end of SR\n")); 25371 if (optval == IPOPT_SSRR) { 25372 return; 25373 } 25374 /* 25375 * Hack: instead of dropping the packet truncate the 25376 * source route to what has been used by filling the 25377 * rest with IPOPT_NOP. 25378 */ 25379 opt[IPOPT_OLEN] = (uint8_t)off; 25380 while (off < optlen) { 25381 opt[off++] = IPOPT_NOP; 25382 } 25383 break; 25384 case IPOPT_RR: 25385 off = opt[IPOPT_OFFSET]; 25386 off--; 25387 if (optlen < IP_ADDR_LEN || 25388 off > optlen - IP_ADDR_LEN) { 25389 /* No more room - ignore */ 25390 ip1dbg(( 25391 "ip_wput_forward_options: end of RR\n")); 25392 break; 25393 } 25394 dst = htonl(INADDR_LOOPBACK); 25395 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25396 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25397 break; 25398 case IPOPT_TS: 25399 /* Insert timestamp if there is romm */ 25400 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25401 case IPOPT_TS_TSONLY: 25402 off = IPOPT_TS_TIMELEN; 25403 break; 25404 case IPOPT_TS_PRESPEC: 25405 case IPOPT_TS_PRESPEC_RFC791: 25406 /* Verify that the address matched */ 25407 off = opt[IPOPT_OFFSET] - 1; 25408 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 25409 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 25410 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 25411 ipst); 25412 if (ire == NULL) { 25413 /* Not for us */ 25414 break; 25415 } 25416 ire_refrele(ire); 25417 /* FALLTHRU */ 25418 case IPOPT_TS_TSANDADDR: 25419 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25420 break; 25421 default: 25422 /* 25423 * ip_*put_options should have already 25424 * dropped this packet. 25425 */ 25426 cmn_err(CE_PANIC, "ip_wput_local_options: " 25427 "unknown IT - bug in ip_wput_options?\n"); 25428 return; /* Keep "lint" happy */ 25429 } 25430 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 25431 /* Increase overflow counter */ 25432 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 25433 opt[IPOPT_POS_OV_FLG] = (uint8_t) 25434 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 25435 (off << 4); 25436 break; 25437 } 25438 off = opt[IPOPT_OFFSET] - 1; 25439 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25440 case IPOPT_TS_PRESPEC: 25441 case IPOPT_TS_PRESPEC_RFC791: 25442 case IPOPT_TS_TSANDADDR: 25443 dst = htonl(INADDR_LOOPBACK); 25444 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 25445 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 25446 /* FALLTHRU */ 25447 case IPOPT_TS_TSONLY: 25448 off = opt[IPOPT_OFFSET] - 1; 25449 /* Compute # of milliseconds since midnight */ 25450 gethrestime(&now); 25451 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 25452 now.tv_nsec / (NANOSEC / MILLISEC); 25453 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 25454 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 25455 break; 25456 } 25457 break; 25458 } 25459 } 25460 } 25461 25462 /* 25463 * Send out a multicast packet on interface ipif. 25464 * The sender does not have an conn. 25465 * Caller verifies that this isn't a PHYI_LOOPBACK. 25466 */ 25467 void 25468 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) 25469 { 25470 ipha_t *ipha; 25471 ire_t *ire; 25472 ipaddr_t dst; 25473 mblk_t *first_mp; 25474 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 25475 25476 /* igmp_sendpkt always allocates a ipsec_out_t */ 25477 ASSERT(mp->b_datap->db_type == M_CTL); 25478 ASSERT(!ipif->ipif_isv6); 25479 ASSERT(!IS_LOOPBACK(ipif->ipif_ill)); 25480 25481 first_mp = mp; 25482 mp = first_mp->b_cont; 25483 ASSERT(mp->b_datap->db_type == M_DATA); 25484 ipha = (ipha_t *)mp->b_rptr; 25485 25486 /* 25487 * Find an IRE which matches the destination and the outgoing 25488 * queue (i.e. the outgoing interface.) 25489 */ 25490 if (ipif->ipif_flags & IPIF_POINTOPOINT) 25491 dst = ipif->ipif_pp_dst_addr; 25492 else 25493 dst = ipha->ipha_dst; 25494 /* 25495 * The source address has already been initialized by the 25496 * caller and hence matching on ILL (MATCH_IRE_ILL) would 25497 * be sufficient rather than MATCH_IRE_IPIF. 25498 * 25499 * This function is used for sending IGMP packets. We need 25500 * to make sure that we send the packet out of the interface 25501 * (ipif->ipif_ill) where we joined the group. This is to 25502 * prevent from switches doing IGMP snooping to send us multicast 25503 * packets for a given group on the interface we have joined. 25504 * If we can't find an ire, igmp_sendpkt has already initialized 25505 * ipsec_out_attach_if so that this will not be load spread in 25506 * ip_newroute_ipif. 25507 */ 25508 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, 25509 MATCH_IRE_ILL, ipst); 25510 if (!ire) { 25511 /* 25512 * Mark this packet to make it be delivered to 25513 * ip_wput_ire after the new ire has been 25514 * created. 25515 */ 25516 mp->b_prev = NULL; 25517 mp->b_next = NULL; 25518 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC, 25519 zoneid, &zero_info); 25520 return; 25521 } 25522 25523 /* 25524 * Honor the RTF_SETSRC flag; this is the only case 25525 * where we force this addr whatever the current src addr is, 25526 * because this address is set by igmp_sendpkt(), and 25527 * cannot be specified by any user. 25528 */ 25529 if (ire->ire_flags & RTF_SETSRC) { 25530 ipha->ipha_src = ire->ire_src_addr; 25531 } 25532 25533 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid); 25534 } 25535 25536 /* 25537 * NOTE : This function does not ire_refrele the ire argument passed in. 25538 * 25539 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 25540 * failure. The nce_fp_mp can vanish any time in the case of 25541 * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 25542 * the ire_lock to access the nce_fp_mp in this case. 25543 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 25544 * prepending a fastpath message IPQoS processing must precede it, we also set 25545 * the b_band of the fastpath message to that of the mblk returned by IPQoS 25546 * (IPQoS might have set the b_band for CoS marking). 25547 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 25548 * must follow it so that IPQoS can mark the dl_priority field for CoS 25549 * marking, if needed. 25550 */ 25551 static mblk_t * 25552 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, 25553 uint32_t ill_index, ipha_t **iphap) 25554 { 25555 uint_t hlen; 25556 ipha_t *ipha; 25557 mblk_t *mp1; 25558 boolean_t qos_done = B_FALSE; 25559 uchar_t *ll_hdr; 25560 ip_stack_t *ipst = ire->ire_ipst; 25561 25562 #define rptr ((uchar_t *)ipha) 25563 25564 ipha = (ipha_t *)mp->b_rptr; 25565 hlen = 0; 25566 LOCK_IRE_FP_MP(ire); 25567 if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) { 25568 ASSERT(DB_TYPE(mp1) == M_DATA); 25569 /* Initiate IPPF processing */ 25570 if ((proc != 0) && IPP_ENABLED(proc, ipst)) { 25571 UNLOCK_IRE_FP_MP(ire); 25572 ip_process(proc, &mp, ill_index); 25573 if (mp == NULL) 25574 return (NULL); 25575 25576 ipha = (ipha_t *)mp->b_rptr; 25577 LOCK_IRE_FP_MP(ire); 25578 if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) { 25579 qos_done = B_TRUE; 25580 goto no_fp_mp; 25581 } 25582 ASSERT(DB_TYPE(mp1) == M_DATA); 25583 } 25584 hlen = MBLKL(mp1); 25585 /* 25586 * Check if we have enough room to prepend fastpath 25587 * header 25588 */ 25589 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 25590 ll_hdr = rptr - hlen; 25591 bcopy(mp1->b_rptr, ll_hdr, hlen); 25592 /* 25593 * Set the b_rptr to the start of the link layer 25594 * header 25595 */ 25596 mp->b_rptr = ll_hdr; 25597 mp1 = mp; 25598 } else { 25599 mp1 = copyb(mp1); 25600 if (mp1 == NULL) 25601 goto unlock_err; 25602 mp1->b_band = mp->b_band; 25603 mp1->b_cont = mp; 25604 /* 25605 * certain system generated traffic may not 25606 * have cred/label in ip header block. This 25607 * is true even for a labeled system. But for 25608 * labeled traffic, inherit the label in the 25609 * new header. 25610 */ 25611 if (DB_CRED(mp) != NULL) 25612 mblk_setcred(mp1, DB_CRED(mp)); 25613 /* 25614 * XXX disable ICK_VALID and compute checksum 25615 * here; can happen if nce_fp_mp changes and 25616 * it can't be copied now due to insufficient 25617 * space. (unlikely, fp mp can change, but it 25618 * does not increase in length) 25619 */ 25620 } 25621 UNLOCK_IRE_FP_MP(ire); 25622 } else { 25623 no_fp_mp: 25624 mp1 = copyb(ire->ire_nce->nce_res_mp); 25625 if (mp1 == NULL) { 25626 unlock_err: 25627 UNLOCK_IRE_FP_MP(ire); 25628 freemsg(mp); 25629 return (NULL); 25630 } 25631 UNLOCK_IRE_FP_MP(ire); 25632 mp1->b_cont = mp; 25633 /* 25634 * certain system generated traffic may not 25635 * have cred/label in ip header block. This 25636 * is true even for a labeled system. But for 25637 * labeled traffic, inherit the label in the 25638 * new header. 25639 */ 25640 if (DB_CRED(mp) != NULL) 25641 mblk_setcred(mp1, DB_CRED(mp)); 25642 if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) { 25643 ip_process(proc, &mp1, ill_index); 25644 if (mp1 == NULL) 25645 return (NULL); 25646 25647 if (mp1->b_cont == NULL) 25648 ipha = NULL; 25649 else 25650 ipha = (ipha_t *)mp1->b_cont->b_rptr; 25651 } 25652 } 25653 25654 *iphap = ipha; 25655 return (mp1); 25656 #undef rptr 25657 } 25658 25659 /* 25660 * Finish the outbound IPsec processing for an IPv6 packet. This function 25661 * is called from ipsec_out_process() if the IPsec packet was processed 25662 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25663 * asynchronously. 25664 */ 25665 void 25666 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 25667 ire_t *ire_arg) 25668 { 25669 in6_addr_t *v6dstp; 25670 ire_t *ire; 25671 mblk_t *mp; 25672 ip6_t *ip6h1; 25673 uint_t ill_index; 25674 ipsec_out_t *io; 25675 boolean_t attach_if, hwaccel; 25676 uint32_t flags = IP6_NO_IPPOLICY; 25677 int match_flags; 25678 zoneid_t zoneid; 25679 boolean_t ill_need_rele = B_FALSE; 25680 boolean_t ire_need_rele = B_FALSE; 25681 ip_stack_t *ipst; 25682 25683 mp = ipsec_mp->b_cont; 25684 ip6h1 = (ip6_t *)mp->b_rptr; 25685 io = (ipsec_out_t *)ipsec_mp->b_rptr; 25686 ASSERT(io->ipsec_out_ns != NULL); 25687 ipst = io->ipsec_out_ns->netstack_ip; 25688 ill_index = io->ipsec_out_ill_index; 25689 if (io->ipsec_out_reachable) { 25690 flags |= IPV6_REACHABILITY_CONFIRMATION; 25691 } 25692 attach_if = io->ipsec_out_attach_if; 25693 hwaccel = io->ipsec_out_accelerated; 25694 zoneid = io->ipsec_out_zoneid; 25695 ASSERT(zoneid != ALL_ZONES); 25696 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 25697 /* Multicast addresses should have non-zero ill_index. */ 25698 v6dstp = &ip6h->ip6_dst; 25699 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 25700 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 25701 ASSERT(!attach_if || ill_index != 0); 25702 if (ill_index != 0) { 25703 if (ill == NULL) { 25704 ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, 25705 B_TRUE, ipst); 25706 25707 /* Failure case frees things for us. */ 25708 if (ill == NULL) 25709 return; 25710 25711 ill_need_rele = B_TRUE; 25712 } 25713 /* 25714 * If this packet needs to go out on a particular interface 25715 * honor it. 25716 */ 25717 if (attach_if) { 25718 match_flags = MATCH_IRE_ILL; 25719 25720 /* 25721 * Check if we need an ire that will not be 25722 * looked up by anybody else i.e. HIDDEN. 25723 */ 25724 if (ill_is_probeonly(ill)) { 25725 match_flags |= MATCH_IRE_MARK_HIDDEN; 25726 } 25727 } 25728 } 25729 ASSERT(mp != NULL); 25730 25731 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 25732 boolean_t unspec_src; 25733 ipif_t *ipif; 25734 25735 /* 25736 * Use the ill_index to get the right ill. 25737 */ 25738 unspec_src = io->ipsec_out_unspec_src; 25739 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 25740 if (ipif == NULL) { 25741 if (ill_need_rele) 25742 ill_refrele(ill); 25743 freemsg(ipsec_mp); 25744 return; 25745 } 25746 25747 if (ire_arg != NULL) { 25748 ire = ire_arg; 25749 } else { 25750 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25751 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 25752 ire_need_rele = B_TRUE; 25753 } 25754 if (ire != NULL) { 25755 ipif_refrele(ipif); 25756 /* 25757 * XXX Do the multicast forwarding now, as the IPsec 25758 * processing has been done. 25759 */ 25760 goto send; 25761 } 25762 25763 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 25764 mp->b_prev = NULL; 25765 mp->b_next = NULL; 25766 25767 /* 25768 * If the IPsec packet was processed asynchronously, 25769 * drop it now. 25770 */ 25771 if (q == NULL) { 25772 if (ill_need_rele) 25773 ill_refrele(ill); 25774 freemsg(ipsec_mp); 25775 return; 25776 } 25777 25778 ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, 25779 unspec_src, zoneid); 25780 ipif_refrele(ipif); 25781 } else { 25782 if (attach_if) { 25783 ipif_t *ipif; 25784 25785 ipif = ipif_get_next_ipif(NULL, ill); 25786 if (ipif == NULL) { 25787 if (ill_need_rele) 25788 ill_refrele(ill); 25789 freemsg(ipsec_mp); 25790 return; 25791 } 25792 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 25793 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 25794 ire_need_rele = B_TRUE; 25795 ipif_refrele(ipif); 25796 } else { 25797 if (ire_arg != NULL) { 25798 ire = ire_arg; 25799 } else { 25800 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, 25801 ipst); 25802 ire_need_rele = B_TRUE; 25803 } 25804 } 25805 if (ire != NULL) 25806 goto send; 25807 /* 25808 * ire disappeared underneath. 25809 * 25810 * What we need to do here is the ip_newroute 25811 * logic to get the ire without doing the IPsec 25812 * processing. Follow the same old path. But this 25813 * time, ip_wput or ire_add_then_send will call us 25814 * directly as all the IPsec operations are done. 25815 */ 25816 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 25817 mp->b_prev = NULL; 25818 mp->b_next = NULL; 25819 25820 /* 25821 * If the IPsec packet was processed asynchronously, 25822 * drop it now. 25823 */ 25824 if (q == NULL) { 25825 if (ill_need_rele) 25826 ill_refrele(ill); 25827 freemsg(ipsec_mp); 25828 return; 25829 } 25830 25831 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 25832 zoneid, ipst); 25833 } 25834 if (ill != NULL && ill_need_rele) 25835 ill_refrele(ill); 25836 return; 25837 send: 25838 if (ill != NULL && ill_need_rele) 25839 ill_refrele(ill); 25840 25841 /* Local delivery */ 25842 if (ire->ire_stq == NULL) { 25843 ill_t *out_ill; 25844 ASSERT(q != NULL); 25845 25846 /* PFHooks: LOOPBACK_OUT */ 25847 out_ill = ire_to_ill(ire); 25848 25849 /* 25850 * DTrace this as ip:::send. A blocked packet will fire the 25851 * send probe, but not the receive probe. 25852 */ 25853 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 25854 void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill, 25855 ipha_t *, NULL, ip6_t *, ip6h, int, 1); 25856 25857 DTRACE_PROBE4(ip6__loopback__out__start, 25858 ill_t *, NULL, ill_t *, out_ill, 25859 ip6_t *, ip6h1, mblk_t *, ipsec_mp); 25860 25861 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 25862 ipst->ips_ipv6firewall_loopback_out, 25863 NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst); 25864 25865 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp); 25866 25867 if (ipsec_mp != NULL) 25868 ip_wput_local_v6(RD(q), out_ill, 25869 ip6h, ipsec_mp, ire, 0); 25870 if (ire_need_rele) 25871 ire_refrele(ire); 25872 return; 25873 } 25874 /* 25875 * Everything is done. Send it out on the wire. 25876 * We force the insertion of a fragment header using the 25877 * IPH_FRAG_HDR flag in two cases: 25878 * - after reception of an ICMPv6 "packet too big" message 25879 * with a MTU < 1280 (cf. RFC 2460 section 5) 25880 * - for multirouted IPv6 packets, so that the receiver can 25881 * discard duplicates according to their fragment identifier 25882 */ 25883 /* XXX fix flow control problems. */ 25884 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 25885 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 25886 if (hwaccel) { 25887 /* 25888 * hardware acceleration does not handle these 25889 * "slow path" cases. 25890 */ 25891 /* IPsec KSTATS: should bump bean counter here. */ 25892 if (ire_need_rele) 25893 ire_refrele(ire); 25894 freemsg(ipsec_mp); 25895 return; 25896 } 25897 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 25898 (mp->b_cont ? msgdsize(mp) : 25899 mp->b_wptr - (uchar_t *)ip6h)) { 25900 /* IPsec KSTATS: should bump bean counter here. */ 25901 ip0dbg(("Packet length mismatch: %d, %ld\n", 25902 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 25903 msgdsize(mp))); 25904 if (ire_need_rele) 25905 ire_refrele(ire); 25906 freemsg(ipsec_mp); 25907 return; 25908 } 25909 ASSERT(mp->b_prev == NULL); 25910 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 25911 ntohs(ip6h->ip6_plen) + 25912 IPV6_HDR_LEN, ire->ire_max_frag)); 25913 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 25914 ire->ire_max_frag); 25915 } else { 25916 UPDATE_OB_PKT_COUNT(ire); 25917 ire->ire_last_used_time = lbolt; 25918 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 25919 } 25920 if (ire_need_rele) 25921 ire_refrele(ire); 25922 freeb(ipsec_mp); 25923 } 25924 25925 void 25926 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 25927 { 25928 mblk_t *hada_mp; /* attributes M_CTL mblk */ 25929 da_ipsec_t *hada; /* data attributes */ 25930 ill_t *ill = (ill_t *)q->q_ptr; 25931 25932 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 25933 25934 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 25935 /* IPsec KSTATS: Bump lose counter here! */ 25936 freemsg(mp); 25937 return; 25938 } 25939 25940 /* 25941 * It's an IPsec packet that must be 25942 * accelerated by the Provider, and the 25943 * outbound ill is IPsec acceleration capable. 25944 * Prepends the mblk with an IPHADA_M_CTL, and ship it 25945 * to the ill. 25946 * IPsec KSTATS: should bump packet counter here. 25947 */ 25948 25949 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 25950 if (hada_mp == NULL) { 25951 /* IPsec KSTATS: should bump packet counter here. */ 25952 freemsg(mp); 25953 return; 25954 } 25955 25956 hada_mp->b_datap->db_type = M_CTL; 25957 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 25958 hada_mp->b_cont = mp; 25959 25960 hada = (da_ipsec_t *)hada_mp->b_rptr; 25961 bzero(hada, sizeof (da_ipsec_t)); 25962 hada->da_type = IPHADA_M_CTL; 25963 25964 putnext(q, hada_mp); 25965 } 25966 25967 /* 25968 * Finish the outbound IPsec processing. This function is called from 25969 * ipsec_out_process() if the IPsec packet was processed 25970 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 25971 * asynchronously. 25972 */ 25973 void 25974 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 25975 ire_t *ire_arg) 25976 { 25977 uint32_t v_hlen_tos_len; 25978 ipaddr_t dst; 25979 ipif_t *ipif = NULL; 25980 ire_t *ire; 25981 ire_t *ire1 = NULL; 25982 mblk_t *next_mp = NULL; 25983 uint32_t max_frag; 25984 boolean_t multirt_send = B_FALSE; 25985 mblk_t *mp; 25986 ipha_t *ipha1; 25987 uint_t ill_index; 25988 ipsec_out_t *io; 25989 boolean_t attach_if; 25990 int match_flags; 25991 irb_t *irb = NULL; 25992 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 25993 zoneid_t zoneid; 25994 ipxmit_state_t pktxmit_state; 25995 ip_stack_t *ipst; 25996 25997 #ifdef _BIG_ENDIAN 25998 #define LENGTH (v_hlen_tos_len & 0xFFFF) 25999 #else 26000 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 26001 #endif 26002 26003 mp = ipsec_mp->b_cont; 26004 ipha1 = (ipha_t *)mp->b_rptr; 26005 ASSERT(mp != NULL); 26006 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 26007 dst = ipha->ipha_dst; 26008 26009 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26010 ill_index = io->ipsec_out_ill_index; 26011 attach_if = io->ipsec_out_attach_if; 26012 zoneid = io->ipsec_out_zoneid; 26013 ASSERT(zoneid != ALL_ZONES); 26014 ipst = io->ipsec_out_ns->netstack_ip; 26015 ASSERT(io->ipsec_out_ns != NULL); 26016 26017 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 26018 if (ill_index != 0) { 26019 if (ill == NULL) { 26020 ill = ip_grab_attach_ill(NULL, ipsec_mp, 26021 ill_index, B_FALSE, ipst); 26022 26023 /* Failure case frees things for us. */ 26024 if (ill == NULL) 26025 return; 26026 26027 ill_need_rele = B_TRUE; 26028 } 26029 /* 26030 * If this packet needs to go out on a particular interface 26031 * honor it. 26032 */ 26033 if (attach_if) { 26034 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 26035 26036 /* 26037 * Check if we need an ire that will not be 26038 * looked up by anybody else i.e. HIDDEN. 26039 */ 26040 if (ill_is_probeonly(ill)) { 26041 match_flags |= MATCH_IRE_MARK_HIDDEN; 26042 } 26043 } 26044 } 26045 26046 if (CLASSD(dst)) { 26047 boolean_t conn_dontroute; 26048 /* 26049 * Use the ill_index to get the right ipif. 26050 */ 26051 conn_dontroute = io->ipsec_out_dontroute; 26052 if (ill_index == 0) 26053 ipif = ipif_lookup_group(dst, zoneid, ipst); 26054 else 26055 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 26056 if (ipif == NULL) { 26057 ip1dbg(("ip_wput_ipsec_out: No ipif for" 26058 " multicast\n")); 26059 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 26060 freemsg(ipsec_mp); 26061 goto done; 26062 } 26063 /* 26064 * ipha_src has already been intialized with the 26065 * value of the ipif in ip_wput. All we need now is 26066 * an ire to send this downstream. 26067 */ 26068 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 26069 MBLK_GETLABEL(mp), match_flags, ipst); 26070 if (ire != NULL) { 26071 ill_t *ill1; 26072 /* 26073 * Do the multicast forwarding now, as the IPsec 26074 * processing has been done. 26075 */ 26076 if (ipst->ips_ip_g_mrouter && !conn_dontroute && 26077 (ill1 = ire_to_ill(ire))) { 26078 if (ip_mforward(ill1, ipha, mp)) { 26079 freemsg(ipsec_mp); 26080 ip1dbg(("ip_wput_ipsec_out: mforward " 26081 "failed\n")); 26082 ire_refrele(ire); 26083 goto done; 26084 } 26085 } 26086 goto send; 26087 } 26088 26089 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 26090 mp->b_prev = NULL; 26091 mp->b_next = NULL; 26092 26093 /* 26094 * If the IPsec packet was processed asynchronously, 26095 * drop it now. 26096 */ 26097 if (q == NULL) { 26098 freemsg(ipsec_mp); 26099 goto done; 26100 } 26101 26102 /* 26103 * We may be using a wrong ipif to create the ire. 26104 * But it is okay as the source address is assigned 26105 * for the packet already. Next outbound packet would 26106 * create the IRE with the right IPIF in ip_wput. 26107 * 26108 * Also handle RTF_MULTIRT routes. 26109 */ 26110 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, 26111 zoneid, &zero_info); 26112 } else { 26113 if (attach_if) { 26114 ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, 26115 zoneid, MBLK_GETLABEL(mp), match_flags, ipst); 26116 } else { 26117 if (ire_arg != NULL) { 26118 ire = ire_arg; 26119 ire_need_rele = B_FALSE; 26120 } else { 26121 ire = ire_cache_lookup(dst, zoneid, 26122 MBLK_GETLABEL(mp), ipst); 26123 } 26124 } 26125 if (ire != NULL) { 26126 goto send; 26127 } 26128 26129 /* 26130 * ire disappeared underneath. 26131 * 26132 * What we need to do here is the ip_newroute 26133 * logic to get the ire without doing the IPsec 26134 * processing. Follow the same old path. But this 26135 * time, ip_wput or ire_add_then_put will call us 26136 * directly as all the IPsec operations are done. 26137 */ 26138 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 26139 mp->b_prev = NULL; 26140 mp->b_next = NULL; 26141 26142 /* 26143 * If the IPsec packet was processed asynchronously, 26144 * drop it now. 26145 */ 26146 if (q == NULL) { 26147 freemsg(ipsec_mp); 26148 goto done; 26149 } 26150 26151 /* 26152 * Since we're going through ip_newroute() again, we 26153 * need to make sure we don't: 26154 * 26155 * 1.) Trigger the ASSERT() with the ipha_ident 26156 * overloading. 26157 * 2.) Redo transport-layer checksumming, since we've 26158 * already done all that to get this far. 26159 * 26160 * The easiest way not do either of the above is to set 26161 * the ipha_ident field to IP_HDR_INCLUDED. 26162 */ 26163 ipha->ipha_ident = IP_HDR_INCLUDED; 26164 ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 26165 zoneid, ipst); 26166 } 26167 goto done; 26168 send: 26169 if (ire->ire_stq == NULL) { 26170 ill_t *out_ill; 26171 /* 26172 * Loopbacks go through ip_wput_local except for one case. 26173 * We come here if we generate a icmp_frag_needed message 26174 * after IPsec processing is over. When this function calls 26175 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 26176 * icmp_frag_needed. The message generated comes back here 26177 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 26178 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 26179 * source address as it is usually set in ip_wput_ire. As 26180 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 26181 * and we end up here. We can't enter ip_wput_ire once the 26182 * IPsec processing is over and hence we need to do it here. 26183 */ 26184 ASSERT(q != NULL); 26185 UPDATE_OB_PKT_COUNT(ire); 26186 ire->ire_last_used_time = lbolt; 26187 if (ipha->ipha_src == 0) 26188 ipha->ipha_src = ire->ire_src_addr; 26189 26190 /* PFHooks: LOOPBACK_OUT */ 26191 out_ill = ire_to_ill(ire); 26192 26193 /* 26194 * DTrace this as ip:::send. A blocked packet will fire the 26195 * send probe, but not the receive probe. 26196 */ 26197 DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, 26198 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, 26199 ipha_t *, ipha, ip6_t *, NULL, int, 1); 26200 26201 DTRACE_PROBE4(ip4__loopback__out__start, 26202 ill_t *, NULL, ill_t *, out_ill, 26203 ipha_t *, ipha1, mblk_t *, ipsec_mp); 26204 26205 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 26206 ipst->ips_ipv4firewall_loopback_out, 26207 NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst); 26208 26209 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp); 26210 26211 if (ipsec_mp != NULL) 26212 ip_wput_local(RD(q), out_ill, 26213 ipha, ipsec_mp, ire, 0, zoneid); 26214 if (ire_need_rele) 26215 ire_refrele(ire); 26216 goto done; 26217 } 26218 26219 if (ire->ire_max_frag < (unsigned int)LENGTH) { 26220 /* 26221 * We are through with IPsec processing. 26222 * Fragment this and send it on the wire. 26223 */ 26224 if (io->ipsec_out_accelerated) { 26225 /* 26226 * The packet has been accelerated but must 26227 * be fragmented. This should not happen 26228 * since AH and ESP must not accelerate 26229 * packets that need fragmentation, however 26230 * the configuration could have changed 26231 * since the AH or ESP processing. 26232 * Drop packet. 26233 * IPsec KSTATS: bump bean counter here. 26234 */ 26235 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 26236 "fragmented accelerated packet!\n")); 26237 freemsg(ipsec_mp); 26238 } else { 26239 ip_wput_ire_fragmentit(ipsec_mp, ire, zoneid, ipst); 26240 } 26241 if (ire_need_rele) 26242 ire_refrele(ire); 26243 goto done; 26244 } 26245 26246 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 26247 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 26248 (void *)ire->ire_ipif, (void *)ipif)); 26249 26250 /* 26251 * Multiroute the secured packet, unless IPsec really 26252 * requires the packet to go out only through a particular 26253 * interface. 26254 */ 26255 if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { 26256 ire_t *first_ire; 26257 irb = ire->ire_bucket; 26258 ASSERT(irb != NULL); 26259 /* 26260 * This ire has been looked up as the one that 26261 * goes through the given ipif; 26262 * make sure we do not omit any other multiroute ire 26263 * that may be present in the bucket before this one. 26264 */ 26265 IRB_REFHOLD(irb); 26266 for (first_ire = irb->irb_ire; 26267 first_ire != NULL; 26268 first_ire = first_ire->ire_next) { 26269 if ((first_ire->ire_flags & RTF_MULTIRT) && 26270 (first_ire->ire_addr == ire->ire_addr) && 26271 !(first_ire->ire_marks & 26272 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) { 26273 break; 26274 } 26275 } 26276 26277 if ((first_ire != NULL) && (first_ire != ire)) { 26278 /* 26279 * Don't change the ire if the packet must 26280 * be fragmented if sent via this new one. 26281 */ 26282 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 26283 IRE_REFHOLD(first_ire); 26284 if (ire_need_rele) 26285 ire_refrele(ire); 26286 else 26287 ire_need_rele = B_TRUE; 26288 ire = first_ire; 26289 } 26290 } 26291 IRB_REFRELE(irb); 26292 26293 multirt_send = B_TRUE; 26294 max_frag = ire->ire_max_frag; 26295 } else { 26296 if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { 26297 ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " 26298 "flag, attach_if %d\n", attach_if)); 26299 } 26300 } 26301 26302 /* 26303 * In most cases, the emission loop below is entered only once. 26304 * Only in the case where the ire holds the RTF_MULTIRT 26305 * flag, we loop to process all RTF_MULTIRT ires in the 26306 * bucket, and send the packet through all crossed 26307 * RTF_MULTIRT routes. 26308 */ 26309 do { 26310 if (multirt_send) { 26311 /* 26312 * ire1 holds here the next ire to process in the 26313 * bucket. If multirouting is expected, 26314 * any non-RTF_MULTIRT ire that has the 26315 * right destination address is ignored. 26316 */ 26317 ASSERT(irb != NULL); 26318 IRB_REFHOLD(irb); 26319 for (ire1 = ire->ire_next; 26320 ire1 != NULL; 26321 ire1 = ire1->ire_next) { 26322 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 26323 continue; 26324 if (ire1->ire_addr != ire->ire_addr) 26325 continue; 26326 if (ire1->ire_marks & 26327 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 26328 continue; 26329 /* No loopback here */ 26330 if (ire1->ire_stq == NULL) 26331 continue; 26332 /* 26333 * Ensure we do not exceed the MTU 26334 * of the next route. 26335 */ 26336 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 26337 ip_multirt_bad_mtu(ire1, max_frag); 26338 continue; 26339 } 26340 26341 IRE_REFHOLD(ire1); 26342 break; 26343 } 26344 IRB_REFRELE(irb); 26345 if (ire1 != NULL) { 26346 /* 26347 * We are in a multiple send case, need to 26348 * make a copy of the packet. 26349 */ 26350 next_mp = copymsg(ipsec_mp); 26351 if (next_mp == NULL) { 26352 ire_refrele(ire1); 26353 ire1 = NULL; 26354 } 26355 } 26356 } 26357 /* 26358 * Everything is done. Send it out on the wire 26359 * 26360 * ip_xmit_v4 will call ip_wput_attach_llhdr and then 26361 * either send it on the wire or, in the case of 26362 * HW acceleration, call ipsec_hw_putnext. 26363 */ 26364 if (ire->ire_nce && 26365 ire->ire_nce->nce_state != ND_REACHABLE) { 26366 DTRACE_PROBE2(ip__wput__ipsec__bail, 26367 (ire_t *), ire, (mblk_t *), ipsec_mp); 26368 /* 26369 * If ire's link-layer is unresolved (this 26370 * would only happen if the incomplete ire 26371 * was added to cachetable via forwarding path) 26372 * don't bother going to ip_xmit_v4. Just drop the 26373 * packet. 26374 * There is a slight risk here, in that, if we 26375 * have the forwarding path create an incomplete 26376 * IRE, then until the IRE is completed, any 26377 * transmitted IPsec packets will be dropped 26378 * instead of being queued waiting for resolution. 26379 * 26380 * But the likelihood of a forwarding packet and a wput 26381 * packet sending to the same dst at the same time 26382 * and there not yet be an ARP entry for it is small. 26383 * Furthermore, if this actually happens, it might 26384 * be likely that wput would generate multiple 26385 * packets (and forwarding would also have a train 26386 * of packets) for that destination. If this is 26387 * the case, some of them would have been dropped 26388 * anyway, since ARP only queues a few packets while 26389 * waiting for resolution 26390 * 26391 * NOTE: We should really call ip_xmit_v4, 26392 * and let it queue the packet and send the 26393 * ARP query and have ARP come back thus: 26394 * <ARP> ip_wput->ip_output->ip-wput_nondata-> 26395 * ip_xmit_v4->ip_wput_attach_llhdr + ipsec 26396 * hw accel work. But it's too complex to get 26397 * the IPsec hw acceleration approach to fit 26398 * well with ip_xmit_v4 doing ARP without 26399 * doing IPsec simplification. For now, we just 26400 * poke ip_xmit_v4 to trigger the arp resolve, so 26401 * that we can continue with the send on the next 26402 * attempt. 26403 * 26404 * XXX THis should be revisited, when 26405 * the IPsec/IP interaction is cleaned up 26406 */ 26407 ip1dbg(("ip_wput_ipsec_out: ire is incomplete" 26408 " - dropping packet\n")); 26409 freemsg(ipsec_mp); 26410 /* 26411 * Call ip_xmit_v4() to trigger ARP query 26412 * in case the nce_state is ND_INITIAL 26413 */ 26414 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 26415 goto drop_pkt; 26416 } 26417 26418 DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, 26419 ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1, 26420 mblk_t *, ipsec_mp); 26421 FW_HOOKS(ipst->ips_ip4_physical_out_event, 26422 ipst->ips_ipv4firewall_physical_out, NULL, 26423 ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst); 26424 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp); 26425 if (ipsec_mp == NULL) 26426 goto drop_pkt; 26427 26428 ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n")); 26429 pktxmit_state = ip_xmit_v4(mp, ire, 26430 (io->ipsec_out_accelerated ? io : NULL), B_FALSE); 26431 26432 if ((pktxmit_state == SEND_FAILED) || 26433 (pktxmit_state == LLHDR_RESLV_FAILED)) { 26434 26435 freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */ 26436 drop_pkt: 26437 BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib, 26438 ipIfStatsOutDiscards); 26439 if (ire_need_rele) 26440 ire_refrele(ire); 26441 if (ire1 != NULL) { 26442 ire_refrele(ire1); 26443 freemsg(next_mp); 26444 } 26445 goto done; 26446 } 26447 26448 freeb(ipsec_mp); 26449 if (ire_need_rele) 26450 ire_refrele(ire); 26451 26452 if (ire1 != NULL) { 26453 ire = ire1; 26454 ire_need_rele = B_TRUE; 26455 ASSERT(next_mp); 26456 ipsec_mp = next_mp; 26457 mp = ipsec_mp->b_cont; 26458 ire1 = NULL; 26459 next_mp = NULL; 26460 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26461 } else { 26462 multirt_send = B_FALSE; 26463 } 26464 } while (multirt_send); 26465 done: 26466 if (ill != NULL && ill_need_rele) 26467 ill_refrele(ill); 26468 if (ipif != NULL) 26469 ipif_refrele(ipif); 26470 } 26471 26472 /* 26473 * Get the ill corresponding to the specified ire, and compare its 26474 * capabilities with the protocol and algorithms specified by the 26475 * the SA obtained from ipsec_out. If they match, annotate the 26476 * ipsec_out structure to indicate that the packet needs acceleration. 26477 * 26478 * 26479 * A packet is eligible for outbound hardware acceleration if the 26480 * following conditions are satisfied: 26481 * 26482 * 1. the packet will not be fragmented 26483 * 2. the provider supports the algorithm 26484 * 3. there is no pending control message being exchanged 26485 * 4. snoop is not attached 26486 * 5. the destination address is not a broadcast or multicast address. 26487 * 26488 * Rationale: 26489 * - Hardware drivers do not support fragmentation with 26490 * the current interface. 26491 * - snoop, multicast, and broadcast may result in exposure of 26492 * a cleartext datagram. 26493 * We check all five of these conditions here. 26494 * 26495 * XXX would like to nuke "ire_t *" parameter here; problem is that 26496 * IRE is only way to figure out if a v4 address is a broadcast and 26497 * thus ineligible for acceleration... 26498 */ 26499 static void 26500 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 26501 { 26502 ipsec_out_t *io; 26503 mblk_t *data_mp; 26504 uint_t plen, overhead; 26505 ip_stack_t *ipst; 26506 26507 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 26508 return; 26509 26510 if (ill == NULL) 26511 return; 26512 ipst = ill->ill_ipst; 26513 /* 26514 * Destination address is a broadcast or multicast. Punt. 26515 */ 26516 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 26517 IRE_LOCAL))) 26518 return; 26519 26520 data_mp = ipsec_mp->b_cont; 26521 26522 if (ill->ill_isv6) { 26523 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 26524 26525 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 26526 return; 26527 26528 plen = ip6h->ip6_plen; 26529 } else { 26530 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 26531 26532 if (CLASSD(ipha->ipha_dst)) 26533 return; 26534 26535 plen = ipha->ipha_length; 26536 } 26537 /* 26538 * Is there a pending DLPI control message being exchanged 26539 * between IP/IPsec and the DLS Provider? If there is, it 26540 * could be a SADB update, and the state of the DLS Provider 26541 * SADB might not be in sync with the SADB maintained by 26542 * IPsec. To avoid dropping packets or using the wrong keying 26543 * material, we do not accelerate this packet. 26544 */ 26545 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 26546 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26547 "ill_dlpi_pending! don't accelerate packet\n")); 26548 return; 26549 } 26550 26551 /* 26552 * Is the Provider in promiscous mode? If it does, we don't 26553 * accelerate the packet since it will bounce back up to the 26554 * listeners in the clear. 26555 */ 26556 if (ill->ill_promisc_on_phys) { 26557 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 26558 "ill in promiscous mode, don't accelerate packet\n")); 26559 return; 26560 } 26561 26562 /* 26563 * Will the packet require fragmentation? 26564 */ 26565 26566 /* 26567 * IPsec ESP note: this is a pessimistic estimate, but the same 26568 * as is used elsewhere. 26569 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 26570 * + 2-byte trailer 26571 */ 26572 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 26573 IPSEC_BASE_ESP_HDR_SIZE(sa); 26574 26575 if ((plen + overhead) > ill->ill_max_mtu) 26576 return; 26577 26578 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26579 26580 /* 26581 * Can the ill accelerate this IPsec protocol and algorithm 26582 * specified by the SA? 26583 */ 26584 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 26585 ill->ill_isv6, sa, ipst->ips_netstack)) { 26586 return; 26587 } 26588 26589 /* 26590 * Tell AH or ESP that the outbound ill is capable of 26591 * accelerating this packet. 26592 */ 26593 io->ipsec_out_is_capab_ill = B_TRUE; 26594 } 26595 26596 /* 26597 * Select which AH & ESP SA's to use (if any) for the outbound packet. 26598 * 26599 * If this function returns B_TRUE, the requested SA's have been filled 26600 * into the ipsec_out_*_sa pointers. 26601 * 26602 * If the function returns B_FALSE, the packet has been "consumed", most 26603 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 26604 * 26605 * The SA references created by the protocol-specific "select" 26606 * function will be released when the ipsec_mp is freed, thanks to the 26607 * ipsec_out_free destructor -- see spd.c. 26608 */ 26609 static boolean_t 26610 ipsec_out_select_sa(mblk_t *ipsec_mp) 26611 { 26612 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 26613 ipsec_out_t *io; 26614 ipsec_policy_t *pp; 26615 ipsec_action_t *ap; 26616 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26617 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26618 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26619 26620 if (!io->ipsec_out_secure) { 26621 /* 26622 * We came here by mistake. 26623 * Don't bother with ipsec processing 26624 * We should "discourage" this path in the future. 26625 */ 26626 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26627 return (B_FALSE); 26628 } 26629 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26630 ASSERT((io->ipsec_out_policy != NULL) || 26631 (io->ipsec_out_act != NULL)); 26632 26633 ASSERT(io->ipsec_out_failed == B_FALSE); 26634 26635 /* 26636 * IPsec processing has started. 26637 */ 26638 io->ipsec_out_proc_begin = B_TRUE; 26639 ap = io->ipsec_out_act; 26640 if (ap == NULL) { 26641 pp = io->ipsec_out_policy; 26642 ASSERT(pp != NULL); 26643 ap = pp->ipsp_act; 26644 ASSERT(ap != NULL); 26645 } 26646 26647 /* 26648 * We have an action. now, let's select SA's. 26649 * (In the future, we can cache this in the conn_t..) 26650 */ 26651 if (ap->ipa_want_esp) { 26652 if (io->ipsec_out_esp_sa == NULL) { 26653 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 26654 IPPROTO_ESP); 26655 } 26656 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 26657 } 26658 26659 if (ap->ipa_want_ah) { 26660 if (io->ipsec_out_ah_sa == NULL) { 26661 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 26662 IPPROTO_AH); 26663 } 26664 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 26665 /* 26666 * The ESP and AH processing order needs to be preserved 26667 * when both protocols are required (ESP should be applied 26668 * before AH for an outbound packet). Force an ESP ACQUIRE 26669 * when both ESP and AH are required, and an AH ACQUIRE 26670 * is needed. 26671 */ 26672 if (ap->ipa_want_esp && need_ah_acquire) 26673 need_esp_acquire = B_TRUE; 26674 } 26675 26676 /* 26677 * Send an ACQUIRE (extended, regular, or both) if we need one. 26678 * Release SAs that got referenced, but will not be used until we 26679 * acquire _all_ of the SAs we need. 26680 */ 26681 if (need_ah_acquire || need_esp_acquire) { 26682 if (io->ipsec_out_ah_sa != NULL) { 26683 IPSA_REFRELE(io->ipsec_out_ah_sa); 26684 io->ipsec_out_ah_sa = NULL; 26685 } 26686 if (io->ipsec_out_esp_sa != NULL) { 26687 IPSA_REFRELE(io->ipsec_out_esp_sa); 26688 io->ipsec_out_esp_sa = NULL; 26689 } 26690 26691 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 26692 return (B_FALSE); 26693 } 26694 26695 return (B_TRUE); 26696 } 26697 26698 /* 26699 * Process an IPSEC_OUT message and see what you can 26700 * do with it. 26701 * IPQoS Notes: 26702 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 26703 * IPsec. 26704 * XXX would like to nuke ire_t. 26705 * XXX ill_index better be "real" 26706 */ 26707 void 26708 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 26709 { 26710 ipsec_out_t *io; 26711 ipsec_policy_t *pp; 26712 ipsec_action_t *ap; 26713 ipha_t *ipha; 26714 ip6_t *ip6h; 26715 mblk_t *mp; 26716 ill_t *ill; 26717 zoneid_t zoneid; 26718 ipsec_status_t ipsec_rc; 26719 boolean_t ill_need_rele = B_FALSE; 26720 ip_stack_t *ipst; 26721 ipsec_stack_t *ipss; 26722 26723 io = (ipsec_out_t *)ipsec_mp->b_rptr; 26724 ASSERT(io->ipsec_out_type == IPSEC_OUT); 26725 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 26726 ipst = io->ipsec_out_ns->netstack_ip; 26727 mp = ipsec_mp->b_cont; 26728 26729 /* 26730 * Initiate IPPF processing. We do it here to account for packets 26731 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 26732 * We can check for ipsec_out_proc_begin even for such packets, as 26733 * they will always be false (asserted below). 26734 */ 26735 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) { 26736 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 26737 io->ipsec_out_ill_index : ill_index); 26738 if (mp == NULL) { 26739 ip2dbg(("ipsec_out_process: packet dropped "\ 26740 "during IPPF processing\n")); 26741 freeb(ipsec_mp); 26742 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26743 return; 26744 } 26745 } 26746 26747 if (!io->ipsec_out_secure) { 26748 /* 26749 * We came here by mistake. 26750 * Don't bother with ipsec processing 26751 * Should "discourage" this path in the future. 26752 */ 26753 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 26754 goto done; 26755 } 26756 ASSERT(io->ipsec_out_need_policy == B_FALSE); 26757 ASSERT((io->ipsec_out_policy != NULL) || 26758 (io->ipsec_out_act != NULL)); 26759 ASSERT(io->ipsec_out_failed == B_FALSE); 26760 26761 ipss = ipst->ips_netstack->netstack_ipsec; 26762 if (!ipsec_loaded(ipss)) { 26763 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 26764 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26765 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 26766 } else { 26767 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 26768 } 26769 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 26770 DROPPER(ipss, ipds_ip_ipsec_not_loaded), 26771 &ipss->ipsec_dropper); 26772 return; 26773 } 26774 26775 /* 26776 * IPsec processing has started. 26777 */ 26778 io->ipsec_out_proc_begin = B_TRUE; 26779 ap = io->ipsec_out_act; 26780 if (ap == NULL) { 26781 pp = io->ipsec_out_policy; 26782 ASSERT(pp != NULL); 26783 ap = pp->ipsp_act; 26784 ASSERT(ap != NULL); 26785 } 26786 26787 /* 26788 * Save the outbound ill index. When the packet comes back 26789 * from IPsec, we make sure the ill hasn't changed or disappeared 26790 * before sending it the accelerated packet. 26791 */ 26792 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 26793 int ifindex; 26794 ill = ire_to_ill(ire); 26795 ifindex = ill->ill_phyint->phyint_ifindex; 26796 io->ipsec_out_capab_ill_index = ifindex; 26797 } 26798 26799 /* 26800 * The order of processing is first insert a IP header if needed. 26801 * Then insert the ESP header and then the AH header. 26802 */ 26803 if ((io->ipsec_out_se_done == B_FALSE) && 26804 (ap->ipa_want_se)) { 26805 /* 26806 * First get the outer IP header before sending 26807 * it to ESP. 26808 */ 26809 ipha_t *oipha, *iipha; 26810 mblk_t *outer_mp, *inner_mp; 26811 26812 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 26813 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 26814 "ipsec_out_process: " 26815 "Self-Encapsulation failed: Out of memory\n"); 26816 freemsg(ipsec_mp); 26817 if (ill != NULL) { 26818 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26819 } else { 26820 BUMP_MIB(&ipst->ips_ip_mib, 26821 ipIfStatsOutDiscards); 26822 } 26823 return; 26824 } 26825 inner_mp = ipsec_mp->b_cont; 26826 ASSERT(inner_mp->b_datap->db_type == M_DATA); 26827 oipha = (ipha_t *)outer_mp->b_rptr; 26828 iipha = (ipha_t *)inner_mp->b_rptr; 26829 *oipha = *iipha; 26830 outer_mp->b_wptr += sizeof (ipha_t); 26831 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 26832 sizeof (ipha_t)); 26833 oipha->ipha_protocol = IPPROTO_ENCAP; 26834 oipha->ipha_version_and_hdr_length = 26835 IP_SIMPLE_HDR_VERSION; 26836 oipha->ipha_hdr_checksum = 0; 26837 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 26838 outer_mp->b_cont = inner_mp; 26839 ipsec_mp->b_cont = outer_mp; 26840 26841 io->ipsec_out_se_done = B_TRUE; 26842 io->ipsec_out_tunnel = B_TRUE; 26843 } 26844 26845 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 26846 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 26847 !ipsec_out_select_sa(ipsec_mp)) 26848 return; 26849 26850 /* 26851 * By now, we know what SA's to use. Toss over to ESP & AH 26852 * to do the heavy lifting. 26853 */ 26854 zoneid = io->ipsec_out_zoneid; 26855 ASSERT(zoneid != ALL_ZONES); 26856 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 26857 ASSERT(io->ipsec_out_esp_sa != NULL); 26858 io->ipsec_out_esp_done = B_TRUE; 26859 /* 26860 * Note that since hw accel can only apply one transform, 26861 * not two, we skip hw accel for ESP if we also have AH 26862 * This is an design limitation of the interface 26863 * which should be revisited. 26864 */ 26865 ASSERT(ire != NULL); 26866 if (io->ipsec_out_ah_sa == NULL) { 26867 ill = (ill_t *)ire->ire_stq->q_ptr; 26868 ipsec_out_is_accelerated(ipsec_mp, 26869 io->ipsec_out_esp_sa, ill, ire); 26870 } 26871 26872 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 26873 switch (ipsec_rc) { 26874 case IPSEC_STATUS_SUCCESS: 26875 break; 26876 case IPSEC_STATUS_FAILED: 26877 if (ill != NULL) { 26878 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26879 } else { 26880 BUMP_MIB(&ipst->ips_ip_mib, 26881 ipIfStatsOutDiscards); 26882 } 26883 /* FALLTHRU */ 26884 case IPSEC_STATUS_PENDING: 26885 return; 26886 } 26887 } 26888 26889 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 26890 ASSERT(io->ipsec_out_ah_sa != NULL); 26891 io->ipsec_out_ah_done = B_TRUE; 26892 if (ire == NULL) { 26893 int idx = io->ipsec_out_capab_ill_index; 26894 ill = ill_lookup_on_ifindex(idx, B_FALSE, 26895 NULL, NULL, NULL, NULL, ipst); 26896 ill_need_rele = B_TRUE; 26897 } else { 26898 ill = (ill_t *)ire->ire_stq->q_ptr; 26899 } 26900 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 26901 ire); 26902 26903 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 26904 switch (ipsec_rc) { 26905 case IPSEC_STATUS_SUCCESS: 26906 break; 26907 case IPSEC_STATUS_FAILED: 26908 if (ill != NULL) { 26909 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 26910 } else { 26911 BUMP_MIB(&ipst->ips_ip_mib, 26912 ipIfStatsOutDiscards); 26913 } 26914 /* FALLTHRU */ 26915 case IPSEC_STATUS_PENDING: 26916 if (ill != NULL && ill_need_rele) 26917 ill_refrele(ill); 26918 return; 26919 } 26920 } 26921 /* 26922 * We are done with IPsec processing. Send it over 26923 * the wire. 26924 */ 26925 done: 26926 mp = ipsec_mp->b_cont; 26927 ipha = (ipha_t *)mp->b_rptr; 26928 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 26929 ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); 26930 } else { 26931 ip6h = (ip6_t *)ipha; 26932 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); 26933 } 26934 if (ill != NULL && ill_need_rele) 26935 ill_refrele(ill); 26936 } 26937 26938 /* ARGSUSED */ 26939 void 26940 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 26941 { 26942 opt_restart_t *or; 26943 int err; 26944 conn_t *connp; 26945 26946 ASSERT(CONN_Q(q)); 26947 connp = Q_TO_CONN(q); 26948 26949 ASSERT(first_mp->b_datap->db_type == M_CTL); 26950 or = (opt_restart_t *)first_mp->b_rptr; 26951 /* 26952 * We don't need to pass any credentials here since this is just 26953 * a restart. The credentials are passed in when svr4_optcom_req 26954 * is called the first time (from ip_wput_nondata). 26955 */ 26956 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 26957 err = svr4_optcom_req(q, first_mp, NULL, 26958 &ip_opt_obj, B_FALSE); 26959 } else { 26960 ASSERT(or->or_type == T_OPTMGMT_REQ); 26961 err = tpi_optcom_req(q, first_mp, NULL, 26962 &ip_opt_obj, B_FALSE); 26963 } 26964 if (err != EINPROGRESS) { 26965 /* operation is done */ 26966 CONN_OPER_PENDING_DONE(connp); 26967 } 26968 } 26969 26970 /* 26971 * ioctls that go through a down/up sequence may need to wait for the down 26972 * to complete. This involves waiting for the ire and ipif refcnts to go down 26973 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 26974 */ 26975 /* ARGSUSED */ 26976 void 26977 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 26978 { 26979 struct iocblk *iocp; 26980 mblk_t *mp1; 26981 ip_ioctl_cmd_t *ipip; 26982 int err; 26983 sin_t *sin; 26984 struct lifreq *lifr; 26985 struct ifreq *ifr; 26986 26987 iocp = (struct iocblk *)mp->b_rptr; 26988 ASSERT(ipsq != NULL); 26989 /* Existence of mp1 verified in ip_wput_nondata */ 26990 mp1 = mp->b_cont->b_cont; 26991 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 26992 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 26993 /* 26994 * Special case where ipsq_current_ipif is not set: 26995 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 26996 * ill could also have become part of a ipmp group in the 26997 * process, we are here as were not able to complete the 26998 * operation in ipif_set_values because we could not become 26999 * exclusive on the new ipsq, In such a case ipsq_current_ipif 27000 * will not be set so we need to set it. 27001 */ 27002 ill_t *ill = q->q_ptr; 27003 ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd); 27004 } 27005 ASSERT(ipsq->ipsq_current_ipif != NULL); 27006 27007 if (ipip->ipi_cmd_type == IF_CMD) { 27008 /* This a old style SIOC[GS]IF* command */ 27009 ifr = (struct ifreq *)mp1->b_rptr; 27010 sin = (sin_t *)&ifr->ifr_addr; 27011 } else if (ipip->ipi_cmd_type == LIF_CMD) { 27012 /* This a new style SIOC[GS]LIF* command */ 27013 lifr = (struct lifreq *)mp1->b_rptr; 27014 sin = (sin_t *)&lifr->lifr_addr; 27015 } else { 27016 sin = NULL; 27017 } 27018 27019 err = (*ipip->ipi_func_restart)(ipsq->ipsq_current_ipif, sin, q, mp, 27020 ipip, mp1->b_rptr); 27021 27022 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 27023 } 27024 27025 /* 27026 * ioctl processing 27027 * 27028 * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up 27029 * the ioctl command in the ioctl tables, determines the copyin data size 27030 * from the ipi_copyin_size field, and does an mi_copyin() of that size. 27031 * 27032 * ioctl processing then continues when the M_IOCDATA makes its way down to 27033 * ip_wput_nondata(). The ioctl is looked up again in the ioctl table, its 27034 * associated 'conn' is refheld till the end of the ioctl and the general 27035 * ioctl processing function ip_process_ioctl() is called to extract the 27036 * arguments and process the ioctl. To simplify extraction, ioctl commands 27037 * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a 27038 * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq()) 27039 * is used to extract the ioctl's arguments. 27040 * 27041 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 27042 * so goes thru the serialization primitive ipsq_try_enter. Then the 27043 * appropriate function to handle the ioctl is called based on the entry in 27044 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 27045 * which also refreleases the 'conn' that was refheld at the start of the 27046 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 27047 * 27048 * Many exclusive ioctls go thru an internal down up sequence as part of 27049 * the operation. For example an attempt to change the IP address of an 27050 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 27051 * does all the cleanup such as deleting all ires that use this address. 27052 * Then we need to wait till all references to the interface go away. 27053 */ 27054 void 27055 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 27056 { 27057 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 27058 ip_ioctl_cmd_t *ipip = arg; 27059 ip_extract_func_t *extract_funcp; 27060 cmd_info_t ci; 27061 int err; 27062 27063 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 27064 27065 if (ipip == NULL) 27066 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27067 27068 /* 27069 * SIOCLIFADDIF needs to go thru a special path since the 27070 * ill may not exist yet. This happens in the case of lo0 27071 * which is created using this ioctl. 27072 */ 27073 if (ipip->ipi_cmd == SIOCLIFADDIF) { 27074 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 27075 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27076 return; 27077 } 27078 27079 ci.ci_ipif = NULL; 27080 if (ipip->ipi_cmd_type == MISC_CMD) { 27081 /* 27082 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF. 27083 */ 27084 if (ipip->ipi_cmd == IF_UNITSEL) { 27085 /* ioctl comes down the ill */ 27086 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 27087 ipif_refhold(ci.ci_ipif); 27088 } 27089 err = 0; 27090 ci.ci_sin = NULL; 27091 ci.ci_sin6 = NULL; 27092 ci.ci_lifr = NULL; 27093 } else { 27094 switch (ipip->ipi_cmd_type) { 27095 case IF_CMD: 27096 case LIF_CMD: 27097 extract_funcp = ip_extract_lifreq; 27098 break; 27099 27100 case ARP_CMD: 27101 case XARP_CMD: 27102 extract_funcp = ip_extract_arpreq; 27103 break; 27104 27105 case TUN_CMD: 27106 extract_funcp = ip_extract_tunreq; 27107 break; 27108 27109 case MSFILT_CMD: 27110 extract_funcp = ip_extract_msfilter; 27111 break; 27112 27113 default: 27114 ASSERT(0); 27115 } 27116 27117 err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl); 27118 if (err != 0) { 27119 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27120 return; 27121 } 27122 27123 /* 27124 * All of the extraction functions return a refheld ipif. 27125 */ 27126 ASSERT(ci.ci_ipif != NULL); 27127 } 27128 27129 if (!(ipip->ipi_flags & IPI_WR)) { 27130 /* 27131 * A return value of EINPROGRESS means the ioctl is 27132 * either queued and waiting for some reason or has 27133 * already completed. 27134 */ 27135 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 27136 ci.ci_lifr); 27137 if (ci.ci_ipif != NULL) 27138 ipif_refrele(ci.ci_ipif); 27139 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); 27140 return; 27141 } 27142 27143 /* 27144 * If ipsq is non-null, we are already being called exclusively on an 27145 * ill but in the case of a failover in progress it is the "from" ill, 27146 * rather than the "to" ill (which is the ill ptr passed in). 27147 * In order to ensure we are exclusive on both ILLs we rerun 27148 * ipsq_try_enter() here, ipsq's support recursive entry. 27149 */ 27150 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 27151 ASSERT(ci.ci_ipif != NULL); 27152 27153 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl, 27154 NEW_OP, B_TRUE); 27155 27156 /* 27157 * Release the ipif so that ipif_down and friends that wait for 27158 * references to go away are not misled about the current ipif_refcnt 27159 * values. We are writer so we can access the ipif even after releasing 27160 * the ipif. 27161 */ 27162 ipif_refrele(ci.ci_ipif); 27163 if (ipsq == NULL) 27164 return; 27165 27166 ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); 27167 27168 /* 27169 * For most set ioctls that come here, this serves as a single point 27170 * where we set the IPIF_CHANGING flag. This ensures that there won't 27171 * be any new references to the ipif. This helps functions that go 27172 * through this path and end up trying to wait for the refcnts 27173 * associated with the ipif to go down to zero. Some exceptions are 27174 * Failover, Failback, and Groupname commands that operate on more than 27175 * just the ci.ci_ipif. These commands internally determine the 27176 * set of ipif's they operate on and set and clear the IPIF_CHANGING 27177 * flags on that set. Another exception is the Removeif command that 27178 * sets the IPIF_CONDEMNED flag internally after identifying the right 27179 * ipif to operate on. 27180 */ 27181 mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); 27182 if (ipip->ipi_cmd != SIOCLIFREMOVEIF && 27183 ipip->ipi_cmd != SIOCLIFFAILOVER && 27184 ipip->ipi_cmd != SIOCLIFFAILBACK && 27185 ipip->ipi_cmd != SIOCSLIFGROUPNAME) 27186 (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; 27187 mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); 27188 27189 /* 27190 * A return value of EINPROGRESS means the ioctl is 27191 * either queued and waiting for some reason or has 27192 * already completed. 27193 */ 27194 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); 27195 27196 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); 27197 27198 ipsq_exit(ipsq); 27199 } 27200 27201 /* 27202 * Complete the ioctl. Typically ioctls use the mi package and need to 27203 * do mi_copyout/mi_copy_done. 27204 */ 27205 void 27206 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq) 27207 { 27208 conn_t *connp = NULL; 27209 27210 if (err == EINPROGRESS) 27211 return; 27212 27213 if (CONN_Q(q)) { 27214 connp = Q_TO_CONN(q); 27215 ASSERT(connp->conn_ref >= 2); 27216 } 27217 27218 switch (mode) { 27219 case COPYOUT: 27220 if (err == 0) 27221 mi_copyout(q, mp); 27222 else 27223 mi_copy_done(q, mp, err); 27224 break; 27225 27226 case NO_COPYOUT: 27227 mi_copy_done(q, mp, err); 27228 break; 27229 27230 default: 27231 ASSERT(mode == CONN_CLOSE); /* aborted through CONN_CLOSE */ 27232 break; 27233 } 27234 27235 /* 27236 * The refhold placed at the start of the ioctl is released here. 27237 */ 27238 if (connp != NULL) 27239 CONN_OPER_PENDING_DONE(connp); 27240 27241 if (ipsq != NULL) 27242 ipsq_current_finish(ipsq); 27243 } 27244 27245 /* 27246 * This is called from ip_wput_nondata to resume a deferred TCP bind. 27247 */ 27248 /* ARGSUSED */ 27249 void 27250 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) 27251 { 27252 conn_t *connp = arg; 27253 tcp_t *tcp; 27254 27255 ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); 27256 tcp = connp->conn_tcp; 27257 27258 if (connp->conn_tcp->tcp_state == TCPS_CLOSED) 27259 freemsg(mp); 27260 else 27261 tcp_rput_other(tcp, mp); 27262 CONN_OPER_PENDING_DONE(connp); 27263 } 27264 27265 /* Called from ip_wput for all non data messages */ 27266 /* ARGSUSED */ 27267 void 27268 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 27269 { 27270 mblk_t *mp1; 27271 ire_t *ire, *fake_ire; 27272 ill_t *ill; 27273 struct iocblk *iocp; 27274 ip_ioctl_cmd_t *ipip; 27275 cred_t *cr; 27276 conn_t *connp; 27277 int err; 27278 nce_t *nce; 27279 ipif_t *ipif; 27280 ip_stack_t *ipst; 27281 char *proto_str; 27282 27283 if (CONN_Q(q)) { 27284 connp = Q_TO_CONN(q); 27285 ipst = connp->conn_netstack->netstack_ip; 27286 } else { 27287 connp = NULL; 27288 ipst = ILLQ_TO_IPST(q); 27289 } 27290 27291 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(q)); 27292 27293 switch (DB_TYPE(mp)) { 27294 case M_IOCTL: 27295 /* 27296 * IOCTL processing begins in ip_sioctl_copyin_setup which 27297 * will arrange to copy in associated control structures. 27298 */ 27299 ip_sioctl_copyin_setup(q, mp); 27300 return; 27301 case M_IOCDATA: 27302 /* 27303 * Ensure that this is associated with one of our trans- 27304 * parent ioctls. If it's not ours, discard it if we're 27305 * running as a driver, or pass it on if we're a module. 27306 */ 27307 iocp = (struct iocblk *)mp->b_rptr; 27308 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 27309 if (ipip == NULL) { 27310 if (q->q_next == NULL) { 27311 goto nak; 27312 } else { 27313 putnext(q, mp); 27314 } 27315 return; 27316 } 27317 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 27318 /* 27319 * the ioctl is one we recognise, but is not 27320 * consumed by IP as a module, pass M_IOCDATA 27321 * for processing downstream, but only for 27322 * common Streams ioctls. 27323 */ 27324 if (ipip->ipi_flags & IPI_PASS_DOWN) { 27325 putnext(q, mp); 27326 return; 27327 } else { 27328 goto nak; 27329 } 27330 } 27331 27332 /* IOCTL continuation following copyin or copyout. */ 27333 if (mi_copy_state(q, mp, NULL) == -1) { 27334 /* 27335 * The copy operation failed. mi_copy_state already 27336 * cleaned up, so we're out of here. 27337 */ 27338 return; 27339 } 27340 /* 27341 * If we just completed a copy in, we become writer and 27342 * continue processing in ip_sioctl_copyin_done. If it 27343 * was a copy out, we call mi_copyout again. If there is 27344 * nothing more to copy out, it will complete the IOCTL. 27345 */ 27346 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 27347 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 27348 mi_copy_done(q, mp, EPROTO); 27349 return; 27350 } 27351 /* 27352 * Check for cases that need more copying. A return 27353 * value of 0 means a second copyin has been started, 27354 * so we return; a return value of 1 means no more 27355 * copying is needed, so we continue. 27356 */ 27357 if (ipip->ipi_cmd_type == MSFILT_CMD && 27358 MI_COPY_COUNT(mp) == 1) { 27359 if (ip_copyin_msfilter(q, mp) == 0) 27360 return; 27361 } 27362 /* 27363 * Refhold the conn, till the ioctl completes. This is 27364 * needed in case the ioctl ends up in the pending mp 27365 * list. Every mp in the ill_pending_mp list and 27366 * the ipsq_pending_mp must have a refhold on the conn 27367 * to resume processing. The refhold is released when 27368 * the ioctl completes. (normally or abnormally) 27369 * In all cases ip_ioctl_finish is called to finish 27370 * the ioctl. 27371 */ 27372 if (connp != NULL) { 27373 /* This is not a reentry */ 27374 ASSERT(ipsq == NULL); 27375 CONN_INC_REF(connp); 27376 } else { 27377 if (!(ipip->ipi_flags & IPI_MODOK)) { 27378 mi_copy_done(q, mp, EINVAL); 27379 return; 27380 } 27381 } 27382 27383 ip_process_ioctl(ipsq, q, mp, ipip); 27384 27385 } else { 27386 mi_copyout(q, mp); 27387 } 27388 return; 27389 nak: 27390 iocp->ioc_error = EINVAL; 27391 mp->b_datap->db_type = M_IOCNAK; 27392 iocp->ioc_count = 0; 27393 qreply(q, mp); 27394 return; 27395 27396 case M_IOCNAK: 27397 /* 27398 * The only way we could get here is if a resolver didn't like 27399 * an IOCTL we sent it. This shouldn't happen. 27400 */ 27401 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 27402 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 27403 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 27404 freemsg(mp); 27405 return; 27406 case M_IOCACK: 27407 /* /dev/ip shouldn't see this */ 27408 if (CONN_Q(q)) 27409 goto nak; 27410 27411 /* Finish socket ioctls passed through to ARP. */ 27412 ip_sioctl_iocack(q, mp); 27413 return; 27414 case M_FLUSH: 27415 if (*mp->b_rptr & FLUSHW) 27416 flushq(q, FLUSHALL); 27417 if (q->q_next) { 27418 putnext(q, mp); 27419 return; 27420 } 27421 if (*mp->b_rptr & FLUSHR) { 27422 *mp->b_rptr &= ~FLUSHW; 27423 qreply(q, mp); 27424 return; 27425 } 27426 freemsg(mp); 27427 return; 27428 case IRE_DB_REQ_TYPE: 27429 if (connp == NULL) { 27430 proto_str = "IRE_DB_REQ_TYPE"; 27431 goto protonak; 27432 } 27433 /* An Upper Level Protocol wants a copy of an IRE. */ 27434 ip_ire_req(q, mp); 27435 return; 27436 case M_CTL: 27437 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 27438 break; 27439 27440 if (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == 27441 TUN_HELLO) { 27442 ASSERT(connp != NULL); 27443 connp->conn_flags |= IPCL_IPTUN; 27444 freeb(mp); 27445 return; 27446 } 27447 27448 /* M_CTL messages are used by ARP to tell us things. */ 27449 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 27450 break; 27451 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 27452 case AR_ENTRY_SQUERY: 27453 ip_wput_ctl(q, mp); 27454 return; 27455 case AR_CLIENT_NOTIFY: 27456 ip_arp_news(q, mp); 27457 return; 27458 case AR_DLPIOP_DONE: 27459 ASSERT(q->q_next != NULL); 27460 ill = (ill_t *)q->q_ptr; 27461 /* qwriter_ip releases the refhold */ 27462 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 27463 ill_refhold(ill); 27464 qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE); 27465 return; 27466 case AR_ARP_CLOSING: 27467 /* 27468 * ARP (above us) is closing. If no ARP bringup is 27469 * currently pending, ack the message so that ARP 27470 * can complete its close. Also mark ill_arp_closing 27471 * so that new ARP bringups will fail. If any 27472 * ARP bringup is currently in progress, we will 27473 * ack this when the current ARP bringup completes. 27474 */ 27475 ASSERT(q->q_next != NULL); 27476 ill = (ill_t *)q->q_ptr; 27477 mutex_enter(&ill->ill_lock); 27478 ill->ill_arp_closing = 1; 27479 if (!ill->ill_arp_bringup_pending) { 27480 mutex_exit(&ill->ill_lock); 27481 qreply(q, mp); 27482 } else { 27483 mutex_exit(&ill->ill_lock); 27484 freemsg(mp); 27485 } 27486 return; 27487 case AR_ARP_EXTEND: 27488 /* 27489 * The ARP module above us is capable of duplicate 27490 * address detection. Old ATM drivers will not send 27491 * this message. 27492 */ 27493 ASSERT(q->q_next != NULL); 27494 ill = (ill_t *)q->q_ptr; 27495 ill->ill_arp_extend = B_TRUE; 27496 freemsg(mp); 27497 return; 27498 default: 27499 break; 27500 } 27501 break; 27502 case M_PROTO: 27503 case M_PCPROTO: 27504 /* 27505 * The only PROTO messages we expect are ULP binds and 27506 * copies of option negotiation acknowledgements. 27507 */ 27508 switch (((union T_primitives *)mp->b_rptr)->type) { 27509 case O_T_BIND_REQ: 27510 case T_BIND_REQ: { 27511 /* Request can get queued in bind */ 27512 if (connp == NULL) { 27513 proto_str = "O_T_BIND_REQ/T_BIND_REQ"; 27514 goto protonak; 27515 } 27516 /* 27517 * The transports except SCTP call ip_bind_{v4,v6}() 27518 * directly instead of a a putnext. SCTP doesn't 27519 * generate any T_BIND_REQ since it has its own 27520 * fanout data structures. However, ESP and AH 27521 * come in for regular binds; all other cases are 27522 * bind retries. 27523 */ 27524 ASSERT(!IPCL_IS_SCTP(connp)); 27525 27526 /* Don't increment refcnt if this is a re-entry */ 27527 if (ipsq == NULL) 27528 CONN_INC_REF(connp); 27529 27530 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 27531 connp, NULL) : ip_bind_v4(q, mp, connp); 27532 if (mp == NULL) 27533 return; 27534 if (IPCL_IS_TCP(connp)) { 27535 /* 27536 * In the case of TCP endpoint we 27537 * come here only for bind retries 27538 */ 27539 ASSERT(ipsq != NULL); 27540 CONN_INC_REF(connp); 27541 squeue_fill(connp->conn_sqp, mp, 27542 ip_resume_tcp_bind, connp, 27543 SQTAG_BIND_RETRY); 27544 } else if (IPCL_IS_UDP(connp)) { 27545 /* 27546 * In the case of UDP endpoint we 27547 * come here only for bind retries 27548 */ 27549 ASSERT(ipsq != NULL); 27550 udp_resume_bind(connp, mp); 27551 } else if (IPCL_IS_RAWIP(connp)) { 27552 /* 27553 * In the case of RAWIP endpoint we 27554 * come here only for bind retries 27555 */ 27556 ASSERT(ipsq != NULL); 27557 rawip_resume_bind(connp, mp); 27558 } else { 27559 /* The case of AH and ESP */ 27560 qreply(q, mp); 27561 CONN_OPER_PENDING_DONE(connp); 27562 } 27563 return; 27564 } 27565 case T_SVR4_OPTMGMT_REQ: 27566 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 27567 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 27568 27569 if (connp == NULL) { 27570 proto_str = "T_SVR4_OPTMGMT_REQ"; 27571 goto protonak; 27572 } 27573 27574 if (!snmpcom_req(q, mp, ip_snmp_set, 27575 ip_snmp_get, cr)) { 27576 /* 27577 * Call svr4_optcom_req so that it can 27578 * generate the ack. We don't come here 27579 * if this operation is being restarted. 27580 * ip_restart_optmgmt will drop the conn ref. 27581 * In the case of ipsec option after the ipsec 27582 * load is complete conn_restart_ipsec_waiter 27583 * drops the conn ref. 27584 */ 27585 ASSERT(ipsq == NULL); 27586 CONN_INC_REF(connp); 27587 if (ip_check_for_ipsec_opt(q, mp)) 27588 return; 27589 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj, 27590 B_FALSE); 27591 if (err != EINPROGRESS) { 27592 /* Operation is done */ 27593 CONN_OPER_PENDING_DONE(connp); 27594 } 27595 } 27596 return; 27597 case T_OPTMGMT_REQ: 27598 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 27599 /* 27600 * Note: No snmpcom_req support through new 27601 * T_OPTMGMT_REQ. 27602 * Call tpi_optcom_req so that it can 27603 * generate the ack. 27604 */ 27605 if (connp == NULL) { 27606 proto_str = "T_OPTMGMT_REQ"; 27607 goto protonak; 27608 } 27609 27610 ASSERT(ipsq == NULL); 27611 /* 27612 * We don't come here for restart. ip_restart_optmgmt 27613 * will drop the conn ref. In the case of ipsec option 27614 * after the ipsec load is complete 27615 * conn_restart_ipsec_waiter drops the conn ref. 27616 */ 27617 CONN_INC_REF(connp); 27618 if (ip_check_for_ipsec_opt(q, mp)) 27619 return; 27620 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE); 27621 if (err != EINPROGRESS) { 27622 /* Operation is done */ 27623 CONN_OPER_PENDING_DONE(connp); 27624 } 27625 return; 27626 case T_UNBIND_REQ: 27627 if (connp == NULL) { 27628 proto_str = "T_UNBIND_REQ"; 27629 goto protonak; 27630 } 27631 mp = ip_unbind(q, mp); 27632 qreply(q, mp); 27633 return; 27634 default: 27635 /* 27636 * Have to drop any DLPI messages coming down from 27637 * arp (such as an info_req which would cause ip 27638 * to receive an extra info_ack if it was passed 27639 * through. 27640 */ 27641 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 27642 (int)*(uint_t *)mp->b_rptr)); 27643 freemsg(mp); 27644 return; 27645 } 27646 /* NOTREACHED */ 27647 case IRE_DB_TYPE: { 27648 nce_t *nce; 27649 ill_t *ill; 27650 in6_addr_t gw_addr_v6; 27651 27652 27653 /* 27654 * This is a response back from a resolver. It 27655 * consists of a message chain containing: 27656 * IRE_MBLK-->LL_HDR_MBLK->pkt 27657 * The IRE_MBLK is the one we allocated in ip_newroute. 27658 * The LL_HDR_MBLK is the DLPI header to use to get 27659 * the attached packet, and subsequent ones for the 27660 * same destination, transmitted. 27661 */ 27662 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 27663 break; 27664 /* 27665 * First, check to make sure the resolution succeeded. 27666 * If it failed, the second mblk will be empty. 27667 * If it is, free the chain, dropping the packet. 27668 * (We must ire_delete the ire; that frees the ire mblk) 27669 * We're doing this now to support PVCs for ATM; it's 27670 * a partial xresolv implementation. When we fully implement 27671 * xresolv interfaces, instead of freeing everything here 27672 * we'll initiate neighbor discovery. 27673 * 27674 * For v4 (ARP and other external resolvers) the resolver 27675 * frees the message, so no check is needed. This check 27676 * is required, though, for a full xresolve implementation. 27677 * Including this code here now both shows how external 27678 * resolvers can NACK a resolution request using an 27679 * existing design that has no specific provisions for NACKs, 27680 * and also takes into account that the current non-ARP 27681 * external resolver has been coded to use this method of 27682 * NACKing for all IPv6 (xresolv) cases, 27683 * whether our xresolv implementation is complete or not. 27684 * 27685 */ 27686 ire = (ire_t *)mp->b_rptr; 27687 ill = ire_to_ill(ire); 27688 mp1 = mp->b_cont; /* dl_unitdata_req */ 27689 if (mp1->b_rptr == mp1->b_wptr) { 27690 if (ire->ire_ipversion == IPV6_VERSION) { 27691 /* 27692 * XRESOLV interface. 27693 */ 27694 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27695 mutex_enter(&ire->ire_lock); 27696 gw_addr_v6 = ire->ire_gateway_addr_v6; 27697 mutex_exit(&ire->ire_lock); 27698 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27699 nce = ndp_lookup_v6(ill, 27700 &ire->ire_addr_v6, B_FALSE); 27701 } else { 27702 nce = ndp_lookup_v6(ill, &gw_addr_v6, 27703 B_FALSE); 27704 } 27705 if (nce != NULL) { 27706 nce_resolv_failed(nce); 27707 ndp_delete(nce); 27708 NCE_REFRELE(nce); 27709 } 27710 } 27711 mp->b_cont = NULL; 27712 freemsg(mp1); /* frees the pkt as well */ 27713 ASSERT(ire->ire_nce == NULL); 27714 ire_delete((ire_t *)mp->b_rptr); 27715 return; 27716 } 27717 27718 /* 27719 * Split them into IRE_MBLK and pkt and feed it into 27720 * ire_add_then_send. Then in ire_add_then_send 27721 * the IRE will be added, and then the packet will be 27722 * run back through ip_wput. This time it will make 27723 * it to the wire. 27724 */ 27725 mp->b_cont = NULL; 27726 mp = mp1->b_cont; /* now, mp points to pkt */ 27727 mp1->b_cont = NULL; 27728 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 27729 if (ire->ire_ipversion == IPV6_VERSION) { 27730 /* 27731 * XRESOLV interface. Find the nce and put a copy 27732 * of the dl_unitdata_req in nce_res_mp 27733 */ 27734 ASSERT(ill->ill_flags & ILLF_XRESOLV); 27735 mutex_enter(&ire->ire_lock); 27736 gw_addr_v6 = ire->ire_gateway_addr_v6; 27737 mutex_exit(&ire->ire_lock); 27738 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 27739 nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, 27740 B_FALSE); 27741 } else { 27742 nce = ndp_lookup_v6(ill, &gw_addr_v6, B_FALSE); 27743 } 27744 if (nce != NULL) { 27745 /* 27746 * We have to protect nce_res_mp here 27747 * from being accessed by other threads 27748 * while we change the mblk pointer. 27749 * Other functions will also lock the nce when 27750 * accessing nce_res_mp. 27751 * 27752 * The reason we change the mblk pointer 27753 * here rather than copying the resolved address 27754 * into the template is that, unlike with 27755 * ethernet, we have no guarantee that the 27756 * resolved address length will be 27757 * smaller than or equal to the lla length 27758 * with which the template was allocated, 27759 * (for ethernet, they're equal) 27760 * so we have to use the actual resolved 27761 * address mblk - which holds the real 27762 * dl_unitdata_req with the resolved address. 27763 * 27764 * Doing this is the same behavior as was 27765 * previously used in the v4 ARP case. 27766 */ 27767 mutex_enter(&nce->nce_lock); 27768 if (nce->nce_res_mp != NULL) 27769 freemsg(nce->nce_res_mp); 27770 nce->nce_res_mp = mp1; 27771 mutex_exit(&nce->nce_lock); 27772 /* 27773 * We do a fastpath probe here because 27774 * we have resolved the address without 27775 * using Neighbor Discovery. 27776 * In the non-XRESOLV v6 case, the fastpath 27777 * probe is done right after neighbor 27778 * discovery completes. 27779 */ 27780 if (nce->nce_res_mp != NULL) { 27781 int res; 27782 nce_fastpath_list_add(nce); 27783 res = ill_fastpath_probe(ill, 27784 nce->nce_res_mp); 27785 if (res != 0 && res != EAGAIN) 27786 nce_fastpath_list_delete(nce); 27787 } 27788 27789 ire_add_then_send(q, ire, mp); 27790 /* 27791 * Now we have to clean out any packets 27792 * that may have been queued on the nce 27793 * while it was waiting for address resolution 27794 * to complete. 27795 */ 27796 mutex_enter(&nce->nce_lock); 27797 mp1 = nce->nce_qd_mp; 27798 nce->nce_qd_mp = NULL; 27799 mutex_exit(&nce->nce_lock); 27800 while (mp1 != NULL) { 27801 mblk_t *nxt_mp; 27802 queue_t *fwdq = NULL; 27803 ill_t *inbound_ill; 27804 uint_t ifindex; 27805 27806 nxt_mp = mp1->b_next; 27807 mp1->b_next = NULL; 27808 /* 27809 * Retrieve ifindex stored in 27810 * ip_rput_data_v6() 27811 */ 27812 ifindex = 27813 (uint_t)(uintptr_t)mp1->b_prev; 27814 inbound_ill = 27815 ill_lookup_on_ifindex(ifindex, 27816 B_TRUE, NULL, NULL, NULL, 27817 NULL, ipst); 27818 mp1->b_prev = NULL; 27819 if (inbound_ill != NULL) 27820 fwdq = inbound_ill->ill_rq; 27821 27822 if (fwdq != NULL) { 27823 put(fwdq, mp1); 27824 ill_refrele(inbound_ill); 27825 } else 27826 put(WR(ill->ill_rq), mp1); 27827 mp1 = nxt_mp; 27828 } 27829 NCE_REFRELE(nce); 27830 } else { /* nce is NULL; clean up */ 27831 ire_delete(ire); 27832 freemsg(mp); 27833 freemsg(mp1); 27834 return; 27835 } 27836 } else { 27837 nce_t *arpce; 27838 /* 27839 * Link layer resolution succeeded. Recompute the 27840 * ire_nce. 27841 */ 27842 ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST)); 27843 if ((arpce = ndp_lookup_v4(ill, 27844 (ire->ire_gateway_addr != INADDR_ANY ? 27845 &ire->ire_gateway_addr : &ire->ire_addr), 27846 B_FALSE)) == NULL) { 27847 freeb(ire->ire_mp); 27848 freeb(mp1); 27849 freemsg(mp); 27850 return; 27851 } 27852 mutex_enter(&arpce->nce_lock); 27853 arpce->nce_last = TICK_TO_MSEC(lbolt64); 27854 if (arpce->nce_state == ND_REACHABLE) { 27855 /* 27856 * Someone resolved this before us; 27857 * cleanup the res_mp. Since ire has 27858 * not been added yet, the call to ire_add_v4 27859 * from ire_add_then_send (when a dup is 27860 * detected) will clean up the ire. 27861 */ 27862 freeb(mp1); 27863 } else { 27864 ASSERT(arpce->nce_res_mp == NULL); 27865 arpce->nce_res_mp = mp1; 27866 arpce->nce_state = ND_REACHABLE; 27867 } 27868 mutex_exit(&arpce->nce_lock); 27869 if (ire->ire_marks & IRE_MARK_NOADD) { 27870 /* 27871 * this ire will not be added to the ire 27872 * cache table, so we can set the ire_nce 27873 * here, as there are no atomicity constraints. 27874 */ 27875 ire->ire_nce = arpce; 27876 /* 27877 * We are associating this nce with the ire 27878 * so change the nce ref taken in 27879 * ndp_lookup_v4() from 27880 * NCE_REFHOLD to NCE_REFHOLD_NOTR 27881 */ 27882 NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 27883 } else { 27884 NCE_REFRELE(arpce); 27885 } 27886 ire_add_then_send(q, ire, mp); 27887 } 27888 return; /* All is well, the packet has been sent. */ 27889 } 27890 case IRE_ARPRESOLVE_TYPE: { 27891 27892 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */ 27893 break; 27894 mp1 = mp->b_cont; /* dl_unitdata_req */ 27895 mp->b_cont = NULL; 27896 /* 27897 * First, check to make sure the resolution succeeded. 27898 * If it failed, the second mblk will be empty. 27899 */ 27900 if (mp1->b_rptr == mp1->b_wptr) { 27901 /* cleanup the incomplete ire, free queued packets */ 27902 freemsg(mp); /* fake ire */ 27903 freeb(mp1); /* dl_unitdata response */ 27904 return; 27905 } 27906 27907 /* 27908 * Update any incomplete nce_t found. We search the ctable 27909 * and find the nce from the ire->ire_nce because we need 27910 * to pass the ire to ip_xmit_v4 later, and can find both 27911 * ire and nce in one lookup. 27912 */ 27913 fake_ire = (ire_t *)mp->b_rptr; 27914 27915 /* 27916 * By the time we come back here from ARP the incomplete ire 27917 * created in ire_forward() could have been removed. We use 27918 * the parameters stored in the fake_ire to specify the real 27919 * ire as explicitly as possible. This avoids problems when 27920 * IPMP groups are configured as an ipif can 'float' 27921 * across several ill queues. We can be confident that the 27922 * the inability to find an ire is because it no longer exists. 27923 */ 27924 ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE, 27925 NULL, NULL, NULL, NULL, ipst); 27926 if (ill == NULL) { 27927 ip1dbg(("ill for incomplete ire vanished\n")); 27928 freemsg(mp); /* fake ire */ 27929 freeb(mp1); /* dl_unitdata response */ 27930 return; 27931 } 27932 27933 /* Get the outgoing ipif */ 27934 mutex_enter(&ill->ill_lock); 27935 ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid); 27936 if (ipif == NULL) { 27937 mutex_exit(&ill->ill_lock); 27938 ill_refrele(ill); 27939 ip1dbg(("logical intrf to incomplete ire vanished\n")); 27940 freemsg(mp); /* fake_ire */ 27941 freeb(mp1); /* dl_unitdata response */ 27942 return; 27943 } 27944 27945 ipif_refhold_locked(ipif); 27946 mutex_exit(&ill->ill_lock); 27947 ill_refrele(ill); 27948 ire = ire_arpresolve_lookup(fake_ire->ire_addr, 27949 fake_ire->ire_gateway_addr, ipif, fake_ire->ire_zoneid, 27950 ipst, ((ill_t *)q->q_ptr)->ill_wq); 27951 ipif_refrele(ipif); 27952 if (ire == NULL) { 27953 /* 27954 * no ire was found; check if there is an nce 27955 * for this lookup; if it has no ire's pointing at it 27956 * cleanup. 27957 */ 27958 if ((nce = ndp_lookup_v4(q->q_ptr, 27959 (fake_ire->ire_gateway_addr != INADDR_ANY ? 27960 &fake_ire->ire_gateway_addr : &fake_ire->ire_addr), 27961 B_FALSE)) != NULL) { 27962 /* 27963 * cleanup: 27964 * We check for refcnt 2 (one for the nce 27965 * hash list + 1 for the ref taken by 27966 * ndp_lookup_v4) to check that there are 27967 * no ire's pointing at the nce. 27968 */ 27969 if (nce->nce_refcnt == 2) 27970 ndp_delete(nce); 27971 NCE_REFRELE(nce); 27972 } 27973 freeb(mp1); /* dl_unitdata response */ 27974 freemsg(mp); /* fake ire */ 27975 return; 27976 } 27977 nce = ire->ire_nce; 27978 DTRACE_PROBE2(ire__arpresolve__type, 27979 ire_t *, ire, nce_t *, nce); 27980 ASSERT(nce->nce_state != ND_INITIAL); 27981 mutex_enter(&nce->nce_lock); 27982 nce->nce_last = TICK_TO_MSEC(lbolt64); 27983 if (nce->nce_state == ND_REACHABLE) { 27984 /* 27985 * Someone resolved this before us; 27986 * our response is not needed any more. 27987 */ 27988 mutex_exit(&nce->nce_lock); 27989 freeb(mp1); /* dl_unitdata response */ 27990 } else { 27991 ASSERT(nce->nce_res_mp == NULL); 27992 nce->nce_res_mp = mp1; 27993 nce->nce_state = ND_REACHABLE; 27994 mutex_exit(&nce->nce_lock); 27995 nce_fastpath(nce); 27996 } 27997 /* 27998 * The cached nce_t has been updated to be reachable; 27999 * Clear the IRE_MARK_UNCACHED flag and free the fake_ire. 28000 */ 28001 fake_ire->ire_marks &= ~IRE_MARK_UNCACHED; 28002 freemsg(mp); 28003 /* 28004 * send out queued packets. 28005 */ 28006 (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); 28007 28008 IRE_REFRELE(ire); 28009 return; 28010 } 28011 default: 28012 break; 28013 } 28014 if (q->q_next) { 28015 putnext(q, mp); 28016 } else 28017 freemsg(mp); 28018 return; 28019 28020 protonak: 28021 cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str); 28022 if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL) 28023 qreply(q, mp); 28024 } 28025 28026 /* 28027 * Process IP options in an outbound packet. Modify the destination if there 28028 * is a source route option. 28029 * Returns non-zero if something fails in which case an ICMP error has been 28030 * sent and mp freed. 28031 */ 28032 static int 28033 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 28034 boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) 28035 { 28036 ipoptp_t opts; 28037 uchar_t *opt; 28038 uint8_t optval; 28039 uint8_t optlen; 28040 ipaddr_t dst; 28041 intptr_t code = 0; 28042 mblk_t *mp; 28043 ire_t *ire = NULL; 28044 28045 ip2dbg(("ip_wput_options\n")); 28046 mp = ipsec_mp; 28047 if (mctl_present) { 28048 mp = ipsec_mp->b_cont; 28049 } 28050 28051 dst = ipha->ipha_dst; 28052 for (optval = ipoptp_first(&opts, ipha); 28053 optval != IPOPT_EOL; 28054 optval = ipoptp_next(&opts)) { 28055 opt = opts.ipoptp_cur; 28056 optlen = opts.ipoptp_len; 28057 ip2dbg(("ip_wput_options: opt %d, len %d\n", 28058 optval, optlen)); 28059 switch (optval) { 28060 uint32_t off; 28061 case IPOPT_SSRR: 28062 case IPOPT_LSRR: 28063 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28064 ip1dbg(( 28065 "ip_wput_options: bad option offset\n")); 28066 code = (char *)&opt[IPOPT_OLEN] - 28067 (char *)ipha; 28068 goto param_prob; 28069 } 28070 off = opt[IPOPT_OFFSET]; 28071 ip1dbg(("ip_wput_options: next hop 0x%x\n", 28072 ntohl(dst))); 28073 /* 28074 * For strict: verify that dst is directly 28075 * reachable. 28076 */ 28077 if (optval == IPOPT_SSRR) { 28078 ire = ire_ftable_lookup(dst, 0, 0, 28079 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 28080 MBLK_GETLABEL(mp), 28081 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 28082 if (ire == NULL) { 28083 ip1dbg(("ip_wput_options: SSRR not" 28084 " directly reachable: 0x%x\n", 28085 ntohl(dst))); 28086 goto bad_src_route; 28087 } 28088 ire_refrele(ire); 28089 } 28090 break; 28091 case IPOPT_RR: 28092 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28093 ip1dbg(( 28094 "ip_wput_options: bad option offset\n")); 28095 code = (char *)&opt[IPOPT_OLEN] - 28096 (char *)ipha; 28097 goto param_prob; 28098 } 28099 break; 28100 case IPOPT_TS: 28101 /* 28102 * Verify that length >=5 and that there is either 28103 * room for another timestamp or that the overflow 28104 * counter is not maxed out. 28105 */ 28106 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 28107 if (optlen < IPOPT_MINLEN_IT) { 28108 goto param_prob; 28109 } 28110 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 28111 ip1dbg(( 28112 "ip_wput_options: bad option offset\n")); 28113 code = (char *)&opt[IPOPT_OFFSET] - 28114 (char *)ipha; 28115 goto param_prob; 28116 } 28117 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 28118 case IPOPT_TS_TSONLY: 28119 off = IPOPT_TS_TIMELEN; 28120 break; 28121 case IPOPT_TS_TSANDADDR: 28122 case IPOPT_TS_PRESPEC: 28123 case IPOPT_TS_PRESPEC_RFC791: 28124 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 28125 break; 28126 default: 28127 code = (char *)&opt[IPOPT_POS_OV_FLG] - 28128 (char *)ipha; 28129 goto param_prob; 28130 } 28131 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 28132 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 28133 /* 28134 * No room and the overflow counter is 15 28135 * already. 28136 */ 28137 goto param_prob; 28138 } 28139 break; 28140 } 28141 } 28142 28143 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 28144 return (0); 28145 28146 ip1dbg(("ip_wput_options: error processing IP options.")); 28147 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 28148 28149 param_prob: 28150 /* 28151 * Since ip_wput() isn't close to finished, we fill 28152 * in enough of the header for credible error reporting. 28153 */ 28154 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 28155 /* Failed */ 28156 freemsg(ipsec_mp); 28157 return (-1); 28158 } 28159 icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst); 28160 return (-1); 28161 28162 bad_src_route: 28163 /* 28164 * Since ip_wput() isn't close to finished, we fill 28165 * in enough of the header for credible error reporting. 28166 */ 28167 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { 28168 /* Failed */ 28169 freemsg(ipsec_mp); 28170 return (-1); 28171 } 28172 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); 28173 return (-1); 28174 } 28175 28176 /* 28177 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 28178 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 28179 * thru /etc/system. 28180 */ 28181 #define CONN_MAXDRAINCNT 64 28182 28183 static void 28184 conn_drain_init(ip_stack_t *ipst) 28185 { 28186 int i; 28187 28188 ipst->ips_conn_drain_list_cnt = conn_drain_nthreads; 28189 28190 if ((ipst->ips_conn_drain_list_cnt == 0) || 28191 (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 28192 /* 28193 * Default value of the number of drainers is the 28194 * number of cpus, subject to maximum of 8 drainers. 28195 */ 28196 if (boot_max_ncpus != -1) 28197 ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 28198 else 28199 ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8); 28200 } 28201 28202 ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt * 28203 sizeof (idl_t), KM_SLEEP); 28204 28205 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28206 mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL, 28207 MUTEX_DEFAULT, NULL); 28208 } 28209 } 28210 28211 static void 28212 conn_drain_fini(ip_stack_t *ipst) 28213 { 28214 int i; 28215 28216 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) 28217 mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock); 28218 kmem_free(ipst->ips_conn_drain_list, 28219 ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); 28220 ipst->ips_conn_drain_list = NULL; 28221 } 28222 28223 /* 28224 * Note: For an overview of how flowcontrol is handled in IP please see the 28225 * IP Flowcontrol notes at the top of this file. 28226 * 28227 * Flow control has blocked us from proceeding. Insert the given conn in one 28228 * of the conn drain lists. These conn wq's will be qenabled later on when 28229 * STREAMS flow control does a backenable. conn_walk_drain will enable 28230 * the first conn in each of these drain lists. Each of these qenabled conns 28231 * in turn enables the next in the list, after it runs, or when it closes, 28232 * thus sustaining the drain process. 28233 * 28234 * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> 28235 * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert 28236 * running at any time, on a given conn, since there can be only 1 service proc 28237 * running on a queue at any time. 28238 */ 28239 void 28240 conn_drain_insert(conn_t *connp) 28241 { 28242 idl_t *idl; 28243 uint_t index; 28244 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28245 28246 mutex_enter(&connp->conn_lock); 28247 if (connp->conn_state_flags & CONN_CLOSING) { 28248 /* 28249 * The conn is closing as a result of which CONN_CLOSING 28250 * is set. Return. 28251 */ 28252 mutex_exit(&connp->conn_lock); 28253 return; 28254 } else if (connp->conn_idl == NULL) { 28255 /* 28256 * Assign the next drain list round robin. We dont' use 28257 * a lock, and thus it may not be strictly round robin. 28258 * Atomicity of load/stores is enough to make sure that 28259 * conn_drain_list_index is always within bounds. 28260 */ 28261 index = ipst->ips_conn_drain_list_index; 28262 ASSERT(index < ipst->ips_conn_drain_list_cnt); 28263 connp->conn_idl = &ipst->ips_conn_drain_list[index]; 28264 index++; 28265 if (index == ipst->ips_conn_drain_list_cnt) 28266 index = 0; 28267 ipst->ips_conn_drain_list_index = index; 28268 } 28269 mutex_exit(&connp->conn_lock); 28270 28271 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28272 if ((connp->conn_drain_prev != NULL) || 28273 (connp->conn_state_flags & CONN_CLOSING)) { 28274 /* 28275 * The conn is already in the drain list, OR 28276 * the conn is closing. We need to check again for 28277 * the closing case again since close can happen 28278 * after we drop the conn_lock, and before we 28279 * acquire the CONN_DRAIN_LIST_LOCK. 28280 */ 28281 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28282 return; 28283 } else { 28284 idl = connp->conn_idl; 28285 } 28286 28287 /* 28288 * The conn is not in the drain list. Insert it at the 28289 * tail of the drain list. The drain list is circular 28290 * and doubly linked. idl_conn points to the 1st element 28291 * in the list. 28292 */ 28293 if (idl->idl_conn == NULL) { 28294 idl->idl_conn = connp; 28295 connp->conn_drain_next = connp; 28296 connp->conn_drain_prev = connp; 28297 } else { 28298 conn_t *head = idl->idl_conn; 28299 28300 connp->conn_drain_next = head; 28301 connp->conn_drain_prev = head->conn_drain_prev; 28302 head->conn_drain_prev->conn_drain_next = connp; 28303 head->conn_drain_prev = connp; 28304 } 28305 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28306 } 28307 28308 /* 28309 * This conn is closing, and we are called from ip_close. OR 28310 * This conn has been serviced by ip_wsrv, and we need to do the tail 28311 * processing. 28312 * If this conn is part of the drain list, we may need to sustain the drain 28313 * process by qenabling the next conn in the drain list. We may also need to 28314 * remove this conn from the list, if it is done. 28315 */ 28316 static void 28317 conn_drain_tail(conn_t *connp, boolean_t closing) 28318 { 28319 idl_t *idl; 28320 28321 /* 28322 * connp->conn_idl is stable at this point, and no lock is needed 28323 * to check it. If we are called from ip_close, close has already 28324 * set CONN_CLOSING, thus freezing the value of conn_idl, and 28325 * called us only because conn_idl is non-null. If we are called thru 28326 * service, conn_idl could be null, but it cannot change because 28327 * service is single-threaded per queue, and there cannot be another 28328 * instance of service trying to call conn_drain_insert on this conn 28329 * now. 28330 */ 28331 ASSERT(!closing || (connp->conn_idl != NULL)); 28332 28333 /* 28334 * If connp->conn_idl is null, the conn has not been inserted into any 28335 * drain list even once since creation of the conn. Just return. 28336 */ 28337 if (connp->conn_idl == NULL) 28338 return; 28339 28340 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 28341 28342 if (connp->conn_drain_prev == NULL) { 28343 /* This conn is currently not in the drain list. */ 28344 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28345 return; 28346 } 28347 idl = connp->conn_idl; 28348 if (idl->idl_conn_draining == connp) { 28349 /* 28350 * This conn is the current drainer. If this is the last conn 28351 * in the drain list, we need to do more checks, in the 'if' 28352 * below. Otherwwise we need to just qenable the next conn, 28353 * to sustain the draining, and is handled in the 'else' 28354 * below. 28355 */ 28356 if (connp->conn_drain_next == idl->idl_conn) { 28357 /* 28358 * This conn is the last in this list. This round 28359 * of draining is complete. If idl_repeat is set, 28360 * it means another flow enabling has happened from 28361 * the driver/streams and we need to another round 28362 * of draining. 28363 * If there are more than 2 conns in the drain list, 28364 * do a left rotate by 1, so that all conns except the 28365 * conn at the head move towards the head by 1, and the 28366 * the conn at the head goes to the tail. This attempts 28367 * a more even share for all queues that are being 28368 * drained. 28369 */ 28370 if ((connp->conn_drain_next != connp) && 28371 (idl->idl_conn->conn_drain_next != connp)) { 28372 idl->idl_conn = idl->idl_conn->conn_drain_next; 28373 } 28374 if (idl->idl_repeat) { 28375 qenable(idl->idl_conn->conn_wq); 28376 idl->idl_conn_draining = idl->idl_conn; 28377 idl->idl_repeat = 0; 28378 } else { 28379 idl->idl_conn_draining = NULL; 28380 } 28381 } else { 28382 /* 28383 * If the next queue that we are now qenable'ing, 28384 * is closing, it will remove itself from this list 28385 * and qenable the subsequent queue in ip_close(). 28386 * Serialization is acheived thru idl_lock. 28387 */ 28388 qenable(connp->conn_drain_next->conn_wq); 28389 idl->idl_conn_draining = connp->conn_drain_next; 28390 } 28391 } 28392 if (!connp->conn_did_putbq || closing) { 28393 /* 28394 * Remove ourself from the drain list, if we did not do 28395 * a putbq, or if the conn is closing. 28396 * Note: It is possible that q->q_first is non-null. It means 28397 * that these messages landed after we did a enableok() in 28398 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 28399 * service them. 28400 */ 28401 if (connp->conn_drain_next == connp) { 28402 /* Singleton in the list */ 28403 ASSERT(connp->conn_drain_prev == connp); 28404 idl->idl_conn = NULL; 28405 idl->idl_conn_draining = NULL; 28406 } else { 28407 connp->conn_drain_prev->conn_drain_next = 28408 connp->conn_drain_next; 28409 connp->conn_drain_next->conn_drain_prev = 28410 connp->conn_drain_prev; 28411 if (idl->idl_conn == connp) 28412 idl->idl_conn = connp->conn_drain_next; 28413 ASSERT(idl->idl_conn_draining != connp); 28414 28415 } 28416 connp->conn_drain_next = NULL; 28417 connp->conn_drain_prev = NULL; 28418 } 28419 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 28420 } 28421 28422 /* 28423 * Write service routine. Shared perimeter entry point. 28424 * ip_wsrv can be called in any of the following ways. 28425 * 1. The device queue's messages has fallen below the low water mark 28426 * and STREAMS has backenabled the ill_wq. We walk thru all the 28427 * the drain lists and backenable the first conn in each list. 28428 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 28429 * qenabled non-tcp upper layers. We start dequeing messages and call 28430 * ip_wput for each message. 28431 */ 28432 28433 void 28434 ip_wsrv(queue_t *q) 28435 { 28436 conn_t *connp; 28437 ill_t *ill; 28438 mblk_t *mp; 28439 28440 if (q->q_next) { 28441 ill = (ill_t *)q->q_ptr; 28442 if (ill->ill_state_flags == 0) { 28443 /* 28444 * The device flow control has opened up. 28445 * Walk through conn drain lists and qenable the 28446 * first conn in each list. This makes sense only 28447 * if the stream is fully plumbed and setup. 28448 * Hence the if check above. 28449 */ 28450 ip1dbg(("ip_wsrv: walking\n")); 28451 conn_walk_drain(ill->ill_ipst); 28452 } 28453 return; 28454 } 28455 28456 connp = Q_TO_CONN(q); 28457 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 28458 28459 /* 28460 * 1. Set conn_draining flag to signal that service is active. 28461 * 28462 * 2. ip_output determines whether it has been called from service, 28463 * based on the last parameter. If it is IP_WSRV it concludes it 28464 * has been called from service. 28465 * 28466 * 3. Message ordering is preserved by the following logic. 28467 * i. A directly called ip_output (i.e. not thru service) will queue 28468 * the message at the tail, if conn_draining is set (i.e. service 28469 * is running) or if q->q_first is non-null. 28470 * 28471 * ii. If ip_output is called from service, and if ip_output cannot 28472 * putnext due to flow control, it does a putbq. 28473 * 28474 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 28475 * (causing an infinite loop). 28476 */ 28477 ASSERT(!connp->conn_did_putbq); 28478 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 28479 connp->conn_draining = 1; 28480 noenable(q); 28481 while ((mp = getq(q)) != NULL) { 28482 ASSERT(CONN_Q(q)); 28483 28484 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 28485 if (connp->conn_did_putbq) { 28486 /* ip_wput did a putbq */ 28487 break; 28488 } 28489 } 28490 /* 28491 * At this point, a thread coming down from top, calling 28492 * ip_wput, may end up queueing the message. We have not yet 28493 * enabled the queue, so ip_wsrv won't be called again. 28494 * To avoid this race, check q->q_first again (in the loop) 28495 * If the other thread queued the message before we call 28496 * enableok(), we will catch it in the q->q_first check. 28497 * If the other thread queues the message after we call 28498 * enableok(), ip_wsrv will be called again by STREAMS. 28499 */ 28500 connp->conn_draining = 0; 28501 enableok(q); 28502 } 28503 28504 /* Enable the next conn for draining */ 28505 conn_drain_tail(connp, B_FALSE); 28506 28507 connp->conn_did_putbq = 0; 28508 } 28509 28510 /* 28511 * Walk the list of all conn's calling the function provided with the 28512 * specified argument for each. Note that this only walks conn's that 28513 * have been bound. 28514 * Applies to both IPv4 and IPv6. 28515 */ 28516 static void 28517 conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 28518 { 28519 conn_walk_fanout_table(ipst->ips_ipcl_udp_fanout, 28520 ipst->ips_ipcl_udp_fanout_size, 28521 func, arg, zoneid); 28522 conn_walk_fanout_table(ipst->ips_ipcl_conn_fanout, 28523 ipst->ips_ipcl_conn_fanout_size, 28524 func, arg, zoneid); 28525 conn_walk_fanout_table(ipst->ips_ipcl_bind_fanout, 28526 ipst->ips_ipcl_bind_fanout_size, 28527 func, arg, zoneid); 28528 conn_walk_fanout_table(ipst->ips_ipcl_proto_fanout, 28529 IPPROTO_MAX, func, arg, zoneid); 28530 conn_walk_fanout_table(ipst->ips_ipcl_proto_fanout_v6, 28531 IPPROTO_MAX, func, arg, zoneid); 28532 } 28533 28534 /* 28535 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 28536 * of conns that need to be drained, check if drain is already in progress. 28537 * If so set the idl_repeat bit, indicating that the last conn in the list 28538 * needs to reinitiate the drain once again, for the list. If drain is not 28539 * in progress for the list, initiate the draining, by qenabling the 1st 28540 * conn in the list. The drain is self-sustaining, each qenabled conn will 28541 * in turn qenable the next conn, when it is done/blocked/closing. 28542 */ 28543 static void 28544 conn_walk_drain(ip_stack_t *ipst) 28545 { 28546 int i; 28547 idl_t *idl; 28548 28549 IP_STAT(ipst, ip_conn_walk_drain); 28550 28551 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { 28552 idl = &ipst->ips_conn_drain_list[i]; 28553 mutex_enter(&idl->idl_lock); 28554 if (idl->idl_conn == NULL) { 28555 mutex_exit(&idl->idl_lock); 28556 continue; 28557 } 28558 /* 28559 * If this list is not being drained currently by 28560 * an ip_wsrv thread, start the process. 28561 */ 28562 if (idl->idl_conn_draining == NULL) { 28563 ASSERT(idl->idl_repeat == 0); 28564 qenable(idl->idl_conn->conn_wq); 28565 idl->idl_conn_draining = idl->idl_conn; 28566 } else { 28567 idl->idl_repeat = 1; 28568 } 28569 mutex_exit(&idl->idl_lock); 28570 } 28571 } 28572 28573 /* 28574 * Walk an conn hash table of `count' buckets, calling func for each entry. 28575 */ 28576 static void 28577 conn_walk_fanout_table(connf_t *connfp, uint_t count, pfv_t func, void *arg, 28578 zoneid_t zoneid) 28579 { 28580 conn_t *connp; 28581 28582 while (count-- > 0) { 28583 mutex_enter(&connfp->connf_lock); 28584 for (connp = connfp->connf_head; connp != NULL; 28585 connp = connp->conn_next) { 28586 if (zoneid == GLOBAL_ZONEID || 28587 zoneid == connp->conn_zoneid) { 28588 CONN_INC_REF(connp); 28589 mutex_exit(&connfp->connf_lock); 28590 (*func)(connp, arg); 28591 mutex_enter(&connfp->connf_lock); 28592 CONN_DEC_REF(connp); 28593 } 28594 } 28595 mutex_exit(&connfp->connf_lock); 28596 connfp++; 28597 } 28598 } 28599 28600 /* conn_walk_fanout routine invoked for ip_conn_report for each conn. */ 28601 static void 28602 conn_report1(conn_t *connp, void *mp) 28603 { 28604 char buf1[INET6_ADDRSTRLEN]; 28605 char buf2[INET6_ADDRSTRLEN]; 28606 uint_t print_len, buf_len; 28607 28608 ASSERT(connp != NULL); 28609 28610 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 28611 if (buf_len <= 0) 28612 return; 28613 (void) inet_ntop(AF_INET6, &connp->conn_srcv6, buf1, sizeof (buf1)); 28614 (void) inet_ntop(AF_INET6, &connp->conn_remv6, buf2, sizeof (buf2)); 28615 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 28616 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 28617 "%5d %s/%05d %s/%05d\n", 28618 (void *)connp, (void *)CONNP_TO_RQ(connp), 28619 (void *)CONNP_TO_WQ(connp), connp->conn_zoneid, 28620 buf1, connp->conn_lport, 28621 buf2, connp->conn_fport); 28622 if (print_len < buf_len) { 28623 ((mblk_t *)mp)->b_wptr += print_len; 28624 } else { 28625 ((mblk_t *)mp)->b_wptr += buf_len; 28626 } 28627 } 28628 28629 /* 28630 * Named Dispatch routine to produce a formatted report on all conns 28631 * that are listed in one of the fanout tables. 28632 * This report is accessed by using the ndd utility to "get" ND variable 28633 * "ip_conn_status". 28634 */ 28635 /* ARGSUSED */ 28636 static int 28637 ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 28638 { 28639 conn_t *connp = Q_TO_CONN(q); 28640 28641 (void) mi_mpprintf(mp, 28642 "CONN " MI_COL_HDRPAD_STR 28643 "rfq " MI_COL_HDRPAD_STR 28644 "stq " MI_COL_HDRPAD_STR 28645 " zone local remote"); 28646 28647 /* 28648 * Because of the ndd constraint, at most we can have 64K buffer 28649 * to put in all conn info. So to be more efficient, just 28650 * allocate a 64K buffer here, assuming we need that large buffer. 28651 * This should be OK as only privileged processes can do ndd /dev/ip. 28652 */ 28653 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 28654 /* The following may work even if we cannot get a large buf. */ 28655 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 28656 return (0); 28657 } 28658 28659 conn_walk_fanout(conn_report1, mp->b_cont, connp->conn_zoneid, 28660 connp->conn_netstack->netstack_ip); 28661 return (0); 28662 } 28663 28664 /* 28665 * Determine if the ill and multicast aspects of that packets 28666 * "matches" the conn. 28667 */ 28668 boolean_t 28669 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 28670 zoneid_t zoneid) 28671 { 28672 ill_t *in_ill; 28673 boolean_t found; 28674 ipif_t *ipif; 28675 ire_t *ire; 28676 ipaddr_t dst, src; 28677 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 28678 28679 dst = ipha->ipha_dst; 28680 src = ipha->ipha_src; 28681 28682 /* 28683 * conn_incoming_ill is set by IP_BOUND_IF which limits 28684 * unicast, broadcast and multicast reception to 28685 * conn_incoming_ill. conn_wantpacket itself is called 28686 * only for BROADCAST and multicast. 28687 * 28688 * 1) ip_rput supresses duplicate broadcasts if the ill 28689 * is part of a group. Hence, we should be receiving 28690 * just one copy of broadcast for the whole group. 28691 * Thus, if it is part of the group the packet could 28692 * come on any ill of the group and hence we need a 28693 * match on the group. Otherwise, match on ill should 28694 * be sufficient. 28695 * 28696 * 2) ip_rput does not suppress duplicate multicast packets. 28697 * If there are two interfaces in a ill group and we have 28698 * 2 applications (conns) joined a multicast group G on 28699 * both the interfaces, ilm_lookup_ill filter in ip_rput 28700 * will give us two packets because we join G on both the 28701 * interfaces rather than nominating just one interface 28702 * for receiving multicast like broadcast above. So, 28703 * we have to call ilg_lookup_ill to filter out duplicate 28704 * copies, if ill is part of a group. 28705 */ 28706 in_ill = connp->conn_incoming_ill; 28707 if (in_ill != NULL) { 28708 if (in_ill->ill_group == NULL) { 28709 if (in_ill != ill) 28710 return (B_FALSE); 28711 } else if (in_ill->ill_group != ill->ill_group) { 28712 return (B_FALSE); 28713 } 28714 } 28715 28716 if (!CLASSD(dst)) { 28717 if (IPCL_ZONE_MATCH(connp, zoneid)) 28718 return (B_TRUE); 28719 /* 28720 * The conn is in a different zone; we need to check that this 28721 * broadcast address is configured in the application's zone and 28722 * on one ill in the group. 28723 */ 28724 ipif = ipif_get_next_ipif(NULL, ill); 28725 if (ipif == NULL) 28726 return (B_FALSE); 28727 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 28728 connp->conn_zoneid, NULL, 28729 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP), ipst); 28730 ipif_refrele(ipif); 28731 if (ire != NULL) { 28732 ire_refrele(ire); 28733 return (B_TRUE); 28734 } else { 28735 return (B_FALSE); 28736 } 28737 } 28738 28739 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 28740 connp->conn_zoneid == zoneid) { 28741 /* 28742 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 28743 * disabled, therefore we don't dispatch the multicast packet to 28744 * the sending zone. 28745 */ 28746 return (B_FALSE); 28747 } 28748 28749 if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) { 28750 /* 28751 * Multicast packet on the loopback interface: we only match 28752 * conns who joined the group in the specified zone. 28753 */ 28754 return (B_FALSE); 28755 } 28756 28757 if (connp->conn_multi_router) { 28758 /* multicast packet and multicast router socket: send up */ 28759 return (B_TRUE); 28760 } 28761 28762 mutex_enter(&connp->conn_lock); 28763 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 28764 mutex_exit(&connp->conn_lock); 28765 return (found); 28766 } 28767 28768 /* 28769 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 28770 */ 28771 /* ARGSUSED */ 28772 static void 28773 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 28774 { 28775 ill_t *ill = (ill_t *)q->q_ptr; 28776 mblk_t *mp1, *mp2; 28777 ipif_t *ipif; 28778 int err = 0; 28779 conn_t *connp = NULL; 28780 ipsq_t *ipsq; 28781 arc_t *arc; 28782 28783 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 28784 28785 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 28786 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 28787 28788 ASSERT(IAM_WRITER_ILL(ill)); 28789 mp2 = mp->b_cont; 28790 mp->b_cont = NULL; 28791 28792 /* 28793 * We have now received the arp bringup completion message 28794 * from ARP. Mark the arp bringup as done. Also if the arp 28795 * stream has already started closing, send up the AR_ARP_CLOSING 28796 * ack now since ARP is waiting in close for this ack. 28797 */ 28798 mutex_enter(&ill->ill_lock); 28799 ill->ill_arp_bringup_pending = 0; 28800 if (ill->ill_arp_closing) { 28801 mutex_exit(&ill->ill_lock); 28802 /* Let's reuse the mp for sending the ack */ 28803 arc = (arc_t *)mp->b_rptr; 28804 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 28805 arc->arc_cmd = AR_ARP_CLOSING; 28806 qreply(q, mp); 28807 } else { 28808 mutex_exit(&ill->ill_lock); 28809 freeb(mp); 28810 } 28811 28812 ipsq = ill->ill_phyint->phyint_ipsq; 28813 ipif = ipsq->ipsq_pending_ipif; 28814 mp1 = ipsq_pending_mp_get(ipsq, &connp); 28815 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 28816 if (mp1 == NULL) { 28817 /* bringup was aborted by the user */ 28818 freemsg(mp2); 28819 return; 28820 } 28821 28822 /* 28823 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 28824 * must have an associated conn_t. Otherwise, we're bringing this 28825 * interface back up as part of handling an asynchronous event (e.g., 28826 * physical address change). 28827 */ 28828 if (ipsq->ipsq_current_ioctl != 0) { 28829 ASSERT(connp != NULL); 28830 q = CONNP_TO_WQ(connp); 28831 } else { 28832 ASSERT(connp == NULL); 28833 q = ill->ill_rq; 28834 } 28835 28836 /* 28837 * If the DL_BIND_REQ fails, it is noted 28838 * in arc_name_offset. 28839 */ 28840 err = *((int *)mp2->b_rptr); 28841 if (err == 0) { 28842 if (ipif->ipif_isv6) { 28843 if ((err = ipif_up_done_v6(ipif)) != 0) 28844 ip0dbg(("ip_arp_done: init failed\n")); 28845 } else { 28846 if ((err = ipif_up_done(ipif)) != 0) 28847 ip0dbg(("ip_arp_done: init failed\n")); 28848 } 28849 } else { 28850 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 28851 } 28852 28853 freemsg(mp2); 28854 28855 if ((err == 0) && (ill->ill_up_ipifs)) { 28856 err = ill_up_ipifs(ill, q, mp1); 28857 if (err == EINPROGRESS) 28858 return; 28859 } 28860 28861 if (ill->ill_up_ipifs) 28862 ill_group_cleanup(ill); 28863 28864 /* 28865 * The operation must complete without EINPROGRESS since 28866 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 28867 * Otherwise, the operation will be stuck forever in the ipsq. 28868 */ 28869 ASSERT(err != EINPROGRESS); 28870 if (ipsq->ipsq_current_ioctl != 0) 28871 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 28872 else 28873 ipsq_current_finish(ipsq); 28874 } 28875 28876 /* Allocate the private structure */ 28877 static int 28878 ip_priv_alloc(void **bufp) 28879 { 28880 void *buf; 28881 28882 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 28883 return (ENOMEM); 28884 28885 *bufp = buf; 28886 return (0); 28887 } 28888 28889 /* Function to delete the private structure */ 28890 void 28891 ip_priv_free(void *buf) 28892 { 28893 ASSERT(buf != NULL); 28894 kmem_free(buf, sizeof (ip_priv_t)); 28895 } 28896 28897 /* 28898 * The entry point for IPPF processing. 28899 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 28900 * routine just returns. 28901 * 28902 * When called, ip_process generates an ipp_packet_t structure 28903 * which holds the state information for this packet and invokes the 28904 * the classifier (via ipp_packet_process). The classification, depending on 28905 * configured filters, results in a list of actions for this packet. Invoking 28906 * an action may cause the packet to be dropped, in which case the resulting 28907 * mblk (*mpp) is NULL. proc indicates the callout position for 28908 * this packet and ill_index is the interface this packet on or will leave 28909 * on (inbound and outbound resp.). 28910 */ 28911 void 28912 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 28913 { 28914 mblk_t *mp; 28915 ip_priv_t *priv; 28916 ipp_action_id_t aid; 28917 int rc = 0; 28918 ipp_packet_t *pp; 28919 #define IP_CLASS "ip" 28920 28921 /* If the classifier is not loaded, return */ 28922 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 28923 return; 28924 } 28925 28926 mp = *mpp; 28927 ASSERT(mp != NULL); 28928 28929 /* Allocate the packet structure */ 28930 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 28931 if (rc != 0) { 28932 *mpp = NULL; 28933 freemsg(mp); 28934 return; 28935 } 28936 28937 /* Allocate the private structure */ 28938 rc = ip_priv_alloc((void **)&priv); 28939 if (rc != 0) { 28940 *mpp = NULL; 28941 freemsg(mp); 28942 ipp_packet_free(pp); 28943 return; 28944 } 28945 priv->proc = proc; 28946 priv->ill_index = ill_index; 28947 ipp_packet_set_private(pp, priv, ip_priv_free); 28948 ipp_packet_set_data(pp, mp); 28949 28950 /* Invoke the classifier */ 28951 rc = ipp_packet_process(&pp); 28952 if (pp != NULL) { 28953 mp = ipp_packet_get_data(pp); 28954 ipp_packet_free(pp); 28955 if (rc != 0) { 28956 freemsg(mp); 28957 *mpp = NULL; 28958 } 28959 } else { 28960 *mpp = NULL; 28961 } 28962 #undef IP_CLASS 28963 } 28964 28965 /* 28966 * Propagate a multicast group membership operation (add/drop) on 28967 * all the interfaces crossed by the related multirt routes. 28968 * The call is considered successful if the operation succeeds 28969 * on at least one interface. 28970 */ 28971 static int 28972 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 28973 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 28974 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 28975 mblk_t *first_mp) 28976 { 28977 ire_t *ire_gw; 28978 irb_t *irb; 28979 int error = 0; 28980 opt_restart_t *or; 28981 ip_stack_t *ipst = ire->ire_ipst; 28982 28983 irb = ire->ire_bucket; 28984 ASSERT(irb != NULL); 28985 28986 ASSERT(DB_TYPE(first_mp) == M_CTL); 28987 28988 or = (opt_restart_t *)first_mp->b_rptr; 28989 IRB_REFHOLD(irb); 28990 for (; ire != NULL; ire = ire->ire_next) { 28991 if ((ire->ire_flags & RTF_MULTIRT) == 0) 28992 continue; 28993 if (ire->ire_addr != group) 28994 continue; 28995 28996 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 28997 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 28998 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst); 28999 /* No resolver exists for the gateway; skip this ire. */ 29000 if (ire_gw == NULL) 29001 continue; 29002 29003 /* 29004 * This function can return EINPROGRESS. If so the operation 29005 * will be restarted from ip_restart_optmgmt which will 29006 * call ip_opt_set and option processing will restart for 29007 * this option. So we may end up calling 'fn' more than once. 29008 * This requires that 'fn' is idempotent except for the 29009 * return value. The operation is considered a success if 29010 * it succeeds at least once on any one interface. 29011 */ 29012 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 29013 NULL, fmode, src, first_mp); 29014 if (error == 0) 29015 or->or_private = CGTP_MCAST_SUCCESS; 29016 29017 if (ip_debug > 0) { 29018 ulong_t off; 29019 char *ksym; 29020 ksym = kobj_getsymname((uintptr_t)fn, &off); 29021 ip2dbg(("ip_multirt_apply_membership: " 29022 "called %s, multirt group 0x%08x via itf 0x%08x, " 29023 "error %d [success %u]\n", 29024 ksym ? ksym : "?", 29025 ntohl(group), ntohl(ire_gw->ire_src_addr), 29026 error, or->or_private)); 29027 } 29028 29029 ire_refrele(ire_gw); 29030 if (error == EINPROGRESS) { 29031 IRB_REFRELE(irb); 29032 return (error); 29033 } 29034 } 29035 IRB_REFRELE(irb); 29036 /* 29037 * Consider the call as successful if we succeeded on at least 29038 * one interface. Otherwise, return the last encountered error. 29039 */ 29040 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 29041 } 29042 29043 29044 /* 29045 * Issue a warning regarding a route crossing an interface with an 29046 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 29047 * amount of time is logged. 29048 */ 29049 static void 29050 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 29051 { 29052 hrtime_t current = gethrtime(); 29053 char buf[INET_ADDRSTRLEN]; 29054 ip_stack_t *ipst = ire->ire_ipst; 29055 29056 /* Convert interval in ms to hrtime in ns */ 29057 if (ipst->ips_multirt_bad_mtu_last_time + 29058 ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <= 29059 current) { 29060 cmn_err(CE_WARN, "ip: ignoring multiroute " 29061 "to %s, incorrect MTU %u (expected %u)\n", 29062 ip_dot_addr(ire->ire_addr, buf), 29063 ire->ire_max_frag, max_frag); 29064 29065 ipst->ips_multirt_bad_mtu_last_time = current; 29066 } 29067 } 29068 29069 29070 /* 29071 * Get the CGTP (multirouting) filtering status. 29072 * If 0, the CGTP hooks are transparent. 29073 */ 29074 /* ARGSUSED */ 29075 static int 29076 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 29077 { 29078 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 29079 29080 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 29081 return (0); 29082 } 29083 29084 29085 /* 29086 * Set the CGTP (multirouting) filtering status. 29087 * If the status is changed from active to transparent 29088 * or from transparent to active, forward the new status 29089 * to the filtering module (if loaded). 29090 */ 29091 /* ARGSUSED */ 29092 static int 29093 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 29094 cred_t *ioc_cr) 29095 { 29096 long new_value; 29097 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 29098 ip_stack_t *ipst = CONNQ_TO_IPST(q); 29099 29100 if (secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 29101 return (EPERM); 29102 29103 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 29104 new_value < 0 || new_value > 1) { 29105 return (EINVAL); 29106 } 29107 29108 if ((!*ip_cgtp_filter_value) && new_value) { 29109 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 29110 ipst->ips_ip_cgtp_filter_ops == NULL ? 29111 " (module not loaded)" : ""); 29112 } 29113 if (*ip_cgtp_filter_value && (!new_value)) { 29114 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 29115 ipst->ips_ip_cgtp_filter_ops == NULL ? 29116 " (module not loaded)" : ""); 29117 } 29118 29119 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 29120 int res; 29121 netstackid_t stackid; 29122 29123 stackid = ipst->ips_netstack->netstack_stackid; 29124 res = ipst->ips_ip_cgtp_filter_ops->cfo_change_state(stackid, 29125 new_value); 29126 if (res) 29127 return (res); 29128 } 29129 29130 *ip_cgtp_filter_value = (boolean_t)new_value; 29131 29132 return (0); 29133 } 29134 29135 29136 /* 29137 * Return the expected CGTP hooks version number. 29138 */ 29139 int 29140 ip_cgtp_filter_supported(void) 29141 { 29142 return (ip_cgtp_filter_rev); 29143 } 29144 29145 29146 /* 29147 * CGTP hooks can be registered by invoking this function. 29148 * Checks that the version number matches. 29149 */ 29150 int 29151 ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops) 29152 { 29153 netstack_t *ns; 29154 ip_stack_t *ipst; 29155 29156 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 29157 return (ENOTSUP); 29158 29159 ns = netstack_find_by_stackid(stackid); 29160 if (ns == NULL) 29161 return (EINVAL); 29162 ipst = ns->netstack_ip; 29163 ASSERT(ipst != NULL); 29164 29165 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 29166 netstack_rele(ns); 29167 return (EALREADY); 29168 } 29169 29170 ipst->ips_ip_cgtp_filter_ops = ops; 29171 netstack_rele(ns); 29172 return (0); 29173 } 29174 29175 /* 29176 * CGTP hooks can be unregistered by invoking this function. 29177 * Returns ENXIO if there was no registration. 29178 * Returns EBUSY if the ndd variable has not been turned off. 29179 */ 29180 int 29181 ip_cgtp_filter_unregister(netstackid_t stackid) 29182 { 29183 netstack_t *ns; 29184 ip_stack_t *ipst; 29185 29186 ns = netstack_find_by_stackid(stackid); 29187 if (ns == NULL) 29188 return (EINVAL); 29189 ipst = ns->netstack_ip; 29190 ASSERT(ipst != NULL); 29191 29192 if (ipst->ips_ip_cgtp_filter) { 29193 netstack_rele(ns); 29194 return (EBUSY); 29195 } 29196 29197 if (ipst->ips_ip_cgtp_filter_ops == NULL) { 29198 netstack_rele(ns); 29199 return (ENXIO); 29200 } 29201 ipst->ips_ip_cgtp_filter_ops = NULL; 29202 netstack_rele(ns); 29203 return (0); 29204 } 29205 29206 /* 29207 * Check whether there is a CGTP filter registration. 29208 * Returns non-zero if there is a registration, otherwise returns zero. 29209 * Note: returns zero if bad stackid. 29210 */ 29211 int 29212 ip_cgtp_filter_is_registered(netstackid_t stackid) 29213 { 29214 netstack_t *ns; 29215 ip_stack_t *ipst; 29216 int ret; 29217 29218 ns = netstack_find_by_stackid(stackid); 29219 if (ns == NULL) 29220 return (0); 29221 ipst = ns->netstack_ip; 29222 ASSERT(ipst != NULL); 29223 29224 if (ipst->ips_ip_cgtp_filter_ops != NULL) 29225 ret = 1; 29226 else 29227 ret = 0; 29228 29229 netstack_rele(ns); 29230 return (ret); 29231 } 29232 29233 static squeue_func_t 29234 ip_squeue_switch(int val) 29235 { 29236 squeue_func_t rval = squeue_fill; 29237 29238 switch (val) { 29239 case IP_SQUEUE_ENTER_NODRAIN: 29240 rval = squeue_enter_nodrain; 29241 break; 29242 case IP_SQUEUE_ENTER: 29243 rval = squeue_enter; 29244 break; 29245 default: 29246 break; 29247 } 29248 return (rval); 29249 } 29250 29251 /* ARGSUSED */ 29252 static int 29253 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 29254 caddr_t addr, cred_t *cr) 29255 { 29256 int *v = (int *)addr; 29257 long new_value; 29258 29259 if (secpolicy_net_config(cr, B_FALSE) != 0) 29260 return (EPERM); 29261 29262 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29263 return (EINVAL); 29264 29265 ip_input_proc = ip_squeue_switch(new_value); 29266 *v = new_value; 29267 return (0); 29268 } 29269 29270 /* 29271 * Handle ndd set of variables which require PRIV_SYS_NET_CONFIG such as 29272 * ip_debug. 29273 */ 29274 /* ARGSUSED */ 29275 static int 29276 ip_int_set(queue_t *q, mblk_t *mp, char *value, 29277 caddr_t addr, cred_t *cr) 29278 { 29279 int *v = (int *)addr; 29280 long new_value; 29281 29282 if (secpolicy_net_config(cr, B_FALSE) != 0) 29283 return (EPERM); 29284 29285 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29286 return (EINVAL); 29287 29288 *v = new_value; 29289 return (0); 29290 } 29291 29292 /* 29293 * Handle changes to ipmp_hook_emulation ndd variable. 29294 * Need to update phyint_hook_ifindex. 29295 * Also generate a nic plumb event should a new ifidex be assigned to a group. 29296 */ 29297 static void 29298 ipmp_hook_emulation_changed(ip_stack_t *ipst) 29299 { 29300 phyint_t *phyi; 29301 phyint_t *phyi_tmp; 29302 char *groupname; 29303 int namelen; 29304 ill_t *ill; 29305 boolean_t new_group; 29306 29307 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29308 /* 29309 * Group indicies are stored in the phyint - a common structure 29310 * to both IPv4 and IPv6. 29311 */ 29312 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 29313 for (; phyi != NULL; 29314 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 29315 phyi, AVL_AFTER)) { 29316 /* Ignore the ones that do not have a group */ 29317 if (phyi->phyint_groupname_len == 0) 29318 continue; 29319 29320 /* 29321 * Look for other phyint in group. 29322 * Clear name/namelen so the lookup doesn't find ourselves. 29323 */ 29324 namelen = phyi->phyint_groupname_len; 29325 groupname = phyi->phyint_groupname; 29326 phyi->phyint_groupname_len = 0; 29327 phyi->phyint_groupname = NULL; 29328 29329 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); 29330 /* Restore */ 29331 phyi->phyint_groupname_len = namelen; 29332 phyi->phyint_groupname = groupname; 29333 29334 new_group = B_FALSE; 29335 if (ipst->ips_ipmp_hook_emulation) { 29336 /* 29337 * If the group already exists and has already 29338 * been assigned a group ifindex, we use the existing 29339 * group_ifindex, otherwise we pick a new group_ifindex 29340 * here. 29341 */ 29342 if (phyi_tmp != NULL && 29343 phyi_tmp->phyint_group_ifindex != 0) { 29344 phyi->phyint_group_ifindex = 29345 phyi_tmp->phyint_group_ifindex; 29346 } else { 29347 /* XXX We need a recovery strategy here. */ 29348 if (!ip_assign_ifindex( 29349 &phyi->phyint_group_ifindex, ipst)) 29350 cmn_err(CE_PANIC, 29351 "ip_assign_ifindex() failed"); 29352 new_group = B_TRUE; 29353 } 29354 } else { 29355 phyi->phyint_group_ifindex = 0; 29356 } 29357 if (ipst->ips_ipmp_hook_emulation) 29358 phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; 29359 else 29360 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 29361 29362 /* 29363 * For IP Filter to find out the relationship between 29364 * names and interface indicies, we need to generate 29365 * a NE_PLUMB event when a new group can appear. 29366 * We always generate events when a new interface appears 29367 * (even when ipmp_hook_emulation is set) so there 29368 * is no need to generate NE_PLUMB events when 29369 * ipmp_hook_emulation is turned off. 29370 * And since it isn't critical for IP Filter to get 29371 * the NE_UNPLUMB events we skip those here. 29372 */ 29373 if (new_group) { 29374 /* 29375 * First phyint in group - generate group PLUMB event. 29376 * Since we are not running inside the ipsq we do 29377 * the dispatch immediately. 29378 */ 29379 if (phyi->phyint_illv4 != NULL) 29380 ill = phyi->phyint_illv4; 29381 else 29382 ill = phyi->phyint_illv6; 29383 29384 if (ill != NULL) { 29385 mutex_enter(&ill->ill_lock); 29386 ill_nic_info_plumb(ill, B_TRUE); 29387 ill_nic_info_dispatch(ill); 29388 mutex_exit(&ill->ill_lock); 29389 } 29390 } 29391 } 29392 rw_exit(&ipst->ips_ill_g_lock); 29393 } 29394 29395 /* ARGSUSED */ 29396 static int 29397 ipmp_hook_emulation_set(queue_t *q, mblk_t *mp, char *value, 29398 caddr_t addr, cred_t *cr) 29399 { 29400 int *v = (int *)addr; 29401 long new_value; 29402 ip_stack_t *ipst = CONNQ_TO_IPST(q); 29403 29404 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 29405 return (EINVAL); 29406 29407 if (*v != new_value) { 29408 *v = new_value; 29409 ipmp_hook_emulation_changed(ipst); 29410 } 29411 return (0); 29412 } 29413 29414 static void * 29415 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) 29416 { 29417 kstat_t *ksp; 29418 29419 ip_stat_t template = { 29420 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 29421 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 29422 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 29423 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 29424 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 29425 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 29426 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 29427 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 29428 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 29429 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 29430 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 29431 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 29432 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 29433 { "ip_db_ref", KSTAT_DATA_UINT64 }, 29434 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 29435 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 29436 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 29437 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 29438 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 29439 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 29440 { "ip_opt", KSTAT_DATA_UINT64 }, 29441 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 29442 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 29443 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 29444 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 29445 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 29446 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 29447 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 29448 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 29449 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 29450 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 29451 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 29452 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 29453 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29454 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29455 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29456 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29457 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 29458 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 29459 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 29460 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 29461 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 29462 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 29463 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 29464 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 29465 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 29466 }; 29467 29468 ksp = kstat_create_netstack("ip", 0, "ipstat", "net", 29469 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 29470 KSTAT_FLAG_VIRTUAL, stackid); 29471 29472 if (ksp == NULL) 29473 return (NULL); 29474 29475 bcopy(&template, ip_statisticsp, sizeof (template)); 29476 ksp->ks_data = (void *)ip_statisticsp; 29477 ksp->ks_private = (void *)(uintptr_t)stackid; 29478 29479 kstat_install(ksp); 29480 return (ksp); 29481 } 29482 29483 static void 29484 ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp) 29485 { 29486 if (ksp != NULL) { 29487 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29488 kstat_delete_netstack(ksp, stackid); 29489 } 29490 } 29491 29492 static void * 29493 ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst) 29494 { 29495 kstat_t *ksp; 29496 29497 ip_named_kstat_t template = { 29498 { "forwarding", KSTAT_DATA_UINT32, 0 }, 29499 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 29500 { "inReceives", KSTAT_DATA_UINT64, 0 }, 29501 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 29502 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 29503 { "forwDatagrams", KSTAT_DATA_UINT64, 0 }, 29504 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 29505 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 29506 { "inDelivers", KSTAT_DATA_UINT64, 0 }, 29507 { "outRequests", KSTAT_DATA_UINT64, 0 }, 29508 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 29509 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 29510 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 29511 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 29512 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 29513 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 29514 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 29515 { "fragFails", KSTAT_DATA_UINT32, 0 }, 29516 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 29517 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 29518 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 29519 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 29520 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 29521 { "inErrs", KSTAT_DATA_UINT32, 0 }, 29522 { "noPorts", KSTAT_DATA_UINT32, 0 }, 29523 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 29524 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 29525 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 29526 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 29527 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 29528 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 29529 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 29530 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 29531 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 29532 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 29533 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 29534 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 29535 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 29536 }; 29537 29538 ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 29539 NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid); 29540 if (ksp == NULL || ksp->ks_data == NULL) 29541 return (NULL); 29542 29543 template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2; 29544 template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl; 29545 template.reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29546 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 29547 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 29548 29549 template.netToMediaEntrySize.value.i32 = 29550 sizeof (mib2_ipNetToMediaEntry_t); 29551 29552 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 29553 29554 bcopy(&template, ksp->ks_data, sizeof (template)); 29555 ksp->ks_update = ip_kstat_update; 29556 ksp->ks_private = (void *)(uintptr_t)stackid; 29557 29558 kstat_install(ksp); 29559 return (ksp); 29560 } 29561 29562 static void 29563 ip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29564 { 29565 if (ksp != NULL) { 29566 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29567 kstat_delete_netstack(ksp, stackid); 29568 } 29569 } 29570 29571 static int 29572 ip_kstat_update(kstat_t *kp, int rw) 29573 { 29574 ip_named_kstat_t *ipkp; 29575 mib2_ipIfStatsEntry_t ipmib; 29576 ill_walk_context_t ctx; 29577 ill_t *ill; 29578 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29579 netstack_t *ns; 29580 ip_stack_t *ipst; 29581 29582 if (kp == NULL || kp->ks_data == NULL) 29583 return (EIO); 29584 29585 if (rw == KSTAT_WRITE) 29586 return (EACCES); 29587 29588 ns = netstack_find_by_stackid(stackid); 29589 if (ns == NULL) 29590 return (-1); 29591 ipst = ns->netstack_ip; 29592 if (ipst == NULL) { 29593 netstack_rele(ns); 29594 return (-1); 29595 } 29596 ipkp = (ip_named_kstat_t *)kp->ks_data; 29597 29598 bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib)); 29599 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 29600 ill = ILL_START_WALK_V4(&ctx, ipst); 29601 for (; ill != NULL; ill = ill_next(&ctx, ill)) 29602 ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib); 29603 rw_exit(&ipst->ips_ill_g_lock); 29604 29605 ipkp->forwarding.value.ui32 = ipmib.ipIfStatsForwarding; 29606 ipkp->defaultTTL.value.ui32 = ipmib.ipIfStatsDefaultTTL; 29607 ipkp->inReceives.value.ui64 = ipmib.ipIfStatsHCInReceives; 29608 ipkp->inHdrErrors.value.ui32 = ipmib.ipIfStatsInHdrErrors; 29609 ipkp->inAddrErrors.value.ui32 = ipmib.ipIfStatsInAddrErrors; 29610 ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams; 29611 ipkp->inUnknownProtos.value.ui32 = ipmib.ipIfStatsInUnknownProtos; 29612 ipkp->inDiscards.value.ui32 = ipmib.ipIfStatsInDiscards; 29613 ipkp->inDelivers.value.ui64 = ipmib.ipIfStatsHCInDelivers; 29614 ipkp->outRequests.value.ui64 = ipmib.ipIfStatsHCOutRequests; 29615 ipkp->outDiscards.value.ui32 = ipmib.ipIfStatsOutDiscards; 29616 ipkp->outNoRoutes.value.ui32 = ipmib.ipIfStatsOutNoRoutes; 29617 ipkp->reasmTimeout.value.ui32 = ipst->ips_ip_g_frag_timeout; 29618 ipkp->reasmReqds.value.ui32 = ipmib.ipIfStatsReasmReqds; 29619 ipkp->reasmOKs.value.ui32 = ipmib.ipIfStatsReasmOKs; 29620 ipkp->reasmFails.value.ui32 = ipmib.ipIfStatsReasmFails; 29621 ipkp->fragOKs.value.ui32 = ipmib.ipIfStatsOutFragOKs; 29622 ipkp->fragFails.value.ui32 = ipmib.ipIfStatsOutFragFails; 29623 ipkp->fragCreates.value.ui32 = ipmib.ipIfStatsOutFragCreates; 29624 29625 ipkp->routingDiscards.value.ui32 = 0; 29626 ipkp->inErrs.value.ui32 = ipmib.tcpIfStatsInErrs; 29627 ipkp->noPorts.value.ui32 = ipmib.udpIfStatsNoPorts; 29628 ipkp->inCksumErrs.value.ui32 = ipmib.ipIfStatsInCksumErrs; 29629 ipkp->reasmDuplicates.value.ui32 = ipmib.ipIfStatsReasmDuplicates; 29630 ipkp->reasmPartDups.value.ui32 = ipmib.ipIfStatsReasmPartDups; 29631 ipkp->forwProhibits.value.ui32 = ipmib.ipIfStatsForwProhibits; 29632 ipkp->udpInCksumErrs.value.ui32 = ipmib.udpIfStatsInCksumErrs; 29633 ipkp->udpInOverflows.value.ui32 = ipmib.udpIfStatsInOverflows; 29634 ipkp->rawipInOverflows.value.ui32 = ipmib.rawipIfStatsInOverflows; 29635 ipkp->ipsecInSucceeded.value.ui32 = ipmib.ipsecIfStatsInSucceeded; 29636 ipkp->ipsecInFailed.value.i32 = ipmib.ipsecIfStatsInFailed; 29637 29638 ipkp->inIPv6.value.ui32 = ipmib.ipIfStatsInWrongIPVersion; 29639 ipkp->outIPv6.value.ui32 = ipmib.ipIfStatsOutWrongIPVersion; 29640 ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion; 29641 29642 netstack_rele(ns); 29643 29644 return (0); 29645 } 29646 29647 static void * 29648 icmp_kstat_init(netstackid_t stackid) 29649 { 29650 kstat_t *ksp; 29651 29652 icmp_named_kstat_t template = { 29653 { "inMsgs", KSTAT_DATA_UINT32 }, 29654 { "inErrors", KSTAT_DATA_UINT32 }, 29655 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 29656 { "inTimeExcds", KSTAT_DATA_UINT32 }, 29657 { "inParmProbs", KSTAT_DATA_UINT32 }, 29658 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 29659 { "inRedirects", KSTAT_DATA_UINT32 }, 29660 { "inEchos", KSTAT_DATA_UINT32 }, 29661 { "inEchoReps", KSTAT_DATA_UINT32 }, 29662 { "inTimestamps", KSTAT_DATA_UINT32 }, 29663 { "inTimestampReps", KSTAT_DATA_UINT32 }, 29664 { "inAddrMasks", KSTAT_DATA_UINT32 }, 29665 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 29666 { "outMsgs", KSTAT_DATA_UINT32 }, 29667 { "outErrors", KSTAT_DATA_UINT32 }, 29668 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 29669 { "outTimeExcds", KSTAT_DATA_UINT32 }, 29670 { "outParmProbs", KSTAT_DATA_UINT32 }, 29671 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 29672 { "outRedirects", KSTAT_DATA_UINT32 }, 29673 { "outEchos", KSTAT_DATA_UINT32 }, 29674 { "outEchoReps", KSTAT_DATA_UINT32 }, 29675 { "outTimestamps", KSTAT_DATA_UINT32 }, 29676 { "outTimestampReps", KSTAT_DATA_UINT32 }, 29677 { "outAddrMasks", KSTAT_DATA_UINT32 }, 29678 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 29679 { "inChksumErrs", KSTAT_DATA_UINT32 }, 29680 { "inUnknowns", KSTAT_DATA_UINT32 }, 29681 { "inFragNeeded", KSTAT_DATA_UINT32 }, 29682 { "outFragNeeded", KSTAT_DATA_UINT32 }, 29683 { "outDrops", KSTAT_DATA_UINT32 }, 29684 { "inOverFlows", KSTAT_DATA_UINT32 }, 29685 { "inBadRedirects", KSTAT_DATA_UINT32 }, 29686 }; 29687 29688 ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 29689 NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid); 29690 if (ksp == NULL || ksp->ks_data == NULL) 29691 return (NULL); 29692 29693 bcopy(&template, ksp->ks_data, sizeof (template)); 29694 29695 ksp->ks_update = icmp_kstat_update; 29696 ksp->ks_private = (void *)(uintptr_t)stackid; 29697 29698 kstat_install(ksp); 29699 return (ksp); 29700 } 29701 29702 static void 29703 icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp) 29704 { 29705 if (ksp != NULL) { 29706 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 29707 kstat_delete_netstack(ksp, stackid); 29708 } 29709 } 29710 29711 static int 29712 icmp_kstat_update(kstat_t *kp, int rw) 29713 { 29714 icmp_named_kstat_t *icmpkp; 29715 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private; 29716 netstack_t *ns; 29717 ip_stack_t *ipst; 29718 29719 if ((kp == NULL) || (kp->ks_data == NULL)) 29720 return (EIO); 29721 29722 if (rw == KSTAT_WRITE) 29723 return (EACCES); 29724 29725 ns = netstack_find_by_stackid(stackid); 29726 if (ns == NULL) 29727 return (-1); 29728 ipst = ns->netstack_ip; 29729 if (ipst == NULL) { 29730 netstack_rele(ns); 29731 return (-1); 29732 } 29733 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 29734 29735 icmpkp->inMsgs.value.ui32 = ipst->ips_icmp_mib.icmpInMsgs; 29736 icmpkp->inErrors.value.ui32 = ipst->ips_icmp_mib.icmpInErrors; 29737 icmpkp->inDestUnreachs.value.ui32 = 29738 ipst->ips_icmp_mib.icmpInDestUnreachs; 29739 icmpkp->inTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpInTimeExcds; 29740 icmpkp->inParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpInParmProbs; 29741 icmpkp->inSrcQuenchs.value.ui32 = ipst->ips_icmp_mib.icmpInSrcQuenchs; 29742 icmpkp->inRedirects.value.ui32 = ipst->ips_icmp_mib.icmpInRedirects; 29743 icmpkp->inEchos.value.ui32 = ipst->ips_icmp_mib.icmpInEchos; 29744 icmpkp->inEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpInEchoReps; 29745 icmpkp->inTimestamps.value.ui32 = ipst->ips_icmp_mib.icmpInTimestamps; 29746 icmpkp->inTimestampReps.value.ui32 = 29747 ipst->ips_icmp_mib.icmpInTimestampReps; 29748 icmpkp->inAddrMasks.value.ui32 = ipst->ips_icmp_mib.icmpInAddrMasks; 29749 icmpkp->inAddrMaskReps.value.ui32 = 29750 ipst->ips_icmp_mib.icmpInAddrMaskReps; 29751 icmpkp->outMsgs.value.ui32 = ipst->ips_icmp_mib.icmpOutMsgs; 29752 icmpkp->outErrors.value.ui32 = ipst->ips_icmp_mib.icmpOutErrors; 29753 icmpkp->outDestUnreachs.value.ui32 = 29754 ipst->ips_icmp_mib.icmpOutDestUnreachs; 29755 icmpkp->outTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpOutTimeExcds; 29756 icmpkp->outParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpOutParmProbs; 29757 icmpkp->outSrcQuenchs.value.ui32 = 29758 ipst->ips_icmp_mib.icmpOutSrcQuenchs; 29759 icmpkp->outRedirects.value.ui32 = ipst->ips_icmp_mib.icmpOutRedirects; 29760 icmpkp->outEchos.value.ui32 = ipst->ips_icmp_mib.icmpOutEchos; 29761 icmpkp->outEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpOutEchoReps; 29762 icmpkp->outTimestamps.value.ui32 = 29763 ipst->ips_icmp_mib.icmpOutTimestamps; 29764 icmpkp->outTimestampReps.value.ui32 = 29765 ipst->ips_icmp_mib.icmpOutTimestampReps; 29766 icmpkp->outAddrMasks.value.ui32 = 29767 ipst->ips_icmp_mib.icmpOutAddrMasks; 29768 icmpkp->outAddrMaskReps.value.ui32 = 29769 ipst->ips_icmp_mib.icmpOutAddrMaskReps; 29770 icmpkp->inCksumErrs.value.ui32 = ipst->ips_icmp_mib.icmpInCksumErrs; 29771 icmpkp->inUnknowns.value.ui32 = ipst->ips_icmp_mib.icmpInUnknowns; 29772 icmpkp->inFragNeeded.value.ui32 = ipst->ips_icmp_mib.icmpInFragNeeded; 29773 icmpkp->outFragNeeded.value.ui32 = 29774 ipst->ips_icmp_mib.icmpOutFragNeeded; 29775 icmpkp->outDrops.value.ui32 = ipst->ips_icmp_mib.icmpOutDrops; 29776 icmpkp->inOverflows.value.ui32 = ipst->ips_icmp_mib.icmpInOverflows; 29777 icmpkp->inBadRedirects.value.ui32 = 29778 ipst->ips_icmp_mib.icmpInBadRedirects; 29779 29780 netstack_rele(ns); 29781 return (0); 29782 } 29783 29784 /* 29785 * This is the fanout function for raw socket opened for SCTP. Note 29786 * that it is called after SCTP checks that there is no socket which 29787 * wants a packet. Then before SCTP handles this out of the blue packet, 29788 * this function is called to see if there is any raw socket for SCTP. 29789 * If there is and it is bound to the correct address, the packet will 29790 * be sent to that socket. Note that only one raw socket can be bound to 29791 * a port. This is assured in ipcl_sctp_hash_insert(); 29792 */ 29793 void 29794 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 29795 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 29796 zoneid_t zoneid) 29797 { 29798 conn_t *connp; 29799 queue_t *rq; 29800 mblk_t *first_mp; 29801 boolean_t secure; 29802 ip6_t *ip6h; 29803 ip_stack_t *ipst = recv_ill->ill_ipst; 29804 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 29805 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; 29806 boolean_t sctp_csum_err = B_FALSE; 29807 29808 if (flags & IP_FF_SCTP_CSUM_ERR) { 29809 sctp_csum_err = B_TRUE; 29810 flags &= ~IP_FF_SCTP_CSUM_ERR; 29811 } 29812 29813 first_mp = mp; 29814 if (mctl_present) { 29815 mp = first_mp->b_cont; 29816 secure = ipsec_in_is_secure(first_mp); 29817 ASSERT(mp != NULL); 29818 } else { 29819 secure = B_FALSE; 29820 } 29821 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 29822 29823 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst); 29824 if (connp == NULL) { 29825 /* 29826 * Although raw sctp is not summed, OOB chunks must be. 29827 * Drop the packet here if the sctp checksum failed. 29828 */ 29829 if (sctp_csum_err) { 29830 BUMP_MIB(&sctps->sctps_mib, sctpChecksumError); 29831 freemsg(first_mp); 29832 return; 29833 } 29834 sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present); 29835 return; 29836 } 29837 rq = connp->conn_rq; 29838 if (!canputnext(rq)) { 29839 CONN_DEC_REF(connp); 29840 BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows); 29841 freemsg(first_mp); 29842 return; 29843 } 29844 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) : 29845 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) { 29846 first_mp = ipsec_check_inbound_policy(first_mp, connp, 29847 (isv4 ? ipha : NULL), ip6h, mctl_present); 29848 if (first_mp == NULL) { 29849 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); 29850 CONN_DEC_REF(connp); 29851 return; 29852 } 29853 } 29854 /* 29855 * We probably should not send M_CTL message up to 29856 * raw socket. 29857 */ 29858 if (mctl_present) 29859 freeb(first_mp); 29860 29861 /* Initiate IPPF processing here if needed. */ 29862 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) || 29863 (!isv4 && IP6_IN_IPP(flags, ipst))) { 29864 ip_process(IPP_LOCAL_IN, &mp, 29865 recv_ill->ill_phyint->phyint_ifindex); 29866 if (mp == NULL) { 29867 CONN_DEC_REF(connp); 29868 return; 29869 } 29870 } 29871 29872 if (connp->conn_recvif || connp->conn_recvslla || 29873 ((connp->conn_ip_recvpktinfo || 29874 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 29875 (flags & IP_FF_IPINFO))) { 29876 int in_flags = 0; 29877 29878 /* 29879 * Since sctp does not support IP_RECVPKTINFO for v4, only pass 29880 * IPF_RECVIF. 29881 */ 29882 if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { 29883 in_flags = IPF_RECVIF; 29884 } 29885 if (connp->conn_recvslla) { 29886 in_flags |= IPF_RECVSLLA; 29887 } 29888 if (isv4) { 29889 mp = ip_add_info(mp, recv_ill, in_flags, 29890 IPCL_ZONEID(connp), ipst); 29891 } else { 29892 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 29893 if (mp == NULL) { 29894 BUMP_MIB(recv_ill->ill_ip_mib, 29895 ipIfStatsInDiscards); 29896 CONN_DEC_REF(connp); 29897 return; 29898 } 29899 } 29900 } 29901 29902 BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); 29903 /* 29904 * We are sending the IPSEC_IN message also up. Refer 29905 * to comments above this function. 29906 * This is the SOCK_RAW, IPPROTO_SCTP case. 29907 */ 29908 (connp->conn_recv)(connp, mp, NULL); 29909 CONN_DEC_REF(connp); 29910 } 29911 29912 #define UPDATE_IP_MIB_OB_COUNTERS(ill, len) \ 29913 { \ 29914 BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits); \ 29915 UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len)); \ 29916 } 29917 /* 29918 * This function should be called only if all packet processing 29919 * including fragmentation is complete. Callers of this function 29920 * must set mp->b_prev to one of these values: 29921 * {0, IPP_FWD_OUT, IPP_LOCAL_OUT} 29922 * prior to handing over the mp as first argument to this function. 29923 * 29924 * If the ire passed by caller is incomplete, this function 29925 * queues the packet and if necessary, sends ARP request and bails. 29926 * If the ire passed is fully resolved, we simply prepend 29927 * the link-layer header to the packet, do ipsec hw acceleration 29928 * work if necessary, and send the packet out on the wire. 29929 * 29930 * NOTE: IPsec will only call this function with fully resolved 29931 * ires if hw acceleration is involved. 29932 * TODO list : 29933 * a Handle M_MULTIDATA so that 29934 * tcp_multisend->tcp_multisend_data can 29935 * call ip_xmit_v4 directly 29936 * b Handle post-ARP work for fragments so that 29937 * ip_wput_frag can call this function. 29938 */ 29939 ipxmit_state_t 29940 ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled) 29941 { 29942 nce_t *arpce; 29943 ipha_t *ipha; 29944 queue_t *q; 29945 int ill_index; 29946 mblk_t *nxt_mp, *first_mp; 29947 boolean_t xmit_drop = B_FALSE; 29948 ip_proc_t proc; 29949 ill_t *out_ill; 29950 int pkt_len; 29951 29952 arpce = ire->ire_nce; 29953 ASSERT(arpce != NULL); 29954 29955 DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire, nce_t *, arpce); 29956 29957 mutex_enter(&arpce->nce_lock); 29958 switch (arpce->nce_state) { 29959 case ND_REACHABLE: 29960 /* If there are other queued packets, queue this packet */ 29961 if (arpce->nce_qd_mp != NULL) { 29962 if (mp != NULL) 29963 nce_queue_mp_common(arpce, mp, B_FALSE); 29964 mp = arpce->nce_qd_mp; 29965 } 29966 arpce->nce_qd_mp = NULL; 29967 mutex_exit(&arpce->nce_lock); 29968 29969 /* 29970 * Flush the queue. In the common case, where the 29971 * ARP is already resolved, it will go through the 29972 * while loop only once. 29973 */ 29974 while (mp != NULL) { 29975 29976 nxt_mp = mp->b_next; 29977 mp->b_next = NULL; 29978 ASSERT(mp->b_datap->db_type != M_CTL); 29979 pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length); 29980 /* 29981 * This info is needed for IPQOS to do COS marking 29982 * in ip_wput_attach_llhdr->ip_process. 29983 */ 29984 proc = (ip_proc_t)(uintptr_t)mp->b_prev; 29985 mp->b_prev = NULL; 29986 29987 /* set up ill index for outbound qos processing */ 29988 out_ill = ire_to_ill(ire); 29989 ill_index = out_ill->ill_phyint->phyint_ifindex; 29990 first_mp = ip_wput_attach_llhdr(mp, ire, proc, 29991 ill_index, &ipha); 29992 if (first_mp == NULL) { 29993 xmit_drop = B_TRUE; 29994 BUMP_MIB(out_ill->ill_ip_mib, 29995 ipIfStatsOutDiscards); 29996 goto next_mp; 29997 } 29998 29999 /* non-ipsec hw accel case */ 30000 if (io == NULL || !io->ipsec_out_accelerated) { 30001 /* send it */ 30002 q = ire->ire_stq; 30003 if (proc == IPP_FWD_OUT) { 30004 UPDATE_IB_PKT_COUNT(ire); 30005 } else { 30006 UPDATE_OB_PKT_COUNT(ire); 30007 } 30008 ire->ire_last_used_time = lbolt; 30009 30010 if (flow_ctl_enabled || canputnext(q)) { 30011 if (proc == IPP_FWD_OUT) { 30012 30013 BUMP_MIB(out_ill->ill_ip_mib, 30014 ipIfStatsHCOutForwDatagrams); 30015 30016 } 30017 UPDATE_IP_MIB_OB_COUNTERS(out_ill, 30018 pkt_len); 30019 30020 DTRACE_IP7(send, mblk_t *, first_mp, 30021 conn_t *, NULL, void_ip_t *, ipha, 30022 __dtrace_ipsr_ill_t *, out_ill, 30023 ipha_t *, ipha, ip6_t *, NULL, int, 30024 0); 30025 30026 putnext(q, first_mp); 30027 } else { 30028 BUMP_MIB(out_ill->ill_ip_mib, 30029 ipIfStatsOutDiscards); 30030 xmit_drop = B_TRUE; 30031 freemsg(first_mp); 30032 } 30033 } else { 30034 /* 30035 * Safety Pup says: make sure this 30036 * is going to the right interface! 30037 */ 30038 ill_t *ill1 = 30039 (ill_t *)ire->ire_stq->q_ptr; 30040 int ifindex = 30041 ill1->ill_phyint->phyint_ifindex; 30042 if (ifindex != 30043 io->ipsec_out_capab_ill_index) { 30044 xmit_drop = B_TRUE; 30045 freemsg(mp); 30046 } else { 30047 UPDATE_IP_MIB_OB_COUNTERS(ill1, 30048 pkt_len); 30049 30050 DTRACE_IP7(send, mblk_t *, first_mp, 30051 conn_t *, NULL, void_ip_t *, ipha, 30052 __dtrace_ipsr_ill_t *, ill1, 30053 ipha_t *, ipha, ip6_t *, NULL, 30054 int, 0); 30055 30056 ipsec_hw_putnext(ire->ire_stq, mp); 30057 } 30058 } 30059 next_mp: 30060 mp = nxt_mp; 30061 } /* while (mp != NULL) */ 30062 if (xmit_drop) 30063 return (SEND_FAILED); 30064 else 30065 return (SEND_PASSED); 30066 30067 case ND_INITIAL: 30068 case ND_INCOMPLETE: 30069 30070 /* 30071 * While we do send off packets to dests that 30072 * use fully-resolved CGTP routes, we do not 30073 * handle unresolved CGTP routes. 30074 */ 30075 ASSERT(!(ire->ire_flags & RTF_MULTIRT)); 30076 ASSERT(io == NULL || !io->ipsec_out_accelerated); 30077 30078 if (mp != NULL) { 30079 /* queue the packet */ 30080 nce_queue_mp_common(arpce, mp, B_FALSE); 30081 } 30082 30083 if (arpce->nce_state == ND_INCOMPLETE) { 30084 mutex_exit(&arpce->nce_lock); 30085 DTRACE_PROBE3(ip__xmit__incomplete, 30086 (ire_t *), ire, (mblk_t *), mp, 30087 (ipsec_out_t *), io); 30088 return (LOOKUP_IN_PROGRESS); 30089 } 30090 30091 arpce->nce_state = ND_INCOMPLETE; 30092 mutex_exit(&arpce->nce_lock); 30093 /* 30094 * Note that ire_add() (called from ire_forward()) 30095 * holds a ref on the ire until ARP is completed. 30096 */ 30097 30098 ire_arpresolve(ire, ire_to_ill(ire)); 30099 return (LOOKUP_IN_PROGRESS); 30100 default: 30101 ASSERT(0); 30102 mutex_exit(&arpce->nce_lock); 30103 return (LLHDR_RESLV_FAILED); 30104 } 30105 } 30106 30107 #undef UPDATE_IP_MIB_OB_COUNTERS 30108 30109 /* 30110 * Return B_TRUE if the buffers differ in length or content. 30111 * This is used for comparing extension header buffers. 30112 * Note that an extension header would be declared different 30113 * even if all that changed was the next header value in that header i.e. 30114 * what really changed is the next extension header. 30115 */ 30116 boolean_t 30117 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 30118 uint_t blen) 30119 { 30120 if (!b_valid) 30121 blen = 0; 30122 30123 if (alen != blen) 30124 return (B_TRUE); 30125 if (alen == 0) 30126 return (B_FALSE); /* Both zero length */ 30127 return (bcmp(abuf, bbuf, alen)); 30128 } 30129 30130 /* 30131 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 30132 * Return B_FALSE if memory allocation fails - don't change any state! 30133 */ 30134 boolean_t 30135 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 30136 const void *src, uint_t srclen) 30137 { 30138 void *dst; 30139 30140 if (!src_valid) 30141 srclen = 0; 30142 30143 ASSERT(*dstlenp == 0); 30144 if (src != NULL && srclen != 0) { 30145 dst = mi_alloc(srclen, BPRI_MED); 30146 if (dst == NULL) 30147 return (B_FALSE); 30148 } else { 30149 dst = NULL; 30150 } 30151 if (*dstp != NULL) 30152 mi_free(*dstp); 30153 *dstp = dst; 30154 *dstlenp = dst == NULL ? 0 : srclen; 30155 return (B_TRUE); 30156 } 30157 30158 /* 30159 * Replace what is in *dst, *dstlen with the source. 30160 * Assumes ip_allocbuf has already been called. 30161 */ 30162 void 30163 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 30164 const void *src, uint_t srclen) 30165 { 30166 if (!src_valid) 30167 srclen = 0; 30168 30169 ASSERT(*dstlenp == srclen); 30170 if (src != NULL && srclen != 0) 30171 bcopy(src, *dstp, srclen); 30172 } 30173 30174 /* 30175 * Free the storage pointed to by the members of an ip6_pkt_t. 30176 */ 30177 void 30178 ip6_pkt_free(ip6_pkt_t *ipp) 30179 { 30180 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 30181 30182 if (ipp->ipp_fields & IPPF_HOPOPTS) { 30183 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 30184 ipp->ipp_hopopts = NULL; 30185 ipp->ipp_hopoptslen = 0; 30186 } 30187 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 30188 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 30189 ipp->ipp_rtdstopts = NULL; 30190 ipp->ipp_rtdstoptslen = 0; 30191 } 30192 if (ipp->ipp_fields & IPPF_DSTOPTS) { 30193 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 30194 ipp->ipp_dstopts = NULL; 30195 ipp->ipp_dstoptslen = 0; 30196 } 30197 if (ipp->ipp_fields & IPPF_RTHDR) { 30198 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 30199 ipp->ipp_rthdr = NULL; 30200 ipp->ipp_rthdrlen = 0; 30201 } 30202 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 30203 IPPF_RTHDR); 30204 } 30205