1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/dlpi.h> 33 #include <sys/stropts.h> 34 #include <sys/sysmacros.h> 35 #include <sys/strsubr.h> 36 #include <sys/strlog.h> 37 #include <sys/strsun.h> 38 #include <sys/zone.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/xti_inet.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/cmn_err.h> 45 #include <sys/debug.h> 46 #include <sys/kobj.h> 47 #include <sys/modctl.h> 48 #include <sys/atomic.h> 49 #include <sys/policy.h> 50 #include <sys/priv.h> 51 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/kmem.h> 55 #include <sys/socket.h> 56 #include <sys/vtrace.h> 57 #include <sys/isa_defs.h> 58 #include <net/if.h> 59 #include <net/if_arp.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <net/if_dl.h> 64 65 #include <inet/common.h> 66 #include <inet/mi.h> 67 #include <inet/mib2.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/snmpcom.h> 71 #include <inet/kstatcom.h> 72 73 #include <netinet/igmp_var.h> 74 #include <netinet/ip6.h> 75 #include <netinet/icmp6.h> 76 #include <netinet/sctp.h> 77 78 #include <inet/ip.h> 79 #include <inet/ip_impl.h> 80 #include <inet/ip6.h> 81 #include <inet/ip6_asp.h> 82 #include <inet/tcp.h> 83 #include <inet/tcp_impl.h> 84 #include <inet/ip_multi.h> 85 #include <inet/ip_if.h> 86 #include <inet/ip_ire.h> 87 #include <inet/ip_rts.h> 88 #include <inet/optcom.h> 89 #include <inet/ip_ndp.h> 90 #include <inet/ip_listutils.h> 91 #include <netinet/igmp.h> 92 #include <netinet/ip_mroute.h> 93 #include <inet/ipp_common.h> 94 95 #include <net/pfkeyv2.h> 96 #include <inet/ipsec_info.h> 97 #include <inet/sadb.h> 98 #include <inet/ipsec_impl.h> 99 #include <sys/iphada.h> 100 #include <inet/tun.h> 101 #include <inet/ipdrop.h> 102 103 #include <sys/ethernet.h> 104 #include <net/if_types.h> 105 #include <sys/cpuvar.h> 106 107 #include <ipp/ipp.h> 108 #include <ipp/ipp_impl.h> 109 #include <ipp/ipgpc/ipgpc.h> 110 111 #include <sys/multidata.h> 112 #include <sys/pattr.h> 113 114 #include <inet/ipclassifier.h> 115 #include <inet/sctp_ip.h> 116 #include <inet/udp_impl.h> 117 118 #include <sys/tsol/label.h> 119 #include <sys/tsol/tnet.h> 120 121 #include <rpc/pmap_prot.h> 122 123 /* 124 * Values for squeue switch: 125 * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain 126 * IP_SQUEUE_ENTER: squeue_enter 127 * IP_SQUEUE_FILL: squeue_fill 128 */ 129 int ip_squeue_enter = 2; 130 squeue_func_t ip_input_proc; 131 /* 132 * IP statistics. 133 */ 134 #define IP_STAT(x) (ip_statistics.x.value.ui64++) 135 #define IP_STAT_UPDATE(x, n) (ip_statistics.x.value.ui64 += (n)) 136 137 typedef struct ip_stat { 138 kstat_named_t ipsec_fanout_proto; 139 kstat_named_t ip_udp_fannorm; 140 kstat_named_t ip_udp_fanmb; 141 kstat_named_t ip_udp_fanothers; 142 kstat_named_t ip_udp_fast_path; 143 kstat_named_t ip_udp_slow_path; 144 kstat_named_t ip_udp_input_err; 145 kstat_named_t ip_tcppullup; 146 kstat_named_t ip_tcpoptions; 147 kstat_named_t ip_multipkttcp; 148 kstat_named_t ip_tcp_fast_path; 149 kstat_named_t ip_tcp_slow_path; 150 kstat_named_t ip_tcp_input_error; 151 kstat_named_t ip_db_ref; 152 kstat_named_t ip_notaligned1; 153 kstat_named_t ip_notaligned2; 154 kstat_named_t ip_multimblk3; 155 kstat_named_t ip_multimblk4; 156 kstat_named_t ip_ipoptions; 157 kstat_named_t ip_classify_fail; 158 kstat_named_t ip_opt; 159 kstat_named_t ip_udp_rput_local; 160 kstat_named_t ipsec_proto_ahesp; 161 kstat_named_t ip_conn_flputbq; 162 kstat_named_t ip_conn_walk_drain; 163 kstat_named_t ip_out_sw_cksum; 164 kstat_named_t ip_in_sw_cksum; 165 kstat_named_t ip_trash_ire_reclaim_calls; 166 kstat_named_t ip_trash_ire_reclaim_success; 167 kstat_named_t ip_ire_arp_timer_expired; 168 kstat_named_t ip_ire_redirect_timer_expired; 169 kstat_named_t ip_ire_pmtu_timer_expired; 170 kstat_named_t ip_input_multi_squeue; 171 kstat_named_t ip_tcp_in_full_hw_cksum_err; 172 kstat_named_t ip_tcp_in_part_hw_cksum_err; 173 kstat_named_t ip_tcp_in_sw_cksum_err; 174 kstat_named_t ip_tcp_out_sw_cksum_bytes; 175 kstat_named_t ip_udp_in_full_hw_cksum_err; 176 kstat_named_t ip_udp_in_part_hw_cksum_err; 177 kstat_named_t ip_udp_in_sw_cksum_err; 178 kstat_named_t ip_udp_out_sw_cksum_bytes; 179 kstat_named_t ip_frag_mdt_pkt_out; 180 kstat_named_t ip_frag_mdt_discarded; 181 kstat_named_t ip_frag_mdt_allocfail; 182 kstat_named_t ip_frag_mdt_addpdescfail; 183 kstat_named_t ip_frag_mdt_allocd; 184 } ip_stat_t; 185 186 static ip_stat_t ip_statistics = { 187 { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, 188 { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, 189 { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, 190 { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, 191 { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, 192 { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, 193 { "ip_udp_input_err", KSTAT_DATA_UINT64 }, 194 { "ip_tcppullup", KSTAT_DATA_UINT64 }, 195 { "ip_tcpoptions", KSTAT_DATA_UINT64 }, 196 { "ip_multipkttcp", KSTAT_DATA_UINT64 }, 197 { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, 198 { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, 199 { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, 200 { "ip_db_ref", KSTAT_DATA_UINT64 }, 201 { "ip_notaligned1", KSTAT_DATA_UINT64 }, 202 { "ip_notaligned2", KSTAT_DATA_UINT64 }, 203 { "ip_multimblk3", KSTAT_DATA_UINT64 }, 204 { "ip_multimblk4", KSTAT_DATA_UINT64 }, 205 { "ip_ipoptions", KSTAT_DATA_UINT64 }, 206 { "ip_classify_fail", KSTAT_DATA_UINT64 }, 207 { "ip_opt", KSTAT_DATA_UINT64 }, 208 { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, 209 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, 210 { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, 211 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, 212 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, 213 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, 214 { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, 215 { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, 216 { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, 217 { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, 218 { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, 219 { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, 220 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 221 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 222 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 223 { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 224 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, 225 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, 226 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, 227 { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 228 { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, 229 { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, 230 { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, 231 { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 232 { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, 233 }; 234 235 static kstat_t *ip_kstat; 236 237 #define TCP6 "tcp6" 238 #define TCP "tcp" 239 #define SCTP "sctp" 240 #define SCTP6 "sctp6" 241 242 major_t TCP6_MAJ; 243 major_t TCP_MAJ; 244 major_t SCTP_MAJ; 245 major_t SCTP6_MAJ; 246 247 int ip_poll_normal_ms = 100; 248 int ip_poll_normal_ticks = 0; 249 250 /* 251 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 252 */ 253 254 struct listptr_s { 255 mblk_t *lp_head; /* pointer to the head of the list */ 256 mblk_t *lp_tail; /* pointer to the tail of the list */ 257 }; 258 259 typedef struct listptr_s listptr_t; 260 261 /* 262 * This is used by ip_snmp_get_mib2_ip_route_media and 263 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 264 */ 265 typedef struct iproutedata_s { 266 uint_t ird_idx; 267 listptr_t ird_route; /* ipRouteEntryTable */ 268 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 269 listptr_t ird_attrs; /* ipRouteAttributeTable */ 270 } iproutedata_t; 271 272 /* 273 * Cluster specific hooks. These should be NULL when booted as a non-cluster 274 */ 275 276 /* 277 * Hook functions to enable cluster networking 278 * On non-clustered systems these vectors must always be NULL. 279 * 280 * Hook function to Check ip specified ip address is a shared ip address 281 * in the cluster 282 * 283 */ 284 int (*cl_inet_isclusterwide)(uint8_t protocol, 285 sa_family_t addr_family, uint8_t *laddrp) = NULL; 286 287 /* 288 * Hook function to generate cluster wide ip fragment identifier 289 */ 290 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 291 uint8_t *laddrp, uint8_t *faddrp) = NULL; 292 293 /* 294 * Synchronization notes: 295 * 296 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 297 * MT level protection given by STREAMS. IP uses a combination of its own 298 * internal serialization mechanism and standard Solaris locking techniques. 299 * The internal serialization is per phyint (no IPMP) or per IPMP group. 300 * This is used to serialize plumbing operations, IPMP operations, certain 301 * multicast operations, most set ioctls, igmp/mld timers etc. 302 * 303 * Plumbing is a long sequence of operations involving message 304 * exchanges between IP, ARP and device drivers. Many set ioctls are typically 305 * involved in plumbing operations. A natural model is to serialize these 306 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 307 * parallel without any interference. But various set ioctls on hme0 are best 308 * serialized. However if the system uses IPMP, the operations are easier if 309 * they are serialized on a per IPMP group basis since IPMP operations 310 * happen across ill's of a group. Thus the lowest common denominator is to 311 * serialize most set ioctls, multicast join/leave operations, IPMP operations 312 * igmp/mld timer operations, and processing of DLPI control messages received 313 * from drivers on a per IPMP group basis. If the system does not employ 314 * IPMP the serialization is on a per phyint basis. This serialization is 315 * provided by the ipsq_t and primitives operating on this. Details can 316 * be found in ip_if.c above the core primitives operating on ipsq_t. 317 * 318 * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 319 * Simiarly lookup of an ire by a thread also returns a refheld ire. 320 * In addition ipif's and ill's referenced by the ire are also indirectly 321 * refheld. Thus no ipif or ill can vanish nor can critical parameters like 322 * the ipif's address or netmask change as long as an ipif is refheld 323 * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the 324 * address of an ipif has to go through the ipsq_t. This ensures that only 325 * 1 such exclusive operation proceeds at any time on the ipif. It then 326 * deletes all ires associated with this ipif, and waits for all refcnts 327 * associated with this ipif to come down to zero. The address is changed 328 * only after the ipif has been quiesced. Then the ipif is brought up again. 329 * More details are described above the comment in ip_sioctl_flags. 330 * 331 * Packet processing is based mostly on IREs and are fully multi-threaded 332 * using standard Solaris MT techniques. 333 * 334 * There are explicit locks in IP to handle: 335 * - The ip_g_head list maintained by mi_open_link() and friends. 336 * 337 * - The reassembly data structures (one lock per hash bucket) 338 * 339 * - conn_lock is meant to protect conn_t fields. The fields actually 340 * protected by conn_lock are documented in the conn_t definition. 341 * 342 * - ire_lock to protect some of the fields of the ire, IRE tables 343 * (one lock per hash bucket). Refer to ip_ire.c for details. 344 * 345 * - ndp_g_lock and nce_lock for protecting NCEs. 346 * 347 * - ill_lock protects fields of the ill and ipif. Details in ip.h 348 * 349 * - ill_g_lock: This is a global reader/writer lock. Protects the following 350 * * The AVL tree based global multi list of all ills. 351 * * The linked list of all ipifs of an ill 352 * * The <ill-ipsq> mapping 353 * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next 354 * * The illgroup list threaded by ill_group_next. 355 * * <ill-phyint> association 356 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 357 * into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion 358 * of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill 359 * will all have to hold the ill_g_lock as writer for the actual duration 360 * of the insertion/deletion/change. More details about the <ill-ipsq> mapping 361 * may be found in the IPMP section. 362 * 363 * - ill_lock: This is a per ill mutex. 364 * It protects some members of the ill and is documented below. 365 * It also protects the <ill-ipsq> mapping 366 * It also protects the illgroup list threaded by ill_group_next. 367 * It also protects the <ill-phyint> assoc. 368 * It also protects the list of ipifs hanging off the ill. 369 * 370 * - ipsq_lock: This is a per ipsq_t mutex lock. 371 * This protects all the other members of the ipsq struct except 372 * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock 373 * 374 * - illgrp_lock: This is a per ill_group mutex lock. 375 * The only thing it protects is the illgrp_ill_schednext member of ill_group 376 * which dictates which is the next ill in an ill_group that is to be chosen 377 * for sending outgoing packets, through creation of an IRE_CACHE that 378 * references this ill. 379 * 380 * - phyint_lock: This is a per phyint mutex lock. Protects just the 381 * phyint_flags 382 * 383 * - ip_g_nd_lock: This is a global reader/writer lock. 384 * Any call to nd_load to load a new parameter to the ND table must hold the 385 * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 386 * as reader. 387 * 388 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 389 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 390 * uniqueness check also done atomically. 391 * 392 * - ipsec_capab_ills_lock: This readers/writer lock protects the global 393 * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken 394 * as a writer when adding or deleting elements from these lists, and 395 * as a reader when walking these lists to send a SADB update to the 396 * IPsec capable ills. 397 * 398 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 399 * group list linked by ill_usesrc_grp_next. It also protects the 400 * ill_usesrc_ifindex field. It is taken as a writer when a member of the 401 * group is being added or deleted. This lock is taken as a reader when 402 * walking the list/group(eg: to get the number of members in a usesrc group). 403 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 404 * field is changing state i.e from NULL to non-NULL or vice-versa. For 405 * example, it is not necessary to take this lock in the initial portion 406 * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and 407 * ip_sioctl_flags since the these operations are executed exclusively and 408 * that ensures that the "usesrc group state" cannot change. The "usesrc 409 * group state" change can happen only in the latter part of 410 * ip_sioctl_slifusesrc and in ill_delete. 411 * 412 * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications. 413 * 414 * To change the <ill-phyint> association, the ill_g_lock must be held 415 * as writer, and the ill_locks of both the v4 and v6 instance of the ill 416 * must be held. 417 * 418 * To change the <ill-ipsq> association the ill_g_lock must be held as writer 419 * and the ill_lock of the ill in question must be held. 420 * 421 * To change the <ill-illgroup> association the ill_g_lock must be held as 422 * writer and the ill_lock of the ill in question must be held. 423 * 424 * To add or delete an ipif from the list of ipifs hanging off the ill, 425 * ill_g_lock (writer) and ill_lock must be held and the thread must be 426 * a writer on the associated ipsq,. 427 * 428 * To add or delete an ill to the system, the ill_g_lock must be held as 429 * writer and the thread must be a writer on the associated ipsq. 430 * 431 * To add or delete an ilm to an ill, the ill_lock must be held and the thread 432 * must be a writer on the associated ipsq. 433 * 434 * Lock hierarchy 435 * 436 * Some lock hierarchy scenarios are listed below. 437 * 438 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 439 * ill_g_lock -> illgrp_lock -> ill_lock 440 * ill_g_lock -> ill_lock(s) -> phyint_lock 441 * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock 442 * ill_g_lock -> ip_addr_avail_lock 443 * conn_lock -> irb_lock -> ill_lock -> ire_lock 444 * ill_g_lock -> ip_g_nd_lock 445 * 446 * When more than 1 ill lock is needed to be held, all ill lock addresses 447 * are sorted on address and locked starting from highest addressed lock 448 * downward. 449 * 450 * Mobile-IP scenarios 451 * 452 * irb_lock -> ill_lock -> ire_mrtun_lock 453 * irb_lock -> ill_lock -> ire_srcif_table_lock 454 * 455 * IPsec scenarios 456 * 457 * ipsa_lock -> ill_g_lock -> ill_lock 458 * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock 459 * ipsec_capab_ills_lock -> ipsa_lock 460 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 461 * 462 * Trusted Solaris scenarios 463 * 464 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 465 * igsa_lock -> gcdb_lock 466 * gcgrp_rwlock -> ire_lock 467 * gcgrp_rwlock -> gcdb_lock 468 * 469 * IPSEC notes : 470 * 471 * IP interacts with the IPSEC code (AH/ESP) by tagging a M_CTL message 472 * in front of the actual packet. For outbound datagrams, the M_CTL 473 * contains a ipsec_out_t (defined in ipsec_info.h), which has the 474 * information used by the IPSEC code for applying the right level of 475 * protection. The information initialized by IP in the ipsec_out_t 476 * is determined by the per-socket policy or global policy in the system. 477 * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in 478 * ipsec_info.h) which starts out with nothing in it. It gets filled 479 * with the right information if it goes through the AH/ESP code, which 480 * happens if the incoming packet is secure. The information initialized 481 * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether 482 * the policy requirements needed by per-socket policy or global policy 483 * is met or not. 484 * 485 * If there is both per-socket policy (set using setsockopt) and there 486 * is also global policy match for the 5 tuples of the socket, 487 * ipsec_override_policy() makes the decision of which one to use. 488 * 489 * For fully connected sockets i.e dst, src [addr, port] is known, 490 * conn_policy_cached is set indicating that policy has been cached. 491 * conn_in_enforce_policy may or may not be set depending on whether 492 * there is a global policy match or per-socket policy match. 493 * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. 494 * Once the right policy is set on the conn_t, policy cannot change for 495 * this socket. This makes life simpler for TCP (UDP ?) where 496 * re-transmissions go out with the same policy. For symmetry, policy 497 * is cached for fully connected UDP sockets also. Thus if policy is cached, 498 * it also implies that policy is latched i.e policy cannot change 499 * on these sockets. As we have the right policy on the conn, we don't 500 * have to lookup global policy for every outbound and inbound datagram 501 * and thus serving as an optimization. Note that a global policy change 502 * does not affect fully connected sockets if they have policy. If fully 503 * connected sockets did not have any policy associated with it, global 504 * policy change may affect them. 505 * 506 * IP Flow control notes: 507 * 508 * Non-TCP streams are flow controlled by IP. On the send side, if the packet 509 * cannot be sent down to the driver by IP, because of a canput failure, IP 510 * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. 511 * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained 512 * when the flowcontrol condition subsides. Ultimately STREAMS backenables the 513 * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the 514 * first conn in the list of conn's to be drained. ip_wsrv on this conn drains 515 * the queued messages, and removes the conn from the drain list, if all 516 * messages were drained. It also qenables the next conn in the drain list to 517 * continue the drain process. 518 * 519 * In reality the drain list is not a single list, but a configurable number 520 * of lists. The ip_wsrv on the IP module, qenables the first conn in each 521 * list. If the ip_wsrv of the next qenabled conn does not run, because the 522 * stream closes, ip_close takes responsibility to qenable the next conn in 523 * the drain list. The directly called ip_wput path always does a putq, if 524 * it cannot putnext. Thus synchronization problems are handled between 525 * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only 526 * functions that manipulate this drain list. Furthermore conn_drain_insert 527 * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv 528 * running on a queue at any time. conn_drain_tail can be simultaneously called 529 * from both ip_wsrv and ip_close. 530 * 531 * IPQOS notes: 532 * 533 * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 534 * and IPQoS modules. IPPF includes hooks in IP at different control points 535 * (callout positions) which direct packets to IPQoS modules for policy 536 * processing. Policies, if present, are global. 537 * 538 * The callout positions are located in the following paths: 539 * o local_in (packets destined for this host) 540 * o local_out (packets orginating from this host ) 541 * o fwd_in (packets forwarded by this m/c - inbound) 542 * o fwd_out (packets forwarded by this m/c - outbound) 543 * Hooks at these callout points can be enabled/disabled using the ndd variable 544 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 545 * By default all the callout positions are enabled. 546 * 547 * Outbound (local_out) 548 * Hooks are placed in ip_wput_ire and ipsec_out_process. 549 * 550 * Inbound (local_in) 551 * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and 552 * TCP and UDP fanout routines. 553 * 554 * Forwarding (in and out) 555 * Hooks are placed in ip_rput_forward and ip_mrtun_forward. 556 * 557 * IP Policy Framework processing (IPPF processing) 558 * Policy processing for a packet is initiated by ip_process, which ascertains 559 * that the classifier (ipgpc) is loaded and configured, failing which the 560 * packet resumes normal processing in IP. If the clasifier is present, the 561 * packet is acted upon by one or more IPQoS modules (action instances), per 562 * filters configured in ipgpc and resumes normal IP processing thereafter. 563 * An action instance can drop a packet in course of its processing. 564 * 565 * A boolean variable, ip_policy, is used in all the fanout routines that can 566 * invoke ip_process for a packet. This variable indicates if the packet should 567 * to be sent for policy processing. The variable is set to B_TRUE by default, 568 * i.e. when the routines are invoked in the normal ip procesing path for a 569 * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; 570 * ip_policy is set to B_FALSE for all the routines called in these two 571 * functions because, in the former case, we don't process loopback traffic 572 * currently while in the latter, the packets have already been processed in 573 * icmp_inbound. 574 * 575 * Zones notes: 576 * 577 * The partitioning rules for networking are as follows: 578 * 1) Packets coming from a zone must have a source address belonging to that 579 * zone. 580 * 2) Packets coming from a zone can only be sent on a physical interface on 581 * which the zone has an IP address. 582 * 3) Between two zones on the same machine, packet delivery is only allowed if 583 * there's a matching route for the destination and zone in the forwarding 584 * table. 585 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 586 * different zones can bind to the same port with the wildcard address 587 * (INADDR_ANY). 588 * 589 * The granularity of interface partitioning is at the logical interface level. 590 * Therefore, every zone has its own IP addresses, and incoming packets can be 591 * attributed to a zone unambiguously. A logical interface is placed into a zone 592 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 593 * structure. Rule (1) is implemented by modifying the source address selection 594 * algorithm so that the list of eligible addresses is filtered based on the 595 * sending process zone. 596 * 597 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 598 * across all zones, depending on their type. Here is the break-up: 599 * 600 * IRE type Shared/exclusive 601 * -------- ---------------- 602 * IRE_BROADCAST Exclusive 603 * IRE_DEFAULT (default routes) Shared (*) 604 * IRE_LOCAL Exclusive 605 * IRE_LOOPBACK Exclusive 606 * IRE_PREFIX (net routes) Shared (*) 607 * IRE_CACHE Exclusive 608 * IRE_IF_NORESOLVER (interface routes) Exclusive 609 * IRE_IF_RESOLVER (interface routes) Exclusive 610 * IRE_HOST (host routes) Shared (*) 611 * 612 * (*) A zone can only use a default or off-subnet route if the gateway is 613 * directly reachable from the zone, that is, if the gateway's address matches 614 * one of the zone's logical interfaces. 615 * 616 * Multiple zones can share a common broadcast address; typically all zones 617 * share the 255.255.255.255 address. Incoming as well as locally originated 618 * broadcast packets must be dispatched to all the zones on the broadcast 619 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 620 * since some zones may not be on the 10.16.72/24 network. To handle this, each 621 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 622 * sent to every zone that has an IRE_BROADCAST entry for the destination 623 * address on the input ill, see conn_wantpacket(). 624 * 625 * Applications in different zones can join the same multicast group address. 626 * For IPv4, group memberships are per-logical interface, so they're already 627 * inherently part of a zone. For IPv6, group memberships are per-physical 628 * interface, so we distinguish IPv6 group memberships based on group address, 629 * interface and zoneid. In both cases, received multicast packets are sent to 630 * every zone for which a group membership entry exists. On IPv6 we need to 631 * check that the target zone still has an address on the receiving physical 632 * interface; it could have been removed since the application issued the 633 * IPV6_JOIN_GROUP. 634 */ 635 636 /* 637 * Squeue Fanout flags: 638 * 0: No fanout. 639 * 1: Fanout across all squeues 640 */ 641 boolean_t ip_squeue_fanout = 0; 642 643 /* 644 * Maximum dups allowed per packet. 645 */ 646 uint_t ip_max_frag_dups = 10; 647 648 #define IS_SIMPLE_IPH(ipha) \ 649 ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) 650 651 /* RFC1122 Conformance */ 652 #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 653 654 #define ILL_MAX_NAMELEN LIFNAMSIZ 655 656 /* Leave room for ip_newroute to tack on the src and target addresses */ 657 #define OK_RESOLVER_MP(mp) \ 658 ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN)) 659 660 static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); 661 662 static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t); 663 static void ip_ipsec_out_prepend(mblk_t *, mblk_t *, ill_t *); 664 665 static void icmp_frag_needed(queue_t *, mblk_t *, int); 666 static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, 667 uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); 668 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *); 669 static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, 670 icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, 671 ill_t *, zoneid_t); 672 static void icmp_options_update(ipha_t *); 673 static void icmp_param_problem(queue_t *, mblk_t *, uint8_t); 674 static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t); 675 static mblk_t *icmp_pkt_err_ok(mblk_t *); 676 static void icmp_redirect(mblk_t *); 677 static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t); 678 679 static void ip_arp_news(queue_t *, mblk_t *); 680 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *); 681 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 682 char *ip_dot_addr(ipaddr_t, char *); 683 mblk_t *ip_carve_mp(mblk_t **, ssize_t); 684 int ip_close(queue_t *, int); 685 static char *ip_dot_saddr(uchar_t *, char *); 686 static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 687 boolean_t, boolean_t, ill_t *, zoneid_t); 688 static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, 689 boolean_t, boolean_t, zoneid_t); 690 static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, 691 boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); 692 static void ip_lrput(queue_t *, mblk_t *); 693 ipaddr_t ip_massage_options(ipha_t *); 694 static void ip_mrtun_forward(ire_t *, ill_t *, mblk_t *); 695 ipaddr_t ip_net_mask(ipaddr_t); 696 void ip_newroute(queue_t *, mblk_t *, ipaddr_t, ill_t *, conn_t *); 697 static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, 698 conn_t *, uint32_t); 699 static int ip_hdr_complete(ipha_t *, zoneid_t); 700 char *ip_nv_lookup(nv_t *, int); 701 static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); 702 static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 703 static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 704 static boolean_t ip_param_register(ipparam_t *, size_t, ipndp_t *, 705 size_t); 706 static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 707 void ip_rput(queue_t *, mblk_t *); 708 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 709 void *dummy_arg); 710 void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); 711 static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *); 712 static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, 713 ire_t *); 714 static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *); 715 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, 716 uint16_t *); 717 int ip_snmp_get(queue_t *, mblk_t *); 718 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *); 719 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *); 720 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *); 721 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *); 722 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *); 723 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *); 724 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *); 725 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *); 726 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *); 727 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *); 728 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *); 729 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *); 730 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *); 731 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *); 732 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *); 733 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *); 734 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 735 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 736 static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); 737 int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 738 static boolean_t ip_source_routed(ipha_t *); 739 static boolean_t ip_source_route_included(ipha_t *); 740 741 static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t); 742 static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int); 743 static void ip_wput_local_options(ipha_t *); 744 static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, 745 zoneid_t); 746 747 static void conn_drain_init(void); 748 static void conn_drain_fini(void); 749 static void conn_drain_tail(conn_t *connp, boolean_t closing); 750 751 static void conn_walk_drain(void); 752 static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, 753 zoneid_t); 754 755 static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, 756 zoneid_t); 757 static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 758 void *dummy_arg); 759 760 static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 761 762 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 763 ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, 764 conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); 765 static void ip_multirt_bad_mtu(ire_t *, uint32_t); 766 767 static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 768 static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 769 caddr_t, cred_t *); 770 extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 771 caddr_t cp, cred_t *cr); 772 extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, 773 cred_t *); 774 static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 775 caddr_t cp, cred_t *cr); 776 static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 777 cred_t *); 778 static squeue_func_t ip_squeue_switch(int); 779 780 static void ip_kstat_init(void); 781 static void ip_kstat_fini(void); 782 static int ip_kstat_update(kstat_t *kp, int rw); 783 static void icmp_kstat_init(void); 784 static void icmp_kstat_fini(void); 785 static int icmp_kstat_update(kstat_t *kp, int rw); 786 787 static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); 788 789 static boolean_t ip_no_forward(ipha_t *, ill_t *); 790 static boolean_t ip_loopback_src_or_dst(ipha_t *, ill_t *); 791 792 static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, 793 ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); 794 795 void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, size_t); 796 797 timeout_id_t ip_ire_expire_id; /* IRE expiration timer. */ 798 static clock_t ip_ire_arp_time_elapsed; /* Time since IRE cache last flushed */ 799 static clock_t ip_ire_rd_time_elapsed; /* ... redirect IREs last flushed */ 800 static clock_t ip_ire_pmtu_time_elapsed; /* Time since path mtu increase */ 801 802 uint_t ip_ire_default_count; /* Number of IPv4 IRE_DEFAULT entries. */ 803 uint_t ip_ire_default_index; /* Walking index used to mod in */ 804 805 ipaddr_t ip_g_all_ones = IP_HOST_MASK; 806 clock_t icmp_pkt_err_last = 0; /* Time since last icmp_pkt_err */ 807 uint_t icmp_pkt_err_sent = 0; /* Number of packets sent in burst */ 808 809 /* How long, in seconds, we allow frags to hang around. */ 810 #define IP_FRAG_TIMEOUT 60 811 812 time_t ip_g_frag_timeout = IP_FRAG_TIMEOUT; 813 clock_t ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; 814 815 /* 816 * Threshold which determines whether MDT should be used when 817 * generating IP fragments; payload size must be greater than 818 * this threshold for MDT to take place. 819 */ 820 #define IP_WPUT_FRAG_MDT_MIN 32768 821 822 int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; 823 824 /* Protected by ip_mi_lock */ 825 static void *ip_g_head; /* Instance Data List Head */ 826 kmutex_t ip_mi_lock; /* Lock for list of instances */ 827 828 /* Only modified during _init and _fini thus no locking is needed. */ 829 caddr_t ip_g_nd; /* Named Dispatch List Head */ 830 831 832 static long ip_rput_pullups; 833 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 834 835 vmem_t *ip_minor_arena; 836 837 /* 838 * MIB-2 stuff for SNMP (both IP and ICMP) 839 */ 840 mib2_ip_t ip_mib; 841 mib2_icmp_t icmp_mib; 842 843 #ifdef DEBUG 844 uint32_t ipsechw_debug = 0; 845 #endif 846 847 kstat_t *ip_mibkp; /* kstat exporting ip_mib data */ 848 kstat_t *icmp_mibkp; /* kstat exporting icmp_mib data */ 849 850 uint_t loopback_packets = 0; 851 852 /* 853 * Multirouting/CGTP stuff 854 */ 855 cgtp_filter_ops_t *ip_cgtp_filter_ops; /* CGTP hooks */ 856 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 857 boolean_t ip_cgtp_filter; /* Enable/disable CGTP hooks */ 858 /* Interval (in ms) between consecutive 'bad MTU' warnings */ 859 hrtime_t ip_multirt_log_interval = 1000; 860 /* Time since last warning issued. */ 861 static hrtime_t multirt_bad_mtu_last_time = 0; 862 863 kmutex_t ip_trash_timer_lock; 864 krwlock_t ip_g_nd_lock; 865 866 /* 867 * XXX following really should only be in a header. Would need more 868 * header and .c clean up first. 869 */ 870 extern optdb_obj_t ip_opt_obj; 871 872 ulong_t ip_squeue_enter_unbound = 0; 873 874 /* 875 * Named Dispatch Parameter Table. 876 * All of these are alterable, within the min/max values given, at run time. 877 */ 878 static ipparam_t lcl_param_arr[] = { 879 /* min max value name */ 880 { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 881 { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 882 { 0, 1, 1, "ip_respond_to_echo_multicast"}, 883 { 0, 1, 0, "ip_respond_to_timestamp"}, 884 { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 885 { 0, 1, 1, "ip_send_redirects"}, 886 { 0, 1, 0, "ip_forward_directed_broadcasts"}, 887 { 0, 10, 0, "ip_debug"}, 888 { 0, 10, 0, "ip_mrtdebug"}, 889 { 5000, 999999999, 60000, "ip_ire_timer_interval" }, 890 { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, 891 { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, 892 { 1, 255, 255, "ip_def_ttl" }, 893 { 0, 1, 0, "ip_forward_src_routed"}, 894 { 0, 256, 32, "ip_wroff_extra" }, 895 { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, 896 { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 897 { 0, 1, 1, "ip_path_mtu_discovery" }, 898 { 0, 240, 30, "ip_ignore_delete_time" }, 899 { 0, 1, 0, "ip_ignore_redirect" }, 900 { 0, 1, 1, "ip_output_queue" }, 901 { 1, 254, 1, "ip_broadcast_ttl" }, 902 { 0, 99999, 100, "ip_icmp_err_interval" }, 903 { 1, 99999, 10, "ip_icmp_err_burst" }, 904 { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 905 { 0, 1, 0, "ip_strict_dst_multihoming" }, 906 { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 907 { 0, 1, 0, "ipsec_override_persocket_policy" }, 908 { 0, 1, 1, "icmp_accept_clear_messages" }, 909 { 0, 1, 1, "igmp_accept_clear_messages" }, 910 { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 911 "ip_ndp_delay_first_probe_time"}, 912 { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 913 "ip_ndp_max_unicast_solicit"}, 914 { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 915 { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 916 { 0, 1, 0, "ip6_forward_src_routed"}, 917 { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 918 { 0, 1, 1, "ip6_send_redirects"}, 919 { 0, 1, 0, "ip6_ignore_redirect" }, 920 { 0, 1, 0, "ip6_strict_dst_multihoming" }, 921 922 { 1, 8, 3, "ip_ire_reclaim_fraction" }, 923 924 { 0, 999999, 1000, "ipsec_policy_log_interval" }, 925 926 { 0, 1, 1, "pim_accept_clear_messages" }, 927 { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 928 { 1, 20, 3, "ip_ndp_unsolicit_count" }, 929 { 0, 1, 1, "ip6_ignore_home_address_opt" }, 930 { 0, 15, 0, "ip_policy_mask" }, 931 { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, 932 { 0, 255, 1, "ip_multirt_ttl" }, 933 { 0, 1, 1, "ip_multidata_outbound" }, 934 #ifdef DEBUG 935 { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 936 #endif 937 }; 938 939 ipparam_t *ip_param_arr = lcl_param_arr; 940 941 /* Extended NDP table */ 942 static ipndp_t lcl_ndp_arr[] = { 943 /* getf setf data name */ 944 { ip_param_generic_get, ip_forward_set, (caddr_t)&ip_g_forward, 945 "ip_forwarding" }, 946 { ip_param_generic_get, ip_forward_set, (caddr_t)&ipv6_forward, 947 "ip6_forwarding" }, 948 { ip_ill_report, NULL, NULL, 949 "ip_ill_status" }, 950 { ip_ipif_report, NULL, NULL, 951 "ip_ipif_status" }, 952 { ip_ire_report, NULL, NULL, 953 "ipv4_ire_status" }, 954 { ip_ire_report_mrtun, NULL, NULL, 955 "ipv4_mrtun_ire_status" }, 956 { ip_ire_report_srcif, NULL, NULL, 957 "ipv4_srcif_ire_status" }, 958 { ip_ire_report_v6, NULL, NULL, 959 "ipv6_ire_status" }, 960 { ip_conn_report, NULL, NULL, 961 "ip_conn_status" }, 962 { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, 963 "ip_rput_pullups" }, 964 { ndp_report, NULL, NULL, 965 "ip_ndp_cache_report" }, 966 { ip_srcid_report, NULL, NULL, 967 "ip_srcid_status" }, 968 { ip_param_generic_get, ip_squeue_profile_set, 969 (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, 970 { ip_param_generic_get, ip_squeue_bind_set, 971 (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, 972 { ip_param_generic_get, ip_input_proc_set, 973 (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 974 { ip_param_generic_get, ip_int_set, 975 (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 976 { ip_cgtp_filter_get, ip_cgtp_filter_set, (caddr_t)&ip_cgtp_filter, 977 "ip_cgtp_filter" }, 978 { ip_param_generic_get, ip_int_set, 979 (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" } 980 }; 981 982 /* 983 * ip_g_forward controls IP forwarding. It takes two values: 984 * 0: IP_FORWARD_NEVER Don't forward packets ever. 985 * 1: IP_FORWARD_ALWAYS Forward packets for elsewhere. 986 * 987 * RFC1122 says there must be a configuration switch to control forwarding, 988 * but that the default MUST be to not forward packets ever. Implicit 989 * control based on configuration of multiple interfaces MUST NOT be 990 * implemented (Section 3.1). SunOS 4.1 did provide the "automatic" capability 991 * and, in fact, it was the default. That capability is now provided in the 992 * /etc/rc2.d/S69inet script. 993 */ 994 int ip_g_forward = IP_FORWARD_DEFAULT; 995 996 /* It also has an IPv6 counterpart. */ 997 998 int ipv6_forward = IP_FORWARD_DEFAULT; 999 1000 /* Following line is external, and in ip.h. Normally marked with * *. */ 1001 #define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value 1002 #define ip_g_resp_to_echo_bcast ip_param_arr[1].ip_param_value 1003 #define ip_g_resp_to_echo_mcast ip_param_arr[2].ip_param_value 1004 #define ip_g_resp_to_timestamp ip_param_arr[3].ip_param_value 1005 #define ip_g_resp_to_timestamp_bcast ip_param_arr[4].ip_param_value 1006 #define ip_g_send_redirects ip_param_arr[5].ip_param_value 1007 #define ip_g_forward_directed_bcast ip_param_arr[6].ip_param_value 1008 #define ip_debug ip_param_arr[7].ip_param_value /* */ 1009 #define ip_mrtdebug ip_param_arr[8].ip_param_value /* */ 1010 #define ip_timer_interval ip_param_arr[9].ip_param_value /* */ 1011 #define ip_ire_arp_interval ip_param_arr[10].ip_param_value /* */ 1012 #define ip_ire_redir_interval ip_param_arr[11].ip_param_value 1013 #define ip_def_ttl ip_param_arr[12].ip_param_value 1014 #define ip_forward_src_routed ip_param_arr[13].ip_param_value 1015 #define ip_wroff_extra ip_param_arr[14].ip_param_value 1016 #define ip_ire_pathmtu_interval ip_param_arr[15].ip_param_value 1017 #define ip_icmp_return ip_param_arr[16].ip_param_value 1018 #define ip_path_mtu_discovery ip_param_arr[17].ip_param_value /* */ 1019 #define ip_ignore_delete_time ip_param_arr[18].ip_param_value /* */ 1020 #define ip_ignore_redirect ip_param_arr[19].ip_param_value 1021 #define ip_output_queue ip_param_arr[20].ip_param_value 1022 #define ip_broadcast_ttl ip_param_arr[21].ip_param_value 1023 #define ip_icmp_err_interval ip_param_arr[22].ip_param_value 1024 #define ip_icmp_err_burst ip_param_arr[23].ip_param_value 1025 #define ip_reass_queue_bytes ip_param_arr[24].ip_param_value 1026 #define ip_strict_dst_multihoming ip_param_arr[25].ip_param_value 1027 #define ip_addrs_per_if ip_param_arr[26].ip_param_value 1028 #define ipsec_override_persocket_policy ip_param_arr[27].ip_param_value /* */ 1029 #define icmp_accept_clear_messages ip_param_arr[28].ip_param_value 1030 #define igmp_accept_clear_messages ip_param_arr[29].ip_param_value 1031 1032 /* IPv6 configuration knobs */ 1033 #define delay_first_probe_time ip_param_arr[30].ip_param_value 1034 #define max_unicast_solicit ip_param_arr[31].ip_param_value 1035 #define ipv6_def_hops ip_param_arr[32].ip_param_value 1036 #define ipv6_icmp_return ip_param_arr[33].ip_param_value 1037 #define ipv6_forward_src_routed ip_param_arr[34].ip_param_value 1038 #define ipv6_resp_echo_mcast ip_param_arr[35].ip_param_value 1039 #define ipv6_send_redirects ip_param_arr[36].ip_param_value 1040 #define ipv6_ignore_redirect ip_param_arr[37].ip_param_value 1041 #define ipv6_strict_dst_multihoming ip_param_arr[38].ip_param_value 1042 #define ip_ire_reclaim_fraction ip_param_arr[39].ip_param_value 1043 #define ipsec_policy_log_interval ip_param_arr[40].ip_param_value 1044 #define pim_accept_clear_messages ip_param_arr[41].ip_param_value 1045 #define ip_ndp_unsolicit_interval ip_param_arr[42].ip_param_value 1046 #define ip_ndp_unsolicit_count ip_param_arr[43].ip_param_value 1047 #define ipv6_ignore_home_address_opt ip_param_arr[44].ip_param_value 1048 #define ip_policy_mask ip_param_arr[45].ip_param_value 1049 #define ip_multirt_resolution_interval ip_param_arr[46].ip_param_value 1050 #define ip_multirt_ttl ip_param_arr[47].ip_param_value 1051 #define ip_multidata_outbound ip_param_arr[48].ip_param_value 1052 #ifdef DEBUG 1053 #define ipv6_drop_inbound_icmpv6 ip_param_arr[49].ip_param_value 1054 #else 1055 #define ipv6_drop_inbound_icmpv6 0 1056 #endif 1057 1058 1059 /* 1060 * Table of IP ioctls encoding the various properties of the ioctl and 1061 * indexed based on the last byte of the ioctl command. Occasionally there 1062 * is a clash, and there is more than 1 ioctl with the same last byte. 1063 * In such a case 1 ioctl is encoded in the ndx table and the remaining 1064 * ioctls are encoded in the misc table. An entry in the ndx table is 1065 * retrieved by indexing on the last byte of the ioctl command and comparing 1066 * the ioctl command with the value in the ndx table. In the event of a 1067 * mismatch the misc table is then searched sequentially for the desired 1068 * ioctl command. 1069 * 1070 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 1071 */ 1072 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 1073 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1077 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1078 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1079 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1080 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1081 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1082 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1083 1084 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 1085 MISC_CMD, ip_siocaddrt, NULL }, 1086 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 1087 MISC_CMD, ip_siocdelrt, NULL }, 1088 1089 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1090 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1091 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1092 IF_CMD, ip_sioctl_get_addr, NULL }, 1093 1094 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1095 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1096 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 1097 IPI_GET_CMD | IPI_REPL, 1098 IF_CMD, ip_sioctl_get_dstaddr, NULL }, 1099 1100 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 1101 IPI_PRIV | IPI_WR | IPI_REPL, 1102 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1103 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 1104 IPI_MODOK | IPI_GET_CMD | IPI_REPL, 1105 IF_CMD, ip_sioctl_get_flags, NULL }, 1106 1107 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1108 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1109 1110 /* copyin size cannot be coded for SIOCGIFCONF */ 1111 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL, 1112 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1113 1114 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1115 IF_CMD, ip_sioctl_mtu, NULL }, 1116 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1117 IF_CMD, ip_sioctl_get_mtu, NULL }, 1118 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 1119 IPI_GET_CMD | IPI_REPL, 1120 IF_CMD, ip_sioctl_get_brdaddr, NULL }, 1121 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1122 IF_CMD, ip_sioctl_brdaddr, NULL }, 1123 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1124 IPI_GET_CMD | IPI_REPL, 1125 IF_CMD, ip_sioctl_get_netmask, NULL }, 1126 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1127 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1128 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1129 IPI_GET_CMD | IPI_REPL, 1130 IF_CMD, ip_sioctl_get_metric, NULL }, 1131 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1132 IF_CMD, ip_sioctl_metric, NULL }, 1133 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1134 1135 /* See 166-168 below for extended SIOC*XARP ioctls */ 1136 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, 1137 MISC_CMD, ip_sioctl_arp, NULL }, 1138 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, 1139 MISC_CMD, ip_sioctl_arp, NULL }, 1140 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, 1141 MISC_CMD, ip_sioctl_arp, NULL }, 1142 1143 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1144 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1145 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1146 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1147 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1148 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1149 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1150 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1151 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1152 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1153 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1154 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1155 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1156 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1157 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1158 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1159 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1160 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1161 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1162 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1163 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1164 1165 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1166 MISC_CMD, if_unitsel, if_unitsel_restart }, 1167 1168 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1169 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1170 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1171 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1172 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1173 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1174 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1175 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1176 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1177 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1178 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1179 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1180 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1181 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1182 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1183 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1184 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1185 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1186 1187 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1188 IPI_PRIV | IPI_WR | IPI_MODOK, 1189 IF_CMD, ip_sioctl_sifname, NULL }, 1190 1191 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1192 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1193 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1194 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1195 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1196 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1197 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1198 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1199 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1200 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1201 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1202 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1203 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1204 1205 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, 1206 MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1207 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1208 IF_CMD, ip_sioctl_get_muxid, NULL }, 1209 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1210 IPI_PRIV | IPI_WR | IPI_REPL, 1211 IF_CMD, ip_sioctl_muxid, NULL }, 1212 1213 /* Both if and lif variants share same func */ 1214 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, 1215 IF_CMD, ip_sioctl_get_lifindex, NULL }, 1216 /* Both if and lif variants share same func */ 1217 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1218 IPI_PRIV | IPI_WR | IPI_REPL, 1219 IF_CMD, ip_sioctl_slifindex, NULL }, 1220 1221 /* copyin size cannot be coded for SIOCGIFCONF */ 1222 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL, 1223 MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1224 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1225 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1226 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1227 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1228 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1229 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1230 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1231 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1232 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1233 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1234 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1235 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1236 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1237 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1238 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1239 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1240 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1241 1242 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1243 IPI_PRIV | IPI_WR | IPI_REPL, 1244 LIF_CMD, ip_sioctl_removeif, 1245 ip_sioctl_removeif_restart }, 1246 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1247 IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, 1248 LIF_CMD, ip_sioctl_addif, NULL }, 1249 #define SIOCLIFADDR_NDX 112 1250 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1251 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1252 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1253 IPI_GET_CMD | IPI_REPL, 1254 LIF_CMD, ip_sioctl_get_addr, NULL }, 1255 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1256 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1257 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1258 IPI_GET_CMD | IPI_REPL, 1259 LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1260 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1261 IPI_PRIV | IPI_WR | IPI_REPL, 1262 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1263 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1264 IPI_GET_CMD | IPI_MODOK | IPI_REPL, 1265 LIF_CMD, ip_sioctl_get_flags, NULL }, 1266 1267 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1268 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1269 1270 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL, 1271 ip_sioctl_get_lifconf, NULL }, 1272 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1273 LIF_CMD, ip_sioctl_mtu, NULL }, 1274 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, 1275 LIF_CMD, ip_sioctl_get_mtu, NULL }, 1276 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1277 IPI_GET_CMD | IPI_REPL, 1278 LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1279 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1280 LIF_CMD, ip_sioctl_brdaddr, NULL }, 1281 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1282 IPI_GET_CMD | IPI_REPL, 1283 LIF_CMD, ip_sioctl_get_netmask, NULL }, 1284 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1285 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1286 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1287 IPI_GET_CMD | IPI_REPL, 1288 LIF_CMD, ip_sioctl_get_metric, NULL }, 1289 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1290 LIF_CMD, ip_sioctl_metric, NULL }, 1291 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1292 IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, 1293 LIF_CMD, ip_sioctl_slifname, 1294 ip_sioctl_slifname_restart }, 1295 1296 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, 1297 MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1298 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1299 IPI_GET_CMD | IPI_REPL, 1300 LIF_CMD, ip_sioctl_get_muxid, NULL }, 1301 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1302 IPI_PRIV | IPI_WR | IPI_REPL, 1303 LIF_CMD, ip_sioctl_muxid, NULL }, 1304 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1305 IPI_GET_CMD | IPI_REPL, 1306 LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1307 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1308 IPI_PRIV | IPI_WR | IPI_REPL, 1309 LIF_CMD, ip_sioctl_slifindex, 0 }, 1310 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1311 LIF_CMD, ip_sioctl_token, NULL }, 1312 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1313 IPI_GET_CMD | IPI_REPL, 1314 LIF_CMD, ip_sioctl_get_token, NULL }, 1315 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1316 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1317 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1318 IPI_GET_CMD | IPI_REPL, 1319 LIF_CMD, ip_sioctl_get_subnet, NULL }, 1320 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1321 LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1322 1323 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1324 IPI_GET_CMD | IPI_REPL, 1325 LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1326 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1327 LIF_CMD, ip_siocdelndp_v6, NULL }, 1328 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1329 LIF_CMD, ip_siocqueryndp_v6, NULL }, 1330 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1331 LIF_CMD, ip_siocsetndp_v6, NULL }, 1332 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1333 MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1334 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1335 MISC_CMD, ip_sioctl_tonlink, NULL }, 1336 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1337 MISC_CMD, ip_sioctl_tmysite, NULL }, 1338 /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, 1339 TUN_CMD, ip_sioctl_tunparam, NULL }, 1340 /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), 1341 IPI_PRIV | IPI_WR, 1342 TUN_CMD, ip_sioctl_tunparam, NULL }, 1343 1344 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1345 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1346 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1347 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1348 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1349 1350 /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), 1351 IPI_PRIV | IPI_WR | IPI_REPL, 1352 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1353 /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), 1354 IPI_PRIV | IPI_WR | IPI_REPL, 1355 LIF_CMD, ip_sioctl_move, ip_sioctl_move }, 1356 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1357 IPI_PRIV | IPI_WR, 1358 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1359 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1360 IPI_GET_CMD | IPI_REPL, 1361 LIF_CMD, ip_sioctl_get_groupname, NULL }, 1362 /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), 1363 IPI_GET_CMD | IPI_REPL, 1364 LIF_CMD, ip_sioctl_get_oindex, NULL }, 1365 1366 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1367 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1368 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1369 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1370 1371 /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1372 LIF_CMD, ip_sioctl_slifoindex, NULL }, 1373 1374 /* These are handled in ip_sioctl_copyin_setup itself */ 1375 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1376 MISC_CMD, NULL, NULL }, 1377 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1378 MISC_CMD, NULL, NULL }, 1379 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1380 1381 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL, 1382 ip_sioctl_get_lifconf, NULL }, 1383 1384 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, 1385 MISC_CMD, ip_sioctl_xarp, NULL }, 1386 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, 1387 MISC_CMD, ip_sioctl_xarp, NULL }, 1388 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, 1389 MISC_CMD, ip_sioctl_xarp, NULL }, 1390 1391 /* SIOCPOPSOCKFS is not handled by IP */ 1392 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1393 1394 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1395 IPI_GET_CMD | IPI_REPL, 1396 LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1397 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1398 IPI_PRIV | IPI_WR | IPI_REPL, 1399 LIF_CMD, ip_sioctl_slifzone, 1400 ip_sioctl_slifzone_restart }, 1401 /* 172-174 are SCTP ioctls and not handled by IP */ 1402 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1403 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1404 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1405 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1406 IPI_GET_CMD, LIF_CMD, 1407 ip_sioctl_get_lifusesrc, 0 }, 1408 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1409 IPI_PRIV | IPI_WR, 1410 LIF_CMD, ip_sioctl_slifusesrc, 1411 NULL }, 1412 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1413 ip_sioctl_get_lifsrcof, NULL }, 1414 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1415 MISC_CMD, ip_sioctl_msfilter, NULL }, 1416 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, 1417 MISC_CMD, ip_sioctl_msfilter, NULL }, 1418 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1419 MISC_CMD, ip_sioctl_msfilter, NULL }, 1420 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, 1421 MISC_CMD, ip_sioctl_msfilter, NULL }, 1422 /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, 1423 ip_sioctl_set_ipmpfailback, NULL } 1424 }; 1425 1426 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1427 1428 ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1429 { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), 1430 IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, 1431 { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, 1432 TUN_CMD, ip_sioctl_tunparam, NULL }, 1433 { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1434 { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1435 { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1436 { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1437 { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, 1438 { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, 1439 { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1440 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, 1441 MISC_CMD, mrt_ioctl}, 1442 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, 1443 MISC_CMD, mrt_ioctl}, 1444 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, 1445 MISC_CMD, mrt_ioctl} 1446 }; 1447 1448 int ip_misc_ioctl_count = 1449 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1450 1451 static idl_t *conn_drain_list; /* The array of conn drain lists */ 1452 static uint_t conn_drain_list_cnt; /* Total count of conn_drain_list */ 1453 static int conn_drain_list_index; /* Next drain_list to be used */ 1454 int conn_drain_nthreads; /* Number of drainers reqd. */ 1455 /* Settable in /etc/system */ 1456 1457 /* Defined in ip_ire.c */ 1458 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1459 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1460 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1461 1462 static nv_t ire_nv_arr[] = { 1463 { IRE_BROADCAST, "BROADCAST" }, 1464 { IRE_LOCAL, "LOCAL" }, 1465 { IRE_LOOPBACK, "LOOPBACK" }, 1466 { IRE_CACHE, "CACHE" }, 1467 { IRE_DEFAULT, "DEFAULT" }, 1468 { IRE_PREFIX, "PREFIX" }, 1469 { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1470 { IRE_IF_RESOLVER, "IF_RESOLV" }, 1471 { IRE_HOST, "HOST" }, 1472 { IRE_HOST_REDIRECT, "HOST_REDIRECT" }, 1473 { 0 } 1474 }; 1475 1476 nv_t *ire_nv_tbl = ire_nv_arr; 1477 1478 /* Defined in ip_if.c, protect the list of IPsec capable ills */ 1479 extern krwlock_t ipsec_capab_ills_lock; 1480 1481 /* Packet dropper for IP IPsec processing failures */ 1482 ipdropper_t ip_dropper; 1483 1484 /* Simple ICMP IP Header Template */ 1485 static ipha_t icmp_ipha = { 1486 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1487 }; 1488 1489 struct module_info ip_mod_info = { 1490 IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 1491 }; 1492 1493 static struct qinit rinit = { 1494 (pfi_t)ip_rput, NULL, ip_open, ip_close, NULL, 1495 &ip_mod_info 1496 }; 1497 1498 static struct qinit winit = { 1499 (pfi_t)ip_wput, (pfi_t)ip_wsrv, ip_open, ip_close, NULL, 1500 &ip_mod_info 1501 }; 1502 1503 static struct qinit lrinit = { 1504 (pfi_t)ip_lrput, NULL, ip_open, ip_close, NULL, 1505 &ip_mod_info 1506 }; 1507 1508 static struct qinit lwinit = { 1509 (pfi_t)ip_lwput, NULL, ip_open, ip_close, NULL, 1510 &ip_mod_info 1511 }; 1512 1513 struct streamtab ipinfo = { 1514 &rinit, &winit, &lrinit, &lwinit 1515 }; 1516 1517 #ifdef DEBUG 1518 static boolean_t skip_sctp_cksum = B_FALSE; 1519 #endif 1520 /* 1521 * Copy an M_CTL-tagged message, preserving reference counts appropriately. 1522 */ 1523 mblk_t * 1524 ip_copymsg(mblk_t *mp) 1525 { 1526 mblk_t *nmp; 1527 ipsec_info_t *in; 1528 1529 if (mp->b_datap->db_type != M_CTL) 1530 return (copymsg(mp)); 1531 1532 in = (ipsec_info_t *)mp->b_rptr; 1533 1534 /* 1535 * Note that M_CTL is also used for delivering ICMP error messages 1536 * upstream to transport layers. 1537 */ 1538 if (in->ipsec_info_type != IPSEC_OUT && 1539 in->ipsec_info_type != IPSEC_IN) 1540 return (copymsg(mp)); 1541 1542 nmp = copymsg(mp->b_cont); 1543 1544 if (in->ipsec_info_type == IPSEC_OUT) 1545 return (ipsec_out_tag(mp, nmp)); 1546 else 1547 return (ipsec_in_tag(mp, nmp)); 1548 } 1549 1550 /* Generate an ICMP fragmentation needed message. */ 1551 static void 1552 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu) 1553 { 1554 icmph_t icmph; 1555 mblk_t *first_mp; 1556 boolean_t mctl_present; 1557 1558 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 1559 1560 if (!(mp = icmp_pkt_err_ok(mp))) { 1561 if (mctl_present) 1562 freeb(first_mp); 1563 return; 1564 } 1565 1566 bzero(&icmph, sizeof (icmph_t)); 1567 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1568 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1569 icmph.icmph_du_mtu = htons((uint16_t)mtu); 1570 BUMP_MIB(&icmp_mib, icmpOutFragNeeded); 1571 BUMP_MIB(&icmp_mib, icmpOutDestUnreachs); 1572 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 1573 } 1574 1575 /* 1576 * icmp_inbound deals with ICMP messages in the following ways. 1577 * 1578 * 1) It needs to send a reply back and possibly delivering it 1579 * to the "interested" upper clients. 1580 * 2) It needs to send it to the upper clients only. 1581 * 3) It needs to change some values in IP only. 1582 * 4) It needs to change some values in IP and upper layers e.g TCP. 1583 * 1584 * We need to accomodate icmp messages coming in clear until we get 1585 * everything secure from the wire. If icmp_accept_clear_messages 1586 * is zero we check with the global policy and act accordingly. If 1587 * it is non-zero, we accept the message without any checks. But 1588 * *this does not mean* that this will be delivered to the upper 1589 * clients. By accepting we might send replies back, change our MTU 1590 * value etc. but delivery to the ULP/clients depends on their policy 1591 * dispositions. 1592 * 1593 * We handle the above 4 cases in the context of IPSEC in the 1594 * following way : 1595 * 1596 * 1) Send the reply back in the same way as the request came in. 1597 * If it came in encrypted, it goes out encrypted. If it came in 1598 * clear, it goes out in clear. Thus, this will prevent chosen 1599 * plain text attack. 1600 * 2) The client may or may not expect things to come in secure. 1601 * If it comes in secure, the policy constraints are checked 1602 * before delivering it to the upper layers. If it comes in 1603 * clear, ipsec_inbound_accept_clear will decide whether to 1604 * accept this in clear or not. In both the cases, if the returned 1605 * message (IP header + 8 bytes) that caused the icmp message has 1606 * AH/ESP headers, it is sent up to AH/ESP for validation before 1607 * sending up. If there are only 8 bytes of returned message, then 1608 * upper client will not be notified. 1609 * 3) Check with global policy to see whether it matches the constaints. 1610 * But this will be done only if icmp_accept_messages_in_clear is 1611 * zero. 1612 * 4) If we need to change both in IP and ULP, then the decision taken 1613 * while affecting the values in IP and while delivering up to TCP 1614 * should be the same. 1615 * 1616 * There are two cases. 1617 * 1618 * a) If we reject data at the IP layer (ipsec_check_global_policy() 1619 * failed), we will not deliver it to the ULP, even though they 1620 * are *willing* to accept in *clear*. This is fine as our global 1621 * disposition to icmp messages asks us reject the datagram. 1622 * 1623 * b) If we accept data at the IP layer (ipsec_check_global_policy() 1624 * succeeded or icmp_accept_messages_in_clear is 1), and not able 1625 * to deliver it to ULP (policy failed), it can lead to 1626 * consistency problems. The cases known at this time are 1627 * ICMP_DESTINATION_UNREACHABLE messages with following code 1628 * values : 1629 * 1630 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1631 * and Upper layer rejects. Then the communication will 1632 * come to a stop. This is solved by making similar decisions 1633 * at both levels. Currently, when we are unable to deliver 1634 * to the Upper Layer (due to policy failures) while IP has 1635 * adjusted ire_max_frag, the next outbound datagram would 1636 * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1637 * will be with the right level of protection. Thus the right 1638 * value will be communicated even if we are not able to 1639 * communicate when we get from the wire initially. But this 1640 * assumes there would be at least one outbound datagram after 1641 * IP has adjusted its ire_max_frag value. To make things 1642 * simpler, we accept in clear after the validation of 1643 * AH/ESP headers. 1644 * 1645 * - Other ICMP ERRORS : We may not be able to deliver it to the 1646 * upper layer depending on the level of protection the upper 1647 * layer expects and the disposition in ipsec_inbound_accept_clear(). 1648 * ipsec_inbound_accept_clear() decides whether a given ICMP error 1649 * should be accepted in clear when the Upper layer expects secure. 1650 * Thus the communication may get aborted by some bad ICMP 1651 * packets. 1652 * 1653 * IPQoS Notes: 1654 * The only instance when a packet is sent for processing is when there 1655 * isn't an ICMP client and if we are interested in it. 1656 * If there is a client, IPPF processing will take place in the 1657 * ip_fanout_proto routine. 1658 * 1659 * Zones notes: 1660 * The packet is only processed in the context of the specified zone: typically 1661 * only this zone will reply to an echo request, and only interested clients in 1662 * this zone will receive a copy of the packet. This means that the caller must 1663 * call icmp_inbound() for each relevant zone. 1664 */ 1665 static void 1666 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, 1667 int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, 1668 ill_t *recv_ill, zoneid_t zoneid) 1669 { 1670 icmph_t *icmph; 1671 ipha_t *ipha; 1672 int iph_hdr_length; 1673 int hdr_length; 1674 boolean_t interested; 1675 uint32_t ts; 1676 uchar_t *wptr; 1677 ipif_t *ipif; 1678 mblk_t *first_mp; 1679 ipsec_in_t *ii; 1680 ire_t *src_ire; 1681 boolean_t onlink; 1682 timestruc_t now; 1683 uint32_t ill_index; 1684 1685 ASSERT(ill != NULL); 1686 1687 first_mp = mp; 1688 if (mctl_present) { 1689 mp = first_mp->b_cont; 1690 ASSERT(mp != NULL); 1691 } 1692 1693 ipha = (ipha_t *)mp->b_rptr; 1694 if (icmp_accept_clear_messages == 0) { 1695 first_mp = ipsec_check_global_policy(first_mp, NULL, 1696 ipha, NULL, mctl_present); 1697 if (first_mp == NULL) 1698 return; 1699 } 1700 1701 /* 1702 * On a labeled system, we have to check whether the zone itself is 1703 * permitted to receive raw traffic. 1704 */ 1705 if (is_system_labeled()) { 1706 if (zoneid == ALL_ZONES) 1707 zoneid = tsol_packet_to_zoneid(mp); 1708 if (!tsol_can_accept_raw(mp, B_FALSE)) { 1709 ip1dbg(("icmp_inbound: zone %d can't receive raw", 1710 zoneid)); 1711 BUMP_MIB(&icmp_mib, icmpInErrors); 1712 freemsg(first_mp); 1713 return; 1714 } 1715 } 1716 1717 /* 1718 * We have accepted the ICMP message. It means that we will 1719 * respond to the packet if needed. It may not be delivered 1720 * to the upper client depending on the policy constraints 1721 * and the disposition in ipsec_inbound_accept_clear. 1722 */ 1723 1724 ASSERT(ill != NULL); 1725 1726 BUMP_MIB(&icmp_mib, icmpInMsgs); 1727 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1728 if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { 1729 /* Last chance to get real. */ 1730 if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { 1731 BUMP_MIB(&icmp_mib, icmpInErrors); 1732 freemsg(first_mp); 1733 return; 1734 } 1735 /* Refresh iph following the pullup. */ 1736 ipha = (ipha_t *)mp->b_rptr; 1737 } 1738 /* ICMP header checksum, including checksum field, should be zero. */ 1739 if (sum_valid ? (sum != 0 && sum != 0xFFFF) : 1740 IP_CSUM(mp, iph_hdr_length, 0)) { 1741 BUMP_MIB(&icmp_mib, icmpInCksumErrs); 1742 freemsg(first_mp); 1743 return; 1744 } 1745 /* The IP header will always be a multiple of four bytes */ 1746 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1747 ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, 1748 icmph->icmph_code)); 1749 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1750 /* We will set "interested" to "true" if we want a copy */ 1751 interested = B_FALSE; 1752 switch (icmph->icmph_type) { 1753 case ICMP_ECHO_REPLY: 1754 BUMP_MIB(&icmp_mib, icmpInEchoReps); 1755 break; 1756 case ICMP_DEST_UNREACHABLE: 1757 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1758 BUMP_MIB(&icmp_mib, icmpInFragNeeded); 1759 interested = B_TRUE; /* Pass up to transport */ 1760 BUMP_MIB(&icmp_mib, icmpInDestUnreachs); 1761 break; 1762 case ICMP_SOURCE_QUENCH: 1763 interested = B_TRUE; /* Pass up to transport */ 1764 BUMP_MIB(&icmp_mib, icmpInSrcQuenchs); 1765 break; 1766 case ICMP_REDIRECT: 1767 if (!ip_ignore_redirect) 1768 interested = B_TRUE; 1769 BUMP_MIB(&icmp_mib, icmpInRedirects); 1770 break; 1771 case ICMP_ECHO_REQUEST: 1772 /* 1773 * Whether to respond to echo requests that come in as IP 1774 * broadcasts or as IP multicast is subject to debate 1775 * (what isn't?). We aim to please, you pick it. 1776 * Default is do it. 1777 */ 1778 if (!broadcast && !CLASSD(ipha->ipha_dst)) { 1779 /* unicast: always respond */ 1780 interested = B_TRUE; 1781 } else if (CLASSD(ipha->ipha_dst)) { 1782 /* multicast: respond based on tunable */ 1783 interested = ip_g_resp_to_echo_mcast; 1784 } else if (broadcast) { 1785 /* broadcast: respond based on tunable */ 1786 interested = ip_g_resp_to_echo_bcast; 1787 } 1788 BUMP_MIB(&icmp_mib, icmpInEchos); 1789 break; 1790 case ICMP_ROUTER_ADVERTISEMENT: 1791 case ICMP_ROUTER_SOLICITATION: 1792 break; 1793 case ICMP_TIME_EXCEEDED: 1794 interested = B_TRUE; /* Pass up to transport */ 1795 BUMP_MIB(&icmp_mib, icmpInTimeExcds); 1796 break; 1797 case ICMP_PARAM_PROBLEM: 1798 interested = B_TRUE; /* Pass up to transport */ 1799 BUMP_MIB(&icmp_mib, icmpInParmProbs); 1800 break; 1801 case ICMP_TIME_STAMP_REQUEST: 1802 /* Response to Time Stamp Requests is local policy. */ 1803 if (ip_g_resp_to_timestamp && 1804 /* So is whether to respond if it was an IP broadcast. */ 1805 (!broadcast || ip_g_resp_to_timestamp_bcast)) { 1806 int tstamp_len = 3 * sizeof (uint32_t); 1807 1808 if (wptr + tstamp_len > mp->b_wptr) { 1809 if (!pullupmsg(mp, wptr + tstamp_len - 1810 mp->b_rptr)) { 1811 BUMP_MIB(&ip_mib, ipInDiscards); 1812 freemsg(first_mp); 1813 return; 1814 } 1815 /* Refresh ipha following the pullup. */ 1816 ipha = (ipha_t *)mp->b_rptr; 1817 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1818 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1819 } 1820 interested = B_TRUE; 1821 } 1822 BUMP_MIB(&icmp_mib, icmpInTimestamps); 1823 break; 1824 case ICMP_TIME_STAMP_REPLY: 1825 BUMP_MIB(&icmp_mib, icmpInTimestampReps); 1826 break; 1827 case ICMP_INFO_REQUEST: 1828 /* Per RFC 1122 3.2.2.7, ignore this. */ 1829 case ICMP_INFO_REPLY: 1830 break; 1831 case ICMP_ADDRESS_MASK_REQUEST: 1832 if ((ip_respond_to_address_mask_broadcast || !broadcast) && 1833 /* TODO m_pullup of complete header? */ 1834 (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) 1835 interested = B_TRUE; 1836 BUMP_MIB(&icmp_mib, icmpInAddrMasks); 1837 break; 1838 case ICMP_ADDRESS_MASK_REPLY: 1839 BUMP_MIB(&icmp_mib, icmpInAddrMaskReps); 1840 break; 1841 default: 1842 interested = B_TRUE; /* Pass up to transport */ 1843 BUMP_MIB(&icmp_mib, icmpInUnknowns); 1844 break; 1845 } 1846 /* See if there is an ICMP client. */ 1847 if (ipcl_proto_search(IPPROTO_ICMP) != NULL) { 1848 /* If there is an ICMP client and we want one too, copy it. */ 1849 mblk_t *first_mp1; 1850 1851 if (!interested) { 1852 ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, 1853 ip_policy, recv_ill, zoneid); 1854 return; 1855 } 1856 first_mp1 = ip_copymsg(first_mp); 1857 if (first_mp1 != NULL) { 1858 ip_fanout_proto(q, first_mp1, ill, ipha, 1859 0, mctl_present, ip_policy, recv_ill, zoneid); 1860 } 1861 } else if (!interested) { 1862 freemsg(first_mp); 1863 return; 1864 } else { 1865 /* 1866 * Initiate policy processing for this packet if ip_policy 1867 * is true. 1868 */ 1869 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 1870 ill_index = ill->ill_phyint->phyint_ifindex; 1871 ip_process(IPP_LOCAL_IN, &mp, ill_index); 1872 if (mp == NULL) { 1873 if (mctl_present) { 1874 freeb(first_mp); 1875 } 1876 BUMP_MIB(&icmp_mib, icmpInErrors); 1877 return; 1878 } 1879 } 1880 } 1881 /* We want to do something with it. */ 1882 /* Check db_ref to make sure we can modify the packet. */ 1883 if (mp->b_datap->db_ref > 1) { 1884 mblk_t *first_mp1; 1885 1886 first_mp1 = ip_copymsg(first_mp); 1887 freemsg(first_mp); 1888 if (!first_mp1) { 1889 BUMP_MIB(&icmp_mib, icmpOutDrops); 1890 return; 1891 } 1892 first_mp = first_mp1; 1893 if (mctl_present) { 1894 mp = first_mp->b_cont; 1895 ASSERT(mp != NULL); 1896 } else { 1897 mp = first_mp; 1898 } 1899 ipha = (ipha_t *)mp->b_rptr; 1900 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1901 wptr = (uchar_t *)icmph + ICMPH_SIZE; 1902 } 1903 switch (icmph->icmph_type) { 1904 case ICMP_ADDRESS_MASK_REQUEST: 1905 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1906 if (ipif == NULL) { 1907 freemsg(first_mp); 1908 return; 1909 } 1910 /* 1911 * outging interface must be IPv4 1912 */ 1913 ASSERT(ipif != NULL && !ipif->ipif_isv6); 1914 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1915 bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); 1916 ipif_refrele(ipif); 1917 BUMP_MIB(&icmp_mib, icmpOutAddrMaskReps); 1918 break; 1919 case ICMP_ECHO_REQUEST: 1920 icmph->icmph_type = ICMP_ECHO_REPLY; 1921 BUMP_MIB(&icmp_mib, icmpOutEchoReps); 1922 break; 1923 case ICMP_TIME_STAMP_REQUEST: { 1924 uint32_t *tsp; 1925 1926 icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1927 tsp = (uint32_t *)wptr; 1928 tsp++; /* Skip past 'originate time' */ 1929 /* Compute # of milliseconds since midnight */ 1930 gethrestime(&now); 1931 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1932 now.tv_nsec / (NANOSEC / MILLISEC); 1933 *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1934 *tsp++ = htonl(ts); /* Lay in 'send time' */ 1935 BUMP_MIB(&icmp_mib, icmpOutTimestampReps); 1936 break; 1937 } 1938 default: 1939 ipha = (ipha_t *)&icmph[1]; 1940 if ((uchar_t *)&ipha[1] > mp->b_wptr) { 1941 if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { 1942 BUMP_MIB(&ip_mib, ipInDiscards); 1943 freemsg(first_mp); 1944 return; 1945 } 1946 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1947 ipha = (ipha_t *)&icmph[1]; 1948 } 1949 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { 1950 BUMP_MIB(&ip_mib, ipInDiscards); 1951 freemsg(first_mp); 1952 return; 1953 } 1954 hdr_length = IPH_HDR_LENGTH(ipha); 1955 if (hdr_length < sizeof (ipha_t)) { 1956 BUMP_MIB(&ip_mib, ipInDiscards); 1957 freemsg(first_mp); 1958 return; 1959 } 1960 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 1961 if (!pullupmsg(mp, 1962 (uchar_t *)ipha + hdr_length - mp->b_rptr)) { 1963 BUMP_MIB(&ip_mib, ipInDiscards); 1964 freemsg(first_mp); 1965 return; 1966 } 1967 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1968 ipha = (ipha_t *)&icmph[1]; 1969 } 1970 switch (icmph->icmph_type) { 1971 case ICMP_REDIRECT: 1972 /* 1973 * As there is no upper client to deliver, we don't 1974 * need the first_mp any more. 1975 */ 1976 if (mctl_present) { 1977 freeb(first_mp); 1978 } 1979 icmp_redirect(mp); 1980 return; 1981 case ICMP_DEST_UNREACHABLE: 1982 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1983 if (!icmp_inbound_too_big(icmph, ipha)) { 1984 freemsg(first_mp); 1985 return; 1986 } 1987 } 1988 /* FALLTHRU */ 1989 default : 1990 /* 1991 * IPQoS notes: Since we have already done IPQoS 1992 * processing we don't want to do it again in 1993 * the fanout routines called by 1994 * icmp_inbound_error_fanout, hence the last 1995 * argument, ip_policy, is B_FALSE. 1996 */ 1997 icmp_inbound_error_fanout(q, ill, first_mp, icmph, 1998 ipha, iph_hdr_length, hdr_length, mctl_present, 1999 B_FALSE, recv_ill, zoneid); 2000 } 2001 return; 2002 } 2003 /* Send out an ICMP packet */ 2004 icmph->icmph_checksum = 0; 2005 icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); 2006 if (icmph->icmph_checksum == 0) 2007 icmph->icmph_checksum = 0xFFFF; 2008 if (broadcast || CLASSD(ipha->ipha_dst)) { 2009 ipif_t *ipif_chosen; 2010 /* 2011 * Make it look like it was directed to us, so we don't look 2012 * like a fool with a broadcast or multicast source address. 2013 */ 2014 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 2015 /* 2016 * Make sure that we haven't grabbed an interface that's DOWN. 2017 */ 2018 if (ipif != NULL) { 2019 ipif_chosen = ipif_select_source(ipif->ipif_ill, 2020 ipha->ipha_src, zoneid); 2021 if (ipif_chosen != NULL) { 2022 ipif_refrele(ipif); 2023 ipif = ipif_chosen; 2024 } 2025 } 2026 if (ipif == NULL) { 2027 ip0dbg(("icmp_inbound: " 2028 "No source for broadcast/multicast:\n" 2029 "\tsrc 0x%x dst 0x%x ill %p " 2030 "ipif_lcl_addr 0x%x\n", 2031 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), 2032 (void *)ill, 2033 ill->ill_ipif->ipif_lcl_addr)); 2034 freemsg(first_mp); 2035 return; 2036 } 2037 ASSERT(ipif != NULL && !ipif->ipif_isv6); 2038 ipha->ipha_dst = ipif->ipif_src_addr; 2039 ipif_refrele(ipif); 2040 } 2041 /* Reset time to live. */ 2042 ipha->ipha_ttl = ip_def_ttl; 2043 { 2044 /* Swap source and destination addresses */ 2045 ipaddr_t tmp; 2046 2047 tmp = ipha->ipha_src; 2048 ipha->ipha_src = ipha->ipha_dst; 2049 ipha->ipha_dst = tmp; 2050 } 2051 ipha->ipha_ident = 0; 2052 if (!IS_SIMPLE_IPH(ipha)) 2053 icmp_options_update(ipha); 2054 2055 /* 2056 * ICMP echo replies should go out on the same interface 2057 * the request came on as probes used by in.mpathd for detecting 2058 * NIC failures are ECHO packets. We turn-off load spreading 2059 * by setting ipsec_in_attach_if to B_TRUE, which is copied 2060 * to ipsec_out_attach_if by ipsec_in_to_out called later in this 2061 * function. This is in turn handled by ip_wput and ip_newroute 2062 * to make sure that the packet goes out on the interface it came 2063 * in on. If we don't turnoff load spreading, the packets might get 2064 * dropped if there are no non-FAILED/INACTIVE interfaces for it 2065 * to go out and in.mpathd would wrongly detect a failure or 2066 * mis-detect a NIC failure for link failure. As load spreading 2067 * can happen only if ill_group is not NULL, we do only for 2068 * that case and this does not affect the normal case. 2069 * 2070 * We turn off load spreading only on echo packets that came from 2071 * on-link hosts. If the interface route has been deleted, this will 2072 * not be enforced as we can't do much. For off-link hosts, as the 2073 * default routes in IPv4 does not typically have an ire_ipif 2074 * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. 2075 * Moreover, expecting a default route through this interface may 2076 * not be correct. We use ipha_dst because of the swap above. 2077 */ 2078 onlink = B_FALSE; 2079 if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { 2080 /* 2081 * First, we need to make sure that it is not one of our 2082 * local addresses. If we set onlink when it is one of 2083 * our local addresses, we will end up creating IRE_CACHES 2084 * for one of our local addresses. Then, we will never 2085 * accept packets for them afterwards. 2086 */ 2087 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, 2088 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 2089 if (src_ire == NULL) { 2090 ipif = ipif_get_next_ipif(NULL, ill); 2091 if (ipif == NULL) { 2092 BUMP_MIB(&ip_mib, ipInDiscards); 2093 freemsg(mp); 2094 return; 2095 } 2096 src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 2097 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 2098 NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE); 2099 ipif_refrele(ipif); 2100 if (src_ire != NULL) { 2101 onlink = B_TRUE; 2102 ire_refrele(src_ire); 2103 } 2104 } else { 2105 ire_refrele(src_ire); 2106 } 2107 } 2108 if (!mctl_present) { 2109 /* 2110 * This packet should go out the same way as it 2111 * came in i.e in clear. To make sure that global 2112 * policy will not be applied to this in ip_wput_ire, 2113 * we attach a IPSEC_IN mp and clear ipsec_in_secure. 2114 */ 2115 ASSERT(first_mp == mp); 2116 if ((first_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 2117 BUMP_MIB(&ip_mib, ipInDiscards); 2118 freemsg(mp); 2119 return; 2120 } 2121 ii = (ipsec_in_t *)first_mp->b_rptr; 2122 2123 /* This is not a secure packet */ 2124 ii->ipsec_in_secure = B_FALSE; 2125 if (onlink) { 2126 ii->ipsec_in_attach_if = B_TRUE; 2127 ii->ipsec_in_ill_index = 2128 ill->ill_phyint->phyint_ifindex; 2129 ii->ipsec_in_rill_index = 2130 recv_ill->ill_phyint->phyint_ifindex; 2131 } 2132 first_mp->b_cont = mp; 2133 } else if (onlink) { 2134 ii = (ipsec_in_t *)first_mp->b_rptr; 2135 ii->ipsec_in_attach_if = B_TRUE; 2136 ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; 2137 ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; 2138 } else { 2139 ii = (ipsec_in_t *)first_mp->b_rptr; 2140 } 2141 ii->ipsec_in_zoneid = zoneid; 2142 ASSERT(zoneid != ALL_ZONES); 2143 if (!ipsec_in_to_out(first_mp, ipha, NULL)) { 2144 BUMP_MIB(&ip_mib, ipInDiscards); 2145 return; 2146 } 2147 BUMP_MIB(&icmp_mib, icmpOutMsgs); 2148 put(WR(q), first_mp); 2149 } 2150 2151 /* Table from RFC 1191 */ 2152 static int icmp_frag_size_table[] = 2153 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2154 2155 /* 2156 * Process received ICMP Packet too big. 2157 * After updating any IRE it does the fanout to any matching transport streams. 2158 * Assumes the message has been pulled up till the IP header that caused 2159 * the error. 2160 * 2161 * Returns B_FALSE on failure and B_TRUE on success. 2162 */ 2163 static boolean_t 2164 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha) 2165 { 2166 ire_t *ire, *first_ire; 2167 int mtu; 2168 int hdr_length; 2169 2170 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2171 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2172 2173 hdr_length = IPH_HDR_LENGTH(ipha); 2174 2175 first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, NULL, 2176 ALL_ZONES, NULL, MATCH_IRE_TYPE); 2177 2178 if (!first_ire) { 2179 ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", 2180 ntohl(ipha->ipha_dst))); 2181 return (B_FALSE); 2182 } 2183 /* Drop if the original packet contained a source route */ 2184 if (ip_source_route_included(ipha)) { 2185 ire_refrele(first_ire); 2186 return (B_FALSE); 2187 } 2188 /* Check for MTU discovery advice as described in RFC 1191 */ 2189 mtu = ntohs(icmph->icmph_du_mtu); 2190 rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); 2191 for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; 2192 ire = ire->ire_next) { 2193 mutex_enter(&ire->ire_lock); 2194 if (icmph->icmph_du_zero == 0 && mtu > 68) { 2195 /* Reduce the IRE max frag value as advised. */ 2196 ip1dbg(("Received mtu from router: %d (was %d)\n", 2197 mtu, ire->ire_max_frag)); 2198 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2199 } else { 2200 uint32_t length; 2201 int i; 2202 2203 /* 2204 * Use the table from RFC 1191 to figure out 2205 * the next "plateau" based on the length in 2206 * the original IP packet. 2207 */ 2208 length = ntohs(ipha->ipha_length); 2209 if (ire->ire_max_frag <= length && 2210 ire->ire_max_frag >= length - hdr_length) { 2211 /* 2212 * Handle broken BSD 4.2 systems that 2213 * return the wrong iph_length in ICMP 2214 * errors. 2215 */ 2216 ip1dbg(("Wrong mtu: sent %d, ire %d\n", 2217 length, ire->ire_max_frag)); 2218 length -= hdr_length; 2219 } 2220 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2221 if (length > icmp_frag_size_table[i]) 2222 break; 2223 } 2224 if (i == A_CNT(icmp_frag_size_table)) { 2225 /* Smaller than 68! */ 2226 ip1dbg(("Too big for packet size %d\n", 2227 length)); 2228 ire->ire_max_frag = MIN(ire->ire_max_frag, 576); 2229 ire->ire_frag_flag = 0; 2230 } else { 2231 mtu = icmp_frag_size_table[i]; 2232 ip1dbg(("Calculated mtu %d, packet size %d, " 2233 "before %d", mtu, length, 2234 ire->ire_max_frag)); 2235 ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); 2236 ip1dbg((", after %d\n", ire->ire_max_frag)); 2237 } 2238 /* Record the new max frag size for the ULP. */ 2239 icmph->icmph_du_zero = 0; 2240 icmph->icmph_du_mtu = 2241 htons((uint16_t)ire->ire_max_frag); 2242 } 2243 mutex_exit(&ire->ire_lock); 2244 } 2245 rw_exit(&first_ire->ire_bucket->irb_lock); 2246 ire_refrele(first_ire); 2247 return (B_TRUE); 2248 } 2249 2250 /* 2251 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout 2252 * calls this function. 2253 */ 2254 static mblk_t * 2255 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) 2256 { 2257 ipha_t *ipha; 2258 icmph_t *icmph; 2259 ipha_t *in_ipha; 2260 int length; 2261 2262 ASSERT(mp->b_datap->db_type == M_DATA); 2263 2264 /* 2265 * For Self-encapsulated packets, we added an extra IP header 2266 * without the options. Inner IP header is the one from which 2267 * the outer IP header was formed. Thus, we need to remove the 2268 * outer IP header. To do this, we pullup the whole message 2269 * and overlay whatever follows the outer IP header over the 2270 * outer IP header. 2271 */ 2272 2273 if (!pullupmsg(mp, -1)) { 2274 BUMP_MIB(&ip_mib, ipInDiscards); 2275 return (NULL); 2276 } 2277 2278 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2279 ipha = (ipha_t *)&icmph[1]; 2280 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2281 2282 /* 2283 * The length that we want to overlay is following the inner 2284 * IP header. Subtracting the IP header + icmp header + outer 2285 * IP header's length should give us the length that we want to 2286 * overlay. 2287 */ 2288 length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - 2289 hdr_length; 2290 /* 2291 * Overlay whatever follows the inner header over the 2292 * outer header. 2293 */ 2294 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2295 2296 /* Set the wptr to account for the outer header */ 2297 mp->b_wptr -= hdr_length; 2298 return (mp); 2299 } 2300 2301 /* 2302 * Try to pass the ICMP message upstream in case the ULP cares. 2303 * 2304 * If the packet that caused the ICMP error is secure, we send 2305 * it to AH/ESP to make sure that the attached packet has a 2306 * valid association. ipha in the code below points to the 2307 * IP header of the packet that caused the error. 2308 * 2309 * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently 2310 * in the context of IPSEC. Normally we tell the upper layer 2311 * whenever we send the ire (including ip_bind), the IPSEC header 2312 * length in ire_ipsec_overhead. TCP can deduce the MSS as it 2313 * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. 2314 * Similarly, we pass the new MTU icmph_du_mtu and TCP does the 2315 * same thing. As TCP has the IPSEC options size that needs to be 2316 * adjusted, we just pass the MTU unchanged. 2317 * 2318 * IFN could have been generated locally or by some router. 2319 * 2320 * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. 2321 * This happens because IP adjusted its value of MTU on an 2322 * earlier IFN message and could not tell the upper layer, 2323 * the new adjusted value of MTU e.g. Packet was encrypted 2324 * or there was not enough information to fanout to upper 2325 * layers. Thus on the next outbound datagram, ip_wput_ire 2326 * generates the IFN, where IPSEC processing has *not* been 2327 * done. 2328 * 2329 * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed 2330 * could have generated this. This happens because ire_max_frag 2331 * value in IP was set to a new value, while the IPSEC processing 2332 * was being done and after we made the fragmentation check in 2333 * ip_wput_ire. Thus on return from IPSEC processing, 2334 * ip_wput_ipsec_out finds that the new length is > ire_max_frag 2335 * and generates the IFN. As IPSEC processing is over, we fanout 2336 * to AH/ESP to remove the header. 2337 * 2338 * In both these cases, ipsec_in_loopback will be set indicating 2339 * that IFN was generated locally. 2340 * 2341 * ROUTER : IFN could be secure or non-secure. 2342 * 2343 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2344 * packet in error has AH/ESP headers to validate the AH/ESP 2345 * headers. AH/ESP will verify whether there is a valid SA or 2346 * not and send it back. We will fanout again if we have more 2347 * data in the packet. 2348 * 2349 * If the packet in error does not have AH/ESP, we handle it 2350 * like any other case. 2351 * 2352 * * NON_SECURE : If the packet in error has AH/ESP headers, 2353 * we attach a dummy ipsec_in and send it up to AH/ESP 2354 * for validation. AH/ESP will verify whether there is a 2355 * valid SA or not and send it back. We will fanout again if 2356 * we have more data in the packet. 2357 * 2358 * If the packet in error does not have AH/ESP, we handle it 2359 * like any other case. 2360 */ 2361 static void 2362 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, 2363 icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, 2364 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 2365 zoneid_t zoneid) 2366 { 2367 uint16_t *up; /* Pointer to ports in ULP header */ 2368 uint32_t ports; /* reversed ports for fanout */ 2369 ipha_t ripha; /* With reversed addresses */ 2370 mblk_t *first_mp; 2371 ipsec_in_t *ii; 2372 tcph_t *tcph; 2373 conn_t *connp; 2374 2375 first_mp = mp; 2376 if (mctl_present) { 2377 mp = first_mp->b_cont; 2378 ASSERT(mp != NULL); 2379 2380 ii = (ipsec_in_t *)first_mp->b_rptr; 2381 ASSERT(ii->ipsec_in_type == IPSEC_IN); 2382 } else { 2383 ii = NULL; 2384 } 2385 2386 switch (ipha->ipha_protocol) { 2387 case IPPROTO_UDP: 2388 /* 2389 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2390 * transport header. 2391 */ 2392 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2393 mp->b_wptr) { 2394 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2395 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2396 BUMP_MIB(&ip_mib, ipInDiscards); 2397 goto drop_pkt; 2398 } 2399 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2400 ipha = (ipha_t *)&icmph[1]; 2401 } 2402 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2403 2404 /* 2405 * Attempt to find a client stream based on port. 2406 * Note that we do a reverse lookup since the header is 2407 * in the form we sent it out. 2408 * The ripha header is only used for the IP_UDP_MATCH and we 2409 * only set the src and dst addresses and protocol. 2410 */ 2411 ripha.ipha_src = ipha->ipha_dst; 2412 ripha.ipha_dst = ipha->ipha_src; 2413 ripha.ipha_protocol = ipha->ipha_protocol; 2414 ((uint16_t *)&ports)[0] = up[1]; 2415 ((uint16_t *)&ports)[1] = up[0]; 2416 ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", 2417 ntohl(ipha->ipha_src), ntohs(up[0]), 2418 ntohl(ipha->ipha_dst), ntohs(up[1]), 2419 icmph->icmph_type, icmph->icmph_code)); 2420 2421 /* Have to change db_type after any pullupmsg */ 2422 DB_TYPE(mp) = M_CTL; 2423 2424 ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, 2425 mctl_present, ip_policy, recv_ill, zoneid); 2426 return; 2427 2428 case IPPROTO_TCP: 2429 /* 2430 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2431 * transport header. 2432 */ 2433 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2434 mp->b_wptr) { 2435 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2436 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2437 BUMP_MIB(&ip_mib, ipInDiscards); 2438 goto drop_pkt; 2439 } 2440 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2441 ipha = (ipha_t *)&icmph[1]; 2442 } 2443 /* 2444 * Find a TCP client stream for this packet. 2445 * Note that we do a reverse lookup since the header is 2446 * in the form we sent it out. 2447 */ 2448 tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); 2449 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN); 2450 if (connp == NULL) { 2451 BUMP_MIB(&ip_mib, ipInDiscards); 2452 goto drop_pkt; 2453 } 2454 2455 /* Have to change db_type after any pullupmsg */ 2456 DB_TYPE(mp) = M_CTL; 2457 squeue_fill(connp->conn_sqp, first_mp, tcp_input, 2458 connp, SQTAG_TCP_INPUT_ICMP_ERR); 2459 return; 2460 2461 case IPPROTO_SCTP: 2462 /* 2463 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 2464 * transport header. 2465 */ 2466 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 2467 mp->b_wptr) { 2468 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 2469 ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { 2470 BUMP_MIB(&ip_mib, ipInDiscards); 2471 goto drop_pkt; 2472 } 2473 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2474 ipha = (ipha_t *)&icmph[1]; 2475 } 2476 up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2477 /* 2478 * Find a SCTP client stream for this packet. 2479 * Note that we do a reverse lookup since the header is 2480 * in the form we sent it out. 2481 * The ripha header is only used for the matching and we 2482 * only set the src and dst addresses, protocol, and version. 2483 */ 2484 ripha.ipha_src = ipha->ipha_dst; 2485 ripha.ipha_dst = ipha->ipha_src; 2486 ripha.ipha_protocol = ipha->ipha_protocol; 2487 ripha.ipha_version_and_hdr_length = 2488 ipha->ipha_version_and_hdr_length; 2489 ((uint16_t *)&ports)[0] = up[1]; 2490 ((uint16_t *)&ports)[1] = up[0]; 2491 2492 /* Have to change db_type after any pullupmsg */ 2493 DB_TYPE(mp) = M_CTL; 2494 ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, 2495 mctl_present, ip_policy, 0, zoneid); 2496 return; 2497 2498 case IPPROTO_ESP: 2499 case IPPROTO_AH: { 2500 int ipsec_rc; 2501 2502 /* 2503 * We need a IPSEC_IN in the front to fanout to AH/ESP. 2504 * We will re-use the IPSEC_IN if it is already present as 2505 * AH/ESP will not affect any fields in the IPSEC_IN for 2506 * ICMP errors. If there is no IPSEC_IN, allocate a new 2507 * one and attach it in the front. 2508 */ 2509 if (ii != NULL) { 2510 /* 2511 * ip_fanout_proto_again converts the ICMP errors 2512 * that come back from AH/ESP to M_DATA so that 2513 * if it is non-AH/ESP and we do a pullupmsg in 2514 * this function, it would work. Convert it back 2515 * to M_CTL before we send up as this is a ICMP 2516 * error. This could have been generated locally or 2517 * by some router. Validate the inner IPSEC 2518 * headers. 2519 * 2520 * NOTE : ill_index is used by ip_fanout_proto_again 2521 * to locate the ill. 2522 */ 2523 ASSERT(ill != NULL); 2524 ii->ipsec_in_ill_index = 2525 ill->ill_phyint->phyint_ifindex; 2526 ii->ipsec_in_rill_index = 2527 recv_ill->ill_phyint->phyint_ifindex; 2528 DB_TYPE(first_mp->b_cont) = M_CTL; 2529 } else { 2530 /* 2531 * IPSEC_IN is not present. We attach a ipsec_in 2532 * message and send up to IPSEC for validating 2533 * and removing the IPSEC headers. Clear 2534 * ipsec_in_secure so that when we return 2535 * from IPSEC, we don't mistakenly think that this 2536 * is a secure packet came from the network. 2537 * 2538 * NOTE : ill_index is used by ip_fanout_proto_again 2539 * to locate the ill. 2540 */ 2541 ASSERT(first_mp == mp); 2542 first_mp = ipsec_in_alloc(B_TRUE); 2543 if (first_mp == NULL) { 2544 freemsg(mp); 2545 BUMP_MIB(&ip_mib, ipInDiscards); 2546 return; 2547 } 2548 ii = (ipsec_in_t *)first_mp->b_rptr; 2549 2550 /* This is not a secure packet */ 2551 ii->ipsec_in_secure = B_FALSE; 2552 first_mp->b_cont = mp; 2553 DB_TYPE(mp) = M_CTL; 2554 ASSERT(ill != NULL); 2555 ii->ipsec_in_ill_index = 2556 ill->ill_phyint->phyint_ifindex; 2557 ii->ipsec_in_rill_index = 2558 recv_ill->ill_phyint->phyint_ifindex; 2559 } 2560 ip2dbg(("icmp_inbound_error: ipsec\n")); 2561 2562 if (!ipsec_loaded()) { 2563 ip_proto_not_sup(q, first_mp, 0, zoneid); 2564 return; 2565 } 2566 2567 if (ipha->ipha_protocol == IPPROTO_ESP) 2568 ipsec_rc = ipsecesp_icmp_error(first_mp); 2569 else 2570 ipsec_rc = ipsecah_icmp_error(first_mp); 2571 if (ipsec_rc == IPSEC_STATUS_FAILED) 2572 return; 2573 2574 ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); 2575 return; 2576 } 2577 default: 2578 /* 2579 * The ripha header is only used for the lookup and we 2580 * only set the src and dst addresses and protocol. 2581 */ 2582 ripha.ipha_src = ipha->ipha_dst; 2583 ripha.ipha_dst = ipha->ipha_src; 2584 ripha.ipha_protocol = ipha->ipha_protocol; 2585 ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", 2586 ripha.ipha_protocol, ntohl(ipha->ipha_src), 2587 ntohl(ipha->ipha_dst), 2588 icmph->icmph_type, icmph->icmph_code)); 2589 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2590 ipha_t *in_ipha; 2591 2592 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 2593 mp->b_wptr) { 2594 if (!pullupmsg(mp, (uchar_t *)ipha + 2595 hdr_length + sizeof (ipha_t) - 2596 mp->b_rptr)) { 2597 2598 BUMP_MIB(&ip_mib, ipInDiscards); 2599 goto drop_pkt; 2600 } 2601 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2602 ipha = (ipha_t *)&icmph[1]; 2603 } 2604 /* 2605 * Caller has verified that length has to be 2606 * at least the size of IP header. 2607 */ 2608 ASSERT(hdr_length >= sizeof (ipha_t)); 2609 /* 2610 * Check the sanity of the inner IP header like 2611 * we did for the outer header. 2612 */ 2613 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2614 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2615 BUMP_MIB(&ip_mib, ipInDiscards); 2616 goto drop_pkt; 2617 } 2618 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2619 BUMP_MIB(&ip_mib, ipInDiscards); 2620 goto drop_pkt; 2621 } 2622 /* Check for Self-encapsulated tunnels */ 2623 if (in_ipha->ipha_src == ipha->ipha_src && 2624 in_ipha->ipha_dst == ipha->ipha_dst) { 2625 2626 mp = icmp_inbound_self_encap_error(mp, 2627 iph_hdr_length, hdr_length); 2628 if (mp == NULL) 2629 goto drop_pkt; 2630 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2631 ipha = (ipha_t *)&icmph[1]; 2632 hdr_length = IPH_HDR_LENGTH(ipha); 2633 /* 2634 * The packet in error is self-encapsualted. 2635 * And we are finding it further encapsulated 2636 * which we could not have possibly generated. 2637 */ 2638 if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2639 BUMP_MIB(&ip_mib, ipInDiscards); 2640 goto drop_pkt; 2641 } 2642 icmp_inbound_error_fanout(q, ill, first_mp, 2643 icmph, ipha, iph_hdr_length, hdr_length, 2644 mctl_present, ip_policy, recv_ill, zoneid); 2645 return; 2646 } 2647 } 2648 if ((ipha->ipha_protocol == IPPROTO_ENCAP || 2649 ipha->ipha_protocol == IPPROTO_IPV6) && 2650 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 2651 ii != NULL && 2652 ii->ipsec_in_loopback && 2653 ii->ipsec_in_secure) { 2654 /* 2655 * For IP tunnels that get a looped-back 2656 * ICMP_FRAGMENTATION_NEEDED message, adjust the 2657 * reported new MTU to take into account the IPsec 2658 * headers protecting this configured tunnel. 2659 * 2660 * This allows the tunnel module (tun.c) to blindly 2661 * accept the MTU reported in an ICMP "too big" 2662 * message. 2663 * 2664 * Non-looped back ICMP messages will just be 2665 * handled by the security protocols (if needed), 2666 * and the first subsequent packet will hit this 2667 * path. 2668 */ 2669 icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - 2670 ipsec_in_extra_length(first_mp)); 2671 } 2672 /* Have to change db_type after any pullupmsg */ 2673 DB_TYPE(mp) = M_CTL; 2674 2675 ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, 2676 ip_policy, recv_ill, zoneid); 2677 return; 2678 } 2679 /* NOTREACHED */ 2680 drop_pkt:; 2681 ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); 2682 freemsg(first_mp); 2683 } 2684 2685 /* 2686 * Common IP options parser. 2687 * 2688 * Setup routine: fill in *optp with options-parsing state, then 2689 * tail-call ipoptp_next to return the first option. 2690 */ 2691 uint8_t 2692 ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2693 { 2694 uint32_t totallen; /* total length of all options */ 2695 2696 totallen = ipha->ipha_version_and_hdr_length - 2697 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2698 totallen <<= 2; 2699 optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2700 optp->ipoptp_end = optp->ipoptp_next + totallen; 2701 optp->ipoptp_flags = 0; 2702 return (ipoptp_next(optp)); 2703 } 2704 2705 /* 2706 * Common IP options parser: extract next option. 2707 */ 2708 uint8_t 2709 ipoptp_next(ipoptp_t *optp) 2710 { 2711 uint8_t *end = optp->ipoptp_end; 2712 uint8_t *cur = optp->ipoptp_next; 2713 uint8_t opt, len, pointer; 2714 2715 /* 2716 * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2717 * has been corrupted. 2718 */ 2719 ASSERT(cur <= end); 2720 2721 if (cur == end) 2722 return (IPOPT_EOL); 2723 2724 opt = cur[IPOPT_OPTVAL]; 2725 2726 /* 2727 * Skip any NOP options. 2728 */ 2729 while (opt == IPOPT_NOP) { 2730 cur++; 2731 if (cur == end) 2732 return (IPOPT_EOL); 2733 opt = cur[IPOPT_OPTVAL]; 2734 } 2735 2736 if (opt == IPOPT_EOL) 2737 return (IPOPT_EOL); 2738 2739 /* 2740 * Option requiring a length. 2741 */ 2742 if ((cur + 1) >= end) { 2743 optp->ipoptp_flags |= IPOPTP_ERROR; 2744 return (IPOPT_EOL); 2745 } 2746 len = cur[IPOPT_OLEN]; 2747 if (len < 2) { 2748 optp->ipoptp_flags |= IPOPTP_ERROR; 2749 return (IPOPT_EOL); 2750 } 2751 optp->ipoptp_cur = cur; 2752 optp->ipoptp_len = len; 2753 optp->ipoptp_next = cur + len; 2754 if (cur + len > end) { 2755 optp->ipoptp_flags |= IPOPTP_ERROR; 2756 return (IPOPT_EOL); 2757 } 2758 2759 /* 2760 * For the options which require a pointer field, make sure 2761 * its there, and make sure it points to either something 2762 * inside this option, or the end of the option. 2763 */ 2764 switch (opt) { 2765 case IPOPT_RR: 2766 case IPOPT_TS: 2767 case IPOPT_LSRR: 2768 case IPOPT_SSRR: 2769 if (len <= IPOPT_OFFSET) { 2770 optp->ipoptp_flags |= IPOPTP_ERROR; 2771 return (opt); 2772 } 2773 pointer = cur[IPOPT_OFFSET]; 2774 if (pointer - 1 > len) { 2775 optp->ipoptp_flags |= IPOPTP_ERROR; 2776 return (opt); 2777 } 2778 break; 2779 } 2780 2781 /* 2782 * Sanity check the pointer field based on the type of the 2783 * option. 2784 */ 2785 switch (opt) { 2786 case IPOPT_RR: 2787 case IPOPT_SSRR: 2788 case IPOPT_LSRR: 2789 if (pointer < IPOPT_MINOFF_SR) 2790 optp->ipoptp_flags |= IPOPTP_ERROR; 2791 break; 2792 case IPOPT_TS: 2793 if (pointer < IPOPT_MINOFF_IT) 2794 optp->ipoptp_flags |= IPOPTP_ERROR; 2795 /* 2796 * Note that the Internet Timestamp option also 2797 * contains two four bit fields (the Overflow field, 2798 * and the Flag field), which follow the pointer 2799 * field. We don't need to check that these fields 2800 * fall within the length of the option because this 2801 * was implicitely done above. We've checked that the 2802 * pointer value is at least IPOPT_MINOFF_IT, and that 2803 * it falls within the option. Since IPOPT_MINOFF_IT > 2804 * IPOPT_POS_OV_FLG, we don't need the explicit check. 2805 */ 2806 ASSERT(len > IPOPT_POS_OV_FLG); 2807 break; 2808 } 2809 2810 return (opt); 2811 } 2812 2813 /* 2814 * Use the outgoing IP header to create an IP_OPTIONS option the way 2815 * it was passed down from the application. 2816 */ 2817 int 2818 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) 2819 { 2820 ipoptp_t opts; 2821 const uchar_t *opt; 2822 uint8_t optval; 2823 uint8_t optlen; 2824 uint32_t len = 0; 2825 uchar_t *buf1 = buf; 2826 2827 buf += IP_ADDR_LEN; /* Leave room for final destination */ 2828 len += IP_ADDR_LEN; 2829 bzero(buf1, IP_ADDR_LEN); 2830 2831 /* 2832 * OK to cast away const here, as we don't store through the returned 2833 * opts.ipoptp_cur pointer. 2834 */ 2835 for (optval = ipoptp_first(&opts, (ipha_t *)ipha); 2836 optval != IPOPT_EOL; 2837 optval = ipoptp_next(&opts)) { 2838 int off; 2839 2840 opt = opts.ipoptp_cur; 2841 optlen = opts.ipoptp_len; 2842 switch (optval) { 2843 case IPOPT_SSRR: 2844 case IPOPT_LSRR: 2845 2846 /* 2847 * Insert ipha_dst as the first entry in the source 2848 * route and move down the entries on step. 2849 * The last entry gets placed at buf1. 2850 */ 2851 buf[IPOPT_OPTVAL] = optval; 2852 buf[IPOPT_OLEN] = optlen; 2853 buf[IPOPT_OFFSET] = optlen; 2854 2855 off = optlen - IP_ADDR_LEN; 2856 if (off < 0) { 2857 /* No entries in source route */ 2858 break; 2859 } 2860 /* Last entry in source route */ 2861 bcopy(opt + off, buf1, IP_ADDR_LEN); 2862 off -= IP_ADDR_LEN; 2863 2864 while (off > 0) { 2865 bcopy(opt + off, 2866 buf + off + IP_ADDR_LEN, 2867 IP_ADDR_LEN); 2868 off -= IP_ADDR_LEN; 2869 } 2870 /* ipha_dst into first slot */ 2871 bcopy(&ipha->ipha_dst, 2872 buf + off + IP_ADDR_LEN, 2873 IP_ADDR_LEN); 2874 buf += optlen; 2875 len += optlen; 2876 break; 2877 2878 case IPOPT_COMSEC: 2879 case IPOPT_SECURITY: 2880 /* if passing up a label is not ok, then remove */ 2881 if (is_system_labeled()) 2882 break; 2883 /* FALLTHROUGH */ 2884 default: 2885 bcopy(opt, buf, optlen); 2886 buf += optlen; 2887 len += optlen; 2888 break; 2889 } 2890 } 2891 done: 2892 /* Pad the resulting options */ 2893 while (len & 0x3) { 2894 *buf++ = IPOPT_EOL; 2895 len++; 2896 } 2897 return (len); 2898 } 2899 2900 /* 2901 * Update any record route or timestamp options to include this host. 2902 * Reverse any source route option. 2903 * This routine assumes that the options are well formed i.e. that they 2904 * have already been checked. 2905 */ 2906 static void 2907 icmp_options_update(ipha_t *ipha) 2908 { 2909 ipoptp_t opts; 2910 uchar_t *opt; 2911 uint8_t optval; 2912 ipaddr_t src; /* Our local address */ 2913 ipaddr_t dst; 2914 2915 ip2dbg(("icmp_options_update\n")); 2916 src = ipha->ipha_src; 2917 dst = ipha->ipha_dst; 2918 2919 for (optval = ipoptp_first(&opts, ipha); 2920 optval != IPOPT_EOL; 2921 optval = ipoptp_next(&opts)) { 2922 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 2923 opt = opts.ipoptp_cur; 2924 ip2dbg(("icmp_options_update: opt %d, len %d\n", 2925 optval, opts.ipoptp_len)); 2926 switch (optval) { 2927 int off1, off2; 2928 case IPOPT_SSRR: 2929 case IPOPT_LSRR: 2930 /* 2931 * Reverse the source route. The first entry 2932 * should be the next to last one in the current 2933 * source route (the last entry is our address). 2934 * The last entry should be the final destination. 2935 */ 2936 off1 = IPOPT_MINOFF_SR - 1; 2937 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 2938 if (off2 < 0) { 2939 /* No entries in source route */ 2940 ip1dbg(( 2941 "icmp_options_update: bad src route\n")); 2942 break; 2943 } 2944 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 2945 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 2946 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 2947 off2 -= IP_ADDR_LEN; 2948 2949 while (off1 < off2) { 2950 bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 2951 bcopy((char *)opt + off2, (char *)opt + off1, 2952 IP_ADDR_LEN); 2953 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 2954 off1 += IP_ADDR_LEN; 2955 off2 -= IP_ADDR_LEN; 2956 } 2957 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 2958 break; 2959 } 2960 } 2961 } 2962 2963 /* 2964 * Process received ICMP Redirect messages. 2965 */ 2966 /* ARGSUSED */ 2967 static void 2968 icmp_redirect(mblk_t *mp) 2969 { 2970 ipha_t *ipha; 2971 int iph_hdr_length; 2972 icmph_t *icmph; 2973 ipha_t *ipha_err; 2974 ire_t *ire; 2975 ire_t *prev_ire; 2976 ire_t *save_ire; 2977 ipaddr_t src, dst, gateway; 2978 iulp_t ulp_info = { 0 }; 2979 int error; 2980 2981 ipha = (ipha_t *)mp->b_rptr; 2982 iph_hdr_length = IPH_HDR_LENGTH(ipha); 2983 if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < 2984 sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { 2985 BUMP_MIB(&icmp_mib, icmpInErrors); 2986 freemsg(mp); 2987 return; 2988 } 2989 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 2990 ipha_err = (ipha_t *)&icmph[1]; 2991 src = ipha->ipha_src; 2992 dst = ipha_err->ipha_dst; 2993 gateway = icmph->icmph_rd_gateway; 2994 /* Make sure the new gateway is reachable somehow. */ 2995 ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, 2996 ALL_ZONES, NULL, MATCH_IRE_TYPE); 2997 /* 2998 * Make sure we had a route for the dest in question and that 2999 * that route was pointing to the old gateway (the source of the 3000 * redirect packet.) 3001 */ 3002 prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, 3003 NULL, MATCH_IRE_GW); 3004 /* 3005 * Check that 3006 * the redirect was not from ourselves 3007 * the new gateway and the old gateway are directly reachable 3008 */ 3009 if (!prev_ire || 3010 !ire || 3011 ire->ire_type == IRE_LOCAL) { 3012 BUMP_MIB(&icmp_mib, icmpInBadRedirects); 3013 freemsg(mp); 3014 if (ire != NULL) 3015 ire_refrele(ire); 3016 if (prev_ire != NULL) 3017 ire_refrele(prev_ire); 3018 return; 3019 } 3020 3021 /* 3022 * Should we use the old ULP info to create the new gateway? From 3023 * a user's perspective, we should inherit the info so that it 3024 * is a "smooth" transition. If we do not do that, then new 3025 * connections going thru the new gateway will have no route metrics, 3026 * which is counter-intuitive to user. From a network point of 3027 * view, this may or may not make sense even though the new gateway 3028 * is still directly connected to us so the route metrics should not 3029 * change much. 3030 * 3031 * But if the old ire_uinfo is not initialized, we do another 3032 * recursive lookup on the dest using the new gateway. There may 3033 * be a route to that. If so, use it to initialize the redirect 3034 * route. 3035 */ 3036 if (prev_ire->ire_uinfo.iulp_set) { 3037 bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3038 } else { 3039 ire_t *tmp_ire; 3040 ire_t *sire; 3041 3042 tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, 3043 ALL_ZONES, 0, NULL, 3044 (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT)); 3045 if (sire != NULL) { 3046 bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); 3047 /* 3048 * If sire != NULL, ire_ftable_lookup() should not 3049 * return a NULL value. 3050 */ 3051 ASSERT(tmp_ire != NULL); 3052 ire_refrele(tmp_ire); 3053 ire_refrele(sire); 3054 } else if (tmp_ire != NULL) { 3055 bcopy(&tmp_ire->ire_uinfo, &ulp_info, 3056 sizeof (iulp_t)); 3057 ire_refrele(tmp_ire); 3058 } 3059 } 3060 if (prev_ire->ire_type == IRE_CACHE) 3061 ire_delete(prev_ire); 3062 ire_refrele(prev_ire); 3063 /* 3064 * TODO: more precise handling for cases 0, 2, 3, the latter two 3065 * require TOS routing 3066 */ 3067 switch (icmph->icmph_code) { 3068 case 0: 3069 case 1: 3070 /* TODO: TOS specificity for cases 2 and 3 */ 3071 case 2: 3072 case 3: 3073 break; 3074 default: 3075 freemsg(mp); 3076 BUMP_MIB(&icmp_mib, icmpInBadRedirects); 3077 ire_refrele(ire); 3078 return; 3079 } 3080 /* 3081 * Create a Route Association. This will allow us to remember that 3082 * someone we believe told us to use the particular gateway. 3083 */ 3084 save_ire = ire; 3085 ire = ire_create( 3086 (uchar_t *)&dst, /* dest addr */ 3087 (uchar_t *)&ip_g_all_ones, /* mask */ 3088 (uchar_t *)&save_ire->ire_src_addr, /* source addr */ 3089 (uchar_t *)&gateway, /* gateway addr */ 3090 NULL, /* no in_srcaddr */ 3091 &save_ire->ire_max_frag, /* max frag */ 3092 NULL, /* Fast Path header */ 3093 NULL, /* no rfq */ 3094 NULL, /* no stq */ 3095 IRE_HOST_REDIRECT, 3096 NULL, 3097 NULL, 3098 NULL, 3099 0, 3100 0, 3101 0, 3102 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 3103 &ulp_info, 3104 NULL, 3105 NULL); 3106 3107 if (ire == NULL) { 3108 freemsg(mp); 3109 ire_refrele(save_ire); 3110 return; 3111 } 3112 error = ire_add(&ire, NULL, NULL, NULL); 3113 ire_refrele(save_ire); 3114 if (error == 0) { 3115 ire_refrele(ire); /* Held in ire_add_v4 */ 3116 /* tell routing sockets that we received a redirect */ 3117 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 3118 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 3119 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR)); 3120 } 3121 3122 /* 3123 * Delete any existing IRE_HOST_REDIRECT for this destination. 3124 * This together with the added IRE has the effect of 3125 * modifying an existing redirect. 3126 */ 3127 prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST_REDIRECT, NULL, NULL, 3128 ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE)); 3129 if (prev_ire) { 3130 ire_delete(prev_ire); 3131 ire_refrele(prev_ire); 3132 } 3133 3134 freemsg(mp); 3135 } 3136 3137 /* 3138 * Generate an ICMP parameter problem message. 3139 */ 3140 static void 3141 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr) 3142 { 3143 icmph_t icmph; 3144 boolean_t mctl_present; 3145 mblk_t *first_mp; 3146 3147 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3148 3149 if (!(mp = icmp_pkt_err_ok(mp))) { 3150 if (mctl_present) 3151 freeb(first_mp); 3152 return; 3153 } 3154 3155 bzero(&icmph, sizeof (icmph_t)); 3156 icmph.icmph_type = ICMP_PARAM_PROBLEM; 3157 icmph.icmph_pp_ptr = ptr; 3158 BUMP_MIB(&icmp_mib, icmpOutParmProbs); 3159 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 3160 } 3161 3162 /* 3163 * Build and ship an IPv4 ICMP message using the packet data in mp, and 3164 * the ICMP header pointed to by "stuff". (May be called as writer.) 3165 * Note: assumes that icmp_pkt_err_ok has been called to verify that 3166 * an icmp error packet can be sent. 3167 * Assigns an appropriate source address to the packet. If ipha_dst is 3168 * one of our addresses use it for source. Otherwise pick a source based 3169 * on a route lookup back to ipha_src. 3170 * Note that ipha_src must be set here since the 3171 * packet is likely to arrive on an ill queue in ip_wput() which will 3172 * not set a source address. 3173 */ 3174 static void 3175 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, 3176 boolean_t mctl_present) 3177 { 3178 ipaddr_t dst; 3179 icmph_t *icmph; 3180 ipha_t *ipha; 3181 uint_t len_needed; 3182 size_t msg_len; 3183 mblk_t *mp1; 3184 ipaddr_t src; 3185 ire_t *ire; 3186 mblk_t *ipsec_mp; 3187 ipsec_out_t *io = NULL; 3188 boolean_t xmit_if_on = B_FALSE; 3189 zoneid_t zoneid; 3190 3191 if (mctl_present) { 3192 /* 3193 * If it is : 3194 * 3195 * 1) a IPSEC_OUT, then this is caused by outbound 3196 * datagram originating on this host. IPSEC processing 3197 * may or may not have been done. Refer to comments above 3198 * icmp_inbound_error_fanout for details. 3199 * 3200 * 2) a IPSEC_IN if we are generating a icmp_message 3201 * for an incoming datagram destined for us i.e called 3202 * from ip_fanout_send_icmp. 3203 */ 3204 ipsec_info_t *in; 3205 ipsec_mp = mp; 3206 mp = ipsec_mp->b_cont; 3207 3208 in = (ipsec_info_t *)ipsec_mp->b_rptr; 3209 ipha = (ipha_t *)mp->b_rptr; 3210 3211 ASSERT(in->ipsec_info_type == IPSEC_OUT || 3212 in->ipsec_info_type == IPSEC_IN); 3213 3214 if (in->ipsec_info_type == IPSEC_IN) { 3215 /* 3216 * Convert the IPSEC_IN to IPSEC_OUT. 3217 */ 3218 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3219 BUMP_MIB(&ip_mib, ipOutDiscards); 3220 return; 3221 } 3222 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3223 } else { 3224 ASSERT(in->ipsec_info_type == IPSEC_OUT); 3225 io = (ipsec_out_t *)in; 3226 if (io->ipsec_out_xmit_if) 3227 xmit_if_on = B_TRUE; 3228 /* 3229 * Clear out ipsec_out_proc_begin, so we do a fresh 3230 * ire lookup. 3231 */ 3232 io->ipsec_out_proc_begin = B_FALSE; 3233 } 3234 zoneid = io->ipsec_out_zoneid; 3235 ASSERT(zoneid != ALL_ZONES); 3236 } else { 3237 /* 3238 * This is in clear. The icmp message we are building 3239 * here should go out in clear. 3240 * 3241 * Pardon the convolution of it all, but it's easier to 3242 * allocate a "use cleartext" IPSEC_IN message and convert 3243 * it than it is to allocate a new one. 3244 */ 3245 ipsec_in_t *ii; 3246 ASSERT(DB_TYPE(mp) == M_DATA); 3247 if ((ipsec_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 3248 freemsg(mp); 3249 BUMP_MIB(&ip_mib, ipOutDiscards); 3250 return; 3251 } 3252 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 3253 3254 /* This is not a secure packet */ 3255 ii->ipsec_in_secure = B_FALSE; 3256 if (CONN_Q(q)) { 3257 zoneid = Q_TO_CONN(q)->conn_zoneid; 3258 } else { 3259 zoneid = GLOBAL_ZONEID; 3260 } 3261 ii->ipsec_in_zoneid = zoneid; 3262 ASSERT(zoneid != ALL_ZONES); 3263 ipsec_mp->b_cont = mp; 3264 ipha = (ipha_t *)mp->b_rptr; 3265 /* 3266 * Convert the IPSEC_IN to IPSEC_OUT. 3267 */ 3268 if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { 3269 BUMP_MIB(&ip_mib, ipOutDiscards); 3270 return; 3271 } 3272 io = (ipsec_out_t *)ipsec_mp->b_rptr; 3273 } 3274 3275 /* Remember our eventual destination */ 3276 dst = ipha->ipha_src; 3277 3278 ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), 3279 NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE); 3280 if (ire != NULL && 3281 (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { 3282 src = ipha->ipha_dst; 3283 } else if (!xmit_if_on) { 3284 if (ire != NULL) 3285 ire_refrele(ire); 3286 ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, 3287 (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY)); 3288 if (ire == NULL) { 3289 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3290 freemsg(ipsec_mp); 3291 return; 3292 } 3293 src = ire->ire_src_addr; 3294 } else { 3295 ipif_t *ipif = NULL; 3296 ill_t *ill; 3297 /* 3298 * This must be an ICMP error coming from 3299 * ip_mrtun_forward(). The src addr should 3300 * be equal to the IP-addr of the outgoing 3301 * interface. 3302 */ 3303 if (io == NULL) { 3304 /* This is not a IPSEC_OUT type control msg */ 3305 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3306 freemsg(ipsec_mp); 3307 return; 3308 } 3309 ill = ill_lookup_on_ifindex(io->ipsec_out_ill_index, B_FALSE, 3310 NULL, NULL, NULL, NULL); 3311 if (ill != NULL) { 3312 ipif = ipif_get_next_ipif(NULL, ill); 3313 ill_refrele(ill); 3314 } 3315 if (ipif == NULL) { 3316 BUMP_MIB(&ip_mib, ipOutNoRoutes); 3317 freemsg(ipsec_mp); 3318 return; 3319 } 3320 src = ipif->ipif_src_addr; 3321 ipif_refrele(ipif); 3322 } 3323 3324 if (ire != NULL) 3325 ire_refrele(ire); 3326 3327 /* 3328 * Check if we can send back more then 8 bytes in addition 3329 * to the IP header. We will include as much as 64 bytes. 3330 */ 3331 len_needed = IPH_HDR_LENGTH(ipha); 3332 if (ipha->ipha_protocol == IPPROTO_ENCAP && 3333 (uchar_t *)ipha + len_needed + 1 <= mp->b_wptr) { 3334 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + len_needed)); 3335 } 3336 len_needed += ip_icmp_return; 3337 msg_len = msgdsize(mp); 3338 if (msg_len > len_needed) { 3339 (void) adjmsg(mp, len_needed - msg_len); 3340 msg_len = len_needed; 3341 } 3342 mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_HI); 3343 if (mp1 == NULL) { 3344 BUMP_MIB(&icmp_mib, icmpOutErrors); 3345 freemsg(ipsec_mp); 3346 return; 3347 } 3348 /* 3349 * On an unlabeled system, dblks don't necessarily have creds. 3350 */ 3351 ASSERT(!is_system_labeled() || DB_CRED(mp) != NULL); 3352 if (DB_CRED(mp) != NULL) 3353 mblk_setcred(mp1, DB_CRED(mp)); 3354 mp1->b_cont = mp; 3355 mp = mp1; 3356 ASSERT(ipsec_mp->b_datap->db_type == M_CTL && 3357 ipsec_mp->b_rptr == (uint8_t *)io && 3358 io->ipsec_out_type == IPSEC_OUT); 3359 ipsec_mp->b_cont = mp; 3360 3361 /* 3362 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this 3363 * node generates be accepted in peace by all on-host destinations. 3364 * If we do NOT assume that all on-host destinations trust 3365 * self-generated ICMP messages, then rework here, ip6.c, and spd.c. 3366 * (Look for ipsec_out_icmp_loopback). 3367 */ 3368 io->ipsec_out_icmp_loopback = B_TRUE; 3369 3370 ipha = (ipha_t *)mp->b_rptr; 3371 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); 3372 *ipha = icmp_ipha; 3373 ipha->ipha_src = src; 3374 ipha->ipha_dst = dst; 3375 ipha->ipha_ttl = ip_def_ttl; 3376 msg_len += sizeof (icmp_ipha) + len; 3377 if (msg_len > IP_MAXPACKET) { 3378 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 3379 msg_len = IP_MAXPACKET; 3380 } 3381 ipha->ipha_length = htons((uint16_t)msg_len); 3382 icmph = (icmph_t *)&ipha[1]; 3383 bcopy(stuff, icmph, len); 3384 icmph->icmph_checksum = 0; 3385 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); 3386 if (icmph->icmph_checksum == 0) 3387 icmph->icmph_checksum = 0xFFFF; 3388 BUMP_MIB(&icmp_mib, icmpOutMsgs); 3389 put(q, ipsec_mp); 3390 } 3391 3392 /* 3393 * Determine if an ICMP error packet can be sent given the rate limit. 3394 * The limit consists of an average frequency (icmp_pkt_err_interval measured 3395 * in milliseconds) and a burst size. Burst size number of packets can 3396 * be sent arbitrarely closely spaced. 3397 * The state is tracked using two variables to implement an approximate 3398 * token bucket filter: 3399 * icmp_pkt_err_last - lbolt value when the last burst started 3400 * icmp_pkt_err_sent - number of packets sent in current burst 3401 */ 3402 boolean_t 3403 icmp_err_rate_limit(void) 3404 { 3405 clock_t now = TICK_TO_MSEC(lbolt); 3406 uint_t refilled; /* Number of packets refilled in tbf since last */ 3407 uint_t err_interval = ip_icmp_err_interval; /* Guard against changes */ 3408 3409 if (err_interval == 0) 3410 return (B_FALSE); 3411 3412 if (icmp_pkt_err_last > now) { 3413 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ 3414 icmp_pkt_err_last = 0; 3415 icmp_pkt_err_sent = 0; 3416 } 3417 /* 3418 * If we are in a burst update the token bucket filter. 3419 * Update the "last" time to be close to "now" but make sure 3420 * we don't loose precision. 3421 */ 3422 if (icmp_pkt_err_sent != 0) { 3423 refilled = (now - icmp_pkt_err_last)/err_interval; 3424 if (refilled > icmp_pkt_err_sent) { 3425 icmp_pkt_err_sent = 0; 3426 } else { 3427 icmp_pkt_err_sent -= refilled; 3428 icmp_pkt_err_last += refilled * err_interval; 3429 } 3430 } 3431 if (icmp_pkt_err_sent == 0) { 3432 /* Start of new burst */ 3433 icmp_pkt_err_last = now; 3434 } 3435 if (icmp_pkt_err_sent < ip_icmp_err_burst) { 3436 icmp_pkt_err_sent++; 3437 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", 3438 icmp_pkt_err_sent)); 3439 return (B_FALSE); 3440 } 3441 ip1dbg(("icmp_err_rate_limit: dropped\n")); 3442 return (B_TRUE); 3443 } 3444 3445 /* 3446 * Check if it is ok to send an IPv4 ICMP error packet in 3447 * response to the IPv4 packet in mp. 3448 * Free the message and return null if no 3449 * ICMP error packet should be sent. 3450 */ 3451 static mblk_t * 3452 icmp_pkt_err_ok(mblk_t *mp) 3453 { 3454 icmph_t *icmph; 3455 ipha_t *ipha; 3456 uint_t len_needed; 3457 ire_t *src_ire; 3458 ire_t *dst_ire; 3459 3460 if (!mp) 3461 return (NULL); 3462 ipha = (ipha_t *)mp->b_rptr; 3463 if (ip_csum_hdr(ipha)) { 3464 BUMP_MIB(&ip_mib, ipInCksumErrs); 3465 freemsg(mp); 3466 return (NULL); 3467 } 3468 src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, 3469 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3470 dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, 3471 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 3472 if (src_ire != NULL || dst_ire != NULL || 3473 CLASSD(ipha->ipha_dst) || 3474 CLASSD(ipha->ipha_src) || 3475 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { 3476 /* Note: only errors to the fragment with offset 0 */ 3477 BUMP_MIB(&icmp_mib, icmpOutDrops); 3478 freemsg(mp); 3479 if (src_ire != NULL) 3480 ire_refrele(src_ire); 3481 if (dst_ire != NULL) 3482 ire_refrele(dst_ire); 3483 return (NULL); 3484 } 3485 if (ipha->ipha_protocol == IPPROTO_ICMP) { 3486 /* 3487 * Check the ICMP type. RFC 1122 sez: don't send ICMP 3488 * errors in response to any ICMP errors. 3489 */ 3490 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; 3491 if (mp->b_wptr - mp->b_rptr < len_needed) { 3492 if (!pullupmsg(mp, len_needed)) { 3493 BUMP_MIB(&icmp_mib, icmpInErrors); 3494 freemsg(mp); 3495 return (NULL); 3496 } 3497 ipha = (ipha_t *)mp->b_rptr; 3498 } 3499 icmph = (icmph_t *) 3500 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); 3501 switch (icmph->icmph_type) { 3502 case ICMP_DEST_UNREACHABLE: 3503 case ICMP_SOURCE_QUENCH: 3504 case ICMP_TIME_EXCEEDED: 3505 case ICMP_PARAM_PROBLEM: 3506 case ICMP_REDIRECT: 3507 BUMP_MIB(&icmp_mib, icmpOutDrops); 3508 freemsg(mp); 3509 return (NULL); 3510 default: 3511 break; 3512 } 3513 } 3514 /* 3515 * If this is a labeled system, then check to see if we're allowed to 3516 * send a response to this particular sender. If not, then just drop. 3517 */ 3518 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 3519 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); 3520 BUMP_MIB(&icmp_mib, icmpOutDrops); 3521 freemsg(mp); 3522 return (NULL); 3523 } 3524 if (icmp_err_rate_limit()) { 3525 /* 3526 * Only send ICMP error packets every so often. 3527 * This should be done on a per port/source basis, 3528 * but for now this will suffice. 3529 */ 3530 freemsg(mp); 3531 return (NULL); 3532 } 3533 return (mp); 3534 } 3535 3536 /* 3537 * Generate an ICMP redirect message. 3538 */ 3539 static void 3540 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway) 3541 { 3542 icmph_t icmph; 3543 3544 /* 3545 * We are called from ip_rput where we could 3546 * not have attached an IPSEC_IN. 3547 */ 3548 ASSERT(mp->b_datap->db_type == M_DATA); 3549 3550 if (!(mp = icmp_pkt_err_ok(mp))) { 3551 return; 3552 } 3553 3554 bzero(&icmph, sizeof (icmph_t)); 3555 icmph.icmph_type = ICMP_REDIRECT; 3556 icmph.icmph_code = 1; 3557 icmph.icmph_rd_gateway = gateway; 3558 BUMP_MIB(&icmp_mib, icmpOutRedirects); 3559 icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE); 3560 } 3561 3562 /* 3563 * Generate an ICMP time exceeded message. 3564 */ 3565 void 3566 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code) 3567 { 3568 icmph_t icmph; 3569 boolean_t mctl_present; 3570 mblk_t *first_mp; 3571 3572 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3573 3574 if (!(mp = icmp_pkt_err_ok(mp))) { 3575 if (mctl_present) 3576 freeb(first_mp); 3577 return; 3578 } 3579 3580 bzero(&icmph, sizeof (icmph_t)); 3581 icmph.icmph_type = ICMP_TIME_EXCEEDED; 3582 icmph.icmph_code = code; 3583 BUMP_MIB(&icmp_mib, icmpOutTimeExcds); 3584 icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present); 3585 } 3586 3587 /* 3588 * Generate an ICMP unreachable message. 3589 */ 3590 void 3591 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code) 3592 { 3593 icmph_t icmph; 3594 mblk_t *first_mp; 3595 boolean_t mctl_present; 3596 3597 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 3598 3599 if (!(mp = icmp_pkt_err_ok(mp))) { 3600 if (mctl_present) 3601 freeb(first_mp); 3602 return; 3603 } 3604 3605 bzero(&icmph, sizeof (icmph_t)); 3606 icmph.icmph_type = ICMP_DEST_UNREACHABLE; 3607 icmph.icmph_code = code; 3608 BUMP_MIB(&icmp_mib, icmpOutDestUnreachs); 3609 ip2dbg(("send icmp destination unreachable code %d\n", code)); 3610 icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present); 3611 } 3612 3613 /* 3614 * News from ARP. ARP sends notification of interesting events down 3615 * to its clients using M_CTL messages with the interesting ARP packet 3616 * attached via b_cont. 3617 * The interesting event from a device comes up the corresponding ARP-IP-DEV 3618 * queue as opposed to ARP sending the message to all the clients, i.e. all 3619 * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache 3620 * table if a cache IRE is found to delete all the entries for the address in 3621 * the packet. 3622 */ 3623 static void 3624 ip_arp_news(queue_t *q, mblk_t *mp) 3625 { 3626 arcn_t *arcn; 3627 arh_t *arh; 3628 char *cp1; 3629 uchar_t *cp2; 3630 ire_t *ire = NULL; 3631 int i1; 3632 char hbuf[128]; 3633 char sbuf[16]; 3634 ipaddr_t src; 3635 in6_addr_t v6src; 3636 boolean_t isv6 = B_FALSE; 3637 3638 if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { 3639 if (q->q_next) { 3640 putnext(q, mp); 3641 } else 3642 freemsg(mp); 3643 return; 3644 } 3645 arh = (arh_t *)mp->b_cont->b_rptr; 3646 /* Is it one we are interested in? */ 3647 if (BE16_TO_U16(arh->arh_proto) == IP6_DL_SAP) { 3648 isv6 = B_TRUE; 3649 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, 3650 IPV6_ADDR_LEN); 3651 } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { 3652 bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, 3653 IP_ADDR_LEN); 3654 } else { 3655 freemsg(mp); 3656 return; 3657 } 3658 3659 arcn = (arcn_t *)mp->b_rptr; 3660 switch (arcn->arcn_code) { 3661 case AR_CN_BOGON: 3662 /* 3663 * Someone is sending ARP packets with a source protocol 3664 * address which we have published. Either they are 3665 * pretending to be us, or we have been asked to proxy 3666 * for a machine that can do fine for itself, or two 3667 * different machines are providing proxy service for the 3668 * same protocol address, or something. We try and do 3669 * something appropriate here. 3670 */ 3671 cp2 = (uchar_t *)&arh[1]; 3672 cp1 = hbuf; 3673 *cp1 = '\0'; 3674 for (i1 = arh->arh_hlen; i1--; cp1 += 3) 3675 (void) sprintf(cp1, "%02x:", *cp2++ & 0xff); 3676 if (cp1 != hbuf) 3677 cp1[-1] = '\0'; 3678 (void) ip_dot_addr(src, sbuf); 3679 if (isv6) 3680 ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL); 3681 else 3682 ire = ire_cache_lookup(src, ALL_ZONES, NULL); 3683 3684 if (ire != NULL && IRE_IS_LOCAL(ire)) { 3685 cmn_err(CE_WARN, 3686 "IP: Hardware address '%s' trying" 3687 " to be our address %s!", 3688 hbuf, sbuf); 3689 } else { 3690 cmn_err(CE_WARN, 3691 "IP: Proxy ARP problem? " 3692 "Hardware address '%s' thinks it is %s", 3693 hbuf, sbuf); 3694 } 3695 if (ire != NULL) 3696 ire_refrele(ire); 3697 break; 3698 case AR_CN_ANNOUNCE: 3699 if (isv6) { 3700 /* 3701 * For XRESOLV interfaces. 3702 * Delete the IRE cache entry and NCE for this 3703 * v6 address 3704 */ 3705 ip_ire_clookup_and_delete_v6(&v6src); 3706 /* 3707 * If v6src is a non-zero, it's a router address 3708 * as below. Do the same sort of thing to clean 3709 * out off-net IRE_CACHE entries that go through 3710 * the router. 3711 */ 3712 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3713 ire_walk_v6(ire_delete_cache_gw_v6, 3714 (char *)&v6src, ALL_ZONES); 3715 } 3716 break; 3717 } 3718 /* 3719 * ARP gives us a copy of any broadcast packet with identical 3720 * sender and receiver protocol address, in 3721 * case we want to intuit something from it. Such a packet 3722 * usually means that a machine has just come up on the net. 3723 * If we have an IRE_CACHE, we blow it away. This way we will 3724 * immediately pick up the rare case of a host changing 3725 * hardware address. ip_ire_clookup_and_delete achieves this. 3726 * 3727 * The address in "src" may be an entry for a router. 3728 * (Default router, or non-default router.) If 3729 * that's true, then any off-net IRE_CACHE entries 3730 * that go through the router with address "src" 3731 * must be clobbered. Use ire_walk to achieve this 3732 * goal. 3733 * 3734 * It should be possible to determine if the address 3735 * in src is or is not for a router. This way, 3736 * the ire_walk() isn't called all of the time here. 3737 * Do not pass 'src' value of 0 to ire_delete_cache_gw, 3738 * as it would remove all IRE_CACHE entries for onlink 3739 * destinations. All onlink destinations have 3740 * ire_gateway_addr == 0. 3741 */ 3742 if ((ip_ire_clookup_and_delete(src, NULL) || 3743 (ire = ire_ftable_lookup(src, 0, 0, 0, NULL, NULL, NULL, 3744 0, NULL, MATCH_IRE_DSTONLY)) != NULL) && src != 0) { 3745 ire_walk_v4(ire_delete_cache_gw, (char *)&src, 3746 ALL_ZONES); 3747 } 3748 /* From ire_ftable_lookup */ 3749 if (ire != NULL) 3750 ire_refrele(ire); 3751 break; 3752 default: 3753 if (ire != NULL) 3754 ire_refrele(ire); 3755 break; 3756 } 3757 freemsg(mp); 3758 } 3759 3760 /* 3761 * Create a mblk suitable for carrying the interface index and/or source link 3762 * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used 3763 * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user 3764 * application. 3765 */ 3766 mblk_t * 3767 ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags) 3768 { 3769 mblk_t *mp; 3770 in_pktinfo_t *pinfo; 3771 ipha_t *ipha; 3772 struct ether_header *pether; 3773 3774 mp = allocb(sizeof (in_pktinfo_t), BPRI_MED); 3775 if (mp == NULL) { 3776 ip1dbg(("ip_add_info: allocation failure.\n")); 3777 return (data_mp); 3778 } 3779 3780 ipha = (ipha_t *)data_mp->b_rptr; 3781 pinfo = (in_pktinfo_t *)mp->b_rptr; 3782 bzero(pinfo, sizeof (in_pktinfo_t)); 3783 pinfo->in_pkt_flags = (uchar_t)flags; 3784 pinfo->in_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ 3785 3786 if (flags & IPF_RECVIF) 3787 pinfo->in_pkt_ifindex = ill->ill_phyint->phyint_ifindex; 3788 3789 pether = (struct ether_header *)((char *)ipha 3790 - sizeof (struct ether_header)); 3791 /* 3792 * Make sure the interface is an ethernet type, since this option 3793 * is currently supported only on this type of interface. Also make 3794 * sure we are pointing correctly above db_base. 3795 */ 3796 3797 if ((flags & IPF_RECVSLLA) && 3798 ((uchar_t *)pether >= data_mp->b_datap->db_base) && 3799 (ill->ill_type == IFT_ETHER) && 3800 (ill->ill_net_type == IRE_IF_RESOLVER)) { 3801 3802 pinfo->in_pkt_slla.sdl_type = IFT_ETHER; 3803 bcopy((uchar_t *)pether->ether_shost.ether_addr_octet, 3804 (uchar_t *)pinfo->in_pkt_slla.sdl_data, ETHERADDRL); 3805 } else { 3806 /* 3807 * Clear the bit. Indicate to upper layer that IP is not 3808 * sending this ancillary info. 3809 */ 3810 pinfo->in_pkt_flags = pinfo->in_pkt_flags & ~IPF_RECVSLLA; 3811 } 3812 3813 mp->b_datap->db_type = M_CTL; 3814 mp->b_wptr += sizeof (in_pktinfo_t); 3815 mp->b_cont = data_mp; 3816 3817 return (mp); 3818 } 3819 3820 /* 3821 * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as 3822 * part of the bind request. 3823 */ 3824 3825 boolean_t 3826 ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) 3827 { 3828 ipsec_in_t *ii; 3829 3830 ASSERT(policy_mp != NULL); 3831 ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); 3832 3833 ii = (ipsec_in_t *)policy_mp->b_rptr; 3834 ASSERT(ii->ipsec_in_type == IPSEC_IN); 3835 3836 connp->conn_policy = ii->ipsec_in_policy; 3837 ii->ipsec_in_policy = NULL; 3838 3839 if (ii->ipsec_in_action != NULL) { 3840 if (connp->conn_latch == NULL) { 3841 connp->conn_latch = iplatch_create(); 3842 if (connp->conn_latch == NULL) 3843 return (B_FALSE); 3844 } 3845 ipsec_latch_inbound(connp->conn_latch, ii); 3846 } 3847 return (B_TRUE); 3848 } 3849 3850 /* 3851 * Upper level protocols (ULP) pass through bind requests to IP for inspection 3852 * and to arrange for power-fanout assist. The ULP is identified by 3853 * adding a single byte at the end of the original bind message. 3854 * A ULP other than UDP or TCP that wishes to be recognized passes 3855 * down a bind with a zero length address. 3856 * 3857 * The binding works as follows: 3858 * - A zero byte address means just bind to the protocol. 3859 * - A four byte address is treated as a request to validate 3860 * that the address is a valid local address, appropriate for 3861 * an application to bind to. This does not affect any fanout 3862 * information in IP. 3863 * - A sizeof sin_t byte address is used to bind to only the local address 3864 * and port. 3865 * - A sizeof ipa_conn_t byte address contains complete fanout information 3866 * consisting of local and remote addresses and ports. In 3867 * this case, the addresses are both validated as appropriate 3868 * for this operation, and, if so, the information is retained 3869 * for use in the inbound fanout. 3870 * 3871 * The ULP (except in the zero-length bind) can append an 3872 * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the 3873 * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants 3874 * a copy of the source or destination IRE (source for local bind; 3875 * destination for complete bind). IPSEC_POLICY_SET indicates that the 3876 * policy information contained should be copied on to the conn. 3877 * 3878 * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. 3879 */ 3880 mblk_t * 3881 ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) 3882 { 3883 ssize_t len; 3884 struct T_bind_req *tbr; 3885 sin_t *sin; 3886 ipa_conn_t *ac; 3887 uchar_t *ucp; 3888 mblk_t *mp1; 3889 boolean_t ire_requested; 3890 boolean_t ipsec_policy_set = B_FALSE; 3891 int error = 0; 3892 int protocol; 3893 ipa_conn_x_t *acx; 3894 3895 ASSERT(!connp->conn_af_isv6); 3896 connp->conn_pkt_isv6 = B_FALSE; 3897 3898 len = MBLKL(mp); 3899 if (len < (sizeof (*tbr) + 1)) { 3900 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 3901 "ip_bind: bogus msg, len %ld", len); 3902 /* XXX: Need to return something better */ 3903 goto bad_addr; 3904 } 3905 /* Back up and extract the protocol identifier. */ 3906 mp->b_wptr--; 3907 protocol = *mp->b_wptr & 0xFF; 3908 tbr = (struct T_bind_req *)mp->b_rptr; 3909 /* Reset the message type in preparation for shipping it back. */ 3910 DB_TYPE(mp) = M_PCPROTO; 3911 3912 connp->conn_ulp = (uint8_t)protocol; 3913 3914 /* 3915 * Check for a zero length address. This is from a protocol that 3916 * wants to register to receive all packets of its type. 3917 */ 3918 if (tbr->ADDR_length == 0) { 3919 /* 3920 * These protocols are now intercepted in ip_bind_v6(). 3921 * Reject protocol-level binds here for now. 3922 * 3923 * For SCTP raw socket, ICMP sends down a bind with sin_t 3924 * so that the protocol type cannot be SCTP. 3925 */ 3926 if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || 3927 protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { 3928 goto bad_addr; 3929 } 3930 3931 /* 3932 * 3933 * The udp module never sends down a zero-length address, 3934 * and allowing this on a labeled system will break MLP 3935 * functionality. 3936 */ 3937 if (is_system_labeled() && protocol == IPPROTO_UDP) 3938 goto bad_addr; 3939 3940 if (connp->conn_mac_exempt) 3941 goto bad_addr; 3942 3943 /* No hash here really. The table is big enough. */ 3944 connp->conn_srcv6 = ipv6_all_zeros; 3945 3946 ipcl_proto_insert(connp, protocol); 3947 3948 tbr->PRIM_type = T_BIND_ACK; 3949 return (mp); 3950 } 3951 3952 /* Extract the address pointer from the message. */ 3953 ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, 3954 tbr->ADDR_length); 3955 if (ucp == NULL) { 3956 ip1dbg(("ip_bind: no address\n")); 3957 goto bad_addr; 3958 } 3959 if (!OK_32PTR(ucp)) { 3960 ip1dbg(("ip_bind: unaligned address\n")); 3961 goto bad_addr; 3962 } 3963 /* 3964 * Check for trailing mps. 3965 */ 3966 3967 mp1 = mp->b_cont; 3968 ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); 3969 ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); 3970 3971 switch (tbr->ADDR_length) { 3972 default: 3973 ip1dbg(("ip_bind: bad address length %d\n", 3974 (int)tbr->ADDR_length)); 3975 goto bad_addr; 3976 3977 case IP_ADDR_LEN: 3978 /* Verification of local address only */ 3979 error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, 3980 ire_requested, ipsec_policy_set, B_FALSE); 3981 break; 3982 3983 case sizeof (sin_t): 3984 sin = (sin_t *)ucp; 3985 error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, 3986 sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); 3987 if (protocol == IPPROTO_TCP) 3988 connp->conn_recv = tcp_conn_request; 3989 break; 3990 3991 case sizeof (ipa_conn_t): 3992 ac = (ipa_conn_t *)ucp; 3993 /* For raw socket, the local port is not set. */ 3994 if (ac->ac_lport == 0) 3995 ac->ac_lport = connp->conn_lport; 3996 /* Always verify destination reachability. */ 3997 error = ip_bind_connected(connp, mp, &ac->ac_laddr, 3998 ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, 3999 ipsec_policy_set, B_TRUE, B_TRUE); 4000 if (protocol == IPPROTO_TCP) 4001 connp->conn_recv = tcp_input; 4002 break; 4003 4004 case sizeof (ipa_conn_x_t): 4005 acx = (ipa_conn_x_t *)ucp; 4006 /* 4007 * Whether or not to verify destination reachability depends 4008 * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. 4009 */ 4010 error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, 4011 acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, 4012 acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, 4013 B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); 4014 if (protocol == IPPROTO_TCP) 4015 connp->conn_recv = tcp_input; 4016 break; 4017 } 4018 if (error == EINPROGRESS) 4019 return (NULL); 4020 else if (error != 0) 4021 goto bad_addr; 4022 /* 4023 * Pass the IPSEC headers size in ire_ipsec_overhead. 4024 * We can't do this in ip_bind_insert_ire because the policy 4025 * may not have been inherited at that point in time and hence 4026 * conn_out_enforce_policy may not be set. 4027 */ 4028 mp1 = mp->b_cont; 4029 if (ire_requested && connp->conn_out_enforce_policy && 4030 mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { 4031 ire_t *ire = (ire_t *)mp1->b_rptr; 4032 ASSERT(MBLKL(mp1) >= sizeof (ire_t)); 4033 ire->ire_ipsec_overhead = conn_ipsec_length(connp); 4034 } 4035 4036 /* Send it home. */ 4037 mp->b_datap->db_type = M_PCPROTO; 4038 tbr->PRIM_type = T_BIND_ACK; 4039 return (mp); 4040 4041 bad_addr: 4042 /* 4043 * If error = -1 then we generate a TBADADDR - otherwise error is 4044 * a unix errno. 4045 */ 4046 if (error > 0) 4047 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 4048 else 4049 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 4050 return (mp); 4051 } 4052 4053 /* 4054 * Here address is verified to be a valid local address. 4055 * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast 4056 * address is also considered a valid local address. 4057 * In the case of a broadcast/multicast address, however, the 4058 * upper protocol is expected to reset the src address 4059 * to 0 if it sees a IRE_BROADCAST type returned so that 4060 * no packets are emitted with broadcast/multicast address as 4061 * source address (that violates hosts requirements RFC1122) 4062 * The addresses valid for bind are: 4063 * (1) - INADDR_ANY (0) 4064 * (2) - IP address of an UP interface 4065 * (3) - IP address of a DOWN interface 4066 * (4) - valid local IP broadcast addresses. In this case 4067 * the conn will only receive packets destined to 4068 * the specified broadcast address. 4069 * (5) - a multicast address. In this case 4070 * the conn will only receive packets destined to 4071 * the specified multicast address. Note: the 4072 * application still has to issue an 4073 * IP_ADD_MEMBERSHIP socket option. 4074 * 4075 * On error, return -1 for TBADADDR otherwise pass the 4076 * errno with TSYSERR reply. 4077 * 4078 * In all the above cases, the bound address must be valid in the current zone. 4079 * When the address is loopback, multicast or broadcast, there might be many 4080 * matching IREs so bind has to look up based on the zone. 4081 * 4082 * Note: lport is in network byte order. 4083 */ 4084 int 4085 ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, 4086 boolean_t ire_requested, boolean_t ipsec_policy_set, 4087 boolean_t fanout_insert) 4088 { 4089 int error = 0; 4090 ire_t *src_ire; 4091 mblk_t *policy_mp; 4092 ipif_t *ipif; 4093 zoneid_t zoneid; 4094 4095 if (ipsec_policy_set) { 4096 policy_mp = mp->b_cont; 4097 } 4098 4099 /* 4100 * If it was previously connected, conn_fully_bound would have 4101 * been set. 4102 */ 4103 connp->conn_fully_bound = B_FALSE; 4104 4105 src_ire = NULL; 4106 ipif = NULL; 4107 4108 zoneid = connp->conn_zoneid; 4109 4110 if (src_addr) { 4111 src_ire = ire_route_lookup(src_addr, 0, 0, 0, 4112 NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY); 4113 /* 4114 * If an address other than 0.0.0.0 is requested, 4115 * we verify that it is a valid address for bind 4116 * Note: Following code is in if-else-if form for 4117 * readability compared to a condition check. 4118 */ 4119 /* LINTED - statement has no consequent */ 4120 if (IRE_IS_LOCAL(src_ire)) { 4121 /* 4122 * (2) Bind to address of local UP interface 4123 */ 4124 } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { 4125 /* 4126 * (4) Bind to broadcast address 4127 * Note: permitted only from transports that 4128 * request IRE 4129 */ 4130 if (!ire_requested) 4131 error = EADDRNOTAVAIL; 4132 } else { 4133 /* 4134 * (3) Bind to address of local DOWN interface 4135 * (ipif_lookup_addr() looks up all interfaces 4136 * but we do not get here for UP interfaces 4137 * - case (2) above) 4138 * We put the protocol byte back into the mblk 4139 * since we may come back via ip_wput_nondata() 4140 * later with this mblk if ipif_lookup_addr chooses 4141 * to defer processing. 4142 */ 4143 *mp->b_wptr++ = (char)connp->conn_ulp; 4144 if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, 4145 CONNP_TO_WQ(connp), mp, ip_wput_nondata, 4146 &error)) != NULL) { 4147 ipif_refrele(ipif); 4148 } else if (error == EINPROGRESS) { 4149 if (src_ire != NULL) 4150 ire_refrele(src_ire); 4151 return (EINPROGRESS); 4152 } else if (CLASSD(src_addr)) { 4153 error = 0; 4154 if (src_ire != NULL) 4155 ire_refrele(src_ire); 4156 /* 4157 * (5) bind to multicast address. 4158 * Fake out the IRE returned to upper 4159 * layer to be a broadcast IRE. 4160 */ 4161 src_ire = ire_ctable_lookup( 4162 INADDR_BROADCAST, INADDR_ANY, 4163 IRE_BROADCAST, NULL, zoneid, NULL, 4164 (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY)); 4165 if (src_ire == NULL || !ire_requested) 4166 error = EADDRNOTAVAIL; 4167 } else { 4168 /* 4169 * Not a valid address for bind 4170 */ 4171 error = EADDRNOTAVAIL; 4172 } 4173 /* 4174 * Just to keep it consistent with the processing in 4175 * ip_bind_v4() 4176 */ 4177 mp->b_wptr--; 4178 } 4179 if (error) { 4180 /* Red Alert! Attempting to be a bogon! */ 4181 ip1dbg(("ip_bind: bad src address 0x%x\n", 4182 ntohl(src_addr))); 4183 goto bad_addr; 4184 } 4185 } 4186 4187 /* 4188 * Allow setting new policies. For example, disconnects come 4189 * down as ipa_t bind. As we would have set conn_policy_cached 4190 * to B_TRUE before, we should set it to B_FALSE, so that policy 4191 * can change after the disconnect. 4192 */ 4193 connp->conn_policy_cached = B_FALSE; 4194 4195 /* 4196 * If not fanout_insert this was just an address verification 4197 */ 4198 if (fanout_insert) { 4199 /* 4200 * The addresses have been verified. Time to insert in 4201 * the correct fanout list. 4202 */ 4203 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4204 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); 4205 connp->conn_lport = lport; 4206 connp->conn_fport = 0; 4207 /* 4208 * Do we need to add a check to reject Multicast packets 4209 */ 4210 error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); 4211 } 4212 4213 if (error == 0) { 4214 if (ire_requested) { 4215 if (!ip_bind_insert_ire(mp, src_ire, NULL)) { 4216 error = -1; 4217 /* Falls through to bad_addr */ 4218 } 4219 } else if (ipsec_policy_set) { 4220 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4221 error = -1; 4222 /* Falls through to bad_addr */ 4223 } 4224 } 4225 } 4226 bad_addr: 4227 if (error != 0) { 4228 if (connp->conn_anon_port) { 4229 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 4230 connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), 4231 B_FALSE); 4232 } 4233 connp->conn_mlp_type = mlptSingle; 4234 } 4235 if (src_ire != NULL) 4236 IRE_REFRELE(src_ire); 4237 if (ipsec_policy_set) { 4238 ASSERT(policy_mp == mp->b_cont); 4239 ASSERT(policy_mp != NULL); 4240 freeb(policy_mp); 4241 /* 4242 * As of now assume that nothing else accompanies 4243 * IPSEC_POLICY_SET. 4244 */ 4245 mp->b_cont = NULL; 4246 } 4247 return (error); 4248 } 4249 4250 /* 4251 * Verify that both the source and destination addresses 4252 * are valid. If verify_dst is false, then the destination address may be 4253 * unreachable, i.e. have no route to it. Protocols like TCP want to verify 4254 * destination reachability, while tunnels do not. 4255 * Note that we allow connect to broadcast and multicast 4256 * addresses when ire_requested is set. Thus the ULP 4257 * has to check for IRE_BROADCAST and multicast. 4258 * 4259 * Returns zero if ok. 4260 * On error: returns -1 to mean TBADADDR otherwise returns an errno 4261 * (for use with TSYSERR reply). 4262 * 4263 * Note: lport and fport are in network byte order. 4264 */ 4265 int 4266 ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, 4267 uint16_t lport, ipaddr_t dst_addr, uint16_t fport, 4268 boolean_t ire_requested, boolean_t ipsec_policy_set, 4269 boolean_t fanout_insert, boolean_t verify_dst) 4270 { 4271 ire_t *src_ire; 4272 ire_t *dst_ire; 4273 int error = 0; 4274 int protocol; 4275 mblk_t *policy_mp; 4276 ire_t *sire = NULL; 4277 ire_t *md_dst_ire = NULL; 4278 ill_t *md_ill = NULL; 4279 zoneid_t zoneid; 4280 ipaddr_t src_addr = *src_addrp; 4281 4282 src_ire = dst_ire = NULL; 4283 protocol = *mp->b_wptr & 0xFF; 4284 4285 /* 4286 * If we never got a disconnect before, clear it now. 4287 */ 4288 connp->conn_fully_bound = B_FALSE; 4289 4290 if (ipsec_policy_set) { 4291 policy_mp = mp->b_cont; 4292 } 4293 4294 zoneid = connp->conn_zoneid; 4295 4296 if (CLASSD(dst_addr)) { 4297 /* Pick up an IRE_BROADCAST */ 4298 dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, 4299 NULL, zoneid, MBLK_GETLABEL(mp), 4300 (MATCH_IRE_RECURSIVE | 4301 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | 4302 MATCH_IRE_SECATTR)); 4303 } else { 4304 /* 4305 * If conn_dontroute is set or if conn_nexthop_set is set, 4306 * and onlink ipif is not found set ENETUNREACH error. 4307 */ 4308 if (connp->conn_dontroute || connp->conn_nexthop_set) { 4309 ipif_t *ipif; 4310 4311 ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? 4312 dst_addr : connp->conn_nexthop_v4, zoneid); 4313 if (ipif == NULL) { 4314 error = ENETUNREACH; 4315 goto bad_addr; 4316 } 4317 ipif_refrele(ipif); 4318 } 4319 4320 if (connp->conn_nexthop_set) { 4321 dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, 4322 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), 4323 MATCH_IRE_SECATTR); 4324 } else { 4325 dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, 4326 &sire, zoneid, MBLK_GETLABEL(mp), 4327 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4328 MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | 4329 MATCH_IRE_SECATTR)); 4330 } 4331 } 4332 /* 4333 * dst_ire can't be a broadcast when not ire_requested. 4334 * We also prevent ire's with src address INADDR_ANY to 4335 * be used, which are created temporarily for 4336 * sending out packets from endpoints that have 4337 * conn_unspec_src set. If verify_dst is true, the destination must be 4338 * reachable. If verify_dst is false, the destination needn't be 4339 * reachable. 4340 * 4341 * If we match on a reject or black hole, then we've got a 4342 * local failure. May as well fail out the connect() attempt, 4343 * since it's never going to succeed. 4344 */ 4345 if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || 4346 (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 4347 ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { 4348 /* 4349 * If we're verifying destination reachability, we always want 4350 * to complain here. 4351 * 4352 * If we're not verifying destination reachability but the 4353 * destination has a route, we still want to fail on the 4354 * temporary address and broadcast address tests. 4355 */ 4356 if (verify_dst || (dst_ire != NULL)) { 4357 if (ip_debug > 2) { 4358 pr_addr_dbg("ip_bind_connected: bad connected " 4359 "dst %s\n", AF_INET, &dst_addr); 4360 } 4361 if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) 4362 error = ENETUNREACH; 4363 else 4364 error = EHOSTUNREACH; 4365 goto bad_addr; 4366 } 4367 } 4368 4369 /* 4370 * We now know that routing will allow us to reach the destination. 4371 * Check whether Trusted Solaris policy allows communication with this 4372 * host, and pretend that the destination is unreachable if not. 4373 * 4374 * This is never a problem for TCP, since that transport is known to 4375 * compute the label properly as part of the tcp_rput_other T_BIND_ACK 4376 * handling. If the remote is unreachable, it will be detected at that 4377 * point, so there's no reason to check it here. 4378 * 4379 * Note that for sendto (and other datagram-oriented friends), this 4380 * check is done as part of the data path label computation instead. 4381 * The check here is just to make non-TCP connect() report the right 4382 * error. 4383 */ 4384 if (dst_ire != NULL && is_system_labeled() && 4385 !IPCL_IS_TCP(connp) && 4386 tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst_addr, NULL, 4387 connp->conn_mac_exempt) != 0) { 4388 error = EHOSTUNREACH; 4389 if (ip_debug > 2) { 4390 pr_addr_dbg("ip_bind_connected: no label for dst %s\n", 4391 AF_INET, &dst_addr); 4392 } 4393 goto bad_addr; 4394 } 4395 4396 /* 4397 * If the app does a connect(), it means that it will most likely 4398 * send more than 1 packet to the destination. It makes sense 4399 * to clear the temporary flag. 4400 */ 4401 if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && 4402 (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { 4403 irb_t *irb = dst_ire->ire_bucket; 4404 4405 rw_enter(&irb->irb_lock, RW_WRITER); 4406 dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; 4407 irb->irb_tmp_ire_cnt--; 4408 rw_exit(&irb->irb_lock); 4409 } 4410 4411 /* 4412 * See if we should notify ULP about MDT; we do this whether or not 4413 * ire_requested is TRUE, in order to handle active connects; MDT 4414 * eligibility tests for passive connects are handled separately 4415 * through tcp_adapt_ire(). We do this before the source address 4416 * selection, because dst_ire may change after a call to 4417 * ipif_select_source(). This is a best-effort check, as the 4418 * packet for this connection may not actually go through 4419 * dst_ire->ire_stq, and the exact IRE can only be known after 4420 * calling ip_newroute(). This is why we further check on the 4421 * IRE during Multidata packet transmission in tcp_multisend(). 4422 */ 4423 if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL && 4424 !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && 4425 (md_ill = ire_to_ill(dst_ire), md_ill != NULL) && 4426 ILL_MDT_CAPABLE(md_ill)) { 4427 md_dst_ire = dst_ire; 4428 IRE_REFHOLD(md_dst_ire); 4429 } 4430 4431 if (dst_ire != NULL && 4432 dst_ire->ire_type == IRE_LOCAL && 4433 dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { 4434 /* 4435 * If the IRE belongs to a different zone, look for a matching 4436 * route in the forwarding table and use the source address from 4437 * that route. 4438 */ 4439 src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, 4440 zoneid, 0, NULL, 4441 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 4442 MATCH_IRE_RJ_BHOLE); 4443 if (src_ire == NULL) { 4444 error = EHOSTUNREACH; 4445 goto bad_addr; 4446 } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 4447 if (!(src_ire->ire_type & IRE_HOST)) 4448 error = ENETUNREACH; 4449 else 4450 error = EHOSTUNREACH; 4451 goto bad_addr; 4452 } 4453 if (src_addr == INADDR_ANY) 4454 src_addr = src_ire->ire_src_addr; 4455 ire_refrele(src_ire); 4456 src_ire = NULL; 4457 } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { 4458 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 4459 src_addr = sire->ire_src_addr; 4460 ire_refrele(dst_ire); 4461 dst_ire = sire; 4462 sire = NULL; 4463 } else { 4464 /* 4465 * Pick a source address so that a proper inbound 4466 * load spreading would happen. 4467 */ 4468 ill_t *dst_ill = dst_ire->ire_ipif->ipif_ill; 4469 ipif_t *src_ipif = NULL; 4470 ire_t *ipif_ire; 4471 4472 /* 4473 * Supply a local source address such that inbound 4474 * load spreading happens. 4475 * 4476 * Determine the best source address on this ill for 4477 * the destination. 4478 * 4479 * 1) For broadcast, we should return a broadcast ire 4480 * found above so that upper layers know that the 4481 * destination address is a broadcast address. 4482 * 4483 * 2) If this is part of a group, select a better 4484 * source address so that better inbound load 4485 * balancing happens. Do the same if the ipif 4486 * is DEPRECATED. 4487 * 4488 * 3) If the outgoing interface is part of a usesrc 4489 * group, then try selecting a source address from 4490 * the usesrc ILL. 4491 */ 4492 if ((dst_ire->ire_zoneid != zoneid && 4493 dst_ire->ire_zoneid != ALL_ZONES) || 4494 (!(dst_ire->ire_type & IRE_BROADCAST) && 4495 ((dst_ill->ill_group != NULL) || 4496 (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 4497 (dst_ill->ill_usesrc_ifindex != 0)))) { 4498 /* 4499 * If the destination is reachable via a 4500 * given gateway, the selected source address 4501 * should be in the same subnet as the gateway. 4502 * Otherwise, the destination is not reachable. 4503 * 4504 * If there are no interfaces on the same subnet 4505 * as the destination, ipif_select_source gives 4506 * first non-deprecated interface which might be 4507 * on a different subnet than the gateway. 4508 * This is not desirable. Hence pass the dst_ire 4509 * source address to ipif_select_source. 4510 * It is sure that the destination is reachable 4511 * with the dst_ire source address subnet. 4512 * So passing dst_ire source address to 4513 * ipif_select_source will make sure that the 4514 * selected source will be on the same subnet 4515 * as dst_ire source address. 4516 */ 4517 ipaddr_t saddr = 4518 dst_ire->ire_ipif->ipif_src_addr; 4519 src_ipif = ipif_select_source(dst_ill, 4520 saddr, zoneid); 4521 if (src_ipif != NULL) { 4522 if (IS_VNI(src_ipif->ipif_ill)) { 4523 /* 4524 * For VNI there is no 4525 * interface route 4526 */ 4527 src_addr = 4528 src_ipif->ipif_src_addr; 4529 } else { 4530 ipif_ire = 4531 ipif_to_ire(src_ipif); 4532 if (ipif_ire != NULL) { 4533 IRE_REFRELE(dst_ire); 4534 dst_ire = ipif_ire; 4535 } 4536 src_addr = 4537 dst_ire->ire_src_addr; 4538 } 4539 ipif_refrele(src_ipif); 4540 } else { 4541 src_addr = dst_ire->ire_src_addr; 4542 } 4543 } else { 4544 src_addr = dst_ire->ire_src_addr; 4545 } 4546 } 4547 } 4548 4549 /* 4550 * We do ire_route_lookup() here (and not 4551 * interface lookup as we assert that 4552 * src_addr should only come from an 4553 * UP interface for hard binding. 4554 */ 4555 ASSERT(src_ire == NULL); 4556 src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, 4557 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY); 4558 /* src_ire must be a local|loopback */ 4559 if (!IRE_IS_LOCAL(src_ire)) { 4560 if (ip_debug > 2) { 4561 pr_addr_dbg("ip_bind_connected: bad connected " 4562 "src %s\n", AF_INET, &src_addr); 4563 } 4564 error = EADDRNOTAVAIL; 4565 goto bad_addr; 4566 } 4567 4568 /* 4569 * If the source address is a loopback address, the 4570 * destination had best be local or multicast. 4571 * The transports that can't handle multicast will reject 4572 * those addresses. 4573 */ 4574 if (src_ire->ire_type == IRE_LOOPBACK && 4575 !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { 4576 ip1dbg(("ip_bind_connected: bad connected loopback\n")); 4577 error = -1; 4578 goto bad_addr; 4579 } 4580 4581 /* 4582 * Allow setting new policies. For example, disconnects come 4583 * down as ipa_t bind. As we would have set conn_policy_cached 4584 * to B_TRUE before, we should set it to B_FALSE, so that policy 4585 * can change after the disconnect. 4586 */ 4587 connp->conn_policy_cached = B_FALSE; 4588 4589 /* 4590 * Set the conn addresses/ports immediately, so the IPsec policy calls 4591 * can handle their passed-in conn's. 4592 */ 4593 4594 IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); 4595 IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); 4596 connp->conn_lport = lport; 4597 connp->conn_fport = fport; 4598 *src_addrp = src_addr; 4599 4600 ASSERT(!(ipsec_policy_set && ire_requested)); 4601 if (ire_requested) { 4602 iulp_t *ulp_info = NULL; 4603 4604 /* 4605 * Note that sire will not be NULL if this is an off-link 4606 * connection and there is not cache for that dest yet. 4607 * 4608 * XXX Because of an existing bug, if there are multiple 4609 * default routes, the IRE returned now may not be the actual 4610 * default route used (default routes are chosen in a 4611 * round robin fashion). So if the metrics for different 4612 * default routes are different, we may return the wrong 4613 * metrics. This will not be a problem if the existing 4614 * bug is fixed. 4615 */ 4616 if (sire != NULL) { 4617 ulp_info = &(sire->ire_uinfo); 4618 } 4619 if (!ip_bind_insert_ire(mp, dst_ire, ulp_info)) { 4620 error = -1; 4621 goto bad_addr; 4622 } 4623 } else if (ipsec_policy_set) { 4624 if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { 4625 error = -1; 4626 goto bad_addr; 4627 } 4628 } 4629 4630 /* 4631 * Cache IPsec policy in this conn. If we have per-socket policy, 4632 * we'll cache that. If we don't, we'll inherit global policy. 4633 * 4634 * We can't insert until the conn reflects the policy. Note that 4635 * conn_policy_cached is set by ipsec_conn_cache_policy() even for 4636 * connections where we don't have a policy. This is to prevent 4637 * global policy lookups in the inbound path. 4638 * 4639 * If we insert before we set conn_policy_cached, 4640 * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true 4641 * because global policy cound be non-empty. We normally call 4642 * ipsec_check_policy() for conn_policy_cached connections only if 4643 * ipc_in_enforce_policy is set. But in this case, 4644 * conn_policy_cached can get set anytime since we made the 4645 * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is 4646 * called, which will make the above assumption false. Thus, we 4647 * need to insert after we set conn_policy_cached. 4648 */ 4649 if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) 4650 goto bad_addr; 4651 4652 if (fanout_insert) { 4653 /* 4654 * The addresses have been verified. Time to insert in 4655 * the correct fanout list. 4656 */ 4657 error = ipcl_conn_insert(connp, protocol, src_addr, 4658 dst_addr, connp->conn_ports); 4659 } 4660 4661 if (error == 0) { 4662 connp->conn_fully_bound = B_TRUE; 4663 /* 4664 * Our initial checks for MDT have passed; the IRE is not 4665 * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to 4666 * be supporting MDT. Pass the IRE, IPC and ILL into 4667 * ip_mdinfo_return(), which performs further checks 4668 * against them and upon success, returns the MDT info 4669 * mblk which we will attach to the bind acknowledgment. 4670 */ 4671 if (md_dst_ire != NULL) { 4672 mblk_t *mdinfo_mp; 4673 4674 ASSERT(md_ill != NULL); 4675 ASSERT(md_ill->ill_mdt_capab != NULL); 4676 if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, 4677 md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) 4678 linkb(mp, mdinfo_mp); 4679 } 4680 } 4681 bad_addr: 4682 if (ipsec_policy_set) { 4683 ASSERT(policy_mp == mp->b_cont); 4684 ASSERT(policy_mp != NULL); 4685 freeb(policy_mp); 4686 /* 4687 * As of now assume that nothing else accompanies 4688 * IPSEC_POLICY_SET. 4689 */ 4690 mp->b_cont = NULL; 4691 } 4692 if (src_ire != NULL) 4693 IRE_REFRELE(src_ire); 4694 if (dst_ire != NULL) 4695 IRE_REFRELE(dst_ire); 4696 if (sire != NULL) 4697 IRE_REFRELE(sire); 4698 if (md_dst_ire != NULL) 4699 IRE_REFRELE(md_dst_ire); 4700 return (error); 4701 } 4702 4703 /* 4704 * Insert the ire in b_cont. Returns false if it fails (due to lack of space). 4705 * Prefers dst_ire over src_ire. 4706 */ 4707 static boolean_t 4708 ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info) 4709 { 4710 mblk_t *mp1; 4711 ire_t *ret_ire = NULL; 4712 4713 mp1 = mp->b_cont; 4714 ASSERT(mp1 != NULL); 4715 4716 if (ire != NULL) { 4717 /* 4718 * mp1 initialized above to IRE_DB_REQ_TYPE 4719 * appended mblk. Its <upper protocol>'s 4720 * job to make sure there is room. 4721 */ 4722 if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) 4723 return (0); 4724 4725 mp1->b_datap->db_type = IRE_DB_TYPE; 4726 mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); 4727 bcopy(ire, mp1->b_rptr, sizeof (ire_t)); 4728 ret_ire = (ire_t *)mp1->b_rptr; 4729 /* 4730 * Pass the latest setting of the ip_path_mtu_discovery and 4731 * copy the ulp info if any. 4732 */ 4733 ret_ire->ire_frag_flag |= (ip_path_mtu_discovery) ? 4734 IPH_DF : 0; 4735 if (ulp_info != NULL) { 4736 bcopy(ulp_info, &(ret_ire->ire_uinfo), 4737 sizeof (iulp_t)); 4738 } 4739 ret_ire->ire_mp = mp1; 4740 } else { 4741 /* 4742 * No IRE was found. Remove IRE mblk. 4743 */ 4744 mp->b_cont = mp1->b_cont; 4745 freeb(mp1); 4746 } 4747 4748 return (1); 4749 } 4750 4751 /* 4752 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping 4753 * the final piece where we don't. Return a pointer to the first mblk in the 4754 * result, and update the pointer to the next mblk to chew on. If anything 4755 * goes wrong (i.e., dupb fails), we waste everything in sight and return a 4756 * NULL pointer. 4757 */ 4758 mblk_t * 4759 ip_carve_mp(mblk_t **mpp, ssize_t len) 4760 { 4761 mblk_t *mp0; 4762 mblk_t *mp1; 4763 mblk_t *mp2; 4764 4765 if (!len || !mpp || !(mp0 = *mpp)) 4766 return (NULL); 4767 /* If we aren't going to consume the first mblk, we need a dup. */ 4768 if (mp0->b_wptr - mp0->b_rptr > len) { 4769 mp1 = dupb(mp0); 4770 if (mp1) { 4771 /* Partition the data between the two mblks. */ 4772 mp1->b_wptr = mp1->b_rptr + len; 4773 mp0->b_rptr = mp1->b_wptr; 4774 /* 4775 * after adjustments if mblk not consumed is now 4776 * unaligned, try to align it. If this fails free 4777 * all messages and let upper layer recover. 4778 */ 4779 if (!OK_32PTR(mp0->b_rptr)) { 4780 if (!pullupmsg(mp0, -1)) { 4781 freemsg(mp0); 4782 freemsg(mp1); 4783 *mpp = NULL; 4784 return (NULL); 4785 } 4786 } 4787 } 4788 return (mp1); 4789 } 4790 /* Eat through as many mblks as we need to get len bytes. */ 4791 len -= mp0->b_wptr - mp0->b_rptr; 4792 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) { 4793 if (mp2->b_wptr - mp2->b_rptr > len) { 4794 /* 4795 * We won't consume the entire last mblk. Like 4796 * above, dup and partition it. 4797 */ 4798 mp1->b_cont = dupb(mp2); 4799 mp1 = mp1->b_cont; 4800 if (!mp1) { 4801 /* 4802 * Trouble. Rather than go to a lot of 4803 * trouble to clean up, we free the messages. 4804 * This won't be any worse than losing it on 4805 * the wire. 4806 */ 4807 freemsg(mp0); 4808 freemsg(mp2); 4809 *mpp = NULL; 4810 return (NULL); 4811 } 4812 mp1->b_wptr = mp1->b_rptr + len; 4813 mp2->b_rptr = mp1->b_wptr; 4814 /* 4815 * after adjustments if mblk not consumed is now 4816 * unaligned, try to align it. If this fails free 4817 * all messages and let upper layer recover. 4818 */ 4819 if (!OK_32PTR(mp2->b_rptr)) { 4820 if (!pullupmsg(mp2, -1)) { 4821 freemsg(mp0); 4822 freemsg(mp2); 4823 *mpp = NULL; 4824 return (NULL); 4825 } 4826 } 4827 *mpp = mp2; 4828 return (mp0); 4829 } 4830 /* Decrement len by the amount we just got. */ 4831 len -= mp2->b_wptr - mp2->b_rptr; 4832 } 4833 /* 4834 * len should be reduced to zero now. If not our caller has 4835 * screwed up. 4836 */ 4837 if (len) { 4838 /* Shouldn't happen! */ 4839 freemsg(mp0); 4840 *mpp = NULL; 4841 return (NULL); 4842 } 4843 /* 4844 * We consumed up to exactly the end of an mblk. Detach the part 4845 * we are returning from the rest of the chain. 4846 */ 4847 mp1->b_cont = NULL; 4848 *mpp = mp2; 4849 return (mp0); 4850 } 4851 4852 /* The ill stream is being unplumbed. Called from ip_close */ 4853 int 4854 ip_modclose(ill_t *ill) 4855 { 4856 4857 boolean_t success; 4858 ipsq_t *ipsq; 4859 ipif_t *ipif; 4860 queue_t *q = ill->ill_rq; 4861 4862 /* 4863 * Forcibly enter the ipsq after some delay. This is to take 4864 * care of the case when some ioctl does not complete because 4865 * we sent a control message to the driver and it did not 4866 * send us a reply. We want to be able to at least unplumb 4867 * and replumb rather than force the user to reboot the system. 4868 */ 4869 success = ipsq_enter(ill, B_FALSE); 4870 4871 /* 4872 * Open/close/push/pop is guaranteed to be single threaded 4873 * per stream by STREAMS. FS guarantees that all references 4874 * from top are gone before close is called. So there can't 4875 * be another close thread that has set CONDEMNED on this ill. 4876 * and cause ipsq_enter to return failure. 4877 */ 4878 ASSERT(success); 4879 ipsq = ill->ill_phyint->phyint_ipsq; 4880 4881 /* 4882 * Mark it condemned. No new reference will be made to this ill. 4883 * Lookup functions will return an error. Threads that try to 4884 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures 4885 * that the refcnt will drop down to zero. 4886 */ 4887 mutex_enter(&ill->ill_lock); 4888 ill->ill_state_flags |= ILL_CONDEMNED; 4889 for (ipif = ill->ill_ipif; ipif != NULL; 4890 ipif = ipif->ipif_next) { 4891 ipif->ipif_state_flags |= IPIF_CONDEMNED; 4892 } 4893 /* 4894 * Wake up anybody waiting to enter the ipsq. ipsq_enter 4895 * returns error if ILL_CONDEMNED is set 4896 */ 4897 cv_broadcast(&ill->ill_cv); 4898 mutex_exit(&ill->ill_lock); 4899 4900 /* 4901 * Shut down fragmentation reassembly. 4902 * ill_frag_timer won't start a timer again. 4903 * Now cancel any existing timer 4904 */ 4905 (void) untimeout(ill->ill_frag_timer_id); 4906 (void) ill_frag_timeout(ill, 0); 4907 4908 /* 4909 * If MOVE was in progress, clear the 4910 * move_in_progress fields also. 4911 */ 4912 if (ill->ill_move_in_progress) { 4913 ILL_CLEAR_MOVE(ill); 4914 } 4915 4916 /* 4917 * Call ill_delete to bring down the ipifs, ilms and ill on 4918 * this ill. Then wait for the refcnts to drop to zero. 4919 * ill_is_quiescent checks whether the ill is really quiescent. 4920 * Then make sure that threads that are waiting to enter the 4921 * ipsq have seen the error returned by ipsq_enter and have 4922 * gone away. Then we call ill_delete_tail which does the 4923 * DL_UNBIND and DL_DETACH with the driver and then qprocsoff. 4924 */ 4925 ill_delete(ill); 4926 mutex_enter(&ill->ill_lock); 4927 while (!ill_is_quiescent(ill)) 4928 cv_wait(&ill->ill_cv, &ill->ill_lock); 4929 while (ill->ill_waiters) 4930 cv_wait(&ill->ill_cv, &ill->ill_lock); 4931 4932 mutex_exit(&ill->ill_lock); 4933 4934 /* qprocsoff is called in ill_delete_tail */ 4935 ill_delete_tail(ill); 4936 4937 /* 4938 * Walk through all upper (conn) streams and qenable 4939 * those that have queued data. 4940 * close synchronization needs this to 4941 * be done to ensure that all upper layers blocked 4942 * due to flow control to the closing device 4943 * get unblocked. 4944 */ 4945 ip1dbg(("ip_wsrv: walking\n")); 4946 conn_walk_drain(); 4947 4948 mutex_enter(&ip_mi_lock); 4949 mi_close_unlink(&ip_g_head, (IDP)ill); 4950 mutex_exit(&ip_mi_lock); 4951 4952 /* 4953 * credp could be null if the open didn't succeed and ip_modopen 4954 * itself calls ip_close. 4955 */ 4956 if (ill->ill_credp != NULL) 4957 crfree(ill->ill_credp); 4958 4959 mi_close_free((IDP)ill); 4960 q->q_ptr = WR(q)->q_ptr = NULL; 4961 4962 ipsq_exit(ipsq, B_TRUE, B_TRUE); 4963 4964 return (0); 4965 } 4966 4967 /* 4968 * This is called as part of close() for both IP and UDP 4969 * in order to quiesce the conn. 4970 */ 4971 void 4972 ip_quiesce_conn(conn_t *connp) 4973 { 4974 boolean_t drain_cleanup_reqd = B_FALSE; 4975 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 4976 boolean_t ilg_cleanup_reqd = B_FALSE; 4977 4978 ASSERT(!IPCL_IS_TCP(connp)); 4979 4980 /* 4981 * Mark the conn as closing, and this conn must not be 4982 * inserted in future into any list. Eg. conn_drain_insert(), 4983 * won't insert this conn into the conn_drain_list. 4984 * Similarly ill_pending_mp_add() will not add any mp to 4985 * the pending mp list, after this conn has started closing. 4986 * 4987 * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg 4988 * cannot get set henceforth. 4989 */ 4990 mutex_enter(&connp->conn_lock); 4991 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); 4992 connp->conn_state_flags |= CONN_CLOSING; 4993 if (connp->conn_idl != NULL) 4994 drain_cleanup_reqd = B_TRUE; 4995 if (connp->conn_oper_pending_ill != NULL) 4996 conn_ioctl_cleanup_reqd = B_TRUE; 4997 if (connp->conn_ilg_inuse != 0) 4998 ilg_cleanup_reqd = B_TRUE; 4999 mutex_exit(&connp->conn_lock); 5000 5001 if (IPCL_IS_UDP(connp)) 5002 udp_quiesce_conn(connp); 5003 5004 if (conn_ioctl_cleanup_reqd) 5005 conn_ioctl_cleanup(connp); 5006 5007 if (is_system_labeled() && connp->conn_anon_port) { 5008 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 5009 connp->conn_mlp_type, connp->conn_ulp, 5010 ntohs(connp->conn_lport), B_FALSE); 5011 connp->conn_anon_port = 0; 5012 } 5013 connp->conn_mlp_type = mlptSingle; 5014 5015 /* 5016 * Remove this conn from any fanout list it is on. 5017 * and then wait for any threads currently operating 5018 * on this endpoint to finish 5019 */ 5020 ipcl_hash_remove(connp); 5021 5022 /* 5023 * Remove this conn from the drain list, and do 5024 * any other cleanup that may be required. 5025 * (Only non-tcp streams may have a non-null conn_idl. 5026 * TCP streams are never flow controlled, and 5027 * conn_idl will be null) 5028 */ 5029 if (drain_cleanup_reqd) 5030 conn_drain_tail(connp, B_TRUE); 5031 5032 if (connp->conn_rq == ip_g_mrouter || connp->conn_wq == ip_g_mrouter) 5033 (void) ip_mrouter_done(NULL); 5034 5035 if (ilg_cleanup_reqd) 5036 ilg_delete_all(connp); 5037 5038 conn_delete_ire(connp, NULL); 5039 5040 /* 5041 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. 5042 * callers from write side can't be there now because close 5043 * is in progress. The only other caller is ipcl_walk 5044 * which checks for the condemned flag. 5045 */ 5046 mutex_enter(&connp->conn_lock); 5047 connp->conn_state_flags |= CONN_CONDEMNED; 5048 while (connp->conn_ref != 1) 5049 cv_wait(&connp->conn_cv, &connp->conn_lock); 5050 connp->conn_state_flags |= CONN_QUIESCED; 5051 mutex_exit(&connp->conn_lock); 5052 } 5053 5054 /* ARGSUSED */ 5055 int 5056 ip_close(queue_t *q, int flags) 5057 { 5058 conn_t *connp; 5059 5060 TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); 5061 5062 /* 5063 * Call the appropriate delete routine depending on whether this is 5064 * a module or device. 5065 */ 5066 if (WR(q)->q_next != NULL) { 5067 /* This is a module close */ 5068 return (ip_modclose((ill_t *)q->q_ptr)); 5069 } 5070 5071 connp = q->q_ptr; 5072 ip_quiesce_conn(connp); 5073 5074 qprocsoff(q); 5075 5076 /* 5077 * Now we are truly single threaded on this stream, and can 5078 * delete the things hanging off the connp, and finally the connp. 5079 * We removed this connp from the fanout list, it cannot be 5080 * accessed thru the fanouts, and we already waited for the 5081 * conn_ref to drop to 0. We are already in close, so 5082 * there cannot be any other thread from the top. qprocsoff 5083 * has completed, and service has completed or won't run in 5084 * future. 5085 */ 5086 ASSERT(connp->conn_ref == 1); 5087 5088 /* 5089 * A conn which was previously marked as IPCL_UDP cannot 5090 * retain the flag because it would have been cleared by 5091 * udp_close(). 5092 */ 5093 ASSERT(!IPCL_IS_UDP(connp)); 5094 5095 if (connp->conn_latch != NULL) { 5096 IPLATCH_REFRELE(connp->conn_latch); 5097 connp->conn_latch = NULL; 5098 } 5099 if (connp->conn_policy != NULL) { 5100 IPPH_REFRELE(connp->conn_policy); 5101 connp->conn_policy = NULL; 5102 } 5103 if (connp->conn_ipsec_opt_mp != NULL) { 5104 freemsg(connp->conn_ipsec_opt_mp); 5105 connp->conn_ipsec_opt_mp = NULL; 5106 } 5107 5108 inet_minor_free(ip_minor_arena, connp->conn_dev); 5109 5110 connp->conn_ref--; 5111 ipcl_conn_destroy(connp); 5112 5113 q->q_ptr = WR(q)->q_ptr = NULL; 5114 return (0); 5115 } 5116 5117 int 5118 ip_snmpmod_close(queue_t *q) 5119 { 5120 conn_t *connp = Q_TO_CONN(q); 5121 ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); 5122 5123 qprocsoff(q); 5124 5125 if (connp->conn_flags & IPCL_UDPMOD) 5126 udp_close_free(connp); 5127 5128 if (connp->conn_cred != NULL) { 5129 crfree(connp->conn_cred); 5130 connp->conn_cred = NULL; 5131 } 5132 CONN_DEC_REF(connp); 5133 q->q_ptr = WR(q)->q_ptr = NULL; 5134 return (0); 5135 } 5136 5137 /* 5138 * Write side put procedure for TCP module or UDP module instance. TCP/UDP 5139 * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP. 5140 * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ. 5141 * M_FLUSH messages and ioctls are only passed downstream; we don't flush our 5142 * queues as we never enqueue messages there and we don't handle any ioctls. 5143 * Everything else is freed. 5144 */ 5145 void 5146 ip_snmpmod_wput(queue_t *q, mblk_t *mp) 5147 { 5148 conn_t *connp = q->q_ptr; 5149 pfi_t setfn; 5150 pfi_t getfn; 5151 5152 ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); 5153 5154 switch (DB_TYPE(mp)) { 5155 case M_PROTO: 5156 case M_PCPROTO: 5157 if ((MBLKL(mp) >= sizeof (t_scalar_t)) && 5158 ((((union T_primitives *)mp->b_rptr)->type == 5159 T_SVR4_OPTMGMT_REQ) || 5160 (((union T_primitives *)mp->b_rptr)->type == 5161 T_OPTMGMT_REQ))) { 5162 /* 5163 * This is the only TPI primitive supported. Its 5164 * handling does not require tcp_t, but it does require 5165 * conn_t to check permissions. 5166 */ 5167 cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); 5168 5169 if (connp->conn_flags & IPCL_TCPMOD) { 5170 setfn = tcp_snmp_set; 5171 getfn = tcp_snmp_get; 5172 } else { 5173 setfn = udp_snmp_set; 5174 getfn = udp_snmp_get; 5175 } 5176 if (!snmpcom_req(q, mp, setfn, getfn, cr)) { 5177 freemsg(mp); 5178 return; 5179 } 5180 } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP)) 5181 != NULL) 5182 qreply(q, mp); 5183 break; 5184 case M_FLUSH: 5185 case M_IOCTL: 5186 putnext(q, mp); 5187 break; 5188 default: 5189 freemsg(mp); 5190 break; 5191 } 5192 } 5193 5194 /* Return the IP checksum for the IP header at "iph". */ 5195 uint16_t 5196 ip_csum_hdr(ipha_t *ipha) 5197 { 5198 uint16_t *uph; 5199 uint32_t sum; 5200 int opt_len; 5201 5202 opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - 5203 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5204 uph = (uint16_t *)ipha; 5205 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 5206 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 5207 if (opt_len > 0) { 5208 do { 5209 sum += uph[10]; 5210 sum += uph[11]; 5211 uph += 2; 5212 } while (--opt_len); 5213 } 5214 sum = (sum & 0xFFFF) + (sum >> 16); 5215 sum = ~(sum + (sum >> 16)) & 0xFFFF; 5216 if (sum == 0xffff) 5217 sum = 0; 5218 return ((uint16_t)sum); 5219 } 5220 5221 void 5222 ip_ddi_destroy(void) 5223 { 5224 tnet_fini(); 5225 tcp_ddi_destroy(); 5226 sctp_ddi_destroy(); 5227 ipsec_loader_destroy(); 5228 ipsec_policy_destroy(); 5229 ipsec_kstat_destroy(); 5230 nd_free(&ip_g_nd); 5231 mutex_destroy(&igmp_timer_lock); 5232 mutex_destroy(&mld_timer_lock); 5233 mutex_destroy(&igmp_slowtimeout_lock); 5234 mutex_destroy(&mld_slowtimeout_lock); 5235 mutex_destroy(&ip_mi_lock); 5236 mutex_destroy(&rts_clients.connf_lock); 5237 ip_ire_fini(); 5238 ip6_asp_free(); 5239 conn_drain_fini(); 5240 ipcl_destroy(); 5241 inet_minor_destroy(ip_minor_arena); 5242 icmp_kstat_fini(); 5243 ip_kstat_fini(); 5244 rw_destroy(&ipsec_capab_ills_lock); 5245 rw_destroy(&ill_g_usesrc_lock); 5246 ip_drop_unregister(&ip_dropper); 5247 } 5248 5249 5250 void 5251 ip_ddi_init(void) 5252 { 5253 TCP6_MAJ = ddi_name_to_major(TCP6); 5254 TCP_MAJ = ddi_name_to_major(TCP); 5255 SCTP_MAJ = ddi_name_to_major(SCTP); 5256 SCTP6_MAJ = ddi_name_to_major(SCTP6); 5257 5258 ip_input_proc = ip_squeue_switch(ip_squeue_enter); 5259 5260 /* IP's IPsec code calls the packet dropper */ 5261 ip_drop_register(&ip_dropper, "IP IPsec processing"); 5262 5263 if (!ip_g_nd) { 5264 if (!ip_param_register(lcl_param_arr, A_CNT(lcl_param_arr), 5265 lcl_ndp_arr, A_CNT(lcl_ndp_arr))) { 5266 nd_free(&ip_g_nd); 5267 } 5268 } 5269 5270 ipsec_loader_init(); 5271 ipsec_policy_init(); 5272 ipsec_kstat_init(); 5273 rw_init(&ip_g_nd_lock, NULL, RW_DEFAULT, NULL); 5274 mutex_init(&igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5275 mutex_init(&mld_timer_lock, NULL, MUTEX_DEFAULT, NULL); 5276 mutex_init(&igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5277 mutex_init(&mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL); 5278 mutex_init(&ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); 5279 mutex_init(&ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); 5280 rw_init(&ill_g_lock, NULL, RW_DEFAULT, NULL); 5281 rw_init(&ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); 5282 rw_init(&ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); 5283 5284 /* 5285 * For IP and TCP the minor numbers should start from 2 since we have 4 5286 * initial devices: ip, ip6, tcp, tcp6. 5287 */ 5288 if ((ip_minor_arena = inet_minor_create("ip_minor_arena", 5289 INET_MIN_DEV + 2, KM_SLEEP)) == NULL) { 5290 cmn_err(CE_PANIC, 5291 "ip_ddi_init: ip_minor_arena creation failed\n"); 5292 } 5293 5294 ipcl_init(); 5295 mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL); 5296 ip_ire_init(); 5297 ip6_asp_init(); 5298 ipif_init(); 5299 conn_drain_init(); 5300 tcp_ddi_init(); 5301 sctp_ddi_init(); 5302 5303 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms); 5304 5305 if ((ip_kstat = kstat_create("ip", 0, "ipstat", 5306 "net", KSTAT_TYPE_NAMED, 5307 sizeof (ip_statistics) / sizeof (kstat_named_t), 5308 KSTAT_FLAG_VIRTUAL)) != NULL) { 5309 ip_kstat->ks_data = &ip_statistics; 5310 kstat_install(ip_kstat); 5311 } 5312 ip_kstat_init(); 5313 ip6_kstat_init(); 5314 icmp_kstat_init(); 5315 ipsec_loader_start(); 5316 tnet_init(); 5317 } 5318 5319 /* 5320 * Allocate and initialize a DLPI template of the specified length. (May be 5321 * called as writer.) 5322 */ 5323 mblk_t * 5324 ip_dlpi_alloc(size_t len, t_uscalar_t prim) 5325 { 5326 mblk_t *mp; 5327 5328 mp = allocb(len, BPRI_MED); 5329 if (!mp) 5330 return (NULL); 5331 5332 /* 5333 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter 5334 * of which we don't seem to use) are sent with M_PCPROTO, and 5335 * that other DLPI are M_PROTO. 5336 */ 5337 if (prim == DL_INFO_REQ) { 5338 mp->b_datap->db_type = M_PCPROTO; 5339 } else { 5340 mp->b_datap->db_type = M_PROTO; 5341 } 5342 5343 mp->b_wptr = mp->b_rptr + len; 5344 bzero(mp->b_rptr, len); 5345 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; 5346 return (mp); 5347 } 5348 5349 const char * 5350 dlpi_prim_str(int prim) 5351 { 5352 switch (prim) { 5353 case DL_INFO_REQ: return ("DL_INFO_REQ"); 5354 case DL_INFO_ACK: return ("DL_INFO_ACK"); 5355 case DL_ATTACH_REQ: return ("DL_ATTACH_REQ"); 5356 case DL_DETACH_REQ: return ("DL_DETACH_REQ"); 5357 case DL_BIND_REQ: return ("DL_BIND_REQ"); 5358 case DL_BIND_ACK: return ("DL_BIND_ACK"); 5359 case DL_UNBIND_REQ: return ("DL_UNBIND_REQ"); 5360 case DL_OK_ACK: return ("DL_OK_ACK"); 5361 case DL_ERROR_ACK: return ("DL_ERROR_ACK"); 5362 case DL_ENABMULTI_REQ: return ("DL_ENABMULTI_REQ"); 5363 case DL_DISABMULTI_REQ: return ("DL_DISABMULTI_REQ"); 5364 case DL_PROMISCON_REQ: return ("DL_PROMISCON_REQ"); 5365 case DL_PROMISCOFF_REQ: return ("DL_PROMISCOFF_REQ"); 5366 case DL_UNITDATA_REQ: return ("DL_UNITDATA_REQ"); 5367 case DL_UNITDATA_IND: return ("DL_UNITDATA_IND"); 5368 case DL_UDERROR_IND: return ("DL_UDERROR_IND"); 5369 case DL_PHYS_ADDR_REQ: return ("DL_PHYS_ADDR_REQ"); 5370 case DL_PHYS_ADDR_ACK: return ("DL_PHYS_ADDR_ACK"); 5371 case DL_SET_PHYS_ADDR_REQ: return ("DL_SET_PHYS_ADDR_REQ"); 5372 case DL_NOTIFY_REQ: return ("DL_NOTIFY_REQ"); 5373 case DL_NOTIFY_ACK: return ("DL_NOTIFY_ACK"); 5374 case DL_NOTIFY_IND: return ("DL_NOTIFY_IND"); 5375 case DL_CAPABILITY_REQ: return ("DL_CAPABILITY_REQ"); 5376 case DL_CAPABILITY_ACK: return ("DL_CAPABILITY_ACK"); 5377 case DL_CONTROL_REQ: return ("DL_CONTROL_REQ"); 5378 case DL_CONTROL_ACK: return ("DL_CONTROL_ACK"); 5379 default: return ("<unknown primitive>"); 5380 } 5381 } 5382 5383 const char * 5384 dlpi_err_str(int err) 5385 { 5386 switch (err) { 5387 case DL_ACCESS: return ("DL_ACCESS"); 5388 case DL_BADADDR: return ("DL_BADADDR"); 5389 case DL_BADCORR: return ("DL_BADCORR"); 5390 case DL_BADDATA: return ("DL_BADDATA"); 5391 case DL_BADPPA: return ("DL_BADPPA"); 5392 case DL_BADPRIM: return ("DL_BADPRIM"); 5393 case DL_BADQOSPARAM: return ("DL_BADQOSPARAM"); 5394 case DL_BADQOSTYPE: return ("DL_BADQOSTYPE"); 5395 case DL_BADSAP: return ("DL_BADSAP"); 5396 case DL_BADTOKEN: return ("DL_BADTOKEN"); 5397 case DL_BOUND: return ("DL_BOUND"); 5398 case DL_INITFAILED: return ("DL_INITFAILED"); 5399 case DL_NOADDR: return ("DL_NOADDR"); 5400 case DL_NOTINIT: return ("DL_NOTINIT"); 5401 case DL_OUTSTATE: return ("DL_OUTSTATE"); 5402 case DL_SYSERR: return ("DL_SYSERR"); 5403 case DL_UNSUPPORTED: return ("DL_UNSUPPORTED"); 5404 case DL_UNDELIVERABLE: return ("DL_UNDELIVERABLE"); 5405 case DL_NOTSUPPORTED : return ("DL_NOTSUPPORTED "); 5406 case DL_TOOMANY: return ("DL_TOOMANY"); 5407 case DL_NOTENAB: return ("DL_NOTENAB"); 5408 case DL_BUSY: return ("DL_BUSY"); 5409 case DL_NOAUTO: return ("DL_NOAUTO"); 5410 case DL_NOXIDAUTO: return ("DL_NOXIDAUTO"); 5411 case DL_NOTESTAUTO: return ("DL_NOTESTAUTO"); 5412 case DL_XIDAUTO: return ("DL_XIDAUTO"); 5413 case DL_TESTAUTO: return ("DL_TESTAUTO"); 5414 case DL_PENDING: return ("DL_PENDING"); 5415 default: return ("<unknown error>"); 5416 } 5417 } 5418 5419 /* 5420 * Debug formatting routine. Returns a character string representation of the 5421 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 5422 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. 5423 */ 5424 char * 5425 ip_dot_addr(ipaddr_t addr, char *buf) 5426 { 5427 return (ip_dot_saddr((uchar_t *)&addr, buf)); 5428 } 5429 5430 /* 5431 * Debug formatting routine. Returns a character string representation of the 5432 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address 5433 * as a pointer. The "xxx" parts including left zero padding so the final 5434 * string will fit easily in tables. It would be nice to take a padding 5435 * length argument instead. 5436 */ 5437 static char * 5438 ip_dot_saddr(uchar_t *addr, char *buf) 5439 { 5440 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", 5441 addr[0] & 0xFF, addr[1] & 0xFF, addr[2] & 0xFF, addr[3] & 0xFF); 5442 return (buf); 5443 } 5444 5445 /* 5446 * Send an ICMP error after patching up the packet appropriately. Returns 5447 * non-zero if the appropriate MIB should be bumped; zero otherwise. 5448 */ 5449 static boolean_t 5450 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, 5451 uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid) 5452 { 5453 ipha_t *ipha; 5454 mblk_t *first_mp; 5455 boolean_t secure; 5456 unsigned char db_type; 5457 5458 first_mp = mp; 5459 if (mctl_present) { 5460 mp = mp->b_cont; 5461 secure = ipsec_in_is_secure(first_mp); 5462 ASSERT(mp != NULL); 5463 } else { 5464 /* 5465 * If this is an ICMP error being reported - which goes 5466 * up as M_CTLs, we need to convert them to M_DATA till 5467 * we finish checking with global policy because 5468 * ipsec_check_global_policy() assumes M_DATA as clear 5469 * and M_CTL as secure. 5470 */ 5471 db_type = DB_TYPE(mp); 5472 DB_TYPE(mp) = M_DATA; 5473 secure = B_FALSE; 5474 } 5475 /* 5476 * We are generating an icmp error for some inbound packet. 5477 * Called from all ip_fanout_(udp, tcp, proto) functions. 5478 * Before we generate an error, check with global policy 5479 * to see whether this is allowed to enter the system. As 5480 * there is no "conn", we are checking with global policy. 5481 */ 5482 ipha = (ipha_t *)mp->b_rptr; 5483 if (secure || ipsec_inbound_v4_policy_present) { 5484 first_mp = ipsec_check_global_policy(first_mp, NULL, 5485 ipha, NULL, mctl_present); 5486 if (first_mp == NULL) 5487 return (B_FALSE); 5488 } 5489 5490 if (!mctl_present) 5491 DB_TYPE(mp) = db_type; 5492 5493 if (flags & IP_FF_SEND_ICMP) { 5494 if (flags & IP_FF_HDR_COMPLETE) { 5495 if (ip_hdr_complete(ipha, zoneid)) { 5496 freemsg(first_mp); 5497 return (B_TRUE); 5498 } 5499 } 5500 if (flags & IP_FF_CKSUM) { 5501 /* 5502 * Have to correct checksum since 5503 * the packet might have been 5504 * fragmented and the reassembly code in ip_rput 5505 * does not restore the IP checksum. 5506 */ 5507 ipha->ipha_hdr_checksum = 0; 5508 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 5509 } 5510 switch (icmp_type) { 5511 case ICMP_DEST_UNREACHABLE: 5512 icmp_unreachable(WR(q), first_mp, icmp_code); 5513 break; 5514 default: 5515 freemsg(first_mp); 5516 break; 5517 } 5518 } else { 5519 freemsg(first_mp); 5520 return (B_FALSE); 5521 } 5522 5523 return (B_TRUE); 5524 } 5525 5526 /* 5527 * Used to send an ICMP error message when a packet is received for 5528 * a protocol that is not supported. The mblk passed as argument 5529 * is consumed by this function. 5530 */ 5531 void 5532 ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid) 5533 { 5534 mblk_t *mp; 5535 ipha_t *ipha; 5536 ill_t *ill; 5537 ipsec_in_t *ii; 5538 5539 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 5540 ASSERT(ii->ipsec_in_type == IPSEC_IN); 5541 5542 mp = ipsec_mp->b_cont; 5543 ipsec_mp->b_cont = NULL; 5544 ipha = (ipha_t *)mp->b_rptr; 5545 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 5546 if (ip_fanout_send_icmp(q, mp, flags, ICMP_DEST_UNREACHABLE, 5547 ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid)) { 5548 BUMP_MIB(&ip_mib, ipInUnknownProtos); 5549 } 5550 } else { 5551 /* Get ill from index in ipsec_in_t. */ 5552 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 5553 B_TRUE, NULL, NULL, NULL, NULL); 5554 if (ill != NULL) { 5555 if (ip_fanout_send_icmp_v6(q, mp, flags, 5556 ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, 5557 0, B_FALSE, zoneid)) { 5558 BUMP_MIB(ill->ill_ip6_mib, ipv6InUnknownProtos); 5559 } 5560 5561 ill_refrele(ill); 5562 } else { /* re-link for the freemsg() below. */ 5563 ipsec_mp->b_cont = mp; 5564 } 5565 } 5566 5567 /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ 5568 freemsg(ipsec_mp); 5569 } 5570 5571 /* 5572 * See if the inbound datagram has had IPsec processing applied to it. 5573 */ 5574 boolean_t 5575 ipsec_in_is_secure(mblk_t *ipsec_mp) 5576 { 5577 ipsec_in_t *ii; 5578 5579 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 5580 ASSERT(ii->ipsec_in_type == IPSEC_IN); 5581 5582 if (ii->ipsec_in_loopback) { 5583 return (ii->ipsec_in_secure); 5584 } else { 5585 return (ii->ipsec_in_ah_sa != NULL || 5586 ii->ipsec_in_esp_sa != NULL || 5587 ii->ipsec_in_decaps); 5588 } 5589 } 5590 5591 /* 5592 * Handle protocols with which IP is less intimate. There 5593 * can be more than one stream bound to a particular 5594 * protocol. When this is the case, normally each one gets a copy 5595 * of any incoming packets. 5596 * 5597 * IPSEC NOTE : 5598 * 5599 * Don't allow a secure packet going up a non-secure connection. 5600 * We don't allow this because 5601 * 5602 * 1) Reply might go out in clear which will be dropped at 5603 * the sending side. 5604 * 2) If the reply goes out in clear it will give the 5605 * adversary enough information for getting the key in 5606 * most of the cases. 5607 * 5608 * Moreover getting a secure packet when we expect clear 5609 * implies that SA's were added without checking for 5610 * policy on both ends. This should not happen once ISAKMP 5611 * is used to negotiate SAs as SAs will be added only after 5612 * verifying the policy. 5613 * 5614 * NOTE : If the packet was tunneled and not multicast we only send 5615 * to it the first match. Unlike TCP and UDP fanouts this doesn't fall 5616 * back to delivering packets to AF_INET6 raw sockets. 5617 * 5618 * IPQoS Notes: 5619 * Once we have determined the client, invoke IPPF processing. 5620 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 5621 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 5622 * ip_policy will be false. 5623 * 5624 * Zones notes: 5625 * Currently only applications in the global zone can create raw sockets for 5626 * protocols other than ICMP. So unlike the broadcast / multicast case of 5627 * ip_fanout_udp(), we only send a copy of the packet to streams in the 5628 * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). 5629 */ 5630 static void 5631 ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, 5632 boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, 5633 zoneid_t zoneid) 5634 { 5635 queue_t *rq; 5636 mblk_t *mp1, *first_mp1; 5637 uint_t protocol = ipha->ipha_protocol; 5638 ipaddr_t dst; 5639 boolean_t one_only; 5640 mblk_t *first_mp = mp; 5641 boolean_t secure; 5642 uint32_t ill_index; 5643 conn_t *connp, *first_connp, *next_connp; 5644 connf_t *connfp; 5645 boolean_t shared_addr; 5646 5647 if (mctl_present) { 5648 mp = first_mp->b_cont; 5649 secure = ipsec_in_is_secure(first_mp); 5650 ASSERT(mp != NULL); 5651 } else { 5652 secure = B_FALSE; 5653 } 5654 dst = ipha->ipha_dst; 5655 /* 5656 * If the packet was tunneled and not multicast we only send to it 5657 * the first match. 5658 */ 5659 one_only = ((protocol == IPPROTO_ENCAP || protocol == IPPROTO_IPV6) && 5660 !CLASSD(dst)); 5661 5662 shared_addr = (zoneid == ALL_ZONES); 5663 if (shared_addr) { 5664 /* 5665 * We don't allow multilevel ports for raw IP, so no need to 5666 * check for that here. 5667 */ 5668 zoneid = tsol_packet_to_zoneid(mp); 5669 } 5670 5671 connfp = &ipcl_proto_fanout[protocol]; 5672 mutex_enter(&connfp->connf_lock); 5673 connp = connfp->connf_head; 5674 for (connp = connfp->connf_head; connp != NULL; 5675 connp = connp->conn_next) { 5676 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, 5677 zoneid) && 5678 (!is_system_labeled() || 5679 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 5680 connp))) 5681 break; 5682 } 5683 5684 if (connp == NULL || connp->conn_upq == NULL) { 5685 /* 5686 * No one bound to these addresses. Is 5687 * there a client that wants all 5688 * unclaimed datagrams? 5689 */ 5690 mutex_exit(&connfp->connf_lock); 5691 /* 5692 * Check for IPPROTO_ENCAP... 5693 */ 5694 if (protocol == IPPROTO_ENCAP && ip_g_mrouter) { 5695 /* 5696 * XXX If an IPsec mblk is here on a multicast 5697 * tunnel (using ip_mroute stuff), what should 5698 * I do? 5699 * 5700 * For now, just free the IPsec mblk before 5701 * passing it up to the multicast routing 5702 * stuff. 5703 * 5704 * BTW, If I match a configured IP-in-IP 5705 * tunnel, ip_mroute_decap will never be 5706 * called. 5707 */ 5708 if (mp != first_mp) 5709 freeb(first_mp); 5710 ip_mroute_decap(q, mp); 5711 } else { 5712 /* 5713 * Otherwise send an ICMP protocol unreachable. 5714 */ 5715 if (ip_fanout_send_icmp(q, first_mp, flags, 5716 ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, 5717 mctl_present, zoneid)) { 5718 BUMP_MIB(&ip_mib, ipInUnknownProtos); 5719 } 5720 } 5721 return; 5722 } 5723 CONN_INC_REF(connp); 5724 first_connp = connp; 5725 5726 /* 5727 * Only send message to one tunnel driver by immediately 5728 * terminating the loop. 5729 */ 5730 connp = one_only ? NULL : connp->conn_next; 5731 5732 for (;;) { 5733 while (connp != NULL) { 5734 if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, 5735 flags, zoneid) && 5736 (!is_system_labeled() || 5737 tsol_receive_local(mp, &dst, IPV4_VERSION, 5738 shared_addr, connp))) 5739 break; 5740 connp = connp->conn_next; 5741 } 5742 5743 /* 5744 * Copy the packet. 5745 */ 5746 if (connp == NULL || connp->conn_upq == NULL || 5747 (((first_mp1 = dupmsg(first_mp)) == NULL) && 5748 ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { 5749 /* 5750 * No more interested clients or memory 5751 * allocation failed 5752 */ 5753 connp = first_connp; 5754 break; 5755 } 5756 mp1 = mctl_present ? first_mp1->b_cont : first_mp1; 5757 CONN_INC_REF(connp); 5758 mutex_exit(&connfp->connf_lock); 5759 rq = connp->conn_rq; 5760 if (!canputnext(rq)) { 5761 if (flags & IP_FF_RAWIP) { 5762 BUMP_MIB(&ip_mib, rawipInOverflows); 5763 } else { 5764 BUMP_MIB(&icmp_mib, icmpInOverflows); 5765 } 5766 5767 freemsg(first_mp1); 5768 } else { 5769 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5770 first_mp1 = ipsec_check_inbound_policy 5771 (first_mp1, connp, ipha, NULL, 5772 mctl_present); 5773 } 5774 if (first_mp1 != NULL) { 5775 /* 5776 * ip_fanout_proto also gets called from 5777 * icmp_inbound_error_fanout, in which case 5778 * the msg type is M_CTL. Don't add info 5779 * in this case for the time being. In future 5780 * when there is a need for knowing the 5781 * inbound iface index for ICMP error msgs, 5782 * then this can be changed. 5783 */ 5784 if ((connp->conn_recvif != 0) && 5785 (mp->b_datap->db_type != M_CTL)) { 5786 /* 5787 * the actual data will be 5788 * contained in b_cont upon 5789 * successful return of the 5790 * following call else 5791 * original mblk is returned 5792 */ 5793 ASSERT(recv_ill != NULL); 5794 mp1 = ip_add_info(mp1, recv_ill, 5795 IPF_RECVIF); 5796 } 5797 BUMP_MIB(&ip_mib, ipInDelivers); 5798 if (mctl_present) 5799 freeb(first_mp1); 5800 putnext(rq, mp1); 5801 } 5802 } 5803 mutex_enter(&connfp->connf_lock); 5804 /* Follow the next pointer before releasing the conn. */ 5805 next_connp = connp->conn_next; 5806 CONN_DEC_REF(connp); 5807 connp = next_connp; 5808 } 5809 5810 /* Last one. Send it upstream. */ 5811 mutex_exit(&connfp->connf_lock); 5812 5813 /* 5814 * If this packet is coming from icmp_inbound_error_fanout ip_policy 5815 * will be set to false. 5816 */ 5817 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 5818 ill_index = ill->ill_phyint->phyint_ifindex; 5819 ip_process(IPP_LOCAL_IN, &mp, ill_index); 5820 if (mp == NULL) { 5821 CONN_DEC_REF(connp); 5822 if (mctl_present) { 5823 freeb(first_mp); 5824 } 5825 return; 5826 } 5827 } 5828 5829 rq = connp->conn_rq; 5830 if (!canputnext(rq)) { 5831 if (flags & IP_FF_RAWIP) { 5832 BUMP_MIB(&ip_mib, rawipInOverflows); 5833 } else { 5834 BUMP_MIB(&icmp_mib, icmpInOverflows); 5835 } 5836 5837 freemsg(first_mp); 5838 } else { 5839 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5840 first_mp = ipsec_check_inbound_policy(first_mp, connp, 5841 ipha, NULL, mctl_present); 5842 } 5843 if (first_mp != NULL) { 5844 /* 5845 * ip_fanout_proto also gets called 5846 * from icmp_inbound_error_fanout, in 5847 * which case the msg type is M_CTL. 5848 * Don't add info in this case for time 5849 * being. In future when there is a 5850 * need for knowing the inbound iface 5851 * index for ICMP error msgs, then this 5852 * can be changed 5853 */ 5854 if ((connp->conn_recvif != 0) && 5855 (mp->b_datap->db_type != M_CTL)) { 5856 /* 5857 * the actual data will be contained in 5858 * b_cont upon successful return 5859 * of the following call else original 5860 * mblk is returned 5861 */ 5862 ASSERT(recv_ill != NULL); 5863 mp = ip_add_info(mp, recv_ill, IPF_RECVIF); 5864 } 5865 BUMP_MIB(&ip_mib, ipInDelivers); 5866 putnext(rq, mp); 5867 if (mctl_present) 5868 freeb(first_mp); 5869 } 5870 } 5871 CONN_DEC_REF(connp); 5872 } 5873 5874 /* 5875 * Fanout for TCP packets 5876 * The caller puts <fport, lport> in the ports parameter. 5877 * 5878 * IPQoS Notes 5879 * Before sending it to the client, invoke IPPF processing. 5880 * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, 5881 * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local 5882 * ip_policy is false. 5883 */ 5884 static void 5885 ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, 5886 uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) 5887 { 5888 mblk_t *first_mp; 5889 boolean_t secure; 5890 uint32_t ill_index; 5891 int ip_hdr_len; 5892 tcph_t *tcph; 5893 boolean_t syn_present = B_FALSE; 5894 conn_t *connp; 5895 5896 first_mp = mp; 5897 if (mctl_present) { 5898 ASSERT(first_mp->b_datap->db_type == M_CTL); 5899 mp = first_mp->b_cont; 5900 secure = ipsec_in_is_secure(first_mp); 5901 ASSERT(mp != NULL); 5902 } else { 5903 secure = B_FALSE; 5904 } 5905 5906 ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); 5907 5908 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, zoneid)) == 5909 NULL) { 5910 /* 5911 * No connected connection or listener. Send a 5912 * TH_RST via tcp_xmit_listeners_reset. 5913 */ 5914 5915 /* Initiate IPPf processing, if needed. */ 5916 if (IPP_ENABLED(IPP_LOCAL_IN)) { 5917 uint32_t ill_index; 5918 ill_index = recv_ill->ill_phyint->phyint_ifindex; 5919 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 5920 if (first_mp == NULL) 5921 return; 5922 } 5923 BUMP_MIB(&ip_mib, ipInDelivers); 5924 ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", 5925 zoneid)); 5926 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 5927 return; 5928 } 5929 5930 /* 5931 * Allocate the SYN for the TCP connection here itself 5932 */ 5933 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5934 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 5935 if (IPCL_IS_TCP(connp)) { 5936 squeue_t *sqp; 5937 5938 /* 5939 * For fused tcp loopback, assign the eager's 5940 * squeue to be that of the active connect's. 5941 * Note that we don't check for IP_FF_LOOPBACK 5942 * here since this routine gets called only 5943 * for loopback (unlike the IPv6 counterpart). 5944 */ 5945 ASSERT(Q_TO_CONN(q) != NULL); 5946 if (do_tcp_fusion && 5947 !CONN_INBOUND_POLICY_PRESENT(connp) && !secure && 5948 !IPP_ENABLED(IPP_LOCAL_IN) && !ip_policy && 5949 IPCL_IS_TCP(Q_TO_CONN(q))) { 5950 ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); 5951 sqp = Q_TO_CONN(q)->conn_sqp; 5952 } else { 5953 sqp = IP_SQUEUE_GET(lbolt); 5954 } 5955 5956 mp->b_datap->db_struioflag |= STRUIO_EAGER; 5957 DB_CKSUMSTART(mp) = (intptr_t)sqp; 5958 syn_present = B_TRUE; 5959 } 5960 } 5961 5962 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 5963 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5964 if ((flags & TH_RST) || (flags & TH_URG)) { 5965 CONN_DEC_REF(connp); 5966 freemsg(first_mp); 5967 return; 5968 } 5969 if (flags & TH_ACK) { 5970 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 5971 CONN_DEC_REF(connp); 5972 return; 5973 } 5974 5975 CONN_DEC_REF(connp); 5976 freemsg(first_mp); 5977 return; 5978 } 5979 5980 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 5981 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 5982 NULL, mctl_present); 5983 if (first_mp == NULL) { 5984 CONN_DEC_REF(connp); 5985 return; 5986 } 5987 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 5988 ASSERT(syn_present); 5989 if (mctl_present) { 5990 ASSERT(first_mp != mp); 5991 first_mp->b_datap->db_struioflag |= 5992 STRUIO_POLICY; 5993 } else { 5994 ASSERT(first_mp == mp); 5995 mp->b_datap->db_struioflag &= 5996 ~STRUIO_EAGER; 5997 mp->b_datap->db_struioflag |= 5998 STRUIO_POLICY; 5999 } 6000 } else { 6001 /* 6002 * Discard first_mp early since we're dealing with a 6003 * fully-connected conn_t and tcp doesn't do policy in 6004 * this case. 6005 */ 6006 if (mctl_present) { 6007 freeb(first_mp); 6008 mctl_present = B_FALSE; 6009 } 6010 first_mp = mp; 6011 } 6012 } 6013 6014 /* 6015 * Initiate policy processing here if needed. If we get here from 6016 * icmp_inbound_error_fanout, ip_policy is false. 6017 */ 6018 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 6019 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6020 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6021 if (mp == NULL) { 6022 CONN_DEC_REF(connp); 6023 if (mctl_present) 6024 freeb(first_mp); 6025 return; 6026 } else if (mctl_present) { 6027 ASSERT(first_mp != mp); 6028 first_mp->b_cont = mp; 6029 } else { 6030 first_mp = mp; 6031 } 6032 } 6033 6034 6035 6036 /* Handle IPv6 socket options. */ 6037 if (!syn_present && 6038 connp->conn_ipv6_recvpktinfo && (flags & IP_FF_IP6INFO)) { 6039 /* Add header */ 6040 ASSERT(recv_ill != NULL); 6041 mp = ip_add_info(mp, recv_ill, IPF_RECVIF); 6042 if (mp == NULL) { 6043 CONN_DEC_REF(connp); 6044 if (mctl_present) 6045 freeb(first_mp); 6046 return; 6047 } else if (mctl_present) { 6048 /* 6049 * ip_add_info might return a new mp. 6050 */ 6051 ASSERT(first_mp != mp); 6052 first_mp->b_cont = mp; 6053 } else { 6054 first_mp = mp; 6055 } 6056 } 6057 6058 BUMP_MIB(&ip_mib, ipInDelivers); 6059 if (IPCL_IS_TCP(connp)) { 6060 (*ip_input_proc)(connp->conn_sqp, first_mp, 6061 connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); 6062 } else { 6063 putnext(connp->conn_rq, first_mp); 6064 CONN_DEC_REF(connp); 6065 } 6066 } 6067 6068 /* 6069 * Deliver a udp packet to the given conn, possibly applying ipsec policy. 6070 * We are responsible for disposing of mp, such as by freemsg() or putnext() 6071 * Caller is responsible for dropping references to the conn, and freeing 6072 * first_mp. 6073 * 6074 * IPQoS Notes 6075 * Before sending it to the client, invoke IPPF processing. Policy processing 6076 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and 6077 * ip_policy is true. If we get here from icmp_inbound_error_fanout or 6078 * ip_wput_local, ip_policy is false. 6079 */ 6080 static void 6081 ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, 6082 boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill, 6083 boolean_t ip_policy) 6084 { 6085 boolean_t mctl_present = (first_mp != NULL); 6086 uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ 6087 uint32_t ill_index; 6088 6089 if (mctl_present) 6090 first_mp->b_cont = mp; 6091 else 6092 first_mp = mp; 6093 6094 if (CONN_UDP_FLOWCTLD(connp)) { 6095 BUMP_MIB(&ip_mib, udpInOverflows); 6096 freemsg(first_mp); 6097 return; 6098 } 6099 6100 if (CONN_INBOUND_POLICY_PRESENT(connp) || secure) { 6101 first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, 6102 NULL, mctl_present); 6103 if (first_mp == NULL) 6104 return; /* Freed by ipsec_check_inbound_policy(). */ 6105 } 6106 if (mctl_present) 6107 freeb(first_mp); 6108 6109 if (connp->conn_recvif) 6110 in_flags = IPF_RECVIF; 6111 if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) 6112 in_flags |= IPF_RECVSLLA; 6113 6114 /* Handle IPv6 options. */ 6115 if (connp->conn_ipv6_recvpktinfo && (flags & IP_FF_IP6INFO)) 6116 in_flags |= IPF_RECVIF; 6117 6118 /* 6119 * Initiate IPPF processing here, if needed. Note first_mp won't be 6120 * freed if the packet is dropped. The caller will do so. 6121 */ 6122 if (IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) { 6123 ill_index = recv_ill->ill_phyint->phyint_ifindex; 6124 ip_process(IPP_LOCAL_IN, &mp, ill_index); 6125 if (mp == NULL) { 6126 return; 6127 } 6128 } 6129 if ((in_flags != 0) && 6130 (mp->b_datap->db_type != M_CTL)) { 6131 /* 6132 * The actual data will be contained in b_cont 6133 * upon successful return of the following call 6134 * else original mblk is returned 6135 */ 6136 ASSERT(recv_ill != NULL); 6137 mp = ip_add_info(mp, recv_ill, in_flags); 6138 } 6139 BUMP_MIB(&ip_mib, ipInDelivers); 6140 6141 /* Send it upstream */ 6142 CONN_UDP_RECV(connp, mp); 6143 } 6144 6145 /* 6146 * Fanout for UDP packets. 6147 * The caller puts <fport, lport> in the ports parameter. 6148 * 6149 * If SO_REUSEADDR is set all multicast and broadcast packets 6150 * will be delivered to all streams bound to the same port. 6151 * 6152 * Zones notes: 6153 * Multicast and broadcast packets will be distributed to streams in all zones. 6154 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an 6155 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 6156 * packets. To maintain this behavior with multiple zones, the conns are grouped 6157 * by zone and the SO_REUSEADDR flag is checked for the first matching conn in 6158 * each zone. If unset, all the following conns in the same zone are skipped. 6159 */ 6160 static void 6161 ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 6162 uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, 6163 boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) 6164 { 6165 uint32_t dstport, srcport; 6166 ipaddr_t dst; 6167 mblk_t *first_mp; 6168 boolean_t secure; 6169 in6_addr_t v6src; 6170 conn_t *connp; 6171 connf_t *connfp; 6172 conn_t *first_connp; 6173 conn_t *next_connp; 6174 mblk_t *mp1, *first_mp1; 6175 ipaddr_t src; 6176 zoneid_t last_zoneid; 6177 boolean_t reuseaddr; 6178 boolean_t shared_addr; 6179 6180 first_mp = mp; 6181 if (mctl_present) { 6182 mp = first_mp->b_cont; 6183 first_mp->b_cont = NULL; 6184 secure = ipsec_in_is_secure(first_mp); 6185 ASSERT(mp != NULL); 6186 } else { 6187 first_mp = NULL; 6188 secure = B_FALSE; 6189 } 6190 6191 /* Extract ports in net byte order */ 6192 dstport = htons(ntohl(ports) & 0xFFFF); 6193 srcport = htons(ntohl(ports) >> 16); 6194 dst = ipha->ipha_dst; 6195 src = ipha->ipha_src; 6196 6197 shared_addr = (zoneid == ALL_ZONES); 6198 if (shared_addr) { 6199 zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); 6200 if (zoneid == ALL_ZONES) 6201 zoneid = tsol_packet_to_zoneid(mp); 6202 } 6203 6204 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(dstport)]; 6205 mutex_enter(&connfp->connf_lock); 6206 connp = connfp->connf_head; 6207 if (!broadcast && !CLASSD(dst)) { 6208 /* 6209 * Not broadcast or multicast. Send to the one (first) 6210 * client we find. No need to check conn_wantpacket() 6211 * since IP_BOUND_IF/conn_incoming_ill does not apply to 6212 * IPv4 unicast packets. 6213 */ 6214 while ((connp != NULL) && 6215 (!IPCL_UDP_MATCH(connp, dstport, dst, 6216 srcport, src) || connp->conn_zoneid != zoneid)) { 6217 connp = connp->conn_next; 6218 } 6219 6220 if (connp == NULL || connp->conn_upq == NULL) 6221 goto notfound; 6222 6223 if (is_system_labeled() && 6224 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6225 connp)) 6226 goto notfound; 6227 6228 CONN_INC_REF(connp); 6229 mutex_exit(&connfp->connf_lock); 6230 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, 6231 recv_ill, ip_policy); 6232 IP_STAT(ip_udp_fannorm); 6233 CONN_DEC_REF(connp); 6234 return; 6235 } 6236 6237 /* 6238 * Broadcast and multicast case 6239 * 6240 * Need to check conn_wantpacket(). 6241 * If SO_REUSEADDR has been set on the first we send the 6242 * packet to all clients that have joined the group and 6243 * match the port. 6244 */ 6245 6246 while (connp != NULL) { 6247 if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && 6248 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6249 (!is_system_labeled() || 6250 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6251 connp))) 6252 break; 6253 connp = connp->conn_next; 6254 } 6255 6256 if (connp == NULL || connp->conn_upq == NULL) 6257 goto notfound; 6258 6259 first_connp = connp; 6260 /* 6261 * When SO_REUSEADDR is not set, send the packet only to the first 6262 * matching connection in its zone by keeping track of the zoneid. 6263 */ 6264 reuseaddr = first_connp->conn_reuseaddr; 6265 last_zoneid = first_connp->conn_zoneid; 6266 6267 CONN_INC_REF(connp); 6268 connp = connp->conn_next; 6269 for (;;) { 6270 while (connp != NULL) { 6271 if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && 6272 (reuseaddr || connp->conn_zoneid != last_zoneid) && 6273 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6274 (!is_system_labeled() || 6275 tsol_receive_local(mp, &dst, IPV4_VERSION, 6276 shared_addr, connp))) 6277 break; 6278 connp = connp->conn_next; 6279 } 6280 /* 6281 * Just copy the data part alone. The mctl part is 6282 * needed just for verifying policy and it is never 6283 * sent up. 6284 */ 6285 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 6286 ((mp1 = copymsg(mp)) == NULL))) { 6287 /* 6288 * No more interested clients or memory 6289 * allocation failed 6290 */ 6291 connp = first_connp; 6292 break; 6293 } 6294 if (connp->conn_zoneid != last_zoneid) { 6295 /* 6296 * Update the zoneid so that the packet isn't sent to 6297 * any more conns in the same zone unless SO_REUSEADDR 6298 * is set. 6299 */ 6300 reuseaddr = connp->conn_reuseaddr; 6301 last_zoneid = connp->conn_zoneid; 6302 } 6303 if (first_mp != NULL) { 6304 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 6305 ipsec_info_type == IPSEC_IN); 6306 first_mp1 = ipsec_in_tag(first_mp, NULL); 6307 if (first_mp1 == NULL) { 6308 freemsg(mp1); 6309 connp = first_connp; 6310 break; 6311 } 6312 } else { 6313 first_mp1 = NULL; 6314 } 6315 CONN_INC_REF(connp); 6316 mutex_exit(&connfp->connf_lock); 6317 /* 6318 * IPQoS notes: We don't send the packet for policy 6319 * processing here, will do it for the last one (below). 6320 * i.e. we do it per-packet now, but if we do policy 6321 * processing per-conn, then we would need to do it 6322 * here too. 6323 */ 6324 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, 6325 ipha, flags, recv_ill, B_FALSE); 6326 mutex_enter(&connfp->connf_lock); 6327 /* Follow the next pointer before releasing the conn. */ 6328 next_connp = connp->conn_next; 6329 IP_STAT(ip_udp_fanmb); 6330 CONN_DEC_REF(connp); 6331 connp = next_connp; 6332 } 6333 6334 /* Last one. Send it upstream. */ 6335 mutex_exit(&connfp->connf_lock); 6336 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, recv_ill, 6337 ip_policy); 6338 IP_STAT(ip_udp_fanmb); 6339 CONN_DEC_REF(connp); 6340 return; 6341 6342 notfound: 6343 6344 mutex_exit(&connfp->connf_lock); 6345 IP_STAT(ip_udp_fanothers); 6346 /* 6347 * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses 6348 * have already been matched above, since they live in the IPv4 6349 * fanout tables. This implies we only need to 6350 * check for IPv6 in6addr_any endpoints here. 6351 * Thus we compare using ipv6_all_zeros instead of the destination 6352 * address, except for the multicast group membership lookup which 6353 * uses the IPv4 destination. 6354 */ 6355 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 6356 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(dstport)]; 6357 mutex_enter(&connfp->connf_lock); 6358 connp = connfp->connf_head; 6359 if (!broadcast && !CLASSD(dst)) { 6360 while (connp != NULL) { 6361 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 6362 srcport, v6src) && connp->conn_zoneid == zoneid && 6363 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6364 !connp->conn_ipv6_v6only) 6365 break; 6366 connp = connp->conn_next; 6367 } 6368 6369 if (connp != NULL && is_system_labeled() && 6370 !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6371 connp)) 6372 connp = NULL; 6373 6374 if (connp == NULL || connp->conn_upq == NULL) { 6375 /* 6376 * No one bound to this port. Is 6377 * there a client that wants all 6378 * unclaimed datagrams? 6379 */ 6380 mutex_exit(&connfp->connf_lock); 6381 6382 if (mctl_present) 6383 first_mp->b_cont = mp; 6384 else 6385 first_mp = mp; 6386 if (ipcl_proto_search(IPPROTO_UDP) != NULL) { 6387 ip_fanout_proto(q, first_mp, ill, ipha, 6388 flags | IP_FF_RAWIP, mctl_present, 6389 ip_policy, recv_ill, zoneid); 6390 } else { 6391 if (ip_fanout_send_icmp(q, first_mp, flags, 6392 ICMP_DEST_UNREACHABLE, 6393 ICMP_PORT_UNREACHABLE, 6394 mctl_present, zoneid)) { 6395 BUMP_MIB(&ip_mib, udpNoPorts); 6396 } 6397 } 6398 return; 6399 } 6400 6401 CONN_INC_REF(connp); 6402 mutex_exit(&connfp->connf_lock); 6403 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, 6404 recv_ill, ip_policy); 6405 CONN_DEC_REF(connp); 6406 return; 6407 } 6408 /* 6409 * IPv4 multicast packet being delivered to an AF_INET6 6410 * in6addr_any endpoint. 6411 * Need to check conn_wantpacket(). Note that we use conn_wantpacket() 6412 * and not conn_wantpacket_v6() since any multicast membership is 6413 * for an IPv4-mapped multicast address. 6414 * The packet is sent to all clients in all zones that have joined the 6415 * group and match the port. 6416 */ 6417 while (connp != NULL) { 6418 if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, 6419 srcport, v6src) && 6420 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6421 (!is_system_labeled() || 6422 tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, 6423 connp))) 6424 break; 6425 connp = connp->conn_next; 6426 } 6427 6428 if (connp == NULL || connp->conn_upq == NULL) { 6429 /* 6430 * No one bound to this port. Is 6431 * there a client that wants all 6432 * unclaimed datagrams? 6433 */ 6434 mutex_exit(&connfp->connf_lock); 6435 6436 if (mctl_present) 6437 first_mp->b_cont = mp; 6438 else 6439 first_mp = mp; 6440 if (ipcl_proto_search(IPPROTO_UDP) != NULL) { 6441 ip_fanout_proto(q, first_mp, ill, ipha, 6442 flags | IP_FF_RAWIP, mctl_present, ip_policy, 6443 recv_ill, zoneid); 6444 } else { 6445 /* 6446 * We used to attempt to send an icmp error here, but 6447 * since this is known to be a multicast packet 6448 * and we don't send icmp errors in response to 6449 * multicast, just drop the packet and give up sooner. 6450 */ 6451 BUMP_MIB(&ip_mib, udpNoPorts); 6452 freemsg(first_mp); 6453 } 6454 return; 6455 } 6456 6457 first_connp = connp; 6458 6459 CONN_INC_REF(connp); 6460 connp = connp->conn_next; 6461 for (;;) { 6462 while (connp != NULL) { 6463 if (IPCL_UDP_MATCH_V6(connp, dstport, 6464 ipv6_all_zeros, srcport, v6src) && 6465 conn_wantpacket(connp, ill, ipha, flags, zoneid) && 6466 (!is_system_labeled() || 6467 tsol_receive_local(mp, &dst, IPV4_VERSION, 6468 shared_addr, connp))) 6469 break; 6470 connp = connp->conn_next; 6471 } 6472 /* 6473 * Just copy the data part alone. The mctl part is 6474 * needed just for verifying policy and it is never 6475 * sent up. 6476 */ 6477 if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && 6478 ((mp1 = copymsg(mp)) == NULL))) { 6479 /* 6480 * No more intested clients or memory 6481 * allocation failed 6482 */ 6483 connp = first_connp; 6484 break; 6485 } 6486 if (first_mp != NULL) { 6487 ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> 6488 ipsec_info_type == IPSEC_IN); 6489 first_mp1 = ipsec_in_tag(first_mp, NULL); 6490 if (first_mp1 == NULL) { 6491 freemsg(mp1); 6492 connp = first_connp; 6493 break; 6494 } 6495 } else { 6496 first_mp1 = NULL; 6497 } 6498 CONN_INC_REF(connp); 6499 mutex_exit(&connfp->connf_lock); 6500 /* 6501 * IPQoS notes: We don't send the packet for policy 6502 * processing here, will do it for the last one (below). 6503 * i.e. we do it per-packet now, but if we do policy 6504 * processing per-conn, then we would need to do it 6505 * here too. 6506 */ 6507 ip_fanout_udp_conn(connp, first_mp1, mp1, secure, 6508 ipha, flags, recv_ill, B_FALSE); 6509 mutex_enter(&connfp->connf_lock); 6510 /* Follow the next pointer before releasing the conn. */ 6511 next_connp = connp->conn_next; 6512 CONN_DEC_REF(connp); 6513 connp = next_connp; 6514 } 6515 6516 /* Last one. Send it upstream. */ 6517 mutex_exit(&connfp->connf_lock); 6518 ip_fanout_udp_conn(connp, first_mp, mp, secure, ipha, flags, recv_ill, 6519 ip_policy); 6520 CONN_DEC_REF(connp); 6521 } 6522 6523 /* 6524 * Complete the ip_wput header so that it 6525 * is possible to generate ICMP 6526 * errors. 6527 */ 6528 static int 6529 ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid) 6530 { 6531 ire_t *ire; 6532 6533 if (ipha->ipha_src == INADDR_ANY) { 6534 ire = ire_lookup_local(zoneid); 6535 if (ire == NULL) { 6536 ip1dbg(("ip_hdr_complete: no source IRE\n")); 6537 return (1); 6538 } 6539 ipha->ipha_src = ire->ire_addr; 6540 ire_refrele(ire); 6541 } 6542 ipha->ipha_ttl = ip_def_ttl; 6543 ipha->ipha_hdr_checksum = 0; 6544 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 6545 return (0); 6546 } 6547 6548 /* 6549 * Nobody should be sending 6550 * packets up this stream 6551 */ 6552 static void 6553 ip_lrput(queue_t *q, mblk_t *mp) 6554 { 6555 mblk_t *mp1; 6556 6557 switch (mp->b_datap->db_type) { 6558 case M_FLUSH: 6559 /* Turn around */ 6560 if (*mp->b_rptr & FLUSHW) { 6561 *mp->b_rptr &= ~FLUSHR; 6562 qreply(q, mp); 6563 return; 6564 } 6565 break; 6566 } 6567 /* Could receive messages that passed through ar_rput */ 6568 for (mp1 = mp; mp1; mp1 = mp1->b_cont) 6569 mp1->b_prev = mp1->b_next = NULL; 6570 freemsg(mp); 6571 } 6572 6573 /* Nobody should be sending packets down this stream */ 6574 /* ARGSUSED */ 6575 void 6576 ip_lwput(queue_t *q, mblk_t *mp) 6577 { 6578 freemsg(mp); 6579 } 6580 6581 /* 6582 * Move the first hop in any source route to ipha_dst and remove that part of 6583 * the source route. Called by other protocols. Errors in option formatting 6584 * are ignored - will be handled by ip_wput_options Return the final 6585 * destination (either ipha_dst or the last entry in a source route.) 6586 */ 6587 ipaddr_t 6588 ip_massage_options(ipha_t *ipha) 6589 { 6590 ipoptp_t opts; 6591 uchar_t *opt; 6592 uint8_t optval; 6593 uint8_t optlen; 6594 ipaddr_t dst; 6595 int i; 6596 ire_t *ire; 6597 6598 ip2dbg(("ip_massage_options\n")); 6599 dst = ipha->ipha_dst; 6600 for (optval = ipoptp_first(&opts, ipha); 6601 optval != IPOPT_EOL; 6602 optval = ipoptp_next(&opts)) { 6603 opt = opts.ipoptp_cur; 6604 switch (optval) { 6605 uint8_t off; 6606 case IPOPT_SSRR: 6607 case IPOPT_LSRR: 6608 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 6609 ip1dbg(("ip_massage_options: bad src route\n")); 6610 break; 6611 } 6612 optlen = opts.ipoptp_len; 6613 off = opt[IPOPT_OFFSET]; 6614 off--; 6615 redo_srr: 6616 if (optlen < IP_ADDR_LEN || 6617 off > optlen - IP_ADDR_LEN) { 6618 /* End of source route */ 6619 ip1dbg(("ip_massage_options: end of SR\n")); 6620 break; 6621 } 6622 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 6623 ip1dbg(("ip_massage_options: next hop 0x%x\n", 6624 ntohl(dst))); 6625 /* 6626 * Check if our address is present more than 6627 * once as consecutive hops in source route. 6628 * XXX verify per-interface ip_forwarding 6629 * for source route? 6630 */ 6631 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 6632 ALL_ZONES, NULL, MATCH_IRE_TYPE); 6633 if (ire != NULL) { 6634 ire_refrele(ire); 6635 off += IP_ADDR_LEN; 6636 goto redo_srr; 6637 } 6638 if (dst == htonl(INADDR_LOOPBACK)) { 6639 ip1dbg(("ip_massage_options: loopback addr in " 6640 "source route!\n")); 6641 break; 6642 } 6643 /* 6644 * Update ipha_dst to be the first hop and remove the 6645 * first hop from the source route (by overwriting 6646 * part of the option with NOP options). 6647 */ 6648 ipha->ipha_dst = dst; 6649 /* Put the last entry in dst */ 6650 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) + 6651 3; 6652 bcopy(&opt[off], &dst, IP_ADDR_LEN); 6653 6654 ip1dbg(("ip_massage_options: last hop 0x%x\n", 6655 ntohl(dst))); 6656 /* Move down and overwrite */ 6657 opt[IP_ADDR_LEN] = opt[0]; 6658 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN; 6659 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET]; 6660 for (i = 0; i < IP_ADDR_LEN; i++) 6661 opt[i] = IPOPT_NOP; 6662 break; 6663 } 6664 } 6665 return (dst); 6666 } 6667 6668 /* 6669 * This function's job is to forward data to the reverse tunnel (FA->HA) 6670 * after doing a few checks. It is assumed that the incoming interface 6671 * of the packet is always different than the outgoing interface and the 6672 * ire_type of the found ire has to be a non-resolver type. 6673 * 6674 * IPQoS notes 6675 * IP policy is invoked twice for a forwarded packet, once on the read side 6676 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 6677 * enabled. 6678 */ 6679 static void 6680 ip_mrtun_forward(ire_t *ire, ill_t *in_ill, mblk_t *mp) 6681 { 6682 ipha_t *ipha; 6683 queue_t *q; 6684 uint32_t pkt_len; 6685 #define rptr ((uchar_t *)ipha) 6686 uint32_t sum; 6687 uint32_t max_frag; 6688 mblk_t *first_mp; 6689 uint32_t ill_index; 6690 6691 ASSERT(ire != NULL); 6692 ASSERT(ire->ire_ipif->ipif_net_type == IRE_IF_NORESOLVER); 6693 ASSERT(ire->ire_stq != NULL); 6694 6695 /* Initiate read side IPPF processing */ 6696 if (IPP_ENABLED(IPP_FWD_IN)) { 6697 ill_index = in_ill->ill_phyint->phyint_ifindex; 6698 ip_process(IPP_FWD_IN, &mp, ill_index); 6699 if (mp == NULL) { 6700 ip2dbg(("ip_mrtun_forward: inbound pkt " 6701 "dropped during IPPF processing\n")); 6702 return; 6703 } 6704 } 6705 6706 if (((in_ill->ill_flags & ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 6707 ILLF_ROUTER) == 0) || 6708 (in_ill == (ill_t *)ire->ire_stq->q_ptr)) { 6709 BUMP_MIB(&ip_mib, ipForwProhibits); 6710 ip0dbg(("ip_mrtun_forward: Can't forward :" 6711 "forwarding is not turned on\n")); 6712 goto drop_pkt; 6713 } 6714 6715 /* 6716 * Don't forward if the interface is down 6717 */ 6718 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 6719 BUMP_MIB(&ip_mib, ipInDiscards); 6720 goto drop_pkt; 6721 } 6722 6723 ipha = (ipha_t *)mp->b_rptr; 6724 pkt_len = ntohs(ipha->ipha_length); 6725 /* Adjust the checksum to reflect the ttl decrement. */ 6726 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 6727 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 6728 if (ipha->ipha_ttl-- <= 1) { 6729 if (ip_csum_hdr(ipha)) { 6730 BUMP_MIB(&ip_mib, ipInCksumErrs); 6731 goto drop_pkt; 6732 } 6733 q = ire->ire_stq; 6734 if ((first_mp = allocb(sizeof (ipsec_info_t), 6735 BPRI_HI)) == NULL) { 6736 goto drop_pkt; 6737 } 6738 ip_ipsec_out_prepend(first_mp, mp, in_ill); 6739 icmp_time_exceeded(q, first_mp, ICMP_TTL_EXCEEDED); 6740 6741 return; 6742 } 6743 6744 /* Get the ill_index of the ILL */ 6745 ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 6746 6747 /* 6748 * ip_mrtun_forward is only used by foreign agent to reverse 6749 * tunnel the incoming packet. So it does not do any option 6750 * processing for source routing. 6751 */ 6752 max_frag = ire->ire_max_frag; 6753 if (pkt_len > max_frag) { 6754 /* 6755 * It needs fragging on its way out. We haven't 6756 * verified the header checksum yet. Since we 6757 * are going to put a surely good checksum in the 6758 * outgoing header, we have to make sure that it 6759 * was good coming in. 6760 */ 6761 if (ip_csum_hdr(ipha)) { 6762 BUMP_MIB(&ip_mib, ipInCksumErrs); 6763 goto drop_pkt; 6764 } 6765 6766 /* Initiate write side IPPF processing */ 6767 if (IPP_ENABLED(IPP_FWD_OUT)) { 6768 ip_process(IPP_FWD_OUT, &mp, ill_index); 6769 if (mp == NULL) { 6770 ip2dbg(("ip_mrtun_forward: outbound pkt "\ 6771 "dropped/deferred during ip policy "\ 6772 "processing\n")); 6773 return; 6774 } 6775 } 6776 if ((first_mp = allocb(sizeof (ipsec_info_t), 6777 BPRI_HI)) == NULL) { 6778 goto drop_pkt; 6779 } 6780 ip_ipsec_out_prepend(first_mp, mp, in_ill); 6781 mp = first_mp; 6782 6783 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0); 6784 return; 6785 } 6786 6787 ip2dbg(("ip_mrtun_forward: ire type (%d)\n", ire->ire_type)); 6788 6789 ASSERT(ire->ire_ipif != NULL); 6790 6791 mp = ip_wput_attach_llhdr(mp, ire, IPP_FWD_OUT, ill_index); 6792 if (mp == NULL) { 6793 BUMP_MIB(&ip_mib, ipInDiscards); 6794 return; 6795 } 6796 6797 /* Now send the packet to the tunnel interface */ 6798 q = ire->ire_stq; 6799 UPDATE_IB_PKT_COUNT(ire); 6800 ire->ire_last_used_time = lbolt; 6801 BUMP_MIB(&ip_mib, ipForwDatagrams); 6802 putnext(q, mp); 6803 ip2dbg(("ip_mrtun_forward: sent packet to ill %p\n", q->q_ptr)); 6804 return; 6805 6806 drop_pkt:; 6807 ip2dbg(("ip_mrtun_forward: dropping pkt\n")); 6808 freemsg(mp); 6809 #undef rptr 6810 } 6811 6812 /* 6813 * Fills the ipsec_out_t data structure with appropriate fields and 6814 * prepends it to mp which contains the IP hdr + data that was meant 6815 * to be forwarded. Please note that ipsec_out_info data structure 6816 * is used here to communicate the outgoing ill path at ip_wput() 6817 * for the ICMP error packet. This has nothing to do with ipsec IP 6818 * security. ipsec_out_t is really used to pass the info to the module 6819 * IP where this information cannot be extracted from conn. 6820 * This functions is called by ip_mrtun_forward(). 6821 */ 6822 void 6823 ip_ipsec_out_prepend(mblk_t *first_mp, mblk_t *mp, ill_t *xmit_ill) 6824 { 6825 ipsec_out_t *io; 6826 6827 ASSERT(xmit_ill != NULL); 6828 first_mp->b_datap->db_type = M_CTL; 6829 first_mp->b_wptr += sizeof (ipsec_info_t); 6830 /* 6831 * This is to pass info to ip_wput in absence of conn. 6832 * ipsec_out_secure will be B_FALSE because of this. 6833 * Thus ipsec_out_secure being B_FALSE indicates that 6834 * this is not IPSEC security related information. 6835 */ 6836 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 6837 io = (ipsec_out_t *)first_mp->b_rptr; 6838 io->ipsec_out_type = IPSEC_OUT; 6839 io->ipsec_out_len = sizeof (ipsec_out_t); 6840 first_mp->b_cont = mp; 6841 io->ipsec_out_ill_index = 6842 xmit_ill->ill_phyint->phyint_ifindex; 6843 io->ipsec_out_xmit_if = B_TRUE; 6844 } 6845 6846 /* 6847 * Return the network mask 6848 * associated with the specified address. 6849 */ 6850 ipaddr_t 6851 ip_net_mask(ipaddr_t addr) 6852 { 6853 uchar_t *up = (uchar_t *)&addr; 6854 ipaddr_t mask = 0; 6855 uchar_t *maskp = (uchar_t *)&mask; 6856 6857 #if defined(__i386) || defined(__amd64) 6858 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER 6859 #endif 6860 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER 6861 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0; 6862 #endif 6863 if (CLASSD(addr)) { 6864 maskp[0] = 0xF0; 6865 return (mask); 6866 } 6867 if (addr == 0) 6868 return (0); 6869 maskp[0] = 0xFF; 6870 if ((up[0] & 0x80) == 0) 6871 return (mask); 6872 6873 maskp[1] = 0xFF; 6874 if ((up[0] & 0xC0) == 0x80) 6875 return (mask); 6876 6877 maskp[2] = 0xFF; 6878 if ((up[0] & 0xE0) == 0xC0) 6879 return (mask); 6880 6881 /* Must be experimental or multicast, indicate as much */ 6882 return ((ipaddr_t)0); 6883 } 6884 6885 /* 6886 * Select an ill for the packet by considering load spreading across 6887 * a different ill in the group if dst_ill is part of some group. 6888 */ 6889 static ill_t * 6890 ip_newroute_get_dst_ill(ill_t *dst_ill) 6891 { 6892 ill_t *ill; 6893 6894 /* 6895 * We schedule irrespective of whether the source address is 6896 * INADDR_ANY or not. illgrp_scheduler returns a held ill. 6897 */ 6898 ill = illgrp_scheduler(dst_ill); 6899 if (ill == NULL) 6900 return (NULL); 6901 6902 /* 6903 * For groups with names ip_sioctl_groupname ensures that all 6904 * ills are of same type. For groups without names, ifgrp_insert 6905 * ensures this. 6906 */ 6907 ASSERT(dst_ill->ill_type == ill->ill_type); 6908 6909 return (ill); 6910 } 6911 6912 /* 6913 * Helper function for the IPIF_NOFAILOVER/ATTACH_IF interface attachment case. 6914 */ 6915 ill_t * 6916 ip_grab_attach_ill(ill_t *ill, mblk_t *first_mp, int ifindex, boolean_t isv6) 6917 { 6918 ill_t *ret_ill; 6919 6920 ASSERT(ifindex != 0); 6921 ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL); 6922 if (ret_ill == NULL || 6923 (ret_ill->ill_phyint->phyint_flags & PHYI_OFFLINE)) { 6924 if (isv6) { 6925 if (ill != NULL) { 6926 BUMP_MIB(ill->ill_ip6_mib, ipv6OutDiscards); 6927 } else { 6928 BUMP_MIB(&ip6_mib, ipv6OutDiscards); 6929 } 6930 ip1dbg(("ip_grab_attach_ill (IPv6): " 6931 "bad ifindex %d.\n", ifindex)); 6932 } else { 6933 BUMP_MIB(&ip_mib, ipOutDiscards); 6934 ip1dbg(("ip_grab_attach_ill (IPv4): " 6935 "bad ifindex %d.\n", ifindex)); 6936 } 6937 if (ret_ill != NULL) 6938 ill_refrele(ret_ill); 6939 freemsg(first_mp); 6940 return (NULL); 6941 } 6942 6943 return (ret_ill); 6944 } 6945 6946 /* 6947 * IPv4 - 6948 * ip_newroute is called by ip_rput or ip_wput whenever we need to send 6949 * out a packet to a destination address for which we do not have specific 6950 * (or sufficient) routing information. 6951 * 6952 * NOTE : These are the scopes of some of the variables that point at IRE, 6953 * which needs to be followed while making any future modifications 6954 * to avoid memory leaks. 6955 * 6956 * - ire and sire are the entries looked up initially by 6957 * ire_ftable_lookup. 6958 * - ipif_ire is used to hold the interface ire associated with 6959 * the new cache ire. But it's scope is limited, so we always REFRELE 6960 * it before branching out to error paths. 6961 * - save_ire is initialized before ire_create, so that ire returned 6962 * by ire_create will not over-write the ire. We REFRELE save_ire 6963 * before breaking out of the switch. 6964 * 6965 * Thus on failures, we have to REFRELE only ire and sire, if they 6966 * are not NULL. 6967 */ 6968 void 6969 ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp) 6970 { 6971 areq_t *areq; 6972 ipaddr_t gw = 0; 6973 ire_t *ire = NULL; 6974 mblk_t *res_mp; 6975 ipaddr_t *addrp; 6976 ipaddr_t nexthop_addr; 6977 ipif_t *src_ipif = NULL; 6978 ill_t *dst_ill = NULL; 6979 ipha_t *ipha; 6980 ire_t *sire = NULL; 6981 mblk_t *first_mp; 6982 ire_t *save_ire; 6983 mblk_t *dlureq_mp; 6984 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER address */ 6985 ushort_t ire_marks = 0; 6986 boolean_t mctl_present; 6987 ipsec_out_t *io; 6988 mblk_t *saved_mp; 6989 ire_t *first_sire = NULL; 6990 mblk_t *copy_mp = NULL; 6991 mblk_t *xmit_mp = NULL; 6992 ipaddr_t save_dst; 6993 uint32_t multirt_flags = 6994 MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; 6995 boolean_t multirt_is_resolvable; 6996 boolean_t multirt_resolve_next; 6997 boolean_t do_attach_ill = B_FALSE; 6998 boolean_t ip_nexthop = B_FALSE; 6999 zoneid_t zoneid; 7000 tsol_ire_gw_secattr_t *attrp = NULL; 7001 tsol_gcgrp_t *gcgrp = NULL; 7002 tsol_gcgrp_addr_t ga; 7003 7004 if (ip_debug > 2) { 7005 /* ip1dbg */ 7006 pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); 7007 } 7008 7009 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 7010 if (mctl_present) { 7011 io = (ipsec_out_t *)first_mp->b_rptr; 7012 zoneid = io->ipsec_out_zoneid; 7013 ASSERT(zoneid != ALL_ZONES); 7014 } else if (connp != NULL) { 7015 zoneid = connp->conn_zoneid; 7016 } else { 7017 zoneid = GLOBAL_ZONEID; 7018 } 7019 7020 ipha = (ipha_t *)mp->b_rptr; 7021 7022 /* All multicast lookups come through ip_newroute_ipif() */ 7023 if (CLASSD(dst)) { 7024 ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", 7025 ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); 7026 freemsg(first_mp); 7027 return; 7028 } 7029 7030 if (ip_loopback_src_or_dst(ipha, NULL)) { 7031 goto icmp_err_ret; 7032 } 7033 7034 if (mctl_present && io->ipsec_out_attach_if) { 7035 /* ip_grab_attach_ill returns a held ill */ 7036 attach_ill = ip_grab_attach_ill(NULL, first_mp, 7037 io->ipsec_out_ill_index, B_FALSE); 7038 7039 /* Failure case frees things for us. */ 7040 if (attach_ill == NULL) 7041 return; 7042 7043 /* 7044 * Check if we need an ire that will not be 7045 * looked up by anybody else i.e. HIDDEN. 7046 */ 7047 if (ill_is_probeonly(attach_ill)) 7048 ire_marks = IRE_MARK_HIDDEN; 7049 } 7050 if (mctl_present && io->ipsec_out_ip_nexthop) { 7051 ip_nexthop = B_TRUE; 7052 nexthop_addr = io->ipsec_out_nexthop_addr; 7053 } 7054 /* 7055 * If this IRE is created for forwarding or it is not for 7056 * traffic for congestion controlled protocols, mark it as temporary. 7057 */ 7058 if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) 7059 ire_marks |= IRE_MARK_TEMPORARY; 7060 7061 /* 7062 * Get what we can from ire_ftable_lookup which will follow an IRE 7063 * chain until it gets the most specific information available. 7064 * For example, we know that there is no IRE_CACHE for this dest, 7065 * but there may be an IRE_OFFSUBNET which specifies a gateway. 7066 * ire_ftable_lookup will look up the gateway, etc. 7067 * Check if in_ill != NULL. If it is true, the packet must be 7068 * from an incoming interface where RTA_SRCIFP is set. 7069 * Otherwise, given ire_ftable_lookup algorithm, only one among routes 7070 * to the destination, of equal netmask length in the forward table, 7071 * will be recursively explored. If no information is available 7072 * for the final gateway of that route, we force the returned ire 7073 * to be equal to sire using MATCH_IRE_PARENT. 7074 * At least, in this case we have a starting point (in the buckets) 7075 * to look for other routes to the destination in the forward table. 7076 * This is actually used only for multirouting, where a list 7077 * of routes has to be processed in sequence. 7078 */ 7079 if (in_ill != NULL) { 7080 ire = ire_srcif_table_lookup(dst, IRE_IF_RESOLVER, NULL, 7081 in_ill, MATCH_IRE_TYPE); 7082 } else if (ip_nexthop) { 7083 /* 7084 * The first time we come here, we look for an IRE_INTERFACE 7085 * entry for the specified nexthop, set the dst to be the 7086 * nexthop address and create an IRE_CACHE entry for the 7087 * nexthop. The next time around, we are able to find an 7088 * IRE_CACHE entry for the nexthop, set the gateway to be the 7089 * nexthop address and create an IRE_CACHE entry for the 7090 * destination address via the specified nexthop. 7091 */ 7092 ire = ire_cache_lookup(nexthop_addr, zoneid, 7093 MBLK_GETLABEL(mp)); 7094 if (ire != NULL) { 7095 gw = nexthop_addr; 7096 ire_marks |= IRE_MARK_PRIVATE_ADDR; 7097 } else { 7098 ire = ire_ftable_lookup(nexthop_addr, 0, 0, 7099 IRE_INTERFACE, NULL, NULL, zoneid, 0, 7100 MBLK_GETLABEL(mp), 7101 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 7102 if (ire != NULL) { 7103 dst = nexthop_addr; 7104 } 7105 } 7106 } else if (attach_ill == NULL) { 7107 ire = ire_ftable_lookup(dst, 0, 0, 0, 7108 NULL, &sire, zoneid, 0, MBLK_GETLABEL(mp), 7109 MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 7110 MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | 7111 MATCH_IRE_SECATTR); 7112 } else { 7113 /* 7114 * attach_ill is set only for communicating with 7115 * on-link hosts. So, don't look for DEFAULT. 7116 */ 7117 ipif_t *attach_ipif; 7118 7119 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 7120 if (attach_ipif == NULL) { 7121 ill_refrele(attach_ill); 7122 goto icmp_err_ret; 7123 } 7124 ire = ire_ftable_lookup(dst, 0, 0, 0, attach_ipif, 7125 &sire, zoneid, 0, MBLK_GETLABEL(mp), 7126 MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL | 7127 MATCH_IRE_SECATTR); 7128 ipif_refrele(attach_ipif); 7129 } 7130 ip3dbg(("ip_newroute: ire_ftable_lookup() " 7131 "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); 7132 7133 /* 7134 * This loop is run only once in most cases. 7135 * We loop to resolve further routes only when the destination 7136 * can be reached through multiple RTF_MULTIRT-flagged ires. 7137 */ 7138 do { 7139 /* Clear the previous iteration's values */ 7140 if (src_ipif != NULL) { 7141 ipif_refrele(src_ipif); 7142 src_ipif = NULL; 7143 } 7144 if (dst_ill != NULL) { 7145 ill_refrele(dst_ill); 7146 dst_ill = NULL; 7147 } 7148 7149 multirt_resolve_next = B_FALSE; 7150 /* 7151 * We check if packets have to be multirouted. 7152 * In this case, given the current <ire, sire> couple, 7153 * we look for the next suitable <ire, sire>. 7154 * This check is done in ire_multirt_lookup(), 7155 * which applies various criteria to find the next route 7156 * to resolve. ire_multirt_lookup() leaves <ire, sire> 7157 * unchanged if it detects it has not been tried yet. 7158 */ 7159 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7160 ip3dbg(("ip_newroute: starting next_resolution " 7161 "with first_mp %p, tag %d\n", 7162 (void *)first_mp, 7163 MULTIRT_DEBUG_TAGGED(first_mp))); 7164 7165 ASSERT(sire != NULL); 7166 multirt_is_resolvable = 7167 ire_multirt_lookup(&ire, &sire, multirt_flags, 7168 MBLK_GETLABEL(mp)); 7169 7170 ip3dbg(("ip_newroute: multirt_is_resolvable %d, " 7171 "ire %p, sire %p\n", 7172 multirt_is_resolvable, 7173 (void *)ire, (void *)sire)); 7174 7175 if (!multirt_is_resolvable) { 7176 /* 7177 * No more multirt route to resolve; give up 7178 * (all routes resolved or no more 7179 * resolvable routes). 7180 */ 7181 if (ire != NULL) { 7182 ire_refrele(ire); 7183 ire = NULL; 7184 } 7185 } else { 7186 ASSERT(sire != NULL); 7187 ASSERT(ire != NULL); 7188 /* 7189 * We simply use first_sire as a flag that 7190 * indicates if a resolvable multirt route 7191 * has already been found. 7192 * If it is not the case, we may have to send 7193 * an ICMP error to report that the 7194 * destination is unreachable. 7195 * We do not IRE_REFHOLD first_sire. 7196 */ 7197 if (first_sire == NULL) { 7198 first_sire = sire; 7199 } 7200 } 7201 } 7202 if (ire == NULL) { 7203 if (ip_debug > 3) { 7204 /* ip2dbg */ 7205 pr_addr_dbg("ip_newroute: " 7206 "can't resolve %s\n", AF_INET, &dst); 7207 } 7208 ip3dbg(("ip_newroute: " 7209 "ire %p, sire %p, first_sire %p\n", 7210 (void *)ire, (void *)sire, (void *)first_sire)); 7211 7212 if (sire != NULL) { 7213 ire_refrele(sire); 7214 sire = NULL; 7215 } 7216 7217 if (first_sire != NULL) { 7218 /* 7219 * At least one multirt route has been found 7220 * in the same call to ip_newroute(); 7221 * there is no need to report an ICMP error. 7222 * first_sire was not IRE_REFHOLDed. 7223 */ 7224 MULTIRT_DEBUG_UNTAG(first_mp); 7225 freemsg(first_mp); 7226 return; 7227 } 7228 ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, 7229 RTA_DST); 7230 if (attach_ill != NULL) 7231 ill_refrele(attach_ill); 7232 goto icmp_err_ret; 7233 } 7234 7235 /* 7236 * When RTA_SRCIFP is used to add a route, then an interface 7237 * route is added in the source interface's routing table. 7238 * If the outgoing interface of this route is of type 7239 * IRE_IF_RESOLVER, then upon creation of the ire, 7240 * ire_dlureq_mp is set to NULL. Later, when this route is 7241 * first used for forwarding packet, ip_newroute() is called 7242 * to resolve the hardware address of the outgoing ipif. 7243 * We do not come here for IRE_IF_NORESOLVER entries in the 7244 * source interface based table. We only come here if the 7245 * outgoing interface is a resolver interface and we don't 7246 * have the ire_dlureq_mp information yet. 7247 * If in_ill is not null that means it is called from 7248 * ip_rput. 7249 */ 7250 7251 ASSERT(ire->ire_in_ill == NULL || 7252 (ire->ire_type == IRE_IF_RESOLVER && 7253 ire->ire_dlureq_mp == NULL)); 7254 7255 /* 7256 * Verify that the returned IRE does not have either 7257 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is 7258 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. 7259 */ 7260 if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || 7261 (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { 7262 if (attach_ill != NULL) 7263 ill_refrele(attach_ill); 7264 goto icmp_err_ret; 7265 } 7266 /* 7267 * Increment the ire_ob_pkt_count field for ire if it is an 7268 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and 7269 * increment the same for the parent IRE, sire, if it is some 7270 * sort of prefix IRE (which includes DEFAULT, PREFIX, HOST 7271 * and HOST_REDIRECT). 7272 */ 7273 if ((ire->ire_type & IRE_INTERFACE) != 0) { 7274 UPDATE_OB_PKT_COUNT(ire); 7275 ire->ire_last_used_time = lbolt; 7276 } 7277 7278 if (sire != NULL) { 7279 gw = sire->ire_gateway_addr; 7280 ASSERT((sire->ire_type & (IRE_CACHETABLE | 7281 IRE_INTERFACE)) == 0); 7282 UPDATE_OB_PKT_COUNT(sire); 7283 sire->ire_last_used_time = lbolt; 7284 } 7285 /* 7286 * We have a route to reach the destination. 7287 * 7288 * 1) If the interface is part of ill group, try to get a new 7289 * ill taking load spreading into account. 7290 * 7291 * 2) After selecting the ill, get a source address that 7292 * might create good inbound load spreading. 7293 * ipif_select_source does this for us. 7294 * 7295 * If the application specified the ill (ifindex), we still 7296 * load spread. Only if the packets needs to go out 7297 * specifically on a given ill e.g. binding to 7298 * IPIF_NOFAILOVER address, then we don't try to use a 7299 * different ill for load spreading. 7300 */ 7301 if (attach_ill == NULL) { 7302 /* 7303 * Don't perform outbound load spreading in the 7304 * case of an RTF_MULTIRT route, as we actually 7305 * typically want to replicate outgoing packets 7306 * through particular interfaces. 7307 */ 7308 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7309 dst_ill = ire->ire_ipif->ipif_ill; 7310 /* for uniformity */ 7311 ill_refhold(dst_ill); 7312 } else { 7313 /* 7314 * If we are here trying to create an IRE_CACHE 7315 * for an offlink destination and have the 7316 * IRE_CACHE for the next hop and the latter is 7317 * using virtual IP source address selection i.e 7318 * it's ire->ire_ipif is pointing to a virtual 7319 * network interface (vni) then 7320 * ip_newroute_get_dst_ll() will return the vni 7321 * interface as the dst_ill. Since the vni is 7322 * virtual i.e not associated with any physical 7323 * interface, it cannot be the dst_ill, hence 7324 * in such a case call ip_newroute_get_dst_ll() 7325 * with the stq_ill instead of the ire_ipif ILL. 7326 * The function returns a refheld ill. 7327 */ 7328 if ((ire->ire_type == IRE_CACHE) && 7329 IS_VNI(ire->ire_ipif->ipif_ill)) 7330 dst_ill = ip_newroute_get_dst_ill( 7331 ire->ire_stq->q_ptr); 7332 else 7333 dst_ill = ip_newroute_get_dst_ill( 7334 ire->ire_ipif->ipif_ill); 7335 } 7336 if (dst_ill == NULL) { 7337 if (ip_debug > 2) { 7338 pr_addr_dbg("ip_newroute: " 7339 "no dst ill for dst" 7340 " %s\n", AF_INET, &dst); 7341 } 7342 goto icmp_err_ret; 7343 } 7344 } else { 7345 dst_ill = ire->ire_ipif->ipif_ill; 7346 /* for uniformity */ 7347 ill_refhold(dst_ill); 7348 /* 7349 * We should have found a route matching ill as we 7350 * called ire_ftable_lookup with MATCH_IRE_ILL. 7351 * Rather than asserting, when there is a mismatch, 7352 * we just drop the packet. 7353 */ 7354 if (dst_ill != attach_ill) { 7355 ip0dbg(("ip_newroute: Packet dropped as " 7356 "IPIF_NOFAILOVER ill is %s, " 7357 "ire->ire_ipif->ipif_ill is %s\n", 7358 attach_ill->ill_name, 7359 dst_ill->ill_name)); 7360 ill_refrele(attach_ill); 7361 goto icmp_err_ret; 7362 } 7363 } 7364 /* attach_ill can't go in loop. IPMP and CGTP are disjoint */ 7365 if (attach_ill != NULL) { 7366 ill_refrele(attach_ill); 7367 attach_ill = NULL; 7368 do_attach_ill = B_TRUE; 7369 } 7370 ASSERT(dst_ill != NULL); 7371 ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); 7372 7373 /* 7374 * Pick the best source address from dst_ill. 7375 * 7376 * 1) If it is part of a multipathing group, we would 7377 * like to spread the inbound packets across different 7378 * interfaces. ipif_select_source picks a random source 7379 * across the different ills in the group. 7380 * 7381 * 2) If it is not part of a multipathing group, we try 7382 * to pick the source address from the destination 7383 * route. Clustering assumes that when we have multiple 7384 * prefixes hosted on an interface, the prefix of the 7385 * source address matches the prefix of the destination 7386 * route. We do this only if the address is not 7387 * DEPRECATED. 7388 * 7389 * 3) If the conn is in a different zone than the ire, we 7390 * need to pick a source address from the right zone. 7391 * 7392 * NOTE : If we hit case (1) above, the prefix of the source 7393 * address picked may not match the prefix of the 7394 * destination routes prefix as ipif_select_source 7395 * does not look at "dst" while picking a source 7396 * address. 7397 * If we want the same behavior as (2), we will need 7398 * to change the behavior of ipif_select_source. 7399 */ 7400 ASSERT(src_ipif == NULL); 7401 if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { 7402 /* 7403 * The RTF_SETSRC flag is set in the parent ire (sire). 7404 * Check that the ipif matching the requested source 7405 * address still exists. 7406 */ 7407 src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, 7408 zoneid, NULL, NULL, NULL, NULL); 7409 } 7410 if (src_ipif == NULL) { 7411 ire_marks |= IRE_MARK_USESRC_CHECK; 7412 if ((dst_ill->ill_group != NULL) || 7413 (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || 7414 (connp != NULL && ire->ire_zoneid != zoneid && 7415 ire->ire_zoneid != ALL_ZONES) || 7416 (dst_ill->ill_usesrc_ifindex != 0)) { 7417 /* 7418 * If the destination is reachable via a 7419 * given gateway, the selected source address 7420 * should be in the same subnet as the gateway. 7421 * Otherwise, the destination is not reachable. 7422 * 7423 * If there are no interfaces on the same subnet 7424 * as the destination, ipif_select_source gives 7425 * first non-deprecated interface which might be 7426 * on a different subnet than the gateway. 7427 * This is not desirable. Hence pass the dst_ire 7428 * source address to ipif_select_source. 7429 * It is sure that the destination is reachable 7430 * with the dst_ire source address subnet. 7431 * So passing dst_ire source address to 7432 * ipif_select_source will make sure that the 7433 * selected source will be on the same subnet 7434 * as dst_ire source address. 7435 */ 7436 ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; 7437 src_ipif = ipif_select_source(dst_ill, saddr, 7438 zoneid); 7439 if (src_ipif == NULL) { 7440 if (ip_debug > 2) { 7441 pr_addr_dbg("ip_newroute: " 7442 "no src for dst %s ", 7443 AF_INET, &dst); 7444 printf("through interface %s\n", 7445 dst_ill->ill_name); 7446 } 7447 goto icmp_err_ret; 7448 } 7449 } else { 7450 src_ipif = ire->ire_ipif; 7451 ASSERT(src_ipif != NULL); 7452 /* hold src_ipif for uniformity */ 7453 ipif_refhold(src_ipif); 7454 } 7455 } 7456 7457 /* 7458 * Assign a source address while we have the conn. 7459 * We can't have ip_wput_ire pick a source address when the 7460 * packet returns from arp since we need to look at 7461 * conn_unspec_src and conn_zoneid, and we lose the conn when 7462 * going through arp. 7463 * 7464 * NOTE : ip_newroute_v6 does not have this piece of code as 7465 * it uses ip6i to store this information. 7466 */ 7467 if (ipha->ipha_src == INADDR_ANY && 7468 (connp == NULL || !connp->conn_unspec_src)) { 7469 ipha->ipha_src = src_ipif->ipif_src_addr; 7470 } 7471 if (ip_debug > 3) { 7472 /* ip2dbg */ 7473 pr_addr_dbg("ip_newroute: first hop %s\n", 7474 AF_INET, &gw); 7475 } 7476 ip2dbg(("\tire type %s (%d)\n", 7477 ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); 7478 7479 /* 7480 * The TTL of multirouted packets is bounded by the 7481 * ip_multirt_ttl ndd variable. 7482 */ 7483 if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { 7484 /* Force TTL of multirouted packets */ 7485 if ((ip_multirt_ttl > 0) && 7486 (ipha->ipha_ttl > ip_multirt_ttl)) { 7487 ip2dbg(("ip_newroute: forcing multirt TTL " 7488 "to %d (was %d), dst 0x%08x\n", 7489 ip_multirt_ttl, ipha->ipha_ttl, 7490 ntohl(sire->ire_addr))); 7491 ipha->ipha_ttl = ip_multirt_ttl; 7492 } 7493 } 7494 /* 7495 * At this point in ip_newroute(), ire is either the 7496 * IRE_CACHE of the next-hop gateway for an off-subnet 7497 * destination or an IRE_INTERFACE type that should be used 7498 * to resolve an on-subnet destination or an on-subnet 7499 * next-hop gateway. 7500 * 7501 * In the IRE_CACHE case, we have the following : 7502 * 7503 * 1) src_ipif - used for getting a source address. 7504 * 7505 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 7506 * means packets using this IRE_CACHE will go out on 7507 * dst_ill. 7508 * 7509 * 3) The IRE sire will point to the prefix that is the 7510 * longest matching route for the destination. These 7511 * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST, 7512 * and IRE_HOST_REDIRECT. 7513 * 7514 * The newly created IRE_CACHE entry for the off-subnet 7515 * destination is tied to both the prefix route and the 7516 * interface route used to resolve the next-hop gateway 7517 * via the ire_phandle and ire_ihandle fields, 7518 * respectively. 7519 * 7520 * In the IRE_INTERFACE case, we have the following : 7521 * 7522 * 1) src_ipif - used for getting a source address. 7523 * 7524 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This 7525 * means packets using the IRE_CACHE that we will build 7526 * here will go out on dst_ill. 7527 * 7528 * 3) sire may or may not be NULL. But, the IRE_CACHE that is 7529 * to be created will only be tied to the IRE_INTERFACE 7530 * that was derived from the ire_ihandle field. 7531 * 7532 * If sire is non-NULL, it means the destination is 7533 * off-link and we will first create the IRE_CACHE for the 7534 * gateway. Next time through ip_newroute, we will create 7535 * the IRE_CACHE for the final destination as described 7536 * above. 7537 * 7538 * In both cases, after the current resolution has been 7539 * completed (or possibly initialised, in the IRE_INTERFACE 7540 * case), the loop may be re-entered to attempt the resolution 7541 * of another RTF_MULTIRT route. 7542 * 7543 * When an IRE_CACHE entry for the off-subnet destination is 7544 * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, 7545 * for further processing in emission loops. 7546 */ 7547 save_ire = ire; 7548 switch (ire->ire_type) { 7549 case IRE_CACHE: { 7550 ire_t *ipif_ire; 7551 mblk_t *ire_fp_mp; 7552 7553 if (gw == 0) 7554 gw = ire->ire_gateway_addr; 7555 /* 7556 * We need 3 ire's to create a new cache ire for an 7557 * off-link destination from the cache ire of the 7558 * gateway. 7559 * 7560 * 1. The prefix ire 'sire' (Note that this does 7561 * not apply to the conn_nexthop_set case) 7562 * 2. The cache ire of the gateway 'ire' 7563 * 3. The interface ire 'ipif_ire' 7564 * 7565 * We have (1) and (2). We lookup (3) below. 7566 * 7567 * If there is no interface route to the gateway, 7568 * it is a race condition, where we found the cache 7569 * but the interface route has been deleted. 7570 */ 7571 if (ip_nexthop) { 7572 ipif_ire = ire_ihandle_lookup_onlink(ire); 7573 } else { 7574 ipif_ire = 7575 ire_ihandle_lookup_offlink(ire, sire); 7576 } 7577 if (ipif_ire == NULL) { 7578 ip1dbg(("ip_newroute: " 7579 "ire_ihandle_lookup_offlink failed\n")); 7580 goto icmp_err_ret; 7581 } 7582 /* 7583 * XXX We are using the same dlureq_mp 7584 * (DL_UNITDATA_REQ) though the save_ire is not 7585 * pointing at the same ill. 7586 * This is incorrect. We need to send it up to the 7587 * resolver to get the right dlureq_mp. For ethernets 7588 * this may be okay (ill_type == DL_ETHER). 7589 */ 7590 dlureq_mp = save_ire->ire_dlureq_mp; 7591 ire_fp_mp = NULL; 7592 /* 7593 * save_ire's ire_fp_mp can't change since it is 7594 * not an IRE_MIPRTUN or IRE_BROADCAST 7595 * LOCK_IRE_FP_MP does not do any useful work in 7596 * the case of IRE_CACHE. So we don't use it below. 7597 */ 7598 if (save_ire->ire_stq == dst_ill->ill_wq) 7599 ire_fp_mp = save_ire->ire_fp_mp; 7600 7601 /* 7602 * Check cached gateway IRE for any security 7603 * attributes; if found, associate the gateway 7604 * credentials group to the destination IRE. 7605 */ 7606 if ((attrp = save_ire->ire_gw_secattr) != NULL) { 7607 mutex_enter(&attrp->igsa_lock); 7608 if ((gcgrp = attrp->igsa_gcgrp) != NULL) 7609 GCGRP_REFHOLD(gcgrp); 7610 mutex_exit(&attrp->igsa_lock); 7611 } 7612 7613 ire = ire_create( 7614 (uchar_t *)&dst, /* dest address */ 7615 (uchar_t *)&ip_g_all_ones, /* mask */ 7616 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7617 (uchar_t *)&gw, /* gateway address */ 7618 NULL, 7619 &save_ire->ire_max_frag, 7620 ire_fp_mp, /* Fast Path header */ 7621 dst_ill->ill_rq, /* recv-from queue */ 7622 dst_ill->ill_wq, /* send-to queue */ 7623 IRE_CACHE, /* IRE type */ 7624 save_ire->ire_dlureq_mp, 7625 src_ipif, 7626 in_ill, /* incoming ill */ 7627 (sire != NULL) ? 7628 sire->ire_mask : 0, /* Parent mask */ 7629 (sire != NULL) ? 7630 sire->ire_phandle : 0, /* Parent handle */ 7631 ipif_ire->ire_ihandle, /* Interface handle */ 7632 (sire != NULL) ? (sire->ire_flags & 7633 (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ 7634 (sire != NULL) ? 7635 &(sire->ire_uinfo) : &(save_ire->ire_uinfo), 7636 NULL, 7637 gcgrp); 7638 7639 if (ire == NULL) { 7640 if (gcgrp != NULL) { 7641 GCGRP_REFRELE(gcgrp); 7642 gcgrp = NULL; 7643 } 7644 ire_refrele(ipif_ire); 7645 ire_refrele(save_ire); 7646 break; 7647 } 7648 7649 /* reference now held by IRE */ 7650 gcgrp = NULL; 7651 7652 ire->ire_marks |= ire_marks; 7653 7654 /* 7655 * Prevent sire and ipif_ire from getting deleted. 7656 * The newly created ire is tied to both of them via 7657 * the phandle and ihandle respectively. 7658 */ 7659 if (sire != NULL) { 7660 IRB_REFHOLD(sire->ire_bucket); 7661 /* Has it been removed already ? */ 7662 if (sire->ire_marks & IRE_MARK_CONDEMNED) { 7663 IRB_REFRELE(sire->ire_bucket); 7664 ire_refrele(ipif_ire); 7665 ire_refrele(save_ire); 7666 break; 7667 } 7668 } 7669 7670 IRB_REFHOLD(ipif_ire->ire_bucket); 7671 /* Has it been removed already ? */ 7672 if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { 7673 IRB_REFRELE(ipif_ire->ire_bucket); 7674 if (sire != NULL) 7675 IRB_REFRELE(sire->ire_bucket); 7676 ire_refrele(ipif_ire); 7677 ire_refrele(save_ire); 7678 break; 7679 } 7680 7681 xmit_mp = first_mp; 7682 /* 7683 * In the case of multirouting, a copy 7684 * of the packet is done before its sending. 7685 * The copy is used to attempt another 7686 * route resolution, in a next loop. 7687 */ 7688 if (ire->ire_flags & RTF_MULTIRT) { 7689 copy_mp = copymsg(first_mp); 7690 if (copy_mp != NULL) { 7691 xmit_mp = copy_mp; 7692 MULTIRT_DEBUG_TAG(first_mp); 7693 } 7694 } 7695 ire_add_then_send(q, ire, xmit_mp); 7696 ire_refrele(save_ire); 7697 7698 /* Assert that sire is not deleted yet. */ 7699 if (sire != NULL) { 7700 ASSERT(sire->ire_ptpn != NULL); 7701 IRB_REFRELE(sire->ire_bucket); 7702 } 7703 7704 /* Assert that ipif_ire is not deleted yet. */ 7705 ASSERT(ipif_ire->ire_ptpn != NULL); 7706 IRB_REFRELE(ipif_ire->ire_bucket); 7707 ire_refrele(ipif_ire); 7708 7709 /* 7710 * If copy_mp is not NULL, multirouting was 7711 * requested. We loop to initiate a next 7712 * route resolution attempt, starting from sire. 7713 */ 7714 if (copy_mp != NULL) { 7715 /* 7716 * Search for the next unresolved 7717 * multirt route. 7718 */ 7719 copy_mp = NULL; 7720 ipif_ire = NULL; 7721 ire = NULL; 7722 multirt_resolve_next = B_TRUE; 7723 continue; 7724 } 7725 if (sire != NULL) 7726 ire_refrele(sire); 7727 ipif_refrele(src_ipif); 7728 ill_refrele(dst_ill); 7729 return; 7730 } 7731 case IRE_IF_NORESOLVER: { 7732 /* 7733 * We have what we need to build an IRE_CACHE. 7734 * 7735 * Create a new dlureq_mp with the IP gateway address 7736 * in destination address in the DLPI hdr if the 7737 * physical length is exactly 4 bytes. 7738 */ 7739 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) { 7740 uchar_t *addr; 7741 7742 if (gw) 7743 addr = (uchar_t *)&gw; 7744 else 7745 addr = (uchar_t *)&dst; 7746 7747 dlureq_mp = ill_dlur_gen(addr, 7748 dst_ill->ill_phys_addr_length, 7749 dst_ill->ill_sap, 7750 dst_ill->ill_sap_length); 7751 } else { 7752 dlureq_mp = ire->ire_dlureq_mp; 7753 } 7754 7755 if (dlureq_mp == NULL) { 7756 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 7757 break; 7758 } 7759 7760 /* 7761 * TSol note: We are creating the ire cache for the 7762 * destination 'dst'. If 'dst' is offlink, going 7763 * through the first hop 'gw', the security attributes 7764 * of 'dst' must be set to point to the gateway 7765 * credentials of gateway 'gw'. If 'dst' is onlink, it 7766 * is possible that 'dst' is a potential gateway that is 7767 * referenced by some route that has some security 7768 * attributes. Thus in the former case, we need to do a 7769 * gcgrp_lookup of 'gw' while in the latter case we 7770 * need to do gcgrp_lookup of 'dst' itself. 7771 */ 7772 ga.ga_af = AF_INET; 7773 IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, 7774 &ga.ga_addr); 7775 gcgrp = gcgrp_lookup(&ga, B_FALSE); 7776 7777 ire = ire_create( 7778 (uchar_t *)&dst, /* dest address */ 7779 (uchar_t *)&ip_g_all_ones, /* mask */ 7780 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7781 (uchar_t *)&gw, /* gateway address */ 7782 NULL, 7783 &save_ire->ire_max_frag, 7784 NULL, /* Fast Path header */ 7785 dst_ill->ill_rq, /* recv-from queue */ 7786 dst_ill->ill_wq, /* send-to queue */ 7787 IRE_CACHE, 7788 dlureq_mp, 7789 src_ipif, 7790 in_ill, /* Incoming ill */ 7791 save_ire->ire_mask, /* Parent mask */ 7792 (sire != NULL) ? /* Parent handle */ 7793 sire->ire_phandle : 0, 7794 save_ire->ire_ihandle, /* Interface handle */ 7795 (sire != NULL) ? sire->ire_flags & 7796 (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ 7797 &(save_ire->ire_uinfo), 7798 NULL, 7799 gcgrp); 7800 7801 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) 7802 freeb(dlureq_mp); 7803 7804 if (ire == NULL) { 7805 if (gcgrp != NULL) { 7806 GCGRP_REFRELE(gcgrp); 7807 gcgrp = NULL; 7808 } 7809 ire_refrele(save_ire); 7810 break; 7811 } 7812 7813 /* reference now held by IRE */ 7814 gcgrp = NULL; 7815 7816 ire->ire_marks |= ire_marks; 7817 7818 /* Prevent save_ire from getting deleted */ 7819 IRB_REFHOLD(save_ire->ire_bucket); 7820 /* Has it been removed already ? */ 7821 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 7822 IRB_REFRELE(save_ire->ire_bucket); 7823 ire_refrele(save_ire); 7824 break; 7825 } 7826 7827 /* 7828 * In the case of multirouting, a copy 7829 * of the packet is made before it is sent. 7830 * The copy is used in the next 7831 * loop to attempt another resolution. 7832 */ 7833 xmit_mp = first_mp; 7834 if ((sire != NULL) && 7835 (sire->ire_flags & RTF_MULTIRT)) { 7836 copy_mp = copymsg(first_mp); 7837 if (copy_mp != NULL) { 7838 xmit_mp = copy_mp; 7839 MULTIRT_DEBUG_TAG(first_mp); 7840 } 7841 } 7842 ire_add_then_send(q, ire, xmit_mp); 7843 7844 /* Assert that it is not deleted yet. */ 7845 ASSERT(save_ire->ire_ptpn != NULL); 7846 IRB_REFRELE(save_ire->ire_bucket); 7847 ire_refrele(save_ire); 7848 7849 if (copy_mp != NULL) { 7850 /* 7851 * If we found a (no)resolver, we ignore any 7852 * trailing top priority IRE_CACHE in further 7853 * loops. This ensures that we do not omit any 7854 * (no)resolver. 7855 * This IRE_CACHE, if any, will be processed 7856 * by another thread entering ip_newroute(). 7857 * IRE_CACHE entries, if any, will be processed 7858 * by another thread entering ip_newroute(), 7859 * (upon resolver response, for instance). 7860 * This aims to force parallel multirt 7861 * resolutions as soon as a packet must be sent. 7862 * In the best case, after the tx of only one 7863 * packet, all reachable routes are resolved. 7864 * Otherwise, the resolution of all RTF_MULTIRT 7865 * routes would require several emissions. 7866 */ 7867 multirt_flags &= ~MULTIRT_CACHEGW; 7868 7869 /* 7870 * Search for the next unresolved multirt 7871 * route. 7872 */ 7873 copy_mp = NULL; 7874 save_ire = NULL; 7875 ire = NULL; 7876 multirt_resolve_next = B_TRUE; 7877 continue; 7878 } 7879 7880 /* 7881 * Don't need sire anymore 7882 */ 7883 if (sire != NULL) 7884 ire_refrele(sire); 7885 7886 ipif_refrele(src_ipif); 7887 ill_refrele(dst_ill); 7888 return; 7889 } 7890 case IRE_IF_RESOLVER: 7891 /* 7892 * We can't build an IRE_CACHE yet, but at least we 7893 * found a resolver that can help. 7894 */ 7895 res_mp = dst_ill->ill_resolver_mp; 7896 if (!OK_RESOLVER_MP(res_mp)) 7897 break; 7898 7899 /* 7900 * To be at this point in the code with a non-zero gw 7901 * means that dst is reachable through a gateway that 7902 * we have never resolved. By changing dst to the gw 7903 * addr we resolve the gateway first. 7904 * When ire_add_then_send() tries to put the IP dg 7905 * to dst, it will reenter ip_newroute() at which 7906 * time we will find the IRE_CACHE for the gw and 7907 * create another IRE_CACHE in case IRE_CACHE above. 7908 */ 7909 if (gw != INADDR_ANY) { 7910 /* 7911 * The source ipif that was determined above was 7912 * relative to the destination address, not the 7913 * gateway's. If src_ipif was not taken out of 7914 * the IRE_IF_RESOLVER entry, we'll need to call 7915 * ipif_select_source() again. 7916 */ 7917 if (src_ipif != ire->ire_ipif) { 7918 ipif_refrele(src_ipif); 7919 src_ipif = ipif_select_source(dst_ill, 7920 gw, zoneid); 7921 if (src_ipif == NULL) { 7922 if (ip_debug > 2) { 7923 pr_addr_dbg( 7924 "ip_newroute: no " 7925 "src for gw %s ", 7926 AF_INET, &gw); 7927 printf("through " 7928 "interface %s\n", 7929 dst_ill->ill_name); 7930 } 7931 goto icmp_err_ret; 7932 } 7933 } 7934 save_dst = dst; 7935 dst = gw; 7936 gw = INADDR_ANY; 7937 } 7938 7939 /* 7940 * TSol note: Please see the corresponding note 7941 * of the IRE_IF_NORESOLVER case 7942 */ 7943 ga.ga_af = AF_INET; 7944 IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr); 7945 gcgrp = gcgrp_lookup(&ga, B_FALSE); 7946 7947 /* 7948 * We obtain a partial IRE_CACHE which we will pass 7949 * along with the resolver query. When the response 7950 * comes back it will be there ready for us to add. 7951 * The ire_max_frag is atomically set under the 7952 * irebucket lock in ire_add_v[46]. 7953 */ 7954 ire = ire_create_mp( 7955 (uchar_t *)&dst, /* dest address */ 7956 (uchar_t *)&ip_g_all_ones, /* mask */ 7957 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 7958 (uchar_t *)&gw, /* gateway address */ 7959 NULL, /* no in_src_addr */ 7960 NULL, /* ire_max_frag */ 7961 NULL, /* Fast Path header */ 7962 dst_ill->ill_rq, /* recv-from queue */ 7963 dst_ill->ill_wq, /* send-to queue */ 7964 IRE_CACHE, 7965 res_mp, 7966 src_ipif, /* Interface ipif */ 7967 in_ill, /* Incoming ILL */ 7968 save_ire->ire_mask, /* Parent mask */ 7969 0, 7970 save_ire->ire_ihandle, /* Interface handle */ 7971 0, /* flags if any */ 7972 &(save_ire->ire_uinfo), 7973 NULL, 7974 gcgrp); 7975 7976 if (ire == NULL) { 7977 ire_refrele(save_ire); 7978 if (gcgrp != NULL) { 7979 GCGRP_REFRELE(gcgrp); 7980 gcgrp = NULL; 7981 } 7982 break; 7983 } 7984 7985 /* reference now held by IRE */ 7986 gcgrp = NULL; 7987 7988 if ((sire != NULL) && 7989 (sire->ire_flags & RTF_MULTIRT)) { 7990 copy_mp = copymsg(first_mp); 7991 if (copy_mp != NULL) 7992 MULTIRT_DEBUG_TAG(copy_mp); 7993 } 7994 7995 ire->ire_marks |= ire_marks; 7996 7997 /* 7998 * Construct message chain for the resolver 7999 * of the form: 8000 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8001 * Packet could contain a IPSEC_OUT mp. 8002 * 8003 * NOTE : ire will be added later when the response 8004 * comes back from ARP. If the response does not 8005 * come back, ARP frees the packet. For this reason, 8006 * we can't REFHOLD the bucket of save_ire to prevent 8007 * deletions. We may not be able to REFRELE the bucket 8008 * if the response never comes back. Thus, before 8009 * adding the ire, ire_add_v4 will make sure that the 8010 * interface route does not get deleted. This is the 8011 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 8012 * where we can always prevent deletions because of 8013 * the synchronous nature of adding IRES i.e 8014 * ire_add_then_send is called after creating the IRE. 8015 */ 8016 ASSERT(ire->ire_mp != NULL); 8017 ire->ire_mp->b_cont = first_mp; 8018 /* Have saved_mp handy, for cleanup if canput fails */ 8019 saved_mp = mp; 8020 mp = ire->ire_dlureq_mp; 8021 ASSERT(mp != NULL); 8022 ire->ire_dlureq_mp = NULL; 8023 linkb(mp, ire->ire_mp); 8024 8025 8026 /* 8027 * Fill in the source and dest addrs for the resolver. 8028 * NOTE: this depends on memory layouts imposed by 8029 * ill_init(). 8030 */ 8031 areq = (areq_t *)mp->b_rptr; 8032 addrp = (ipaddr_t *)((char *)areq + 8033 areq->areq_sender_addr_offset); 8034 if (do_attach_ill) { 8035 /* 8036 * This is bind to no failover case. 8037 * arp packet also must go out on attach_ill. 8038 */ 8039 ASSERT(ipha->ipha_src != NULL); 8040 *addrp = ipha->ipha_src; 8041 } else { 8042 *addrp = save_ire->ire_src_addr; 8043 } 8044 8045 ire_refrele(save_ire); 8046 addrp = (ipaddr_t *)((char *)areq + 8047 areq->areq_target_addr_offset); 8048 *addrp = dst; 8049 /* Up to the resolver. */ 8050 if (canputnext(dst_ill->ill_rq)) { 8051 putnext(dst_ill->ill_rq, mp); 8052 ire = NULL; 8053 if (copy_mp != NULL) { 8054 /* 8055 * If we found a resolver, we ignore 8056 * any trailing top priority IRE_CACHE 8057 * in the further loops. This ensures 8058 * that we do not omit any resolver. 8059 * IRE_CACHE entries, if any, will be 8060 * processed next time we enter 8061 * ip_newroute(). 8062 */ 8063 multirt_flags &= ~MULTIRT_CACHEGW; 8064 /* 8065 * Search for the next unresolved 8066 * multirt route. 8067 */ 8068 first_mp = copy_mp; 8069 copy_mp = NULL; 8070 /* Prepare the next resolution loop. */ 8071 mp = first_mp; 8072 EXTRACT_PKT_MP(mp, first_mp, 8073 mctl_present); 8074 if (mctl_present) 8075 io = (ipsec_out_t *) 8076 first_mp->b_rptr; 8077 ipha = (ipha_t *)mp->b_rptr; 8078 8079 ASSERT(sire != NULL); 8080 8081 dst = save_dst; 8082 multirt_resolve_next = B_TRUE; 8083 continue; 8084 } 8085 8086 if (sire != NULL) 8087 ire_refrele(sire); 8088 8089 /* 8090 * The response will come back in ip_wput 8091 * with db_type IRE_DB_TYPE. 8092 */ 8093 ipif_refrele(src_ipif); 8094 ill_refrele(dst_ill); 8095 return; 8096 } else { 8097 /* Prepare for cleanup */ 8098 ire->ire_dlureq_mp = mp; 8099 mp->b_cont = NULL; 8100 ire_delete(ire); 8101 mp = saved_mp; 8102 ire = NULL; 8103 if (copy_mp != NULL) { 8104 MULTIRT_DEBUG_UNTAG(copy_mp); 8105 freemsg(copy_mp); 8106 copy_mp = NULL; 8107 } 8108 break; 8109 } 8110 default: 8111 break; 8112 } 8113 } while (multirt_resolve_next); 8114 8115 ip1dbg(("ip_newroute: dropped\n")); 8116 /* Did this packet originate externally? */ 8117 if (mp->b_prev) { 8118 mp->b_next = NULL; 8119 mp->b_prev = NULL; 8120 BUMP_MIB(&ip_mib, ipInDiscards); 8121 } else { 8122 BUMP_MIB(&ip_mib, ipOutDiscards); 8123 } 8124 ASSERT(copy_mp == NULL); 8125 MULTIRT_DEBUG_UNTAG(first_mp); 8126 freemsg(first_mp); 8127 if (ire != NULL) 8128 ire_refrele(ire); 8129 if (sire != NULL) 8130 ire_refrele(sire); 8131 if (src_ipif != NULL) 8132 ipif_refrele(src_ipif); 8133 if (dst_ill != NULL) 8134 ill_refrele(dst_ill); 8135 return; 8136 8137 icmp_err_ret: 8138 ip1dbg(("ip_newroute: no route\n")); 8139 if (src_ipif != NULL) 8140 ipif_refrele(src_ipif); 8141 if (dst_ill != NULL) 8142 ill_refrele(dst_ill); 8143 if (sire != NULL) 8144 ire_refrele(sire); 8145 /* Did this packet originate externally? */ 8146 if (mp->b_prev) { 8147 mp->b_next = NULL; 8148 mp->b_prev = NULL; 8149 /* XXX ipInNoRoutes */ 8150 q = WR(q); 8151 } else { 8152 /* 8153 * Since ip_wput() isn't close to finished, we fill 8154 * in enough of the header for credible error reporting. 8155 */ 8156 if (ip_hdr_complete(ipha, zoneid)) { 8157 /* Failed */ 8158 MULTIRT_DEBUG_UNTAG(first_mp); 8159 freemsg(first_mp); 8160 if (ire != NULL) 8161 ire_refrele(ire); 8162 return; 8163 } 8164 } 8165 BUMP_MIB(&ip_mib, ipOutNoRoutes); 8166 8167 /* 8168 * At this point we will have ire only if RTF_BLACKHOLE 8169 * or RTF_REJECT flags are set on the IRE. It will not 8170 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8171 */ 8172 if (ire != NULL) { 8173 if (ire->ire_flags & RTF_BLACKHOLE) { 8174 ire_refrele(ire); 8175 MULTIRT_DEBUG_UNTAG(first_mp); 8176 freemsg(first_mp); 8177 return; 8178 } 8179 ire_refrele(ire); 8180 } 8181 if (ip_source_routed(ipha)) { 8182 icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED); 8183 return; 8184 } 8185 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE); 8186 } 8187 8188 /* 8189 * IPv4 - 8190 * ip_newroute_ipif is called by ip_wput_multicast and 8191 * ip_rput_forward_multicast whenever we need to send 8192 * out a packet to a destination address for which we do not have specific 8193 * routing information. It is used when the packet will be sent out 8194 * on a specific interface. It is also called by ip_wput() when IP_XMIT_IF 8195 * socket option is set or icmp error message wants to go out on a particular 8196 * interface for a unicast packet. 8197 * 8198 * In most cases, the destination address is resolved thanks to the ipif 8199 * intrinsic resolver. However, there are some cases where the call to 8200 * ip_newroute_ipif must take into account the potential presence of 8201 * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire 8202 * that uses the interface. This is specified through flags, 8203 * which can be a combination of: 8204 * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC 8205 * flag, the resulting ire will inherit the IRE_OFFSUBNET source address 8206 * and flags. Additionally, the packet source address has to be set to 8207 * the specified address. The caller is thus expected to set this flag 8208 * if the packet has no specific source address yet. 8209 * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT 8210 * flag, the resulting ire will inherit the flag. All unresolved routes 8211 * to the destination must be explored in the same call to 8212 * ip_newroute_ipif(). 8213 */ 8214 static void 8215 ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, 8216 conn_t *connp, uint32_t flags) 8217 { 8218 areq_t *areq; 8219 ire_t *ire = NULL; 8220 mblk_t *res_mp; 8221 ipaddr_t *addrp; 8222 mblk_t *first_mp; 8223 ire_t *save_ire = NULL; 8224 ill_t *attach_ill = NULL; /* Bind to IPIF_NOFAILOVER */ 8225 ipif_t *src_ipif = NULL; 8226 ushort_t ire_marks = 0; 8227 ill_t *dst_ill = NULL; 8228 boolean_t mctl_present; 8229 ipsec_out_t *io; 8230 ipha_t *ipha; 8231 int ihandle = 0; 8232 mblk_t *saved_mp; 8233 ire_t *fire = NULL; 8234 mblk_t *copy_mp = NULL; 8235 boolean_t multirt_resolve_next; 8236 ipaddr_t ipha_dst; 8237 zoneid_t zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 8238 8239 /* 8240 * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold 8241 * here for uniformity 8242 */ 8243 ipif_refhold(ipif); 8244 8245 /* 8246 * This loop is run only once in most cases. 8247 * We loop to resolve further routes only when the destination 8248 * can be reached through multiple RTF_MULTIRT-flagged ires. 8249 */ 8250 do { 8251 if (dst_ill != NULL) { 8252 ill_refrele(dst_ill); 8253 dst_ill = NULL; 8254 } 8255 if (src_ipif != NULL) { 8256 ipif_refrele(src_ipif); 8257 src_ipif = NULL; 8258 } 8259 multirt_resolve_next = B_FALSE; 8260 8261 ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), 8262 ipif->ipif_ill->ill_name)); 8263 8264 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 8265 if (mctl_present) 8266 io = (ipsec_out_t *)first_mp->b_rptr; 8267 8268 ipha = (ipha_t *)mp->b_rptr; 8269 8270 /* 8271 * Save the packet destination address, we may need it after 8272 * the packet has been consumed. 8273 */ 8274 ipha_dst = ipha->ipha_dst; 8275 8276 /* 8277 * If the interface is a pt-pt interface we look for an 8278 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the 8279 * local_address and the pt-pt destination address. Otherwise 8280 * we just match the local address. 8281 * NOTE: dst could be different than ipha->ipha_dst in case 8282 * of sending igmp multicast packets over a point-to-point 8283 * connection. 8284 * Thus we must be careful enough to check ipha_dst to be a 8285 * multicast address, otherwise it will take xmit_if path for 8286 * multicast packets resulting into kernel stack overflow by 8287 * repeated calls to ip_newroute_ipif from ire_send(). 8288 */ 8289 if (CLASSD(ipha_dst) && 8290 !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { 8291 goto err_ret; 8292 } 8293 8294 /* 8295 * We check if an IRE_OFFSUBNET for the addr that goes through 8296 * ipif exists. We need it to determine if the RTF_SETSRC and/or 8297 * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may 8298 * propagate its flags to the new ire. 8299 */ 8300 if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { 8301 fire = ipif_lookup_multi_ire(ipif, ipha_dst); 8302 ip2dbg(("ip_newroute_ipif: " 8303 "ipif_lookup_multi_ire(" 8304 "ipif %p, dst %08x) = fire %p\n", 8305 (void *)ipif, ntohl(dst), (void *)fire)); 8306 } 8307 8308 if (mctl_present && io->ipsec_out_attach_if) { 8309 attach_ill = ip_grab_attach_ill(NULL, first_mp, 8310 io->ipsec_out_ill_index, B_FALSE); 8311 8312 /* Failure case frees things for us. */ 8313 if (attach_ill == NULL) { 8314 ipif_refrele(ipif); 8315 if (fire != NULL) 8316 ire_refrele(fire); 8317 return; 8318 } 8319 8320 /* 8321 * Check if we need an ire that will not be 8322 * looked up by anybody else i.e. HIDDEN. 8323 */ 8324 if (ill_is_probeonly(attach_ill)) { 8325 ire_marks = IRE_MARK_HIDDEN; 8326 } 8327 /* 8328 * ip_wput passes the right ipif for IPIF_NOFAILOVER 8329 * case. 8330 */ 8331 dst_ill = ipif->ipif_ill; 8332 /* attach_ill has been refheld by ip_grab_attach_ill */ 8333 ASSERT(dst_ill == attach_ill); 8334 } else { 8335 /* 8336 * If this is set by IP_XMIT_IF, then make sure that 8337 * ipif is pointing to the same ill as the IP_XMIT_IF 8338 * specified ill. 8339 */ 8340 ASSERT((connp == NULL) || 8341 (connp->conn_xmit_if_ill == NULL) || 8342 (connp->conn_xmit_if_ill == ipif->ipif_ill)); 8343 /* 8344 * If the interface belongs to an interface group, 8345 * make sure the next possible interface in the group 8346 * is used. This encourages load spreading among 8347 * peers in an interface group. 8348 * Note: load spreading is disabled for RTF_MULTIRT 8349 * routes. 8350 */ 8351 if ((flags & RTF_MULTIRT) && (fire != NULL) && 8352 (fire->ire_flags & RTF_MULTIRT)) { 8353 /* 8354 * Don't perform outbound load spreading 8355 * in the case of an RTF_MULTIRT issued route, 8356 * we actually typically want to replicate 8357 * outgoing packets through particular 8358 * interfaces. 8359 */ 8360 dst_ill = ipif->ipif_ill; 8361 ill_refhold(dst_ill); 8362 } else { 8363 dst_ill = ip_newroute_get_dst_ill( 8364 ipif->ipif_ill); 8365 } 8366 if (dst_ill == NULL) { 8367 if (ip_debug > 2) { 8368 pr_addr_dbg("ip_newroute_ipif: " 8369 "no dst ill for dst %s\n", 8370 AF_INET, &dst); 8371 } 8372 goto err_ret; 8373 } 8374 } 8375 8376 /* 8377 * Pick a source address preferring non-deprecated ones. 8378 * Unlike ip_newroute, we don't do any source address 8379 * selection here since for multicast it really does not help 8380 * in inbound load spreading as in the unicast case. 8381 */ 8382 if ((flags & RTF_SETSRC) && (fire != NULL) && 8383 (fire->ire_flags & RTF_SETSRC)) { 8384 /* 8385 * As requested by flags, an IRE_OFFSUBNET was looked up 8386 * on that interface. This ire has RTF_SETSRC flag, so 8387 * the source address of the packet must be changed. 8388 * Check that the ipif matching the requested source 8389 * address still exists. 8390 */ 8391 src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, 8392 zoneid, NULL, NULL, NULL, NULL); 8393 } 8394 if (((ipif->ipif_flags & IPIF_DEPRECATED) || 8395 (connp != NULL && ipif->ipif_zoneid != zoneid && 8396 ipif->ipif_zoneid != ALL_ZONES)) && 8397 (src_ipif == NULL)) { 8398 src_ipif = ipif_select_source(dst_ill, dst, zoneid); 8399 if (src_ipif == NULL) { 8400 if (ip_debug > 2) { 8401 /* ip1dbg */ 8402 pr_addr_dbg("ip_newroute_ipif: " 8403 "no src for dst %s", 8404 AF_INET, &dst); 8405 } 8406 ip1dbg((" through interface %s\n", 8407 dst_ill->ill_name)); 8408 goto err_ret; 8409 } 8410 ipif_refrele(ipif); 8411 ipif = src_ipif; 8412 ipif_refhold(ipif); 8413 } 8414 if (src_ipif == NULL) { 8415 src_ipif = ipif; 8416 ipif_refhold(src_ipif); 8417 } 8418 8419 /* 8420 * Assign a source address while we have the conn. 8421 * We can't have ip_wput_ire pick a source address when the 8422 * packet returns from arp since conn_unspec_src might be set 8423 * and we loose the conn when going through arp. 8424 */ 8425 if (ipha->ipha_src == INADDR_ANY && 8426 (connp == NULL || !connp->conn_unspec_src)) { 8427 ipha->ipha_src = src_ipif->ipif_src_addr; 8428 } 8429 8430 /* 8431 * In case of IP_XMIT_IF, it is possible that the outgoing 8432 * interface does not have an interface ire. 8433 * Example: Thousands of mobileip PPP interfaces to mobile 8434 * nodes. We don't want to create interface ires because 8435 * packets from other mobile nodes must not take the route 8436 * via interface ires to the visiting mobile node without 8437 * going through the home agent, in absence of mobileip 8438 * route optimization. 8439 */ 8440 if (CLASSD(ipha_dst) && (connp == NULL || 8441 connp->conn_xmit_if_ill == NULL)) { 8442 /* ipif_to_ire returns an held ire */ 8443 ire = ipif_to_ire(ipif); 8444 if (ire == NULL) 8445 goto err_ret; 8446 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 8447 goto err_ret; 8448 /* 8449 * ihandle is needed when the ire is added to 8450 * cache table. 8451 */ 8452 save_ire = ire; 8453 ihandle = save_ire->ire_ihandle; 8454 8455 ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " 8456 "flags %04x\n", 8457 (void *)ire, (void *)ipif, flags)); 8458 if ((flags & RTF_MULTIRT) && (fire != NULL) && 8459 (fire->ire_flags & RTF_MULTIRT)) { 8460 /* 8461 * As requested by flags, an IRE_OFFSUBNET was 8462 * looked up on that interface. This ire has 8463 * RTF_MULTIRT flag, so the resolution loop will 8464 * be re-entered to resolve additional routes on 8465 * other interfaces. For that purpose, a copy of 8466 * the packet is performed at this point. 8467 */ 8468 fire->ire_last_used_time = lbolt; 8469 copy_mp = copymsg(first_mp); 8470 if (copy_mp) { 8471 MULTIRT_DEBUG_TAG(copy_mp); 8472 } 8473 } 8474 if ((flags & RTF_SETSRC) && (fire != NULL) && 8475 (fire->ire_flags & RTF_SETSRC)) { 8476 /* 8477 * As requested by flags, an IRE_OFFSUBET was 8478 * looked up on that interface. This ire has 8479 * RTF_SETSRC flag, so the source address of the 8480 * packet must be changed. 8481 */ 8482 ipha->ipha_src = fire->ire_src_addr; 8483 } 8484 } else { 8485 ASSERT((connp == NULL) || 8486 (connp->conn_xmit_if_ill != NULL) || 8487 (connp->conn_dontroute)); 8488 /* 8489 * The only ways we can come here are: 8490 * 1) IP_XMIT_IF socket option is set 8491 * 2) ICMP error message generated from 8492 * ip_mrtun_forward() routine and it needs 8493 * to go through the specified ill. 8494 * 3) SO_DONTROUTE socket option is set 8495 * In all cases, the new ire will not be added 8496 * into cache table. 8497 */ 8498 ire_marks |= IRE_MARK_NOADD; 8499 } 8500 8501 switch (ipif->ipif_net_type) { 8502 case IRE_IF_NORESOLVER: { 8503 /* We have what we need to build an IRE_CACHE. */ 8504 mblk_t *dlureq_mp; 8505 8506 /* 8507 * Create a new dlureq_mp with the 8508 * IP gateway address as destination address in the 8509 * DLPI hdr if the physical length is exactly 4 bytes. 8510 */ 8511 if (dst_ill->ill_phys_addr_length == IP_ADDR_LEN) { 8512 dlureq_mp = ill_dlur_gen((uchar_t *)&dst, 8513 dst_ill->ill_phys_addr_length, 8514 dst_ill->ill_sap, 8515 dst_ill->ill_sap_length); 8516 } else { 8517 /* use the value set in ip_ll_subnet_defaults */ 8518 dlureq_mp = ill_dlur_gen(NULL, 8519 dst_ill->ill_phys_addr_length, 8520 dst_ill->ill_sap, 8521 dst_ill->ill_sap_length); 8522 } 8523 8524 if (dlureq_mp == NULL) 8525 break; 8526 /* 8527 * The new ire inherits the IRE_OFFSUBNET flags 8528 * and source address, if this was requested. 8529 */ 8530 ire = ire_create( 8531 (uchar_t *)&dst, /* dest address */ 8532 (uchar_t *)&ip_g_all_ones, /* mask */ 8533 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8534 NULL, /* gateway address */ 8535 NULL, 8536 &ipif->ipif_mtu, 8537 NULL, /* Fast Path header */ 8538 dst_ill->ill_rq, /* recv-from queue */ 8539 dst_ill->ill_wq, /* send-to queue */ 8540 IRE_CACHE, 8541 dlureq_mp, 8542 src_ipif, 8543 NULL, 8544 (save_ire != NULL ? save_ire->ire_mask : 0), 8545 (fire != NULL) ? /* Parent handle */ 8546 fire->ire_phandle : 0, 8547 ihandle, /* Interface handle */ 8548 (fire != NULL) ? 8549 (fire->ire_flags & 8550 (RTF_SETSRC | RTF_MULTIRT)) : 0, 8551 (save_ire == NULL ? &ire_uinfo_null : 8552 &save_ire->ire_uinfo), 8553 NULL, 8554 NULL); 8555 8556 freeb(dlureq_mp); 8557 8558 if (ire == NULL) { 8559 if (save_ire != NULL) 8560 ire_refrele(save_ire); 8561 break; 8562 } 8563 8564 ire->ire_marks |= ire_marks; 8565 8566 /* 8567 * If IRE_MARK_NOADD is set then we need to convert 8568 * the max_fragp to a useable value now. This is 8569 * normally done in ire_add_v[46]. 8570 */ 8571 if (ire->ire_marks & IRE_MARK_NOADD) { 8572 uint_t max_frag; 8573 8574 max_frag = *ire->ire_max_fragp; 8575 ire->ire_max_fragp = NULL; 8576 ire->ire_max_frag = max_frag; 8577 } 8578 8579 /* Prevent save_ire from getting deleted */ 8580 if (save_ire != NULL) { 8581 IRB_REFHOLD(save_ire->ire_bucket); 8582 /* Has it been removed already ? */ 8583 if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { 8584 IRB_REFRELE(save_ire->ire_bucket); 8585 ire_refrele(save_ire); 8586 break; 8587 } 8588 } 8589 8590 ire_add_then_send(q, ire, first_mp); 8591 8592 /* Assert that save_ire is not deleted yet. */ 8593 if (save_ire != NULL) { 8594 ASSERT(save_ire->ire_ptpn != NULL); 8595 IRB_REFRELE(save_ire->ire_bucket); 8596 ire_refrele(save_ire); 8597 save_ire = NULL; 8598 } 8599 if (fire != NULL) { 8600 ire_refrele(fire); 8601 fire = NULL; 8602 } 8603 8604 /* 8605 * the resolution loop is re-entered if this 8606 * was requested through flags and if we 8607 * actually are in a multirouting case. 8608 */ 8609 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 8610 boolean_t need_resolve = 8611 ire_multirt_need_resolve(ipha_dst, 8612 MBLK_GETLABEL(copy_mp)); 8613 if (!need_resolve) { 8614 MULTIRT_DEBUG_UNTAG(copy_mp); 8615 freemsg(copy_mp); 8616 copy_mp = NULL; 8617 } else { 8618 /* 8619 * ipif_lookup_group() calls 8620 * ire_lookup_multi() that uses 8621 * ire_ftable_lookup() to find 8622 * an IRE_INTERFACE for the group. 8623 * In the multirt case, 8624 * ire_lookup_multi() then invokes 8625 * ire_multirt_lookup() to find 8626 * the next resolvable ire. 8627 * As a result, we obtain an new 8628 * interface, derived from the 8629 * next ire. 8630 */ 8631 ipif_refrele(ipif); 8632 ipif = ipif_lookup_group(ipha_dst, 8633 zoneid); 8634 ip2dbg(("ip_newroute_ipif: " 8635 "multirt dst %08x, ipif %p\n", 8636 htonl(dst), (void *)ipif)); 8637 if (ipif != NULL) { 8638 mp = copy_mp; 8639 copy_mp = NULL; 8640 multirt_resolve_next = B_TRUE; 8641 continue; 8642 } else { 8643 freemsg(copy_mp); 8644 } 8645 } 8646 } 8647 if (ipif != NULL) 8648 ipif_refrele(ipif); 8649 ill_refrele(dst_ill); 8650 ipif_refrele(src_ipif); 8651 return; 8652 } 8653 case IRE_IF_RESOLVER: 8654 /* 8655 * We can't build an IRE_CACHE yet, but at least 8656 * we found a resolver that can help. 8657 */ 8658 res_mp = dst_ill->ill_resolver_mp; 8659 if (!OK_RESOLVER_MP(res_mp)) 8660 break; 8661 8662 /* 8663 * We obtain a partial IRE_CACHE which we will pass 8664 * along with the resolver query. When the response 8665 * comes back it will be there ready for us to add. 8666 * The new ire inherits the IRE_OFFSUBNET flags 8667 * and source address, if this was requested. 8668 * The ire_max_frag is atomically set under the 8669 * irebucket lock in ire_add_v[46]. Only in the 8670 * case of IRE_MARK_NOADD, we set it here itself. 8671 */ 8672 ire = ire_create_mp( 8673 (uchar_t *)&dst, /* dest address */ 8674 (uchar_t *)&ip_g_all_ones, /* mask */ 8675 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 8676 NULL, /* gateway address */ 8677 NULL, /* no in_src_addr */ 8678 (ire_marks & IRE_MARK_NOADD) ? 8679 ipif->ipif_mtu : 0, /* max_frag */ 8680 NULL, /* Fast path header */ 8681 dst_ill->ill_rq, /* recv-from queue */ 8682 dst_ill->ill_wq, /* send-to queue */ 8683 IRE_CACHE, 8684 res_mp, 8685 src_ipif, 8686 NULL, 8687 (save_ire != NULL ? save_ire->ire_mask : 0), 8688 (fire != NULL) ? /* Parent handle */ 8689 fire->ire_phandle : 0, 8690 ihandle, /* Interface handle */ 8691 (fire != NULL) ? /* flags if any */ 8692 (fire->ire_flags & 8693 (RTF_SETSRC | RTF_MULTIRT)) : 0, 8694 (save_ire == NULL ? &ire_uinfo_null : 8695 &save_ire->ire_uinfo), 8696 NULL, 8697 NULL); 8698 8699 if (save_ire != NULL) { 8700 ire_refrele(save_ire); 8701 save_ire = NULL; 8702 } 8703 if (ire == NULL) 8704 break; 8705 8706 ire->ire_marks |= ire_marks; 8707 /* 8708 * Construct message chain for the resolver of the 8709 * form: 8710 * ARP_REQ_MBLK-->IRE_MBLK-->Packet 8711 * 8712 * NOTE : ire will be added later when the response 8713 * comes back from ARP. If the response does not 8714 * come back, ARP frees the packet. For this reason, 8715 * we can't REFHOLD the bucket of save_ire to prevent 8716 * deletions. We may not be able to REFRELE the 8717 * bucket if the response never comes back. 8718 * Thus, before adding the ire, ire_add_v4 will make 8719 * sure that the interface route does not get deleted. 8720 * This is the only case unlike ip_newroute_v6, 8721 * ip_newroute_ipif_v6 where we can always prevent 8722 * deletions because ire_add_then_send is called after 8723 * creating the IRE. 8724 * If IRE_MARK_NOADD is set, then ire_add_then_send 8725 * does not add this IRE into the IRE CACHE. 8726 */ 8727 ASSERT(ire->ire_mp != NULL); 8728 ire->ire_mp->b_cont = first_mp; 8729 /* Have saved_mp handy, for cleanup if canput fails */ 8730 saved_mp = mp; 8731 mp = ire->ire_dlureq_mp; 8732 ASSERT(mp != NULL); 8733 ire->ire_dlureq_mp = NULL; 8734 linkb(mp, ire->ire_mp); 8735 8736 /* 8737 * Fill in the source and dest addrs for the resolver. 8738 * NOTE: this depends on memory layouts imposed by 8739 * ill_init(). 8740 */ 8741 areq = (areq_t *)mp->b_rptr; 8742 addrp = (ipaddr_t *)((char *)areq + 8743 areq->areq_sender_addr_offset); 8744 *addrp = ire->ire_src_addr; 8745 addrp = (ipaddr_t *)((char *)areq + 8746 areq->areq_target_addr_offset); 8747 *addrp = dst; 8748 /* Up to the resolver. */ 8749 if (canputnext(dst_ill->ill_rq)) { 8750 putnext(dst_ill->ill_rq, mp); 8751 /* 8752 * The response will come back in ip_wput 8753 * with db_type IRE_DB_TYPE. 8754 */ 8755 } else { 8756 ire->ire_dlureq_mp = mp; 8757 mp->b_cont = NULL; 8758 ire_delete(ire); 8759 saved_mp->b_next = NULL; 8760 saved_mp->b_prev = NULL; 8761 freemsg(first_mp); 8762 ip2dbg(("ip_newroute_ipif: dropped\n")); 8763 } 8764 8765 if (fire != NULL) { 8766 ire_refrele(fire); 8767 fire = NULL; 8768 } 8769 8770 8771 /* 8772 * The resolution loop is re-entered if this was 8773 * requested through flags and we actually are 8774 * in a multirouting case. 8775 */ 8776 if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { 8777 boolean_t need_resolve = 8778 ire_multirt_need_resolve(ipha_dst, 8779 MBLK_GETLABEL(copy_mp)); 8780 if (!need_resolve) { 8781 MULTIRT_DEBUG_UNTAG(copy_mp); 8782 freemsg(copy_mp); 8783 copy_mp = NULL; 8784 } else { 8785 /* 8786 * ipif_lookup_group() calls 8787 * ire_lookup_multi() that uses 8788 * ire_ftable_lookup() to find 8789 * an IRE_INTERFACE for the group. 8790 * In the multirt case, 8791 * ire_lookup_multi() then invokes 8792 * ire_multirt_lookup() to find 8793 * the next resolvable ire. 8794 * As a result, we obtain an new 8795 * interface, derived from the 8796 * next ire. 8797 */ 8798 ipif_refrele(ipif); 8799 ipif = ipif_lookup_group(ipha_dst, 8800 zoneid); 8801 if (ipif != NULL) { 8802 mp = copy_mp; 8803 copy_mp = NULL; 8804 multirt_resolve_next = B_TRUE; 8805 continue; 8806 } else { 8807 freemsg(copy_mp); 8808 } 8809 } 8810 } 8811 if (ipif != NULL) 8812 ipif_refrele(ipif); 8813 ill_refrele(dst_ill); 8814 ipif_refrele(src_ipif); 8815 return; 8816 default: 8817 break; 8818 } 8819 } while (multirt_resolve_next); 8820 8821 err_ret: 8822 ip2dbg(("ip_newroute_ipif: dropped\n")); 8823 if (fire != NULL) 8824 ire_refrele(fire); 8825 ipif_refrele(ipif); 8826 /* Did this packet originate externally? */ 8827 if (dst_ill != NULL) 8828 ill_refrele(dst_ill); 8829 if (src_ipif != NULL) 8830 ipif_refrele(src_ipif); 8831 if (mp->b_prev || mp->b_next) { 8832 mp->b_next = NULL; 8833 mp->b_prev = NULL; 8834 } else { 8835 /* 8836 * Since ip_wput() isn't close to finished, we fill 8837 * in enough of the header for credible error reporting. 8838 */ 8839 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 8840 /* Failed */ 8841 freemsg(first_mp); 8842 if (ire != NULL) 8843 ire_refrele(ire); 8844 return; 8845 } 8846 } 8847 /* 8848 * At this point we will have ire only if RTF_BLACKHOLE 8849 * or RTF_REJECT flags are set on the IRE. It will not 8850 * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. 8851 */ 8852 if (ire != NULL) { 8853 if (ire->ire_flags & RTF_BLACKHOLE) { 8854 ire_refrele(ire); 8855 freemsg(first_mp); 8856 return; 8857 } 8858 ire_refrele(ire); 8859 } 8860 icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE); 8861 } 8862 8863 /* Name/Value Table Lookup Routine */ 8864 char * 8865 ip_nv_lookup(nv_t *nv, int value) 8866 { 8867 if (!nv) 8868 return (NULL); 8869 for (; nv->nv_name; nv++) { 8870 if (nv->nv_value == value) 8871 return (nv->nv_name); 8872 } 8873 return ("unknown"); 8874 } 8875 8876 /* 8877 * one day it can be patched to 1 from /etc/system for machines that have few 8878 * fast network interfaces feeding multiple cpus. 8879 */ 8880 int ill_stream_putlocks = 0; 8881 8882 /* 8883 * This is a module open, i.e. this is a control stream for access 8884 * to a DLPI device. We allocate an ill_t as the instance data in 8885 * this case. 8886 */ 8887 int 8888 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 8889 { 8890 uint32_t mem_cnt; 8891 uint32_t cpu_cnt; 8892 uint32_t min_cnt; 8893 pgcnt_t mem_avail; 8894 extern uint32_t ip_cache_table_size, ip6_cache_table_size; 8895 ill_t *ill; 8896 int err; 8897 8898 /* 8899 * Prevent unprivileged processes from pushing IP so that 8900 * they can't send raw IP. 8901 */ 8902 if (secpolicy_net_rawaccess(credp) != 0) 8903 return (EPERM); 8904 8905 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t)); 8906 q->q_ptr = WR(q)->q_ptr = ill; 8907 8908 /* 8909 * ill_init initializes the ill fields and then sends down 8910 * down a DL_INFO_REQ after calling qprocson. 8911 */ 8912 err = ill_init(q, ill); 8913 if (err != 0) { 8914 mi_free(ill); 8915 q->q_ptr = NULL; 8916 WR(q)->q_ptr = NULL; 8917 return (err); 8918 } 8919 8920 /* ill_init initializes the ipsq marking this thread as writer */ 8921 ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE, B_TRUE); 8922 /* Wait for the DL_INFO_ACK */ 8923 mutex_enter(&ill->ill_lock); 8924 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { 8925 /* 8926 * Return value of 0 indicates a pending signal. 8927 */ 8928 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); 8929 if (err == 0) { 8930 mutex_exit(&ill->ill_lock); 8931 (void) ip_close(q, 0); 8932 return (EINTR); 8933 } 8934 } 8935 mutex_exit(&ill->ill_lock); 8936 8937 /* 8938 * ip_rput_other could have set an error in ill_error on 8939 * receipt of M_ERROR. 8940 */ 8941 8942 err = ill->ill_error; 8943 if (err != 0) { 8944 (void) ip_close(q, 0); 8945 return (err); 8946 } 8947 8948 /* 8949 * ip_ire_max_bucket_cnt is sized below based on the memory 8950 * size and the cpu speed of the machine. This is upper 8951 * bounded by the compile time value of ip_ire_max_bucket_cnt 8952 * and is lower bounded by the compile time value of 8953 * ip_ire_min_bucket_cnt. Similar logic applies to 8954 * ip6_ire_max_bucket_cnt. 8955 */ 8956 mem_avail = kmem_avail(); 8957 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 8958 ip_cache_table_size / sizeof (ire_t); 8959 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 8960 8961 min_cnt = MIN(cpu_cnt, mem_cnt); 8962 if (min_cnt < ip_ire_min_bucket_cnt) 8963 min_cnt = ip_ire_min_bucket_cnt; 8964 if (ip_ire_max_bucket_cnt > min_cnt) { 8965 ip_ire_max_bucket_cnt = min_cnt; 8966 } 8967 8968 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 8969 ip6_cache_table_size / sizeof (ire_t); 8970 min_cnt = MIN(cpu_cnt, mem_cnt); 8971 if (min_cnt < ip6_ire_min_bucket_cnt) 8972 min_cnt = ip6_ire_min_bucket_cnt; 8973 if (ip6_ire_max_bucket_cnt > min_cnt) { 8974 ip6_ire_max_bucket_cnt = min_cnt; 8975 } 8976 8977 ill->ill_credp = credp; 8978 crhold(credp); 8979 8980 mutex_enter(&ip_mi_lock); 8981 err = mi_open_link(&ip_g_head, (IDP)ill, devp, flag, sflag, credp); 8982 mutex_exit(&ip_mi_lock); 8983 if (err) { 8984 (void) ip_close(q, 0); 8985 return (err); 8986 } 8987 return (0); 8988 } 8989 8990 /* IP open routine. */ 8991 int 8992 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 8993 { 8994 conn_t *connp; 8995 major_t maj; 8996 8997 TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); 8998 8999 /* Allow reopen. */ 9000 if (q->q_ptr != NULL) 9001 return (0); 9002 9003 if (sflag & MODOPEN) { 9004 /* This is a module open */ 9005 return (ip_modopen(q, devp, flag, sflag, credp)); 9006 } 9007 9008 /* 9009 * We are opening as a device. This is an IP client stream, and we 9010 * allocate an conn_t as the instance data. 9011 */ 9012 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP); 9013 connp->conn_upq = q; 9014 q->q_ptr = WR(q)->q_ptr = connp; 9015 9016 if (flag & SO_SOCKSTR) 9017 connp->conn_flags |= IPCL_SOCKET; 9018 9019 /* Minor tells us which /dev entry was opened */ 9020 if (geteminor(*devp) == IPV6_MINOR) { 9021 connp->conn_flags |= IPCL_ISV6; 9022 connp->conn_af_isv6 = B_TRUE; 9023 ip_setqinfo(q, geteminor(*devp), B_FALSE); 9024 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9025 } else { 9026 connp->conn_af_isv6 = B_FALSE; 9027 connp->conn_pkt_isv6 = B_FALSE; 9028 } 9029 9030 if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { 9031 q->q_ptr = WR(q)->q_ptr = NULL; 9032 CONN_DEC_REF(connp); 9033 return (EBUSY); 9034 } 9035 9036 maj = getemajor(*devp); 9037 *devp = makedevice(maj, (minor_t)connp->conn_dev); 9038 9039 /* 9040 * connp->conn_cred is crfree()ed in ipcl_conn_destroy() 9041 */ 9042 connp->conn_cred = credp; 9043 crhold(connp->conn_cred); 9044 9045 /* 9046 * If the caller has the process-wide flag set, then default to MAC 9047 * exempt mode. This allows read-down to unlabeled hosts. 9048 */ 9049 if (getpflags(NET_MAC_AWARE, credp) != 0) 9050 connp->conn_mac_exempt = B_TRUE; 9051 9052 connp->conn_zoneid = getzoneid(); 9053 9054 /* 9055 * This should only happen for ndd, netstat, raw socket or other SCTP 9056 * administrative ops. In these cases, we just need a normal conn_t 9057 * with ulp set to IPPROTO_SCTP. All other ops are trapped and 9058 * an error will be returned. 9059 */ 9060 if (maj != SCTP_MAJ && maj != SCTP6_MAJ) { 9061 connp->conn_rq = q; 9062 connp->conn_wq = WR(q); 9063 } else { 9064 connp->conn_ulp = IPPROTO_SCTP; 9065 connp->conn_rq = connp->conn_wq = NULL; 9066 } 9067 /* Non-zero default values */ 9068 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9069 9070 /* 9071 * Make the conn globally visible to walkers 9072 */ 9073 mutex_enter(&connp->conn_lock); 9074 connp->conn_state_flags &= ~CONN_INCIPIENT; 9075 mutex_exit(&connp->conn_lock); 9076 ASSERT(connp->conn_ref == 1); 9077 9078 qprocson(q); 9079 9080 return (0); 9081 } 9082 9083 /* 9084 * Change q_qinfo based on the value of isv6. 9085 * This can not called on an ill queue. 9086 * Note that there is no race since either q_qinfo works for conn queues - it 9087 * is just an optimization to enter the best wput routine directly. 9088 */ 9089 void 9090 ip_setqinfo(queue_t *q, minor_t minor, boolean_t bump_mib) 9091 { 9092 ASSERT(q->q_flag & QREADR); 9093 ASSERT(WR(q)->q_next == NULL); 9094 ASSERT(q->q_ptr != NULL); 9095 9096 if (minor == IPV6_MINOR) { 9097 if (bump_mib) 9098 BUMP_MIB(&ip6_mib, ipv6OutSwitchIPv4); 9099 q->q_qinfo = &rinit_ipv6; 9100 WR(q)->q_qinfo = &winit_ipv6; 9101 (Q_TO_CONN(q))->conn_pkt_isv6 = B_TRUE; 9102 } else { 9103 if (bump_mib) 9104 BUMP_MIB(&ip_mib, ipOutSwitchIPv6); 9105 q->q_qinfo = &rinit; 9106 WR(q)->q_qinfo = &winit; 9107 (Q_TO_CONN(q))->conn_pkt_isv6 = B_FALSE; 9108 } 9109 9110 } 9111 9112 /* 9113 * See if IPsec needs loading because of the options in mp. 9114 */ 9115 static boolean_t 9116 ipsec_opt_present(mblk_t *mp) 9117 { 9118 uint8_t *optcp, *next_optcp, *opt_endcp; 9119 struct opthdr *opt; 9120 struct T_opthdr *topt; 9121 int opthdr_len; 9122 t_uscalar_t optname, optlevel; 9123 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; 9124 ipsec_req_t *ipsr; 9125 9126 /* 9127 * Walk through the mess, and find IP_SEC_OPT. If it's there, 9128 * return TRUE. 9129 */ 9130 9131 optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); 9132 opt_endcp = optcp + tor->OPT_length; 9133 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9134 opthdr_len = sizeof (struct T_opthdr); 9135 } else { /* O_OPTMGMT_REQ */ 9136 ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); 9137 opthdr_len = sizeof (struct opthdr); 9138 } 9139 for (; optcp < opt_endcp; optcp = next_optcp) { 9140 if (optcp + opthdr_len > opt_endcp) 9141 return (B_FALSE); /* Not enough option header. */ 9142 if (tor->PRIM_type == T_OPTMGMT_REQ) { 9143 topt = (struct T_opthdr *)optcp; 9144 optlevel = topt->level; 9145 optname = topt->name; 9146 next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); 9147 } else { 9148 opt = (struct opthdr *)optcp; 9149 optlevel = opt->level; 9150 optname = opt->name; 9151 next_optcp = optcp + opthdr_len + 9152 _TPI_ALIGN_OPT(opt->len); 9153 } 9154 if ((next_optcp < optcp) || /* wraparound pointer space */ 9155 ((next_optcp >= opt_endcp) && /* last option bad len */ 9156 ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) 9157 return (B_FALSE); /* bad option buffer */ 9158 if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || 9159 (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { 9160 /* 9161 * Check to see if it's an all-bypass or all-zeroes 9162 * IPsec request. Don't bother loading IPsec if 9163 * the socket doesn't want to use it. (A good example 9164 * is a bypass request.) 9165 * 9166 * Basically, if any of the non-NEVER bits are set, 9167 * load IPsec. 9168 */ 9169 ipsr = (ipsec_req_t *)(optcp + opthdr_len); 9170 if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || 9171 (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || 9172 (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) 9173 != 0) 9174 return (B_TRUE); 9175 } 9176 } 9177 return (B_FALSE); 9178 } 9179 9180 /* 9181 * If conn is is waiting for ipsec to finish loading, kick it. 9182 */ 9183 /* ARGSUSED */ 9184 static void 9185 conn_restart_ipsec_waiter(conn_t *connp, void *arg) 9186 { 9187 t_scalar_t optreq_prim; 9188 mblk_t *mp; 9189 cred_t *cr; 9190 int err = 0; 9191 9192 /* 9193 * This function is called, after ipsec loading is complete. 9194 * Since IP checks exclusively and atomically (i.e it prevents 9195 * ipsec load from completing until ip_optcom_req completes) 9196 * whether ipsec load is complete, there cannot be a race with IP 9197 * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. 9198 */ 9199 mutex_enter(&connp->conn_lock); 9200 if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { 9201 ASSERT(connp->conn_ipsec_opt_mp != NULL); 9202 mp = connp->conn_ipsec_opt_mp; 9203 connp->conn_ipsec_opt_mp = NULL; 9204 connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; 9205 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(CONNP_TO_WQ(connp))); 9206 mutex_exit(&connp->conn_lock); 9207 9208 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 9209 9210 optreq_prim = ((union T_primitives *)mp->b_rptr)->type; 9211 if (optreq_prim == T_OPTMGMT_REQ) { 9212 err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9213 &ip_opt_obj); 9214 } else { 9215 ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); 9216 err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, 9217 &ip_opt_obj); 9218 } 9219 if (err != EINPROGRESS) 9220 CONN_OPER_PENDING_DONE(connp); 9221 return; 9222 } 9223 mutex_exit(&connp->conn_lock); 9224 } 9225 9226 /* 9227 * Called from the ipsec_loader thread, outside any perimeter, to tell 9228 * ip qenable any of the queues waiting for the ipsec loader to 9229 * complete. 9230 * 9231 * Use ip_mi_lock to be safe here: all modifications of the mi lists 9232 * are done with this lock held, so it's guaranteed that none of the 9233 * links will change along the way. 9234 */ 9235 void 9236 ip_ipsec_load_complete() 9237 { 9238 ipcl_walk(conn_restart_ipsec_waiter, NULL); 9239 } 9240 9241 /* 9242 * Can't be used. Need to call svr4* -> optset directly. the leaf routine 9243 * determines the grp on which it has to become exclusive, queues the mp 9244 * and sq draining restarts the optmgmt 9245 */ 9246 static boolean_t 9247 ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) 9248 { 9249 conn_t *connp; 9250 9251 /* 9252 * Take IPsec requests and treat them special. 9253 */ 9254 if (ipsec_opt_present(mp)) { 9255 /* First check if IPsec is loaded. */ 9256 mutex_enter(&ipsec_loader_lock); 9257 if (ipsec_loader_state != IPSEC_LOADER_WAIT) { 9258 mutex_exit(&ipsec_loader_lock); 9259 return (B_FALSE); 9260 } 9261 connp = Q_TO_CONN(q); 9262 mutex_enter(&connp->conn_lock); 9263 connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; 9264 9265 ASSERT(connp->conn_ipsec_opt_mp == NULL); 9266 connp->conn_ipsec_opt_mp = mp; 9267 mutex_exit(&connp->conn_lock); 9268 mutex_exit(&ipsec_loader_lock); 9269 9270 ipsec_loader_loadnow(); 9271 return (B_TRUE); 9272 } 9273 return (B_FALSE); 9274 } 9275 9276 /* 9277 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, 9278 * all of them are copied to the conn_t. If the req is "zero", the policy is 9279 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req 9280 * fields. 9281 * We keep only the latest setting of the policy and thus policy setting 9282 * is not incremental/cumulative. 9283 * 9284 * Requests to set policies with multiple alternative actions will 9285 * go through a different API. 9286 */ 9287 int 9288 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) 9289 { 9290 uint_t ah_req = 0; 9291 uint_t esp_req = 0; 9292 uint_t se_req = 0; 9293 ipsec_selkey_t sel; 9294 ipsec_act_t *actp = NULL; 9295 uint_t nact; 9296 ipsec_policy_t *pin4 = NULL, *pout4 = NULL; 9297 ipsec_policy_t *pin6 = NULL, *pout6 = NULL; 9298 ipsec_policy_root_t *pr; 9299 ipsec_policy_head_t *ph; 9300 int fam; 9301 boolean_t is_pol_reset; 9302 int error = 0; 9303 9304 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER) 9305 9306 /* 9307 * The IP_SEC_OPT option does not allow variable length parameters, 9308 * hence a request cannot be NULL. 9309 */ 9310 if (req == NULL) 9311 return (EINVAL); 9312 9313 ah_req = req->ipsr_ah_req; 9314 esp_req = req->ipsr_esp_req; 9315 se_req = req->ipsr_self_encap_req; 9316 9317 /* 9318 * Are we dealing with a request to reset the policy (i.e. 9319 * zero requests). 9320 */ 9321 is_pol_reset = ((ah_req & REQ_MASK) == 0 && 9322 (esp_req & REQ_MASK) == 0 && 9323 (se_req & REQ_MASK) == 0); 9324 9325 if (!is_pol_reset) { 9326 /* 9327 * If we couldn't load IPsec, fail with "protocol 9328 * not supported". 9329 * IPsec may not have been loaded for a request with zero 9330 * policies, so we don't fail in this case. 9331 */ 9332 mutex_enter(&ipsec_loader_lock); 9333 if (ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) { 9334 mutex_exit(&ipsec_loader_lock); 9335 return (EPROTONOSUPPORT); 9336 } 9337 mutex_exit(&ipsec_loader_lock); 9338 9339 /* 9340 * Test for valid requests. Invalid algorithms 9341 * need to be tested by IPSEC code because new 9342 * algorithms can be added dynamically. 9343 */ 9344 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 9345 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 || 9346 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) { 9347 return (EINVAL); 9348 } 9349 9350 /* 9351 * Only privileged users can issue these 9352 * requests. 9353 */ 9354 if (((ah_req & IPSEC_PREF_NEVER) || 9355 (esp_req & IPSEC_PREF_NEVER) || 9356 (se_req & IPSEC_PREF_NEVER)) && 9357 secpolicy_net_config(cr, B_FALSE) != 0) { 9358 return (EPERM); 9359 } 9360 9361 /* 9362 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER 9363 * are mutually exclusive. 9364 */ 9365 if (((ah_req & REQ_MASK) == REQ_MASK) || 9366 ((esp_req & REQ_MASK) == REQ_MASK) || 9367 ((se_req & REQ_MASK) == REQ_MASK)) { 9368 /* Both of them are set */ 9369 return (EINVAL); 9370 } 9371 } 9372 9373 mutex_enter(&connp->conn_lock); 9374 9375 /* 9376 * If we have already cached policies in ip_bind_connected*(), don't 9377 * let them change now. We cache policies for connections 9378 * whose src,dst [addr, port] is known. The exception to this is 9379 * tunnels. Tunnels are allowed to change policies after having 9380 * become fully bound. 9381 */ 9382 if (connp->conn_policy_cached && !IPCL_IS_IPTUN(connp)) { 9383 mutex_exit(&connp->conn_lock); 9384 return (EINVAL); 9385 } 9386 9387 /* 9388 * We have a zero policies, reset the connection policy if already 9389 * set. This will cause the connection to inherit the 9390 * global policy, if any. 9391 */ 9392 if (is_pol_reset) { 9393 if (connp->conn_policy != NULL) { 9394 IPPH_REFRELE(connp->conn_policy); 9395 connp->conn_policy = NULL; 9396 } 9397 connp->conn_flags &= ~IPCL_CHECK_POLICY; 9398 connp->conn_in_enforce_policy = B_FALSE; 9399 connp->conn_out_enforce_policy = B_FALSE; 9400 mutex_exit(&connp->conn_lock); 9401 return (0); 9402 } 9403 9404 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy); 9405 if (ph == NULL) 9406 goto enomem; 9407 9408 ipsec_actvec_from_req(req, &actp, &nact); 9409 if (actp == NULL) 9410 goto enomem; 9411 9412 /* 9413 * Always allocate IPv4 policy entries, since they can also 9414 * apply to ipv6 sockets being used in ipv4-compat mode. 9415 */ 9416 bzero(&sel, sizeof (sel)); 9417 sel.ipsl_valid = IPSL_IPV4; 9418 9419 pin4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET); 9420 if (pin4 == NULL) 9421 goto enomem; 9422 9423 pout4 = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET); 9424 if (pout4 == NULL) 9425 goto enomem; 9426 9427 if (connp->conn_pkt_isv6) { 9428 /* 9429 * We're looking at a v6 socket, also allocate the 9430 * v6-specific entries... 9431 */ 9432 sel.ipsl_valid = IPSL_IPV6; 9433 pin6 = ipsec_policy_create(&sel, actp, nact, 9434 IPSEC_PRIO_SOCKET); 9435 if (pin6 == NULL) 9436 goto enomem; 9437 9438 pout6 = ipsec_policy_create(&sel, actp, nact, 9439 IPSEC_PRIO_SOCKET); 9440 if (pout6 == NULL) 9441 goto enomem; 9442 9443 /* 9444 * .. and file them away in the right place. 9445 */ 9446 fam = IPSEC_AF_V6; 9447 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 9448 HASHLIST_INSERT(pin6, ipsp_hash, pr->ipr_nonhash[fam]); 9449 ipsec_insert_always(&ph->iph_rulebyid, pin6); 9450 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 9451 HASHLIST_INSERT(pout6, ipsp_hash, pr->ipr_nonhash[fam]); 9452 ipsec_insert_always(&ph->iph_rulebyid, pout6); 9453 } 9454 9455 ipsec_actvec_free(actp, nact); 9456 9457 /* 9458 * File the v4 policies. 9459 */ 9460 fam = IPSEC_AF_V4; 9461 pr = &ph->iph_root[IPSEC_TYPE_INBOUND]; 9462 HASHLIST_INSERT(pin4, ipsp_hash, pr->ipr_nonhash[fam]); 9463 ipsec_insert_always(&ph->iph_rulebyid, pin4); 9464 9465 pr = &ph->iph_root[IPSEC_TYPE_OUTBOUND]; 9466 HASHLIST_INSERT(pout4, ipsp_hash, pr->ipr_nonhash[fam]); 9467 ipsec_insert_always(&ph->iph_rulebyid, pout4); 9468 9469 /* 9470 * If the requests need security, set enforce_policy. 9471 * If the requests are IPSEC_PREF_NEVER, one should 9472 * still set conn_out_enforce_policy so that an ipsec_out 9473 * gets attached in ip_wput. This is needed so that 9474 * for connections that we don't cache policy in ip_bind, 9475 * if global policy matches in ip_wput_attach_policy, we 9476 * don't wrongly inherit global policy. Similarly, we need 9477 * to set conn_in_enforce_policy also so that we don't verify 9478 * policy wrongly. 9479 */ 9480 if ((ah_req & REQ_MASK) != 0 || 9481 (esp_req & REQ_MASK) != 0 || 9482 (se_req & REQ_MASK) != 0) { 9483 connp->conn_in_enforce_policy = B_TRUE; 9484 connp->conn_out_enforce_policy = B_TRUE; 9485 connp->conn_flags |= IPCL_CHECK_POLICY; 9486 } 9487 9488 /* 9489 * Tunnels are allowed to set policy after having been fully bound. 9490 * If that's the case, cache policy here. 9491 */ 9492 if (IPCL_IS_IPTUN(connp) && connp->conn_fully_bound) 9493 error = ipsec_conn_cache_policy(connp, !connp->conn_af_isv6); 9494 9495 mutex_exit(&connp->conn_lock); 9496 return (error); 9497 #undef REQ_MASK 9498 9499 /* 9500 * Common memory-allocation-failure exit path. 9501 */ 9502 enomem: 9503 mutex_exit(&connp->conn_lock); 9504 if (actp != NULL) 9505 ipsec_actvec_free(actp, nact); 9506 if (pin4 != NULL) 9507 IPPOL_REFRELE(pin4); 9508 if (pout4 != NULL) 9509 IPPOL_REFRELE(pout4); 9510 if (pin6 != NULL) 9511 IPPOL_REFRELE(pin6); 9512 if (pout6 != NULL) 9513 IPPOL_REFRELE(pout6); 9514 return (ENOMEM); 9515 } 9516 9517 /* 9518 * Only for options that pass in an IP addr. Currently only V4 options 9519 * pass in an ipif. V6 options always pass an ifindex specifying the ill. 9520 * So this function assumes level is IPPROTO_IP 9521 */ 9522 int 9523 ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, 9524 mblk_t *first_mp) 9525 { 9526 ipif_t *ipif = NULL; 9527 int error; 9528 ill_t *ill; 9529 9530 ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); 9531 9532 if (addr != INADDR_ANY || checkonly) { 9533 ASSERT(connp != NULL); 9534 if (option == IP_NEXTHOP) { 9535 ipif = 9536 ipif_lookup_onlink_addr(addr, connp->conn_zoneid); 9537 } else { 9538 ipif = ipif_lookup_addr(addr, NULL, connp->conn_zoneid, 9539 CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, 9540 &error); 9541 } 9542 if (ipif == NULL) { 9543 if (error == EINPROGRESS) 9544 return (error); 9545 else if ((option == IP_MULTICAST_IF) || 9546 (option == IP_NEXTHOP)) 9547 return (EHOSTUNREACH); 9548 else 9549 return (EINVAL); 9550 } else if (checkonly) { 9551 if (option == IP_MULTICAST_IF) { 9552 ill = ipif->ipif_ill; 9553 /* not supported by the virtual network iface */ 9554 if (IS_VNI(ill)) { 9555 ipif_refrele(ipif); 9556 return (EINVAL); 9557 } 9558 } 9559 ipif_refrele(ipif); 9560 return (0); 9561 } 9562 ill = ipif->ipif_ill; 9563 mutex_enter(&connp->conn_lock); 9564 mutex_enter(&ill->ill_lock); 9565 if ((ill->ill_state_flags & ILL_CONDEMNED) || 9566 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 9567 mutex_exit(&ill->ill_lock); 9568 mutex_exit(&connp->conn_lock); 9569 ipif_refrele(ipif); 9570 return (option == IP_MULTICAST_IF ? 9571 EHOSTUNREACH : EINVAL); 9572 } 9573 } else { 9574 mutex_enter(&connp->conn_lock); 9575 } 9576 9577 /* None of the options below are supported on the VNI */ 9578 if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { 9579 mutex_exit(&ill->ill_lock); 9580 mutex_exit(&connp->conn_lock); 9581 ipif_refrele(ipif); 9582 return (EINVAL); 9583 } 9584 9585 switch (option) { 9586 case IP_DONTFAILOVER_IF: 9587 /* 9588 * This option is used by in.mpathd to ensure 9589 * that IPMP probe packets only go out on the 9590 * test interfaces. in.mpathd sets this option 9591 * on the non-failover interfaces. 9592 * For backward compatibility, this option 9593 * implicitly sets IP_MULTICAST_IF, as used 9594 * be done in bind(), so that ip_wput gets 9595 * this ipif to send mcast packets. 9596 */ 9597 if (ipif != NULL) { 9598 ASSERT(addr != INADDR_ANY); 9599 connp->conn_nofailover_ill = ipif->ipif_ill; 9600 connp->conn_multicast_ipif = ipif; 9601 } else { 9602 ASSERT(addr == INADDR_ANY); 9603 connp->conn_nofailover_ill = NULL; 9604 connp->conn_multicast_ipif = NULL; 9605 } 9606 break; 9607 9608 case IP_MULTICAST_IF: 9609 connp->conn_multicast_ipif = ipif; 9610 break; 9611 case IP_NEXTHOP: 9612 connp->conn_nexthop_v4 = addr; 9613 connp->conn_nexthop_set = B_TRUE; 9614 break; 9615 } 9616 9617 if (ipif != NULL) { 9618 mutex_exit(&ill->ill_lock); 9619 mutex_exit(&connp->conn_lock); 9620 ipif_refrele(ipif); 9621 return (0); 9622 } 9623 mutex_exit(&connp->conn_lock); 9624 /* We succeded in cleared the option */ 9625 return (0); 9626 } 9627 9628 /* 9629 * For options that pass in an ifindex specifying the ill. V6 options always 9630 * pass in an ill. Some v4 options also pass in ifindex specifying the ill. 9631 */ 9632 int 9633 ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, 9634 int level, int option, mblk_t *first_mp) 9635 { 9636 ill_t *ill = NULL; 9637 int error = 0; 9638 9639 ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); 9640 if (ifindex != 0) { 9641 ASSERT(connp != NULL); 9642 ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), 9643 first_mp, ip_restart_optmgmt, &error); 9644 if (ill != NULL) { 9645 if (checkonly) { 9646 /* not supported by the virtual network iface */ 9647 if (IS_VNI(ill)) { 9648 ill_refrele(ill); 9649 return (EINVAL); 9650 } 9651 ill_refrele(ill); 9652 return (0); 9653 } 9654 if (!ipif_lookup_zoneid_group(ill, connp->conn_zoneid, 9655 0, NULL)) { 9656 ill_refrele(ill); 9657 ill = NULL; 9658 mutex_enter(&connp->conn_lock); 9659 goto setit; 9660 } 9661 mutex_enter(&connp->conn_lock); 9662 mutex_enter(&ill->ill_lock); 9663 if (ill->ill_state_flags & ILL_CONDEMNED) { 9664 mutex_exit(&ill->ill_lock); 9665 mutex_exit(&connp->conn_lock); 9666 ill_refrele(ill); 9667 ill = NULL; 9668 mutex_enter(&connp->conn_lock); 9669 } 9670 goto setit; 9671 } else if (error == EINPROGRESS) { 9672 return (error); 9673 } else { 9674 error = 0; 9675 } 9676 } 9677 mutex_enter(&connp->conn_lock); 9678 setit: 9679 ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); 9680 9681 /* 9682 * The options below assume that the ILL (if any) transmits and/or 9683 * receives traffic. Neither of which is true for the virtual network 9684 * interface, so fail setting these on a VNI. 9685 */ 9686 if (IS_VNI(ill)) { 9687 ASSERT(ill != NULL); 9688 mutex_exit(&ill->ill_lock); 9689 mutex_exit(&connp->conn_lock); 9690 ill_refrele(ill); 9691 return (EINVAL); 9692 } 9693 9694 if (level == IPPROTO_IP) { 9695 switch (option) { 9696 case IP_BOUND_IF: 9697 connp->conn_incoming_ill = ill; 9698 connp->conn_outgoing_ill = ill; 9699 connp->conn_orig_bound_ifindex = (ill == NULL) ? 9700 0 : ifindex; 9701 break; 9702 9703 case IP_XMIT_IF: 9704 /* 9705 * Similar to IP_BOUND_IF, but this only 9706 * determines the outgoing interface for 9707 * unicast packets. Also no IRE_CACHE entry 9708 * is added for the destination of the 9709 * outgoing packets. This feature is needed 9710 * for mobile IP. 9711 */ 9712 connp->conn_xmit_if_ill = ill; 9713 connp->conn_orig_xmit_ifindex = (ill == NULL) ? 9714 0 : ifindex; 9715 break; 9716 9717 case IP_MULTICAST_IF: 9718 /* 9719 * This option is an internal special. The socket 9720 * level IP_MULTICAST_IF specifies an 'ipaddr' and 9721 * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF 9722 * specifies an ifindex and we try first on V6 ill's. 9723 * If we don't find one, we they try using on v4 ill's 9724 * intenally and we come here. 9725 */ 9726 if (!checkonly && ill != NULL) { 9727 ipif_t *ipif; 9728 ipif = ill->ill_ipif; 9729 9730 if (ipif->ipif_state_flags & IPIF_CONDEMNED) { 9731 mutex_exit(&ill->ill_lock); 9732 mutex_exit(&connp->conn_lock); 9733 ill_refrele(ill); 9734 ill = NULL; 9735 mutex_enter(&connp->conn_lock); 9736 } else { 9737 connp->conn_multicast_ipif = ipif; 9738 } 9739 } 9740 break; 9741 } 9742 } else { 9743 switch (option) { 9744 case IPV6_BOUND_IF: 9745 connp->conn_incoming_ill = ill; 9746 connp->conn_outgoing_ill = ill; 9747 connp->conn_orig_bound_ifindex = (ill == NULL) ? 9748 0 : ifindex; 9749 break; 9750 9751 case IPV6_BOUND_PIF: 9752 /* 9753 * Limit all transmit to this ill. 9754 * Unlike IPV6_BOUND_IF, using this option 9755 * prevents load spreading and failover from 9756 * happening when the interface is part of the 9757 * group. That's why we don't need to remember 9758 * the ifindex in orig_bound_ifindex as in 9759 * IPV6_BOUND_IF. 9760 */ 9761 connp->conn_outgoing_pill = ill; 9762 break; 9763 9764 case IPV6_DONTFAILOVER_IF: 9765 /* 9766 * This option is used by in.mpathd to ensure 9767 * that IPMP probe packets only go out on the 9768 * test interfaces. in.mpathd sets this option 9769 * on the non-failover interfaces. 9770 */ 9771 connp->conn_nofailover_ill = ill; 9772 /* 9773 * For backward compatibility, this option 9774 * implicitly sets ip_multicast_ill as used in 9775 * IP_MULTICAST_IF so that ip_wput gets 9776 * this ipif to send mcast packets. 9777 */ 9778 connp->conn_multicast_ill = ill; 9779 connp->conn_orig_multicast_ifindex = (ill == NULL) ? 9780 0 : ifindex; 9781 break; 9782 9783 case IPV6_MULTICAST_IF: 9784 /* 9785 * Set conn_multicast_ill to be the IPv6 ill. 9786 * Set conn_multicast_ipif to be an IPv4 ipif 9787 * for ifindex to make IPv4 mapped addresses 9788 * on PF_INET6 sockets honor IPV6_MULTICAST_IF. 9789 * Even if no IPv6 ill exists for the ifindex 9790 * we need to check for an IPv4 ifindex in order 9791 * for this to work with mapped addresses. In that 9792 * case only set conn_multicast_ipif. 9793 */ 9794 if (!checkonly) { 9795 if (ifindex == 0) { 9796 connp->conn_multicast_ill = NULL; 9797 connp->conn_orig_multicast_ifindex = 0; 9798 connp->conn_multicast_ipif = NULL; 9799 } else if (ill != NULL) { 9800 connp->conn_multicast_ill = ill; 9801 connp->conn_orig_multicast_ifindex = 9802 ifindex; 9803 } 9804 } 9805 break; 9806 } 9807 } 9808 9809 if (ill != NULL) { 9810 mutex_exit(&ill->ill_lock); 9811 mutex_exit(&connp->conn_lock); 9812 ill_refrele(ill); 9813 return (0); 9814 } 9815 mutex_exit(&connp->conn_lock); 9816 /* 9817 * We succeeded in clearing the option (ifindex == 0) or failed to 9818 * locate the ill and could not set the option (ifindex != 0) 9819 */ 9820 return (ifindex == 0 ? 0 : EINVAL); 9821 } 9822 9823 /* This routine sets socket options. */ 9824 /* ARGSUSED */ 9825 int 9826 ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, 9827 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 9828 void *dummy, cred_t *cr, mblk_t *first_mp) 9829 { 9830 int *i1 = (int *)invalp; 9831 conn_t *connp = Q_TO_CONN(q); 9832 int error = 0; 9833 boolean_t checkonly; 9834 ire_t *ire; 9835 boolean_t found; 9836 9837 switch (optset_context) { 9838 9839 case SETFN_OPTCOM_CHECKONLY: 9840 checkonly = B_TRUE; 9841 /* 9842 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 9843 * inlen != 0 implies value supplied and 9844 * we have to "pretend" to set it. 9845 * inlen == 0 implies that there is no 9846 * value part in T_CHECK request and just validation 9847 * done elsewhere should be enough, we just return here. 9848 */ 9849 if (inlen == 0) { 9850 *outlenp = 0; 9851 return (0); 9852 } 9853 break; 9854 case SETFN_OPTCOM_NEGOTIATE: 9855 case SETFN_UD_NEGOTIATE: 9856 case SETFN_CONN_NEGOTIATE: 9857 checkonly = B_FALSE; 9858 break; 9859 default: 9860 /* 9861 * We should never get here 9862 */ 9863 *outlenp = 0; 9864 return (EINVAL); 9865 } 9866 9867 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 9868 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 9869 9870 /* 9871 * For fixed length options, no sanity check 9872 * of passed in length is done. It is assumed *_optcom_req() 9873 * routines do the right thing. 9874 */ 9875 9876 switch (level) { 9877 case SOL_SOCKET: 9878 /* 9879 * conn_lock protects the bitfields, and is used to 9880 * set the fields atomically. 9881 */ 9882 switch (name) { 9883 case SO_BROADCAST: 9884 if (!checkonly) { 9885 /* TODO: use value someplace? */ 9886 mutex_enter(&connp->conn_lock); 9887 connp->conn_broadcast = *i1 ? 1 : 0; 9888 mutex_exit(&connp->conn_lock); 9889 } 9890 break; /* goto sizeof (int) option return */ 9891 case SO_USELOOPBACK: 9892 if (!checkonly) { 9893 /* TODO: use value someplace? */ 9894 mutex_enter(&connp->conn_lock); 9895 connp->conn_loopback = *i1 ? 1 : 0; 9896 mutex_exit(&connp->conn_lock); 9897 } 9898 break; /* goto sizeof (int) option return */ 9899 case SO_DONTROUTE: 9900 if (!checkonly) { 9901 mutex_enter(&connp->conn_lock); 9902 connp->conn_dontroute = *i1 ? 1 : 0; 9903 mutex_exit(&connp->conn_lock); 9904 } 9905 break; /* goto sizeof (int) option return */ 9906 case SO_REUSEADDR: 9907 if (!checkonly) { 9908 mutex_enter(&connp->conn_lock); 9909 connp->conn_reuseaddr = *i1 ? 1 : 0; 9910 mutex_exit(&connp->conn_lock); 9911 } 9912 break; /* goto sizeof (int) option return */ 9913 case SO_PROTOTYPE: 9914 if (!checkonly) { 9915 mutex_enter(&connp->conn_lock); 9916 connp->conn_proto = *i1; 9917 mutex_exit(&connp->conn_lock); 9918 } 9919 break; /* goto sizeof (int) option return */ 9920 case SO_ANON_MLP: 9921 if (!checkonly) { 9922 mutex_enter(&connp->conn_lock); 9923 connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; 9924 mutex_exit(&connp->conn_lock); 9925 } 9926 break; /* goto sizeof (int) option return */ 9927 case SO_MAC_EXEMPT: 9928 if (secpolicy_net_mac_aware(cr) != 0 || 9929 IPCL_IS_BOUND(connp)) 9930 return (EACCES); 9931 if (!checkonly) { 9932 mutex_enter(&connp->conn_lock); 9933 connp->conn_mac_exempt = *i1 != 0 ? 1 : 0; 9934 mutex_exit(&connp->conn_lock); 9935 } 9936 break; /* goto sizeof (int) option return */ 9937 default: 9938 /* 9939 * "soft" error (negative) 9940 * option not handled at this level 9941 * Note: Do not modify *outlenp 9942 */ 9943 return (-EINVAL); 9944 } 9945 break; 9946 case IPPROTO_IP: 9947 switch (name) { 9948 case IP_NEXTHOP: 9949 case IP_MULTICAST_IF: 9950 case IP_DONTFAILOVER_IF: { 9951 ipaddr_t addr = *i1; 9952 9953 error = ip_opt_set_ipif(connp, addr, checkonly, name, 9954 first_mp); 9955 if (error != 0) 9956 return (error); 9957 break; /* goto sizeof (int) option return */ 9958 } 9959 9960 case IP_MULTICAST_TTL: 9961 /* Recorded in transport above IP */ 9962 *outvalp = *invalp; 9963 *outlenp = sizeof (uchar_t); 9964 return (0); 9965 case IP_MULTICAST_LOOP: 9966 if (!checkonly) { 9967 mutex_enter(&connp->conn_lock); 9968 connp->conn_multicast_loop = *invalp ? 1 : 0; 9969 mutex_exit(&connp->conn_lock); 9970 } 9971 *outvalp = *invalp; 9972 *outlenp = sizeof (uchar_t); 9973 return (0); 9974 case IP_ADD_MEMBERSHIP: 9975 case MCAST_JOIN_GROUP: 9976 case IP_DROP_MEMBERSHIP: 9977 case MCAST_LEAVE_GROUP: { 9978 struct ip_mreq *mreqp; 9979 struct group_req *greqp; 9980 ire_t *ire; 9981 boolean_t done = B_FALSE; 9982 ipaddr_t group, ifaddr; 9983 struct sockaddr_in *sin; 9984 uint32_t *ifindexp; 9985 boolean_t mcast_opt = B_TRUE; 9986 mcast_record_t fmode; 9987 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 9988 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 9989 9990 switch (name) { 9991 case IP_ADD_MEMBERSHIP: 9992 mcast_opt = B_FALSE; 9993 /* FALLTHRU */ 9994 case MCAST_JOIN_GROUP: 9995 fmode = MODE_IS_EXCLUDE; 9996 optfn = ip_opt_add_group; 9997 break; 9998 9999 case IP_DROP_MEMBERSHIP: 10000 mcast_opt = B_FALSE; 10001 /* FALLTHRU */ 10002 case MCAST_LEAVE_GROUP: 10003 fmode = MODE_IS_INCLUDE; 10004 optfn = ip_opt_delete_group; 10005 break; 10006 } 10007 10008 if (mcast_opt) { 10009 greqp = (struct group_req *)i1; 10010 sin = (struct sockaddr_in *)&greqp->gr_group; 10011 if (sin->sin_family != AF_INET) { 10012 *outlenp = 0; 10013 return (ENOPROTOOPT); 10014 } 10015 group = (ipaddr_t)sin->sin_addr.s_addr; 10016 ifaddr = INADDR_ANY; 10017 ifindexp = &greqp->gr_interface; 10018 } else { 10019 mreqp = (struct ip_mreq *)i1; 10020 group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; 10021 ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; 10022 ifindexp = NULL; 10023 } 10024 10025 /* 10026 * In the multirouting case, we need to replicate 10027 * the request on all interfaces that will take part 10028 * in replication. We do so because multirouting is 10029 * reflective, thus we will probably receive multi- 10030 * casts on those interfaces. 10031 * The ip_multirt_apply_membership() succeeds if the 10032 * operation succeeds on at least one interface. 10033 */ 10034 ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, 10035 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10036 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10037 if (ire != NULL) { 10038 if (ire->ire_flags & RTF_MULTIRT) { 10039 error = ip_multirt_apply_membership( 10040 optfn, ire, connp, checkonly, group, 10041 fmode, INADDR_ANY, first_mp); 10042 done = B_TRUE; 10043 } 10044 ire_refrele(ire); 10045 } 10046 if (!done) { 10047 error = optfn(connp, checkonly, group, ifaddr, 10048 ifindexp, fmode, INADDR_ANY, first_mp); 10049 } 10050 if (error) { 10051 /* 10052 * EINPROGRESS is a soft error, needs retry 10053 * so don't make *outlenp zero. 10054 */ 10055 if (error != EINPROGRESS) 10056 *outlenp = 0; 10057 return (error); 10058 } 10059 /* OK return - copy input buffer into output buffer */ 10060 if (invalp != outvalp) { 10061 /* don't trust bcopy for identical src/dst */ 10062 bcopy(invalp, outvalp, inlen); 10063 } 10064 *outlenp = inlen; 10065 return (0); 10066 } 10067 case IP_BLOCK_SOURCE: 10068 case IP_UNBLOCK_SOURCE: 10069 case IP_ADD_SOURCE_MEMBERSHIP: 10070 case IP_DROP_SOURCE_MEMBERSHIP: 10071 case MCAST_BLOCK_SOURCE: 10072 case MCAST_UNBLOCK_SOURCE: 10073 case MCAST_JOIN_SOURCE_GROUP: 10074 case MCAST_LEAVE_SOURCE_GROUP: { 10075 struct ip_mreq_source *imreqp; 10076 struct group_source_req *gsreqp; 10077 in_addr_t grp, src, ifaddr = INADDR_ANY; 10078 uint32_t ifindex = 0; 10079 mcast_record_t fmode; 10080 struct sockaddr_in *sin; 10081 ire_t *ire; 10082 boolean_t mcast_opt = B_TRUE, done = B_FALSE; 10083 int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 10084 uint_t *, mcast_record_t, ipaddr_t, mblk_t *); 10085 10086 switch (name) { 10087 case IP_BLOCK_SOURCE: 10088 mcast_opt = B_FALSE; 10089 /* FALLTHRU */ 10090 case MCAST_BLOCK_SOURCE: 10091 fmode = MODE_IS_EXCLUDE; 10092 optfn = ip_opt_add_group; 10093 break; 10094 10095 case IP_UNBLOCK_SOURCE: 10096 mcast_opt = B_FALSE; 10097 /* FALLTHRU */ 10098 case MCAST_UNBLOCK_SOURCE: 10099 fmode = MODE_IS_EXCLUDE; 10100 optfn = ip_opt_delete_group; 10101 break; 10102 10103 case IP_ADD_SOURCE_MEMBERSHIP: 10104 mcast_opt = B_FALSE; 10105 /* FALLTHRU */ 10106 case MCAST_JOIN_SOURCE_GROUP: 10107 fmode = MODE_IS_INCLUDE; 10108 optfn = ip_opt_add_group; 10109 break; 10110 10111 case IP_DROP_SOURCE_MEMBERSHIP: 10112 mcast_opt = B_FALSE; 10113 /* FALLTHRU */ 10114 case MCAST_LEAVE_SOURCE_GROUP: 10115 fmode = MODE_IS_INCLUDE; 10116 optfn = ip_opt_delete_group; 10117 break; 10118 } 10119 10120 if (mcast_opt) { 10121 gsreqp = (struct group_source_req *)i1; 10122 if (gsreqp->gsr_group.ss_family != AF_INET) { 10123 *outlenp = 0; 10124 return (ENOPROTOOPT); 10125 } 10126 sin = (struct sockaddr_in *)&gsreqp->gsr_group; 10127 grp = (ipaddr_t)sin->sin_addr.s_addr; 10128 sin = (struct sockaddr_in *)&gsreqp->gsr_source; 10129 src = (ipaddr_t)sin->sin_addr.s_addr; 10130 ifindex = gsreqp->gsr_interface; 10131 } else { 10132 imreqp = (struct ip_mreq_source *)i1; 10133 grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; 10134 src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; 10135 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; 10136 } 10137 10138 /* 10139 * In the multirouting case, we need to replicate 10140 * the request as noted in the mcast cases above. 10141 */ 10142 ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, 10143 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10144 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10145 if (ire != NULL) { 10146 if (ire->ire_flags & RTF_MULTIRT) { 10147 error = ip_multirt_apply_membership( 10148 optfn, ire, connp, checkonly, grp, 10149 fmode, src, first_mp); 10150 done = B_TRUE; 10151 } 10152 ire_refrele(ire); 10153 } 10154 if (!done) { 10155 error = optfn(connp, checkonly, grp, ifaddr, 10156 &ifindex, fmode, src, first_mp); 10157 } 10158 if (error != 0) { 10159 /* 10160 * EINPROGRESS is a soft error, needs retry 10161 * so don't make *outlenp zero. 10162 */ 10163 if (error != EINPROGRESS) 10164 *outlenp = 0; 10165 return (error); 10166 } 10167 /* OK return - copy input buffer into output buffer */ 10168 if (invalp != outvalp) { 10169 bcopy(invalp, outvalp, inlen); 10170 } 10171 *outlenp = inlen; 10172 return (0); 10173 } 10174 case IP_SEC_OPT: 10175 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10176 if (error != 0) { 10177 *outlenp = 0; 10178 return (error); 10179 } 10180 break; 10181 case IP_HDRINCL: 10182 case IP_OPTIONS: 10183 case T_IP_OPTIONS: 10184 case IP_TOS: 10185 case T_IP_TOS: 10186 case IP_TTL: 10187 case IP_RECVDSTADDR: 10188 case IP_RECVOPTS: 10189 /* OK return - copy input buffer into output buffer */ 10190 if (invalp != outvalp) { 10191 /* don't trust bcopy for identical src/dst */ 10192 bcopy(invalp, outvalp, inlen); 10193 } 10194 *outlenp = inlen; 10195 return (0); 10196 case IP_RECVIF: 10197 /* Retrieve the inbound interface index */ 10198 if (!checkonly) { 10199 mutex_enter(&connp->conn_lock); 10200 connp->conn_recvif = *i1 ? 1 : 0; 10201 mutex_exit(&connp->conn_lock); 10202 } 10203 break; /* goto sizeof (int) option return */ 10204 case IP_RECVSLLA: 10205 /* Retrieve the source link layer address */ 10206 if (!checkonly) { 10207 mutex_enter(&connp->conn_lock); 10208 connp->conn_recvslla = *i1 ? 1 : 0; 10209 mutex_exit(&connp->conn_lock); 10210 } 10211 break; /* goto sizeof (int) option return */ 10212 case MRT_INIT: 10213 case MRT_DONE: 10214 case MRT_ADD_VIF: 10215 case MRT_DEL_VIF: 10216 case MRT_ADD_MFC: 10217 case MRT_DEL_MFC: 10218 case MRT_ASSERT: 10219 if ((error = secpolicy_net_config(cr, B_FALSE)) != 0) { 10220 *outlenp = 0; 10221 return (error); 10222 } 10223 error = ip_mrouter_set((int)name, q, checkonly, 10224 (uchar_t *)invalp, inlen, first_mp); 10225 if (error) { 10226 *outlenp = 0; 10227 return (error); 10228 } 10229 /* OK return - copy input buffer into output buffer */ 10230 if (invalp != outvalp) { 10231 /* don't trust bcopy for identical src/dst */ 10232 bcopy(invalp, outvalp, inlen); 10233 } 10234 *outlenp = inlen; 10235 return (0); 10236 case IP_BOUND_IF: 10237 case IP_XMIT_IF: 10238 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10239 level, name, first_mp); 10240 if (error != 0) 10241 return (error); 10242 break; /* goto sizeof (int) option return */ 10243 10244 case IP_UNSPEC_SRC: 10245 /* Allow sending with a zero source address */ 10246 if (!checkonly) { 10247 mutex_enter(&connp->conn_lock); 10248 connp->conn_unspec_src = *i1 ? 1 : 0; 10249 mutex_exit(&connp->conn_lock); 10250 } 10251 break; /* goto sizeof (int) option return */ 10252 default: 10253 /* 10254 * "soft" error (negative) 10255 * option not handled at this level 10256 * Note: Do not modify *outlenp 10257 */ 10258 return (-EINVAL); 10259 } 10260 break; 10261 case IPPROTO_IPV6: 10262 switch (name) { 10263 case IPV6_BOUND_IF: 10264 case IPV6_BOUND_PIF: 10265 case IPV6_DONTFAILOVER_IF: 10266 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10267 level, name, first_mp); 10268 if (error != 0) 10269 return (error); 10270 break; /* goto sizeof (int) option return */ 10271 10272 case IPV6_MULTICAST_IF: 10273 /* 10274 * The only possible errors are EINPROGRESS and 10275 * EINVAL. EINPROGRESS will be restarted and is not 10276 * a hard error. We call this option on both V4 and V6 10277 * If both return EINVAL, then this call returns 10278 * EINVAL. If at least one of them succeeds we 10279 * return success. 10280 */ 10281 found = B_FALSE; 10282 error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, 10283 level, name, first_mp); 10284 if (error == EINPROGRESS) 10285 return (error); 10286 if (error == 0) 10287 found = B_TRUE; 10288 error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, 10289 IPPROTO_IP, IP_MULTICAST_IF, first_mp); 10290 if (error == 0) 10291 found = B_TRUE; 10292 if (!found) 10293 return (error); 10294 break; /* goto sizeof (int) option return */ 10295 10296 case IPV6_MULTICAST_HOPS: 10297 /* Recorded in transport above IP */ 10298 break; /* goto sizeof (int) option return */ 10299 case IPV6_MULTICAST_LOOP: 10300 if (!checkonly) { 10301 mutex_enter(&connp->conn_lock); 10302 connp->conn_multicast_loop = *i1; 10303 mutex_exit(&connp->conn_lock); 10304 } 10305 break; /* goto sizeof (int) option return */ 10306 case IPV6_JOIN_GROUP: 10307 case MCAST_JOIN_GROUP: 10308 case IPV6_LEAVE_GROUP: 10309 case MCAST_LEAVE_GROUP: { 10310 struct ipv6_mreq *ip_mreqp; 10311 struct group_req *greqp; 10312 ire_t *ire; 10313 boolean_t done = B_FALSE; 10314 in6_addr_t groupv6; 10315 uint32_t ifindex; 10316 boolean_t mcast_opt = B_TRUE; 10317 mcast_record_t fmode; 10318 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 10319 int, mcast_record_t, const in6_addr_t *, mblk_t *); 10320 10321 switch (name) { 10322 case IPV6_JOIN_GROUP: 10323 mcast_opt = B_FALSE; 10324 /* FALLTHRU */ 10325 case MCAST_JOIN_GROUP: 10326 fmode = MODE_IS_EXCLUDE; 10327 optfn = ip_opt_add_group_v6; 10328 break; 10329 10330 case IPV6_LEAVE_GROUP: 10331 mcast_opt = B_FALSE; 10332 /* FALLTHRU */ 10333 case MCAST_LEAVE_GROUP: 10334 fmode = MODE_IS_INCLUDE; 10335 optfn = ip_opt_delete_group_v6; 10336 break; 10337 } 10338 10339 if (mcast_opt) { 10340 struct sockaddr_in *sin; 10341 struct sockaddr_in6 *sin6; 10342 greqp = (struct group_req *)i1; 10343 if (greqp->gr_group.ss_family == AF_INET) { 10344 sin = (struct sockaddr_in *) 10345 &(greqp->gr_group); 10346 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 10347 &groupv6); 10348 } else { 10349 sin6 = (struct sockaddr_in6 *) 10350 &(greqp->gr_group); 10351 groupv6 = sin6->sin6_addr; 10352 } 10353 ifindex = greqp->gr_interface; 10354 } else { 10355 ip_mreqp = (struct ipv6_mreq *)i1; 10356 groupv6 = ip_mreqp->ipv6mr_multiaddr; 10357 ifindex = ip_mreqp->ipv6mr_interface; 10358 } 10359 /* 10360 * In the multirouting case, we need to replicate 10361 * the request on all interfaces that will take part 10362 * in replication. We do so because multirouting is 10363 * reflective, thus we will probably receive multi- 10364 * casts on those interfaces. 10365 * The ip_multirt_apply_membership_v6() succeeds if 10366 * the operation succeeds on at least one interface. 10367 */ 10368 ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, 10369 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10370 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10371 if (ire != NULL) { 10372 if (ire->ire_flags & RTF_MULTIRT) { 10373 error = ip_multirt_apply_membership_v6( 10374 optfn, ire, connp, checkonly, 10375 &groupv6, fmode, &ipv6_all_zeros, 10376 first_mp); 10377 done = B_TRUE; 10378 } 10379 ire_refrele(ire); 10380 } 10381 if (!done) { 10382 error = optfn(connp, checkonly, &groupv6, 10383 ifindex, fmode, &ipv6_all_zeros, first_mp); 10384 } 10385 if (error) { 10386 /* 10387 * EINPROGRESS is a soft error, needs retry 10388 * so don't make *outlenp zero. 10389 */ 10390 if (error != EINPROGRESS) 10391 *outlenp = 0; 10392 return (error); 10393 } 10394 /* OK return - copy input buffer into output buffer */ 10395 if (invalp != outvalp) { 10396 /* don't trust bcopy for identical src/dst */ 10397 bcopy(invalp, outvalp, inlen); 10398 } 10399 *outlenp = inlen; 10400 return (0); 10401 } 10402 case MCAST_BLOCK_SOURCE: 10403 case MCAST_UNBLOCK_SOURCE: 10404 case MCAST_JOIN_SOURCE_GROUP: 10405 case MCAST_LEAVE_SOURCE_GROUP: { 10406 struct group_source_req *gsreqp; 10407 in6_addr_t v6grp, v6src; 10408 uint32_t ifindex; 10409 mcast_record_t fmode; 10410 ire_t *ire; 10411 boolean_t done = B_FALSE; 10412 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, 10413 int, mcast_record_t, const in6_addr_t *, mblk_t *); 10414 10415 switch (name) { 10416 case MCAST_BLOCK_SOURCE: 10417 fmode = MODE_IS_EXCLUDE; 10418 optfn = ip_opt_add_group_v6; 10419 break; 10420 case MCAST_UNBLOCK_SOURCE: 10421 fmode = MODE_IS_EXCLUDE; 10422 optfn = ip_opt_delete_group_v6; 10423 break; 10424 case MCAST_JOIN_SOURCE_GROUP: 10425 fmode = MODE_IS_INCLUDE; 10426 optfn = ip_opt_add_group_v6; 10427 break; 10428 case MCAST_LEAVE_SOURCE_GROUP: 10429 fmode = MODE_IS_INCLUDE; 10430 optfn = ip_opt_delete_group_v6; 10431 break; 10432 } 10433 10434 gsreqp = (struct group_source_req *)i1; 10435 ifindex = gsreqp->gsr_interface; 10436 if (gsreqp->gsr_group.ss_family == AF_INET) { 10437 struct sockaddr_in *s; 10438 s = (struct sockaddr_in *)&gsreqp->gsr_group; 10439 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); 10440 s = (struct sockaddr_in *)&gsreqp->gsr_source; 10441 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); 10442 } else { 10443 struct sockaddr_in6 *s6; 10444 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; 10445 v6grp = s6->sin6_addr; 10446 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; 10447 v6src = s6->sin6_addr; 10448 } 10449 10450 /* 10451 * In the multirouting case, we need to replicate 10452 * the request as noted in the mcast cases above. 10453 */ 10454 ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, 10455 IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, 10456 MATCH_IRE_MASK | MATCH_IRE_TYPE); 10457 if (ire != NULL) { 10458 if (ire->ire_flags & RTF_MULTIRT) { 10459 error = ip_multirt_apply_membership_v6( 10460 optfn, ire, connp, checkonly, 10461 &v6grp, fmode, &v6src, first_mp); 10462 done = B_TRUE; 10463 } 10464 ire_refrele(ire); 10465 } 10466 if (!done) { 10467 error = optfn(connp, checkonly, &v6grp, 10468 ifindex, fmode, &v6src, first_mp); 10469 } 10470 if (error != 0) { 10471 /* 10472 * EINPROGRESS is a soft error, needs retry 10473 * so don't make *outlenp zero. 10474 */ 10475 if (error != EINPROGRESS) 10476 *outlenp = 0; 10477 return (error); 10478 } 10479 /* OK return - copy input buffer into output buffer */ 10480 if (invalp != outvalp) { 10481 bcopy(invalp, outvalp, inlen); 10482 } 10483 *outlenp = inlen; 10484 return (0); 10485 } 10486 case IPV6_UNICAST_HOPS: 10487 /* Recorded in transport above IP */ 10488 break; /* goto sizeof (int) option return */ 10489 case IPV6_UNSPEC_SRC: 10490 /* Allow sending with a zero source address */ 10491 if (!checkonly) { 10492 mutex_enter(&connp->conn_lock); 10493 connp->conn_unspec_src = *i1 ? 1 : 0; 10494 mutex_exit(&connp->conn_lock); 10495 } 10496 break; /* goto sizeof (int) option return */ 10497 case IPV6_RECVPKTINFO: 10498 if (!checkonly) { 10499 mutex_enter(&connp->conn_lock); 10500 connp->conn_ipv6_recvpktinfo = *i1 ? 1 : 0; 10501 mutex_exit(&connp->conn_lock); 10502 } 10503 break; /* goto sizeof (int) option return */ 10504 case IPV6_RECVTCLASS: 10505 if (!checkonly) { 10506 if (*i1 < 0 || *i1 > 1) { 10507 return (EINVAL); 10508 } 10509 mutex_enter(&connp->conn_lock); 10510 connp->conn_ipv6_recvtclass = *i1; 10511 mutex_exit(&connp->conn_lock); 10512 } 10513 break; 10514 case IPV6_RECVPATHMTU: 10515 if (!checkonly) { 10516 if (*i1 < 0 || *i1 > 1) { 10517 return (EINVAL); 10518 } 10519 mutex_enter(&connp->conn_lock); 10520 connp->conn_ipv6_recvpathmtu = *i1; 10521 mutex_exit(&connp->conn_lock); 10522 } 10523 break; 10524 case IPV6_RECVHOPLIMIT: 10525 if (!checkonly) { 10526 mutex_enter(&connp->conn_lock); 10527 connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; 10528 mutex_exit(&connp->conn_lock); 10529 } 10530 break; /* goto sizeof (int) option return */ 10531 case IPV6_RECVHOPOPTS: 10532 if (!checkonly) { 10533 mutex_enter(&connp->conn_lock); 10534 connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; 10535 mutex_exit(&connp->conn_lock); 10536 } 10537 break; /* goto sizeof (int) option return */ 10538 case IPV6_RECVDSTOPTS: 10539 if (!checkonly) { 10540 mutex_enter(&connp->conn_lock); 10541 connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; 10542 mutex_exit(&connp->conn_lock); 10543 } 10544 break; /* goto sizeof (int) option return */ 10545 case IPV6_RECVRTHDR: 10546 if (!checkonly) { 10547 mutex_enter(&connp->conn_lock); 10548 connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; 10549 mutex_exit(&connp->conn_lock); 10550 } 10551 break; /* goto sizeof (int) option return */ 10552 case IPV6_RECVRTHDRDSTOPTS: 10553 if (!checkonly) { 10554 mutex_enter(&connp->conn_lock); 10555 connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; 10556 mutex_exit(&connp->conn_lock); 10557 } 10558 break; /* goto sizeof (int) option return */ 10559 case IPV6_PKTINFO: 10560 if (inlen == 0) 10561 return (-EINVAL); /* clearing option */ 10562 error = ip6_set_pktinfo(cr, connp, 10563 (struct in6_pktinfo *)invalp, first_mp); 10564 if (error != 0) 10565 *outlenp = 0; 10566 else 10567 *outlenp = inlen; 10568 return (error); 10569 case IPV6_NEXTHOP: { 10570 struct sockaddr_in6 *sin6; 10571 10572 /* Verify that the nexthop is reachable */ 10573 if (inlen == 0) 10574 return (-EINVAL); /* clearing option */ 10575 10576 sin6 = (struct sockaddr_in6 *)invalp; 10577 ire = ire_route_lookup_v6(&sin6->sin6_addr, 10578 0, 0, 0, NULL, NULL, connp->conn_zoneid, 10579 NULL, MATCH_IRE_DEFAULT); 10580 10581 if (ire == NULL) { 10582 *outlenp = 0; 10583 return (EHOSTUNREACH); 10584 } 10585 ire_refrele(ire); 10586 return (-EINVAL); 10587 } 10588 case IPV6_SEC_OPT: 10589 error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); 10590 if (error != 0) { 10591 *outlenp = 0; 10592 return (error); 10593 } 10594 break; 10595 case IPV6_SRC_PREFERENCES: { 10596 /* 10597 * This is implemented strictly in the ip module 10598 * (here and in tcp_opt_*() to accomodate tcp 10599 * sockets). Modules above ip pass this option 10600 * down here since ip is the only one that needs to 10601 * be aware of source address preferences. 10602 * 10603 * This socket option only affects connected 10604 * sockets that haven't already bound to a specific 10605 * IPv6 address. In other words, sockets that 10606 * don't call bind() with an address other than the 10607 * unspecified address and that call connect(). 10608 * ip_bind_connected_v6() passes these preferences 10609 * to the ipif_select_source_v6() function. 10610 */ 10611 if (inlen != sizeof (uint32_t)) 10612 return (EINVAL); 10613 error = ip6_set_src_preferences(connp, 10614 *(uint32_t *)invalp); 10615 if (error != 0) { 10616 *outlenp = 0; 10617 return (error); 10618 } else { 10619 *outlenp = sizeof (uint32_t); 10620 } 10621 break; 10622 } 10623 case IPV6_V6ONLY: 10624 if (*i1 < 0 || *i1 > 1) { 10625 return (EINVAL); 10626 } 10627 mutex_enter(&connp->conn_lock); 10628 connp->conn_ipv6_v6only = *i1; 10629 mutex_exit(&connp->conn_lock); 10630 break; 10631 default: 10632 return (-EINVAL); 10633 } 10634 break; 10635 default: 10636 /* 10637 * "soft" error (negative) 10638 * option not handled at this level 10639 * Note: Do not modify *outlenp 10640 */ 10641 return (-EINVAL); 10642 } 10643 /* 10644 * Common case of return from an option that is sizeof (int) 10645 */ 10646 *(int *)outvalp = *i1; 10647 *outlenp = sizeof (int); 10648 return (0); 10649 } 10650 10651 /* 10652 * This routine gets default values of certain options whose default 10653 * values are maintained by protocol specific code 10654 */ 10655 /* ARGSUSED */ 10656 int 10657 ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 10658 { 10659 int *i1 = (int *)ptr; 10660 10661 switch (level) { 10662 case IPPROTO_IP: 10663 switch (name) { 10664 case IP_MULTICAST_TTL: 10665 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 10666 return (sizeof (uchar_t)); 10667 case IP_MULTICAST_LOOP: 10668 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 10669 return (sizeof (uchar_t)); 10670 default: 10671 return (-1); 10672 } 10673 case IPPROTO_IPV6: 10674 switch (name) { 10675 case IPV6_UNICAST_HOPS: 10676 *i1 = ipv6_def_hops; 10677 return (sizeof (int)); 10678 case IPV6_MULTICAST_HOPS: 10679 *i1 = IP_DEFAULT_MULTICAST_TTL; 10680 return (sizeof (int)); 10681 case IPV6_MULTICAST_LOOP: 10682 *i1 = IP_DEFAULT_MULTICAST_LOOP; 10683 return (sizeof (int)); 10684 case IPV6_V6ONLY: 10685 *i1 = 1; 10686 return (sizeof (int)); 10687 default: 10688 return (-1); 10689 } 10690 default: 10691 return (-1); 10692 } 10693 /* NOTREACHED */ 10694 } 10695 10696 /* 10697 * Given a destination address and a pointer to where to put the information 10698 * this routine fills in the mtuinfo. 10699 */ 10700 int 10701 ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, 10702 struct ip6_mtuinfo *mtuinfo) 10703 { 10704 ire_t *ire; 10705 10706 if (IN6_IS_ADDR_UNSPECIFIED(in6)) 10707 return (-1); 10708 10709 bzero(mtuinfo, sizeof (*mtuinfo)); 10710 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 10711 mtuinfo->ip6m_addr.sin6_port = port; 10712 mtuinfo->ip6m_addr.sin6_addr = *in6; 10713 10714 ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL); 10715 if (ire != NULL) { 10716 mtuinfo->ip6m_mtu = ire->ire_max_frag; 10717 ire_refrele(ire); 10718 } else { 10719 mtuinfo->ip6m_mtu = IPV6_MIN_MTU; 10720 } 10721 return (sizeof (struct ip6_mtuinfo)); 10722 } 10723 10724 /* 10725 * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error 10726 * checking of GET_QUEUE_CRED(q) and that ip_g_mrouter is set should be done and 10727 * isn't. This doesn't matter as the error checking is done properly for the 10728 * other MRT options coming in through ip_opt_set. 10729 */ 10730 int 10731 ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 10732 { 10733 conn_t *connp = Q_TO_CONN(q); 10734 ipsec_req_t *req = (ipsec_req_t *)ptr; 10735 10736 switch (level) { 10737 case IPPROTO_IP: 10738 switch (name) { 10739 case MRT_VERSION: 10740 case MRT_ASSERT: 10741 (void) ip_mrouter_get(name, q, ptr); 10742 return (sizeof (int)); 10743 case IP_SEC_OPT: 10744 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); 10745 case IP_NEXTHOP: 10746 if (connp->conn_nexthop_set) { 10747 *(ipaddr_t *)ptr = connp->conn_nexthop_v4; 10748 return (sizeof (ipaddr_t)); 10749 } else 10750 return (0); 10751 default: 10752 break; 10753 } 10754 break; 10755 case IPPROTO_IPV6: 10756 switch (name) { 10757 case IPV6_SEC_OPT: 10758 return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); 10759 case IPV6_SRC_PREFERENCES: { 10760 return (ip6_get_src_preferences(connp, 10761 (uint32_t *)ptr)); 10762 } 10763 case IPV6_V6ONLY: 10764 *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; 10765 return (sizeof (int)); 10766 case IPV6_PATHMTU: 10767 return (ip_fill_mtuinfo(&connp->conn_remv6, 0, 10768 (struct ip6_mtuinfo *)ptr)); 10769 default: 10770 break; 10771 } 10772 break; 10773 default: 10774 break; 10775 } 10776 return (-1); 10777 } 10778 10779 /* Named Dispatch routine to get a current value out of our parameter table. */ 10780 /* ARGSUSED */ 10781 static int 10782 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 10783 { 10784 ipparam_t *ippa = (ipparam_t *)cp; 10785 10786 (void) mi_mpprintf(mp, "%d", ippa->ip_param_value); 10787 return (0); 10788 } 10789 10790 /* ARGSUSED */ 10791 static int 10792 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 10793 { 10794 10795 (void) mi_mpprintf(mp, "%d", *(int *)cp); 10796 return (0); 10797 } 10798 10799 /* 10800 * Set ip{,6}_forwarding values. This means walking through all of the 10801 * ill's and toggling their forwarding values. 10802 */ 10803 /* ARGSUSED */ 10804 static int 10805 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 10806 { 10807 long new_value; 10808 int *forwarding_value = (int *)cp; 10809 ill_t *walker; 10810 boolean_t isv6 = (forwarding_value == &ipv6_forward); 10811 ill_walk_context_t ctx; 10812 10813 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 10814 new_value < 0 || new_value > 1) { 10815 return (EINVAL); 10816 } 10817 10818 *forwarding_value = new_value; 10819 10820 /* 10821 * Regardless of the current value of ip_forwarding, set all per-ill 10822 * values of ip_forwarding to the value being set. 10823 * 10824 * Bring all the ill's up to date with the new global value. 10825 */ 10826 rw_enter(&ill_g_lock, RW_READER); 10827 10828 if (isv6) 10829 walker = ILL_START_WALK_V6(&ctx); 10830 else 10831 walker = ILL_START_WALK_V4(&ctx); 10832 for (; walker != NULL; walker = ill_next(&ctx, walker)) { 10833 (void) ill_forward_set(q, mp, (new_value != 0), 10834 (caddr_t)walker); 10835 } 10836 rw_exit(&ill_g_lock); 10837 10838 return (0); 10839 } 10840 10841 /* 10842 * Walk through the param array specified registering each element with the 10843 * Named Dispatch handler. This is called only during init. So it is ok 10844 * not to acquire any locks 10845 */ 10846 static boolean_t 10847 ip_param_register(ipparam_t *ippa, size_t ippa_cnt, 10848 ipndp_t *ipnd, size_t ipnd_cnt) 10849 { 10850 for (; ippa_cnt-- > 0; ippa++) { 10851 if (ippa->ip_param_name && ippa->ip_param_name[0]) { 10852 if (!nd_load(&ip_g_nd, ippa->ip_param_name, 10853 ip_param_get, ip_param_set, (caddr_t)ippa)) { 10854 nd_free(&ip_g_nd); 10855 return (B_FALSE); 10856 } 10857 } 10858 } 10859 10860 for (; ipnd_cnt-- > 0; ipnd++) { 10861 if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) { 10862 if (!nd_load(&ip_g_nd, ipnd->ip_ndp_name, 10863 ipnd->ip_ndp_getf, ipnd->ip_ndp_setf, 10864 ipnd->ip_ndp_data)) { 10865 nd_free(&ip_g_nd); 10866 return (B_FALSE); 10867 } 10868 } 10869 } 10870 10871 return (B_TRUE); 10872 } 10873 10874 /* Named Dispatch routine to negotiate a new value for one of our parameters. */ 10875 /* ARGSUSED */ 10876 static int 10877 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr) 10878 { 10879 long new_value; 10880 ipparam_t *ippa = (ipparam_t *)cp; 10881 10882 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 10883 new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { 10884 return (EINVAL); 10885 } 10886 ippa->ip_param_value = new_value; 10887 return (0); 10888 } 10889 10890 /* 10891 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases, 10892 * When an ipf is passed here for the first time, if 10893 * we already have in-order fragments on the queue, we convert from the fast- 10894 * path reassembly scheme to the hard-case scheme. From then on, additional 10895 * fragments are reassembled here. We keep track of the start and end offsets 10896 * of each piece, and the number of holes in the chain. When the hole count 10897 * goes to zero, we are done! 10898 * 10899 * The ipf_count will be updated to account for any mblk(s) added (pointed to 10900 * by mp) or subtracted (freeb()ed dups), upon return the caller must update 10901 * ipfb_count and ill_frag_count by the difference of ipf_count before and 10902 * after the call to ip_reassemble(). 10903 */ 10904 int 10905 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, 10906 size_t msg_len) 10907 { 10908 uint_t end; 10909 mblk_t *next_mp; 10910 mblk_t *mp1; 10911 uint_t offset; 10912 boolean_t incr_dups = B_TRUE; 10913 boolean_t offset_zero_seen = B_FALSE; 10914 boolean_t pkt_boundary_checked = B_FALSE; 10915 10916 /* If start == 0 then ipf_nf_hdr_len has to be set. */ 10917 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0); 10918 10919 /* Add in byte count */ 10920 ipf->ipf_count += msg_len; 10921 if (ipf->ipf_end) { 10922 /* 10923 * We were part way through in-order reassembly, but now there 10924 * is a hole. We walk through messages already queued, and 10925 * mark them for hard case reassembly. We know that up till 10926 * now they were in order starting from offset zero. 10927 */ 10928 offset = 0; 10929 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 10930 IP_REASS_SET_START(mp1, offset); 10931 if (offset == 0) { 10932 ASSERT(ipf->ipf_nf_hdr_len != 0); 10933 offset = -ipf->ipf_nf_hdr_len; 10934 } 10935 offset += mp1->b_wptr - mp1->b_rptr; 10936 IP_REASS_SET_END(mp1, offset); 10937 } 10938 /* One hole at the end. */ 10939 ipf->ipf_hole_cnt = 1; 10940 /* Brand it as a hard case, forever. */ 10941 ipf->ipf_end = 0; 10942 } 10943 /* Walk through all the new pieces. */ 10944 do { 10945 end = start + (mp->b_wptr - mp->b_rptr); 10946 /* 10947 * If start is 0, decrease 'end' only for the first mblk of 10948 * the fragment. Otherwise 'end' can get wrong value in the 10949 * second pass of the loop if first mblk is exactly the 10950 * size of ipf_nf_hdr_len. 10951 */ 10952 if (start == 0 && !offset_zero_seen) { 10953 /* First segment */ 10954 ASSERT(ipf->ipf_nf_hdr_len != 0); 10955 end -= ipf->ipf_nf_hdr_len; 10956 offset_zero_seen = B_TRUE; 10957 } 10958 next_mp = mp->b_cont; 10959 /* 10960 * We are checking to see if there is any interesing data 10961 * to process. If there isn't and the mblk isn't the 10962 * one which carries the unfragmentable header then we 10963 * drop it. It's possible to have just the unfragmentable 10964 * header come through without any data. That needs to be 10965 * saved. 10966 * 10967 * If the assert at the top of this function holds then the 10968 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code 10969 * is infrequently traveled enough that the test is left in 10970 * to protect against future code changes which break that 10971 * invariant. 10972 */ 10973 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) { 10974 /* Empty. Blast it. */ 10975 IP_REASS_SET_START(mp, 0); 10976 IP_REASS_SET_END(mp, 0); 10977 /* 10978 * If the ipf points to the mblk we are about to free, 10979 * update ipf to point to the next mblk (or NULL 10980 * if none). 10981 */ 10982 if (ipf->ipf_mp->b_cont == mp) 10983 ipf->ipf_mp->b_cont = next_mp; 10984 freeb(mp); 10985 continue; 10986 } 10987 mp->b_cont = NULL; 10988 IP_REASS_SET_START(mp, start); 10989 IP_REASS_SET_END(mp, end); 10990 if (!ipf->ipf_tail_mp) { 10991 ipf->ipf_tail_mp = mp; 10992 ipf->ipf_mp->b_cont = mp; 10993 if (start == 0 || !more) { 10994 ipf->ipf_hole_cnt = 1; 10995 /* 10996 * if the first fragment comes in more than one 10997 * mblk, this loop will be executed for each 10998 * mblk. Need to adjust hole count so exiting 10999 * this routine will leave hole count at 1. 11000 */ 11001 if (next_mp) 11002 ipf->ipf_hole_cnt++; 11003 } else 11004 ipf->ipf_hole_cnt = 2; 11005 continue; 11006 } else if (ipf->ipf_last_frag_seen && !more && 11007 !pkt_boundary_checked) { 11008 /* 11009 * We check datagram boundary only if this fragment 11010 * claims to be the last fragment and we have seen a 11011 * last fragment in the past too. We do this only 11012 * once for a given fragment. 11013 * 11014 * start cannot be 0 here as fragments with start=0 11015 * and MF=0 gets handled as a complete packet. These 11016 * fragments should not reach here. 11017 */ 11018 11019 if (start + msgdsize(mp) != 11020 IP_REASS_END(ipf->ipf_tail_mp)) { 11021 /* 11022 * We have two fragments both of which claim 11023 * to be the last fragment but gives conflicting 11024 * information about the whole datagram size. 11025 * Something fishy is going on. Drop the 11026 * fragment and free up the reassembly list. 11027 */ 11028 return (IP_REASS_FAILED); 11029 } 11030 11031 /* 11032 * We shouldn't come to this code block again for this 11033 * particular fragment. 11034 */ 11035 pkt_boundary_checked = B_TRUE; 11036 } 11037 11038 /* New stuff at or beyond tail? */ 11039 offset = IP_REASS_END(ipf->ipf_tail_mp); 11040 if (start >= offset) { 11041 if (ipf->ipf_last_frag_seen) { 11042 /* current fragment is beyond last fragment */ 11043 return (IP_REASS_FAILED); 11044 } 11045 /* Link it on end. */ 11046 ipf->ipf_tail_mp->b_cont = mp; 11047 ipf->ipf_tail_mp = mp; 11048 if (more) { 11049 if (start != offset) 11050 ipf->ipf_hole_cnt++; 11051 } else if (start == offset && next_mp == NULL) 11052 ipf->ipf_hole_cnt--; 11053 continue; 11054 } 11055 mp1 = ipf->ipf_mp->b_cont; 11056 offset = IP_REASS_START(mp1); 11057 /* New stuff at the front? */ 11058 if (start < offset) { 11059 if (start == 0) { 11060 if (end >= offset) { 11061 /* Nailed the hole at the begining. */ 11062 ipf->ipf_hole_cnt--; 11063 } 11064 } else if (end < offset) { 11065 /* 11066 * A hole, stuff, and a hole where there used 11067 * to be just a hole. 11068 */ 11069 ipf->ipf_hole_cnt++; 11070 } 11071 mp->b_cont = mp1; 11072 /* Check for overlap. */ 11073 while (end > offset) { 11074 if (end < IP_REASS_END(mp1)) { 11075 mp->b_wptr -= end - offset; 11076 IP_REASS_SET_END(mp, offset); 11077 if (ill->ill_isv6) { 11078 BUMP_MIB(ill->ill_ip6_mib, 11079 ipv6ReasmPartDups); 11080 } else { 11081 BUMP_MIB(&ip_mib, 11082 ipReasmPartDups); 11083 } 11084 break; 11085 } 11086 /* Did we cover another hole? */ 11087 if ((mp1->b_cont && 11088 IP_REASS_END(mp1) != 11089 IP_REASS_START(mp1->b_cont) && 11090 end >= IP_REASS_START(mp1->b_cont)) || 11091 (!ipf->ipf_last_frag_seen && !more)) { 11092 ipf->ipf_hole_cnt--; 11093 } 11094 /* Clip out mp1. */ 11095 if ((mp->b_cont = mp1->b_cont) == NULL) { 11096 /* 11097 * After clipping out mp1, this guy 11098 * is now hanging off the end. 11099 */ 11100 ipf->ipf_tail_mp = mp; 11101 } 11102 IP_REASS_SET_START(mp1, 0); 11103 IP_REASS_SET_END(mp1, 0); 11104 /* Subtract byte count */ 11105 ipf->ipf_count -= mp1->b_datap->db_lim - 11106 mp1->b_datap->db_base; 11107 freeb(mp1); 11108 if (ill->ill_isv6) { 11109 BUMP_MIB(ill->ill_ip6_mib, 11110 ipv6ReasmPartDups); 11111 } else { 11112 BUMP_MIB(&ip_mib, ipReasmPartDups); 11113 } 11114 mp1 = mp->b_cont; 11115 if (!mp1) 11116 break; 11117 offset = IP_REASS_START(mp1); 11118 } 11119 ipf->ipf_mp->b_cont = mp; 11120 continue; 11121 } 11122 /* 11123 * The new piece starts somewhere between the start of the head 11124 * and before the end of the tail. 11125 */ 11126 for (; mp1; mp1 = mp1->b_cont) { 11127 offset = IP_REASS_END(mp1); 11128 if (start < offset) { 11129 if (end <= offset) { 11130 /* Nothing new. */ 11131 IP_REASS_SET_START(mp, 0); 11132 IP_REASS_SET_END(mp, 0); 11133 /* Subtract byte count */ 11134 ipf->ipf_count -= mp->b_datap->db_lim - 11135 mp->b_datap->db_base; 11136 if (incr_dups) { 11137 ipf->ipf_num_dups++; 11138 incr_dups = B_FALSE; 11139 } 11140 freeb(mp); 11141 if (ill->ill_isv6) { 11142 BUMP_MIB(ill->ill_ip6_mib, 11143 ipv6ReasmDuplicates); 11144 } else { 11145 BUMP_MIB(&ip_mib, 11146 ipReasmDuplicates); 11147 } 11148 break; 11149 } 11150 /* 11151 * Trim redundant stuff off beginning of new 11152 * piece. 11153 */ 11154 IP_REASS_SET_START(mp, offset); 11155 mp->b_rptr += offset - start; 11156 if (ill->ill_isv6) { 11157 BUMP_MIB(ill->ill_ip6_mib, 11158 ipv6ReasmPartDups); 11159 } else { 11160 BUMP_MIB(&ip_mib, ipReasmPartDups); 11161 } 11162 start = offset; 11163 if (!mp1->b_cont) { 11164 /* 11165 * After trimming, this guy is now 11166 * hanging off the end. 11167 */ 11168 mp1->b_cont = mp; 11169 ipf->ipf_tail_mp = mp; 11170 if (!more) { 11171 ipf->ipf_hole_cnt--; 11172 } 11173 break; 11174 } 11175 } 11176 if (start >= IP_REASS_START(mp1->b_cont)) 11177 continue; 11178 /* Fill a hole */ 11179 if (start > offset) 11180 ipf->ipf_hole_cnt++; 11181 mp->b_cont = mp1->b_cont; 11182 mp1->b_cont = mp; 11183 mp1 = mp->b_cont; 11184 offset = IP_REASS_START(mp1); 11185 if (end >= offset) { 11186 ipf->ipf_hole_cnt--; 11187 /* Check for overlap. */ 11188 while (end > offset) { 11189 if (end < IP_REASS_END(mp1)) { 11190 mp->b_wptr -= end - offset; 11191 IP_REASS_SET_END(mp, offset); 11192 /* 11193 * TODO we might bump 11194 * this up twice if there is 11195 * overlap at both ends. 11196 */ 11197 if (ill->ill_isv6) { 11198 BUMP_MIB( 11199 ill->ill_ip6_mib, 11200 ipv6ReasmPartDups); 11201 } else { 11202 BUMP_MIB(&ip_mib, 11203 ipReasmPartDups); 11204 } 11205 break; 11206 } 11207 /* Did we cover another hole? */ 11208 if ((mp1->b_cont && 11209 IP_REASS_END(mp1) 11210 != IP_REASS_START(mp1->b_cont) && 11211 end >= 11212 IP_REASS_START(mp1->b_cont)) || 11213 (!ipf->ipf_last_frag_seen && 11214 !more)) { 11215 ipf->ipf_hole_cnt--; 11216 } 11217 /* Clip out mp1. */ 11218 if ((mp->b_cont = mp1->b_cont) == 11219 NULL) { 11220 /* 11221 * After clipping out mp1, 11222 * this guy is now hanging 11223 * off the end. 11224 */ 11225 ipf->ipf_tail_mp = mp; 11226 } 11227 IP_REASS_SET_START(mp1, 0); 11228 IP_REASS_SET_END(mp1, 0); 11229 /* Subtract byte count */ 11230 ipf->ipf_count -= 11231 mp1->b_datap->db_lim - 11232 mp1->b_datap->db_base; 11233 freeb(mp1); 11234 if (ill->ill_isv6) { 11235 BUMP_MIB(ill->ill_ip6_mib, 11236 ipv6ReasmPartDups); 11237 } else { 11238 BUMP_MIB(&ip_mib, 11239 ipReasmPartDups); 11240 } 11241 mp1 = mp->b_cont; 11242 if (!mp1) 11243 break; 11244 offset = IP_REASS_START(mp1); 11245 } 11246 } 11247 break; 11248 } 11249 } while (start = end, mp = next_mp); 11250 11251 /* Fragment just processed could be the last one. Remember this fact */ 11252 if (!more) 11253 ipf->ipf_last_frag_seen = B_TRUE; 11254 11255 /* Still got holes? */ 11256 if (ipf->ipf_hole_cnt) 11257 return (IP_REASS_PARTIAL); 11258 /* Clean up overloaded fields to avoid upstream disasters. */ 11259 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) { 11260 IP_REASS_SET_START(mp1, 0); 11261 IP_REASS_SET_END(mp1, 0); 11262 } 11263 return (IP_REASS_COMPLETE); 11264 } 11265 11266 /* 11267 * ipsec processing for the fast path, used for input UDP Packets 11268 */ 11269 static boolean_t 11270 ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, 11271 mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present) 11272 { 11273 uint32_t ill_index; 11274 uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ 11275 11276 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11277 /* The ill_index of the incoming ILL */ 11278 ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; 11279 11280 /* pass packet up to the transport */ 11281 if (CONN_INBOUND_POLICY_PRESENT(connp) || mctl_present) { 11282 *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, 11283 NULL, mctl_present); 11284 if (*first_mpp == NULL) { 11285 return (B_FALSE); 11286 } 11287 } 11288 11289 /* Initiate IPPF processing for fastpath UDP */ 11290 if (IPP_ENABLED(IPP_LOCAL_IN)) { 11291 ip_process(IPP_LOCAL_IN, mpp, ill_index); 11292 if (*mpp == NULL) { 11293 ip2dbg(("ip_input_ipsec_process: UDP pkt " 11294 "deferred/dropped during IPPF processing\n")); 11295 return (B_FALSE); 11296 } 11297 } 11298 /* 11299 * We make the checks as below since we are in the fast path 11300 * and want to minimize the number of checks if the IP_RECVIF and/or 11301 * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set 11302 */ 11303 if (connp->conn_recvif || connp->conn_recvslla || 11304 connp->conn_ipv6_recvpktinfo) { 11305 if (connp->conn_recvif || 11306 connp->conn_ipv6_recvpktinfo) { 11307 in_flags = IPF_RECVIF; 11308 } 11309 if (connp->conn_recvslla) { 11310 in_flags |= IPF_RECVSLLA; 11311 } 11312 /* 11313 * since in_flags are being set ill will be 11314 * referenced in ip_add_info, so it better not 11315 * be NULL. 11316 */ 11317 /* 11318 * the actual data will be contained in b_cont 11319 * upon successful return of the following call. 11320 * If the call fails then the original mblk is 11321 * returned. 11322 */ 11323 *mpp = ip_add_info(*mpp, ill, in_flags); 11324 } 11325 11326 return (B_TRUE); 11327 } 11328 11329 /* 11330 * Fragmentation reassembly. Each ILL has a hash table for 11331 * queuing packets undergoing reassembly for all IPIFs 11332 * associated with the ILL. The hash is based on the packet 11333 * IP ident field. The ILL frag hash table was allocated 11334 * as a timer block at the time the ILL was created. Whenever 11335 * there is anything on the reassembly queue, the timer will 11336 * be running. Returns B_TRUE if successful else B_FALSE; 11337 * frees mp on failure. 11338 */ 11339 static boolean_t 11340 ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, 11341 uint32_t *cksum_val, uint16_t *cksum_flags) 11342 { 11343 uint32_t frag_offset_flags; 11344 ill_t *ill = (ill_t *)q->q_ptr; 11345 mblk_t *mp = *mpp; 11346 mblk_t *t_mp; 11347 ipaddr_t dst; 11348 uint8_t proto = ipha->ipha_protocol; 11349 uint32_t sum_val; 11350 uint16_t sum_flags; 11351 ipf_t *ipf; 11352 ipf_t **ipfp; 11353 ipfb_t *ipfb; 11354 uint16_t ident; 11355 uint32_t offset; 11356 ipaddr_t src; 11357 uint_t hdr_length; 11358 uint32_t end; 11359 mblk_t *mp1; 11360 mblk_t *tail_mp; 11361 size_t count; 11362 size_t msg_len; 11363 uint8_t ecn_info = 0; 11364 uint32_t packet_size; 11365 boolean_t pruned = B_FALSE; 11366 11367 if (cksum_val != NULL) 11368 *cksum_val = 0; 11369 if (cksum_flags != NULL) 11370 *cksum_flags = 0; 11371 11372 /* 11373 * Drop the fragmented as early as possible, if 11374 * we don't have resource(s) to re-assemble. 11375 */ 11376 if (ip_reass_queue_bytes == 0) { 11377 freemsg(mp); 11378 return (B_FALSE); 11379 } 11380 11381 /* Check for fragmentation offset; return if there's none */ 11382 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & 11383 (IPH_MF | IPH_OFFSET)) == 0) 11384 return (B_TRUE); 11385 11386 /* 11387 * We utilize hardware computed checksum info only for UDP since 11388 * IP fragmentation is a normal occurence for the protocol. In 11389 * addition, checksum offload support for IP fragments carrying 11390 * UDP payload is commonly implemented across network adapters. 11391 */ 11392 ASSERT(ill != NULL); 11393 if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && 11394 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { 11395 mblk_t *mp1 = mp->b_cont; 11396 int32_t len; 11397 11398 /* Record checksum information from the packet */ 11399 sum_val = (uint32_t)DB_CKSUM16(mp); 11400 sum_flags = DB_CKSUMFLAGS(mp); 11401 11402 /* IP payload offset from beginning of mblk */ 11403 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; 11404 11405 if ((sum_flags & HCK_PARTIALCKSUM) && 11406 (mp1 == NULL || mp1->b_cont == NULL) && 11407 offset >= DB_CKSUMSTART(mp) && 11408 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { 11409 uint32_t adj; 11410 /* 11411 * Partial checksum has been calculated by hardware 11412 * and attached to the packet; in addition, any 11413 * prepended extraneous data is even byte aligned. 11414 * If any such data exists, we adjust the checksum; 11415 * this would also handle any postpended data. 11416 */ 11417 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), 11418 mp, mp1, len, adj); 11419 11420 /* One's complement subtract extraneous checksum */ 11421 if (adj >= sum_val) 11422 sum_val = ~(adj - sum_val) & 0xFFFF; 11423 else 11424 sum_val -= adj; 11425 } 11426 } else { 11427 sum_val = 0; 11428 sum_flags = 0; 11429 } 11430 11431 /* Clear hardware checksumming flag */ 11432 DB_CKSUMFLAGS(mp) = 0; 11433 11434 ident = ipha->ipha_ident; 11435 offset = (frag_offset_flags << 3) & 0xFFFF; 11436 src = ipha->ipha_src; 11437 dst = ipha->ipha_dst; 11438 hdr_length = IPH_HDR_LENGTH(ipha); 11439 end = ntohs(ipha->ipha_length) - hdr_length; 11440 11441 /* If end == 0 then we have a packet with no data, so just free it */ 11442 if (end == 0) { 11443 freemsg(mp); 11444 return (B_FALSE); 11445 } 11446 11447 /* Record the ECN field info. */ 11448 ecn_info = (ipha->ipha_type_of_service & 0x3); 11449 if (offset != 0) { 11450 /* 11451 * If this isn't the first piece, strip the header, and 11452 * add the offset to the end value. 11453 */ 11454 mp->b_rptr += hdr_length; 11455 end += offset; 11456 } 11457 11458 msg_len = MBLKSIZE(mp); 11459 tail_mp = mp; 11460 while (tail_mp->b_cont != NULL) { 11461 tail_mp = tail_mp->b_cont; 11462 msg_len += MBLKSIZE(tail_mp); 11463 } 11464 11465 /* If the reassembly list for this ILL will get too big, prune it */ 11466 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= 11467 ip_reass_queue_bytes) { 11468 ill_frag_prune(ill, 11469 (ip_reass_queue_bytes < msg_len) ? 0 : 11470 (ip_reass_queue_bytes - msg_len)); 11471 pruned = B_TRUE; 11472 } 11473 11474 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; 11475 mutex_enter(&ipfb->ipfb_lock); 11476 11477 ipfp = &ipfb->ipfb_ipf; 11478 /* Try to find an existing fragment queue for this packet. */ 11479 for (;;) { 11480 ipf = ipfp[0]; 11481 if (ipf != NULL) { 11482 /* 11483 * It has to match on ident and src/dst address. 11484 */ 11485 if (ipf->ipf_ident == ident && 11486 ipf->ipf_src == src && 11487 ipf->ipf_dst == dst && 11488 ipf->ipf_protocol == proto) { 11489 /* 11490 * If we have received too many 11491 * duplicate fragments for this packet 11492 * free it. 11493 */ 11494 if (ipf->ipf_num_dups > ip_max_frag_dups) { 11495 ill_frag_free_pkts(ill, ipfb, ipf, 1); 11496 freemsg(mp); 11497 mutex_exit(&ipfb->ipfb_lock); 11498 return (B_FALSE); 11499 } 11500 /* Found it. */ 11501 break; 11502 } 11503 ipfp = &ipf->ipf_hash_next; 11504 continue; 11505 } 11506 11507 /* 11508 * If we pruned the list, do we want to store this new 11509 * fragment?. We apply an optimization here based on the 11510 * fact that most fragments will be received in order. 11511 * So if the offset of this incoming fragment is zero, 11512 * it is the first fragment of a new packet. We will 11513 * keep it. Otherwise drop the fragment, as we have 11514 * probably pruned the packet already (since the 11515 * packet cannot be found). 11516 */ 11517 if (pruned && offset != 0) { 11518 mutex_exit(&ipfb->ipfb_lock); 11519 freemsg(mp); 11520 return (B_FALSE); 11521 } 11522 11523 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) { 11524 /* 11525 * Too many fragmented packets in this hash 11526 * bucket. Free the oldest. 11527 */ 11528 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); 11529 } 11530 11531 /* New guy. Allocate a frag message. */ 11532 mp1 = allocb(sizeof (*ipf), BPRI_MED); 11533 if (mp1 == NULL) { 11534 BUMP_MIB(&ip_mib, ipInDiscards); 11535 freemsg(mp); 11536 reass_done: 11537 mutex_exit(&ipfb->ipfb_lock); 11538 return (B_FALSE); 11539 } 11540 11541 11542 BUMP_MIB(&ip_mib, ipReasmReqds); 11543 mp1->b_cont = mp; 11544 11545 /* Initialize the fragment header. */ 11546 ipf = (ipf_t *)mp1->b_rptr; 11547 ipf->ipf_mp = mp1; 11548 ipf->ipf_ptphn = ipfp; 11549 ipfp[0] = ipf; 11550 ipf->ipf_hash_next = NULL; 11551 ipf->ipf_ident = ident; 11552 ipf->ipf_protocol = proto; 11553 ipf->ipf_src = src; 11554 ipf->ipf_dst = dst; 11555 ipf->ipf_nf_hdr_len = 0; 11556 /* Record reassembly start time. */ 11557 ipf->ipf_timestamp = gethrestime_sec(); 11558 /* Record ipf generation and account for frag header */ 11559 ipf->ipf_gen = ill->ill_ipf_gen++; 11560 ipf->ipf_count = MBLKSIZE(mp1); 11561 ipf->ipf_last_frag_seen = B_FALSE; 11562 ipf->ipf_ecn = ecn_info; 11563 ipf->ipf_num_dups = 0; 11564 ipfb->ipfb_frag_pkts++; 11565 ipf->ipf_checksum = 0; 11566 ipf->ipf_checksum_flags = 0; 11567 11568 /* Store checksum value in fragment header */ 11569 if (sum_flags != 0) { 11570 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11571 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11572 ipf->ipf_checksum = sum_val; 11573 ipf->ipf_checksum_flags = sum_flags; 11574 } 11575 11576 /* 11577 * We handle reassembly two ways. In the easy case, 11578 * where all the fragments show up in order, we do 11579 * minimal bookkeeping, and just clip new pieces on 11580 * the end. If we ever see a hole, then we go off 11581 * to ip_reassemble which has to mark the pieces and 11582 * keep track of the number of holes, etc. Obviously, 11583 * the point of having both mechanisms is so we can 11584 * handle the easy case as efficiently as possible. 11585 */ 11586 if (offset == 0) { 11587 /* Easy case, in-order reassembly so far. */ 11588 ipf->ipf_count += msg_len; 11589 ipf->ipf_tail_mp = tail_mp; 11590 /* 11591 * Keep track of next expected offset in 11592 * ipf_end. 11593 */ 11594 ipf->ipf_end = end; 11595 ipf->ipf_nf_hdr_len = hdr_length; 11596 } else { 11597 /* Hard case, hole at the beginning. */ 11598 ipf->ipf_tail_mp = NULL; 11599 /* 11600 * ipf_end == 0 means that we have given up 11601 * on easy reassembly. 11602 */ 11603 ipf->ipf_end = 0; 11604 11605 /* Forget checksum offload from now on */ 11606 ipf->ipf_checksum_flags = 0; 11607 11608 /* 11609 * ipf_hole_cnt is set by ip_reassemble. 11610 * ipf_count is updated by ip_reassemble. 11611 * No need to check for return value here 11612 * as we don't expect reassembly to complete 11613 * or fail for the first fragment itself. 11614 */ 11615 (void) ip_reassemble(mp, ipf, 11616 (frag_offset_flags & IPH_OFFSET) << 3, 11617 (frag_offset_flags & IPH_MF), ill, msg_len); 11618 } 11619 /* Update per ipfb and ill byte counts */ 11620 ipfb->ipfb_count += ipf->ipf_count; 11621 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11622 ill->ill_frag_count += ipf->ipf_count; 11623 ASSERT(ill->ill_frag_count > 0); /* Wraparound */ 11624 /* If the frag timer wasn't already going, start it. */ 11625 mutex_enter(&ill->ill_lock); 11626 ill_frag_timer_start(ill); 11627 mutex_exit(&ill->ill_lock); 11628 goto reass_done; 11629 } 11630 11631 /* 11632 * If the packet's flag has changed (it could be coming up 11633 * from an interface different than the previous, therefore 11634 * possibly different checksum capability), then forget about 11635 * any stored checksum states. Otherwise add the value to 11636 * the existing one stored in the fragment header. 11637 */ 11638 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { 11639 sum_val += ipf->ipf_checksum; 11640 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11641 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); 11642 ipf->ipf_checksum = sum_val; 11643 } else if (ipf->ipf_checksum_flags != 0) { 11644 /* Forget checksum offload from now on */ 11645 ipf->ipf_checksum_flags = 0; 11646 } 11647 11648 /* 11649 * We have a new piece of a datagram which is already being 11650 * reassembled. Update the ECN info if all IP fragments 11651 * are ECN capable. If there is one which is not, clear 11652 * all the info. If there is at least one which has CE 11653 * code point, IP needs to report that up to transport. 11654 */ 11655 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { 11656 if (ecn_info == IPH_ECN_CE) 11657 ipf->ipf_ecn = IPH_ECN_CE; 11658 } else { 11659 ipf->ipf_ecn = IPH_ECN_NECT; 11660 } 11661 if (offset && ipf->ipf_end == offset) { 11662 /* The new fragment fits at the end */ 11663 ipf->ipf_tail_mp->b_cont = mp; 11664 /* Update the byte count */ 11665 ipf->ipf_count += msg_len; 11666 /* Update per ipfb and ill byte counts */ 11667 ipfb->ipfb_count += msg_len; 11668 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11669 ill->ill_frag_count += msg_len; 11670 ASSERT(ill->ill_frag_count > 0); /* Wraparound */ 11671 if (frag_offset_flags & IPH_MF) { 11672 /* More to come. */ 11673 ipf->ipf_end = end; 11674 ipf->ipf_tail_mp = tail_mp; 11675 goto reass_done; 11676 } 11677 } else { 11678 /* Go do the hard cases. */ 11679 int ret; 11680 11681 if (offset == 0) 11682 ipf->ipf_nf_hdr_len = hdr_length; 11683 11684 /* Save current byte count */ 11685 count = ipf->ipf_count; 11686 ret = ip_reassemble(mp, ipf, 11687 (frag_offset_flags & IPH_OFFSET) << 3, 11688 (frag_offset_flags & IPH_MF), ill, msg_len); 11689 /* Count of bytes added and subtracted (freeb()ed) */ 11690 count = ipf->ipf_count - count; 11691 if (count) { 11692 /* Update per ipfb and ill byte counts */ 11693 ipfb->ipfb_count += count; 11694 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ 11695 ill->ill_frag_count += count; 11696 ASSERT(ill->ill_frag_count > 0); 11697 } 11698 if (ret == IP_REASS_PARTIAL) { 11699 goto reass_done; 11700 } else if (ret == IP_REASS_FAILED) { 11701 /* Reassembly failed. Free up all resources */ 11702 ill_frag_free_pkts(ill, ipfb, ipf, 1); 11703 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { 11704 IP_REASS_SET_START(t_mp, 0); 11705 IP_REASS_SET_END(t_mp, 0); 11706 } 11707 freemsg(mp); 11708 goto reass_done; 11709 } 11710 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ 11711 } 11712 /* 11713 * We have completed reassembly. Unhook the frag header from 11714 * the reassembly list. 11715 * 11716 * Before we free the frag header, record the ECN info 11717 * to report back to the transport. 11718 */ 11719 ecn_info = ipf->ipf_ecn; 11720 BUMP_MIB(&ip_mib, ipReasmOKs); 11721 ipfp = ipf->ipf_ptphn; 11722 11723 /* We need to supply these to caller */ 11724 if ((sum_flags = ipf->ipf_checksum_flags) != 0) 11725 sum_val = ipf->ipf_checksum; 11726 else 11727 sum_val = 0; 11728 11729 mp1 = ipf->ipf_mp; 11730 count = ipf->ipf_count; 11731 ipf = ipf->ipf_hash_next; 11732 if (ipf != NULL) 11733 ipf->ipf_ptphn = ipfp; 11734 ipfp[0] = ipf; 11735 ill->ill_frag_count -= count; 11736 ASSERT(ipfb->ipfb_count >= count); 11737 ipfb->ipfb_count -= count; 11738 ipfb->ipfb_frag_pkts--; 11739 mutex_exit(&ipfb->ipfb_lock); 11740 /* Ditch the frag header. */ 11741 mp = mp1->b_cont; 11742 11743 freeb(mp1); 11744 11745 /* Restore original IP length in header. */ 11746 packet_size = (uint32_t)msgdsize(mp); 11747 if (packet_size > IP_MAXPACKET) { 11748 freemsg(mp); 11749 BUMP_MIB(&ip_mib, ipInHdrErrors); 11750 return (B_FALSE); 11751 } 11752 11753 if (DB_REF(mp) > 1) { 11754 mblk_t *mp2 = copymsg(mp); 11755 11756 freemsg(mp); 11757 if (mp2 == NULL) { 11758 BUMP_MIB(&ip_mib, ipInDiscards); 11759 return (B_FALSE); 11760 } 11761 mp = mp2; 11762 } 11763 ipha = (ipha_t *)mp->b_rptr; 11764 11765 ipha->ipha_length = htons((uint16_t)packet_size); 11766 /* We're now complete, zip the frag state */ 11767 ipha->ipha_fragment_offset_and_flags = 0; 11768 /* Record the ECN info. */ 11769 ipha->ipha_type_of_service &= 0xFC; 11770 ipha->ipha_type_of_service |= ecn_info; 11771 *mpp = mp; 11772 11773 /* Reassembly is successful; return checksum information if needed */ 11774 if (cksum_val != NULL) 11775 *cksum_val = sum_val; 11776 if (cksum_flags != NULL) 11777 *cksum_flags = sum_flags; 11778 11779 return (B_TRUE); 11780 } 11781 11782 /* 11783 * Perform ip header check sum update local options. 11784 * return B_TRUE if all is well, else return B_FALSE and release 11785 * the mp. caller is responsible for decrementing ire ref cnt. 11786 */ 11787 static boolean_t 11788 ip_options_cksum(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) 11789 { 11790 mblk_t *first_mp; 11791 boolean_t mctl_present; 11792 uint16_t sum; 11793 11794 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 11795 /* 11796 * Don't do the checksum if it has gone through AH/ESP 11797 * processing. 11798 */ 11799 if (!mctl_present) { 11800 sum = ip_csum_hdr(ipha); 11801 if (sum != 0) { 11802 BUMP_MIB(&ip_mib, ipInCksumErrs); 11803 freemsg(first_mp); 11804 return (B_FALSE); 11805 } 11806 } 11807 11808 if (!ip_rput_local_options(q, mp, ipha, ire)) { 11809 if (mctl_present) 11810 freeb(first_mp); 11811 return (B_FALSE); 11812 } 11813 11814 return (B_TRUE); 11815 } 11816 11817 /* 11818 * All udp packet are delivered to the local host via this routine. 11819 */ 11820 void 11821 ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 11822 ill_t *recv_ill) 11823 { 11824 uint32_t sum; 11825 uint32_t u1; 11826 boolean_t mctl_present; 11827 conn_t *connp; 11828 mblk_t *first_mp; 11829 uint16_t *up; 11830 ill_t *ill = (ill_t *)q->q_ptr; 11831 uint16_t reass_hck_flags = 0; 11832 11833 #define rptr ((uchar_t *)ipha) 11834 11835 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 11836 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 11837 ASSERT(ipha->ipha_protocol == IPPROTO_UDP); 11838 11839 /* 11840 * FAST PATH for udp packets 11841 */ 11842 11843 /* u1 is # words of IP options */ 11844 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + 11845 IP_SIMPLE_HDR_LENGTH_IN_WORDS); 11846 11847 /* IP options present */ 11848 if (u1 != 0) 11849 goto ipoptions; 11850 11851 /* Check the IP header checksum. */ 11852 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 11853 /* Clear the IP header h/w cksum flag */ 11854 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 11855 } else { 11856 #define uph ((uint16_t *)ipha) 11857 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 11858 uph[6] + uph[7] + uph[8] + uph[9]; 11859 #undef uph 11860 /* finish doing IP checksum */ 11861 sum = (sum & 0xFFFF) + (sum >> 16); 11862 sum = ~(sum + (sum >> 16)) & 0xFFFF; 11863 /* 11864 * Don't verify header checksum if this packet is coming 11865 * back from AH/ESP as we already did it. 11866 */ 11867 if (!mctl_present && sum != 0 && sum != 0xFFFF) { 11868 BUMP_MIB(&ip_mib, ipInCksumErrs); 11869 freemsg(first_mp); 11870 return; 11871 } 11872 } 11873 11874 /* 11875 * Count for SNMP of inbound packets for ire. 11876 * if mctl is present this might be a secure packet and 11877 * has already been counted for in ip_proto_input(). 11878 */ 11879 if (!mctl_present) { 11880 UPDATE_IB_PKT_COUNT(ire); 11881 ire->ire_last_used_time = lbolt; 11882 } 11883 11884 /* packet part of fragmented IP packet? */ 11885 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 11886 if (u1 & (IPH_MF | IPH_OFFSET)) { 11887 goto fragmented; 11888 } 11889 11890 /* u1 = IP header length (20 bytes) */ 11891 u1 = IP_SIMPLE_HDR_LENGTH; 11892 11893 /* packet does not contain complete IP & UDP headers */ 11894 if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) 11895 goto udppullup; 11896 11897 /* up points to UDP header */ 11898 up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); 11899 #define iphs ((uint16_t *)ipha) 11900 11901 /* if udp hdr cksum != 0, then need to checksum udp packet */ 11902 if (up[3] != 0) { 11903 mblk_t *mp1 = mp->b_cont; 11904 boolean_t cksum_err; 11905 uint16_t hck_flags = 0; 11906 11907 /* Pseudo-header checksum */ 11908 u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 11909 iphs[9] + up[2]; 11910 11911 /* 11912 * Revert to software checksum calculation if the interface 11913 * isn't capable of checksum offload or if IPsec is present. 11914 */ 11915 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 11916 hck_flags = DB_CKSUMFLAGS(mp); 11917 11918 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 11919 IP_STAT(ip_in_sw_cksum); 11920 11921 IP_CKSUM_RECV(hck_flags, u1, 11922 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 11923 (int32_t)((uchar_t *)up - rptr), 11924 mp, mp1, cksum_err); 11925 11926 if (cksum_err) { 11927 BUMP_MIB(&ip_mib, udpInCksumErrs); 11928 11929 if (hck_flags & HCK_FULLCKSUM) 11930 IP_STAT(ip_udp_in_full_hw_cksum_err); 11931 else if (hck_flags & HCK_PARTIALCKSUM) 11932 IP_STAT(ip_udp_in_part_hw_cksum_err); 11933 else 11934 IP_STAT(ip_udp_in_sw_cksum_err); 11935 11936 freemsg(first_mp); 11937 return; 11938 } 11939 } 11940 11941 /* Non-fragmented broadcast or multicast packet? */ 11942 if (ire->ire_type == IRE_BROADCAST) 11943 goto udpslowpath; 11944 11945 if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, 11946 ire->ire_zoneid)) != NULL) { 11947 ASSERT(connp->conn_upq != NULL); 11948 IP_STAT(ip_udp_fast_path); 11949 11950 if (CONN_UDP_FLOWCTLD(connp)) { 11951 freemsg(mp); 11952 BUMP_MIB(&ip_mib, udpInOverflows); 11953 } else { 11954 if (!mctl_present) { 11955 BUMP_MIB(&ip_mib, ipInDelivers); 11956 } 11957 /* 11958 * mp and first_mp can change. 11959 */ 11960 if (ip_udp_check(q, connp, recv_ill, 11961 ipha, &mp, &first_mp, mctl_present)) { 11962 /* Send it upstream */ 11963 CONN_UDP_RECV(connp, mp); 11964 } 11965 } 11966 /* 11967 * freeb() cannot deal with null mblk being passed 11968 * in and first_mp can be set to null in the call 11969 * ipsec_input_fast_proc()->ipsec_check_inbound_policy. 11970 */ 11971 if (mctl_present && first_mp != NULL) { 11972 freeb(first_mp); 11973 } 11974 CONN_DEC_REF(connp); 11975 return; 11976 } 11977 11978 /* 11979 * if we got here we know the packet is not fragmented and 11980 * has no options. The classifier could not find a conn_t and 11981 * most likely its an icmp packet so send it through slow path. 11982 */ 11983 11984 goto udpslowpath; 11985 11986 ipoptions: 11987 if (!ip_options_cksum(q, mp, ipha, ire)) { 11988 goto slow_done; 11989 } 11990 11991 UPDATE_IB_PKT_COUNT(ire); 11992 ire->ire_last_used_time = lbolt; 11993 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 11994 if (u1 & (IPH_MF | IPH_OFFSET)) { 11995 fragmented: 11996 /* 11997 * "sum" and "reass_hck_flags" are non-zero if the 11998 * reassembled packet has a valid hardware computed 11999 * checksum information associated with it. 12000 */ 12001 if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) 12002 goto slow_done; 12003 /* 12004 * Make sure that first_mp points back to mp as 12005 * the mp we came in with could have changed in 12006 * ip_rput_fragment(). 12007 */ 12008 ASSERT(!mctl_present); 12009 ipha = (ipha_t *)mp->b_rptr; 12010 first_mp = mp; 12011 } 12012 12013 /* Now we have a complete datagram, destined for this machine. */ 12014 u1 = IPH_HDR_LENGTH(ipha); 12015 /* Pull up the UDP header, if necessary. */ 12016 if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { 12017 udppullup: 12018 if (!pullupmsg(mp, u1 + UDPH_SIZE)) { 12019 BUMP_MIB(&ip_mib, ipInDiscards); 12020 freemsg(first_mp); 12021 goto slow_done; 12022 } 12023 ipha = (ipha_t *)mp->b_rptr; 12024 } 12025 12026 /* 12027 * Validate the checksum for the reassembled packet; for the 12028 * pullup case we calculate the payload checksum in software. 12029 */ 12030 up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); 12031 if (up[3] != 0) { 12032 boolean_t cksum_err; 12033 12034 if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12035 IP_STAT(ip_in_sw_cksum); 12036 12037 IP_CKSUM_RECV_REASS(reass_hck_flags, 12038 (int32_t)((uchar_t *)up - (uchar_t *)ipha), 12039 IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 12040 iphs[9] + up[2], sum, cksum_err); 12041 12042 if (cksum_err) { 12043 BUMP_MIB(&ip_mib, udpInCksumErrs); 12044 12045 if (reass_hck_flags & HCK_FULLCKSUM) 12046 IP_STAT(ip_udp_in_full_hw_cksum_err); 12047 else if (reass_hck_flags & HCK_PARTIALCKSUM) 12048 IP_STAT(ip_udp_in_part_hw_cksum_err); 12049 else 12050 IP_STAT(ip_udp_in_sw_cksum_err); 12051 12052 freemsg(first_mp); 12053 goto slow_done; 12054 } 12055 } 12056 udpslowpath: 12057 12058 /* Clear hardware checksum flag to be safe */ 12059 DB_CKSUMFLAGS(mp) = 0; 12060 12061 ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, 12062 (ire->ire_type == IRE_BROADCAST), 12063 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO, 12064 mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); 12065 12066 slow_done: 12067 IP_STAT(ip_udp_slow_path); 12068 return; 12069 12070 #undef iphs 12071 #undef rptr 12072 } 12073 12074 /* ARGSUSED */ 12075 static mblk_t * 12076 ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12077 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, 12078 ill_rx_ring_t *ill_ring) 12079 { 12080 conn_t *connp; 12081 uint32_t sum; 12082 uint32_t u1; 12083 uint16_t *up; 12084 int offset; 12085 ssize_t len; 12086 mblk_t *mp1; 12087 boolean_t syn_present = B_FALSE; 12088 tcph_t *tcph; 12089 uint_t ip_hdr_len; 12090 ill_t *ill = (ill_t *)q->q_ptr; 12091 zoneid_t zoneid = ire->ire_zoneid; 12092 boolean_t cksum_err; 12093 uint16_t hck_flags = 0; 12094 12095 #define rptr ((uchar_t *)ipha) 12096 12097 ASSERT(ipha->ipha_protocol == IPPROTO_TCP); 12098 12099 /* 12100 * FAST PATH for tcp packets 12101 */ 12102 12103 /* u1 is # words of IP options */ 12104 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12105 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12106 12107 /* IP options present */ 12108 if (u1) { 12109 goto ipoptions; 12110 } else { 12111 /* Check the IP header checksum. */ 12112 if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12113 /* Clear the IP header h/w cksum flag */ 12114 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 12115 } else { 12116 #define uph ((uint16_t *)ipha) 12117 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12118 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12119 #undef uph 12120 /* finish doing IP checksum */ 12121 sum = (sum & 0xFFFF) + (sum >> 16); 12122 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12123 /* 12124 * Don't verify header checksum if this packet 12125 * is coming back from AH/ESP as we already did it. 12126 */ 12127 if (!mctl_present && (sum != 0) && sum != 0xFFFF) { 12128 BUMP_MIB(&ip_mib, ipInCksumErrs); 12129 goto error; 12130 } 12131 } 12132 } 12133 12134 if (!mctl_present) { 12135 UPDATE_IB_PKT_COUNT(ire); 12136 ire->ire_last_used_time = lbolt; 12137 } 12138 12139 /* packet part of fragmented IP packet? */ 12140 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12141 if (u1 & (IPH_MF | IPH_OFFSET)) { 12142 goto fragmented; 12143 } 12144 12145 /* u1 = IP header length (20 bytes) */ 12146 u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; 12147 12148 /* does packet contain IP+TCP headers? */ 12149 len = mp->b_wptr - rptr; 12150 if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { 12151 IP_STAT(ip_tcppullup); 12152 goto tcppullup; 12153 } 12154 12155 /* TCP options present? */ 12156 offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; 12157 12158 /* 12159 * If options need to be pulled up, then goto tcpoptions. 12160 * otherwise we are still in the fast path 12161 */ 12162 if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { 12163 IP_STAT(ip_tcpoptions); 12164 goto tcpoptions; 12165 } 12166 12167 /* multiple mblks of tcp data? */ 12168 if ((mp1 = mp->b_cont) != NULL) { 12169 /* more then two? */ 12170 if (mp1->b_cont != NULL) { 12171 IP_STAT(ip_multipkttcp); 12172 goto multipkttcp; 12173 } 12174 len += mp1->b_wptr - mp1->b_rptr; 12175 } 12176 12177 up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); 12178 12179 /* part of pseudo checksum */ 12180 12181 /* TCP datagram length */ 12182 u1 = len - IP_SIMPLE_HDR_LENGTH; 12183 12184 #define iphs ((uint16_t *)ipha) 12185 12186 #ifdef _BIG_ENDIAN 12187 u1 += IPPROTO_TCP; 12188 #else 12189 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12190 #endif 12191 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12192 12193 /* 12194 * Revert to software checksum calculation if the interface 12195 * isn't capable of checksum offload or if IPsec is present. 12196 */ 12197 if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) 12198 hck_flags = DB_CKSUMFLAGS(mp); 12199 12200 if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) 12201 IP_STAT(ip_in_sw_cksum); 12202 12203 IP_CKSUM_RECV(hck_flags, u1, 12204 (uchar_t *)(rptr + DB_CKSUMSTART(mp)), 12205 (int32_t)((uchar_t *)up - rptr), 12206 mp, mp1, cksum_err); 12207 12208 if (cksum_err) { 12209 BUMP_MIB(&ip_mib, tcpInErrs); 12210 12211 if (hck_flags & HCK_FULLCKSUM) 12212 IP_STAT(ip_tcp_in_full_hw_cksum_err); 12213 else if (hck_flags & HCK_PARTIALCKSUM) 12214 IP_STAT(ip_tcp_in_part_hw_cksum_err); 12215 else 12216 IP_STAT(ip_tcp_in_sw_cksum_err); 12217 12218 goto error; 12219 } 12220 12221 try_again: 12222 12223 if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, zoneid)) == 12224 NULL) { 12225 /* Send the TH_RST */ 12226 goto no_conn; 12227 } 12228 12229 /* 12230 * TCP FAST PATH for AF_INET socket. 12231 * 12232 * TCP fast path to avoid extra work. An AF_INET socket type 12233 * does not have facility to receive extra information via 12234 * ip_process or ip_add_info. Also, when the connection was 12235 * established, we made a check if this connection is impacted 12236 * by any global IPSec policy or per connection policy (a 12237 * policy that comes in effect later will not apply to this 12238 * connection). Since all this can be determined at the 12239 * connection establishment time, a quick check of flags 12240 * can avoid extra work. 12241 */ 12242 if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && 12243 !IPP_ENABLED(IPP_LOCAL_IN)) { 12244 ASSERT(first_mp == mp); 12245 SET_SQUEUE(mp, tcp_rput_data, connp); 12246 return (mp); 12247 } 12248 12249 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 12250 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 12251 if (IPCL_IS_TCP(connp)) { 12252 mp->b_datap->db_struioflag |= STRUIO_EAGER; 12253 DB_CKSUMSTART(mp) = 12254 (intptr_t)ip_squeue_get(ill_ring); 12255 if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && 12256 !CONN_INBOUND_POLICY_PRESENT(connp)) { 12257 SET_SQUEUE(mp, connp->conn_recv, connp); 12258 return (mp); 12259 } else if (IPCL_IS_BOUND(connp) && !mctl_present && 12260 !CONN_INBOUND_POLICY_PRESENT(connp)) { 12261 ip_squeue_enter_unbound++; 12262 SET_SQUEUE(mp, tcp_conn_request_unbound, 12263 connp); 12264 return (mp); 12265 } 12266 syn_present = B_TRUE; 12267 } 12268 12269 } 12270 12271 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { 12272 uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; 12273 12274 /* No need to send this packet to TCP */ 12275 if ((flags & TH_RST) || (flags & TH_URG)) { 12276 CONN_DEC_REF(connp); 12277 freemsg(first_mp); 12278 return (NULL); 12279 } 12280 if (flags & TH_ACK) { 12281 tcp_xmit_listeners_reset(first_mp, ip_hdr_len); 12282 CONN_DEC_REF(connp); 12283 return (NULL); 12284 } 12285 12286 CONN_DEC_REF(connp); 12287 freemsg(first_mp); 12288 return (NULL); 12289 } 12290 12291 if (CONN_INBOUND_POLICY_PRESENT(connp) || mctl_present) { 12292 first_mp = ipsec_check_inbound_policy(first_mp, connp, 12293 ipha, NULL, mctl_present); 12294 if (first_mp == NULL) { 12295 CONN_DEC_REF(connp); 12296 return (NULL); 12297 } 12298 if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { 12299 ASSERT(syn_present); 12300 if (mctl_present) { 12301 ASSERT(first_mp != mp); 12302 first_mp->b_datap->db_struioflag |= 12303 STRUIO_POLICY; 12304 } else { 12305 ASSERT(first_mp == mp); 12306 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 12307 mp->b_datap->db_struioflag |= STRUIO_POLICY; 12308 } 12309 } else { 12310 /* 12311 * Discard first_mp early since we're dealing with a 12312 * fully-connected conn_t and tcp doesn't do policy in 12313 * this case. 12314 */ 12315 if (mctl_present) { 12316 freeb(first_mp); 12317 mctl_present = B_FALSE; 12318 } 12319 first_mp = mp; 12320 } 12321 } 12322 12323 /* Initiate IPPF processing for fastpath */ 12324 if (IPP_ENABLED(IPP_LOCAL_IN)) { 12325 uint32_t ill_index; 12326 12327 ill_index = recv_ill->ill_phyint->phyint_ifindex; 12328 ip_process(IPP_LOCAL_IN, &mp, ill_index); 12329 if (mp == NULL) { 12330 ip2dbg(("ip_input_ipsec_process: TCP pkt " 12331 "deferred/dropped during IPPF processing\n")); 12332 CONN_DEC_REF(connp); 12333 if (mctl_present) 12334 freeb(first_mp); 12335 return (NULL); 12336 } else if (mctl_present) { 12337 /* 12338 * ip_process might return a new mp. 12339 */ 12340 ASSERT(first_mp != mp); 12341 first_mp->b_cont = mp; 12342 } else { 12343 first_mp = mp; 12344 } 12345 12346 } 12347 12348 if (!syn_present && connp->conn_ipv6_recvpktinfo) { 12349 mp = ip_add_info(mp, recv_ill, flags); 12350 if (mp == NULL) { 12351 CONN_DEC_REF(connp); 12352 if (mctl_present) 12353 freeb(first_mp); 12354 return (NULL); 12355 } else if (mctl_present) { 12356 /* 12357 * ip_add_info might return a new mp. 12358 */ 12359 ASSERT(first_mp != mp); 12360 first_mp->b_cont = mp; 12361 } else { 12362 first_mp = mp; 12363 } 12364 } 12365 12366 if (IPCL_IS_TCP(connp)) { 12367 SET_SQUEUE(first_mp, connp->conn_recv, connp); 12368 return (first_mp); 12369 } else { 12370 putnext(connp->conn_rq, first_mp); 12371 CONN_DEC_REF(connp); 12372 return (NULL); 12373 } 12374 12375 no_conn: 12376 /* Initiate IPPf processing, if needed. */ 12377 if (IPP_ENABLED(IPP_LOCAL_IN)) { 12378 uint32_t ill_index; 12379 ill_index = recv_ill->ill_phyint->phyint_ifindex; 12380 ip_process(IPP_LOCAL_IN, &first_mp, ill_index); 12381 if (first_mp == NULL) { 12382 return (NULL); 12383 } 12384 } 12385 BUMP_MIB(&ip_mib, ipInDelivers); 12386 tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr)); 12387 return (NULL); 12388 ipoptions: 12389 if (!ip_options_cksum(q, first_mp, ipha, ire)) { 12390 goto slow_done; 12391 } 12392 12393 UPDATE_IB_PKT_COUNT(ire); 12394 ire->ire_last_used_time = lbolt; 12395 12396 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12397 if (u1 & (IPH_MF | IPH_OFFSET)) { 12398 fragmented: 12399 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 12400 if (mctl_present) 12401 freeb(first_mp); 12402 goto slow_done; 12403 } 12404 /* 12405 * Make sure that first_mp points back to mp as 12406 * the mp we came in with could have changed in 12407 * ip_rput_fragment(). 12408 */ 12409 ASSERT(!mctl_present); 12410 ipha = (ipha_t *)mp->b_rptr; 12411 first_mp = mp; 12412 } 12413 12414 /* Now we have a complete datagram, destined for this machine. */ 12415 u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); 12416 12417 len = mp->b_wptr - mp->b_rptr; 12418 /* Pull up a minimal TCP header, if necessary. */ 12419 if (len < (u1 + 20)) { 12420 tcppullup: 12421 if (!pullupmsg(mp, u1 + 20)) { 12422 BUMP_MIB(&ip_mib, ipInDiscards); 12423 goto error; 12424 } 12425 ipha = (ipha_t *)mp->b_rptr; 12426 len = mp->b_wptr - mp->b_rptr; 12427 } 12428 12429 /* 12430 * Extract the offset field from the TCP header. As usual, we 12431 * try to help the compiler more than the reader. 12432 */ 12433 offset = ((uchar_t *)ipha)[u1 + 12] >> 4; 12434 if (offset != 5) { 12435 tcpoptions: 12436 if (offset < 5) { 12437 BUMP_MIB(&ip_mib, ipInDiscards); 12438 goto error; 12439 } 12440 /* 12441 * There must be TCP options. 12442 * Make sure we can grab them. 12443 */ 12444 offset <<= 2; 12445 offset += u1; 12446 if (len < offset) { 12447 if (!pullupmsg(mp, offset)) { 12448 BUMP_MIB(&ip_mib, ipInDiscards); 12449 goto error; 12450 } 12451 ipha = (ipha_t *)mp->b_rptr; 12452 len = mp->b_wptr - rptr; 12453 } 12454 } 12455 12456 /* Get the total packet length in len, including headers. */ 12457 if (mp->b_cont) { 12458 multipkttcp: 12459 len = msgdsize(mp); 12460 } 12461 12462 /* 12463 * Check the TCP checksum by pulling together the pseudo- 12464 * header checksum, and passing it to ip_csum to be added in 12465 * with the TCP datagram. 12466 * 12467 * Since we are not using the hwcksum if available we must 12468 * clear the flag. We may come here via tcppullup or tcpoptions. 12469 * If either of these fails along the way the mblk is freed. 12470 * If this logic ever changes and mblk is reused to say send 12471 * ICMP's back, then this flag may need to be cleared in 12472 * other places as well. 12473 */ 12474 DB_CKSUMFLAGS(mp) = 0; 12475 12476 up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); 12477 12478 u1 = (uint32_t)(len - u1); /* TCP datagram length. */ 12479 #ifdef _BIG_ENDIAN 12480 u1 += IPPROTO_TCP; 12481 #else 12482 u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); 12483 #endif 12484 u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 12485 /* 12486 * Not M_DATA mblk or its a dup, so do the checksum now. 12487 */ 12488 IP_STAT(ip_in_sw_cksum); 12489 if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { 12490 BUMP_MIB(&ip_mib, tcpInErrs); 12491 goto error; 12492 } 12493 12494 IP_STAT(ip_tcp_slow_path); 12495 goto try_again; 12496 #undef iphs 12497 #undef rptr 12498 12499 error: 12500 freemsg(first_mp); 12501 slow_done: 12502 return (NULL); 12503 } 12504 12505 /* ARGSUSED */ 12506 static void 12507 ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, 12508 ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) 12509 { 12510 conn_t *connp; 12511 uint32_t sum; 12512 uint32_t u1; 12513 ssize_t len; 12514 sctp_hdr_t *sctph; 12515 zoneid_t zoneid = ire->ire_zoneid; 12516 uint32_t pktsum; 12517 uint32_t calcsum; 12518 uint32_t ports; 12519 uint_t ipif_seqid; 12520 in6_addr_t map_src, map_dst; 12521 ill_t *ill = (ill_t *)q->q_ptr; 12522 12523 #define rptr ((uchar_t *)ipha) 12524 12525 ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); 12526 12527 /* u1 is # words of IP options */ 12528 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 12529 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 12530 12531 /* IP options present */ 12532 if (u1 > 0) { 12533 goto ipoptions; 12534 } else { 12535 /* Check the IP header checksum. */ 12536 if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { 12537 #define uph ((uint16_t *)ipha) 12538 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + 12539 uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; 12540 #undef uph 12541 /* finish doing IP checksum */ 12542 sum = (sum & 0xFFFF) + (sum >> 16); 12543 sum = ~(sum + (sum >> 16)) & 0xFFFF; 12544 /* 12545 * Don't verify header checksum if this packet 12546 * is coming back from AH/ESP as we already did it. 12547 */ 12548 if (!mctl_present && (sum != 0) && sum != 0xFFFF) { 12549 BUMP_MIB(&ip_mib, ipInCksumErrs); 12550 goto error; 12551 } 12552 } 12553 /* 12554 * Since there is no SCTP h/w cksum support yet, just 12555 * clear the flag. 12556 */ 12557 DB_CKSUMFLAGS(mp) = 0; 12558 } 12559 12560 /* 12561 * Don't verify header checksum if this packet is coming 12562 * back from AH/ESP as we already did it. 12563 */ 12564 if (!mctl_present) { 12565 UPDATE_IB_PKT_COUNT(ire); 12566 ire->ire_last_used_time = lbolt; 12567 } 12568 12569 /* packet part of fragmented IP packet? */ 12570 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12571 if (u1 & (IPH_MF | IPH_OFFSET)) 12572 goto fragmented; 12573 12574 /* u1 = IP header length (20 bytes) */ 12575 u1 = IP_SIMPLE_HDR_LENGTH; 12576 12577 find_sctp_client: 12578 /* Pullup if we don't have the sctp common header. */ 12579 len = MBLKL(mp); 12580 if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { 12581 if (mp->b_cont == NULL || 12582 !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { 12583 BUMP_MIB(&ip_mib, ipInDiscards); 12584 goto error; 12585 } 12586 ipha = (ipha_t *)mp->b_rptr; 12587 len = MBLKL(mp); 12588 } 12589 12590 sctph = (sctp_hdr_t *)(rptr + u1); 12591 #ifdef DEBUG 12592 if (!skip_sctp_cksum) { 12593 #endif 12594 pktsum = sctph->sh_chksum; 12595 sctph->sh_chksum = 0; 12596 calcsum = sctp_cksum(mp, u1); 12597 if (calcsum != pktsum) { 12598 BUMP_MIB(&sctp_mib, sctpChecksumError); 12599 goto error; 12600 } 12601 sctph->sh_chksum = pktsum; 12602 #ifdef DEBUG /* skip_sctp_cksum */ 12603 } 12604 #endif 12605 /* get the ports */ 12606 ports = *(uint32_t *)&sctph->sh_sport; 12607 12608 ipif_seqid = ire->ire_ipif->ipif_seqid; 12609 IRE_REFRELE(ire); 12610 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 12611 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 12612 if ((connp = sctp_fanout(&map_src, &map_dst, ports, ipif_seqid, zoneid, 12613 mp)) == NULL) { 12614 /* Check for raw socket or OOTB handling */ 12615 goto no_conn; 12616 } 12617 12618 /* Found a client; up it goes */ 12619 BUMP_MIB(&ip_mib, ipInDelivers); 12620 sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); 12621 return; 12622 12623 no_conn: 12624 ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, 12625 ports, mctl_present, flags, B_TRUE, ipif_seqid, zoneid); 12626 return; 12627 12628 ipoptions: 12629 DB_CKSUMFLAGS(mp) = 0; 12630 if (!ip_options_cksum(q, first_mp, ipha, ire)) 12631 goto slow_done; 12632 12633 UPDATE_IB_PKT_COUNT(ire); 12634 ire->ire_last_used_time = lbolt; 12635 12636 u1 = ntohs(ipha->ipha_fragment_offset_and_flags); 12637 if (u1 & (IPH_MF | IPH_OFFSET)) { 12638 fragmented: 12639 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) 12640 goto slow_done; 12641 /* 12642 * Make sure that first_mp points back to mp as 12643 * the mp we came in with could have changed in 12644 * ip_rput_fragment(). 12645 */ 12646 ASSERT(!mctl_present); 12647 ipha = (ipha_t *)mp->b_rptr; 12648 first_mp = mp; 12649 } 12650 12651 /* Now we have a complete datagram, destined for this machine. */ 12652 u1 = IPH_HDR_LENGTH(ipha); 12653 goto find_sctp_client; 12654 #undef iphs 12655 #undef rptr 12656 12657 error: 12658 freemsg(first_mp); 12659 slow_done: 12660 IRE_REFRELE(ire); 12661 } 12662 12663 #define VER_BITS 0xF0 12664 #define VERSION_6 0x60 12665 12666 static boolean_t 12667 ip_rput_multimblk_ipoptions(queue_t *q, mblk_t *mp, ipha_t **iphapp, 12668 ipaddr_t *dstp) 12669 { 12670 uint_t opt_len; 12671 ipha_t *ipha; 12672 ssize_t len; 12673 uint_t pkt_len; 12674 12675 IP_STAT(ip_ipoptions); 12676 ipha = *iphapp; 12677 12678 #define rptr ((uchar_t *)ipha) 12679 /* Assume no IPv6 packets arrive over the IPv4 queue */ 12680 if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { 12681 BUMP_MIB(&ip_mib, ipInIPv6); 12682 freemsg(mp); 12683 return (B_FALSE); 12684 } 12685 12686 /* multiple mblk or too short */ 12687 pkt_len = ntohs(ipha->ipha_length); 12688 12689 /* Get the number of words of IP options in the IP header. */ 12690 opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 12691 if (opt_len) { 12692 /* IP Options present! Validate and process. */ 12693 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { 12694 BUMP_MIB(&ip_mib, ipInHdrErrors); 12695 goto done; 12696 } 12697 /* 12698 * Recompute complete header length and make sure we 12699 * have access to all of it. 12700 */ 12701 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; 12702 if (len > (mp->b_wptr - rptr)) { 12703 if (len > pkt_len) { 12704 BUMP_MIB(&ip_mib, ipInHdrErrors); 12705 goto done; 12706 } 12707 if (!pullupmsg(mp, len)) { 12708 BUMP_MIB(&ip_mib, ipInDiscards); 12709 goto done; 12710 } 12711 ipha = (ipha_t *)mp->b_rptr; 12712 } 12713 /* 12714 * Go off to ip_rput_options which returns the next hop 12715 * destination address, which may have been affected 12716 * by source routing. 12717 */ 12718 IP_STAT(ip_opt); 12719 if (ip_rput_options(q, mp, ipha, dstp) == -1) { 12720 return (B_FALSE); 12721 } 12722 } 12723 *iphapp = ipha; 12724 return (B_TRUE); 12725 done: 12726 /* clear b_prev - used by ip_mroute_decap */ 12727 mp->b_prev = NULL; 12728 freemsg(mp); 12729 return (B_FALSE); 12730 #undef rptr 12731 } 12732 12733 /* 12734 * Deal with the fact that there is no ire for the destination. 12735 * The incoming ill (in_ill) is passed in to ip_newroute only 12736 * in the case of packets coming from mobile ip forward tunnel. 12737 * It must be null otherwise. 12738 */ 12739 static void 12740 ip_rput_noire(queue_t *q, ill_t *in_ill, mblk_t *mp, int ll_multicast, 12741 ipaddr_t dst) 12742 { 12743 ipha_t *ipha; 12744 ill_t *ill; 12745 12746 ipha = (ipha_t *)mp->b_rptr; 12747 ill = (ill_t *)q->q_ptr; 12748 12749 ASSERT(ill != NULL); 12750 /* 12751 * No IRE for this destination, so it can't be for us. 12752 * Unless we are forwarding, drop the packet. 12753 * We have to let source routed packets through 12754 * since we don't yet know if they are 'ping -l' 12755 * packets i.e. if they will go out over the 12756 * same interface as they came in on. 12757 */ 12758 if (ll_multicast) { 12759 freemsg(mp); 12760 return; 12761 } 12762 if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha)) { 12763 BUMP_MIB(&ip_mib, ipForwProhibits); 12764 freemsg(mp); 12765 return; 12766 } 12767 12768 /* Check for Martian addresses */ 12769 if ((in_ill == NULL) && (ip_no_forward(ipha, ill))) { 12770 freemsg(mp); 12771 return; 12772 } 12773 12774 /* Mark this packet as having originated externally */ 12775 mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; 12776 12777 /* 12778 * Clear the indication that this may have a hardware checksum 12779 * as we are not using it 12780 */ 12781 DB_CKSUMFLAGS(mp) = 0; 12782 12783 /* 12784 * Now hand the packet to ip_newroute. 12785 */ 12786 ip_newroute(q, mp, dst, in_ill, NULL); 12787 } 12788 12789 /* 12790 * check ip header length and align it. 12791 */ 12792 static boolean_t 12793 ip_check_and_align_header(queue_t *q, mblk_t *mp) 12794 { 12795 ssize_t len; 12796 ill_t *ill; 12797 ipha_t *ipha; 12798 12799 len = MBLKL(mp); 12800 12801 if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { 12802 if (!OK_32PTR(mp->b_rptr)) 12803 IP_STAT(ip_notaligned1); 12804 else 12805 IP_STAT(ip_notaligned2); 12806 /* Guard against bogus device drivers */ 12807 if (len < 0) { 12808 /* clear b_prev - used by ip_mroute_decap */ 12809 mp->b_prev = NULL; 12810 BUMP_MIB(&ip_mib, ipInHdrErrors); 12811 freemsg(mp); 12812 return (B_FALSE); 12813 } 12814 12815 if (ip_rput_pullups++ == 0) { 12816 ill = (ill_t *)q->q_ptr; 12817 ipha = (ipha_t *)mp->b_rptr; 12818 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 12819 "ip_check_and_align_header: %s forced us to " 12820 " pullup pkt, hdr len %ld, hdr addr %p", 12821 ill->ill_name, len, ipha); 12822 } 12823 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 12824 /* clear b_prev - used by ip_mroute_decap */ 12825 mp->b_prev = NULL; 12826 BUMP_MIB(&ip_mib, ipInDiscards); 12827 freemsg(mp); 12828 return (B_FALSE); 12829 } 12830 } 12831 return (B_TRUE); 12832 } 12833 12834 static boolean_t 12835 ip_rput_notforus(queue_t **qp, mblk_t *mp, ire_t *ire, ill_t *ill) 12836 { 12837 ill_group_t *ill_group; 12838 ill_group_t *ire_group; 12839 queue_t *q; 12840 ill_t *ire_ill; 12841 uint_t ill_ifindex; 12842 12843 q = *qp; 12844 /* 12845 * We need to check to make sure the packet came in 12846 * on the queue associated with the destination IRE. 12847 * Note that for multicast packets and broadcast packets sent to 12848 * a broadcast address which is shared between multiple interfaces 12849 * we should not do this since we just got a random broadcast ire. 12850 */ 12851 if (ire->ire_rfq && ire->ire_type != IRE_BROADCAST) { 12852 boolean_t check_multi = B_TRUE; 12853 12854 /* 12855 * This packet came in on an interface other than the 12856 * one associated with the destination address. 12857 * "Gateway" it to the appropriate interface here. 12858 * As long as the ills belong to the same group, 12859 * we don't consider them to arriving on the wrong 12860 * interface. Thus, when the switch is doing inbound 12861 * load spreading, we won't drop packets when we 12862 * are doing strict multihoming checks. Note, the 12863 * same holds true for 'usesrc groups' where the 12864 * destination address may belong to another interface 12865 * to allow multipathing to happen 12866 */ 12867 ill_group = ill->ill_group; 12868 ire_ill = (ill_t *)(ire->ire_rfq)->q_ptr; 12869 ill_ifindex = ill->ill_usesrc_ifindex; 12870 ire_group = ire_ill->ill_group; 12871 12872 /* 12873 * If it's part of the same IPMP group, or if it's a legal 12874 * address on the 'usesrc' interface, then bypass strict 12875 * checks. 12876 */ 12877 if (ill_group != NULL && ill_group == ire_group) { 12878 check_multi = B_FALSE; 12879 } else if (ill_ifindex != 0 && 12880 ill_ifindex == ire_ill->ill_phyint->phyint_ifindex) { 12881 check_multi = B_FALSE; 12882 } 12883 12884 if (check_multi && 12885 ip_strict_dst_multihoming && 12886 ((ill->ill_flags & 12887 ire->ire_ipif->ipif_ill->ill_flags & 12888 ILLF_ROUTER) == 0)) { 12889 /* Drop packet */ 12890 BUMP_MIB(&ip_mib, ipForwProhibits); 12891 freemsg(mp); 12892 ire_refrele(ire); 12893 return (B_TRUE); 12894 } 12895 12896 /* 12897 * Change the queue (for non-virtual destination network 12898 * interfaces) and ip_rput_local will be called with the right 12899 * queue 12900 */ 12901 q = ire->ire_rfq; 12902 } 12903 /* Must be broadcast. We'll take it. */ 12904 *qp = q; 12905 return (B_FALSE); 12906 } 12907 12908 static void 12909 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, 12910 ill_t *ill, int ll_multicast) 12911 { 12912 ill_group_t *ill_group; 12913 ill_group_t *ire_group; 12914 queue_t *dev_q; 12915 12916 ASSERT(ire->ire_stq != NULL); 12917 if (ll_multicast != 0) 12918 goto drop_pkt; 12919 12920 if (ip_no_forward(ipha, ill)) 12921 goto drop_pkt; 12922 12923 ill_group = ill->ill_group; 12924 ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; 12925 /* 12926 * Check if we want to forward this one at this time. 12927 * We allow source routed packets on a host provided that 12928 * they go out the same interface or same interface group 12929 * as they came in on. 12930 * 12931 * XXX To be quicker, we may wish to not chase pointers to 12932 * get the ILLF_ROUTER flag and instead store the 12933 * forwarding policy in the ire. An unfortunate 12934 * side-effect of that would be requiring an ire flush 12935 * whenever the ILLF_ROUTER flag changes. 12936 */ 12937 if (((ill->ill_flags & 12938 ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & 12939 ILLF_ROUTER) == 0) && 12940 !(ip_source_routed(ipha) && (ire->ire_rfq == q || 12941 (ill_group != NULL && ill_group == ire_group)))) { 12942 BUMP_MIB(&ip_mib, ipForwProhibits); 12943 if (ip_source_routed(ipha)) { 12944 q = WR(q); 12945 /* 12946 * Clear the indication that this may have 12947 * hardware checksum as we are not using it. 12948 */ 12949 DB_CKSUMFLAGS(mp) = 0; 12950 icmp_unreachable(q, mp, 12951 ICMP_SOURCE_ROUTE_FAILED); 12952 ire_refrele(ire); 12953 return; 12954 } 12955 goto drop_pkt; 12956 } 12957 12958 /* Packet is being forwarded. Turning off hwcksum flag. */ 12959 DB_CKSUMFLAGS(mp) = 0; 12960 if (ip_g_send_redirects) { 12961 /* 12962 * Check whether the incoming interface and outgoing 12963 * interface is part of the same group. If so, 12964 * send redirects. 12965 * 12966 * Check the source address to see if it originated 12967 * on the same logical subnet it is going back out on. 12968 * If so, we should be able to send it a redirect. 12969 * Avoid sending a redirect if the destination 12970 * is directly connected (gw_addr == 0), 12971 * or if the packet was source routed out this 12972 * interface. 12973 */ 12974 ipaddr_t src; 12975 mblk_t *mp1; 12976 ire_t *src_ire = NULL; 12977 12978 /* 12979 * Check whether ire_rfq and q are from the same ill 12980 * or if they are not same, they at least belong 12981 * to the same group. If so, send redirects. 12982 */ 12983 if ((ire->ire_rfq == q || 12984 (ill_group != NULL && ill_group == ire_group)) && 12985 (ire->ire_gateway_addr != 0) && 12986 !ip_source_routed(ipha)) { 12987 12988 src = ipha->ipha_src; 12989 src_ire = ire_ftable_lookup(src, 0, 0, 12990 IRE_INTERFACE, ire->ire_ipif, NULL, ALL_ZONES, 12991 0, NULL, MATCH_IRE_IPIF | MATCH_IRE_TYPE); 12992 12993 if (src_ire != NULL) { 12994 /* 12995 * The source is directly connected. 12996 * Just copy the ip header (which is 12997 * in the first mblk) 12998 */ 12999 mp1 = copyb(mp); 13000 if (mp1 != NULL) { 13001 icmp_send_redirect(WR(q), mp1, 13002 ire->ire_gateway_addr); 13003 } 13004 ire_refrele(src_ire); 13005 } 13006 } 13007 } 13008 13009 dev_q = ire->ire_stq->q_next; 13010 if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { 13011 BUMP_MIB(&ip_mib, ipInDiscards); 13012 freemsg(mp); 13013 ire_refrele(ire); 13014 return; 13015 } 13016 13017 ip_rput_forward(ire, ipha, mp, ill); 13018 IRE_REFRELE(ire); 13019 return; 13020 13021 drop_pkt: 13022 ire_refrele(ire); 13023 ip2dbg(("ip_rput_forward: drop pkt\n")); 13024 freemsg(mp); 13025 } 13026 13027 static boolean_t 13028 ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha, 13029 ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) 13030 { 13031 queue_t *q; 13032 ire_t *ire; 13033 uint16_t hcksumflags; 13034 13035 q = *qp; 13036 ire = *irep; 13037 13038 /* 13039 * Clear the indication that this may have hardware 13040 * checksum as we are not using it for forwarding. 13041 */ 13042 hcksumflags = DB_CKSUMFLAGS(mp); 13043 DB_CKSUMFLAGS(mp) = 0; 13044 13045 /* 13046 * Directed broadcast forwarding: if the packet came in over a 13047 * different interface then it is routed out over we can forward it. 13048 */ 13049 if (ipha->ipha_protocol == IPPROTO_TCP) { 13050 ire_refrele(ire); 13051 freemsg(mp); 13052 BUMP_MIB(&ip_mib, ipInDiscards); 13053 return (B_TRUE); 13054 } 13055 /* 13056 * For multicast we have set dst to be INADDR_BROADCAST 13057 * for delivering to all STREAMS. IRE_MARK_NORECV is really 13058 * only for broadcast packets. 13059 */ 13060 if (!CLASSD(ipha->ipha_dst)) { 13061 ire_t *new_ire; 13062 ipif_t *ipif; 13063 /* 13064 * For ill groups, as the switch duplicates broadcasts 13065 * across all the ports, we need to filter out and 13066 * send up only one copy. There is one copy for every 13067 * broadcast address on each ill. Thus, we look for a 13068 * specific IRE on this ill and look at IRE_MARK_NORECV 13069 * later to see whether this ill is eligible to receive 13070 * them or not. ill_nominate_bcast_rcv() nominates only 13071 * one set of IREs for receiving. 13072 */ 13073 13074 ipif = ipif_get_next_ipif(NULL, ill); 13075 if (ipif == NULL) { 13076 ire_refrele(ire); 13077 freemsg(mp); 13078 BUMP_MIB(&ip_mib, ipInDiscards); 13079 return (B_TRUE); 13080 } 13081 new_ire = ire_ctable_lookup(dst, 0, 0, 13082 ipif, ALL_ZONES, NULL, MATCH_IRE_ILL); 13083 ipif_refrele(ipif); 13084 13085 if (new_ire != NULL) { 13086 if (new_ire->ire_marks & IRE_MARK_NORECV) { 13087 ire_refrele(ire); 13088 ire_refrele(new_ire); 13089 freemsg(mp); 13090 BUMP_MIB(&ip_mib, ipInDiscards); 13091 return (B_TRUE); 13092 } 13093 /* 13094 * In the special case of multirouted broadcast 13095 * packets, we unconditionally need to "gateway" 13096 * them to the appropriate interface here. 13097 * In the normal case, this cannot happen, because 13098 * there is no broadcast IRE tagged with the 13099 * RTF_MULTIRT flag. 13100 */ 13101 if (new_ire->ire_flags & RTF_MULTIRT) { 13102 ire_refrele(new_ire); 13103 if (ire->ire_rfq != NULL) { 13104 q = ire->ire_rfq; 13105 *qp = q; 13106 } 13107 } else { 13108 ire_refrele(ire); 13109 ire = new_ire; 13110 } 13111 } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { 13112 if (!ip_g_forward_directed_bcast) { 13113 /* 13114 * Free the message if 13115 * ip_g_forward_directed_bcast is turned 13116 * off for non-local broadcast. 13117 */ 13118 ire_refrele(ire); 13119 freemsg(mp); 13120 BUMP_MIB(&ip_mib, ipInDiscards); 13121 return (B_TRUE); 13122 } 13123 } else { 13124 /* 13125 * This CGTP packet successfully passed the 13126 * CGTP filter, but the related CGTP 13127 * broadcast IRE has not been found, 13128 * meaning that the redundant ipif is 13129 * probably down. However, if we discarded 13130 * this packet, its duplicate would be 13131 * filtered out by the CGTP filter so none 13132 * of them would get through. So we keep 13133 * going with this one. 13134 */ 13135 ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); 13136 if (ire->ire_rfq != NULL) { 13137 q = ire->ire_rfq; 13138 *qp = q; 13139 } 13140 } 13141 } 13142 if (ip_g_forward_directed_bcast && ll_multicast == 0) { 13143 /* 13144 * Verify that there are not more then one 13145 * IRE_BROADCAST with this broadcast address which 13146 * has ire_stq set. 13147 * TODO: simplify, loop over all IRE's 13148 */ 13149 ire_t *ire1; 13150 int num_stq = 0; 13151 mblk_t *mp1; 13152 13153 /* Find the first one with ire_stq set */ 13154 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 13155 for (ire1 = ire; ire1 && 13156 !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; 13157 ire1 = ire1->ire_next) 13158 ; 13159 if (ire1) { 13160 ire_refrele(ire); 13161 ire = ire1; 13162 IRE_REFHOLD(ire); 13163 } 13164 13165 /* Check if there are additional ones with stq set */ 13166 for (ire1 = ire; ire1; ire1 = ire1->ire_next) { 13167 if (ire->ire_addr != ire1->ire_addr) 13168 break; 13169 if (ire1->ire_stq) { 13170 num_stq++; 13171 break; 13172 } 13173 } 13174 rw_exit(&ire->ire_bucket->irb_lock); 13175 if (num_stq == 1 && ire->ire_stq != NULL) { 13176 ip1dbg(("ip_rput_process_broadcast: directed " 13177 "broadcast to 0x%x\n", 13178 ntohl(ire->ire_addr))); 13179 mp1 = copymsg(mp); 13180 if (mp1) { 13181 switch (ipha->ipha_protocol) { 13182 case IPPROTO_UDP: 13183 ip_udp_input(q, mp1, ipha, ire, ill); 13184 break; 13185 default: 13186 ip_proto_input(q, mp1, ipha, ire, ill); 13187 break; 13188 } 13189 } 13190 /* 13191 * Adjust ttl to 2 (1+1 - the forward engine 13192 * will decrement it by one. 13193 */ 13194 if (ip_csum_hdr(ipha)) { 13195 BUMP_MIB(&ip_mib, ipInCksumErrs); 13196 ip2dbg(("ip_rput_broadcast:drop pkt\n")); 13197 freemsg(mp); 13198 ire_refrele(ire); 13199 return (B_TRUE); 13200 } 13201 ipha->ipha_ttl = ip_broadcast_ttl + 1; 13202 ipha->ipha_hdr_checksum = 0; 13203 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 13204 ip_rput_process_forward(q, mp, ire, ipha, 13205 ill, ll_multicast); 13206 return (B_TRUE); 13207 } 13208 ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", 13209 ntohl(ire->ire_addr))); 13210 } 13211 13212 *irep = ire; 13213 13214 /* Restore any hardware checksum flags */ 13215 DB_CKSUMFLAGS(mp) = hcksumflags; 13216 return (B_FALSE); 13217 } 13218 13219 /* ARGSUSED */ 13220 static boolean_t 13221 ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, 13222 int *ll_multicast, ipaddr_t *dstp) 13223 { 13224 /* 13225 * Forward packets only if we have joined the allmulti 13226 * group on this interface. 13227 */ 13228 if (ip_g_mrouter && ill->ill_join_allmulti) { 13229 int retval; 13230 13231 /* 13232 * Clear the indication that this may have hardware 13233 * checksum as we are not using it. 13234 */ 13235 DB_CKSUMFLAGS(mp) = 0; 13236 retval = ip_mforward(ill, ipha, mp); 13237 /* ip_mforward updates mib variables if needed */ 13238 /* clear b_prev - used by ip_mroute_decap */ 13239 mp->b_prev = NULL; 13240 13241 switch (retval) { 13242 case 0: 13243 /* 13244 * pkt is okay and arrived on phyint. 13245 * 13246 * If we are running as a multicast router 13247 * we need to see all IGMP and/or PIM packets. 13248 */ 13249 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 13250 (ipha->ipha_protocol == IPPROTO_PIM)) { 13251 goto done; 13252 } 13253 break; 13254 case -1: 13255 /* pkt is mal-formed, toss it */ 13256 goto drop_pkt; 13257 case 1: 13258 /* pkt is okay and arrived on a tunnel */ 13259 /* 13260 * If we are running a multicast router 13261 * we need to see all igmp packets. 13262 */ 13263 if (ipha->ipha_protocol == IPPROTO_IGMP) { 13264 *dstp = INADDR_BROADCAST; 13265 *ll_multicast = 1; 13266 return (B_FALSE); 13267 } 13268 13269 goto drop_pkt; 13270 } 13271 } 13272 13273 ILM_WALKER_HOLD(ill); 13274 if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { 13275 /* 13276 * This might just be caused by the fact that 13277 * multiple IP Multicast addresses map to the same 13278 * link layer multicast - no need to increment counter! 13279 */ 13280 ILM_WALKER_RELE(ill); 13281 freemsg(mp); 13282 return (B_TRUE); 13283 } 13284 ILM_WALKER_RELE(ill); 13285 done: 13286 ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); 13287 /* 13288 * This assumes the we deliver to all streams for multicast 13289 * and broadcast packets. 13290 */ 13291 *dstp = INADDR_BROADCAST; 13292 *ll_multicast = 1; 13293 return (B_FALSE); 13294 drop_pkt: 13295 ip2dbg(("ip_rput: drop pkt\n")); 13296 freemsg(mp); 13297 return (B_TRUE); 13298 } 13299 13300 static boolean_t 13301 ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, 13302 int *ll_multicast, mblk_t **mpp) 13303 { 13304 mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; 13305 boolean_t must_copy = B_FALSE; 13306 struct iocblk *iocp; 13307 ipha_t *ipha; 13308 13309 #define rptr ((uchar_t *)ipha) 13310 13311 first_mp = *first_mpp; 13312 mp = *mpp; 13313 13314 ASSERT(first_mp == mp); 13315 13316 /* 13317 * if db_ref > 1 then copymsg and free original. Packet may be 13318 * changed and do not want other entity who has a reference to this 13319 * message to trip over the changes. This is a blind change because 13320 * trying to catch all places that might change packet is too 13321 * difficult (since it may be a module above this one) 13322 * 13323 * This corresponds to the non-fast path case. We walk down the full 13324 * chain in this case, and check the db_ref count of all the dblks, 13325 * and do a copymsg if required. It is possible that the db_ref counts 13326 * of the data blocks in the mblk chain can be different. 13327 * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref 13328 * count of 1, followed by a M_DATA block with a ref count of 2, if 13329 * 'snoop' is running. 13330 */ 13331 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 13332 if (mp1->b_datap->db_ref > 1) { 13333 must_copy = B_TRUE; 13334 break; 13335 } 13336 } 13337 13338 if (must_copy) { 13339 mp1 = copymsg(mp); 13340 if (mp1 == NULL) { 13341 for (mp1 = mp; mp1 != NULL; 13342 mp1 = mp1->b_cont) { 13343 mp1->b_next = NULL; 13344 mp1->b_prev = NULL; 13345 } 13346 freemsg(mp); 13347 BUMP_MIB(&ip_mib, ipInDiscards); 13348 return (B_TRUE); 13349 } 13350 for (from_mp = mp, to_mp = mp1; from_mp != NULL; 13351 from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { 13352 /* Copy b_next - used in M_BREAK messages */ 13353 to_mp->b_next = from_mp->b_next; 13354 from_mp->b_next = NULL; 13355 /* Copy b_prev - used by ip_mroute_decap */ 13356 to_mp->b_prev = from_mp->b_prev; 13357 from_mp->b_prev = NULL; 13358 } 13359 *first_mpp = first_mp = mp1; 13360 freemsg(mp); 13361 mp = mp1; 13362 *mpp = mp1; 13363 } 13364 13365 ipha = (ipha_t *)mp->b_rptr; 13366 13367 /* 13368 * previous code has a case for M_DATA. 13369 * We want to check how that happens. 13370 */ 13371 ASSERT(first_mp->b_datap->db_type != M_DATA); 13372 switch (first_mp->b_datap->db_type) { 13373 case M_PROTO: 13374 case M_PCPROTO: 13375 if (((dl_unitdata_ind_t *)rptr)->dl_primitive != 13376 DL_UNITDATA_IND) { 13377 /* Go handle anything other than data elsewhere. */ 13378 ip_rput_dlpi(q, mp); 13379 return (B_TRUE); 13380 } 13381 *ll_multicast = ((dl_unitdata_ind_t *)rptr)->dl_group_address; 13382 /* Ditch the DLPI header. */ 13383 mp1 = mp->b_cont; 13384 ASSERT(first_mp == mp); 13385 *first_mpp = mp1; 13386 freeb(mp); 13387 *mpp = mp1; 13388 return (B_FALSE); 13389 case M_BREAK: 13390 /* 13391 * A packet arrives as M_BREAK following a cycle through 13392 * ip_rput, ip_newroute, ... and finally ire_add_then_send. 13393 * This is an IP datagram sans lower level header. 13394 * M_BREAK are also used to pass back in multicast packets 13395 * that are encapsulated with a source route. 13396 */ 13397 /* Ditch the M_BREAK mblk */ 13398 mp1 = mp->b_cont; 13399 ASSERT(first_mp == mp); 13400 *first_mpp = mp1; 13401 freeb(mp); 13402 mp = mp1; 13403 mp->b_next = NULL; 13404 *mpp = mp; 13405 *ll_multicast = 0; 13406 return (B_FALSE); 13407 case M_IOCACK: 13408 ip1dbg(("got iocack ")); 13409 iocp = (struct iocblk *)mp->b_rptr; 13410 switch (iocp->ioc_cmd) { 13411 case DL_IOC_HDR_INFO: 13412 ill = (ill_t *)q->q_ptr; 13413 ill_fastpath_ack(ill, mp); 13414 return (B_TRUE); 13415 case SIOCSTUNPARAM: 13416 case OSIOCSTUNPARAM: 13417 /* Go through qwriter_ip */ 13418 break; 13419 case SIOCGTUNPARAM: 13420 case OSIOCGTUNPARAM: 13421 ip_rput_other(NULL, q, mp, NULL); 13422 return (B_TRUE); 13423 default: 13424 putnext(q, mp); 13425 return (B_TRUE); 13426 } 13427 /* FALLTHRU */ 13428 case M_ERROR: 13429 case M_HANGUP: 13430 /* 13431 * Since this is on the ill stream we unconditionally 13432 * bump up the refcount 13433 */ 13434 ill_refhold(ill); 13435 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_other, CUR_OP, 13436 B_FALSE); 13437 return (B_TRUE); 13438 case M_CTL: 13439 if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && 13440 (((da_ipsec_t *)first_mp->b_rptr)->da_type == 13441 IPHADA_M_CTL)) { 13442 /* 13443 * It's an IPsec accelerated packet. 13444 * Make sure that the ill from which we received the 13445 * packet has enabled IPsec hardware acceleration. 13446 */ 13447 if (!(ill->ill_capabilities & 13448 (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { 13449 /* IPsec kstats: bean counter */ 13450 freemsg(mp); 13451 return (B_TRUE); 13452 } 13453 13454 /* 13455 * Make mp point to the mblk following the M_CTL, 13456 * then process according to type of mp. 13457 * After this processing, first_mp will point to 13458 * the data-attributes and mp to the pkt following 13459 * the M_CTL. 13460 */ 13461 mp = first_mp->b_cont; 13462 if (mp == NULL) { 13463 freemsg(first_mp); 13464 return (B_TRUE); 13465 } 13466 /* 13467 * A Hardware Accelerated packet can only be M_DATA 13468 * ESP or AH packet. 13469 */ 13470 if (mp->b_datap->db_type != M_DATA) { 13471 /* non-M_DATA IPsec accelerated packet */ 13472 IPSECHW_DEBUG(IPSECHW_PKT, 13473 ("non-M_DATA IPsec accelerated pkt\n")); 13474 freemsg(first_mp); 13475 return (B_TRUE); 13476 } 13477 ipha = (ipha_t *)mp->b_rptr; 13478 if (ipha->ipha_protocol != IPPROTO_AH && 13479 ipha->ipha_protocol != IPPROTO_ESP) { 13480 IPSECHW_DEBUG(IPSECHW_PKT, 13481 ("non-M_DATA IPsec accelerated pkt\n")); 13482 freemsg(first_mp); 13483 return (B_TRUE); 13484 } 13485 *mpp = mp; 13486 return (B_FALSE); 13487 } 13488 putnext(q, mp); 13489 return (B_TRUE); 13490 case M_FLUSH: 13491 if (*mp->b_rptr & FLUSHW) { 13492 *mp->b_rptr &= ~FLUSHR; 13493 qreply(q, mp); 13494 return (B_TRUE); 13495 } 13496 freemsg(mp); 13497 return (B_TRUE); 13498 case M_IOCNAK: 13499 ip1dbg(("got iocnak ")); 13500 iocp = (struct iocblk *)mp->b_rptr; 13501 switch (iocp->ioc_cmd) { 13502 case DL_IOC_HDR_INFO: 13503 case SIOCSTUNPARAM: 13504 case OSIOCSTUNPARAM: 13505 /* 13506 * Since this is on the ill stream we unconditionally 13507 * bump up the refcount 13508 */ 13509 ill_refhold(ill); 13510 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_other, 13511 CUR_OP, B_FALSE); 13512 return (B_TRUE); 13513 case SIOCGTUNPARAM: 13514 case OSIOCGTUNPARAM: 13515 ip_rput_other(NULL, q, mp, NULL); 13516 return (B_TRUE); 13517 default: 13518 break; 13519 } 13520 /* FALLTHRU */ 13521 default: 13522 putnext(q, mp); 13523 return (B_TRUE); 13524 } 13525 } 13526 13527 /* Read side put procedure. Packets coming from the wire arrive here. */ 13528 void 13529 ip_rput(queue_t *q, mblk_t *mp) 13530 { 13531 ill_t *ill; 13532 13533 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); 13534 13535 ill = (ill_t *)q->q_ptr; 13536 13537 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { 13538 union DL_primitives *dl; 13539 13540 /* 13541 * Things are opening or closing. Only accept DLPI control 13542 * messages. In the open case, the ill->ill_ipif has not yet 13543 * been created. In the close case, things hanging off the 13544 * ill could have been freed already. In either case it 13545 * may not be safe to proceed further. 13546 */ 13547 13548 dl = (union DL_primitives *)mp->b_rptr; 13549 if ((mp->b_datap->db_type != M_PCPROTO) || 13550 (dl->dl_primitive == DL_UNITDATA_IND)) { 13551 /* 13552 * Also SIOC[GS]TUN* ioctls can come here. 13553 */ 13554 inet_freemsg(mp); 13555 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13556 "ip_input_end: q %p (%S)", q, "uninit"); 13557 return; 13558 } 13559 } 13560 13561 /* 13562 * if db_ref > 1 then copymsg and free original. Packet may be 13563 * changed and we do not want the other entity who has a reference to 13564 * this message to trip over the changes. This is a blind change because 13565 * trying to catch all places that might change the packet is too 13566 * difficult. 13567 * 13568 * This corresponds to the fast path case, where we have a chain of 13569 * M_DATA mblks. We check the db_ref count of only the 1st data block 13570 * in the mblk chain. There doesn't seem to be a reason why a device 13571 * driver would send up data with varying db_ref counts in the mblk 13572 * chain. In any case the Fast path is a private interface, and our 13573 * drivers don't do such a thing. Given the above assumption, there is 13574 * no need to walk down the entire mblk chain (which could have a 13575 * potential performance problem) 13576 */ 13577 if (mp->b_datap->db_ref > 1) { 13578 mblk_t *mp1; 13579 boolean_t adjusted = B_FALSE; 13580 IP_STAT(ip_db_ref); 13581 13582 /* 13583 * The IP_RECVSLLA option depends on having the link layer 13584 * header. First check that: 13585 * a> the underlying device is of type ether, since this 13586 * option is currently supported only over ethernet. 13587 * b> there is enough room to copy over the link layer header. 13588 * 13589 * Once the checks are done, adjust rptr so that the link layer 13590 * header will be copied via copymsg. Note that, IFT_ETHER may 13591 * be returned by some non-ethernet drivers but in this case the 13592 * second check will fail. 13593 */ 13594 if (ill->ill_type == IFT_ETHER && 13595 (mp->b_rptr - mp->b_datap->db_base) >= 13596 sizeof (struct ether_header)) { 13597 mp->b_rptr -= sizeof (struct ether_header); 13598 adjusted = B_TRUE; 13599 } 13600 mp1 = copymsg(mp); 13601 if (mp1 == NULL) { 13602 /* Clear b_next - used in M_BREAK messages */ 13603 mp->b_next = NULL; 13604 /* clear b_prev - used by ip_mroute_decap */ 13605 mp->b_prev = NULL; 13606 freemsg(mp); 13607 BUMP_MIB(&ip_mib, ipInDiscards); 13608 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13609 "ip_rput_end: q %p (%S)", q, "copymsg"); 13610 return; 13611 } 13612 if (adjusted) { 13613 /* 13614 * Copy is done. Restore the pointer in the _new_ mblk 13615 */ 13616 mp1->b_rptr += sizeof (struct ether_header); 13617 } 13618 /* Copy b_next - used in M_BREAK messages */ 13619 mp1->b_next = mp->b_next; 13620 mp->b_next = NULL; 13621 /* Copy b_prev - used by ip_mroute_decap */ 13622 mp1->b_prev = mp->b_prev; 13623 mp->b_prev = NULL; 13624 freemsg(mp); 13625 mp = mp1; 13626 } 13627 13628 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13629 "ip_rput_end: q %p (%S)", q, "end"); 13630 13631 ip_input(ill, NULL, mp, 0); 13632 } 13633 13634 /* 13635 * Direct read side procedure capable of dealing with chains. GLDv3 based 13636 * drivers call this function directly with mblk chains while STREAMS 13637 * read side procedure ip_rput() calls this for single packet with ip_ring 13638 * set to NULL to process one packet at a time. 13639 * 13640 * The ill will always be valid if this function is called directly from 13641 * the driver. 13642 */ 13643 /*ARGSUSED*/ 13644 void 13645 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, size_t hdrlen) 13646 { 13647 ipaddr_t dst; 13648 ire_t *ire; 13649 ipha_t *ipha; 13650 uint_t pkt_len; 13651 ssize_t len; 13652 uint_t opt_len; 13653 int ll_multicast; 13654 int cgtp_flt_pkt; 13655 queue_t *q = ill->ill_rq; 13656 squeue_t *curr_sqp = NULL; 13657 mblk_t *head = NULL; 13658 mblk_t *tail = NULL; 13659 mblk_t *first_mp; 13660 mblk_t *mp; 13661 int cnt = 0; 13662 13663 ASSERT(mp_chain != NULL); 13664 ASSERT(ill != NULL); 13665 13666 TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); 13667 13668 #define rptr ((uchar_t *)ipha) 13669 13670 while (mp_chain != NULL) { 13671 first_mp = mp = mp_chain; 13672 mp_chain = mp_chain->b_next; 13673 mp->b_next = NULL; 13674 ll_multicast = 0; 13675 ire = NULL; 13676 13677 /* 13678 * ip_input fast path 13679 */ 13680 13681 /* mblk type is not M_DATA */ 13682 if (mp->b_datap->db_type != M_DATA) { 13683 if (ip_rput_process_notdata(q, &first_mp, ill, 13684 &ll_multicast, &mp)) 13685 continue; 13686 } 13687 13688 ASSERT(mp->b_datap->db_type == M_DATA); 13689 ASSERT(mp->b_datap->db_ref == 1); 13690 13691 13692 ipha = (ipha_t *)mp->b_rptr; 13693 len = mp->b_wptr - rptr; 13694 13695 BUMP_MIB(&ip_mib, ipInReceives); 13696 13697 /* 13698 * IP header ptr not aligned? 13699 * OR IP header not complete in first mblk 13700 */ 13701 if (!OK_32PTR(rptr) || len < IP_SIMPLE_HDR_LENGTH) { 13702 if (!ip_check_and_align_header(q, mp)) 13703 continue; 13704 ipha = (ipha_t *)mp->b_rptr; 13705 len = mp->b_wptr - rptr; 13706 } 13707 13708 /* multiple mblk or too short */ 13709 pkt_len = ntohs(ipha->ipha_length); 13710 len -= pkt_len; 13711 if (len != 0) { 13712 /* 13713 * Make sure we have data length consistent 13714 * with the IP header. 13715 */ 13716 if (mp->b_cont == NULL) { 13717 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 13718 BUMP_MIB(&ip_mib, ipInHdrErrors); 13719 ip2dbg(("ip_input: drop pkt\n")); 13720 freemsg(mp); 13721 continue; 13722 } 13723 mp->b_wptr = rptr + pkt_len; 13724 } else if (len += msgdsize(mp->b_cont)) { 13725 if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { 13726 BUMP_MIB(&ip_mib, ipInHdrErrors); 13727 ip2dbg(("ip_input: drop pkt\n")); 13728 freemsg(mp); 13729 continue; 13730 } 13731 (void) adjmsg(mp, -len); 13732 IP_STAT(ip_multimblk3); 13733 } 13734 } 13735 13736 if (ip_loopback_src_or_dst(ipha, ill)) { 13737 ip2dbg(("ip_input: drop pkt\n")); 13738 freemsg(mp); 13739 continue; 13740 } 13741 13742 /* 13743 * Attach any necessary label information to this packet. 13744 */ 13745 if (is_system_labeled() && 13746 !tsol_get_pkt_label(mp, IPV4_VERSION)) { 13747 BUMP_MIB(&ip_mib, ipInDiscards); 13748 freemsg(mp); 13749 continue; 13750 } 13751 13752 opt_len = ipha->ipha_version_and_hdr_length - 13753 IP_SIMPLE_HDR_VERSION; 13754 /* IP version bad or there are IP options */ 13755 if (opt_len) { 13756 if (len != 0) 13757 IP_STAT(ip_multimblk4); 13758 else 13759 IP_STAT(ip_ipoptions); 13760 if (!ip_rput_multimblk_ipoptions(q, mp, &ipha, &dst)) 13761 continue; 13762 } else { 13763 dst = ipha->ipha_dst; 13764 } 13765 13766 /* 13767 * Invoke the CGTP (multirouting) filtering module to process 13768 * the incoming packet. Packets identified as duplicates 13769 * must be discarded. Filtering is active only if the 13770 * the ip_cgtp_filter ndd variable is non-zero. 13771 */ 13772 cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 13773 if (ip_cgtp_filter && (ip_cgtp_filter_ops != NULL)) { 13774 cgtp_flt_pkt = 13775 ip_cgtp_filter_ops->cfo_filter(q, mp); 13776 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 13777 freemsg(first_mp); 13778 continue; 13779 } 13780 } 13781 13782 /* 13783 * If rsvpd is running, let RSVP daemon handle its processing 13784 * and forwarding of RSVP multicast/unicast packets. 13785 * If rsvpd is not running but mrouted is running, RSVP 13786 * multicast packets are forwarded as multicast traffic 13787 * and RSVP unicast packets are forwarded by unicast router. 13788 * If neither rsvpd nor mrouted is running, RSVP multicast 13789 * packets are not forwarded, but the unicast packets are 13790 * forwarded like unicast traffic. 13791 */ 13792 if (ipha->ipha_protocol == IPPROTO_RSVP && 13793 ipcl_proto_search(IPPROTO_RSVP) != NULL) { 13794 /* RSVP packet and rsvpd running. Treat as ours */ 13795 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); 13796 /* 13797 * This assumes that we deliver to all streams for 13798 * multicast and broadcast packets. 13799 * We have to force ll_multicast to 1 to handle the 13800 * M_DATA messages passed in from ip_mroute_decap. 13801 */ 13802 dst = INADDR_BROADCAST; 13803 ll_multicast = 1; 13804 } else if (CLASSD(dst)) { 13805 /* packet is multicast */ 13806 mp->b_next = NULL; 13807 if (ip_rput_process_multicast(q, mp, ill, ipha, 13808 &ll_multicast, &dst)) 13809 continue; 13810 } 13811 13812 13813 /* 13814 * Check if the packet is coming from the Mobile IP 13815 * forward tunnel interface 13816 */ 13817 if (ill->ill_srcif_refcnt > 0) { 13818 ire = ire_srcif_table_lookup(dst, IRE_INTERFACE, 13819 NULL, ill, MATCH_IRE_TYPE); 13820 if (ire != NULL && ire->ire_dlureq_mp == NULL && 13821 ire->ire_ipif->ipif_net_type == 13822 IRE_IF_RESOLVER) { 13823 /* We need to resolve the link layer info */ 13824 ire_refrele(ire); 13825 ip_rput_noire(q, (ill_t *)q->q_ptr, mp, 13826 ll_multicast, dst); 13827 continue; 13828 } 13829 } 13830 13831 if (ire == NULL) { 13832 ire = ire_cache_lookup(dst, ALL_ZONES, 13833 MBLK_GETLABEL(mp)); 13834 } 13835 13836 /* 13837 * If mipagent is running and reverse tunnel is created as per 13838 * mobile node request, then any packet coming through the 13839 * incoming interface from the mobile-node, should be reverse 13840 * tunneled to it's home agent except those that are destined 13841 * to foreign agent only. 13842 * This needs source address based ire lookup. The routing 13843 * entries for source address based lookup are only created by 13844 * mipagent program only when a reverse tunnel is created. 13845 * Reference : RFC2002, RFC2344 13846 */ 13847 if (ill->ill_mrtun_refcnt > 0) { 13848 ipaddr_t srcaddr; 13849 ire_t *tmp_ire; 13850 13851 tmp_ire = ire; /* Save, we might need it later */ 13852 if (ire == NULL || (ire->ire_type != IRE_LOCAL && 13853 ire->ire_type != IRE_BROADCAST)) { 13854 srcaddr = ipha->ipha_src; 13855 ire = ire_mrtun_lookup(srcaddr, ill); 13856 if (ire != NULL) { 13857 /* 13858 * Should not be getting iphada packet 13859 * here. we should only get those for 13860 * IRE_LOCAL traffic, excluded above. 13861 * Fail-safe (drop packet) in the event 13862 * hardware is misbehaving. 13863 */ 13864 if (first_mp != mp) { 13865 /* IPsec KSTATS: beancount me */ 13866 freemsg(first_mp); 13867 } else { 13868 /* 13869 * This packet must be forwarded 13870 * to Reverse Tunnel 13871 */ 13872 ip_mrtun_forward(ire, ill, mp); 13873 } 13874 ire_refrele(ire); 13875 if (tmp_ire != NULL) 13876 ire_refrele(tmp_ire); 13877 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 13878 "ip_input_end: q %p (%S)", 13879 q, "uninit"); 13880 continue; 13881 } 13882 } 13883 /* 13884 * If this packet is from a non-mobilenode or a 13885 * mobile-node which does not request reverse 13886 * tunnel service 13887 */ 13888 ire = tmp_ire; 13889 } 13890 13891 13892 /* 13893 * If we reach here that means the incoming packet satisfies 13894 * one of the following conditions: 13895 * - packet is from a mobile node which does not request 13896 * reverse tunnel 13897 * - packet is from a non-mobile node, which is the most 13898 * common case 13899 * - packet is from a reverse tunnel enabled mobile node 13900 * and destined to foreign agent only 13901 */ 13902 13903 if (ire == NULL) { 13904 /* 13905 * No IRE for this destination, so it can't be for us. 13906 * Unless we are forwarding, drop the packet. 13907 * We have to let source routed packets through 13908 * since we don't yet know if they are 'ping -l' 13909 * packets i.e. if they will go out over the 13910 * same interface as they came in on. 13911 */ 13912 ip_rput_noire(q, NULL, mp, ll_multicast, dst); 13913 continue; 13914 } 13915 13916 /* 13917 * Broadcast IRE may indicate either broadcast or 13918 * multicast packet 13919 */ 13920 if (ire->ire_type == IRE_BROADCAST) { 13921 /* 13922 * Skip broadcast checks if packet is UDP multicast; 13923 * we'd rather not enter ip_rput_process_broadcast() 13924 * unless the packet is broadcast for real, since 13925 * that routine is a no-op for multicast. 13926 */ 13927 if ((ipha->ipha_protocol != IPPROTO_UDP || 13928 !CLASSD(ipha->ipha_dst)) && 13929 ip_rput_process_broadcast(&q, mp, &ire, ipha, ill, 13930 dst, cgtp_flt_pkt, ll_multicast)) { 13931 continue; 13932 } 13933 } else if (ire->ire_stq != NULL) { 13934 /* fowarding? */ 13935 ip_rput_process_forward(q, mp, ire, ipha, ill, 13936 ll_multicast); 13937 continue; 13938 } 13939 13940 /* packet not for us */ 13941 if (ire->ire_rfq != q) { 13942 if (ip_rput_notforus(&q, mp, ire, ill)) { 13943 continue; 13944 } 13945 } 13946 13947 switch (ipha->ipha_protocol) { 13948 case IPPROTO_TCP: 13949 ASSERT(first_mp == mp); 13950 if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, 13951 mp, 0, q, ip_ring)) != NULL) { 13952 if (curr_sqp == NULL) { 13953 curr_sqp = GET_SQUEUE(mp); 13954 ASSERT(cnt == 0); 13955 cnt++; 13956 head = tail = mp; 13957 } else if (curr_sqp == GET_SQUEUE(mp)) { 13958 ASSERT(tail != NULL); 13959 cnt++; 13960 tail->b_next = mp; 13961 tail = mp; 13962 } else { 13963 /* 13964 * A different squeue. Send the 13965 * chain for the previous squeue on 13966 * its way. This shouldn't happen 13967 * often unless interrupt binding 13968 * changes. 13969 */ 13970 IP_STAT(ip_input_multi_squeue); 13971 squeue_enter_chain(curr_sqp, head, 13972 tail, cnt, SQTAG_IP_INPUT); 13973 curr_sqp = GET_SQUEUE(mp); 13974 head = mp; 13975 tail = mp; 13976 cnt = 1; 13977 } 13978 } 13979 IRE_REFRELE(ire); 13980 continue; 13981 case IPPROTO_UDP: 13982 ASSERT(first_mp == mp); 13983 ip_udp_input(q, mp, ipha, ire, ill); 13984 IRE_REFRELE(ire); 13985 continue; 13986 case IPPROTO_SCTP: 13987 ASSERT(first_mp == mp); 13988 ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, 13989 q, dst); 13990 continue; 13991 default: 13992 ip_proto_input(q, first_mp, ipha, ire, ill); 13993 IRE_REFRELE(ire); 13994 continue; 13995 } 13996 } 13997 13998 if (head != NULL) 13999 squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); 14000 14001 /* 14002 * This code is there just to make netperf/ttcp look good. 14003 * 14004 * Its possible that after being in polling mode (and having cleared 14005 * the backlog), squeues have turned the interrupt frequency higher 14006 * to improve latency at the expense of more CPU utilization (less 14007 * packets per interrupts or more number of interrupts). Workloads 14008 * like ttcp/netperf do manage to tickle polling once in a while 14009 * but for the remaining time, stay in higher interrupt mode since 14010 * their packet arrival rate is pretty uniform and this shows up 14011 * as higher CPU utilization. Since people care about CPU utilization 14012 * while running netperf/ttcp, turn the interrupt frequency back to 14013 * normal/default if polling has not been used in ip_poll_normal_ticks. 14014 */ 14015 if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { 14016 if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { 14017 ip_ring->rr_poll_state &= ~ILL_POLLING; 14018 ip_ring->rr_blank(ip_ring->rr_handle, 14019 ip_ring->rr_normal_blank_time, 14020 ip_ring->rr_normal_pkt_cnt); 14021 } 14022 } 14023 14024 TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, 14025 "ip_input_end: q %p (%S)", q, "end"); 14026 #undef rptr 14027 } 14028 14029 static void 14030 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, 14031 t_uscalar_t err) 14032 { 14033 if (dl_err == DL_SYSERR) { 14034 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14035 "%s: %s failed: DL_SYSERR (errno %u)\n", 14036 ill->ill_name, dlpi_prim_str(prim), err); 14037 return; 14038 } 14039 14040 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14041 "%s: %s failed: %s\n", ill->ill_name, dlpi_prim_str(prim), 14042 dlpi_err_str(dl_err)); 14043 } 14044 14045 /* 14046 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other 14047 * than DL_UNITDATA_IND messages. If we need to process this message 14048 * exclusively, we call qwriter_ip, in which case we also need to call 14049 * ill_refhold before that, since qwriter_ip does an ill_refrele. 14050 */ 14051 void 14052 ip_rput_dlpi(queue_t *q, mblk_t *mp) 14053 { 14054 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 14055 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 14056 ill_t *ill; 14057 14058 ip1dbg(("ip_rput_dlpi")); 14059 ill = (ill_t *)q->q_ptr; 14060 switch (dloa->dl_primitive) { 14061 case DL_ERROR_ACK: 14062 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK %s (0x%x): " 14063 "%s (0x%x), unix %u\n", ill->ill_name, 14064 dlpi_prim_str(dlea->dl_error_primitive), 14065 dlea->dl_error_primitive, 14066 dlpi_err_str(dlea->dl_errno), 14067 dlea->dl_errno, 14068 dlea->dl_unix_errno)); 14069 switch (dlea->dl_error_primitive) { 14070 case DL_UNBIND_REQ: 14071 mutex_enter(&ill->ill_lock); 14072 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14073 cv_signal(&ill->ill_cv); 14074 mutex_exit(&ill->ill_lock); 14075 /* FALLTHRU */ 14076 case DL_NOTIFY_REQ: 14077 case DL_ATTACH_REQ: 14078 case DL_DETACH_REQ: 14079 case DL_INFO_REQ: 14080 case DL_BIND_REQ: 14081 case DL_ENABMULTI_REQ: 14082 case DL_PHYS_ADDR_REQ: 14083 case DL_CAPABILITY_REQ: 14084 case DL_CONTROL_REQ: 14085 /* 14086 * Refhold the ill to match qwriter_ip which does a 14087 * refrele. Since this is on the ill stream we 14088 * unconditionally bump up the refcount without 14089 * checking for ILL_CAN_LOOKUP 14090 */ 14091 ill_refhold(ill); 14092 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14093 CUR_OP, B_FALSE); 14094 return; 14095 case DL_DISABMULTI_REQ: 14096 freemsg(mp); /* Don't want to pass this up */ 14097 return; 14098 default: 14099 break; 14100 } 14101 ip_dlpi_error(ill, dlea->dl_error_primitive, 14102 dlea->dl_errno, dlea->dl_unix_errno); 14103 freemsg(mp); 14104 return; 14105 case DL_INFO_ACK: 14106 case DL_BIND_ACK: 14107 case DL_PHYS_ADDR_ACK: 14108 case DL_NOTIFY_ACK: 14109 case DL_CAPABILITY_ACK: 14110 case DL_CONTROL_ACK: 14111 /* 14112 * Refhold the ill to match qwriter_ip which does a refrele 14113 * Since this is on the ill stream we unconditionally 14114 * bump up the refcount without doing ILL_CAN_LOOKUP. 14115 */ 14116 ill_refhold(ill); 14117 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14118 CUR_OP, B_FALSE); 14119 return; 14120 case DL_NOTIFY_IND: 14121 ill_refhold(ill); 14122 /* 14123 * The DL_NOTIFY_IND is an asynchronous message that has no 14124 * relation to the current ioctl in progress (if any). Hence we 14125 * pass in NEW_OP in this case. 14126 */ 14127 (void) qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14128 NEW_OP, B_FALSE); 14129 return; 14130 case DL_OK_ACK: 14131 ip1dbg(("ip_rput: DL_OK_ACK for %s\n", 14132 dlpi_prim_str((int)dloa->dl_correct_primitive))); 14133 switch (dloa->dl_correct_primitive) { 14134 case DL_UNBIND_REQ: 14135 mutex_enter(&ill->ill_lock); 14136 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14137 cv_signal(&ill->ill_cv); 14138 mutex_exit(&ill->ill_lock); 14139 /* FALLTHRU */ 14140 case DL_ATTACH_REQ: 14141 case DL_DETACH_REQ: 14142 /* 14143 * Refhold the ill to match qwriter_ip which does a 14144 * refrele. Since this is on the ill stream we 14145 * unconditionally bump up the refcount 14146 */ 14147 ill_refhold(ill); 14148 qwriter_ip(NULL, ill, q, mp, ip_rput_dlpi_writer, 14149 CUR_OP, B_FALSE); 14150 return; 14151 case DL_ENABMULTI_REQ: 14152 if (ill->ill_dlpi_multicast_state == IDMS_INPROGRESS) 14153 ill->ill_dlpi_multicast_state = IDMS_OK; 14154 break; 14155 14156 } 14157 break; 14158 default: 14159 break; 14160 } 14161 freemsg(mp); 14162 } 14163 14164 /* 14165 * Handling of DLPI messages that require exclusive access to the ipsq. 14166 * 14167 * Need to do ill_pending_mp_release on ioctl completion, which could 14168 * happen here. (along with mi_copy_done) 14169 */ 14170 /* ARGSUSED */ 14171 static void 14172 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 14173 { 14174 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; 14175 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; 14176 int err = 0; 14177 ill_t *ill; 14178 ipif_t *ipif = NULL; 14179 mblk_t *mp1 = NULL; 14180 conn_t *connp = NULL; 14181 t_uscalar_t physaddr_req; 14182 mblk_t *mp_hw; 14183 union DL_primitives *dlp; 14184 boolean_t success; 14185 boolean_t ioctl_aborted = B_FALSE; 14186 boolean_t log = B_TRUE; 14187 14188 ip1dbg(("ip_rput_dlpi_writer ..")); 14189 ill = (ill_t *)q->q_ptr; 14190 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 14191 14192 ASSERT(IAM_WRITER_ILL(ill)); 14193 14194 /* 14195 * ipsq_pending_mp and ipsq_pending_ipif track each other. i.e. 14196 * both are null or non-null. However we can assert that only 14197 * after grabbing the ipsq_lock. So we don't make any assertion 14198 * here and in other places in the code. 14199 */ 14200 ipif = ipsq->ipsq_pending_ipif; 14201 /* 14202 * The current ioctl could have been aborted by the user and a new 14203 * ioctl to bring up another ill could have started. We could still 14204 * get a response from the driver later. 14205 */ 14206 if (ipif != NULL && ipif->ipif_ill != ill) 14207 ioctl_aborted = B_TRUE; 14208 14209 switch (dloa->dl_primitive) { 14210 case DL_ERROR_ACK: 14211 switch (dlea->dl_error_primitive) { 14212 case DL_UNBIND_REQ: 14213 case DL_ATTACH_REQ: 14214 case DL_DETACH_REQ: 14215 case DL_INFO_REQ: 14216 ill_dlpi_done(ill, dlea->dl_error_primitive); 14217 break; 14218 case DL_NOTIFY_REQ: 14219 ill_dlpi_done(ill, DL_NOTIFY_REQ); 14220 log = B_FALSE; 14221 break; 14222 case DL_PHYS_ADDR_REQ: 14223 /* 14224 * For IPv6 only, there are two additional 14225 * phys_addr_req's sent to the driver to get the 14226 * IPv6 token and lla. This allows IP to acquire 14227 * the hardware address format for a given interface 14228 * without having built in knowledge of the hardware 14229 * address. ill_phys_addr_pend keeps track of the last 14230 * DL_PAR sent so we know which response we are 14231 * dealing with. ill_dlpi_done will update 14232 * ill_phys_addr_pend when it sends the next req. 14233 * We don't complete the IOCTL until all three DL_PARs 14234 * have been attempted, so set *_len to 0 and break. 14235 */ 14236 physaddr_req = ill->ill_phys_addr_pend; 14237 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 14238 if (physaddr_req == DL_IPV6_TOKEN) { 14239 ill->ill_token_length = 0; 14240 log = B_FALSE; 14241 break; 14242 } else if (physaddr_req == DL_IPV6_LINK_LAYER_ADDR) { 14243 ill->ill_nd_lla_len = 0; 14244 log = B_FALSE; 14245 break; 14246 } 14247 /* 14248 * Something went wrong with the DL_PHYS_ADDR_REQ. 14249 * We presumably have an IOCTL hanging out waiting 14250 * for completion. Find it and complete the IOCTL 14251 * with the error noted. 14252 * However, ill_dl_phys was called on an ill queue 14253 * (from SIOCSLIFNAME), thus conn_pending_ill is not 14254 * set. But the ioctl is known to be pending on ill_wq. 14255 */ 14256 if (!ill->ill_ifname_pending) 14257 break; 14258 ill->ill_ifname_pending = 0; 14259 if (!ioctl_aborted) 14260 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14261 if (mp1 != NULL) { 14262 /* 14263 * This operation (SIOCSLIFNAME) must have 14264 * happened on the ill. Assert there is no conn 14265 */ 14266 ASSERT(connp == NULL); 14267 q = ill->ill_wq; 14268 } 14269 break; 14270 case DL_BIND_REQ: 14271 ill_dlpi_done(ill, DL_BIND_REQ); 14272 if (ill->ill_ifname_pending) 14273 break; 14274 /* 14275 * Something went wrong with the bind. We presumably 14276 * have an IOCTL hanging out waiting for completion. 14277 * Find it, take down the interface that was coming 14278 * up, and complete the IOCTL with the error noted. 14279 */ 14280 if (!ioctl_aborted) 14281 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14282 if (mp1 != NULL) { 14283 /* 14284 * This operation (SIOCSLIFFLAGS) must have 14285 * happened from a conn. 14286 */ 14287 ASSERT(connp != NULL); 14288 q = CONNP_TO_WQ(connp); 14289 if (ill->ill_move_in_progress) { 14290 ILL_CLEAR_MOVE(ill); 14291 } 14292 (void) ipif_down(ipif, NULL, NULL); 14293 /* error is set below the switch */ 14294 } 14295 break; 14296 case DL_ENABMULTI_REQ: 14297 ip1dbg(("DL_ERROR_ACK to enabmulti\n")); 14298 14299 if (ill->ill_dlpi_multicast_state == IDMS_INPROGRESS) 14300 ill->ill_dlpi_multicast_state = IDMS_FAILED; 14301 if (ill->ill_dlpi_multicast_state == IDMS_FAILED) { 14302 ipif_t *ipif; 14303 14304 log = B_FALSE; 14305 printf("ip: joining multicasts failed (%d)" 14306 " on %s - will use link layer " 14307 "broadcasts for multicast\n", 14308 dlea->dl_errno, ill->ill_name); 14309 14310 /* 14311 * Set up the multicast mapping alone. 14312 * writer, so ok to access ill->ill_ipif 14313 * without any lock. 14314 */ 14315 ipif = ill->ill_ipif; 14316 mutex_enter(&ill->ill_phyint->phyint_lock); 14317 ill->ill_phyint->phyint_flags |= 14318 PHYI_MULTI_BCAST; 14319 mutex_exit(&ill->ill_phyint->phyint_lock); 14320 14321 if (!ill->ill_isv6) { 14322 (void) ipif_arp_setup_multicast(ipif, 14323 NULL); 14324 } else { 14325 (void) ipif_ndp_setup_multicast(ipif, 14326 NULL); 14327 } 14328 } 14329 freemsg(mp); /* Don't want to pass this up */ 14330 return; 14331 case DL_CAPABILITY_REQ: 14332 case DL_CONTROL_REQ: 14333 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " 14334 "DL_CAPABILITY/CONTROL REQ\n")); 14335 ill_dlpi_done(ill, dlea->dl_error_primitive); 14336 ill->ill_capab_state = IDMS_FAILED; 14337 freemsg(mp); 14338 return; 14339 } 14340 /* 14341 * Note the error for IOCTL completion (mp1 is set when 14342 * ready to complete ioctl). If ill_ifname_pending_err is 14343 * set, an error occured during plumbing (ill_ifname_pending), 14344 * so we want to report that error. 14345 * 14346 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's 14347 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are 14348 * expected to get errack'd if the driver doesn't support 14349 * these flags (e.g. ethernet). log will be set to B_FALSE 14350 * if these error conditions are encountered. 14351 */ 14352 if (mp1 != NULL) { 14353 if (ill->ill_ifname_pending_err != 0) { 14354 err = ill->ill_ifname_pending_err; 14355 ill->ill_ifname_pending_err = 0; 14356 } else { 14357 err = dlea->dl_unix_errno ? 14358 dlea->dl_unix_errno : ENXIO; 14359 } 14360 /* 14361 * If we're plumbing an interface and an error hasn't already 14362 * been saved, set ill_ifname_pending_err to the error passed 14363 * up. Ignore the error if log is B_FALSE (see comment above). 14364 */ 14365 } else if (log && ill->ill_ifname_pending && 14366 ill->ill_ifname_pending_err == 0) { 14367 ill->ill_ifname_pending_err = dlea->dl_unix_errno ? 14368 dlea->dl_unix_errno : ENXIO; 14369 } 14370 14371 if (log) 14372 ip_dlpi_error(ill, dlea->dl_error_primitive, 14373 dlea->dl_errno, dlea->dl_unix_errno); 14374 break; 14375 case DL_CAPABILITY_ACK: { 14376 boolean_t reneg_flag = B_FALSE; 14377 /* Call a routine to handle this one. */ 14378 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 14379 /* 14380 * Check if the ACK is due to renegotiation case since we 14381 * will need to send a new CAPABILITY_REQ later. 14382 */ 14383 if (ill->ill_capab_state == IDMS_RENEG) { 14384 /* This is the ack for a renogiation case */ 14385 reneg_flag = B_TRUE; 14386 ill->ill_capab_state = IDMS_UNKNOWN; 14387 } 14388 ill_capability_ack(ill, mp); 14389 if (reneg_flag) 14390 ill_capability_probe(ill); 14391 break; 14392 } 14393 case DL_CONTROL_ACK: 14394 /* We treat all of these as "fire and forget" */ 14395 ill_dlpi_done(ill, DL_CONTROL_REQ); 14396 break; 14397 case DL_INFO_ACK: 14398 /* Call a routine to handle this one. */ 14399 ill_dlpi_done(ill, DL_INFO_REQ); 14400 ip_ll_subnet_defaults(ill, mp); 14401 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock)); 14402 return; 14403 case DL_BIND_ACK: 14404 /* 14405 * We should have an IOCTL waiting on this unless 14406 * sent by ill_dl_phys, in which case just return 14407 */ 14408 ill_dlpi_done(ill, DL_BIND_REQ); 14409 if (ill->ill_ifname_pending) 14410 break; 14411 14412 if (!ioctl_aborted) 14413 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14414 if (mp1 == NULL) 14415 break; 14416 ASSERT(connp != NULL); 14417 q = CONNP_TO_WQ(connp); 14418 14419 /* 14420 * We are exclusive. So nothing can change even after 14421 * we get the pending mp. If need be we can put it back 14422 * and restart, as in calling ipif_arp_up() below. 14423 */ 14424 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); 14425 14426 mutex_enter(&ill->ill_lock); 14427 ill->ill_dl_up = 1; 14428 mutex_exit(&ill->ill_lock); 14429 14430 /* 14431 * Now bring up the resolver, when that is 14432 * done we'll create IREs and we are done. 14433 */ 14434 if (ill->ill_isv6) { 14435 /* 14436 * v6 interfaces. 14437 * Unlike ARP which has to do another bind 14438 * and attach, once we get here we are 14439 * done withh NDP. Except in the case of 14440 * ILLF_XRESOLV, in which case we send an 14441 * AR_INTERFACE_UP to the external resolver. 14442 * If all goes well, the ioctl will complete 14443 * in ip_rput(). If there's an error, we 14444 * complete it here. 14445 */ 14446 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 14447 B_FALSE); 14448 if (err == 0) { 14449 if (ill->ill_flags & ILLF_XRESOLV) { 14450 mutex_enter(&connp->conn_lock); 14451 mutex_enter(&ill->ill_lock); 14452 success = ipsq_pending_mp_add( 14453 connp, ipif, q, mp1, 0); 14454 mutex_exit(&ill->ill_lock); 14455 mutex_exit(&connp->conn_lock); 14456 if (success) { 14457 err = ipif_resolver_up(ipif, 14458 B_FALSE); 14459 if (err == EINPROGRESS) { 14460 freemsg(mp); 14461 return; 14462 } 14463 ASSERT(err != 0); 14464 mp1 = ipsq_pending_mp_get(ipsq, 14465 &connp); 14466 ASSERT(mp1 != NULL); 14467 } else { 14468 /* conn has started closing */ 14469 err = EINTR; 14470 } 14471 } else { /* Non XRESOLV interface */ 14472 err = ipif_up_done_v6(ipif); 14473 } 14474 } 14475 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 14476 /* 14477 * ARP and other v4 external resolvers. 14478 * Leave the pending mblk intact so that 14479 * the ioctl completes in ip_rput(). 14480 */ 14481 mutex_enter(&connp->conn_lock); 14482 mutex_enter(&ill->ill_lock); 14483 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); 14484 mutex_exit(&ill->ill_lock); 14485 mutex_exit(&connp->conn_lock); 14486 if (success) { 14487 err = ipif_resolver_up(ipif, B_FALSE); 14488 if (err == EINPROGRESS) { 14489 freemsg(mp); 14490 return; 14491 } 14492 ASSERT(err != 0); 14493 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14494 } else { 14495 /* The conn has started closing */ 14496 err = EINTR; 14497 } 14498 } else { 14499 /* 14500 * This one is complete. Reply to pending ioctl. 14501 */ 14502 err = ipif_up_done(ipif); 14503 } 14504 14505 if ((err == 0) && (ill->ill_up_ipifs)) { 14506 err = ill_up_ipifs(ill, q, mp1); 14507 if (err == EINPROGRESS) { 14508 freemsg(mp); 14509 return; 14510 } 14511 } 14512 14513 if (ill->ill_up_ipifs) { 14514 ill_group_cleanup(ill); 14515 } 14516 14517 break; 14518 case DL_NOTIFY_IND: { 14519 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; 14520 ire_t *ire; 14521 boolean_t need_ire_walk_v4 = B_FALSE; 14522 boolean_t need_ire_walk_v6 = B_FALSE; 14523 14524 /* 14525 * Change the address everywhere we need to. 14526 * What we're getting here is a link-level addr or phys addr. 14527 * The new addr is at notify + notify->dl_addr_offset 14528 * The address length is notify->dl_addr_length; 14529 */ 14530 switch (notify->dl_notification) { 14531 case DL_NOTE_PHYS_ADDR: 14532 mp_hw = copyb(mp); 14533 if (mp_hw == NULL) { 14534 err = ENOMEM; 14535 break; 14536 } 14537 dlp = (union DL_primitives *)mp_hw->b_rptr; 14538 /* 14539 * We currently don't support changing 14540 * the token via DL_NOTIFY_IND. 14541 * When we do support it, we have to consider 14542 * what the implications are with respect to 14543 * the token and the link local address. 14544 */ 14545 mutex_enter(&ill->ill_lock); 14546 if (dlp->notify_ind.dl_data == 14547 DL_IPV6_LINK_LAYER_ADDR) { 14548 if (ill->ill_nd_lla_mp != NULL) 14549 freemsg(ill->ill_nd_lla_mp); 14550 ill->ill_nd_lla_mp = mp_hw; 14551 ill->ill_nd_lla = (uchar_t *)mp_hw->b_rptr + 14552 dlp->notify_ind.dl_addr_offset; 14553 ill->ill_nd_lla_len = 14554 dlp->notify_ind.dl_addr_length - 14555 ABS(ill->ill_sap_length); 14556 mutex_exit(&ill->ill_lock); 14557 break; 14558 } else if (dlp->notify_ind.dl_data == 14559 DL_CURR_PHYS_ADDR) { 14560 if (ill->ill_phys_addr_mp != NULL) 14561 freemsg(ill->ill_phys_addr_mp); 14562 ill->ill_phys_addr_mp = mp_hw; 14563 ill->ill_phys_addr = (uchar_t *)mp_hw->b_rptr + 14564 dlp->notify_ind.dl_addr_offset; 14565 ill->ill_phys_addr_length = 14566 dlp->notify_ind.dl_addr_length - 14567 ABS(ill->ill_sap_length); 14568 if (ill->ill_isv6 && 14569 !(ill->ill_flags & ILLF_XRESOLV)) { 14570 if (ill->ill_nd_lla_mp != NULL) 14571 freemsg(ill->ill_nd_lla_mp); 14572 ill->ill_nd_lla_mp = copyb(mp_hw); 14573 ill->ill_nd_lla = (uchar_t *) 14574 ill->ill_nd_lla_mp->b_rptr + 14575 dlp->notify_ind.dl_addr_offset; 14576 ill->ill_nd_lla_len = 14577 ill->ill_phys_addr_length; 14578 } 14579 } 14580 mutex_exit(&ill->ill_lock); 14581 /* 14582 * Send out gratuitous arp request for our new 14583 * hardware address. 14584 */ 14585 for (ipif = ill->ill_ipif; ipif != NULL; 14586 ipif = ipif->ipif_next) { 14587 if (!(ipif->ipif_flags & IPIF_UP)) 14588 continue; 14589 if (ill->ill_isv6) { 14590 ipif_ndp_down(ipif); 14591 /* 14592 * Set B_TRUE to enable 14593 * ipif_ndp_up() to send out 14594 * unsolicited advertisements. 14595 */ 14596 err = ipif_ndp_up(ipif, 14597 &ipif->ipif_v6lcl_addr, 14598 B_TRUE); 14599 if (err) { 14600 ip1dbg(( 14601 "ip_rput_dlpi_writer: " 14602 "Failed to update ndp " 14603 "err %d\n", err)); 14604 } 14605 } else { 14606 /* 14607 * IPv4 ARP case 14608 * 14609 * Set B_TRUE, as we only want 14610 * ipif_resolver_up to send an 14611 * AR_ENTRY_ADD request up to 14612 * ARP. 14613 */ 14614 err = ipif_resolver_up(ipif, 14615 B_TRUE); 14616 if (err) { 14617 ip1dbg(( 14618 "ip_rput_dlpi_writer: " 14619 "Failed to update arp " 14620 "err %d\n", err)); 14621 } 14622 } 14623 } 14624 /* 14625 * Allow "fall through" to the DL_NOTE_FASTPATH_FLUSH 14626 * case so that all old fastpath information can be 14627 * purged from IRE caches. 14628 */ 14629 /* FALLTHRU */ 14630 case DL_NOTE_FASTPATH_FLUSH: 14631 /* 14632 * Any fastpath probe sent henceforth will get the 14633 * new fp mp. So we first delete any ires that are 14634 * waiting for the fastpath. Then walk all ires and 14635 * delete the ire or delete the fp mp. In the case of 14636 * IRE_MIPRTUN and IRE_BROADCAST it is difficult to 14637 * recreate the ire's without going through a complex 14638 * ipif up/down dance. So we don't delete the ire 14639 * itself, but just the ire_fp_mp for these 2 ire's 14640 * In the case of the other ire's we delete the ire's 14641 * themselves. Access to ire_fp_mp is completely 14642 * protected by ire_lock for IRE_MIPRTUN and 14643 * IRE_BROADCAST. Deleting the ire is preferable in the 14644 * other cases for performance. 14645 */ 14646 if (ill->ill_isv6) { 14647 nce_fastpath_list_dispatch(ill, NULL, NULL); 14648 ndp_walk(ill, (pfi_t)ndp_fastpath_flush, 14649 NULL); 14650 } else { 14651 ire_fastpath_list_dispatch(ill, NULL, NULL); 14652 ire_walk_ill_v4(MATCH_IRE_WQ | MATCH_IRE_TYPE, 14653 IRE_CACHE | IRE_BROADCAST, 14654 ire_fastpath_flush, NULL, ill); 14655 mutex_enter(&ire_mrtun_lock); 14656 if (ire_mrtun_count != 0) { 14657 mutex_exit(&ire_mrtun_lock); 14658 ire_walk_ill_mrtun(MATCH_IRE_WQ, 14659 IRE_MIPRTUN, ire_fastpath_flush, 14660 NULL, ill); 14661 } else { 14662 mutex_exit(&ire_mrtun_lock); 14663 } 14664 } 14665 break; 14666 case DL_NOTE_SDU_SIZE: 14667 /* 14668 * Change the MTU size of the interface, of all 14669 * attached ipif's, and of all relevant ire's. The 14670 * new value's a uint32_t at notify->dl_data. 14671 * Mtu change Vs. new ire creation - protocol below. 14672 * 14673 * a Mark the ipif as IPIF_CHANGING. 14674 * b Set the new mtu in the ipif. 14675 * c Change the ire_max_frag on all affected ires 14676 * d Unmark the IPIF_CHANGING 14677 * 14678 * To see how the protocol works, assume an interface 14679 * route is also being added simultaneously by 14680 * ip_rt_add and let 'ipif' be the ipif referenced by 14681 * the ire. If the ire is created before step a, 14682 * it will be cleaned up by step c. If the ire is 14683 * created after step d, it will see the new value of 14684 * ipif_mtu. Any attempt to create the ire between 14685 * steps a to d will fail because of the IPIF_CHANGING 14686 * flag. Note that ire_create() is passed a pointer to 14687 * the ipif_mtu, and not the value. During ire_add 14688 * under the bucket lock, the ire_max_frag of the 14689 * new ire being created is set from the ipif/ire from 14690 * which it is being derived. 14691 */ 14692 mutex_enter(&ill->ill_lock); 14693 ill->ill_max_frag = (uint_t)notify->dl_data; 14694 14695 /* 14696 * If an SIOCSLIFLNKINFO has changed the ill_max_mtu 14697 * leave it alone 14698 */ 14699 if (ill->ill_mtu_userspecified) { 14700 mutex_exit(&ill->ill_lock); 14701 break; 14702 } 14703 ill->ill_max_mtu = ill->ill_max_frag; 14704 if (ill->ill_isv6) { 14705 if (ill->ill_max_mtu < IPV6_MIN_MTU) 14706 ill->ill_max_mtu = IPV6_MIN_MTU; 14707 } else { 14708 if (ill->ill_max_mtu < IP_MIN_MTU) 14709 ill->ill_max_mtu = IP_MIN_MTU; 14710 } 14711 for (ipif = ill->ill_ipif; ipif != NULL; 14712 ipif = ipif->ipif_next) { 14713 /* 14714 * Don't override the mtu if the user 14715 * has explicitly set it. 14716 */ 14717 if (ipif->ipif_flags & IPIF_FIXEDMTU) 14718 continue; 14719 ipif->ipif_mtu = (uint_t)notify->dl_data; 14720 if (ipif->ipif_isv6) 14721 ire = ipif_to_ire_v6(ipif); 14722 else 14723 ire = ipif_to_ire(ipif); 14724 if (ire != NULL) { 14725 ire->ire_max_frag = ipif->ipif_mtu; 14726 ire_refrele(ire); 14727 } 14728 if (ipif->ipif_flags & IPIF_UP) { 14729 if (ill->ill_isv6) 14730 need_ire_walk_v6 = B_TRUE; 14731 else 14732 need_ire_walk_v4 = B_TRUE; 14733 } 14734 } 14735 mutex_exit(&ill->ill_lock); 14736 if (need_ire_walk_v4) 14737 ire_walk_v4(ill_mtu_change, (char *)ill, 14738 ALL_ZONES); 14739 if (need_ire_walk_v6) 14740 ire_walk_v6(ill_mtu_change, (char *)ill, 14741 ALL_ZONES); 14742 break; 14743 case DL_NOTE_LINK_UP: 14744 case DL_NOTE_LINK_DOWN: { 14745 /* 14746 * We are writer. ill / phyint / ipsq assocs stable. 14747 * The RUNNING flag reflects the state of the link. 14748 */ 14749 phyint_t *phyint = ill->ill_phyint; 14750 uint64_t new_phyint_flags; 14751 boolean_t changed = B_FALSE; 14752 14753 mutex_enter(&phyint->phyint_lock); 14754 new_phyint_flags = 14755 (notify->dl_notification == DL_NOTE_LINK_UP) ? 14756 phyint->phyint_flags | PHYI_RUNNING : 14757 phyint->phyint_flags & ~PHYI_RUNNING; 14758 if (new_phyint_flags != phyint->phyint_flags) { 14759 phyint->phyint_flags = new_phyint_flags; 14760 changed = B_TRUE; 14761 } 14762 mutex_exit(&phyint->phyint_lock); 14763 /* 14764 * If the flags have changed, send a message to 14765 * the routing socket. 14766 */ 14767 if (changed) { 14768 if (phyint->phyint_illv4 != NULL) { 14769 ip_rts_ifmsg( 14770 phyint->phyint_illv4->ill_ipif); 14771 } 14772 if (phyint->phyint_illv6 != NULL) { 14773 ip_rts_ifmsg( 14774 phyint->phyint_illv6->ill_ipif); 14775 } 14776 } 14777 break; 14778 } 14779 case DL_NOTE_PROMISC_ON_PHYS: 14780 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 14781 "got a DL_NOTE_PROMISC_ON_PHYS\n")); 14782 mutex_enter(&ill->ill_lock); 14783 ill->ill_promisc_on_phys = B_TRUE; 14784 mutex_exit(&ill->ill_lock); 14785 break; 14786 case DL_NOTE_PROMISC_OFF_PHYS: 14787 IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " 14788 "got a DL_NOTE_PROMISC_OFF_PHYS\n")); 14789 mutex_enter(&ill->ill_lock); 14790 ill->ill_promisc_on_phys = B_FALSE; 14791 mutex_exit(&ill->ill_lock); 14792 break; 14793 case DL_NOTE_CAPAB_RENEG: 14794 /* 14795 * Something changed on the driver side. 14796 * It wants us to renegotiate the capabilities 14797 * on this ill. The most likely cause is the 14798 * aggregation interface under us where a 14799 * port got added or went away. 14800 * 14801 * We reset the capabilities and set the 14802 * state to IDMS_RENG so that when the ack 14803 * comes back, we can start the 14804 * renegotiation process. 14805 */ 14806 ill_capability_reset(ill); 14807 ill->ill_capab_state = IDMS_RENEG; 14808 break; 14809 default: 14810 ip0dbg(("ip_rput_dlpi_writer: unknown notification " 14811 "type 0x%x for DL_NOTIFY_IND\n", 14812 notify->dl_notification)); 14813 break; 14814 } 14815 14816 /* 14817 * As this is an asynchronous operation, we 14818 * should not call ill_dlpi_done 14819 */ 14820 break; 14821 } 14822 case DL_NOTIFY_ACK: 14823 /* 14824 * Don't really need to check for what notifications 14825 * are supported; we'll process what gets sent upstream, 14826 * and we know it'll be something we support changing 14827 * based on our DL_NOTIFY_REQ. 14828 */ 14829 ill_dlpi_done(ill, DL_NOTIFY_REQ); 14830 break; 14831 case DL_PHYS_ADDR_ACK: { 14832 /* 14833 * We should have an IOCTL waiting on this when request 14834 * sent by ill_dl_phys. 14835 * However, ill_dl_phys was called on an ill queue (from 14836 * SIOCSLIFNAME), thus conn_pending_ill is not set. But the 14837 * ioctl is known to be pending on ill_wq. 14838 * There are two additional phys_addr_req's sent to the 14839 * driver to get the token and lla. ill_phys_addr_pend 14840 * keeps track of the last one sent so we know which 14841 * response we are dealing with. ill_dlpi_done will 14842 * update ill_phys_addr_pend when it sends the next req. 14843 * We don't complete the IOCTL until all three DL_PARs 14844 * have been attempted. 14845 * 14846 * We don't need any lock to update ill_nd_lla* fields, 14847 * since the ill is not yet up, We grab the lock just 14848 * for uniformity with other code that accesses ill_nd_lla. 14849 */ 14850 physaddr_req = ill->ill_phys_addr_pend; 14851 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ); 14852 if (physaddr_req == DL_IPV6_TOKEN || 14853 physaddr_req == DL_IPV6_LINK_LAYER_ADDR) { 14854 if (physaddr_req == DL_IPV6_TOKEN) { 14855 /* 14856 * bcopy to low-order bits of ill_token 14857 * 14858 * XXX Temporary hack - currently, 14859 * all known tokens are 64 bits, 14860 * so I'll cheat for the moment. 14861 */ 14862 dlp = (union DL_primitives *)mp->b_rptr; 14863 14864 mutex_enter(&ill->ill_lock); 14865 bcopy((uchar_t *)(mp->b_rptr + 14866 dlp->physaddr_ack.dl_addr_offset), 14867 (void *)&ill->ill_token.s6_addr32[2], 14868 dlp->physaddr_ack.dl_addr_length); 14869 ill->ill_token_length = 14870 dlp->physaddr_ack.dl_addr_length; 14871 mutex_exit(&ill->ill_lock); 14872 } else { 14873 ASSERT(ill->ill_nd_lla_mp == NULL); 14874 mp_hw = copyb(mp); 14875 if (mp_hw == NULL) { 14876 err = ENOMEM; 14877 break; 14878 } 14879 dlp = (union DL_primitives *)mp_hw->b_rptr; 14880 mutex_enter(&ill->ill_lock); 14881 ill->ill_nd_lla_mp = mp_hw; 14882 ill->ill_nd_lla = (uchar_t *)mp_hw->b_rptr + 14883 dlp->physaddr_ack.dl_addr_offset; 14884 ill->ill_nd_lla_len = 14885 dlp->physaddr_ack.dl_addr_length; 14886 mutex_exit(&ill->ill_lock); 14887 } 14888 break; 14889 } 14890 ASSERT(physaddr_req == DL_CURR_PHYS_ADDR); 14891 ASSERT(ill->ill_phys_addr_mp == NULL); 14892 if (!ill->ill_ifname_pending) 14893 break; 14894 ill->ill_ifname_pending = 0; 14895 if (!ioctl_aborted) 14896 mp1 = ipsq_pending_mp_get(ipsq, &connp); 14897 if (mp1 != NULL) { 14898 ASSERT(connp == NULL); 14899 q = ill->ill_wq; 14900 } 14901 /* 14902 * If any error acks received during the plumbing sequence, 14903 * ill_ifname_pending_err will be set. Break out and send up 14904 * the error to the pending ioctl. 14905 */ 14906 if (ill->ill_ifname_pending_err != 0) { 14907 err = ill->ill_ifname_pending_err; 14908 ill->ill_ifname_pending_err = 0; 14909 break; 14910 } 14911 /* 14912 * Get the interface token. If the zeroth interface 14913 * address is zero then set the address to the link local 14914 * address 14915 */ 14916 mp_hw = copyb(mp); 14917 if (mp_hw == NULL) { 14918 err = ENOMEM; 14919 break; 14920 } 14921 dlp = (union DL_primitives *)mp_hw->b_rptr; 14922 ill->ill_phys_addr_mp = mp_hw; 14923 ill->ill_phys_addr = (uchar_t *)mp_hw->b_rptr + 14924 dlp->physaddr_ack.dl_addr_offset; 14925 if (dlp->physaddr_ack.dl_addr_length == 0 || 14926 ill->ill_phys_addr_length == 0 || 14927 ill->ill_phys_addr_length == IP_ADDR_LEN) { 14928 /* 14929 * Compatibility: atun driver returns a length of 0. 14930 * ipdptp has an ill_phys_addr_length of zero(from 14931 * DL_BIND_ACK) but a non-zero length here. 14932 * ipd has an ill_phys_addr_length of 4(from 14933 * DL_BIND_ACK) but a non-zero length here. 14934 */ 14935 ill->ill_phys_addr = NULL; 14936 } else if (dlp->physaddr_ack.dl_addr_length != 14937 ill->ill_phys_addr_length) { 14938 ip0dbg(("DL_PHYS_ADDR_ACK: " 14939 "Address length mismatch %d %d\n", 14940 dlp->physaddr_ack.dl_addr_length, 14941 ill->ill_phys_addr_length)); 14942 err = EINVAL; 14943 break; 14944 } 14945 mutex_enter(&ill->ill_lock); 14946 if (ill->ill_nd_lla_mp == NULL) { 14947 ill->ill_nd_lla_mp = copyb(mp_hw); 14948 if (ill->ill_nd_lla_mp == NULL) { 14949 err = ENOMEM; 14950 mutex_exit(&ill->ill_lock); 14951 break; 14952 } 14953 ill->ill_nd_lla = 14954 (uchar_t *)ill->ill_nd_lla_mp->b_rptr + 14955 dlp->physaddr_ack.dl_addr_offset; 14956 ill->ill_nd_lla_len = ill->ill_phys_addr_length; 14957 } 14958 mutex_exit(&ill->ill_lock); 14959 if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 14960 (void) ill_setdefaulttoken(ill); 14961 14962 /* 14963 * If the ill zero interface has a zero address assign 14964 * it the proper link local address. 14965 */ 14966 ASSERT(ill->ill_ipif->ipif_id == 0); 14967 if (ipif != NULL && 14968 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 14969 (void) ipif_setlinklocal(ipif); 14970 break; 14971 } 14972 case DL_OK_ACK: 14973 ip2dbg(("DL_OK_ACK %s (0x%x)\n", 14974 dlpi_prim_str((int)dloa->dl_correct_primitive), 14975 dloa->dl_correct_primitive)); 14976 switch (dloa->dl_correct_primitive) { 14977 case DL_UNBIND_REQ: 14978 case DL_ATTACH_REQ: 14979 case DL_DETACH_REQ: 14980 ill_dlpi_done(ill, dloa->dl_correct_primitive); 14981 break; 14982 } 14983 break; 14984 default: 14985 break; 14986 } 14987 14988 freemsg(mp); 14989 if (mp1) { 14990 struct iocblk *iocp; 14991 int mode; 14992 14993 /* 14994 * Complete the waiting IOCTL. For SIOCLIFADDIF or 14995 * SIOCSLIFNAME do a copyout. 14996 */ 14997 iocp = (struct iocblk *)mp1->b_rptr; 14998 14999 if (iocp->ioc_cmd == SIOCLIFADDIF || 15000 iocp->ioc_cmd == SIOCSLIFNAME) 15001 mode = COPYOUT; 15002 else 15003 mode = NO_COPYOUT; 15004 /* 15005 * The ioctl must complete now without EINPROGRESS 15006 * since ipsq_pending_mp_get has removed the ioctl mblk 15007 * from ipsq_pending_mp. Otherwise the ioctl will be 15008 * stuck for ever in the ipsq. 15009 */ 15010 ASSERT(err != EINPROGRESS); 15011 ip_ioctl_finish(q, mp1, err, mode, ipif, ipsq); 15012 15013 } 15014 } 15015 15016 /* 15017 * ip_rput_other is called by ip_rput to handle messages modifying the global 15018 * state in IP. Normally called as writer. Exception SIOCGTUNPARAM (shared) 15019 */ 15020 /* ARGSUSED */ 15021 void 15022 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 15023 { 15024 ill_t *ill; 15025 struct iocblk *iocp; 15026 mblk_t *mp1; 15027 conn_t *connp = NULL; 15028 15029 ip1dbg(("ip_rput_other ")); 15030 ill = (ill_t *)q->q_ptr; 15031 /* 15032 * This routine is not a writer in the case of SIOCGTUNPARAM 15033 * in which case ipsq is NULL. 15034 */ 15035 if (ipsq != NULL) { 15036 ASSERT(IAM_WRITER_IPSQ(ipsq)); 15037 ASSERT(ipsq == ill->ill_phyint->phyint_ipsq); 15038 } 15039 15040 switch (mp->b_datap->db_type) { 15041 case M_ERROR: 15042 case M_HANGUP: 15043 /* 15044 * The device has a problem. We force the ILL down. It can 15045 * be brought up again manually using SIOCSIFFLAGS (via 15046 * ifconfig or equivalent). 15047 */ 15048 ASSERT(ipsq != NULL); 15049 if (mp->b_rptr < mp->b_wptr) 15050 ill->ill_error = (int)(*mp->b_rptr & 0xFF); 15051 if (ill->ill_error == 0) 15052 ill->ill_error = ENXIO; 15053 if (!ill_down_start(q, mp)) 15054 return; 15055 ipif_all_down_tail(ipsq, q, mp, NULL); 15056 break; 15057 case M_IOCACK: 15058 iocp = (struct iocblk *)mp->b_rptr; 15059 ASSERT(iocp->ioc_cmd != DL_IOC_HDR_INFO); 15060 switch (iocp->ioc_cmd) { 15061 case SIOCSTUNPARAM: 15062 case OSIOCSTUNPARAM: 15063 ASSERT(ipsq != NULL); 15064 /* 15065 * Finish socket ioctl passed through to tun. 15066 * We should have an IOCTL waiting on this. 15067 */ 15068 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15069 if (ill->ill_isv6) { 15070 struct iftun_req *ta; 15071 15072 /* 15073 * if a source or destination is 15074 * being set, try and set the link 15075 * local address for the tunnel 15076 */ 15077 ta = (struct iftun_req *)mp->b_cont-> 15078 b_cont->b_rptr; 15079 if (ta->ifta_flags & (IFTUN_SRC | IFTUN_DST)) { 15080 ipif_set_tun_llink(ill, ta); 15081 } 15082 15083 } 15084 if (mp1 != NULL) { 15085 /* 15086 * Now copy back the b_next/b_prev used by 15087 * mi code for the mi_copy* functions. 15088 * See ip_sioctl_tunparam() for the reason. 15089 * Also protect against missing b_cont. 15090 */ 15091 if (mp->b_cont != NULL) { 15092 mp->b_cont->b_next = 15093 mp1->b_cont->b_next; 15094 mp->b_cont->b_prev = 15095 mp1->b_cont->b_prev; 15096 } 15097 inet_freemsg(mp1); 15098 ASSERT(ipsq->ipsq_current_ipif != NULL); 15099 ASSERT(connp != NULL); 15100 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15101 iocp->ioc_error, NO_COPYOUT, 15102 ipsq->ipsq_current_ipif, ipsq); 15103 } else { 15104 ASSERT(connp == NULL); 15105 putnext(q, mp); 15106 } 15107 break; 15108 case SIOCGTUNPARAM: 15109 case OSIOCGTUNPARAM: 15110 /* 15111 * This is really M_IOCDATA from the tunnel driver. 15112 * convert back and complete the ioctl. 15113 * We should have an IOCTL waiting on this. 15114 */ 15115 mp1 = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 15116 if (mp1) { 15117 /* 15118 * Now copy back the b_next/b_prev used by 15119 * mi code for the mi_copy* functions. 15120 * See ip_sioctl_tunparam() for the reason. 15121 * Also protect against missing b_cont. 15122 */ 15123 if (mp->b_cont != NULL) { 15124 mp->b_cont->b_next = 15125 mp1->b_cont->b_next; 15126 mp->b_cont->b_prev = 15127 mp1->b_cont->b_prev; 15128 } 15129 inet_freemsg(mp1); 15130 if (iocp->ioc_error == 0) 15131 mp->b_datap->db_type = M_IOCDATA; 15132 ASSERT(connp != NULL); 15133 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15134 iocp->ioc_error, COPYOUT, NULL, NULL); 15135 } else { 15136 ASSERT(connp == NULL); 15137 putnext(q, mp); 15138 } 15139 break; 15140 default: 15141 break; 15142 } 15143 break; 15144 case M_IOCNAK: 15145 iocp = (struct iocblk *)mp->b_rptr; 15146 15147 switch (iocp->ioc_cmd) { 15148 int mode; 15149 ipif_t *ipif; 15150 15151 case DL_IOC_HDR_INFO: 15152 /* 15153 * If this was the first attempt turn of the 15154 * fastpath probing. 15155 */ 15156 mutex_enter(&ill->ill_lock); 15157 if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) { 15158 ill->ill_dlpi_fastpath_state = IDMS_FAILED; 15159 mutex_exit(&ill->ill_lock); 15160 ill_fastpath_nack(ill); 15161 ip1dbg(("ip_rput: DLPI fastpath off on " 15162 "interface %s\n", 15163 ill->ill_name)); 15164 } else { 15165 mutex_exit(&ill->ill_lock); 15166 } 15167 freemsg(mp); 15168 break; 15169 case SIOCSTUNPARAM: 15170 case OSIOCSTUNPARAM: 15171 ASSERT(ipsq != NULL); 15172 /* 15173 * Finish socket ioctl passed through to tun 15174 * We should have an IOCTL waiting on this. 15175 */ 15176 /* FALLTHRU */ 15177 case SIOCGTUNPARAM: 15178 case OSIOCGTUNPARAM: 15179 /* 15180 * This is really M_IOCDATA from the tunnel driver. 15181 * convert back and complete the ioctl. 15182 * We should have an IOCTL waiting on this. 15183 */ 15184 if (iocp->ioc_cmd == SIOCGTUNPARAM || 15185 iocp->ioc_cmd == OSIOCGTUNPARAM) { 15186 mp1 = ill_pending_mp_get(ill, &connp, 15187 iocp->ioc_id); 15188 mode = COPYOUT; 15189 ipsq = NULL; 15190 ipif = NULL; 15191 } else { 15192 mp1 = ipsq_pending_mp_get(ipsq, &connp); 15193 mode = NO_COPYOUT; 15194 ASSERT(ipsq->ipsq_current_ipif != NULL); 15195 ipif = ipsq->ipsq_current_ipif; 15196 } 15197 if (mp1 != NULL) { 15198 /* 15199 * Now copy back the b_next/b_prev used by 15200 * mi code for the mi_copy* functions. 15201 * See ip_sioctl_tunparam() for the reason. 15202 * Also protect against missing b_cont. 15203 */ 15204 if (mp->b_cont != NULL) { 15205 mp->b_cont->b_next = 15206 mp1->b_cont->b_next; 15207 mp->b_cont->b_prev = 15208 mp1->b_cont->b_prev; 15209 } 15210 inet_freemsg(mp1); 15211 if (iocp->ioc_error == 0) 15212 iocp->ioc_error = EINVAL; 15213 ASSERT(connp != NULL); 15214 ip_ioctl_finish(CONNP_TO_WQ(connp), mp, 15215 iocp->ioc_error, mode, ipif, ipsq); 15216 } else { 15217 ASSERT(connp == NULL); 15218 putnext(q, mp); 15219 } 15220 break; 15221 default: 15222 break; 15223 } 15224 default: 15225 break; 15226 } 15227 } 15228 15229 /* 15230 * NOTE : This function does not ire_refrele the ire argument passed in. 15231 * 15232 * IPQoS notes 15233 * IP policy is invoked twice for a forwarded packet, once on the read side 15234 * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are 15235 * enabled. An additional parameter, in_ill, has been added for this purpose. 15236 * Note that in_ill could be NULL when called from ip_rput_forward_multicast 15237 * because ip_mroute drops this information. 15238 * 15239 */ 15240 void 15241 ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) 15242 { 15243 uint32_t pkt_len; 15244 queue_t *q; 15245 uint32_t sum; 15246 #define rptr ((uchar_t *)ipha) 15247 uint32_t max_frag; 15248 uint32_t ill_index; 15249 15250 /* Get the ill_index of the incoming ILL */ 15251 ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; 15252 15253 /* Initiate Read side IPPF processing */ 15254 if (IPP_ENABLED(IPP_FWD_IN)) { 15255 ip_process(IPP_FWD_IN, &mp, ill_index); 15256 if (mp == NULL) { 15257 ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ 15258 "during IPPF processing\n")); 15259 return; 15260 } 15261 } 15262 pkt_len = ntohs(ipha->ipha_length); 15263 15264 /* Adjust the checksum to reflect the ttl decrement. */ 15265 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 15266 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 15267 15268 if (ipha->ipha_ttl-- <= 1) { 15269 if (ip_csum_hdr(ipha)) { 15270 BUMP_MIB(&ip_mib, ipInCksumErrs); 15271 goto drop_pkt; 15272 } 15273 /* 15274 * Note: ire_stq this will be NULL for multicast 15275 * datagrams using the long path through arp (the IRE 15276 * is not an IRE_CACHE). This should not cause 15277 * problems since we don't generate ICMP errors for 15278 * multicast packets. 15279 */ 15280 q = ire->ire_stq; 15281 if (q) 15282 icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED); 15283 else 15284 freemsg(mp); 15285 return; 15286 } 15287 15288 /* 15289 * Don't forward if the interface is down 15290 */ 15291 if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { 15292 BUMP_MIB(&ip_mib, ipInDiscards); 15293 goto drop_pkt; 15294 } 15295 15296 /* Get the ill_index of the outgoing ILL */ 15297 ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; 15298 15299 if (is_system_labeled()) { 15300 mblk_t *mp1; 15301 15302 if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { 15303 BUMP_MIB(&ip_mib, ipForwProhibits); 15304 goto drop_pkt; 15305 } 15306 /* Size may have changed */ 15307 mp = mp1; 15308 ipha = (ipha_t *)mp->b_rptr; 15309 pkt_len = ntohs(ipha->ipha_length); 15310 } 15311 15312 /* Check if there are options to update */ 15313 if (!IS_SIMPLE_IPH(ipha)) { 15314 if (ip_csum_hdr(ipha)) { 15315 BUMP_MIB(&ip_mib, ipInCksumErrs); 15316 goto drop_pkt; 15317 } 15318 if (ip_rput_forward_options(mp, ipha, ire)) { 15319 return; 15320 } 15321 15322 ipha->ipha_hdr_checksum = 0; 15323 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 15324 } 15325 max_frag = ire->ire_max_frag; 15326 if (pkt_len > max_frag) { 15327 /* 15328 * It needs fragging on its way out. We haven't 15329 * verified the header checksum yet. Since we 15330 * are going to put a surely good checksum in the 15331 * outgoing header, we have to make sure that it 15332 * was good coming in. 15333 */ 15334 if (ip_csum_hdr(ipha)) { 15335 BUMP_MIB(&ip_mib, ipInCksumErrs); 15336 goto drop_pkt; 15337 } 15338 /* Initiate Write side IPPF processing */ 15339 if (IPP_ENABLED(IPP_FWD_OUT)) { 15340 ip_process(IPP_FWD_OUT, &mp, ill_index); 15341 if (mp == NULL) { 15342 ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ 15343 " during IPPF processing\n")); 15344 return; 15345 } 15346 } 15347 ip_wput_frag(ire, mp, IB_PKT, max_frag, 0); 15348 return; 15349 } 15350 15351 mp = ip_wput_attach_llhdr(mp, ire, IPP_FWD_OUT, ill_index); 15352 if (mp == NULL) { 15353 BUMP_MIB(&ip_mib, ipInDiscards); 15354 return; 15355 } 15356 15357 q = ire->ire_stq; 15358 UPDATE_IB_PKT_COUNT(ire); 15359 ire->ire_last_used_time = lbolt; 15360 BUMP_MIB(&ip_mib, ipForwDatagrams); 15361 putnext(q, mp); 15362 return; 15363 15364 drop_pkt:; 15365 ip1dbg(("ip_rput_forward: drop pkt\n")); 15366 freemsg(mp); 15367 #undef rptr 15368 } 15369 15370 void 15371 ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) 15372 { 15373 ire_t *ire; 15374 15375 ASSERT(!ipif->ipif_isv6); 15376 /* 15377 * Find an IRE which matches the destination and the outgoing 15378 * queue in the cache table. All we need is an IRE_CACHE which 15379 * is pointing at ipif->ipif_ill. If it is part of some ill group, 15380 * then it is enough to have some IRE_CACHE in the group. 15381 */ 15382 if (ipif->ipif_flags & IPIF_POINTOPOINT) 15383 dst = ipif->ipif_pp_dst_addr; 15384 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, MBLK_GETLABEL(mp), 15385 MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR); 15386 if (ire == NULL) { 15387 /* 15388 * Mark this packet to make it be delivered to 15389 * ip_rput_forward after the new ire has been 15390 * created. 15391 */ 15392 mp->b_prev = NULL; 15393 mp->b_next = mp; 15394 ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, 15395 NULL, 0); 15396 } else { 15397 ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); 15398 IRE_REFRELE(ire); 15399 } 15400 } 15401 15402 /* Update any source route, record route or timestamp options */ 15403 static int 15404 ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire) 15405 { 15406 ipoptp_t opts; 15407 uchar_t *opt; 15408 uint8_t optval; 15409 uint8_t optlen; 15410 ipaddr_t dst; 15411 uint32_t ts; 15412 ire_t *dst_ire = NULL; 15413 ire_t *tmp_ire = NULL; 15414 timestruc_t now; 15415 15416 ip2dbg(("ip_rput_forward_options\n")); 15417 dst = ipha->ipha_dst; 15418 for (optval = ipoptp_first(&opts, ipha); 15419 optval != IPOPT_EOL; 15420 optval = ipoptp_next(&opts)) { 15421 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 15422 opt = opts.ipoptp_cur; 15423 optlen = opts.ipoptp_len; 15424 ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", 15425 optval, opts.ipoptp_len)); 15426 switch (optval) { 15427 uint32_t off; 15428 case IPOPT_SSRR: 15429 case IPOPT_LSRR: 15430 /* Check if adminstratively disabled */ 15431 if (!ip_forward_src_routed) { 15432 BUMP_MIB(&ip_mib, ipForwProhibits); 15433 if (ire->ire_stq) 15434 icmp_unreachable(ire->ire_stq, mp, 15435 ICMP_SOURCE_ROUTE_FAILED); 15436 else { 15437 ip0dbg(("ip_rput_forward_options: " 15438 "unable to send unreach\n")); 15439 freemsg(mp); 15440 } 15441 return (-1); 15442 } 15443 15444 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 15445 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 15446 if (dst_ire == NULL) { 15447 /* 15448 * Must be partial since ip_rput_options 15449 * checked for strict. 15450 */ 15451 break; 15452 } 15453 off = opt[IPOPT_OFFSET]; 15454 off--; 15455 redo_srr: 15456 if (optlen < IP_ADDR_LEN || 15457 off > optlen - IP_ADDR_LEN) { 15458 /* End of source route */ 15459 ip1dbg(( 15460 "ip_rput_forward_options: end of SR\n")); 15461 ire_refrele(dst_ire); 15462 break; 15463 } 15464 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 15465 bcopy(&ire->ire_src_addr, (char *)opt + off, 15466 IP_ADDR_LEN); 15467 ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", 15468 ntohl(dst))); 15469 15470 /* 15471 * Check if our address is present more than 15472 * once as consecutive hops in source route. 15473 */ 15474 tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 15475 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 15476 if (tmp_ire != NULL) { 15477 ire_refrele(tmp_ire); 15478 off += IP_ADDR_LEN; 15479 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15480 goto redo_srr; 15481 } 15482 ipha->ipha_dst = dst; 15483 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15484 ire_refrele(dst_ire); 15485 break; 15486 case IPOPT_RR: 15487 off = opt[IPOPT_OFFSET]; 15488 off--; 15489 if (optlen < IP_ADDR_LEN || 15490 off > optlen - IP_ADDR_LEN) { 15491 /* No more room - ignore */ 15492 ip1dbg(( 15493 "ip_rput_forward_options: end of RR\n")); 15494 break; 15495 } 15496 bcopy(&ire->ire_src_addr, (char *)opt + off, 15497 IP_ADDR_LEN); 15498 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15499 break; 15500 case IPOPT_TS: 15501 /* Insert timestamp if there is room */ 15502 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 15503 case IPOPT_TS_TSONLY: 15504 off = IPOPT_TS_TIMELEN; 15505 break; 15506 case IPOPT_TS_PRESPEC: 15507 case IPOPT_TS_PRESPEC_RFC791: 15508 /* Verify that the address matched */ 15509 off = opt[IPOPT_OFFSET] - 1; 15510 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 15511 dst_ire = ire_ctable_lookup(dst, 0, 15512 IRE_LOCAL, NULL, ALL_ZONES, NULL, 15513 MATCH_IRE_TYPE); 15514 15515 if (dst_ire == NULL) { 15516 /* Not for us */ 15517 break; 15518 } 15519 ire_refrele(dst_ire); 15520 /* FALLTHRU */ 15521 case IPOPT_TS_TSANDADDR: 15522 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 15523 break; 15524 default: 15525 /* 15526 * ip_*put_options should have already 15527 * dropped this packet. 15528 */ 15529 cmn_err(CE_PANIC, "ip_rput_forward_options: " 15530 "unknown IT - bug in ip_rput_options?\n"); 15531 return (0); /* Keep "lint" happy */ 15532 } 15533 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 15534 /* Increase overflow counter */ 15535 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 15536 opt[IPOPT_POS_OV_FLG] = 15537 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 15538 (off << 4)); 15539 break; 15540 } 15541 off = opt[IPOPT_OFFSET] - 1; 15542 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 15543 case IPOPT_TS_PRESPEC: 15544 case IPOPT_TS_PRESPEC_RFC791: 15545 case IPOPT_TS_TSANDADDR: 15546 bcopy(&ire->ire_src_addr, 15547 (char *)opt + off, IP_ADDR_LEN); 15548 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 15549 /* FALLTHRU */ 15550 case IPOPT_TS_TSONLY: 15551 off = opt[IPOPT_OFFSET] - 1; 15552 /* Compute # of milliseconds since midnight */ 15553 gethrestime(&now); 15554 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 15555 now.tv_nsec / (NANOSEC / MILLISEC); 15556 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 15557 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 15558 break; 15559 } 15560 break; 15561 } 15562 } 15563 return (0); 15564 } 15565 15566 /* 15567 * This is called after processing at least one of AH/ESP headers. 15568 * 15569 * NOTE: the ill corresponding to ipsec_in_ill_index may not be 15570 * the actual, physical interface on which the packet was received, 15571 * but, when ip_strict_dst_multihoming is set to 1, could be the 15572 * interface which had the ipha_dst configured when the packet went 15573 * through ip_rput. The ill_index corresponding to the recv_ill 15574 * is saved in ipsec_in_rill_index 15575 */ 15576 void 15577 ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) 15578 { 15579 mblk_t *mp; 15580 ipaddr_t dst; 15581 in6_addr_t *v6dstp; 15582 ipha_t *ipha; 15583 ip6_t *ip6h; 15584 ipsec_in_t *ii; 15585 boolean_t ill_need_rele = B_FALSE; 15586 boolean_t rill_need_rele = B_FALSE; 15587 boolean_t ire_need_rele = B_FALSE; 15588 15589 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 15590 ASSERT(ii->ipsec_in_ill_index != 0); 15591 15592 mp = ipsec_mp->b_cont; 15593 ASSERT(mp != NULL); 15594 15595 15596 if (ill == NULL) { 15597 ASSERT(recv_ill == NULL); 15598 /* 15599 * We need to get the original queue on which ip_rput_local 15600 * or ip_rput_data_v6 was called. 15601 */ 15602 ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, 15603 !ii->ipsec_in_v4, NULL, NULL, NULL, NULL); 15604 ill_need_rele = B_TRUE; 15605 15606 if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { 15607 recv_ill = ill_lookup_on_ifindex( 15608 ii->ipsec_in_rill_index, !ii->ipsec_in_v4, 15609 NULL, NULL, NULL, NULL); 15610 rill_need_rele = B_TRUE; 15611 } else { 15612 recv_ill = ill; 15613 } 15614 15615 if ((ill == NULL) || (recv_ill == NULL)) { 15616 ip0dbg(("ip_fanout_proto_again: interface " 15617 "disappeared\n")); 15618 if (ill != NULL) 15619 ill_refrele(ill); 15620 if (recv_ill != NULL) 15621 ill_refrele(recv_ill); 15622 freemsg(ipsec_mp); 15623 return; 15624 } 15625 } 15626 15627 ASSERT(ill != NULL && recv_ill != NULL); 15628 15629 if (mp->b_datap->db_type == M_CTL) { 15630 /* 15631 * AH/ESP is returning the ICMP message after 15632 * removing their headers. Fanout again till 15633 * it gets to the right protocol. 15634 */ 15635 if (ii->ipsec_in_v4) { 15636 icmph_t *icmph; 15637 int iph_hdr_length; 15638 int hdr_length; 15639 15640 ipha = (ipha_t *)mp->b_rptr; 15641 iph_hdr_length = IPH_HDR_LENGTH(ipha); 15642 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 15643 ipha = (ipha_t *)&icmph[1]; 15644 hdr_length = IPH_HDR_LENGTH(ipha); 15645 /* 15646 * icmp_inbound_error_fanout may need to do pullupmsg. 15647 * Reset the type to M_DATA. 15648 */ 15649 mp->b_datap->db_type = M_DATA; 15650 icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, 15651 icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, 15652 B_FALSE, ill, ii->ipsec_in_zoneid); 15653 } else { 15654 icmp6_t *icmp6; 15655 int hdr_length; 15656 15657 ip6h = (ip6_t *)mp->b_rptr; 15658 /* Don't call hdr_length_v6() unless you have to. */ 15659 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) 15660 hdr_length = ip_hdr_length_v6(mp, ip6h); 15661 else 15662 hdr_length = IPV6_HDR_LEN; 15663 15664 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); 15665 /* 15666 * icmp_inbound_error_fanout_v6 may need to do 15667 * pullupmsg. Reset the type to M_DATA. 15668 */ 15669 mp->b_datap->db_type = M_DATA; 15670 icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, 15671 ip6h, icmp6, ill, B_TRUE, ii->ipsec_in_zoneid); 15672 } 15673 if (ill_need_rele) 15674 ill_refrele(ill); 15675 if (rill_need_rele) 15676 ill_refrele(recv_ill); 15677 return; 15678 } 15679 15680 if (ii->ipsec_in_v4) { 15681 ipha = (ipha_t *)mp->b_rptr; 15682 dst = ipha->ipha_dst; 15683 if (CLASSD(dst)) { 15684 /* 15685 * Multicast has to be delivered to all streams. 15686 */ 15687 dst = INADDR_BROADCAST; 15688 } 15689 15690 if (ire == NULL) { 15691 ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, 15692 MBLK_GETLABEL(mp)); 15693 if (ire == NULL) { 15694 if (ill_need_rele) 15695 ill_refrele(ill); 15696 if (rill_need_rele) 15697 ill_refrele(recv_ill); 15698 ip1dbg(("ip_fanout_proto_again: " 15699 "IRE not found")); 15700 freemsg(ipsec_mp); 15701 return; 15702 } 15703 ire_need_rele = B_TRUE; 15704 } 15705 15706 switch (ipha->ipha_protocol) { 15707 case IPPROTO_UDP: 15708 ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, 15709 recv_ill); 15710 if (ire_need_rele) 15711 ire_refrele(ire); 15712 break; 15713 case IPPROTO_TCP: 15714 if (!ire_need_rele) 15715 IRE_REFHOLD(ire); 15716 mp = ip_tcp_input(mp, ipha, ill, B_TRUE, 15717 ire, ipsec_mp, 0, ill->ill_rq, NULL); 15718 IRE_REFRELE(ire); 15719 if (mp != NULL) 15720 squeue_enter_chain(GET_SQUEUE(mp), mp, 15721 mp, 1, SQTAG_IP_PROTO_AGAIN); 15722 break; 15723 case IPPROTO_SCTP: 15724 if (!ire_need_rele) 15725 IRE_REFHOLD(ire); 15726 ip_sctp_input(mp, ipha, ill, B_TRUE, ire, 15727 ipsec_mp, 0, ill->ill_rq, dst); 15728 break; 15729 default: 15730 ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, 15731 recv_ill); 15732 if (ire_need_rele) 15733 ire_refrele(ire); 15734 break; 15735 } 15736 } else { 15737 uint32_t rput_flags = 0; 15738 15739 ip6h = (ip6_t *)mp->b_rptr; 15740 v6dstp = &ip6h->ip6_dst; 15741 /* 15742 * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast 15743 * address. 15744 * 15745 * Currently, we don't store that state in the IPSEC_IN 15746 * message, and we may need to. 15747 */ 15748 rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? 15749 IP6_IN_LLMCAST : 0); 15750 ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, 15751 NULL); 15752 } 15753 if (ill_need_rele) 15754 ill_refrele(ill); 15755 if (rill_need_rele) 15756 ill_refrele(recv_ill); 15757 } 15758 15759 /* 15760 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout 15761 * returns 'true' if there are still fragments left on the queue, in 15762 * which case we restart the timer. 15763 */ 15764 void 15765 ill_frag_timer(void *arg) 15766 { 15767 ill_t *ill = (ill_t *)arg; 15768 boolean_t frag_pending; 15769 15770 mutex_enter(&ill->ill_lock); 15771 ASSERT(!ill->ill_fragtimer_executing); 15772 if (ill->ill_state_flags & ILL_CONDEMNED) { 15773 ill->ill_frag_timer_id = 0; 15774 mutex_exit(&ill->ill_lock); 15775 return; 15776 } 15777 ill->ill_fragtimer_executing = 1; 15778 mutex_exit(&ill->ill_lock); 15779 15780 frag_pending = ill_frag_timeout(ill, ip_g_frag_timeout); 15781 15782 /* 15783 * Restart the timer, if we have fragments pending or if someone 15784 * wanted us to be scheduled again. 15785 */ 15786 mutex_enter(&ill->ill_lock); 15787 ill->ill_fragtimer_executing = 0; 15788 ill->ill_frag_timer_id = 0; 15789 if (frag_pending || ill->ill_fragtimer_needrestart) 15790 ill_frag_timer_start(ill); 15791 mutex_exit(&ill->ill_lock); 15792 } 15793 15794 void 15795 ill_frag_timer_start(ill_t *ill) 15796 { 15797 ASSERT(MUTEX_HELD(&ill->ill_lock)); 15798 15799 /* If the ill is closing or opening don't proceed */ 15800 if (ill->ill_state_flags & ILL_CONDEMNED) 15801 return; 15802 15803 if (ill->ill_fragtimer_executing) { 15804 /* 15805 * ill_frag_timer is currently executing. Just record the 15806 * the fact that we want the timer to be restarted. 15807 * ill_frag_timer will post a timeout before it returns, 15808 * ensuring it will be called again. 15809 */ 15810 ill->ill_fragtimer_needrestart = 1; 15811 return; 15812 } 15813 15814 if (ill->ill_frag_timer_id == 0) { 15815 /* 15816 * The timer is neither running nor is the timeout handler 15817 * executing. Post a timeout so that ill_frag_timer will be 15818 * called 15819 */ 15820 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill, 15821 MSEC_TO_TICK(ip_g_frag_timo_ms >> 1)); 15822 ill->ill_fragtimer_needrestart = 0; 15823 } 15824 } 15825 15826 /* 15827 * This routine is needed for loopback when forwarding multicasts. 15828 * 15829 * IPQoS Notes: 15830 * IPPF processing is done in fanout routines. 15831 * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, 15832 * processing for IPSec packets is done when it comes back in clear. 15833 * NOTE : The callers of this function need to do the ire_refrele for the 15834 * ire that is being passed in. 15835 */ 15836 void 15837 ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, 15838 ill_t *recv_ill) 15839 { 15840 ill_t *ill = (ill_t *)q->q_ptr; 15841 uint32_t sum; 15842 uint32_t u1; 15843 uint32_t u2; 15844 int hdr_length; 15845 boolean_t mctl_present; 15846 mblk_t *first_mp = mp; 15847 mblk_t *hada_mp = NULL; 15848 ipha_t *inner_ipha; 15849 15850 TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, 15851 "ip_rput_locl_start: q %p", q); 15852 15853 ASSERT(ire->ire_ipversion == IPV4_VERSION); 15854 15855 15856 #define rptr ((uchar_t *)ipha) 15857 #define iphs ((uint16_t *)ipha) 15858 15859 /* 15860 * no UDP or TCP packet should come here anymore. 15861 */ 15862 ASSERT((ipha->ipha_protocol != IPPROTO_TCP) && 15863 (ipha->ipha_protocol != IPPROTO_UDP)); 15864 15865 EXTRACT_PKT_MP(mp, first_mp, mctl_present); 15866 if (mctl_present && 15867 ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { 15868 ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); 15869 15870 /* 15871 * It's an IPsec accelerated packet. 15872 * Keep a pointer to the data attributes around until 15873 * we allocate the ipsec_info_t. 15874 */ 15875 IPSECHW_DEBUG(IPSECHW_PKT, 15876 ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); 15877 hada_mp = first_mp; 15878 hada_mp->b_cont = NULL; 15879 /* 15880 * Since it is accelerated, it comes directly from 15881 * the ill and the data attributes is followed by 15882 * the packet data. 15883 */ 15884 ASSERT(mp->b_datap->db_type != M_CTL); 15885 first_mp = mp; 15886 mctl_present = B_FALSE; 15887 } 15888 15889 /* 15890 * IF M_CTL is not present, then ipsec_in_is_secure 15891 * should return B_TRUE. There is a case where loopback 15892 * packets has an M_CTL in the front with all the 15893 * IPSEC options set to IPSEC_PREF_NEVER - which means 15894 * ipsec_in_is_secure will return B_FALSE. As loopback 15895 * packets never comes here, it is safe to ASSERT the 15896 * following. 15897 */ 15898 ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); 15899 15900 15901 /* u1 is # words of IP options */ 15902 u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) 15903 + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 15904 15905 if (u1) { 15906 if (!ip_options_cksum(q, mp, ipha, ire)) { 15907 if (hada_mp != NULL) 15908 freemsg(hada_mp); 15909 return; 15910 } 15911 } else { 15912 /* Check the IP header checksum. */ 15913 #define uph ((uint16_t *)ipha) 15914 sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + 15915 uph[6] + uph[7] + uph[8] + uph[9]; 15916 #undef uph 15917 /* finish doing IP checksum */ 15918 sum = (sum & 0xFFFF) + (sum >> 16); 15919 sum = ~(sum + (sum >> 16)) & 0xFFFF; 15920 /* 15921 * Don't verify header checksum if this packet is coming 15922 * back from AH/ESP as we already did it. 15923 */ 15924 if (!mctl_present && (sum && sum != 0xFFFF)) { 15925 BUMP_MIB(&ip_mib, ipInCksumErrs); 15926 goto drop_pkt; 15927 } 15928 } 15929 15930 /* 15931 * Count for SNMP of inbound packets for ire. As ip_proto_input 15932 * might be called more than once for secure packets, count only 15933 * the first time. 15934 */ 15935 if (!mctl_present) { 15936 UPDATE_IB_PKT_COUNT(ire); 15937 ire->ire_last_used_time = lbolt; 15938 } 15939 15940 /* Check for fragmentation offset. */ 15941 u2 = ntohs(ipha->ipha_fragment_offset_and_flags); 15942 u1 = u2 & (IPH_MF | IPH_OFFSET); 15943 if (u1) { 15944 /* 15945 * We re-assemble fragments before we do the AH/ESP 15946 * processing. Thus, M_CTL should not be present 15947 * while we are re-assembling. 15948 */ 15949 ASSERT(!mctl_present); 15950 ASSERT(first_mp == mp); 15951 if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { 15952 return; 15953 } 15954 /* 15955 * Make sure that first_mp points back to mp as 15956 * the mp we came in with could have changed in 15957 * ip_rput_fragment(). 15958 */ 15959 ipha = (ipha_t *)mp->b_rptr; 15960 first_mp = mp; 15961 } 15962 15963 /* 15964 * Clear hardware checksumming flag as it is currently only 15965 * used by TCP and UDP. 15966 */ 15967 DB_CKSUMFLAGS(mp) = 0; 15968 15969 /* Now we have a complete datagram, destined for this machine. */ 15970 u1 = IPH_HDR_LENGTH(ipha); 15971 switch (ipha->ipha_protocol) { 15972 case IPPROTO_ICMP: { 15973 ire_t *ire_zone; 15974 ilm_t *ilm; 15975 mblk_t *mp1; 15976 zoneid_t last_zoneid; 15977 15978 if (CLASSD(ipha->ipha_dst) && 15979 !(recv_ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) { 15980 ASSERT(ire->ire_type == IRE_BROADCAST); 15981 /* 15982 * In the multicast case, applications may have joined 15983 * the group from different zones, so we need to deliver 15984 * the packet to each of them. Loop through the 15985 * multicast memberships structures (ilm) on the receive 15986 * ill and send a copy of the packet up each matching 15987 * one. However, we don't do this for multicasts sent on 15988 * the loopback interface (PHYI_LOOPBACK flag set) as 15989 * they must stay in the sender's zone. 15990 * 15991 * ilm_add_v6() ensures that ilms in the same zone are 15992 * contiguous in the ill_ilm list. We use this property 15993 * to avoid sending duplicates needed when two 15994 * applications in the same zone join the same group on 15995 * different logical interfaces: we ignore the ilm if 15996 * its zoneid is the same as the last matching one. 15997 * In addition, the sending of the packet for 15998 * ire_zoneid is delayed until all of the other ilms 15999 * have been exhausted. 16000 */ 16001 last_zoneid = -1; 16002 ILM_WALKER_HOLD(recv_ill); 16003 for (ilm = recv_ill->ill_ilm; ilm != NULL; 16004 ilm = ilm->ilm_next) { 16005 if ((ilm->ilm_flags & ILM_DELETED) || 16006 ipha->ipha_dst != ilm->ilm_addr || 16007 ilm->ilm_zoneid == last_zoneid || 16008 ilm->ilm_zoneid == ire->ire_zoneid || 16009 ilm->ilm_zoneid == ALL_ZONES || 16010 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 16011 continue; 16012 mp1 = ip_copymsg(first_mp); 16013 if (mp1 == NULL) 16014 continue; 16015 icmp_inbound(q, mp1, B_TRUE, ill, 16016 0, sum, mctl_present, B_TRUE, 16017 recv_ill, ilm->ilm_zoneid); 16018 last_zoneid = ilm->ilm_zoneid; 16019 } 16020 ILM_WALKER_RELE(recv_ill); 16021 } else if (ire->ire_type == IRE_BROADCAST) { 16022 /* 16023 * In the broadcast case, there may be many zones 16024 * which need a copy of the packet delivered to them. 16025 * There is one IRE_BROADCAST per broadcast address 16026 * and per zone; we walk those using a helper function. 16027 * In addition, the sending of the packet for ire is 16028 * delayed until all of the other ires have been 16029 * processed. 16030 */ 16031 IRB_REFHOLD(ire->ire_bucket); 16032 ire_zone = NULL; 16033 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 16034 ire)) != NULL) { 16035 mp1 = ip_copymsg(first_mp); 16036 if (mp1 == NULL) 16037 continue; 16038 16039 UPDATE_IB_PKT_COUNT(ire_zone); 16040 ire_zone->ire_last_used_time = lbolt; 16041 icmp_inbound(q, mp1, B_TRUE, ill, 16042 0, sum, mctl_present, B_TRUE, 16043 recv_ill, ire_zone->ire_zoneid); 16044 } 16045 IRB_REFRELE(ire->ire_bucket); 16046 } 16047 icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), 16048 ill, 0, sum, mctl_present, B_TRUE, recv_ill, 16049 ire->ire_zoneid); 16050 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16051 "ip_rput_locl_end: q %p (%S)", q, "icmp"); 16052 return; 16053 } 16054 case IPPROTO_IGMP: 16055 /* 16056 * If we are not willing to accept IGMP packets in clear, 16057 * then check with global policy. 16058 */ 16059 if (igmp_accept_clear_messages == 0) { 16060 first_mp = ipsec_check_global_policy(first_mp, NULL, 16061 ipha, NULL, mctl_present); 16062 if (first_mp == NULL) 16063 return; 16064 } 16065 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 16066 freemsg(first_mp); 16067 ip1dbg(("ip_proto_input: zone all cannot accept raw")); 16068 BUMP_MIB(&ip_mib, ipInDiscards); 16069 return; 16070 } 16071 if (igmp_input(q, mp, ill)) { 16072 /* Bad packet - discarded by igmp_input */ 16073 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16074 "ip_rput_locl_end: q %p (%S)", q, "igmp"); 16075 if (mctl_present) 16076 freeb(first_mp); 16077 return; 16078 } 16079 /* 16080 * igmp_input() may have pulled up the message so ipha needs to 16081 * be reinitialized. 16082 */ 16083 ipha = (ipha_t *)mp->b_rptr; 16084 if (ipcl_proto_search(ipha->ipha_protocol) == NULL) { 16085 /* No user-level listener for IGMP packets */ 16086 goto drop_pkt; 16087 } 16088 /* deliver to local raw users */ 16089 break; 16090 case IPPROTO_PIM: 16091 /* 16092 * If we are not willing to accept PIM packets in clear, 16093 * then check with global policy. 16094 */ 16095 if (pim_accept_clear_messages == 0) { 16096 first_mp = ipsec_check_global_policy(first_mp, NULL, 16097 ipha, NULL, mctl_present); 16098 if (first_mp == NULL) 16099 return; 16100 } 16101 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { 16102 freemsg(first_mp); 16103 ip1dbg(("ip_proto_input: zone all cannot accept PIM")); 16104 BUMP_MIB(&ip_mib, ipInDiscards); 16105 return; 16106 } 16107 if (pim_input(q, mp) != 0) { 16108 /* Bad packet - discarded by pim_input */ 16109 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16110 "ip_rput_locl_end: q %p (%S)", q, "pim"); 16111 if (mctl_present) 16112 freeb(first_mp); 16113 return; 16114 } 16115 16116 /* 16117 * pim_input() may have pulled up the message so ipha needs to 16118 * be reinitialized. 16119 */ 16120 ipha = (ipha_t *)mp->b_rptr; 16121 if (ipcl_proto_search(ipha->ipha_protocol) == NULL) { 16122 /* No user-level listener for PIM packets */ 16123 goto drop_pkt; 16124 } 16125 /* deliver to local raw users */ 16126 break; 16127 case IPPROTO_ENCAP: 16128 /* 16129 * Handle self-encapsulated packets (IP-in-IP where 16130 * the inner addresses == the outer addresses). 16131 */ 16132 hdr_length = IPH_HDR_LENGTH(ipha); 16133 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 16134 mp->b_wptr) { 16135 if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + 16136 sizeof (ipha_t) - mp->b_rptr)) { 16137 BUMP_MIB(&ip_mib, ipInDiscards); 16138 freemsg(first_mp); 16139 return; 16140 } 16141 ipha = (ipha_t *)mp->b_rptr; 16142 } 16143 inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 16144 /* 16145 * Check the sanity of the inner IP header. 16146 */ 16147 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 16148 BUMP_MIB(&ip_mib, ipInDiscards); 16149 freemsg(first_mp); 16150 return; 16151 } 16152 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 16153 BUMP_MIB(&ip_mib, ipInDiscards); 16154 freemsg(first_mp); 16155 return; 16156 } 16157 if (inner_ipha->ipha_src == ipha->ipha_src && 16158 inner_ipha->ipha_dst == ipha->ipha_dst) { 16159 ipsec_in_t *ii; 16160 16161 /* 16162 * Self-encapsulated tunnel packet. Remove 16163 * the outer IP header and fanout again. 16164 * We also need to make sure that the inner 16165 * header is pulled up until options. 16166 */ 16167 mp->b_rptr = (uchar_t *)inner_ipha; 16168 ipha = inner_ipha; 16169 hdr_length = IPH_HDR_LENGTH(ipha); 16170 if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { 16171 if (!pullupmsg(mp, (uchar_t *)ipha + 16172 + hdr_length - mp->b_rptr)) { 16173 freemsg(first_mp); 16174 return; 16175 } 16176 ipha = (ipha_t *)mp->b_rptr; 16177 } 16178 if (!mctl_present) { 16179 ASSERT(first_mp == mp); 16180 /* 16181 * This means that somebody is sending 16182 * Self-encapsualted packets without AH/ESP. 16183 * If AH/ESP was present, we would have already 16184 * allocated the first_mp. 16185 */ 16186 if ((first_mp = ipsec_in_alloc(B_TRUE)) == 16187 NULL) { 16188 ip1dbg(("ip_proto_input: IPSEC_IN " 16189 "allocation failure.\n")); 16190 BUMP_MIB(&ip_mib, ipInDiscards); 16191 freemsg(mp); 16192 return; 16193 } 16194 first_mp->b_cont = mp; 16195 } 16196 /* 16197 * We generally store the ill_index if we need to 16198 * do IPSEC processing as we lose the ill queue when 16199 * we come back. But in this case, we never should 16200 * have to store the ill_index here as it should have 16201 * been stored previously when we processed the 16202 * AH/ESP header in this routine or for non-ipsec 16203 * cases, we still have the queue. But for some bad 16204 * packets from the wire, we can get to IPSEC after 16205 * this and we better store the index for that case. 16206 */ 16207 ill = (ill_t *)q->q_ptr; 16208 ii = (ipsec_in_t *)first_mp->b_rptr; 16209 ii->ipsec_in_ill_index = 16210 ill->ill_phyint->phyint_ifindex; 16211 ii->ipsec_in_rill_index = 16212 recv_ill->ill_phyint->phyint_ifindex; 16213 if (ii->ipsec_in_decaps) { 16214 /* 16215 * This packet is self-encapsulated multiple 16216 * times. We don't want to recurse infinitely. 16217 * To keep it simple, drop the packet. 16218 */ 16219 BUMP_MIB(&ip_mib, ipInDiscards); 16220 freemsg(first_mp); 16221 return; 16222 } 16223 ii->ipsec_in_decaps = B_TRUE; 16224 ip_proto_input(q, first_mp, ipha, ire, recv_ill); 16225 return; 16226 } 16227 break; 16228 case IPPROTO_AH: 16229 case IPPROTO_ESP: { 16230 /* 16231 * Fast path for AH/ESP. If this is the first time 16232 * we are sending a datagram to AH/ESP, allocate 16233 * a IPSEC_IN message and prepend it. Otherwise, 16234 * just fanout. 16235 */ 16236 16237 int ipsec_rc; 16238 ipsec_in_t *ii; 16239 16240 IP_STAT(ipsec_proto_ahesp); 16241 if (!mctl_present) { 16242 ASSERT(first_mp == mp); 16243 if ((first_mp = ipsec_in_alloc(B_TRUE)) == NULL) { 16244 ip1dbg(("ip_proto_input: IPSEC_IN " 16245 "allocation failure.\n")); 16246 freemsg(hada_mp); /* okay ifnull */ 16247 BUMP_MIB(&ip_mib, ipInDiscards); 16248 freemsg(mp); 16249 return; 16250 } 16251 /* 16252 * Store the ill_index so that when we come back 16253 * from IPSEC we ride on the same queue. 16254 */ 16255 ill = (ill_t *)q->q_ptr; 16256 ii = (ipsec_in_t *)first_mp->b_rptr; 16257 ii->ipsec_in_ill_index = 16258 ill->ill_phyint->phyint_ifindex; 16259 ii->ipsec_in_rill_index = 16260 recv_ill->ill_phyint->phyint_ifindex; 16261 first_mp->b_cont = mp; 16262 /* 16263 * Cache hardware acceleration info. 16264 */ 16265 if (hada_mp != NULL) { 16266 IPSECHW_DEBUG(IPSECHW_PKT, 16267 ("ip_rput_local: caching data attr.\n")); 16268 ii->ipsec_in_accelerated = B_TRUE; 16269 ii->ipsec_in_da = hada_mp; 16270 hada_mp = NULL; 16271 } 16272 } else { 16273 ii = (ipsec_in_t *)first_mp->b_rptr; 16274 } 16275 16276 if (!ipsec_loaded()) { 16277 ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, 16278 ire->ire_zoneid); 16279 return; 16280 } 16281 16282 /* select inbound SA and have IPsec process the pkt */ 16283 if (ipha->ipha_protocol == IPPROTO_ESP) { 16284 esph_t *esph = ipsec_inbound_esp_sa(first_mp); 16285 if (esph == NULL) 16286 return; 16287 ASSERT(ii->ipsec_in_esp_sa != NULL); 16288 ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); 16289 ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( 16290 first_mp, esph); 16291 } else { 16292 ah_t *ah = ipsec_inbound_ah_sa(first_mp); 16293 if (ah == NULL) 16294 return; 16295 ASSERT(ii->ipsec_in_ah_sa != NULL); 16296 ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); 16297 ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( 16298 first_mp, ah); 16299 } 16300 16301 switch (ipsec_rc) { 16302 case IPSEC_STATUS_SUCCESS: 16303 break; 16304 case IPSEC_STATUS_FAILED: 16305 BUMP_MIB(&ip_mib, ipInDiscards); 16306 /* FALLTHRU */ 16307 case IPSEC_STATUS_PENDING: 16308 return; 16309 } 16310 /* we're done with IPsec processing, send it up */ 16311 ip_fanout_proto_again(first_mp, ill, recv_ill, ire); 16312 return; 16313 } 16314 default: 16315 break; 16316 } 16317 if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { 16318 ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", 16319 ire->ire_zoneid)); 16320 goto drop_pkt; 16321 } 16322 /* 16323 * Handle protocols with which IP is less intimate. There 16324 * can be more than one stream bound to a particular 16325 * protocol. When this is the case, each one gets a copy 16326 * of any incoming packets. 16327 */ 16328 ip_fanout_proto(q, first_mp, ill, ipha, 16329 IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, 16330 B_TRUE, recv_ill, ire->ire_zoneid); 16331 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16332 "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); 16333 return; 16334 16335 drop_pkt: 16336 freemsg(first_mp); 16337 if (hada_mp != NULL) 16338 freeb(hada_mp); 16339 TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, 16340 "ip_rput_locl_end: q %p (%S)", q, "droppkt"); 16341 #undef rptr 16342 #undef iphs 16343 16344 } 16345 16346 /* 16347 * Update any source route, record route or timestamp options. 16348 * Check that we are at end of strict source route. 16349 * The options have already been checked for sanity in ip_rput_options(). 16350 */ 16351 static boolean_t 16352 ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) 16353 { 16354 ipoptp_t opts; 16355 uchar_t *opt; 16356 uint8_t optval; 16357 uint8_t optlen; 16358 ipaddr_t dst; 16359 uint32_t ts; 16360 ire_t *dst_ire; 16361 timestruc_t now; 16362 16363 ASSERT(ire->ire_ipversion == IPV4_VERSION); 16364 16365 ip2dbg(("ip_rput_local_options\n")); 16366 16367 for (optval = ipoptp_first(&opts, ipha); 16368 optval != IPOPT_EOL; 16369 optval = ipoptp_next(&opts)) { 16370 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 16371 opt = opts.ipoptp_cur; 16372 optlen = opts.ipoptp_len; 16373 ip2dbg(("ip_rput_local_options: opt %d, len %d\n", 16374 optval, optlen)); 16375 switch (optval) { 16376 uint32_t off; 16377 case IPOPT_SSRR: 16378 case IPOPT_LSRR: 16379 off = opt[IPOPT_OFFSET]; 16380 off--; 16381 if (optlen < IP_ADDR_LEN || 16382 off > optlen - IP_ADDR_LEN) { 16383 /* End of source route */ 16384 ip1dbg(("ip_rput_local_options: end of SR\n")); 16385 break; 16386 } 16387 /* 16388 * This will only happen if two consecutive entries 16389 * in the source route contains our address or if 16390 * it is a packet with a loose source route which 16391 * reaches us before consuming the whole source route 16392 */ 16393 ip1dbg(("ip_rput_local_options: not end of SR\n")); 16394 if (optval == IPOPT_SSRR) { 16395 goto bad_src_route; 16396 } 16397 /* 16398 * Hack: instead of dropping the packet truncate the 16399 * source route to what has been used by filling the 16400 * rest with IPOPT_NOP. 16401 */ 16402 opt[IPOPT_OLEN] = (uint8_t)off; 16403 while (off < optlen) { 16404 opt[off++] = IPOPT_NOP; 16405 } 16406 break; 16407 case IPOPT_RR: 16408 off = opt[IPOPT_OFFSET]; 16409 off--; 16410 if (optlen < IP_ADDR_LEN || 16411 off > optlen - IP_ADDR_LEN) { 16412 /* No more room - ignore */ 16413 ip1dbg(( 16414 "ip_rput_local_options: end of RR\n")); 16415 break; 16416 } 16417 bcopy(&ire->ire_src_addr, (char *)opt + off, 16418 IP_ADDR_LEN); 16419 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16420 break; 16421 case IPOPT_TS: 16422 /* Insert timestamp if there is romm */ 16423 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16424 case IPOPT_TS_TSONLY: 16425 off = IPOPT_TS_TIMELEN; 16426 break; 16427 case IPOPT_TS_PRESPEC: 16428 case IPOPT_TS_PRESPEC_RFC791: 16429 /* Verify that the address matched */ 16430 off = opt[IPOPT_OFFSET] - 1; 16431 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16432 dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 16433 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 16434 if (dst_ire == NULL) { 16435 /* Not for us */ 16436 break; 16437 } 16438 ire_refrele(dst_ire); 16439 /* FALLTHRU */ 16440 case IPOPT_TS_TSANDADDR: 16441 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16442 break; 16443 default: 16444 /* 16445 * ip_*put_options should have already 16446 * dropped this packet. 16447 */ 16448 cmn_err(CE_PANIC, "ip_rput_local_options: " 16449 "unknown IT - bug in ip_rput_options?\n"); 16450 return (B_TRUE); /* Keep "lint" happy */ 16451 } 16452 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 16453 /* Increase overflow counter */ 16454 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 16455 opt[IPOPT_POS_OV_FLG] = 16456 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) | 16457 (off << 4)); 16458 break; 16459 } 16460 off = opt[IPOPT_OFFSET] - 1; 16461 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16462 case IPOPT_TS_PRESPEC: 16463 case IPOPT_TS_PRESPEC_RFC791: 16464 case IPOPT_TS_TSANDADDR: 16465 bcopy(&ire->ire_src_addr, (char *)opt + off, 16466 IP_ADDR_LEN); 16467 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 16468 /* FALLTHRU */ 16469 case IPOPT_TS_TSONLY: 16470 off = opt[IPOPT_OFFSET] - 1; 16471 /* Compute # of milliseconds since midnight */ 16472 gethrestime(&now); 16473 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 16474 now.tv_nsec / (NANOSEC / MILLISEC); 16475 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 16476 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 16477 break; 16478 } 16479 break; 16480 } 16481 } 16482 return (B_TRUE); 16483 16484 bad_src_route: 16485 q = WR(q); 16486 /* make sure we clear any indication of a hardware checksum */ 16487 DB_CKSUMFLAGS(mp) = 0; 16488 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); 16489 return (B_FALSE); 16490 16491 } 16492 16493 /* 16494 * Process IP options in an inbound packet. If an option affects the 16495 * effective destination address, return the next hop address via dstp. 16496 * Returns -1 if something fails in which case an ICMP error has been sent 16497 * and mp freed. 16498 */ 16499 static int 16500 ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp) 16501 { 16502 ipoptp_t opts; 16503 uchar_t *opt; 16504 uint8_t optval; 16505 uint8_t optlen; 16506 ipaddr_t dst; 16507 intptr_t code = 0; 16508 ire_t *ire = NULL; 16509 16510 ip2dbg(("ip_rput_options\n")); 16511 dst = ipha->ipha_dst; 16512 for (optval = ipoptp_first(&opts, ipha); 16513 optval != IPOPT_EOL; 16514 optval = ipoptp_next(&opts)) { 16515 opt = opts.ipoptp_cur; 16516 optlen = opts.ipoptp_len; 16517 ip2dbg(("ip_rput_options: opt %d, len %d\n", 16518 optval, optlen)); 16519 /* 16520 * Note: we need to verify the checksum before we 16521 * modify anything thus this routine only extracts the next 16522 * hop dst from any source route. 16523 */ 16524 switch (optval) { 16525 uint32_t off; 16526 case IPOPT_SSRR: 16527 case IPOPT_LSRR: 16528 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 16529 ALL_ZONES, NULL, MATCH_IRE_TYPE); 16530 if (ire == NULL) { 16531 if (optval == IPOPT_SSRR) { 16532 ip1dbg(("ip_rput_options: not next" 16533 " strict source route 0x%x\n", 16534 ntohl(dst))); 16535 code = (char *)&ipha->ipha_dst - 16536 (char *)ipha; 16537 goto param_prob; /* RouterReq's */ 16538 } 16539 ip2dbg(("ip_rput_options: " 16540 "not next source route 0x%x\n", 16541 ntohl(dst))); 16542 break; 16543 } 16544 ire_refrele(ire); 16545 16546 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16547 ip1dbg(( 16548 "ip_rput_options: bad option offset\n")); 16549 code = (char *)&opt[IPOPT_OLEN] - 16550 (char *)ipha; 16551 goto param_prob; 16552 } 16553 off = opt[IPOPT_OFFSET]; 16554 off--; 16555 redo_srr: 16556 if (optlen < IP_ADDR_LEN || 16557 off > optlen - IP_ADDR_LEN) { 16558 /* End of source route */ 16559 ip1dbg(("ip_rput_options: end of SR\n")); 16560 break; 16561 } 16562 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 16563 ip1dbg(("ip_rput_options: next hop 0x%x\n", 16564 ntohl(dst))); 16565 16566 /* 16567 * Check if our address is present more than 16568 * once as consecutive hops in source route. 16569 * XXX verify per-interface ip_forwarding 16570 * for source route? 16571 */ 16572 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 16573 ALL_ZONES, NULL, MATCH_IRE_TYPE); 16574 16575 if (ire != NULL) { 16576 ire_refrele(ire); 16577 off += IP_ADDR_LEN; 16578 goto redo_srr; 16579 } 16580 16581 if (dst == htonl(INADDR_LOOPBACK)) { 16582 ip1dbg(("ip_rput_options: loopback addr in " 16583 "source route!\n")); 16584 goto bad_src_route; 16585 } 16586 /* 16587 * For strict: verify that dst is directly 16588 * reachable. 16589 */ 16590 if (optval == IPOPT_SSRR) { 16591 ire = ire_ftable_lookup(dst, 0, 0, 16592 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 16593 MBLK_GETLABEL(mp), 16594 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 16595 if (ire == NULL) { 16596 ip1dbg(("ip_rput_options: SSRR not " 16597 "directly reachable: 0x%x\n", 16598 ntohl(dst))); 16599 goto bad_src_route; 16600 } 16601 ire_refrele(ire); 16602 } 16603 /* 16604 * Defer update of the offset and the record route 16605 * until the packet is forwarded. 16606 */ 16607 break; 16608 case IPOPT_RR: 16609 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16610 ip1dbg(( 16611 "ip_rput_options: bad option offset\n")); 16612 code = (char *)&opt[IPOPT_OLEN] - 16613 (char *)ipha; 16614 goto param_prob; 16615 } 16616 break; 16617 case IPOPT_TS: 16618 /* 16619 * Verify that length >= 5 and that there is either 16620 * room for another timestamp or that the overflow 16621 * counter is not maxed out. 16622 */ 16623 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 16624 if (optlen < IPOPT_MINLEN_IT) { 16625 goto param_prob; 16626 } 16627 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 16628 ip1dbg(( 16629 "ip_rput_options: bad option offset\n")); 16630 code = (char *)&opt[IPOPT_OFFSET] - 16631 (char *)ipha; 16632 goto param_prob; 16633 } 16634 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 16635 case IPOPT_TS_TSONLY: 16636 off = IPOPT_TS_TIMELEN; 16637 break; 16638 case IPOPT_TS_TSANDADDR: 16639 case IPOPT_TS_PRESPEC: 16640 case IPOPT_TS_PRESPEC_RFC791: 16641 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 16642 break; 16643 default: 16644 code = (char *)&opt[IPOPT_POS_OV_FLG] - 16645 (char *)ipha; 16646 goto param_prob; 16647 } 16648 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 16649 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 16650 /* 16651 * No room and the overflow counter is 15 16652 * already. 16653 */ 16654 goto param_prob; 16655 } 16656 break; 16657 } 16658 } 16659 16660 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { 16661 *dstp = dst; 16662 return (0); 16663 } 16664 16665 ip1dbg(("ip_rput_options: error processing IP options.")); 16666 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 16667 16668 param_prob: 16669 q = WR(q); 16670 /* make sure we clear any indication of a hardware checksum */ 16671 DB_CKSUMFLAGS(mp) = 0; 16672 icmp_param_problem(q, mp, (uint8_t)code); 16673 return (-1); 16674 16675 bad_src_route: 16676 q = WR(q); 16677 /* make sure we clear any indication of a hardware checksum */ 16678 DB_CKSUMFLAGS(mp) = 0; 16679 icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); 16680 return (-1); 16681 } 16682 16683 /* 16684 * IP & ICMP info in >=14 msg's ... 16685 * - ip fixed part (mib2_ip_t) 16686 * - icmp fixed part (mib2_icmp_t) 16687 * - ipAddrEntryTable (ip 20) all IPv4 ipifs 16688 * - ipRouteEntryTable (ip 21) all IPv4 IREs 16689 * - ipNetToMediaEntryTable (ip 22) IPv4 IREs for on-link destinations 16690 * - ipRouteAttributeTable (ip 102) labeled routes 16691 * - ip multicast membership (ip_member_t) 16692 * - ip multicast source filtering (ip_grpsrc_t) 16693 * - igmp fixed part (struct igmpstat) 16694 * - multicast routing stats (struct mrtstat) 16695 * - multicast routing vifs (array of struct vifctl) 16696 * - multicast routing routes (array of struct mfcctl) 16697 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t) 16698 * One per ill plus one generic 16699 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t) 16700 * One per ill plus one generic 16701 * - ipv6RouteEntry all IPv6 IREs 16702 * - ipv6RouteAttributeTable (ip6 102) labeled routes 16703 * - ipv6NetToMediaEntry all Neighbor Cache entries 16704 * - ipv6AddrEntry all IPv6 ipifs 16705 * - ipv6 multicast membership (ipv6_member_t) 16706 * - ipv6 multicast source filtering (ipv6_grpsrc_t) 16707 * 16708 * IP_ROUTE and IP_MEDIA are augmented in arp to include arp cache entries not 16709 * already present. 16710 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is 16711 * already filled in by the caller. 16712 * Return value of 0 indicates that no messages were sent and caller 16713 * should free mpctl. 16714 */ 16715 int 16716 ip_snmp_get(queue_t *q, mblk_t *mpctl) 16717 { 16718 16719 if (mpctl == NULL || mpctl->b_cont == NULL) { 16720 return (0); 16721 } 16722 16723 if ((mpctl = ip_snmp_get_mib2_ip(q, mpctl)) == NULL) { 16724 return (1); 16725 } 16726 16727 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl)) == NULL) { 16728 return (1); 16729 } 16730 16731 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl)) == NULL) { 16732 return (1); 16733 } 16734 16735 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl)) == NULL) { 16736 return (1); 16737 } 16738 16739 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl)) == NULL) { 16740 return (1); 16741 } 16742 16743 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl)) == NULL) { 16744 return (1); 16745 } 16746 16747 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl)) == NULL) { 16748 return (1); 16749 } 16750 16751 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl)) == NULL) { 16752 return (1); 16753 } 16754 16755 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl)) == NULL) { 16756 return (1); 16757 } 16758 16759 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl)) == NULL) { 16760 return (1); 16761 } 16762 16763 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl)) == NULL) { 16764 return (1); 16765 } 16766 16767 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl)) == NULL) { 16768 return (1); 16769 } 16770 16771 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl)) == NULL) { 16772 return (1); 16773 } 16774 16775 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl)) == NULL) { 16776 return (1); 16777 } 16778 16779 if ((mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl)) == NULL) { 16780 return (1); 16781 } 16782 16783 if ((mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl)) == NULL) { 16784 return (1); 16785 } 16786 16787 if ((mpctl = sctp_snmp_get_mib2(q, mpctl)) == NULL) { 16788 return (1); 16789 } 16790 freemsg(mpctl); 16791 return (1); 16792 } 16793 16794 16795 /* Get global IPv4 statistics */ 16796 static mblk_t * 16797 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl) 16798 { 16799 struct opthdr *optp; 16800 mblk_t *mp2ctl; 16801 16802 /* 16803 * make a copy of the original message 16804 */ 16805 mp2ctl = copymsg(mpctl); 16806 16807 /* fixed length IP structure... */ 16808 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16809 optp->level = MIB2_IP; 16810 optp->name = 0; 16811 SET_MIB(ip_mib.ipForwarding, 16812 (WE_ARE_FORWARDING ? 1 : 2)); 16813 SET_MIB(ip_mib.ipDefaultTTL, 16814 (uint32_t)ip_def_ttl); 16815 SET_MIB(ip_mib.ipReasmTimeout, 16816 ip_g_frag_timeout); 16817 SET_MIB(ip_mib.ipAddrEntrySize, 16818 sizeof (mib2_ipAddrEntry_t)); 16819 SET_MIB(ip_mib.ipRouteEntrySize, 16820 sizeof (mib2_ipRouteEntry_t)); 16821 SET_MIB(ip_mib.ipNetToMediaEntrySize, 16822 sizeof (mib2_ipNetToMediaEntry_t)); 16823 SET_MIB(ip_mib.ipMemberEntrySize, sizeof (ip_member_t)); 16824 SET_MIB(ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t)); 16825 SET_MIB(ip_mib.ipRouteAttributeSize, sizeof (mib2_ipAttributeEntry_t)); 16826 SET_MIB(ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); 16827 if (!snmp_append_data(mpctl->b_cont, (char *)&ip_mib, 16828 (int)sizeof (ip_mib))) { 16829 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n", 16830 (uint_t)sizeof (ip_mib))); 16831 } 16832 16833 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16834 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n", 16835 (int)optp->level, (int)optp->name, (int)optp->len)); 16836 qreply(q, mpctl); 16837 return (mp2ctl); 16838 } 16839 16840 /* Global IPv4 ICMP statistics */ 16841 static mblk_t * 16842 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl) 16843 { 16844 struct opthdr *optp; 16845 mblk_t *mp2ctl; 16846 16847 /* 16848 * Make a copy of the original message 16849 */ 16850 mp2ctl = copymsg(mpctl); 16851 16852 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16853 optp->level = MIB2_ICMP; 16854 optp->name = 0; 16855 if (!snmp_append_data(mpctl->b_cont, (char *)&icmp_mib, 16856 (int)sizeof (icmp_mib))) { 16857 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n", 16858 (uint_t)sizeof (icmp_mib))); 16859 } 16860 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16861 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n", 16862 (int)optp->level, (int)optp->name, (int)optp->len)); 16863 qreply(q, mpctl); 16864 return (mp2ctl); 16865 } 16866 16867 /* Global IPv4 IGMP statistics */ 16868 static mblk_t * 16869 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl) 16870 { 16871 struct opthdr *optp; 16872 mblk_t *mp2ctl; 16873 16874 /* 16875 * make a copy of the original message 16876 */ 16877 mp2ctl = copymsg(mpctl); 16878 16879 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16880 optp->level = EXPER_IGMP; 16881 optp->name = 0; 16882 if (!snmp_append_data(mpctl->b_cont, (char *)&igmpstat, 16883 (int)sizeof (igmpstat))) { 16884 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n", 16885 (uint_t)sizeof (igmpstat))); 16886 } 16887 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16888 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n", 16889 (int)optp->level, (int)optp->name, (int)optp->len)); 16890 qreply(q, mpctl); 16891 return (mp2ctl); 16892 } 16893 16894 /* Global IPv4 Multicast Routing statistics */ 16895 static mblk_t * 16896 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl) 16897 { 16898 struct opthdr *optp; 16899 mblk_t *mp2ctl; 16900 16901 /* 16902 * make a copy of the original message 16903 */ 16904 mp2ctl = copymsg(mpctl); 16905 16906 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16907 optp->level = EXPER_DVMRP; 16908 optp->name = 0; 16909 if (!ip_mroute_stats(mpctl->b_cont)) { 16910 ip0dbg(("ip_mroute_stats: failed\n")); 16911 } 16912 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16913 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n", 16914 (int)optp->level, (int)optp->name, (int)optp->len)); 16915 qreply(q, mpctl); 16916 return (mp2ctl); 16917 } 16918 16919 /* IPv4 address information */ 16920 static mblk_t * 16921 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl) 16922 { 16923 struct opthdr *optp; 16924 mblk_t *mp2ctl; 16925 mblk_t *mp_tail = NULL; 16926 ill_t *ill; 16927 ipif_t *ipif; 16928 uint_t bitval; 16929 mib2_ipAddrEntry_t mae; 16930 zoneid_t zoneid; 16931 ill_walk_context_t ctx; 16932 16933 /* 16934 * make a copy of the original message 16935 */ 16936 mp2ctl = copymsg(mpctl); 16937 16938 /* ipAddrEntryTable */ 16939 16940 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16941 optp->level = MIB2_IP; 16942 optp->name = MIB2_IP_ADDR; 16943 zoneid = Q_TO_CONN(q)->conn_zoneid; 16944 16945 rw_enter(&ill_g_lock, RW_READER); 16946 ill = ILL_START_WALK_V4(&ctx); 16947 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 16948 for (ipif = ill->ill_ipif; ipif != NULL; 16949 ipif = ipif->ipif_next) { 16950 if (ipif->ipif_zoneid != zoneid && 16951 ipif->ipif_zoneid != ALL_ZONES) 16952 continue; 16953 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 16954 mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 16955 mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; 16956 16957 (void) ipif_get_name(ipif, 16958 mae.ipAdEntIfIndex.o_bytes, 16959 OCTET_LENGTH); 16960 mae.ipAdEntIfIndex.o_length = 16961 mi_strlen(mae.ipAdEntIfIndex.o_bytes); 16962 mae.ipAdEntAddr = ipif->ipif_lcl_addr; 16963 mae.ipAdEntNetMask = ipif->ipif_net_mask; 16964 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; 16965 mae.ipAdEntInfo.ae_subnet_len = 16966 ip_mask_to_plen(ipif->ipif_net_mask); 16967 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; 16968 for (bitval = 1; 16969 bitval && 16970 !(bitval & ipif->ipif_brd_addr); 16971 bitval <<= 1) 16972 noop; 16973 mae.ipAdEntBcastAddr = bitval; 16974 mae.ipAdEntReasmMaxSize = 65535; 16975 mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; 16976 mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; 16977 mae.ipAdEntInfo.ae_broadcast_addr = 16978 ipif->ipif_brd_addr; 16979 mae.ipAdEntInfo.ae_pp_dst_addr = 16980 ipif->ipif_pp_dst_addr; 16981 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | 16982 ill->ill_flags | ill->ill_phyint->phyint_flags; 16983 16984 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 16985 (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { 16986 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to " 16987 "allocate %u bytes\n", 16988 (uint_t)sizeof (mib2_ipAddrEntry_t))); 16989 } 16990 } 16991 } 16992 rw_exit(&ill_g_lock); 16993 16994 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 16995 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n", 16996 (int)optp->level, (int)optp->name, (int)optp->len)); 16997 qreply(q, mpctl); 16998 return (mp2ctl); 16999 } 17000 17001 /* IPv6 address information */ 17002 static mblk_t * 17003 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl) 17004 { 17005 struct opthdr *optp; 17006 mblk_t *mp2ctl; 17007 mblk_t *mp_tail = NULL; 17008 ill_t *ill; 17009 ipif_t *ipif; 17010 mib2_ipv6AddrEntry_t mae6; 17011 zoneid_t zoneid; 17012 ill_walk_context_t ctx; 17013 17014 /* 17015 * make a copy of the original message 17016 */ 17017 mp2ctl = copymsg(mpctl); 17018 17019 /* ipv6AddrEntryTable */ 17020 17021 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17022 optp->level = MIB2_IP6; 17023 optp->name = MIB2_IP6_ADDR; 17024 zoneid = Q_TO_CONN(q)->conn_zoneid; 17025 17026 rw_enter(&ill_g_lock, RW_READER); 17027 ill = ILL_START_WALK_V6(&ctx); 17028 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17029 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 17030 if (ipif->ipif_zoneid != zoneid && 17031 ipif->ipif_zoneid != ALL_ZONES) 17032 continue; 17033 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; 17034 mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; 17035 mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; 17036 17037 (void) ipif_get_name(ipif, 17038 mae6.ipv6AddrIfIndex.o_bytes, 17039 OCTET_LENGTH); 17040 mae6.ipv6AddrIfIndex.o_length = 17041 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes); 17042 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr; 17043 mae6.ipv6AddrPfxLength = 17044 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 17045 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; 17046 mae6.ipv6AddrInfo.ae_subnet_len = 17047 mae6.ipv6AddrPfxLength; 17048 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; 17049 17050 /* Type: stateless(1), stateful(2), unknown(3) */ 17051 if (ipif->ipif_flags & IPIF_ADDRCONF) 17052 mae6.ipv6AddrType = 1; 17053 else 17054 mae6.ipv6AddrType = 2; 17055 /* Anycast: true(1), false(2) */ 17056 if (ipif->ipif_flags & IPIF_ANYCAST) 17057 mae6.ipv6AddrAnycastFlag = 1; 17058 else 17059 mae6.ipv6AddrAnycastFlag = 2; 17060 17061 /* 17062 * Address status: preferred(1), deprecated(2), 17063 * invalid(3), inaccessible(4), unknown(5) 17064 */ 17065 if (ipif->ipif_flags & IPIF_NOLOCAL) 17066 mae6.ipv6AddrStatus = 3; 17067 else if (ipif->ipif_flags & IPIF_DEPRECATED) 17068 mae6.ipv6AddrStatus = 2; 17069 else 17070 mae6.ipv6AddrStatus = 1; 17071 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; 17072 mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; 17073 mae6.ipv6AddrInfo.ae_pp_dst_addr = 17074 ipif->ipif_v6pp_dst_addr; 17075 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags | 17076 ill->ill_flags | ill->ill_phyint->phyint_flags; 17077 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17078 (char *)&mae6, 17079 (int)sizeof (mib2_ipv6AddrEntry_t))) { 17080 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to " 17081 "allocate %u bytes\n", 17082 (uint_t)sizeof (mib2_ipv6AddrEntry_t))); 17083 } 17084 } 17085 } 17086 rw_exit(&ill_g_lock); 17087 17088 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17089 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n", 17090 (int)optp->level, (int)optp->name, (int)optp->len)); 17091 qreply(q, mpctl); 17092 return (mp2ctl); 17093 } 17094 17095 /* IPv4 multicast group membership. */ 17096 static mblk_t * 17097 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl) 17098 { 17099 struct opthdr *optp; 17100 mblk_t *mp2ctl; 17101 ill_t *ill; 17102 ipif_t *ipif; 17103 ilm_t *ilm; 17104 ip_member_t ipm; 17105 mblk_t *mp_tail = NULL; 17106 ill_walk_context_t ctx; 17107 zoneid_t zoneid; 17108 17109 /* 17110 * make a copy of the original message 17111 */ 17112 mp2ctl = copymsg(mpctl); 17113 zoneid = Q_TO_CONN(q)->conn_zoneid; 17114 17115 /* ipGroupMember table */ 17116 optp = (struct opthdr *)&mpctl->b_rptr[ 17117 sizeof (struct T_optmgmt_ack)]; 17118 optp->level = MIB2_IP; 17119 optp->name = EXPER_IP_GROUP_MEMBERSHIP; 17120 17121 rw_enter(&ill_g_lock, RW_READER); 17122 ill = ILL_START_WALK_V4(&ctx); 17123 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17124 ILM_WALKER_HOLD(ill); 17125 for (ipif = ill->ill_ipif; ipif != NULL; 17126 ipif = ipif->ipif_next) { 17127 if (ipif->ipif_zoneid != zoneid && 17128 ipif->ipif_zoneid != ALL_ZONES) 17129 continue; /* not this zone */ 17130 (void) ipif_get_name(ipif, 17131 ipm.ipGroupMemberIfIndex.o_bytes, 17132 OCTET_LENGTH); 17133 ipm.ipGroupMemberIfIndex.o_length = 17134 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); 17135 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17136 ASSERT(ilm->ilm_ipif != NULL); 17137 ASSERT(ilm->ilm_ill == NULL); 17138 if (ilm->ilm_ipif != ipif) 17139 continue; 17140 ipm.ipGroupMemberAddress = ilm->ilm_addr; 17141 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; 17142 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; 17143 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17144 (char *)&ipm, (int)sizeof (ipm))) { 17145 ip1dbg(("ip_snmp_get_mib2_ip_group: " 17146 "failed to allocate %u bytes\n", 17147 (uint_t)sizeof (ipm))); 17148 } 17149 } 17150 } 17151 ILM_WALKER_RELE(ill); 17152 } 17153 rw_exit(&ill_g_lock); 17154 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17155 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17156 (int)optp->level, (int)optp->name, (int)optp->len)); 17157 qreply(q, mpctl); 17158 return (mp2ctl); 17159 } 17160 17161 /* IPv6 multicast group membership. */ 17162 static mblk_t * 17163 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl) 17164 { 17165 struct opthdr *optp; 17166 mblk_t *mp2ctl; 17167 ill_t *ill; 17168 ilm_t *ilm; 17169 ipv6_member_t ipm6; 17170 mblk_t *mp_tail = NULL; 17171 ill_walk_context_t ctx; 17172 zoneid_t zoneid; 17173 17174 /* 17175 * make a copy of the original message 17176 */ 17177 mp2ctl = copymsg(mpctl); 17178 zoneid = Q_TO_CONN(q)->conn_zoneid; 17179 17180 /* ip6GroupMember table */ 17181 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17182 optp->level = MIB2_IP6; 17183 optp->name = EXPER_IP6_GROUP_MEMBERSHIP; 17184 17185 rw_enter(&ill_g_lock, RW_READER); 17186 ill = ILL_START_WALK_V6(&ctx); 17187 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17188 ILM_WALKER_HOLD(ill); 17189 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; 17190 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17191 ASSERT(ilm->ilm_ipif == NULL); 17192 ASSERT(ilm->ilm_ill != NULL); 17193 if (ilm->ilm_zoneid != zoneid) 17194 continue; /* not this zone */ 17195 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; 17196 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; 17197 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode; 17198 if (!snmp_append_data2(mpctl->b_cont, 17199 &mp_tail, 17200 (char *)&ipm6, (int)sizeof (ipm6))) { 17201 ip1dbg(("ip_snmp_get_mib2_ip6_group: " 17202 "failed to allocate %u bytes\n", 17203 (uint_t)sizeof (ipm6))); 17204 } 17205 } 17206 ILM_WALKER_RELE(ill); 17207 } 17208 rw_exit(&ill_g_lock); 17209 17210 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17211 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17212 (int)optp->level, (int)optp->name, (int)optp->len)); 17213 qreply(q, mpctl); 17214 return (mp2ctl); 17215 } 17216 17217 /* IP multicast filtered sources */ 17218 static mblk_t * 17219 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl) 17220 { 17221 struct opthdr *optp; 17222 mblk_t *mp2ctl; 17223 ill_t *ill; 17224 ipif_t *ipif; 17225 ilm_t *ilm; 17226 ip_grpsrc_t ips; 17227 mblk_t *mp_tail = NULL; 17228 ill_walk_context_t ctx; 17229 zoneid_t zoneid; 17230 int i; 17231 slist_t *sl; 17232 17233 /* 17234 * make a copy of the original message 17235 */ 17236 mp2ctl = copymsg(mpctl); 17237 zoneid = Q_TO_CONN(q)->conn_zoneid; 17238 17239 /* ipGroupSource table */ 17240 optp = (struct opthdr *)&mpctl->b_rptr[ 17241 sizeof (struct T_optmgmt_ack)]; 17242 optp->level = MIB2_IP; 17243 optp->name = EXPER_IP_GROUP_SOURCES; 17244 17245 rw_enter(&ill_g_lock, RW_READER); 17246 ill = ILL_START_WALK_V4(&ctx); 17247 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17248 ILM_WALKER_HOLD(ill); 17249 for (ipif = ill->ill_ipif; ipif != NULL; 17250 ipif = ipif->ipif_next) { 17251 if (ipif->ipif_zoneid != zoneid) 17252 continue; /* not this zone */ 17253 (void) ipif_get_name(ipif, 17254 ips.ipGroupSourceIfIndex.o_bytes, 17255 OCTET_LENGTH); 17256 ips.ipGroupSourceIfIndex.o_length = 17257 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); 17258 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17259 ASSERT(ilm->ilm_ipif != NULL); 17260 ASSERT(ilm->ilm_ill == NULL); 17261 sl = ilm->ilm_filter; 17262 if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) 17263 continue; 17264 ips.ipGroupSourceGroup = ilm->ilm_addr; 17265 for (i = 0; i < sl->sl_numsrc; i++) { 17266 if (!IN6_IS_ADDR_V4MAPPED( 17267 &sl->sl_addr[i])) 17268 continue; 17269 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], 17270 ips.ipGroupSourceAddress); 17271 if (snmp_append_data2(mpctl->b_cont, 17272 &mp_tail, (char *)&ips, 17273 (int)sizeof (ips)) == 0) { 17274 ip1dbg(("ip_snmp_get_mib2_" 17275 "ip_group_src: failed to " 17276 "allocate %u bytes\n", 17277 (uint_t)sizeof (ips))); 17278 } 17279 } 17280 } 17281 } 17282 ILM_WALKER_RELE(ill); 17283 } 17284 rw_exit(&ill_g_lock); 17285 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17286 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17287 (int)optp->level, (int)optp->name, (int)optp->len)); 17288 qreply(q, mpctl); 17289 return (mp2ctl); 17290 } 17291 17292 /* IPv6 multicast filtered sources. */ 17293 static mblk_t * 17294 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl) 17295 { 17296 struct opthdr *optp; 17297 mblk_t *mp2ctl; 17298 ill_t *ill; 17299 ilm_t *ilm; 17300 ipv6_grpsrc_t ips6; 17301 mblk_t *mp_tail = NULL; 17302 ill_walk_context_t ctx; 17303 zoneid_t zoneid; 17304 int i; 17305 slist_t *sl; 17306 17307 /* 17308 * make a copy of the original message 17309 */ 17310 mp2ctl = copymsg(mpctl); 17311 zoneid = Q_TO_CONN(q)->conn_zoneid; 17312 17313 /* ip6GroupMember table */ 17314 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17315 optp->level = MIB2_IP6; 17316 optp->name = EXPER_IP6_GROUP_SOURCES; 17317 17318 rw_enter(&ill_g_lock, RW_READER); 17319 ill = ILL_START_WALK_V6(&ctx); 17320 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17321 ILM_WALKER_HOLD(ill); 17322 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; 17323 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { 17324 ASSERT(ilm->ilm_ipif == NULL); 17325 ASSERT(ilm->ilm_ill != NULL); 17326 sl = ilm->ilm_filter; 17327 if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) 17328 continue; 17329 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; 17330 for (i = 0; i < sl->sl_numsrc; i++) { 17331 ips6.ipv6GroupSourceAddress = sl->sl_addr[i]; 17332 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17333 (char *)&ips6, (int)sizeof (ips6))) { 17334 ip1dbg(("ip_snmp_get_mib2_ip6_" 17335 "group_src: failed to allocate " 17336 "%u bytes\n", 17337 (uint_t)sizeof (ips6))); 17338 } 17339 } 17340 } 17341 ILM_WALKER_RELE(ill); 17342 } 17343 rw_exit(&ill_g_lock); 17344 17345 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17346 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 17347 (int)optp->level, (int)optp->name, (int)optp->len)); 17348 qreply(q, mpctl); 17349 return (mp2ctl); 17350 } 17351 17352 /* Multicast routing virtual interface table. */ 17353 static mblk_t * 17354 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl) 17355 { 17356 struct opthdr *optp; 17357 mblk_t *mp2ctl; 17358 17359 /* 17360 * make a copy of the original message 17361 */ 17362 mp2ctl = copymsg(mpctl); 17363 17364 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17365 optp->level = EXPER_DVMRP; 17366 optp->name = EXPER_DVMRP_VIF; 17367 if (!ip_mroute_vif(mpctl->b_cont)) { 17368 ip0dbg(("ip_mroute_vif: failed\n")); 17369 } 17370 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17371 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n", 17372 (int)optp->level, (int)optp->name, (int)optp->len)); 17373 qreply(q, mpctl); 17374 return (mp2ctl); 17375 } 17376 17377 /* Multicast routing table. */ 17378 static mblk_t * 17379 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl) 17380 { 17381 struct opthdr *optp; 17382 mblk_t *mp2ctl; 17383 17384 /* 17385 * make a copy of the original message 17386 */ 17387 mp2ctl = copymsg(mpctl); 17388 17389 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17390 optp->level = EXPER_DVMRP; 17391 optp->name = EXPER_DVMRP_MRT; 17392 if (!ip_mroute_mrt(mpctl->b_cont)) { 17393 ip0dbg(("ip_mroute_mrt: failed\n")); 17394 } 17395 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17396 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n", 17397 (int)optp->level, (int)optp->name, (int)optp->len)); 17398 qreply(q, mpctl); 17399 return (mp2ctl); 17400 } 17401 17402 /* 17403 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable 17404 * in one IRE walk. 17405 */ 17406 static mblk_t * 17407 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl) 17408 { 17409 struct opthdr *optp; 17410 mblk_t *mp2ctl; /* Returned */ 17411 mblk_t *mp3ctl; /* nettomedia */ 17412 mblk_t *mp4ctl; /* routeattrs */ 17413 iproutedata_t ird; 17414 zoneid_t zoneid; 17415 17416 /* 17417 * make copies of the original message 17418 * - mp2ctl is returned unchanged to the caller for his use 17419 * - mpctl is sent upstream as ipRouteEntryTable 17420 * - mp3ctl is sent upstream as ipNetToMediaEntryTable 17421 * - mp4ctl is sent upstream as ipRouteAttributeTable 17422 */ 17423 mp2ctl = copymsg(mpctl); 17424 mp3ctl = copymsg(mpctl); 17425 mp4ctl = copymsg(mpctl); 17426 if (mp3ctl == NULL || mp4ctl == NULL) { 17427 freemsg(mp4ctl); 17428 freemsg(mp3ctl); 17429 freemsg(mp2ctl); 17430 freemsg(mpctl); 17431 return (NULL); 17432 } 17433 17434 bzero(&ird, sizeof (ird)); 17435 17436 ird.ird_route.lp_head = mpctl->b_cont; 17437 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 17438 ird.ird_attrs.lp_head = mp4ctl->b_cont; 17439 17440 zoneid = Q_TO_CONN(q)->conn_zoneid; 17441 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid); 17442 if (zoneid == GLOBAL_ZONEID) { 17443 /* 17444 * Those IREs are used by Mobile-IP; since mipagent(1M) requires 17445 * the sys_net_config privilege, it can only run in the global 17446 * zone, so we don't display these IREs in the other zones. 17447 */ 17448 ire_walk_srcif_table_v4(ip_snmp_get2_v4, &ird); 17449 ire_walk_ill_mrtun(0, 0, ip_snmp_get2_v4, &ird, NULL); 17450 } 17451 17452 /* ipRouteEntryTable in mpctl */ 17453 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17454 optp->level = MIB2_IP; 17455 optp->name = MIB2_IP_ROUTE; 17456 optp->len = msgdsize(ird.ird_route.lp_head); 17457 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17458 (int)optp->level, (int)optp->name, (int)optp->len)); 17459 qreply(q, mpctl); 17460 17461 /* ipNetToMediaEntryTable in mp3ctl */ 17462 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17463 optp->level = MIB2_IP; 17464 optp->name = MIB2_IP_MEDIA; 17465 optp->len = msgdsize(ird.ird_netmedia.lp_head); 17466 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17467 (int)optp->level, (int)optp->name, (int)optp->len)); 17468 qreply(q, mp3ctl); 17469 17470 /* ipRouteAttributeTable in mp4ctl */ 17471 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17472 optp->level = MIB2_IP; 17473 optp->name = EXPER_IP_RTATTR; 17474 optp->len = msgdsize(ird.ird_attrs.lp_head); 17475 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n", 17476 (int)optp->level, (int)optp->name, (int)optp->len)); 17477 if (optp->len == 0) 17478 freemsg(mp4ctl); 17479 else 17480 qreply(q, mp4ctl); 17481 17482 return (mp2ctl); 17483 } 17484 17485 /* 17486 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and 17487 * ipv6NetToMediaEntryTable in an NDP walk. 17488 */ 17489 static mblk_t * 17490 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl) 17491 { 17492 struct opthdr *optp; 17493 mblk_t *mp2ctl; /* Returned */ 17494 mblk_t *mp3ctl; /* nettomedia */ 17495 mblk_t *mp4ctl; /* routeattrs */ 17496 iproutedata_t ird; 17497 zoneid_t zoneid; 17498 17499 /* 17500 * make copies of the original message 17501 * - mp2ctl is returned unchanged to the caller for his use 17502 * - mpctl is sent upstream as ipv6RouteEntryTable 17503 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable 17504 * - mp4ctl is sent upstream as ipv6RouteAttributeTable 17505 */ 17506 mp2ctl = copymsg(mpctl); 17507 mp3ctl = copymsg(mpctl); 17508 mp4ctl = copymsg(mpctl); 17509 if (mp3ctl == NULL || mp4ctl == NULL) { 17510 freemsg(mp4ctl); 17511 freemsg(mp3ctl); 17512 freemsg(mp2ctl); 17513 freemsg(mpctl); 17514 return (NULL); 17515 } 17516 17517 bzero(&ird, sizeof (ird)); 17518 17519 ird.ird_route.lp_head = mpctl->b_cont; 17520 ird.ird_netmedia.lp_head = mp3ctl->b_cont; 17521 ird.ird_attrs.lp_head = mp4ctl->b_cont; 17522 17523 zoneid = Q_TO_CONN(q)->conn_zoneid; 17524 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid); 17525 17526 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17527 optp->level = MIB2_IP6; 17528 optp->name = MIB2_IP6_ROUTE; 17529 optp->len = msgdsize(ird.ird_route.lp_head); 17530 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17531 (int)optp->level, (int)optp->name, (int)optp->len)); 17532 qreply(q, mpctl); 17533 17534 /* ipv6NetToMediaEntryTable in mp3ctl */ 17535 ndp_walk(NULL, ip_snmp_get2_v6_media, &ird); 17536 17537 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17538 optp->level = MIB2_IP6; 17539 optp->name = MIB2_IP6_MEDIA; 17540 optp->len = msgdsize(ird.ird_netmedia.lp_head); 17541 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17542 (int)optp->level, (int)optp->name, (int)optp->len)); 17543 qreply(q, mp3ctl); 17544 17545 /* ipv6RouteAttributeTable in mp4ctl */ 17546 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17547 optp->level = MIB2_IP6; 17548 optp->name = EXPER_IP_RTATTR; 17549 optp->len = msgdsize(ird.ird_attrs.lp_head); 17550 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n", 17551 (int)optp->level, (int)optp->name, (int)optp->len)); 17552 if (optp->len == 0) 17553 freemsg(mp4ctl); 17554 else 17555 qreply(q, mp4ctl); 17556 17557 return (mp2ctl); 17558 } 17559 17560 /* 17561 * ICMPv6 mib: One per ill 17562 */ 17563 static mblk_t * 17564 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl) 17565 { 17566 struct opthdr *optp; 17567 mblk_t *mp2ctl; 17568 ill_t *ill; 17569 ill_walk_context_t ctx; 17570 mblk_t *mp_tail = NULL; 17571 17572 /* 17573 * Make a copy of the original message 17574 */ 17575 mp2ctl = copymsg(mpctl); 17576 17577 /* fixed length IPv6 structure ... */ 17578 17579 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17580 optp->level = MIB2_IP6; 17581 optp->name = 0; 17582 /* Include "unknown interface" ip6_mib */ 17583 ip6_mib.ipv6IfIndex = 0; /* Flag to netstat */ 17584 SET_MIB(ip6_mib.ipv6Forwarding, ipv6_forward ? 1 : 2); 17585 SET_MIB(ip6_mib.ipv6DefaultHopLimit, ipv6_def_hops); 17586 SET_MIB(ip6_mib.ipv6IfStatsEntrySize, 17587 sizeof (mib2_ipv6IfStatsEntry_t)); 17588 SET_MIB(ip6_mib.ipv6AddrEntrySize, sizeof (mib2_ipv6AddrEntry_t)); 17589 SET_MIB(ip6_mib.ipv6RouteEntrySize, sizeof (mib2_ipv6RouteEntry_t)); 17590 SET_MIB(ip6_mib.ipv6NetToMediaEntrySize, 17591 sizeof (mib2_ipv6NetToMediaEntry_t)); 17592 SET_MIB(ip6_mib.ipv6MemberEntrySize, sizeof (ipv6_member_t)); 17593 SET_MIB(ip6_mib.ipv6GroupSourceEntrySize, sizeof (ipv6_grpsrc_t)); 17594 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&ip6_mib, 17595 (int)sizeof (ip6_mib))) { 17596 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n", 17597 (uint_t)sizeof (ip6_mib))); 17598 } 17599 17600 rw_enter(&ill_g_lock, RW_READER); 17601 ill = ILL_START_WALK_V6(&ctx); 17602 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17603 ill->ill_ip6_mib->ipv6IfIndex = 17604 ill->ill_phyint->phyint_ifindex; 17605 SET_MIB(ill->ill_ip6_mib->ipv6Forwarding, 17606 ipv6_forward ? 1 : 2); 17607 SET_MIB(ill->ill_ip6_mib->ipv6DefaultHopLimit, 17608 ill->ill_max_hops); 17609 SET_MIB(ill->ill_ip6_mib->ipv6IfStatsEntrySize, 17610 sizeof (mib2_ipv6IfStatsEntry_t)); 17611 SET_MIB(ill->ill_ip6_mib->ipv6AddrEntrySize, 17612 sizeof (mib2_ipv6AddrEntry_t)); 17613 SET_MIB(ill->ill_ip6_mib->ipv6RouteEntrySize, 17614 sizeof (mib2_ipv6RouteEntry_t)); 17615 SET_MIB(ill->ill_ip6_mib->ipv6NetToMediaEntrySize, 17616 sizeof (mib2_ipv6NetToMediaEntry_t)); 17617 SET_MIB(ill->ill_ip6_mib->ipv6MemberEntrySize, 17618 sizeof (ipv6_member_t)); 17619 17620 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17621 (char *)ill->ill_ip6_mib, 17622 (int)sizeof (*ill->ill_ip6_mib))) { 17623 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate " 17624 "%u bytes\n", 17625 (uint_t)sizeof (*ill->ill_ip6_mib))); 17626 } 17627 } 17628 rw_exit(&ill_g_lock); 17629 17630 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17631 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n", 17632 (int)optp->level, (int)optp->name, (int)optp->len)); 17633 qreply(q, mpctl); 17634 return (mp2ctl); 17635 } 17636 17637 /* 17638 * ICMPv6 mib: One per ill 17639 */ 17640 static mblk_t * 17641 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl) 17642 { 17643 struct opthdr *optp; 17644 mblk_t *mp2ctl; 17645 ill_t *ill; 17646 ill_walk_context_t ctx; 17647 mblk_t *mp_tail = NULL; 17648 /* 17649 * Make a copy of the original message 17650 */ 17651 mp2ctl = copymsg(mpctl); 17652 17653 /* fixed length ICMPv6 structure ... */ 17654 17655 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 17656 optp->level = MIB2_ICMP6; 17657 optp->name = 0; 17658 /* Include "unknown interface" icmp6_mib */ 17659 icmp6_mib.ipv6IfIcmpIfIndex = 0; /* Flag to netstat */ 17660 icmp6_mib.ipv6IfIcmpEntrySize = sizeof (mib2_ipv6IfIcmpEntry_t); 17661 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&icmp6_mib, 17662 (int)sizeof (icmp6_mib))) { 17663 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n", 17664 (uint_t)sizeof (icmp6_mib))); 17665 } 17666 17667 rw_enter(&ill_g_lock, RW_READER); 17668 ill = ILL_START_WALK_V6(&ctx); 17669 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 17670 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 17671 ill->ill_phyint->phyint_ifindex; 17672 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 17673 sizeof (mib2_ipv6IfIcmpEntry_t); 17674 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 17675 (char *)ill->ill_icmp6_mib, 17676 (int)sizeof (*ill->ill_icmp6_mib))) { 17677 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate " 17678 "%u bytes\n", 17679 (uint_t)sizeof (*ill->ill_icmp6_mib))); 17680 } 17681 } 17682 rw_exit(&ill_g_lock); 17683 17684 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 17685 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n", 17686 (int)optp->level, (int)optp->name, (int)optp->len)); 17687 qreply(q, mpctl); 17688 return (mp2ctl); 17689 } 17690 17691 /* 17692 * ire_walk routine to create both ipRouteEntryTable and 17693 * ipNetToMediaEntryTable in one IRE walk 17694 */ 17695 static void 17696 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) 17697 { 17698 ill_t *ill; 17699 ipif_t *ipif; 17700 mblk_t *llmp; 17701 dl_unitdata_req_t *dlup; 17702 mib2_ipRouteEntry_t *re; 17703 mib2_ipNetToMediaEntry_t ntme; 17704 mib2_ipAttributeEntry_t *iae, *iaeptr; 17705 ipaddr_t gw_addr; 17706 tsol_ire_gw_secattr_t *attrp; 17707 tsol_gc_t *gc = NULL; 17708 tsol_gcgrp_t *gcgrp = NULL; 17709 uint_t sacnt = 0; 17710 int i; 17711 17712 ASSERT(ire->ire_ipversion == IPV4_VERSION); 17713 17714 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 17715 return; 17716 17717 if ((attrp = ire->ire_gw_secattr) != NULL) { 17718 mutex_enter(&attrp->igsa_lock); 17719 if ((gc = attrp->igsa_gc) != NULL) { 17720 gcgrp = gc->gc_grp; 17721 ASSERT(gcgrp != NULL); 17722 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17723 sacnt = 1; 17724 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 17725 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17726 gc = gcgrp->gcgrp_head; 17727 sacnt = gcgrp->gcgrp_count; 17728 } 17729 mutex_exit(&attrp->igsa_lock); 17730 17731 /* do nothing if there's no gc to report */ 17732 if (gc == NULL) { 17733 ASSERT(sacnt == 0); 17734 if (gcgrp != NULL) { 17735 /* we might as well drop the lock now */ 17736 rw_exit(&gcgrp->gcgrp_rwlock); 17737 gcgrp = NULL; 17738 } 17739 attrp = NULL; 17740 } 17741 17742 ASSERT(gc == NULL || (gcgrp != NULL && 17743 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 17744 } 17745 ASSERT(sacnt == 0 || gc != NULL); 17746 17747 if (sacnt != 0 && 17748 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 17749 kmem_free(re, sizeof (*re)); 17750 rw_exit(&gcgrp->gcgrp_rwlock); 17751 return; 17752 } 17753 17754 /* 17755 * Return all IRE types for route table... let caller pick and choose 17756 */ 17757 re->ipRouteDest = ire->ire_addr; 17758 ipif = ire->ire_ipif; 17759 re->ipRouteIfIndex.o_length = 0; 17760 if (ire->ire_type == IRE_CACHE) { 17761 ill = (ill_t *)ire->ire_stq->q_ptr; 17762 re->ipRouteIfIndex.o_length = 17763 ill->ill_name_length == 0 ? 0 : 17764 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17765 bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, 17766 re->ipRouteIfIndex.o_length); 17767 } else if (ipif != NULL) { 17768 (void) ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, 17769 OCTET_LENGTH); 17770 re->ipRouteIfIndex.o_length = 17771 mi_strlen(re->ipRouteIfIndex.o_bytes); 17772 } 17773 re->ipRouteMetric1 = -1; 17774 re->ipRouteMetric2 = -1; 17775 re->ipRouteMetric3 = -1; 17776 re->ipRouteMetric4 = -1; 17777 17778 gw_addr = ire->ire_gateway_addr; 17779 17780 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) 17781 re->ipRouteNextHop = ire->ire_src_addr; 17782 else 17783 re->ipRouteNextHop = gw_addr; 17784 /* indirect(4), direct(3), or invalid(2) */ 17785 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 17786 re->ipRouteType = 2; 17787 else 17788 re->ipRouteType = (gw_addr != 0) ? 4 : 3; 17789 re->ipRouteProto = -1; 17790 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; 17791 re->ipRouteMask = ire->ire_mask; 17792 re->ipRouteMetric5 = -1; 17793 re->ipRouteInfo.re_max_frag = ire->ire_max_frag; 17794 re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; 17795 re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 17796 llmp = ire->ire_dlureq_mp; 17797 re->ipRouteInfo.re_ref = ire->ire_refcnt; 17798 re->ipRouteInfo.re_src_addr = ire->ire_src_addr; 17799 re->ipRouteInfo.re_ire_type = ire->ire_type; 17800 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; 17801 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 17802 re->ipRouteInfo.re_flags = ire->ire_flags; 17803 re->ipRouteInfo.re_in_ill.o_length = 0; 17804 if (ire->ire_in_ill != NULL) { 17805 re->ipRouteInfo.re_in_ill.o_length = 17806 ire->ire_in_ill->ill_name_length == 0 ? 0 : 17807 MIN(OCTET_LENGTH, ire->ire_in_ill->ill_name_length - 1); 17808 bcopy(ire->ire_in_ill->ill_name, 17809 re->ipRouteInfo.re_in_ill.o_bytes, 17810 re->ipRouteInfo.re_in_ill.o_length); 17811 } 17812 re->ipRouteInfo.re_in_src_addr = ire->ire_in_src_addr; 17813 17814 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 17815 (char *)re, (int)sizeof (*re))) { 17816 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17817 (uint_t)sizeof (*re))); 17818 } 17819 17820 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 17821 iaeptr->iae_routeidx = ird->ird_idx; 17822 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 17823 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 17824 } 17825 17826 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 17827 (char *)iae, sacnt * sizeof (*iae))) { 17828 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17829 (unsigned)(sacnt * sizeof (*iae)))); 17830 } 17831 17832 if (ire->ire_type != IRE_CACHE || gw_addr != 0) 17833 goto done; 17834 /* 17835 * only IRE_CACHE entries that are for a directly connected subnet 17836 * get appended to net -> phys addr table 17837 * (others in arp) 17838 */ 17839 ntme.ipNetToMediaIfIndex.o_length = 0; 17840 ill = ire_to_ill(ire); 17841 ASSERT(ill != NULL); 17842 ntme.ipNetToMediaIfIndex.o_length = 17843 ill->ill_name_length == 0 ? 0 : 17844 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17845 bcopy(ill->ill_name, ntme.ipNetToMediaIfIndex.o_bytes, 17846 ntme.ipNetToMediaIfIndex.o_length); 17847 17848 ntme.ipNetToMediaPhysAddress.o_length = 0; 17849 if (llmp) { 17850 uchar_t *addr; 17851 17852 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 17853 /* Remove sap from address */ 17854 if (ill->ill_sap_length < 0) 17855 addr = llmp->b_rptr + dlup->dl_dest_addr_offset; 17856 else 17857 addr = llmp->b_rptr + dlup->dl_dest_addr_offset + 17858 ill->ill_sap_length; 17859 17860 ntme.ipNetToMediaPhysAddress.o_length = 17861 MIN(OCTET_LENGTH, ill->ill_phys_addr_length); 17862 bcopy(addr, ntme.ipNetToMediaPhysAddress.o_bytes, 17863 ntme.ipNetToMediaPhysAddress.o_length); 17864 } 17865 ntme.ipNetToMediaNetAddress = ire->ire_addr; 17866 /* assume dynamic (may be changed in arp) */ 17867 ntme.ipNetToMediaType = 3; 17868 ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (uint32_t); 17869 bcopy(&ire->ire_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes, 17870 ntme.ipNetToMediaInfo.ntm_mask.o_length); 17871 ntme.ipNetToMediaInfo.ntm_flags = ACE_F_RESOLVED; 17872 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 17873 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 17874 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", 17875 (uint_t)sizeof (ntme))); 17876 } 17877 done: 17878 /* bump route index for next pass */ 17879 ird->ird_idx++; 17880 17881 kmem_free(re, sizeof (*re)); 17882 if (sacnt != 0) 17883 kmem_free(iae, sacnt * sizeof (*iae)); 17884 17885 if (gcgrp != NULL) 17886 rw_exit(&gcgrp->gcgrp_rwlock); 17887 } 17888 17889 /* 17890 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable. 17891 */ 17892 static void 17893 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) 17894 { 17895 ill_t *ill; 17896 ipif_t *ipif; 17897 mib2_ipv6RouteEntry_t *re; 17898 mib2_ipAttributeEntry_t *iae, *iaeptr; 17899 in6_addr_t gw_addr_v6; 17900 tsol_ire_gw_secattr_t *attrp; 17901 tsol_gc_t *gc = NULL; 17902 tsol_gcgrp_t *gcgrp = NULL; 17903 uint_t sacnt = 0; 17904 int i; 17905 17906 ASSERT(ire->ire_ipversion == IPV6_VERSION); 17907 17908 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) 17909 return; 17910 17911 if ((attrp = ire->ire_gw_secattr) != NULL) { 17912 mutex_enter(&attrp->igsa_lock); 17913 if ((gc = attrp->igsa_gc) != NULL) { 17914 gcgrp = gc->gc_grp; 17915 ASSERT(gcgrp != NULL); 17916 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17917 sacnt = 1; 17918 } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { 17919 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); 17920 gc = gcgrp->gcgrp_head; 17921 sacnt = gcgrp->gcgrp_count; 17922 } 17923 mutex_exit(&attrp->igsa_lock); 17924 17925 /* do nothing if there's no gc to report */ 17926 if (gc == NULL) { 17927 ASSERT(sacnt == 0); 17928 if (gcgrp != NULL) { 17929 /* we might as well drop the lock now */ 17930 rw_exit(&gcgrp->gcgrp_rwlock); 17931 gcgrp = NULL; 17932 } 17933 attrp = NULL; 17934 } 17935 17936 ASSERT(gc == NULL || (gcgrp != NULL && 17937 RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); 17938 } 17939 ASSERT(sacnt == 0 || gc != NULL); 17940 17941 if (sacnt != 0 && 17942 (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { 17943 kmem_free(re, sizeof (*re)); 17944 rw_exit(&gcgrp->gcgrp_rwlock); 17945 return; 17946 } 17947 17948 /* 17949 * Return all IRE types for route table... let caller pick and choose 17950 */ 17951 re->ipv6RouteDest = ire->ire_addr_v6; 17952 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); 17953 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ 17954 re->ipv6RouteIfIndex.o_length = 0; 17955 ipif = ire->ire_ipif; 17956 if (ire->ire_type == IRE_CACHE) { 17957 ill = (ill_t *)ire->ire_stq->q_ptr; 17958 re->ipv6RouteIfIndex.o_length = 17959 ill->ill_name_length == 0 ? 0 : 17960 MIN(OCTET_LENGTH, ill->ill_name_length - 1); 17961 bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, 17962 re->ipv6RouteIfIndex.o_length); 17963 } else if (ipif != NULL) { 17964 (void) ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, 17965 OCTET_LENGTH); 17966 re->ipv6RouteIfIndex.o_length = 17967 mi_strlen(re->ipv6RouteIfIndex.o_bytes); 17968 } 17969 17970 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 17971 17972 mutex_enter(&ire->ire_lock); 17973 gw_addr_v6 = ire->ire_gateway_addr_v6; 17974 mutex_exit(&ire->ire_lock); 17975 17976 if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) 17977 re->ipv6RouteNextHop = ire->ire_src_addr_v6; 17978 else 17979 re->ipv6RouteNextHop = gw_addr_v6; 17980 17981 /* remote(4), local(3), or discard(2) */ 17982 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) 17983 re->ipv6RouteType = 2; 17984 else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) 17985 re->ipv6RouteType = 3; 17986 else 17987 re->ipv6RouteType = 4; 17988 17989 re->ipv6RouteProtocol = -1; 17990 re->ipv6RoutePolicy = 0; 17991 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time; 17992 re->ipv6RouteNextHopRDI = 0; 17993 re->ipv6RouteWeight = 0; 17994 re->ipv6RouteMetric = 0; 17995 re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; 17996 re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; 17997 re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; 17998 re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; 17999 re->ipv6RouteInfo.re_ire_type = ire->ire_type; 18000 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; 18001 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; 18002 re->ipv6RouteInfo.re_ref = ire->ire_refcnt; 18003 re->ipv6RouteInfo.re_flags = ire->ire_flags; 18004 18005 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail, 18006 (char *)re, (int)sizeof (*re))) { 18007 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 18008 (uint_t)sizeof (*re))); 18009 } 18010 18011 for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { 18012 iaeptr->iae_routeidx = ird->ird_idx; 18013 iaeptr->iae_doi = gc->gc_db->gcdb_doi; 18014 iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; 18015 } 18016 18017 if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, 18018 (char *)iae, sacnt * sizeof (*iae))) { 18019 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", 18020 (unsigned)(sacnt * sizeof (*iae)))); 18021 } 18022 18023 /* bump route index for next pass */ 18024 ird->ird_idx++; 18025 18026 kmem_free(re, sizeof (*re)); 18027 if (sacnt != 0) 18028 kmem_free(iae, sacnt * sizeof (*iae)); 18029 18030 if (gcgrp != NULL) 18031 rw_exit(&gcgrp->gcgrp_rwlock); 18032 } 18033 18034 /* 18035 * ndp_walk routine to create ipv6NetToMediaEntryTable 18036 */ 18037 static int 18038 ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) 18039 { 18040 ill_t *ill; 18041 mib2_ipv6NetToMediaEntry_t ntme; 18042 dl_unitdata_req_t *dl; 18043 18044 ill = nce->nce_ill; 18045 ASSERT(ill->ill_isv6); 18046 18047 /* 18048 * Neighbor cache entry attached to IRE with on-link 18049 * destination. 18050 */ 18051 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; 18052 ntme.ipv6NetToMediaNetAddress = nce->nce_addr; 18053 if ((ill->ill_flags & ILLF_XRESOLV) && 18054 (nce->nce_res_mp != NULL)) { 18055 dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); 18056 ntme.ipv6NetToMediaPhysAddress.o_length = 18057 dl->dl_dest_addr_length; 18058 } else { 18059 ntme.ipv6NetToMediaPhysAddress.o_length = 18060 ill->ill_phys_addr_length; 18061 } 18062 if (nce->nce_res_mp != NULL) { 18063 bcopy((char *)nce->nce_res_mp->b_rptr + 18064 NCE_LL_ADDR_OFFSET(ill), 18065 ntme.ipv6NetToMediaPhysAddress.o_bytes, 18066 ntme.ipv6NetToMediaPhysAddress.o_length); 18067 } else { 18068 bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, 18069 ill->ill_phys_addr_length); 18070 } 18071 /* 18072 * Note: Returns ND_* states. Should be: 18073 * reachable(1), stale(2), delay(3), probe(4), 18074 * invalid(5), unknown(6) 18075 */ 18076 ntme.ipv6NetToMediaState = nce->nce_state; 18077 ntme.ipv6NetToMediaLastUpdated = 0; 18078 18079 /* other(1), dynamic(2), static(3), local(4) */ 18080 if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { 18081 ntme.ipv6NetToMediaType = 4; 18082 } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { 18083 ntme.ipv6NetToMediaType = 1; 18084 } else { 18085 ntme.ipv6NetToMediaType = 2; 18086 } 18087 18088 if (!snmp_append_data2(ird->ird_netmedia.lp_head, 18089 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { 18090 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n", 18091 (uint_t)sizeof (ntme))); 18092 } 18093 return (0); 18094 } 18095 18096 /* 18097 * return (0) if invalid set request, 1 otherwise, including non-tcp requests 18098 */ 18099 /* ARGSUSED */ 18100 int 18101 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 18102 { 18103 switch (level) { 18104 case MIB2_IP: 18105 case MIB2_ICMP: 18106 switch (name) { 18107 default: 18108 break; 18109 } 18110 return (1); 18111 default: 18112 return (1); 18113 } 18114 } 18115 18116 /* 18117 * Called before the options are updated to check if this packet will 18118 * be source routed from here. 18119 * This routine assumes that the options are well formed i.e. that they 18120 * have already been checked. 18121 */ 18122 static boolean_t 18123 ip_source_routed(ipha_t *ipha) 18124 { 18125 ipoptp_t opts; 18126 uchar_t *opt; 18127 uint8_t optval; 18128 uint8_t optlen; 18129 ipaddr_t dst; 18130 ire_t *ire; 18131 18132 if (IS_SIMPLE_IPH(ipha)) { 18133 ip2dbg(("not source routed\n")); 18134 return (B_FALSE); 18135 } 18136 dst = ipha->ipha_dst; 18137 for (optval = ipoptp_first(&opts, ipha); 18138 optval != IPOPT_EOL; 18139 optval = ipoptp_next(&opts)) { 18140 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 18141 opt = opts.ipoptp_cur; 18142 optlen = opts.ipoptp_len; 18143 ip2dbg(("ip_source_routed: opt %d, len %d\n", 18144 optval, optlen)); 18145 switch (optval) { 18146 uint32_t off; 18147 case IPOPT_SSRR: 18148 case IPOPT_LSRR: 18149 /* 18150 * If dst is one of our addresses and there are some 18151 * entries left in the source route return (true). 18152 */ 18153 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, 18154 ALL_ZONES, NULL, MATCH_IRE_TYPE); 18155 if (ire == NULL) { 18156 ip2dbg(("ip_source_routed: not next" 18157 " source route 0x%x\n", 18158 ntohl(dst))); 18159 return (B_FALSE); 18160 } 18161 ire_refrele(ire); 18162 off = opt[IPOPT_OFFSET]; 18163 off--; 18164 if (optlen < IP_ADDR_LEN || 18165 off > optlen - IP_ADDR_LEN) { 18166 /* End of source route */ 18167 ip1dbg(("ip_source_routed: end of SR\n")); 18168 return (B_FALSE); 18169 } 18170 return (B_TRUE); 18171 } 18172 } 18173 ip2dbg(("not source routed\n")); 18174 return (B_FALSE); 18175 } 18176 18177 /* 18178 * Check if the packet contains any source route. 18179 */ 18180 static boolean_t 18181 ip_source_route_included(ipha_t *ipha) 18182 { 18183 ipoptp_t opts; 18184 uint8_t optval; 18185 18186 if (IS_SIMPLE_IPH(ipha)) 18187 return (B_FALSE); 18188 for (optval = ipoptp_first(&opts, ipha); 18189 optval != IPOPT_EOL; 18190 optval = ipoptp_next(&opts)) { 18191 switch (optval) { 18192 case IPOPT_SSRR: 18193 case IPOPT_LSRR: 18194 return (B_TRUE); 18195 } 18196 } 18197 return (B_FALSE); 18198 } 18199 18200 /* 18201 * Called when the IRE expiration timer fires. 18202 */ 18203 /* ARGSUSED */ 18204 void 18205 ip_trash_timer_expire(void *args) 18206 { 18207 int flush_flag = 0; 18208 18209 /* 18210 * ip_ire_expire_id is protected by ip_trash_timer_lock. 18211 * This lock makes sure that a new invocation of this function 18212 * that occurs due to an almost immediate timer firing will not 18213 * progress beyond this point until the current invocation is done 18214 */ 18215 mutex_enter(&ip_trash_timer_lock); 18216 ip_ire_expire_id = 0; 18217 mutex_exit(&ip_trash_timer_lock); 18218 18219 /* Periodic timer */ 18220 if (ip_ire_arp_time_elapsed >= ip_ire_arp_interval) { 18221 /* 18222 * Remove all IRE_CACHE entries since they might 18223 * contain arp information. 18224 */ 18225 flush_flag |= FLUSH_ARP_TIME; 18226 ip_ire_arp_time_elapsed = 0; 18227 IP_STAT(ip_ire_arp_timer_expired); 18228 } 18229 if (ip_ire_rd_time_elapsed >= ip_ire_redir_interval) { 18230 /* Remove all redirects */ 18231 flush_flag |= FLUSH_REDIRECT_TIME; 18232 ip_ire_rd_time_elapsed = 0; 18233 IP_STAT(ip_ire_redirect_timer_expired); 18234 } 18235 if (ip_ire_pmtu_time_elapsed >= ip_ire_pathmtu_interval) { 18236 /* Increase path mtu */ 18237 flush_flag |= FLUSH_MTU_TIME; 18238 ip_ire_pmtu_time_elapsed = 0; 18239 IP_STAT(ip_ire_pmtu_timer_expired); 18240 } 18241 if (flush_flag != 0) { 18242 /* Walk all IPv4 IRE's and update them */ 18243 ire_walk_v4(ire_expire, (char *)(uintptr_t)flush_flag, 18244 ALL_ZONES); 18245 } 18246 if (flush_flag & FLUSH_MTU_TIME) { 18247 /* 18248 * Walk all IPv6 IRE's and update them 18249 * Note that ARP and redirect timers are not 18250 * needed since NUD handles stale entries. 18251 */ 18252 flush_flag = FLUSH_MTU_TIME; 18253 ire_walk_v6(ire_expire, (char *)(uintptr_t)flush_flag, 18254 ALL_ZONES); 18255 } 18256 18257 ip_ire_arp_time_elapsed += ip_timer_interval; 18258 ip_ire_rd_time_elapsed += ip_timer_interval; 18259 ip_ire_pmtu_time_elapsed += ip_timer_interval; 18260 18261 /* 18262 * Hold the lock to serialize timeout calls and prevent 18263 * stale values in ip_ire_expire_id. Otherwise it is possible 18264 * for the timer to fire and a new invocation of this function 18265 * to start before the return value of timeout has been stored 18266 * in ip_ire_expire_id by the current invocation. 18267 */ 18268 mutex_enter(&ip_trash_timer_lock); 18269 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 18270 MSEC_TO_TICK(ip_timer_interval)); 18271 mutex_exit(&ip_trash_timer_lock); 18272 } 18273 18274 /* 18275 * Called by the memory allocator subsystem directly, when the system 18276 * is running low on memory. 18277 */ 18278 /* ARGSUSED */ 18279 void 18280 ip_trash_ire_reclaim(void *args) 18281 { 18282 ire_cache_count_t icc; 18283 ire_cache_reclaim_t icr; 18284 ncc_cache_count_t ncc; 18285 nce_cache_reclaim_t ncr; 18286 uint_t delete_cnt; 18287 /* 18288 * Memory reclaim call back. 18289 * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. 18290 * Then, with a target of freeing 1/Nth of IRE_CACHE 18291 * entries, determine what fraction to free for 18292 * each category of IRE_CACHE entries giving absolute priority 18293 * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu 18294 * entry will be freed unless all offlink entries are freed). 18295 */ 18296 icc.icc_total = 0; 18297 icc.icc_unused = 0; 18298 icc.icc_offlink = 0; 18299 icc.icc_pmtu = 0; 18300 icc.icc_onlink = 0; 18301 ire_walk(ire_cache_count, (char *)&icc); 18302 18303 /* 18304 * Free NCEs for IPv6 like the onlink ires. 18305 */ 18306 ncc.ncc_total = 0; 18307 ncc.ncc_host = 0; 18308 ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc); 18309 18310 ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + 18311 icc.icc_pmtu + icc.icc_onlink); 18312 delete_cnt = icc.icc_total/ip_ire_reclaim_fraction; 18313 IP_STAT(ip_trash_ire_reclaim_calls); 18314 if (delete_cnt == 0) 18315 return; 18316 IP_STAT(ip_trash_ire_reclaim_success); 18317 /* Always delete all unused offlink entries */ 18318 icr.icr_unused = 1; 18319 if (delete_cnt <= icc.icc_unused) { 18320 /* 18321 * Only need to free unused entries. In other words, 18322 * there are enough unused entries to free to meet our 18323 * target number of freed ire cache entries. 18324 */ 18325 icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; 18326 ncr.ncr_host = 0; 18327 } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { 18328 /* 18329 * Only need to free unused entries, plus a fraction of offlink 18330 * entries. It follows from the first if statement that 18331 * icc_offlink is non-zero, and that delete_cnt != icc_unused. 18332 */ 18333 delete_cnt -= icc.icc_unused; 18334 /* Round up # deleted by truncating fraction */ 18335 icr.icr_offlink = icc.icc_offlink / delete_cnt; 18336 icr.icr_pmtu = icr.icr_onlink = 0; 18337 ncr.ncr_host = 0; 18338 } else if (delete_cnt <= 18339 icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { 18340 /* 18341 * Free all unused and offlink entries, plus a fraction of 18342 * pmtu entries. It follows from the previous if statement 18343 * that icc_pmtu is non-zero, and that 18344 * delete_cnt != icc_unused + icc_offlink. 18345 */ 18346 icr.icr_offlink = 1; 18347 delete_cnt -= icc.icc_unused + icc.icc_offlink; 18348 /* Round up # deleted by truncating fraction */ 18349 icr.icr_pmtu = icc.icc_pmtu / delete_cnt; 18350 icr.icr_onlink = 0; 18351 ncr.ncr_host = 0; 18352 } else { 18353 /* 18354 * Free all unused, offlink, and pmtu entries, plus a fraction 18355 * of onlink entries. If we're here, then we know that 18356 * icc_onlink is non-zero, and that 18357 * delete_cnt != icc_unused + icc_offlink + icc_pmtu. 18358 */ 18359 icr.icr_offlink = icr.icr_pmtu = 1; 18360 delete_cnt -= icc.icc_unused + icc.icc_offlink + 18361 icc.icc_pmtu; 18362 /* Round up # deleted by truncating fraction */ 18363 icr.icr_onlink = icc.icc_onlink / delete_cnt; 18364 /* Using the same delete fraction as for onlink IREs */ 18365 ncr.ncr_host = ncc.ncc_host / delete_cnt; 18366 } 18367 #ifdef DEBUG 18368 ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " 18369 "fractions %d/%d/%d/%d\n", 18370 icc.icc_total/ip_ire_reclaim_fraction, icc.icc_total, 18371 icc.icc_unused, icc.icc_offlink, 18372 icc.icc_pmtu, icc.icc_onlink, 18373 icr.icr_unused, icr.icr_offlink, 18374 icr.icr_pmtu, icr.icr_onlink)); 18375 #endif 18376 ire_walk(ire_cache_reclaim, (char *)&icr); 18377 if (ncr.ncr_host != 0) 18378 ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, 18379 (uchar_t *)&ncr); 18380 #ifdef DEBUG 18381 icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; 18382 icc.icc_pmtu = 0; icc.icc_onlink = 0; 18383 ire_walk(ire_cache_count, (char *)&icc); 18384 ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", 18385 icc.icc_total, icc.icc_unused, icc.icc_offlink, 18386 icc.icc_pmtu, icc.icc_onlink)); 18387 #endif 18388 } 18389 18390 /* 18391 * ip_unbind is called when a copy of an unbind request is received from the 18392 * upper level protocol. We remove this conn from any fanout hash list it is 18393 * on, and zero out the bind information. No reply is expected up above. 18394 */ 18395 mblk_t * 18396 ip_unbind(queue_t *q, mblk_t *mp) 18397 { 18398 conn_t *connp = Q_TO_CONN(q); 18399 18400 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 18401 18402 if (is_system_labeled() && connp->conn_anon_port) { 18403 (void) tsol_mlp_anon(crgetzone(connp->conn_cred), 18404 connp->conn_mlp_type, connp->conn_ulp, 18405 ntohs(connp->conn_lport), B_FALSE); 18406 connp->conn_anon_port = 0; 18407 } 18408 connp->conn_mlp_type = mlptSingle; 18409 18410 ipcl_hash_remove(connp); 18411 18412 ASSERT(mp->b_cont == NULL); 18413 /* 18414 * Convert mp into a T_OK_ACK 18415 */ 18416 mp = mi_tpi_ok_ack_alloc(mp); 18417 18418 /* 18419 * should not happen in practice... T_OK_ACK is smaller than the 18420 * original message. 18421 */ 18422 if (mp == NULL) 18423 return (NULL); 18424 18425 /* 18426 * Don't bzero the ports if its TCP since TCP still needs the 18427 * lport to remove it from its own bind hash. TCP will do the 18428 * cleanup. 18429 */ 18430 if (!IPCL_IS_TCP(connp)) 18431 bzero(&connp->u_port, sizeof (connp->u_port)); 18432 18433 return (mp); 18434 } 18435 18436 /* 18437 * Write side put procedure. Outbound data, IOCTLs, responses from 18438 * resolvers, etc, come down through here. 18439 */ 18440 void 18441 ip_output(void *arg, mblk_t *mp, void *arg2, int caller) 18442 { 18443 conn_t *connp = NULL; 18444 queue_t *q = (queue_t *)arg2; 18445 ipha_t *ipha; 18446 #define rptr ((uchar_t *)ipha) 18447 ire_t *ire = NULL; 18448 ire_t *sctp_ire = NULL; 18449 uint32_t v_hlen_tos_len; 18450 ipaddr_t dst; 18451 mblk_t *first_mp = NULL; 18452 boolean_t mctl_present; 18453 ipsec_out_t *io; 18454 int match_flags; 18455 ill_t *attach_ill = NULL; 18456 /* Bind to IPIF_NOFAILOVER ill etc. */ 18457 ill_t *xmit_ill = NULL; /* IP_XMIT_IF etc. */ 18458 ipif_t *dst_ipif; 18459 boolean_t multirt_need_resolve = B_FALSE; 18460 mblk_t *copy_mp = NULL; 18461 int err; 18462 zoneid_t zoneid; 18463 int adjust; 18464 uint16_t iplen; 18465 boolean_t need_decref = B_FALSE; 18466 boolean_t ignore_dontroute = B_FALSE; 18467 boolean_t ignore_nexthop = B_FALSE; 18468 boolean_t ip_nexthop = B_FALSE; 18469 ipaddr_t nexthop_addr; 18470 18471 #ifdef _BIG_ENDIAN 18472 #define V_HLEN (v_hlen_tos_len >> 24) 18473 #else 18474 #define V_HLEN (v_hlen_tos_len & 0xFF) 18475 #endif 18476 18477 TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, 18478 "ip_wput_start: q %p", q); 18479 18480 /* 18481 * ip_wput fast path 18482 */ 18483 18484 /* is packet from ARP ? */ 18485 if (q->q_next != NULL) 18486 goto qnext; 18487 18488 connp = (conn_t *)arg; 18489 zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 18490 18491 /* is queue flow controlled? */ 18492 if ((q->q_first != NULL || connp->conn_draining) && 18493 (caller == IP_WPUT)) { 18494 ASSERT(!need_decref); 18495 (void) putq(q, mp); 18496 return; 18497 } 18498 18499 /* Multidata transmit? */ 18500 if (DB_TYPE(mp) == M_MULTIDATA) { 18501 /* 18502 * We should never get here, since all Multidata messages 18503 * originating from tcp should have been directed over to 18504 * tcp_multisend() in the first place. 18505 */ 18506 BUMP_MIB(&ip_mib, ipOutDiscards); 18507 freemsg(mp); 18508 return; 18509 } else if (DB_TYPE(mp) != M_DATA) 18510 goto notdata; 18511 18512 if (mp->b_flag & MSGHASREF) { 18513 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 18514 mp->b_flag &= ~MSGHASREF; 18515 SCTP_EXTRACT_IPINFO(mp, sctp_ire); 18516 need_decref = B_TRUE; 18517 } 18518 ipha = (ipha_t *)mp->b_rptr; 18519 18520 /* is IP header non-aligned or mblk smaller than basic IP header */ 18521 #ifndef SAFETY_BEFORE_SPEED 18522 if (!OK_32PTR(rptr) || 18523 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) 18524 goto hdrtoosmall; 18525 #endif 18526 18527 ASSERT(OK_32PTR(ipha)); 18528 18529 /* 18530 * This function assumes that mp points to an IPv4 packet. If it's the 18531 * wrong version, we'll catch it again in ip_output_v6. 18532 * 18533 * Note that this is *only* locally-generated output here, and never 18534 * forwarded data, and that we need to deal only with transports that 18535 * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to 18536 * label.) 18537 */ 18538 if (is_system_labeled() && 18539 (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && 18540 !connp->conn_ulp_labeled) { 18541 err = tsol_check_label(BEST_CRED(mp, connp), &mp, &adjust, 18542 connp->conn_mac_exempt); 18543 ipha = (ipha_t *)mp->b_rptr; 18544 if (err != 0) { 18545 first_mp = mp; 18546 if (err == EINVAL) 18547 goto icmp_parameter_problem; 18548 ip2dbg(("ip_wput: label check failed (%d)\n", err)); 18549 goto drop_pkt; 18550 } 18551 iplen = ntohs(ipha->ipha_length) + adjust; 18552 ipha->ipha_length = htons(iplen); 18553 } 18554 18555 /* 18556 * If there is a policy, try to attach an ipsec_out in 18557 * the front. At the end, first_mp either points to a 18558 * M_DATA message or IPSEC_OUT message linked to a 18559 * M_DATA message. We have to do it now as we might 18560 * lose the "conn" if we go through ip_newroute. 18561 */ 18562 if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { 18563 if (((mp = ipsec_attach_ipsec_out(mp, connp, NULL, 18564 ipha->ipha_protocol)) == NULL)) { 18565 if (need_decref) 18566 CONN_DEC_REF(connp); 18567 return; 18568 } else { 18569 ASSERT(mp->b_datap->db_type == M_CTL); 18570 first_mp = mp; 18571 mp = mp->b_cont; 18572 mctl_present = B_TRUE; 18573 } 18574 } else { 18575 first_mp = mp; 18576 mctl_present = B_FALSE; 18577 } 18578 18579 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 18580 18581 /* is wrong version or IP options present */ 18582 if (V_HLEN != IP_SIMPLE_HDR_VERSION) 18583 goto version_hdrlen_check; 18584 dst = ipha->ipha_dst; 18585 18586 if (connp->conn_nofailover_ill != NULL) { 18587 attach_ill = conn_get_held_ill(connp, 18588 &connp->conn_nofailover_ill, &err); 18589 if (err == ILL_LOOKUP_FAILED) { 18590 if (need_decref) 18591 CONN_DEC_REF(connp); 18592 freemsg(first_mp); 18593 return; 18594 } 18595 } 18596 18597 /* is packet multicast? */ 18598 if (CLASSD(dst)) 18599 goto multicast; 18600 18601 if ((connp->conn_dontroute) || (connp->conn_xmit_if_ill != NULL) || 18602 (connp->conn_nexthop_set)) { 18603 /* 18604 * If the destination is a broadcast or a loopback 18605 * address, SO_DONTROUTE, IP_XMIT_IF and IP_NEXTHOP go 18606 * through the standard path. But in the case of local 18607 * destination only SO_DONTROUTE and IP_NEXTHOP go through 18608 * the standard path not IP_XMIT_IF. 18609 */ 18610 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18611 if ((ire == NULL) || ((ire->ire_type != IRE_BROADCAST) && 18612 (ire->ire_type != IRE_LOOPBACK))) { 18613 if ((connp->conn_dontroute || 18614 connp->conn_nexthop_set) && (ire != NULL) && 18615 (ire->ire_type == IRE_LOCAL)) 18616 goto standard_path; 18617 18618 if (ire != NULL) { 18619 ire_refrele(ire); 18620 /* No more access to ire */ 18621 ire = NULL; 18622 } 18623 /* 18624 * bypass routing checks and go directly to 18625 * interface. 18626 */ 18627 if (connp->conn_dontroute) { 18628 goto dontroute; 18629 } else if (connp->conn_nexthop_set) { 18630 ip_nexthop = B_TRUE; 18631 nexthop_addr = connp->conn_nexthop_v4; 18632 goto send_from_ill; 18633 } 18634 18635 /* 18636 * If IP_XMIT_IF socket option is set, 18637 * then we allow unicast and multicast 18638 * packets to go through the ill. It is 18639 * quite possible that the destination 18640 * is not in the ire cache table and we 18641 * do not want to go to ip_newroute() 18642 * instead we call ip_newroute_ipif. 18643 */ 18644 xmit_ill = conn_get_held_ill(connp, 18645 &connp->conn_xmit_if_ill, &err); 18646 if (err == ILL_LOOKUP_FAILED) { 18647 if (attach_ill != NULL) 18648 ill_refrele(attach_ill); 18649 if (need_decref) 18650 CONN_DEC_REF(connp); 18651 freemsg(first_mp); 18652 return; 18653 } 18654 goto send_from_ill; 18655 } 18656 standard_path: 18657 /* Must be a broadcast, a loopback or a local ire */ 18658 if (ire != NULL) { 18659 ire_refrele(ire); 18660 /* No more access to ire */ 18661 ire = NULL; 18662 } 18663 } 18664 18665 if (attach_ill != NULL) 18666 goto send_from_ill; 18667 18668 /* 18669 * We cache IRE_CACHEs to avoid lookups. We don't do 18670 * this for the tcp global queue and listen end point 18671 * as it does not really have a real destination to 18672 * talk to. This is also true for SCTP. 18673 */ 18674 if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && 18675 !connp->conn_fully_bound) { 18676 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18677 if (ire == NULL) 18678 goto noirefound; 18679 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18680 "ip_wput_end: q %p (%S)", q, "end"); 18681 18682 /* 18683 * Check if the ire has the RTF_MULTIRT flag, inherited 18684 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 18685 */ 18686 if (ire->ire_flags & RTF_MULTIRT) { 18687 18688 /* 18689 * Force the TTL of multirouted packets if required. 18690 * The TTL of such packets is bounded by the 18691 * ip_multirt_ttl ndd variable. 18692 */ 18693 if ((ip_multirt_ttl > 0) && 18694 (ipha->ipha_ttl > ip_multirt_ttl)) { 18695 ip2dbg(("ip_wput: forcing multirt TTL to %d " 18696 "(was %d), dst 0x%08x\n", 18697 ip_multirt_ttl, ipha->ipha_ttl, 18698 ntohl(ire->ire_addr))); 18699 ipha->ipha_ttl = ip_multirt_ttl; 18700 } 18701 /* 18702 * We look at this point if there are pending 18703 * unresolved routes. ire_multirt_resolvable() 18704 * checks in O(n) that all IRE_OFFSUBNET ire 18705 * entries for the packet's destination and 18706 * flagged RTF_MULTIRT are currently resolved. 18707 * If some remain unresolved, we make a copy 18708 * of the current message. It will be used 18709 * to initiate additional route resolutions. 18710 */ 18711 multirt_need_resolve = 18712 ire_multirt_need_resolve(ire->ire_addr, 18713 MBLK_GETLABEL(first_mp)); 18714 ip2dbg(("ip_wput[TCP]: ire %p, " 18715 "multirt_need_resolve %d, first_mp %p\n", 18716 (void *)ire, multirt_need_resolve, 18717 (void *)first_mp)); 18718 if (multirt_need_resolve) { 18719 copy_mp = copymsg(first_mp); 18720 if (copy_mp != NULL) { 18721 MULTIRT_DEBUG_TAG(copy_mp); 18722 } 18723 } 18724 } 18725 18726 ip_wput_ire(q, first_mp, ire, connp, caller); 18727 18728 /* 18729 * Try to resolve another multiroute if 18730 * ire_multirt_need_resolve() deemed it necessary. 18731 */ 18732 if (copy_mp != NULL) { 18733 ip_newroute(q, copy_mp, dst, NULL, connp); 18734 } 18735 if (need_decref) 18736 CONN_DEC_REF(connp); 18737 return; 18738 } 18739 18740 /* 18741 * Access to conn_ire_cache. (protected by conn_lock) 18742 * 18743 * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab 18744 * the ire bucket lock here to check for CONDEMNED as it is okay to 18745 * send a packet or two with the IRE_CACHE that is going away. 18746 * Access to the ire requires an ire refhold on the ire prior to 18747 * its use since an interface unplumb thread may delete the cached 18748 * ire and release the refhold at any time. 18749 * 18750 * Caching an ire in the conn_ire_cache 18751 * 18752 * o Caching an ire pointer in the conn requires a strict check for 18753 * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant 18754 * ires before cleaning up the conns. So the caching of an ire pointer 18755 * in the conn is done after making sure under the bucket lock that the 18756 * ire has not yet been marked CONDEMNED. Otherwise we will end up 18757 * caching an ire after the unplumb thread has cleaned up the conn. 18758 * If the conn does not send a packet subsequently the unplumb thread 18759 * will be hanging waiting for the ire count to drop to zero. 18760 * 18761 * o We also need to atomically test for a null conn_ire_cache and 18762 * set the conn_ire_cache under the the protection of the conn_lock 18763 * to avoid races among concurrent threads trying to simultaneously 18764 * cache an ire in the conn_ire_cache. 18765 */ 18766 mutex_enter(&connp->conn_lock); 18767 ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; 18768 18769 if (ire != NULL && ire->ire_addr == dst && 18770 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18771 18772 IRE_REFHOLD(ire); 18773 mutex_exit(&connp->conn_lock); 18774 18775 } else { 18776 boolean_t cached = B_FALSE; 18777 connp->conn_ire_cache = NULL; 18778 mutex_exit(&connp->conn_lock); 18779 /* Release the old ire */ 18780 if (ire != NULL && sctp_ire == NULL) 18781 IRE_REFRELE_NOTR(ire); 18782 18783 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 18784 if (ire == NULL) 18785 goto noirefound; 18786 IRE_REFHOLD_NOTR(ire); 18787 18788 mutex_enter(&connp->conn_lock); 18789 if (!(connp->conn_state_flags & CONN_CLOSING) && 18790 connp->conn_ire_cache == NULL) { 18791 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 18792 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18793 connp->conn_ire_cache = ire; 18794 cached = B_TRUE; 18795 } 18796 rw_exit(&ire->ire_bucket->irb_lock); 18797 } 18798 mutex_exit(&connp->conn_lock); 18799 18800 /* 18801 * We can continue to use the ire but since it was 18802 * not cached, we should drop the extra reference. 18803 */ 18804 if (!cached) 18805 IRE_REFRELE_NOTR(ire); 18806 } 18807 18808 18809 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18810 "ip_wput_end: q %p (%S)", q, "end"); 18811 18812 /* 18813 * Check if the ire has the RTF_MULTIRT flag, inherited 18814 * from an IRE_OFFSUBNET ire entry in ip_newroute(). 18815 */ 18816 if (ire->ire_flags & RTF_MULTIRT) { 18817 18818 /* 18819 * Force the TTL of multirouted packets if required. 18820 * The TTL of such packets is bounded by the 18821 * ip_multirt_ttl ndd variable. 18822 */ 18823 if ((ip_multirt_ttl > 0) && 18824 (ipha->ipha_ttl > ip_multirt_ttl)) { 18825 ip2dbg(("ip_wput: forcing multirt TTL to %d " 18826 "(was %d), dst 0x%08x\n", 18827 ip_multirt_ttl, ipha->ipha_ttl, 18828 ntohl(ire->ire_addr))); 18829 ipha->ipha_ttl = ip_multirt_ttl; 18830 } 18831 18832 /* 18833 * At this point, we check to see if there are any pending 18834 * unresolved routes. ire_multirt_resolvable() 18835 * checks in O(n) that all IRE_OFFSUBNET ire 18836 * entries for the packet's destination and 18837 * flagged RTF_MULTIRT are currently resolved. 18838 * If some remain unresolved, we make a copy 18839 * of the current message. It will be used 18840 * to initiate additional route resolutions. 18841 */ 18842 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 18843 MBLK_GETLABEL(first_mp)); 18844 ip2dbg(("ip_wput[not TCP]: ire %p, " 18845 "multirt_need_resolve %d, first_mp %p\n", 18846 (void *)ire, multirt_need_resolve, (void *)first_mp)); 18847 if (multirt_need_resolve) { 18848 copy_mp = copymsg(first_mp); 18849 if (copy_mp != NULL) { 18850 MULTIRT_DEBUG_TAG(copy_mp); 18851 } 18852 } 18853 } 18854 18855 ip_wput_ire(q, first_mp, ire, connp, caller); 18856 18857 /* 18858 * Try to resolve another multiroute if 18859 * ire_multirt_resolvable() deemed it necessary 18860 */ 18861 if (copy_mp != NULL) { 18862 ip_newroute(q, copy_mp, dst, NULL, connp); 18863 } 18864 if (need_decref) 18865 CONN_DEC_REF(connp); 18866 return; 18867 18868 qnext: 18869 /* 18870 * Upper Level Protocols pass down complete IP datagrams 18871 * as M_DATA messages. Everything else is a sideshow. 18872 * 18873 * 1) We could be re-entering ip_wput because of ip_neworute 18874 * in which case we could have a IPSEC_OUT message. We 18875 * need to pass through ip_wput like other datagrams and 18876 * hence cannot branch to ip_wput_nondata. 18877 * 18878 * 2) ARP, AH, ESP, and other clients who are on the module 18879 * instance of IP stream, give us something to deal with. 18880 * We will handle AH and ESP here and rest in ip_wput_nondata. 18881 * 18882 * 3) ICMP replies also could come here. 18883 */ 18884 if (DB_TYPE(mp) != M_DATA) { 18885 notdata: 18886 if (DB_TYPE(mp) == M_CTL) { 18887 /* 18888 * M_CTL messages are used by ARP, AH and ESP to 18889 * communicate with IP. We deal with IPSEC_IN and 18890 * IPSEC_OUT here. ip_wput_nondata handles other 18891 * cases. 18892 */ 18893 ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; 18894 if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { 18895 first_mp = mp->b_cont; 18896 first_mp->b_flag &= ~MSGHASREF; 18897 ASSERT(connp->conn_ulp == IPPROTO_SCTP); 18898 SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); 18899 CONN_DEC_REF(connp); 18900 connp = NULL; 18901 } 18902 if (ii->ipsec_info_type == IPSEC_IN) { 18903 /* 18904 * Either this message goes back to 18905 * IPSEC for further processing or to 18906 * ULP after policy checks. 18907 */ 18908 ip_fanout_proto_again(mp, NULL, NULL, NULL); 18909 return; 18910 } else if (ii->ipsec_info_type == IPSEC_OUT) { 18911 io = (ipsec_out_t *)ii; 18912 if (io->ipsec_out_proc_begin) { 18913 /* 18914 * IPSEC processing has already started. 18915 * Complete it. 18916 * IPQoS notes: We don't care what is 18917 * in ipsec_out_ill_index since this 18918 * won't be processed for IPQoS policies 18919 * in ipsec_out_process. 18920 */ 18921 ipsec_out_process(q, mp, NULL, 18922 io->ipsec_out_ill_index); 18923 return; 18924 } else { 18925 connp = (q->q_next != NULL) ? 18926 NULL : Q_TO_CONN(q); 18927 first_mp = mp; 18928 mp = mp->b_cont; 18929 mctl_present = B_TRUE; 18930 } 18931 zoneid = io->ipsec_out_zoneid; 18932 ASSERT(zoneid != ALL_ZONES); 18933 } else if (ii->ipsec_info_type == IPSEC_CTL) { 18934 /* 18935 * It's an IPsec control message requesting 18936 * an SADB update to be sent to the IPsec 18937 * hardware acceleration capable ills. 18938 */ 18939 ipsec_ctl_t *ipsec_ctl = 18940 (ipsec_ctl_t *)mp->b_rptr; 18941 ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; 18942 uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; 18943 mblk_t *cmp = mp->b_cont; 18944 18945 ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); 18946 ASSERT(cmp != NULL); 18947 18948 freeb(mp); 18949 ill_ipsec_capab_send_all(satype, cmp, sa); 18950 return; 18951 } else { 18952 /* 18953 * This must be ARP or special TSOL signaling. 18954 */ 18955 ip_wput_nondata(NULL, q, mp, NULL); 18956 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18957 "ip_wput_end: q %p (%S)", q, "nondata"); 18958 return; 18959 } 18960 } else { 18961 /* 18962 * This must be non-(ARP/AH/ESP) messages. 18963 */ 18964 ASSERT(!need_decref); 18965 ip_wput_nondata(NULL, q, mp, NULL); 18966 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 18967 "ip_wput_end: q %p (%S)", q, "nondata"); 18968 return; 18969 } 18970 } else { 18971 first_mp = mp; 18972 mctl_present = B_FALSE; 18973 } 18974 18975 ASSERT(first_mp != NULL); 18976 /* 18977 * ICMP echo replies attach an ipsec_out and set ipsec_out_attach_if 18978 * to make sure that this packet goes out on the same interface it 18979 * came in. We handle that here. 18980 */ 18981 if (mctl_present) { 18982 uint_t ifindex; 18983 18984 io = (ipsec_out_t *)first_mp->b_rptr; 18985 if (io->ipsec_out_attach_if || 18986 io->ipsec_out_xmit_if || 18987 io->ipsec_out_ip_nexthop) { 18988 ill_t *ill; 18989 18990 /* 18991 * We may have lost the conn context if we are 18992 * coming here from ip_newroute(). Copy the 18993 * nexthop information. 18994 */ 18995 if (io->ipsec_out_ip_nexthop) { 18996 ip_nexthop = B_TRUE; 18997 nexthop_addr = io->ipsec_out_nexthop_addr; 18998 18999 ipha = (ipha_t *)mp->b_rptr; 19000 dst = ipha->ipha_dst; 19001 goto send_from_ill; 19002 } else { 19003 ASSERT(io->ipsec_out_ill_index != 0); 19004 ifindex = io->ipsec_out_ill_index; 19005 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 19006 NULL, NULL, NULL, NULL); 19007 /* 19008 * ipsec_out_xmit_if bit is used to tell 19009 * ip_wput to use the ill to send outgoing data 19010 * as we have no conn when data comes from ICMP 19011 * error msg routines. Currently this feature is 19012 * only used by ip_mrtun_forward routine. 19013 */ 19014 if (io->ipsec_out_xmit_if) { 19015 xmit_ill = ill; 19016 if (xmit_ill == NULL) { 19017 ip1dbg(("ip_output:bad ifindex " 19018 "for xmit_ill %d\n", 19019 ifindex)); 19020 freemsg(first_mp); 19021 BUMP_MIB(&ip_mib, 19022 ipOutDiscards); 19023 ASSERT(!need_decref); 19024 return; 19025 } 19026 /* Free up the ipsec_out_t mblk */ 19027 ASSERT(first_mp->b_cont == mp); 19028 first_mp->b_cont = NULL; 19029 freeb(first_mp); 19030 /* Just send the IP header+ICMP+data */ 19031 first_mp = mp; 19032 ipha = (ipha_t *)mp->b_rptr; 19033 dst = ipha->ipha_dst; 19034 goto send_from_ill; 19035 } else { 19036 attach_ill = ill; 19037 } 19038 19039 if (attach_ill == NULL) { 19040 ASSERT(xmit_ill == NULL); 19041 ip1dbg(("ip_output: bad ifindex for " 19042 "(BIND TO IPIF_NOFAILOVER) %d\n", 19043 ifindex)); 19044 freemsg(first_mp); 19045 BUMP_MIB(&ip_mib, ipOutDiscards); 19046 ASSERT(!need_decref); 19047 return; 19048 } 19049 } 19050 } 19051 } 19052 19053 ASSERT(xmit_ill == NULL); 19054 19055 /* We have a complete IP datagram heading outbound. */ 19056 ipha = (ipha_t *)mp->b_rptr; 19057 19058 #ifndef SPEED_BEFORE_SAFETY 19059 /* 19060 * Make sure we have a full-word aligned message and that at least 19061 * a simple IP header is accessible in the first message. If not, 19062 * try a pullup. 19063 */ 19064 if (!OK_32PTR(rptr) || 19065 (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) { 19066 hdrtoosmall: 19067 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 19068 BUMP_MIB(&ip_mib, ipOutDiscards); 19069 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19070 "ip_wput_end: q %p (%S)", q, "pullupfailed"); 19071 if (first_mp == NULL) 19072 first_mp = mp; 19073 goto drop_pkt; 19074 } 19075 19076 /* This function assumes that mp points to an IPv4 packet. */ 19077 if (is_system_labeled() && q->q_next == NULL && 19078 (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && 19079 !connp->conn_ulp_labeled) { 19080 err = tsol_check_label(BEST_CRED(mp, connp), &mp, 19081 &adjust, connp->conn_mac_exempt); 19082 ipha = (ipha_t *)mp->b_rptr; 19083 if (first_mp != NULL) 19084 first_mp->b_cont = mp; 19085 if (err != 0) { 19086 if (first_mp == NULL) 19087 first_mp = mp; 19088 if (err == EINVAL) 19089 goto icmp_parameter_problem; 19090 ip2dbg(("ip_wput: label check failed (%d)\n", 19091 err)); 19092 goto drop_pkt; 19093 } 19094 iplen = ntohs(ipha->ipha_length) + adjust; 19095 ipha->ipha_length = htons(iplen); 19096 } 19097 19098 ipha = (ipha_t *)mp->b_rptr; 19099 if (first_mp == NULL) { 19100 ASSERT(attach_ill == NULL && xmit_ill == NULL); 19101 /* 19102 * If we got here because of "goto hdrtoosmall" 19103 * We need to attach a IPSEC_OUT. 19104 */ 19105 if (connp->conn_out_enforce_policy) { 19106 if (((mp = ipsec_attach_ipsec_out(mp, connp, 19107 NULL, ipha->ipha_protocol)) == NULL)) { 19108 if (need_decref) 19109 CONN_DEC_REF(connp); 19110 return; 19111 } else { 19112 ASSERT(mp->b_datap->db_type == M_CTL); 19113 first_mp = mp; 19114 mp = mp->b_cont; 19115 mctl_present = B_TRUE; 19116 } 19117 } else { 19118 first_mp = mp; 19119 mctl_present = B_FALSE; 19120 } 19121 } 19122 } 19123 #endif 19124 19125 /* Most of the code below is written for speed, not readability */ 19126 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 19127 19128 /* 19129 * If ip_newroute() fails, we're going to need a full 19130 * header for the icmp wraparound. 19131 */ 19132 if (V_HLEN != IP_SIMPLE_HDR_VERSION) { 19133 uint_t v_hlen; 19134 version_hdrlen_check: 19135 ASSERT(first_mp != NULL); 19136 v_hlen = V_HLEN; 19137 /* 19138 * siphon off IPv6 packets coming down from transport 19139 * layer modules here. 19140 * Note: high-order bit carries NUD reachability confirmation 19141 */ 19142 if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { 19143 /* 19144 * XXX implement a IPv4 and IPv6 packet counter per 19145 * conn and switch when ratio exceeds e.g. 10:1 19146 */ 19147 #ifdef notyet 19148 if (q->q_next == NULL) /* Avoid ill queue */ 19149 ip_setqinfo(RD(q), B_TRUE, B_TRUE); 19150 #endif 19151 BUMP_MIB(&ip_mib, ipOutIPv6); 19152 ASSERT(xmit_ill == NULL); 19153 if (attach_ill != NULL) 19154 ill_refrele(attach_ill); 19155 if (need_decref) 19156 mp->b_flag |= MSGHASREF; 19157 (void) ip_output_v6(connp, first_mp, q, caller); 19158 return; 19159 } 19160 19161 if ((v_hlen >> 4) != IP_VERSION) { 19162 BUMP_MIB(&ip_mib, ipOutDiscards); 19163 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19164 "ip_wput_end: q %p (%S)", q, "badvers"); 19165 goto drop_pkt; 19166 } 19167 /* 19168 * Is the header length at least 20 bytes? 19169 * 19170 * Are there enough bytes accessible in the header? If 19171 * not, try a pullup. 19172 */ 19173 v_hlen &= 0xF; 19174 v_hlen <<= 2; 19175 if (v_hlen < IP_SIMPLE_HDR_LENGTH) { 19176 BUMP_MIB(&ip_mib, ipOutDiscards); 19177 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19178 "ip_wput_end: q %p (%S)", q, "badlen"); 19179 goto drop_pkt; 19180 } 19181 if (v_hlen > (mp->b_wptr - rptr)) { 19182 if (!pullupmsg(mp, v_hlen)) { 19183 BUMP_MIB(&ip_mib, ipOutDiscards); 19184 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19185 "ip_wput_end: q %p (%S)", q, "badpullup2"); 19186 goto drop_pkt; 19187 } 19188 ipha = (ipha_t *)mp->b_rptr; 19189 } 19190 /* 19191 * Move first entry from any source route into ipha_dst and 19192 * verify the options 19193 */ 19194 if (ip_wput_options(q, first_mp, ipha, mctl_present, zoneid)) { 19195 ASSERT(xmit_ill == NULL); 19196 if (attach_ill != NULL) 19197 ill_refrele(attach_ill); 19198 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19199 "ip_wput_end: q %p (%S)", q, "badopts"); 19200 if (need_decref) 19201 CONN_DEC_REF(connp); 19202 return; 19203 } 19204 } 19205 dst = ipha->ipha_dst; 19206 19207 /* 19208 * Try to get an IRE_CACHE for the destination address. If we can't, 19209 * we have to run the packet through ip_newroute which will take 19210 * the appropriate action to arrange for an IRE_CACHE, such as querying 19211 * a resolver, or assigning a default gateway, etc. 19212 */ 19213 if (CLASSD(dst)) { 19214 ipif_t *ipif; 19215 uint32_t setsrc = 0; 19216 19217 multicast: 19218 ASSERT(first_mp != NULL); 19219 ASSERT(xmit_ill == NULL); 19220 ip2dbg(("ip_wput: CLASSD\n")); 19221 if (connp == NULL) { 19222 /* 19223 * Use the first good ipif on the ill. 19224 * XXX Should this ever happen? (Appears 19225 * to show up with just ppp and no ethernet due 19226 * to in.rdisc.) 19227 * However, ire_send should be able to 19228 * call ip_wput_ire directly. 19229 * 19230 * XXX Also, this can happen for ICMP and other packets 19231 * with multicast source addresses. Perhaps we should 19232 * fix things so that we drop the packet in question, 19233 * but for now, just run with it. 19234 */ 19235 ill_t *ill = (ill_t *)q->q_ptr; 19236 19237 /* 19238 * Don't honor attach_if for this case. If ill 19239 * is part of the group, ipif could belong to 19240 * any ill and we cannot maintain attach_ill 19241 * and ipif_ill same anymore and the assert 19242 * below would fail. 19243 */ 19244 if (mctl_present) { 19245 io->ipsec_out_ill_index = 0; 19246 io->ipsec_out_attach_if = B_FALSE; 19247 ASSERT(attach_ill != NULL); 19248 ill_refrele(attach_ill); 19249 attach_ill = NULL; 19250 } 19251 19252 ASSERT(attach_ill == NULL); 19253 ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); 19254 if (ipif == NULL) { 19255 if (need_decref) 19256 CONN_DEC_REF(connp); 19257 freemsg(first_mp); 19258 return; 19259 } 19260 ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", 19261 ntohl(dst), ill->ill_name)); 19262 } else { 19263 /* 19264 * If both IP_MULTICAST_IF and IP_XMIT_IF are set, 19265 * IP_XMIT_IF is honoured. 19266 * Block comment above this function explains the 19267 * locking mechanism used here 19268 */ 19269 xmit_ill = conn_get_held_ill(connp, 19270 &connp->conn_xmit_if_ill, &err); 19271 if (err == ILL_LOOKUP_FAILED) { 19272 ip1dbg(("ip_wput: No ill for IP_XMIT_IF\n")); 19273 goto drop_pkt; 19274 } 19275 if (xmit_ill == NULL) { 19276 ipif = conn_get_held_ipif(connp, 19277 &connp->conn_multicast_ipif, &err); 19278 if (err == IPIF_LOOKUP_FAILED) { 19279 ip1dbg(("ip_wput: No ipif for " 19280 "multicast\n")); 19281 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19282 goto drop_pkt; 19283 } 19284 } 19285 if (xmit_ill != NULL) { 19286 ipif = ipif_get_next_ipif(NULL, xmit_ill); 19287 if (ipif == NULL) { 19288 ip1dbg(("ip_wput: No ipif for " 19289 "IP_XMIT_IF\n")); 19290 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19291 goto drop_pkt; 19292 } 19293 } else if (ipif == NULL || ipif->ipif_isv6) { 19294 /* 19295 * We must do this ipif determination here 19296 * else we could pass through ip_newroute 19297 * and come back here without the conn context. 19298 * 19299 * Note: we do late binding i.e. we bind to 19300 * the interface when the first packet is sent. 19301 * For performance reasons we do not rebind on 19302 * each packet but keep the binding until the 19303 * next IP_MULTICAST_IF option. 19304 * 19305 * conn_multicast_{ipif,ill} are shared between 19306 * IPv4 and IPv6 and AF_INET6 sockets can 19307 * send both IPv4 and IPv6 packets. Hence 19308 * we have to check that "isv6" matches above. 19309 */ 19310 if (ipif != NULL) 19311 ipif_refrele(ipif); 19312 ipif = ipif_lookup_group(dst, zoneid); 19313 if (ipif == NULL) { 19314 ip1dbg(("ip_wput: No ipif for " 19315 "multicast\n")); 19316 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19317 goto drop_pkt; 19318 } 19319 err = conn_set_held_ipif(connp, 19320 &connp->conn_multicast_ipif, ipif); 19321 if (err == IPIF_LOOKUP_FAILED) { 19322 ipif_refrele(ipif); 19323 ip1dbg(("ip_wput: No ipif for " 19324 "multicast\n")); 19325 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19326 goto drop_pkt; 19327 } 19328 } 19329 } 19330 ASSERT(!ipif->ipif_isv6); 19331 /* 19332 * As we may lose the conn by the time we reach ip_wput_ire, 19333 * we copy conn_multicast_loop and conn_dontroute on to an 19334 * ipsec_out. In case if this datagram goes out secure, 19335 * we need the ill_index also. Copy that also into the 19336 * ipsec_out. 19337 */ 19338 if (mctl_present) { 19339 io = (ipsec_out_t *)first_mp->b_rptr; 19340 ASSERT(first_mp->b_datap->db_type == M_CTL); 19341 ASSERT(io->ipsec_out_type == IPSEC_OUT); 19342 } else { 19343 ASSERT(mp == first_mp); 19344 if ((first_mp = allocb(sizeof (ipsec_info_t), 19345 BPRI_HI)) == NULL) { 19346 ipif_refrele(ipif); 19347 first_mp = mp; 19348 goto drop_pkt; 19349 } 19350 first_mp->b_datap->db_type = M_CTL; 19351 first_mp->b_wptr += sizeof (ipsec_info_t); 19352 /* ipsec_out_secure is B_FALSE now */ 19353 bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); 19354 io = (ipsec_out_t *)first_mp->b_rptr; 19355 io->ipsec_out_type = IPSEC_OUT; 19356 io->ipsec_out_len = sizeof (ipsec_out_t); 19357 io->ipsec_out_use_global_policy = B_TRUE; 19358 first_mp->b_cont = mp; 19359 mctl_present = B_TRUE; 19360 } 19361 if (attach_ill != NULL) { 19362 ASSERT(attach_ill == ipif->ipif_ill); 19363 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 19364 19365 /* 19366 * Check if we need an ire that will not be 19367 * looked up by anybody else i.e. HIDDEN. 19368 */ 19369 if (ill_is_probeonly(attach_ill)) { 19370 match_flags |= MATCH_IRE_MARK_HIDDEN; 19371 } 19372 io->ipsec_out_ill_index = 19373 attach_ill->ill_phyint->phyint_ifindex; 19374 io->ipsec_out_attach_if = B_TRUE; 19375 } else { 19376 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 19377 io->ipsec_out_ill_index = 19378 ipif->ipif_ill->ill_phyint->phyint_ifindex; 19379 } 19380 if (connp != NULL) { 19381 io->ipsec_out_multicast_loop = 19382 connp->conn_multicast_loop; 19383 io->ipsec_out_dontroute = connp->conn_dontroute; 19384 io->ipsec_out_zoneid = connp->conn_zoneid; 19385 } 19386 /* 19387 * If the application uses IP_MULTICAST_IF with 19388 * different logical addresses of the same ILL, we 19389 * need to make sure that the soruce address of 19390 * the packet matches the logical IP address used 19391 * in the option. We do it by initializing ipha_src 19392 * here. This should keep IPSEC also happy as 19393 * when we return from IPSEC processing, we don't 19394 * have to worry about getting the right address on 19395 * the packet. Thus it is sufficient to look for 19396 * IRE_CACHE using MATCH_IRE_ILL rathen than 19397 * MATCH_IRE_IPIF. 19398 * 19399 * NOTE : We need to do it for non-secure case also as 19400 * this might go out secure if there is a global policy 19401 * match in ip_wput_ire. For bind to IPIF_NOFAILOVER 19402 * address, the source should be initialized already and 19403 * hence we won't be initializing here. 19404 * 19405 * As we do not have the ire yet, it is possible that 19406 * we set the source address here and then later discover 19407 * that the ire implies the source address to be assigned 19408 * through the RTF_SETSRC flag. 19409 * In that case, the setsrc variable will remind us 19410 * that overwritting the source address by the one 19411 * of the RTF_SETSRC-flagged ire is allowed. 19412 */ 19413 if (ipha->ipha_src == INADDR_ANY && 19414 (connp == NULL || !connp->conn_unspec_src)) { 19415 ipha->ipha_src = ipif->ipif_src_addr; 19416 setsrc = RTF_SETSRC; 19417 } 19418 /* 19419 * Find an IRE which matches the destination and the outgoing 19420 * queue (i.e. the outgoing interface.) 19421 * For loopback use a unicast IP address for 19422 * the ire lookup. 19423 */ 19424 if (ipif->ipif_ill->ill_phyint->phyint_flags & 19425 PHYI_LOOPBACK) { 19426 dst = ipif->ipif_lcl_addr; 19427 } 19428 /* 19429 * If IP_XMIT_IF is set, we branch out to ip_newroute_ipif. 19430 * We don't need to lookup ire in ctable as the packet 19431 * needs to be sent to the destination through the specified 19432 * ill irrespective of ires in the cache table. 19433 */ 19434 ire = NULL; 19435 if (xmit_ill == NULL) { 19436 ire = ire_ctable_lookup(dst, 0, 0, ipif, 19437 zoneid, MBLK_GETLABEL(mp), match_flags); 19438 } 19439 19440 /* 19441 * refrele attach_ill as its not needed anymore. 19442 */ 19443 if (attach_ill != NULL) { 19444 ill_refrele(attach_ill); 19445 attach_ill = NULL; 19446 } 19447 19448 if (ire == NULL) { 19449 /* 19450 * Multicast loopback and multicast forwarding is 19451 * done in ip_wput_ire. 19452 * 19453 * Mark this packet to make it be delivered to 19454 * ip_wput_ire after the new ire has been 19455 * created. 19456 * 19457 * The call to ip_newroute_ipif takes into account 19458 * the setsrc reminder. In any case, we take care 19459 * of the RTF_MULTIRT flag. 19460 */ 19461 mp->b_prev = mp->b_next = NULL; 19462 if (xmit_ill == NULL || 19463 xmit_ill->ill_ipif_up_count > 0) { 19464 ip_newroute_ipif(q, first_mp, ipif, dst, connp, 19465 setsrc | RTF_MULTIRT); 19466 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19467 "ip_wput_end: q %p (%S)", q, "noire"); 19468 } else { 19469 freemsg(first_mp); 19470 } 19471 ipif_refrele(ipif); 19472 if (xmit_ill != NULL) 19473 ill_refrele(xmit_ill); 19474 if (need_decref) 19475 CONN_DEC_REF(connp); 19476 return; 19477 } 19478 19479 ipif_refrele(ipif); 19480 ipif = NULL; 19481 ASSERT(xmit_ill == NULL); 19482 19483 /* 19484 * Honor the RTF_SETSRC flag for multicast packets, 19485 * if allowed by the setsrc reminder. 19486 */ 19487 if ((ire->ire_flags & RTF_SETSRC) && setsrc) { 19488 ipha->ipha_src = ire->ire_src_addr; 19489 } 19490 19491 /* 19492 * Unconditionally force the TTL to 1 for 19493 * multirouted multicast packets: 19494 * multirouted multicast should not cross 19495 * multicast routers. 19496 */ 19497 if (ire->ire_flags & RTF_MULTIRT) { 19498 if (ipha->ipha_ttl > 1) { 19499 ip2dbg(("ip_wput: forcing multicast " 19500 "multirt TTL to 1 (was %d), dst 0x%08x\n", 19501 ipha->ipha_ttl, ntohl(ire->ire_addr))); 19502 ipha->ipha_ttl = 1; 19503 } 19504 } 19505 } else { 19506 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 19507 if ((ire != NULL) && (ire->ire_type & 19508 (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { 19509 ignore_dontroute = B_TRUE; 19510 ignore_nexthop = B_TRUE; 19511 } 19512 if (ire != NULL) { 19513 ire_refrele(ire); 19514 ire = NULL; 19515 } 19516 /* 19517 * Guard against coming in from arp in which case conn is NULL. 19518 * Also guard against non M_DATA with dontroute set but 19519 * destined to local, loopback or broadcast addresses. 19520 */ 19521 if (connp != NULL && connp->conn_dontroute && 19522 !ignore_dontroute) { 19523 dontroute: 19524 /* 19525 * Set TTL to 1 if SO_DONTROUTE is set to prevent 19526 * routing protocols from seeing false direct 19527 * connectivity. 19528 */ 19529 ipha->ipha_ttl = 1; 19530 /* 19531 * If IP_XMIT_IF is also set (conn_xmit_if_ill != NULL) 19532 * along with SO_DONTROUTE, higher precedence is 19533 * given to IP_XMIT_IF and the IP_XMIT_IF ipif is used. 19534 */ 19535 if (connp->conn_xmit_if_ill == NULL) { 19536 /* If suitable ipif not found, drop packet */ 19537 dst_ipif = ipif_lookup_onlink_addr(dst, zoneid); 19538 if (dst_ipif == NULL) { 19539 ip1dbg(("ip_wput: no route for " 19540 "dst using SO_DONTROUTE\n")); 19541 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19542 mp->b_prev = mp->b_next = NULL; 19543 if (first_mp == NULL) 19544 first_mp = mp; 19545 goto drop_pkt; 19546 } else { 19547 /* 19548 * If suitable ipif has been found, set 19549 * xmit_ill to the corresponding 19550 * ipif_ill because we'll be following 19551 * the IP_XMIT_IF logic. 19552 */ 19553 ASSERT(xmit_ill == NULL); 19554 xmit_ill = dst_ipif->ipif_ill; 19555 mutex_enter(&xmit_ill->ill_lock); 19556 if (!ILL_CAN_LOOKUP(xmit_ill)) { 19557 mutex_exit(&xmit_ill->ill_lock); 19558 xmit_ill = NULL; 19559 ipif_refrele(dst_ipif); 19560 ip1dbg(("ip_wput: no route for" 19561 " dst using" 19562 " SO_DONTROUTE\n")); 19563 BUMP_MIB(&ip_mib, 19564 ipOutNoRoutes); 19565 mp->b_prev = mp->b_next = NULL; 19566 if (first_mp == NULL) 19567 first_mp = mp; 19568 goto drop_pkt; 19569 } 19570 ill_refhold_locked(xmit_ill); 19571 mutex_exit(&xmit_ill->ill_lock); 19572 ipif_refrele(dst_ipif); 19573 } 19574 } 19575 19576 } 19577 /* 19578 * If we are bound to IPIF_NOFAILOVER address, look for 19579 * an IRE_CACHE matching the ill. 19580 */ 19581 send_from_ill: 19582 if (attach_ill != NULL) { 19583 ipif_t *attach_ipif; 19584 19585 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 19586 19587 /* 19588 * Check if we need an ire that will not be 19589 * looked up by anybody else i.e. HIDDEN. 19590 */ 19591 if (ill_is_probeonly(attach_ill)) { 19592 match_flags |= MATCH_IRE_MARK_HIDDEN; 19593 } 19594 19595 attach_ipif = ipif_get_next_ipif(NULL, attach_ill); 19596 if (attach_ipif == NULL) { 19597 ip1dbg(("ip_wput: No ipif for attach_ill\n")); 19598 goto drop_pkt; 19599 } 19600 ire = ire_ctable_lookup(dst, 0, 0, attach_ipif, 19601 zoneid, MBLK_GETLABEL(mp), match_flags); 19602 ipif_refrele(attach_ipif); 19603 } else if (xmit_ill != NULL || (connp != NULL && 19604 connp->conn_xmit_if_ill != NULL)) { 19605 /* 19606 * Mark this packet as originated locally 19607 */ 19608 mp->b_prev = mp->b_next = NULL; 19609 /* 19610 * xmit_ill could be NULL if SO_DONTROUTE 19611 * is also set. 19612 */ 19613 if (xmit_ill == NULL) { 19614 xmit_ill = conn_get_held_ill(connp, 19615 &connp->conn_xmit_if_ill, &err); 19616 if (err == ILL_LOOKUP_FAILED) { 19617 if (need_decref) 19618 CONN_DEC_REF(connp); 19619 freemsg(first_mp); 19620 return; 19621 } 19622 if (xmit_ill == NULL) { 19623 if (connp->conn_dontroute) 19624 goto dontroute; 19625 goto send_from_ill; 19626 } 19627 } 19628 /* 19629 * could be SO_DONTROUTE case also. 19630 * check at least one interface is UP as 19631 * spcified by this ILL, and then call 19632 * ip_newroute_ipif() 19633 */ 19634 if (xmit_ill->ill_ipif_up_count > 0) { 19635 ipif_t *ipif; 19636 19637 ipif = ipif_get_next_ipif(NULL, xmit_ill); 19638 if (ipif != NULL) { 19639 ip_newroute_ipif(q, first_mp, ipif, 19640 dst, connp, 0); 19641 ipif_refrele(ipif); 19642 ip1dbg(("ip_wput: ip_unicast_if\n")); 19643 } 19644 } else { 19645 freemsg(first_mp); 19646 } 19647 ill_refrele(xmit_ill); 19648 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19649 "ip_wput_end: q %p (%S)", q, "unicast_if"); 19650 if (need_decref) 19651 CONN_DEC_REF(connp); 19652 return; 19653 } else if (ip_nexthop || (connp != NULL && 19654 (connp->conn_nexthop_set)) && !ignore_nexthop) { 19655 if (!ip_nexthop) { 19656 ip_nexthop = B_TRUE; 19657 nexthop_addr = connp->conn_nexthop_v4; 19658 } 19659 match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | 19660 MATCH_IRE_GW; 19661 ire = ire_ctable_lookup(dst, nexthop_addr, 0, 19662 NULL, zoneid, MBLK_GETLABEL(mp), match_flags); 19663 } else { 19664 ire = ire_cache_lookup(dst, zoneid, MBLK_GETLABEL(mp)); 19665 } 19666 if (!ire) { 19667 /* 19668 * Make sure we don't load spread if this 19669 * is IPIF_NOFAILOVER case. 19670 */ 19671 if ((attach_ill != NULL) || 19672 (ip_nexthop && !ignore_nexthop)) { 19673 if (mctl_present) { 19674 io = (ipsec_out_t *)first_mp->b_rptr; 19675 ASSERT(first_mp->b_datap->db_type == 19676 M_CTL); 19677 ASSERT(io->ipsec_out_type == IPSEC_OUT); 19678 } else { 19679 ASSERT(mp == first_mp); 19680 first_mp = allocb( 19681 sizeof (ipsec_info_t), BPRI_HI); 19682 if (first_mp == NULL) { 19683 first_mp = mp; 19684 goto drop_pkt; 19685 } 19686 first_mp->b_datap->db_type = M_CTL; 19687 first_mp->b_wptr += 19688 sizeof (ipsec_info_t); 19689 /* ipsec_out_secure is B_FALSE now */ 19690 bzero(first_mp->b_rptr, 19691 sizeof (ipsec_info_t)); 19692 io = (ipsec_out_t *)first_mp->b_rptr; 19693 io->ipsec_out_type = IPSEC_OUT; 19694 io->ipsec_out_len = 19695 sizeof (ipsec_out_t); 19696 io->ipsec_out_use_global_policy = 19697 B_TRUE; 19698 first_mp->b_cont = mp; 19699 mctl_present = B_TRUE; 19700 } 19701 if (attach_ill != NULL) { 19702 io->ipsec_out_ill_index = attach_ill-> 19703 ill_phyint->phyint_ifindex; 19704 io->ipsec_out_attach_if = B_TRUE; 19705 } else { 19706 io->ipsec_out_ip_nexthop = ip_nexthop; 19707 io->ipsec_out_nexthop_addr = 19708 nexthop_addr; 19709 } 19710 } 19711 noirefound: 19712 /* 19713 * Mark this packet as having originated on 19714 * this machine. This will be noted in 19715 * ire_add_then_send, which needs to know 19716 * whether to run it back through ip_wput or 19717 * ip_rput following successful resolution. 19718 */ 19719 mp->b_prev = NULL; 19720 mp->b_next = NULL; 19721 ip_newroute(q, first_mp, dst, NULL, connp); 19722 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19723 "ip_wput_end: q %p (%S)", q, "newroute"); 19724 if (attach_ill != NULL) 19725 ill_refrele(attach_ill); 19726 if (xmit_ill != NULL) 19727 ill_refrele(xmit_ill); 19728 if (need_decref) 19729 CONN_DEC_REF(connp); 19730 return; 19731 } 19732 } 19733 19734 /* We now know where we are going with it. */ 19735 19736 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19737 "ip_wput_end: q %p (%S)", q, "end"); 19738 19739 /* 19740 * Check if the ire has the RTF_MULTIRT flag, inherited 19741 * from an IRE_OFFSUBNET ire entry in ip_newroute. 19742 */ 19743 if (ire->ire_flags & RTF_MULTIRT) { 19744 /* 19745 * Force the TTL of multirouted packets if required. 19746 * The TTL of such packets is bounded by the 19747 * ip_multirt_ttl ndd variable. 19748 */ 19749 if ((ip_multirt_ttl > 0) && 19750 (ipha->ipha_ttl > ip_multirt_ttl)) { 19751 ip2dbg(("ip_wput: forcing multirt TTL to %d " 19752 "(was %d), dst 0x%08x\n", 19753 ip_multirt_ttl, ipha->ipha_ttl, 19754 ntohl(ire->ire_addr))); 19755 ipha->ipha_ttl = ip_multirt_ttl; 19756 } 19757 /* 19758 * At this point, we check to see if there are any pending 19759 * unresolved routes. ire_multirt_resolvable() 19760 * checks in O(n) that all IRE_OFFSUBNET ire 19761 * entries for the packet's destination and 19762 * flagged RTF_MULTIRT are currently resolved. 19763 * If some remain unresolved, we make a copy 19764 * of the current message. It will be used 19765 * to initiate additional route resolutions. 19766 */ 19767 multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, 19768 MBLK_GETLABEL(first_mp)); 19769 ip2dbg(("ip_wput[noirefound]: ire %p, " 19770 "multirt_need_resolve %d, first_mp %p\n", 19771 (void *)ire, multirt_need_resolve, (void *)first_mp)); 19772 if (multirt_need_resolve) { 19773 copy_mp = copymsg(first_mp); 19774 if (copy_mp != NULL) { 19775 MULTIRT_DEBUG_TAG(copy_mp); 19776 } 19777 } 19778 } 19779 19780 ip_wput_ire(q, first_mp, ire, connp, caller); 19781 /* 19782 * Try to resolve another multiroute if 19783 * ire_multirt_resolvable() deemed it necessary. 19784 * At this point, we need to distinguish 19785 * multicasts from other packets. For multicasts, 19786 * we call ip_newroute_ipif() and request that both 19787 * multirouting and setsrc flags are checked. 19788 */ 19789 if (copy_mp != NULL) { 19790 if (CLASSD(dst)) { 19791 ipif_t *ipif = ipif_lookup_group(dst, zoneid); 19792 if (ipif) { 19793 ip_newroute_ipif(q, copy_mp, ipif, dst, connp, 19794 RTF_SETSRC | RTF_MULTIRT); 19795 ipif_refrele(ipif); 19796 } else { 19797 MULTIRT_DEBUG_UNTAG(copy_mp); 19798 freemsg(copy_mp); 19799 copy_mp = NULL; 19800 } 19801 } else { 19802 ip_newroute(q, copy_mp, dst, NULL, connp); 19803 } 19804 } 19805 if (attach_ill != NULL) 19806 ill_refrele(attach_ill); 19807 if (xmit_ill != NULL) 19808 ill_refrele(xmit_ill); 19809 if (need_decref) 19810 CONN_DEC_REF(connp); 19811 return; 19812 19813 icmp_parameter_problem: 19814 /* could not have originated externally */ 19815 ASSERT(mp->b_prev == NULL); 19816 if (ip_hdr_complete(ipha, zoneid) == 0) { 19817 BUMP_MIB(&ip_mib, ipOutNoRoutes); 19818 /* it's the IP header length that's in trouble */ 19819 icmp_param_problem(q, first_mp, 0); 19820 first_mp = NULL; 19821 } 19822 19823 drop_pkt: 19824 ip1dbg(("ip_wput: dropped packet\n")); 19825 if (ire != NULL) 19826 ire_refrele(ire); 19827 if (need_decref) 19828 CONN_DEC_REF(connp); 19829 freemsg(first_mp); 19830 if (attach_ill != NULL) 19831 ill_refrele(attach_ill); 19832 if (xmit_ill != NULL) 19833 ill_refrele(xmit_ill); 19834 TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, 19835 "ip_wput_end: q %p (%S)", q, "droppkt"); 19836 } 19837 19838 void 19839 ip_wput(queue_t *q, mblk_t *mp) 19840 { 19841 ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); 19842 } 19843 19844 /* 19845 * 19846 * The following rules must be observed when accessing any ipif or ill 19847 * that has been cached in the conn. Typically conn_nofailover_ill, 19848 * conn_xmit_if_ill, conn_multicast_ipif and conn_multicast_ill. 19849 * 19850 * Access: The ipif or ill pointed to from the conn can be accessed under 19851 * the protection of the conn_lock or after it has been refheld under the 19852 * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or 19853 * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. 19854 * The reason for this is that a concurrent unplumb could actually be 19855 * cleaning up these cached pointers by walking the conns and might have 19856 * finished cleaning up the conn in question. The macros check that an 19857 * unplumb has not yet started on the ipif or ill. 19858 * 19859 * Caching: An ipif or ill pointer may be cached in the conn only after 19860 * making sure that an unplumb has not started. So the caching is done 19861 * while holding both the conn_lock and the ill_lock and after using the 19862 * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED 19863 * flag before starting the cleanup of conns. 19864 * 19865 * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock 19866 * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock 19867 * or a reference to the ipif or a reference to an ire that references the 19868 * ipif. An ipif does not change its ill except for failover/failback. Since 19869 * failover/failback happens only after bringing down the ipif and making sure 19870 * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock 19871 * the above holds. 19872 */ 19873 ipif_t * 19874 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) 19875 { 19876 ipif_t *ipif; 19877 ill_t *ill; 19878 19879 *err = 0; 19880 rw_enter(&ill_g_lock, RW_READER); 19881 mutex_enter(&connp->conn_lock); 19882 ipif = *ipifp; 19883 if (ipif != NULL) { 19884 ill = ipif->ipif_ill; 19885 mutex_enter(&ill->ill_lock); 19886 if (IPIF_CAN_LOOKUP(ipif)) { 19887 ipif_refhold_locked(ipif); 19888 mutex_exit(&ill->ill_lock); 19889 mutex_exit(&connp->conn_lock); 19890 rw_exit(&ill_g_lock); 19891 return (ipif); 19892 } else { 19893 *err = IPIF_LOOKUP_FAILED; 19894 } 19895 mutex_exit(&ill->ill_lock); 19896 } 19897 mutex_exit(&connp->conn_lock); 19898 rw_exit(&ill_g_lock); 19899 return (NULL); 19900 } 19901 19902 ill_t * 19903 conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) 19904 { 19905 ill_t *ill; 19906 19907 *err = 0; 19908 mutex_enter(&connp->conn_lock); 19909 ill = *illp; 19910 if (ill != NULL) { 19911 mutex_enter(&ill->ill_lock); 19912 if (ILL_CAN_LOOKUP(ill)) { 19913 ill_refhold_locked(ill); 19914 mutex_exit(&ill->ill_lock); 19915 mutex_exit(&connp->conn_lock); 19916 return (ill); 19917 } else { 19918 *err = ILL_LOOKUP_FAILED; 19919 } 19920 mutex_exit(&ill->ill_lock); 19921 } 19922 mutex_exit(&connp->conn_lock); 19923 return (NULL); 19924 } 19925 19926 static int 19927 conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) 19928 { 19929 ill_t *ill; 19930 19931 ill = ipif->ipif_ill; 19932 mutex_enter(&connp->conn_lock); 19933 mutex_enter(&ill->ill_lock); 19934 if (IPIF_CAN_LOOKUP(ipif)) { 19935 *ipifp = ipif; 19936 mutex_exit(&ill->ill_lock); 19937 mutex_exit(&connp->conn_lock); 19938 return (0); 19939 } 19940 mutex_exit(&ill->ill_lock); 19941 mutex_exit(&connp->conn_lock); 19942 return (IPIF_LOOKUP_FAILED); 19943 } 19944 19945 /* 19946 * This is called if the outbound datagram needs fragmentation. 19947 * 19948 * NOTE : This function does not ire_refrele the ire argument passed in. 19949 */ 19950 static void 19951 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire) 19952 { 19953 ipha_t *ipha; 19954 mblk_t *mp; 19955 uint32_t v_hlen_tos_len; 19956 uint32_t max_frag; 19957 uint32_t frag_flag; 19958 boolean_t dont_use; 19959 19960 if (ipsec_mp->b_datap->db_type == M_CTL) { 19961 mp = ipsec_mp->b_cont; 19962 } else { 19963 mp = ipsec_mp; 19964 } 19965 19966 ipha = (ipha_t *)mp->b_rptr; 19967 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 19968 19969 #ifdef _BIG_ENDIAN 19970 #define V_HLEN (v_hlen_tos_len >> 24) 19971 #define LENGTH (v_hlen_tos_len & 0xFFFF) 19972 #else 19973 #define V_HLEN (v_hlen_tos_len & 0xFF) 19974 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 19975 #endif 19976 19977 #ifndef SPEED_BEFORE_SAFETY 19978 /* 19979 * Check that ipha_length is consistent with 19980 * the mblk length 19981 */ 19982 if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { 19983 ip0dbg(("Packet length mismatch: %d, %ld\n", 19984 LENGTH, msgdsize(mp))); 19985 freemsg(ipsec_mp); 19986 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 19987 "ip_wput_ire_fragmentit: mp %p (%S)", mp, 19988 "packet length mismatch"); 19989 return; 19990 } 19991 #endif 19992 /* 19993 * Don't use frag_flag if pre-built packet or source 19994 * routed or if multicast (since multicast packets do not solicit 19995 * ICMP "packet too big" messages). Get the values of 19996 * max_frag and frag_flag atomically by acquiring the 19997 * ire_lock. 19998 */ 19999 mutex_enter(&ire->ire_lock); 20000 max_frag = ire->ire_max_frag; 20001 frag_flag = ire->ire_frag_flag; 20002 mutex_exit(&ire->ire_lock); 20003 20004 dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || 20005 (V_HLEN != IP_SIMPLE_HDR_VERSION && 20006 ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); 20007 20008 ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, 20009 (dont_use ? 0 : frag_flag)); 20010 } 20011 20012 /* 20013 * Used for deciding the MSS size for the upper layer. Thus 20014 * we need to check the outbound policy values in the conn. 20015 */ 20016 int 20017 conn_ipsec_length(conn_t *connp) 20018 { 20019 ipsec_latch_t *ipl; 20020 20021 ipl = connp->conn_latch; 20022 if (ipl == NULL) 20023 return (0); 20024 20025 if (ipl->ipl_out_policy == NULL) 20026 return (0); 20027 20028 return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); 20029 } 20030 20031 /* 20032 * Returns an estimate of the IPSEC headers size. This is used if 20033 * we don't want to call into IPSEC to get the exact size. 20034 */ 20035 int 20036 ipsec_out_extra_length(mblk_t *ipsec_mp) 20037 { 20038 ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; 20039 ipsec_action_t *a; 20040 20041 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20042 if (!io->ipsec_out_secure) 20043 return (0); 20044 20045 a = io->ipsec_out_act; 20046 20047 if (a == NULL) { 20048 ASSERT(io->ipsec_out_policy != NULL); 20049 a = io->ipsec_out_policy->ipsp_act; 20050 } 20051 ASSERT(a != NULL); 20052 20053 return (a->ipa_ovhd); 20054 } 20055 20056 /* 20057 * Returns an estimate of the IPSEC headers size. This is used if 20058 * we don't want to call into IPSEC to get the exact size. 20059 */ 20060 int 20061 ipsec_in_extra_length(mblk_t *ipsec_mp) 20062 { 20063 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 20064 ipsec_action_t *a; 20065 20066 ASSERT(ii->ipsec_in_type == IPSEC_IN); 20067 20068 a = ii->ipsec_in_action; 20069 return (a == NULL ? 0 : a->ipa_ovhd); 20070 } 20071 20072 /* 20073 * If there are any source route options, return the true final 20074 * destination. Otherwise, return the destination. 20075 */ 20076 ipaddr_t 20077 ip_get_dst(ipha_t *ipha) 20078 { 20079 ipoptp_t opts; 20080 uchar_t *opt; 20081 uint8_t optval; 20082 uint8_t optlen; 20083 ipaddr_t dst; 20084 uint32_t off; 20085 20086 dst = ipha->ipha_dst; 20087 20088 if (IS_SIMPLE_IPH(ipha)) 20089 return (dst); 20090 20091 for (optval = ipoptp_first(&opts, ipha); 20092 optval != IPOPT_EOL; 20093 optval = ipoptp_next(&opts)) { 20094 opt = opts.ipoptp_cur; 20095 optlen = opts.ipoptp_len; 20096 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 20097 switch (optval) { 20098 case IPOPT_SSRR: 20099 case IPOPT_LSRR: 20100 off = opt[IPOPT_OFFSET]; 20101 /* 20102 * If one of the conditions is true, it means 20103 * end of options and dst already has the right 20104 * value. 20105 */ 20106 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) { 20107 off = optlen - IP_ADDR_LEN; 20108 bcopy(&opt[off], &dst, IP_ADDR_LEN); 20109 } 20110 return (dst); 20111 default: 20112 break; 20113 } 20114 } 20115 20116 return (dst); 20117 } 20118 20119 mblk_t * 20120 ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, 20121 conn_t *connp, boolean_t unspec_src) 20122 { 20123 ipsec_out_t *io; 20124 mblk_t *first_mp; 20125 boolean_t policy_present; 20126 20127 first_mp = mp; 20128 if (mp->b_datap->db_type == M_CTL) { 20129 io = (ipsec_out_t *)first_mp->b_rptr; 20130 /* 20131 * ip_wput[_v6] attaches an IPSEC_OUT in two cases. 20132 * 20133 * 1) There is per-socket policy (including cached global 20134 * policy). 20135 * 2) There is no per-socket policy, but it is 20136 * a multicast packet that needs to go out 20137 * on a specific interface. This is the case 20138 * where (ip_wput and ip_wput_multicast) attaches 20139 * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. 20140 * 20141 * In case (2) we check with global policy to 20142 * see if there is a match and set the ill_index 20143 * appropriately so that we can lookup the ire 20144 * properly in ip_wput_ipsec_out. 20145 */ 20146 20147 /* 20148 * ipsec_out_use_global_policy is set to B_FALSE 20149 * in ipsec_in_to_out(). Refer to that function for 20150 * details. 20151 */ 20152 if ((io->ipsec_out_latch == NULL) && 20153 (io->ipsec_out_use_global_policy)) { 20154 return (ip_wput_attach_policy(first_mp, ipha, ip6h, 20155 ire, connp, unspec_src)); 20156 } 20157 if (!io->ipsec_out_secure) { 20158 /* 20159 * If this is not a secure packet, drop 20160 * the IPSEC_OUT mp and treat it as a clear 20161 * packet. This happens when we are sending 20162 * a ICMP reply back to a clear packet. See 20163 * ipsec_in_to_out() for details. 20164 */ 20165 mp = first_mp->b_cont; 20166 freeb(first_mp); 20167 } 20168 return (mp); 20169 } 20170 /* 20171 * See whether we need to attach a global policy here. We 20172 * don't depend on the conn (as it could be null) for deciding 20173 * what policy this datagram should go through because it 20174 * should have happened in ip_wput if there was some 20175 * policy. This normally happens for connections which are not 20176 * fully bound preventing us from caching policies in 20177 * ip_bind. Packets coming from the TCP listener/global queue 20178 * - which are non-hard_bound - could also be affected by 20179 * applying policy here. 20180 * 20181 * If this packet is coming from tcp global queue or listener, 20182 * we will be applying policy here. This may not be *right* 20183 * if these packets are coming from the detached connection as 20184 * it could have gone in clear before. This happens only if a 20185 * TCP connection started when there is no policy and somebody 20186 * added policy before it became detached. Thus packets of the 20187 * detached connection could go out secure and the other end 20188 * would drop it because it will be expecting in clear. The 20189 * converse is not true i.e if somebody starts a TCP 20190 * connection and deletes the policy, all the packets will 20191 * still go out with the policy that existed before deleting 20192 * because ip_unbind sends up policy information which is used 20193 * by TCP on subsequent ip_wputs. The right solution is to fix 20194 * TCP to attach a dummy IPSEC_OUT and set 20195 * ipsec_out_use_global_policy to B_FALSE. As this might 20196 * affect performance for normal cases, we are not doing it. 20197 * Thus, set policy before starting any TCP connections. 20198 * 20199 * NOTE - We might apply policy even for a hard bound connection 20200 * - for which we cached policy in ip_bind - if somebody added 20201 * global policy after we inherited the policy in ip_bind. 20202 * This means that the packets that were going out in clear 20203 * previously would start going secure and hence get dropped 20204 * on the other side. To fix this, TCP attaches a dummy 20205 * ipsec_out and make sure that we don't apply global policy. 20206 */ 20207 if (ipha != NULL) 20208 policy_present = ipsec_outbound_v4_policy_present; 20209 else 20210 policy_present = ipsec_outbound_v6_policy_present; 20211 if (!policy_present) 20212 return (mp); 20213 20214 return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src)); 20215 } 20216 20217 ire_t * 20218 conn_set_outgoing_ill(conn_t *connp, ire_t *ire, ill_t **conn_outgoing_ill) 20219 { 20220 ipaddr_t addr; 20221 ire_t *save_ire; 20222 irb_t *irb; 20223 ill_group_t *illgrp; 20224 int err; 20225 20226 save_ire = ire; 20227 addr = ire->ire_addr; 20228 20229 ASSERT(ire->ire_type == IRE_BROADCAST); 20230 20231 illgrp = connp->conn_outgoing_ill->ill_group; 20232 if (illgrp == NULL) { 20233 *conn_outgoing_ill = conn_get_held_ill(connp, 20234 &connp->conn_outgoing_ill, &err); 20235 if (err == ILL_LOOKUP_FAILED) { 20236 ire_refrele(save_ire); 20237 return (NULL); 20238 } 20239 return (save_ire); 20240 } 20241 /* 20242 * If IP_BOUND_IF has been done, conn_outgoing_ill will be set. 20243 * If it is part of the group, we need to send on the ire 20244 * that has been cleared of IRE_MARK_NORECV and that belongs 20245 * to this group. This is okay as IP_BOUND_IF really means 20246 * any ill in the group. We depend on the fact that the 20247 * first ire in the group is always cleared of IRE_MARK_NORECV 20248 * if such an ire exists. This is possible only if you have 20249 * at least one ill in the group that has not failed. 20250 * 20251 * First get to the ire that matches the address and group. 20252 * 20253 * We don't look for an ire with a matching zoneid because a given zone 20254 * won't always have broadcast ires on all ills in the group. 20255 */ 20256 irb = ire->ire_bucket; 20257 rw_enter(&irb->irb_lock, RW_READER); 20258 if (ire->ire_marks & IRE_MARK_NORECV) { 20259 /* 20260 * If the current zone only has an ire broadcast for this 20261 * address marked NORECV, the ire we want is ahead in the 20262 * bucket, so we look it up deliberately ignoring the zoneid. 20263 */ 20264 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 20265 if (ire->ire_addr != addr) 20266 continue; 20267 /* skip over deleted ires */ 20268 if (ire->ire_marks & IRE_MARK_CONDEMNED) 20269 continue; 20270 } 20271 } 20272 while (ire != NULL) { 20273 /* 20274 * If a new interface is coming up, we could end up 20275 * seeing the loopback ire and the non-loopback ire 20276 * may not have been added yet. So check for ire_stq 20277 */ 20278 if (ire->ire_stq != NULL && (ire->ire_addr != addr || 20279 ire->ire_ipif->ipif_ill->ill_group == illgrp)) { 20280 break; 20281 } 20282 ire = ire->ire_next; 20283 } 20284 if (ire != NULL && ire->ire_addr == addr && 20285 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 20286 IRE_REFHOLD(ire); 20287 rw_exit(&irb->irb_lock); 20288 ire_refrele(save_ire); 20289 *conn_outgoing_ill = ire_to_ill(ire); 20290 /* 20291 * Refhold the ill to make the conn_outgoing_ill 20292 * independent of the ire. ip_wput_ire goes in a loop 20293 * and may refrele the ire. Since we have an ire at this 20294 * point we don't need to use ILL_CAN_LOOKUP on the ill. 20295 */ 20296 ill_refhold(*conn_outgoing_ill); 20297 return (ire); 20298 } 20299 rw_exit(&irb->irb_lock); 20300 ip1dbg(("conn_set_outgoing_ill: No matching ire\n")); 20301 /* 20302 * If we can't find a suitable ire, return the original ire. 20303 */ 20304 return (save_ire); 20305 } 20306 20307 /* 20308 * This function does the ire_refrele of the ire passed in as the 20309 * argument. As this function looks up more ires i.e broadcast ires, 20310 * it needs to REFRELE them. Currently, for simplicity we don't 20311 * differentiate the one passed in and looked up here. We always 20312 * REFRELE. 20313 * IPQoS Notes: 20314 * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for 20315 * IPSec packets are done in ipsec_out_process. 20316 * 20317 */ 20318 void 20319 ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller) 20320 { 20321 ipha_t *ipha; 20322 #define rptr ((uchar_t *)ipha) 20323 mblk_t *mp1; 20324 queue_t *stq; 20325 #define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) 20326 uint32_t v_hlen_tos_len; 20327 uint32_t ttl_protocol; 20328 ipaddr_t src; 20329 ipaddr_t dst; 20330 uint32_t cksum; 20331 ipaddr_t orig_src; 20332 ire_t *ire1; 20333 mblk_t *next_mp; 20334 uint_t hlen; 20335 uint16_t *up; 20336 uint32_t max_frag = ire->ire_max_frag; 20337 ill_t *ill = ire_to_ill(ire); 20338 int clusterwide; 20339 uint16_t ip_hdr_included; /* IP header included by ULP? */ 20340 int ipsec_len; 20341 mblk_t *first_mp; 20342 ipsec_out_t *io; 20343 boolean_t conn_dontroute; /* conn value for multicast */ 20344 boolean_t conn_multicast_loop; /* conn value for multicast */ 20345 boolean_t multicast_forward; /* Should we forward ? */ 20346 boolean_t unspec_src; 20347 ill_t *conn_outgoing_ill = NULL; 20348 ill_t *ire_ill; 20349 ill_t *ire1_ill; 20350 uint32_t ill_index = 0; 20351 boolean_t multirt_send = B_FALSE; 20352 int err; 20353 zoneid_t zoneid; 20354 20355 TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, 20356 "ip_wput_ire_start: q %p", q); 20357 20358 multicast_forward = B_FALSE; 20359 unspec_src = (connp != NULL && connp->conn_unspec_src); 20360 20361 if (ire->ire_flags & RTF_MULTIRT) { 20362 /* 20363 * Multirouting case. The bucket where ire is stored 20364 * probably holds other RTF_MULTIRT flagged ire 20365 * to the destination. In this call to ip_wput_ire, 20366 * we attempt to send the packet through all 20367 * those ires. Thus, we first ensure that ire is the 20368 * first RTF_MULTIRT ire in the bucket, 20369 * before walking the ire list. 20370 */ 20371 ire_t *first_ire; 20372 irb_t *irb = ire->ire_bucket; 20373 ASSERT(irb != NULL); 20374 20375 /* Make sure we do not omit any multiroute ire. */ 20376 IRB_REFHOLD(irb); 20377 for (first_ire = irb->irb_ire; 20378 first_ire != NULL; 20379 first_ire = first_ire->ire_next) { 20380 if ((first_ire->ire_flags & RTF_MULTIRT) && 20381 (first_ire->ire_addr == ire->ire_addr) && 20382 !(first_ire->ire_marks & 20383 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 20384 break; 20385 } 20386 20387 if ((first_ire != NULL) && (first_ire != ire)) { 20388 IRE_REFHOLD(first_ire); 20389 ire_refrele(ire); 20390 ire = first_ire; 20391 ill = ire_to_ill(ire); 20392 } 20393 IRB_REFRELE(irb); 20394 } 20395 20396 /* 20397 * conn_outgoing_ill is used only in the broadcast loop. 20398 * for performance we don't grab the mutexs in the fastpath 20399 */ 20400 if ((connp != NULL) && 20401 (connp->conn_xmit_if_ill == NULL) && 20402 (ire->ire_type == IRE_BROADCAST) && 20403 ((connp->conn_nofailover_ill != NULL) || 20404 (connp->conn_outgoing_ill != NULL))) { 20405 /* 20406 * Bind to IPIF_NOFAILOVER address overrides IP_BOUND_IF 20407 * option. So, see if this endpoint is bound to a 20408 * IPIF_NOFAILOVER address. If so, honor it. This implies 20409 * that if the interface is failed, we will still send 20410 * the packet on the same ill which is what we want. 20411 */ 20412 conn_outgoing_ill = conn_get_held_ill(connp, 20413 &connp->conn_nofailover_ill, &err); 20414 if (err == ILL_LOOKUP_FAILED) { 20415 ire_refrele(ire); 20416 freemsg(mp); 20417 return; 20418 } 20419 if (conn_outgoing_ill == NULL) { 20420 /* 20421 * Choose a good ill in the group to send the 20422 * packets on. 20423 */ 20424 ire = conn_set_outgoing_ill(connp, ire, 20425 &conn_outgoing_ill); 20426 if (ire == NULL) { 20427 freemsg(mp); 20428 return; 20429 } 20430 } 20431 } 20432 20433 if (mp->b_datap->db_type != M_CTL) { 20434 ipha = (ipha_t *)mp->b_rptr; 20435 zoneid = (connp != NULL ? connp->conn_zoneid : ALL_ZONES); 20436 } else { 20437 io = (ipsec_out_t *)mp->b_rptr; 20438 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20439 zoneid = io->ipsec_out_zoneid; 20440 ASSERT(zoneid != ALL_ZONES); 20441 ipha = (ipha_t *)mp->b_cont->b_rptr; 20442 dst = ipha->ipha_dst; 20443 /* 20444 * For the multicast case, ipsec_out carries conn_dontroute and 20445 * conn_multicast_loop as conn may not be available here. We 20446 * need this for multicast loopback and forwarding which is done 20447 * later in the code. 20448 */ 20449 if (CLASSD(dst)) { 20450 conn_dontroute = io->ipsec_out_dontroute; 20451 conn_multicast_loop = io->ipsec_out_multicast_loop; 20452 /* 20453 * If conn_dontroute is not set or conn_multicast_loop 20454 * is set, we need to do forwarding/loopback. For 20455 * datagrams from ip_wput_multicast, conn_dontroute is 20456 * set to B_TRUE and conn_multicast_loop is set to 20457 * B_FALSE so that we neither do forwarding nor 20458 * loopback. 20459 */ 20460 if (!conn_dontroute || conn_multicast_loop) 20461 multicast_forward = B_TRUE; 20462 } 20463 } 20464 20465 if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && 20466 ire->ire_zoneid != ALL_ZONES) { 20467 /* 20468 * When a zone sends a packet to another zone, we try to deliver 20469 * the packet under the same conditions as if the destination 20470 * was a real node on the network. To do so, we look for a 20471 * matching route in the forwarding table. 20472 * RTF_REJECT and RTF_BLACKHOLE are handled just like 20473 * ip_newroute() does. 20474 */ 20475 ire_t *src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, 20476 NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | 20477 MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE)); 20478 if (src_ire != NULL && 20479 !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) { 20480 if (ipha->ipha_src == INADDR_ANY && !unspec_src) 20481 ipha->ipha_src = src_ire->ire_src_addr; 20482 ire_refrele(src_ire); 20483 } else { 20484 ire_refrele(ire); 20485 if (conn_outgoing_ill != NULL) 20486 ill_refrele(conn_outgoing_ill); 20487 BUMP_MIB(&ip_mib, ipOutNoRoutes); 20488 if (src_ire != NULL) { 20489 if (src_ire->ire_flags & RTF_BLACKHOLE) { 20490 ire_refrele(src_ire); 20491 freemsg(mp); 20492 return; 20493 } 20494 ire_refrele(src_ire); 20495 } 20496 if (ip_hdr_complete(ipha, zoneid)) { 20497 /* Failed */ 20498 freemsg(mp); 20499 return; 20500 } 20501 icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE); 20502 return; 20503 } 20504 } 20505 20506 if (mp->b_datap->db_type == M_CTL || 20507 ipsec_outbound_v4_policy_present) { 20508 mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, 20509 unspec_src); 20510 if (mp == NULL) { 20511 ire_refrele(ire); 20512 if (conn_outgoing_ill != NULL) 20513 ill_refrele(conn_outgoing_ill); 20514 return; 20515 } 20516 } 20517 20518 first_mp = mp; 20519 ipsec_len = 0; 20520 20521 if (first_mp->b_datap->db_type == M_CTL) { 20522 io = (ipsec_out_t *)first_mp->b_rptr; 20523 ASSERT(io->ipsec_out_type == IPSEC_OUT); 20524 mp = first_mp->b_cont; 20525 ipsec_len = ipsec_out_extra_length(first_mp); 20526 ASSERT(ipsec_len >= 0); 20527 zoneid = io->ipsec_out_zoneid; 20528 ASSERT(zoneid != ALL_ZONES); 20529 20530 /* 20531 * Drop M_CTL here if IPsec processing is not needed. 20532 * (Non-IPsec use of M_CTL extracted any information it 20533 * needed above). 20534 */ 20535 if (ipsec_len == 0) { 20536 freeb(first_mp); 20537 first_mp = mp; 20538 } 20539 } 20540 20541 /* 20542 * Fast path for ip_wput_ire 20543 */ 20544 20545 ipha = (ipha_t *)mp->b_rptr; 20546 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 20547 dst = ipha->ipha_dst; 20548 20549 /* 20550 * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED 20551 * if the socket is a SOCK_RAW type. The transport checksum should 20552 * be provided in the pre-built packet, so we don't need to compute it. 20553 * Also, other application set flags, like DF, should not be altered. 20554 * Other transport MUST pass down zero. 20555 */ 20556 ip_hdr_included = ipha->ipha_ident; 20557 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 20558 20559 if (CLASSD(dst)) { 20560 ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", 20561 ntohl(dst), 20562 ip_nv_lookup(ire_nv_tbl, ire->ire_type), 20563 ntohl(ire->ire_addr))); 20564 } 20565 20566 /* Macros to extract header fields from data already in registers */ 20567 #ifdef _BIG_ENDIAN 20568 #define V_HLEN (v_hlen_tos_len >> 24) 20569 #define LENGTH (v_hlen_tos_len & 0xFFFF) 20570 #define PROTO (ttl_protocol & 0xFF) 20571 #else 20572 #define V_HLEN (v_hlen_tos_len & 0xFF) 20573 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 20574 #define PROTO (ttl_protocol >> 8) 20575 #endif 20576 20577 20578 orig_src = src = ipha->ipha_src; 20579 /* (The loop back to "another" is explained down below.) */ 20580 another:; 20581 /* 20582 * Assign an ident value for this packet. We assign idents on 20583 * a per destination basis out of the IRE. There could be 20584 * other threads targeting the same destination, so we have to 20585 * arrange for a atomic increment. Note that we use a 32-bit 20586 * atomic add because it has better performance than its 20587 * 16-bit sibling. 20588 * 20589 * If running in cluster mode and if the source address 20590 * belongs to a replicated service then vector through 20591 * cl_inet_ipident vector to allocate ip identifier 20592 * NOTE: This is a contract private interface with the 20593 * clustering group. 20594 */ 20595 clusterwide = 0; 20596 if (cl_inet_ipident) { 20597 ASSERT(cl_inet_isclusterwide); 20598 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 20599 AF_INET, (uint8_t *)(uintptr_t)src)) { 20600 ipha->ipha_ident = (*cl_inet_ipident)(IPPROTO_IP, 20601 AF_INET, (uint8_t *)(uintptr_t)src, 20602 (uint8_t *)(uintptr_t)dst); 20603 clusterwide = 1; 20604 } 20605 } 20606 if (!clusterwide) { 20607 ipha->ipha_ident = 20608 (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 20609 } 20610 20611 #ifndef _BIG_ENDIAN 20612 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 20613 #endif 20614 20615 /* 20616 * Set source address unless sent on an ill or conn_unspec_src is set. 20617 * This is needed to obey conn_unspec_src when packets go through 20618 * ip_newroute + arp. 20619 * Assumes ip_newroute{,_multi} sets the source address as well. 20620 */ 20621 if (src == INADDR_ANY && !unspec_src) { 20622 /* 20623 * Assign the appropriate source address from the IRE if none 20624 * was specified. 20625 */ 20626 ASSERT(ire->ire_ipversion == IPV4_VERSION); 20627 20628 /* 20629 * With IP multipathing, broadcast packets are sent on the ire 20630 * that has been cleared of IRE_MARK_NORECV and that belongs to 20631 * the group. However, this ire might not be in the same zone so 20632 * we can't always use its source address. We look for a 20633 * broadcast ire in the same group and in the right zone. 20634 */ 20635 if (ire->ire_type == IRE_BROADCAST && 20636 ire->ire_zoneid != zoneid) { 20637 ire_t *src_ire = ire_ctable_lookup(dst, 0, 20638 IRE_BROADCAST, ire->ire_ipif, zoneid, NULL, 20639 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP)); 20640 if (src_ire != NULL) { 20641 src = src_ire->ire_src_addr; 20642 ire_refrele(src_ire); 20643 } else { 20644 ire_refrele(ire); 20645 if (conn_outgoing_ill != NULL) 20646 ill_refrele(conn_outgoing_ill); 20647 freemsg(first_mp); 20648 BUMP_MIB(&ip_mib, ipOutDiscards); 20649 return; 20650 } 20651 } else { 20652 src = ire->ire_src_addr; 20653 } 20654 20655 if (connp == NULL) { 20656 ip1dbg(("ip_wput_ire: no connp and no src " 20657 "address for dst 0x%x, using src 0x%x\n", 20658 ntohl(dst), 20659 ntohl(src))); 20660 } 20661 ipha->ipha_src = src; 20662 } 20663 stq = ire->ire_stq; 20664 20665 /* 20666 * We only allow ire chains for broadcasts since there will 20667 * be multiple IRE_CACHE entries for the same multicast 20668 * address (one per ipif). 20669 */ 20670 next_mp = NULL; 20671 20672 /* broadcast packet */ 20673 if (ire->ire_type == IRE_BROADCAST) 20674 goto broadcast; 20675 20676 /* loopback ? */ 20677 if (stq == NULL) 20678 goto nullstq; 20679 20680 /* The ill_index for outbound ILL */ 20681 ill_index = Q_TO_INDEX(stq); 20682 20683 BUMP_MIB(&ip_mib, ipOutRequests); 20684 ttl_protocol = ((uint16_t *)ipha)[4]; 20685 20686 /* pseudo checksum (do it in parts for IP header checksum) */ 20687 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 20688 20689 if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { 20690 queue_t *dev_q = stq->q_next; 20691 20692 /* flow controlled */ 20693 if ((dev_q->q_next || dev_q->q_first) && 20694 !canput(dev_q)) 20695 goto blocked; 20696 if ((PROTO == IPPROTO_UDP) && 20697 (ip_hdr_included != IP_HDR_INCLUDED)) { 20698 hlen = (V_HLEN & 0xF) << 2; 20699 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 20700 if (*up != 0) { 20701 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, 20702 hlen, LENGTH, max_frag, ipsec_len, cksum); 20703 /* Software checksum? */ 20704 if (DB_CKSUMFLAGS(mp) == 0) { 20705 IP_STAT(ip_out_sw_cksum); 20706 IP_STAT_UPDATE( 20707 ip_udp_out_sw_cksum_bytes, 20708 LENGTH - hlen); 20709 } 20710 } 20711 } 20712 } else if (ip_hdr_included != IP_HDR_INCLUDED) { 20713 hlen = (V_HLEN & 0xF) << 2; 20714 if (PROTO == IPPROTO_TCP) { 20715 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 20716 /* 20717 * The packet header is processed once and for all, even 20718 * in the multirouting case. We disable hardware 20719 * checksum if the packet is multirouted, as it will be 20720 * replicated via several interfaces, and not all of 20721 * them may have this capability. 20722 */ 20723 IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, 20724 LENGTH, max_frag, ipsec_len, cksum); 20725 /* Software checksum? */ 20726 if (DB_CKSUMFLAGS(mp) == 0) { 20727 IP_STAT(ip_out_sw_cksum); 20728 IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, 20729 LENGTH - hlen); 20730 } 20731 } else { 20732 sctp_hdr_t *sctph; 20733 20734 ASSERT(PROTO == IPPROTO_SCTP); 20735 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 20736 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 20737 /* 20738 * Zero out the checksum field to ensure proper 20739 * checksum calculation. 20740 */ 20741 sctph->sh_chksum = 0; 20742 #ifdef DEBUG 20743 if (!skip_sctp_cksum) 20744 #endif 20745 sctph->sh_chksum = sctp_cksum(mp, hlen); 20746 } 20747 } 20748 20749 /* 20750 * If this is a multicast packet and originated from ip_wput 20751 * we need to do loopback and forwarding checks. If it comes 20752 * from ip_wput_multicast, we SHOULD not do this. 20753 */ 20754 if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; 20755 20756 /* checksum */ 20757 cksum += ttl_protocol; 20758 20759 /* fragment the packet */ 20760 if (max_frag < (uint_t)(LENGTH + ipsec_len)) 20761 goto fragmentit; 20762 /* 20763 * Don't use frag_flag if packet is pre-built or source 20764 * routed or if multicast (since multicast packets do 20765 * not solicit ICMP "packet too big" messages). 20766 */ 20767 if ((ip_hdr_included != IP_HDR_INCLUDED) && 20768 (V_HLEN == IP_SIMPLE_HDR_VERSION || 20769 !ip_source_route_included(ipha)) && 20770 !CLASSD(ipha->ipha_dst)) 20771 ipha->ipha_fragment_offset_and_flags |= 20772 htons(ire->ire_frag_flag); 20773 20774 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 20775 /* calculate IP header checksum */ 20776 cksum += ipha->ipha_ident; 20777 cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); 20778 cksum += ipha->ipha_fragment_offset_and_flags; 20779 20780 /* IP options present */ 20781 hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; 20782 if (hlen) 20783 goto checksumoptions; 20784 20785 /* calculate hdr checksum */ 20786 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 20787 cksum = ~(cksum + (cksum >> 16)); 20788 ipha->ipha_hdr_checksum = (uint16_t)cksum; 20789 } 20790 if (ipsec_len != 0) { 20791 /* 20792 * We will do the rest of the processing after 20793 * we come back from IPSEC in ip_wput_ipsec_out(). 20794 */ 20795 ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); 20796 20797 io = (ipsec_out_t *)first_mp->b_rptr; 20798 io->ipsec_out_ill_index = ((ill_t *)stq->q_ptr)-> 20799 ill_phyint->phyint_ifindex; 20800 20801 ipsec_out_process(q, first_mp, ire, ill_index); 20802 ire_refrele(ire); 20803 if (conn_outgoing_ill != NULL) 20804 ill_refrele(conn_outgoing_ill); 20805 return; 20806 } 20807 20808 /* 20809 * In most cases, the emission loop below is entered only 20810 * once. Only in the case where the ire holds the 20811 * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT 20812 * flagged ires in the bucket, and send the packet 20813 * through all crossed RTF_MULTIRT routes. 20814 */ 20815 if (ire->ire_flags & RTF_MULTIRT) { 20816 multirt_send = B_TRUE; 20817 } 20818 do { 20819 if (multirt_send) { 20820 irb_t *irb; 20821 /* 20822 * We are in a multiple send case, need to get 20823 * the next ire and make a duplicate of the packet. 20824 * ire1 holds here the next ire to process in the 20825 * bucket. If multirouting is expected, 20826 * any non-RTF_MULTIRT ire that has the 20827 * right destination address is ignored. 20828 */ 20829 irb = ire->ire_bucket; 20830 ASSERT(irb != NULL); 20831 20832 IRB_REFHOLD(irb); 20833 for (ire1 = ire->ire_next; 20834 ire1 != NULL; 20835 ire1 = ire1->ire_next) { 20836 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 20837 continue; 20838 if (ire1->ire_addr != ire->ire_addr) 20839 continue; 20840 if (ire1->ire_marks & 20841 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 20842 continue; 20843 20844 /* Got one */ 20845 IRE_REFHOLD(ire1); 20846 break; 20847 } 20848 IRB_REFRELE(irb); 20849 20850 if (ire1 != NULL) { 20851 next_mp = copyb(mp); 20852 if ((next_mp == NULL) || 20853 ((mp->b_cont != NULL) && 20854 ((next_mp->b_cont = 20855 dupmsg(mp->b_cont)) == NULL))) { 20856 freemsg(next_mp); 20857 next_mp = NULL; 20858 ire_refrele(ire1); 20859 ire1 = NULL; 20860 } 20861 } 20862 20863 /* Last multiroute ire; don't loop anymore. */ 20864 if (ire1 == NULL) { 20865 multirt_send = B_FALSE; 20866 } 20867 } 20868 mp = ip_wput_attach_llhdr(mp, ire, IPP_LOCAL_OUT, ill_index); 20869 if (mp == NULL) { 20870 BUMP_MIB(&ip_mib, ipOutDiscards); 20871 ip2dbg(("ip_wput_ire: fastpath wput pkt dropped "\ 20872 "during IPPF processing\n")); 20873 ire_refrele(ire); 20874 if (next_mp != NULL) { 20875 freemsg(next_mp); 20876 ire_refrele(ire1); 20877 } 20878 if (conn_outgoing_ill != NULL) 20879 ill_refrele(conn_outgoing_ill); 20880 return; 20881 } 20882 UPDATE_OB_PKT_COUNT(ire); 20883 ire->ire_last_used_time = lbolt; 20884 20885 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 20886 "ip_wput_ire_end: q %p (%S)", 20887 q, "last copy out"); 20888 putnext(stq, mp); 20889 IRE_REFRELE(ire); 20890 20891 if (multirt_send) { 20892 ASSERT(ire1); 20893 /* 20894 * Proceed with the next RTF_MULTIRT ire, 20895 * Also set up the send-to queue accordingly. 20896 */ 20897 ire = ire1; 20898 ire1 = NULL; 20899 stq = ire->ire_stq; 20900 mp = next_mp; 20901 next_mp = NULL; 20902 ipha = (ipha_t *)mp->b_rptr; 20903 ill_index = Q_TO_INDEX(stq); 20904 } 20905 } while (multirt_send); 20906 if (conn_outgoing_ill != NULL) 20907 ill_refrele(conn_outgoing_ill); 20908 return; 20909 20910 /* 20911 * ire->ire_type == IRE_BROADCAST (minimize diffs) 20912 */ 20913 broadcast: 20914 { 20915 /* 20916 * Avoid broadcast storms by setting the ttl to 1 20917 * for broadcasts. This parameter can be set 20918 * via ndd, so make sure that for the SO_DONTROUTE 20919 * case that ipha_ttl is always set to 1. 20920 * In the event that we are replying to incoming 20921 * ICMP packets, conn could be NULL. 20922 */ 20923 if ((connp != NULL) && connp->conn_dontroute) 20924 ipha->ipha_ttl = 1; 20925 else 20926 ipha->ipha_ttl = ip_broadcast_ttl; 20927 20928 /* 20929 * Note that we are not doing a IRB_REFHOLD here. 20930 * Actually we don't care if the list changes i.e 20931 * if somebody deletes an IRE from the list while 20932 * we drop the lock, the next time we come around 20933 * ire_next will be NULL and hence we won't send 20934 * out multiple copies which is fine. 20935 */ 20936 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 20937 ire1 = ire->ire_next; 20938 if (conn_outgoing_ill != NULL) { 20939 while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { 20940 ASSERT(ire1 == ire->ire_next); 20941 if (ire1 != NULL && ire1->ire_addr == dst) { 20942 ire_refrele(ire); 20943 ire = ire1; 20944 IRE_REFHOLD(ire); 20945 ire1 = ire->ire_next; 20946 continue; 20947 } 20948 rw_exit(&ire->ire_bucket->irb_lock); 20949 /* Did not find a matching ill */ 20950 ip1dbg(("ip_wput_ire: broadcast with no " 20951 "matching IP_BOUND_IF ill %s\n", 20952 conn_outgoing_ill->ill_name)); 20953 freemsg(first_mp); 20954 if (ire != NULL) 20955 ire_refrele(ire); 20956 ill_refrele(conn_outgoing_ill); 20957 return; 20958 } 20959 } else if (ire1 != NULL && ire1->ire_addr == dst) { 20960 /* 20961 * If the next IRE has the same address and is not one 20962 * of the two copies that we need to send, try to see 20963 * whether this copy should be sent at all. This 20964 * assumes that we insert loopbacks first and then 20965 * non-loopbacks. This is acheived by inserting the 20966 * loopback always before non-loopback. 20967 * This is used to send a single copy of a broadcast 20968 * packet out all physical interfaces that have an 20969 * matching IRE_BROADCAST while also looping 20970 * back one copy (to ip_wput_local) for each 20971 * matching physical interface. However, we avoid 20972 * sending packets out different logical that match by 20973 * having ipif_up/ipif_down supress duplicate 20974 * IRE_BROADCASTS. 20975 * 20976 * This feature is currently used to get broadcasts 20977 * sent to multiple interfaces, when the broadcast 20978 * address being used applies to multiple interfaces. 20979 * For example, a whole net broadcast will be 20980 * replicated on every connected subnet of 20981 * the target net. 20982 * 20983 * Each zone has its own set of IRE_BROADCASTs, so that 20984 * we're able to distribute inbound packets to multiple 20985 * zones who share a broadcast address. We avoid looping 20986 * back outbound packets in different zones but on the 20987 * same ill, as the application would see duplicates. 20988 * 20989 * If the interfaces are part of the same group, 20990 * we would want to send only one copy out for 20991 * whole group. 20992 * 20993 * This logic assumes that ire_add_v4() groups the 20994 * IRE_BROADCAST entries so that those with the same 20995 * ire_addr and ill_group are kept together. 20996 */ 20997 ire_ill = ire->ire_ipif->ipif_ill; 20998 if (ire->ire_stq == NULL && ire1->ire_stq != NULL) { 20999 if (ire_ill->ill_group != NULL && 21000 (ire->ire_marks & IRE_MARK_NORECV)) { 21001 /* 21002 * If the current zone only has an ire 21003 * broadcast for this address marked 21004 * NORECV, the ire we want is ahead in 21005 * the bucket, so we look it up 21006 * deliberately ignoring the zoneid. 21007 */ 21008 for (ire1 = ire->ire_bucket->irb_ire; 21009 ire1 != NULL; 21010 ire1 = ire1->ire_next) { 21011 ire1_ill = 21012 ire1->ire_ipif->ipif_ill; 21013 if (ire1->ire_addr != dst) 21014 continue; 21015 /* skip over the current ire */ 21016 if (ire1 == ire) 21017 continue; 21018 /* skip over deleted ires */ 21019 if (ire1->ire_marks & 21020 IRE_MARK_CONDEMNED) 21021 continue; 21022 /* 21023 * non-loopback ire in our 21024 * group: use it for the next 21025 * pass in the loop 21026 */ 21027 if (ire1->ire_stq != NULL && 21028 ire1_ill->ill_group == 21029 ire_ill->ill_group) 21030 break; 21031 } 21032 } 21033 } else { 21034 while (ire1 != NULL && ire1->ire_addr == dst) { 21035 ire1_ill = ire1->ire_ipif->ipif_ill; 21036 /* 21037 * We can have two broadcast ires on the 21038 * same ill in different zones; here 21039 * we'll send a copy of the packet on 21040 * each ill and the fanout code will 21041 * call conn_wantpacket() to check that 21042 * the zone has the broadcast address 21043 * configured on the ill. If the two 21044 * ires are in the same group we only 21045 * send one copy up. 21046 */ 21047 if (ire1_ill != ire_ill && 21048 (ire1_ill->ill_group == NULL || 21049 ire_ill->ill_group == NULL || 21050 ire1_ill->ill_group != 21051 ire_ill->ill_group)) { 21052 break; 21053 } 21054 ire1 = ire1->ire_next; 21055 } 21056 } 21057 } 21058 ASSERT(multirt_send == B_FALSE); 21059 if (ire1 != NULL && ire1->ire_addr == dst) { 21060 if ((ire->ire_flags & RTF_MULTIRT) && 21061 (ire1->ire_flags & RTF_MULTIRT)) { 21062 /* 21063 * We are in the multirouting case. 21064 * The message must be sent at least 21065 * on both ires. These ires have been 21066 * inserted AFTER the standard ones 21067 * in ip_rt_add(). There are thus no 21068 * other ire entries for the destination 21069 * address in the rest of the bucket 21070 * that do not have the RTF_MULTIRT 21071 * flag. We don't process a copy 21072 * of the message here. This will be 21073 * done in the final sending loop. 21074 */ 21075 multirt_send = B_TRUE; 21076 } else { 21077 next_mp = ip_copymsg(first_mp); 21078 if (next_mp != NULL) 21079 IRE_REFHOLD(ire1); 21080 } 21081 } 21082 rw_exit(&ire->ire_bucket->irb_lock); 21083 } 21084 21085 if (stq) { 21086 /* 21087 * A non-NULL send-to queue means this packet is going 21088 * out of this machine. 21089 */ 21090 21091 BUMP_MIB(&ip_mib, ipOutRequests); 21092 ttl_protocol = ((uint16_t *)ipha)[4]; 21093 /* 21094 * We accumulate the pseudo header checksum in cksum. 21095 * This is pretty hairy code, so watch close. One 21096 * thing to keep in mind is that UDP and TCP have 21097 * stored their respective datagram lengths in their 21098 * checksum fields. This lines things up real nice. 21099 */ 21100 cksum = (dst >> 16) + (dst & 0xFFFF) + 21101 (src >> 16) + (src & 0xFFFF); 21102 /* 21103 * We assume the udp checksum field contains the 21104 * length, so to compute the pseudo header checksum, 21105 * all we need is the protocol number and src/dst. 21106 */ 21107 /* Provide the checksums for UDP and TCP. */ 21108 if ((PROTO == IPPROTO_TCP) && 21109 (ip_hdr_included != IP_HDR_INCLUDED)) { 21110 /* hlen gets the number of uchar_ts in the IP header */ 21111 hlen = (V_HLEN & 0xF) << 2; 21112 up = IPH_TCPH_CHECKSUMP(ipha, hlen); 21113 IP_STAT(ip_out_sw_cksum); 21114 IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, 21115 LENGTH - hlen); 21116 *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); 21117 if (*up == 0) 21118 *up = 0xFFFF; 21119 } else if (PROTO == IPPROTO_SCTP && 21120 (ip_hdr_included != IP_HDR_INCLUDED)) { 21121 sctp_hdr_t *sctph; 21122 21123 hlen = (V_HLEN & 0xF) << 2; 21124 ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); 21125 sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); 21126 sctph->sh_chksum = 0; 21127 #ifdef DEBUG 21128 if (!skip_sctp_cksum) 21129 #endif 21130 sctph->sh_chksum = sctp_cksum(mp, hlen); 21131 } else { 21132 queue_t *dev_q = stq->q_next; 21133 21134 if ((dev_q->q_next || dev_q->q_first) && 21135 !canput(dev_q)) { 21136 blocked: 21137 ipha->ipha_ident = ip_hdr_included; 21138 /* 21139 * If we don't have a conn to apply 21140 * backpressure, free the message. 21141 * In the ire_send path, we don't know 21142 * the position to requeue the packet. Rather 21143 * than reorder packets, we just drop this 21144 * packet. 21145 */ 21146 if (ip_output_queue && connp != NULL && 21147 caller != IRE_SEND) { 21148 if (caller == IP_WSRV) { 21149 connp->conn_did_putbq = 1; 21150 (void) putbq(connp->conn_wq, 21151 first_mp); 21152 conn_drain_insert(connp); 21153 /* 21154 * This is the service thread, 21155 * and the queue is already 21156 * noenabled. The check for 21157 * canput and the putbq is not 21158 * atomic. So we need to check 21159 * again. 21160 */ 21161 if (canput(stq->q_next)) 21162 connp->conn_did_putbq 21163 = 0; 21164 IP_STAT(ip_conn_flputbq); 21165 } else { 21166 /* 21167 * We are not the service proc. 21168 * ip_wsrv will be scheduled or 21169 * is already running. 21170 */ 21171 (void) putq(connp->conn_wq, 21172 first_mp); 21173 } 21174 } else { 21175 BUMP_MIB(&ip_mib, ipOutDiscards); 21176 freemsg(first_mp); 21177 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21178 "ip_wput_ire_end: q %p (%S)", 21179 q, "discard"); 21180 } 21181 ire_refrele(ire); 21182 if (next_mp) { 21183 ire_refrele(ire1); 21184 freemsg(next_mp); 21185 } 21186 if (conn_outgoing_ill != NULL) 21187 ill_refrele(conn_outgoing_ill); 21188 return; 21189 } 21190 if ((PROTO == IPPROTO_UDP) && 21191 (ip_hdr_included != IP_HDR_INCLUDED)) { 21192 /* 21193 * hlen gets the number of uchar_ts in the 21194 * IP header 21195 */ 21196 hlen = (V_HLEN & 0xF) << 2; 21197 up = IPH_UDPH_CHECKSUMP(ipha, hlen); 21198 max_frag = ire->ire_max_frag; 21199 if (*up != 0) { 21200 IP_CKSUM_XMIT(ire_ill, ire, mp, ipha, 21201 up, PROTO, hlen, LENGTH, max_frag, 21202 ipsec_len, cksum); 21203 /* Software checksum? */ 21204 if (DB_CKSUMFLAGS(mp) == 0) { 21205 IP_STAT(ip_out_sw_cksum); 21206 IP_STAT_UPDATE( 21207 ip_udp_out_sw_cksum_bytes, 21208 LENGTH - hlen); 21209 } 21210 } 21211 } 21212 } 21213 /* 21214 * Need to do this even when fragmenting. The local 21215 * loopback can be done without computing checksums 21216 * but forwarding out other interface must be done 21217 * after the IP checksum (and ULP checksums) have been 21218 * computed. 21219 * 21220 * NOTE : multicast_forward is set only if this packet 21221 * originated from ip_wput. For packets originating from 21222 * ip_wput_multicast, it is not set. 21223 */ 21224 if (CLASSD(ipha->ipha_dst) && multicast_forward) { 21225 multi_loopback: 21226 ip2dbg(("ip_wput: multicast, loop %d\n", 21227 conn_multicast_loop)); 21228 21229 /* Forget header checksum offload */ 21230 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 21231 21232 /* 21233 * Local loopback of multicasts? Check the 21234 * ill. 21235 * 21236 * Note that the loopback function will not come 21237 * in through ip_rput - it will only do the 21238 * client fanout thus we need to do an mforward 21239 * as well. The is different from the BSD 21240 * logic. 21241 */ 21242 if (ill != NULL) { 21243 ilm_t *ilm; 21244 21245 ILM_WALKER_HOLD(ill); 21246 ilm = ilm_lookup_ill(ill, ipha->ipha_dst, 21247 ALL_ZONES); 21248 ILM_WALKER_RELE(ill); 21249 if (ilm != NULL) { 21250 /* 21251 * Pass along the virtual output q. 21252 * ip_wput_local() will distribute the 21253 * packet to all the matching zones, 21254 * except the sending zone when 21255 * IP_MULTICAST_LOOP is false. 21256 */ 21257 ip_multicast_loopback(q, ill, first_mp, 21258 conn_multicast_loop ? 0 : 21259 IP_FF_NO_MCAST_LOOP, zoneid); 21260 } 21261 } 21262 if (ipha->ipha_ttl == 0) { 21263 /* 21264 * 0 => only to this host i.e. we are 21265 * done. We are also done if this was the 21266 * loopback interface since it is sufficient 21267 * to loopback one copy of a multicast packet. 21268 */ 21269 freemsg(first_mp); 21270 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21271 "ip_wput_ire_end: q %p (%S)", 21272 q, "loopback"); 21273 ire_refrele(ire); 21274 if (conn_outgoing_ill != NULL) 21275 ill_refrele(conn_outgoing_ill); 21276 return; 21277 } 21278 /* 21279 * ILLF_MULTICAST is checked in ip_newroute 21280 * i.e. we don't need to check it here since 21281 * all IRE_CACHEs come from ip_newroute. 21282 * For multicast traffic, SO_DONTROUTE is interpreted 21283 * to mean only send the packet out the interface 21284 * (optionally specified with IP_MULTICAST_IF) 21285 * and do not forward it out additional interfaces. 21286 * RSVP and the rsvp daemon is an example of a 21287 * protocol and user level process that 21288 * handles it's own routing. Hence, it uses the 21289 * SO_DONTROUTE option to accomplish this. 21290 */ 21291 21292 if (ip_g_mrouter && !conn_dontroute && ill != NULL) { 21293 /* Unconditionally redo the checksum */ 21294 ipha->ipha_hdr_checksum = 0; 21295 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 21296 21297 /* 21298 * If this needs to go out secure, we need 21299 * to wait till we finish the IPSEC 21300 * processing. 21301 */ 21302 if (ipsec_len == 0 && 21303 ip_mforward(ill, ipha, mp)) { 21304 freemsg(first_mp); 21305 ip1dbg(("ip_wput: mforward failed\n")); 21306 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21307 "ip_wput_ire_end: q %p (%S)", 21308 q, "mforward failed"); 21309 ire_refrele(ire); 21310 if (conn_outgoing_ill != NULL) 21311 ill_refrele(conn_outgoing_ill); 21312 return; 21313 } 21314 } 21315 } 21316 max_frag = ire->ire_max_frag; 21317 cksum += ttl_protocol; 21318 if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { 21319 /* No fragmentation required for this one. */ 21320 /* 21321 * Don't use frag_flag if packet is pre-built or source 21322 * routed or if multicast (since multicast packets do 21323 * not solicit ICMP "packet too big" messages). 21324 */ 21325 if ((ip_hdr_included != IP_HDR_INCLUDED) && 21326 (V_HLEN == IP_SIMPLE_HDR_VERSION || 21327 !ip_source_route_included(ipha)) && 21328 !CLASSD(ipha->ipha_dst)) 21329 ipha->ipha_fragment_offset_and_flags |= 21330 htons(ire->ire_frag_flag); 21331 21332 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 21333 /* Complete the IP header checksum. */ 21334 cksum += ipha->ipha_ident; 21335 cksum += (v_hlen_tos_len >> 16)+ 21336 (v_hlen_tos_len & 0xFFFF); 21337 cksum += ipha->ipha_fragment_offset_and_flags; 21338 hlen = (V_HLEN & 0xF) - 21339 IP_SIMPLE_HDR_LENGTH_IN_WORDS; 21340 if (hlen) { 21341 checksumoptions: 21342 /* 21343 * Account for the IP Options in the IP 21344 * header checksum. 21345 */ 21346 up = (uint16_t *)(rptr+ 21347 IP_SIMPLE_HDR_LENGTH); 21348 do { 21349 cksum += up[0]; 21350 cksum += up[1]; 21351 up += 2; 21352 } while (--hlen); 21353 } 21354 cksum = ((cksum & 0xFFFF) + (cksum >> 16)); 21355 cksum = ~(cksum + (cksum >> 16)); 21356 ipha->ipha_hdr_checksum = (uint16_t)cksum; 21357 } 21358 if (ipsec_len != 0) { 21359 ipsec_out_process(q, first_mp, ire, ill_index); 21360 if (!next_mp) { 21361 ire_refrele(ire); 21362 if (conn_outgoing_ill != NULL) 21363 ill_refrele(conn_outgoing_ill); 21364 return; 21365 } 21366 goto next; 21367 } 21368 21369 /* 21370 * multirt_send has already been handled 21371 * for broadcast, but not yet for multicast 21372 * or IP options. 21373 */ 21374 if (next_mp == NULL) { 21375 if (ire->ire_flags & RTF_MULTIRT) { 21376 multirt_send = B_TRUE; 21377 } 21378 } 21379 21380 /* 21381 * In most cases, the emission loop below is 21382 * entered only once. Only in the case where 21383 * the ire holds the RTF_MULTIRT flag, do we loop 21384 * to process all RTF_MULTIRT ires in the bucket, 21385 * and send the packet through all crossed 21386 * RTF_MULTIRT routes. 21387 */ 21388 do { 21389 if (multirt_send) { 21390 irb_t *irb; 21391 21392 irb = ire->ire_bucket; 21393 ASSERT(irb != NULL); 21394 /* 21395 * We are in a multiple send case, 21396 * need to get the next IRE and make 21397 * a duplicate of the packet. 21398 */ 21399 IRB_REFHOLD(irb); 21400 for (ire1 = ire->ire_next; 21401 ire1 != NULL; 21402 ire1 = ire1->ire_next) { 21403 if (!(ire1->ire_flags & 21404 RTF_MULTIRT)) 21405 continue; 21406 if (ire1->ire_addr != 21407 ire->ire_addr) 21408 continue; 21409 if (ire1->ire_marks & 21410 (IRE_MARK_CONDEMNED| 21411 IRE_MARK_HIDDEN)) 21412 continue; 21413 21414 /* Got one */ 21415 IRE_REFHOLD(ire1); 21416 break; 21417 } 21418 IRB_REFRELE(irb); 21419 21420 if (ire1 != NULL) { 21421 next_mp = copyb(mp); 21422 if ((next_mp == NULL) || 21423 ((mp->b_cont != NULL) && 21424 ((next_mp->b_cont = 21425 dupmsg(mp->b_cont)) 21426 == NULL))) { 21427 freemsg(next_mp); 21428 next_mp = NULL; 21429 ire_refrele(ire1); 21430 ire1 = NULL; 21431 } 21432 } 21433 21434 /* 21435 * Last multiroute ire; don't loop 21436 * anymore. The emission is over 21437 * and next_mp is NULL. 21438 */ 21439 if (ire1 == NULL) { 21440 multirt_send = B_FALSE; 21441 } 21442 } 21443 21444 ASSERT(ipsec_len == 0); 21445 mp1 = ip_wput_attach_llhdr(mp, ire, 21446 IPP_LOCAL_OUT, ill_index); 21447 if (mp1 == NULL) { 21448 BUMP_MIB(&ip_mib, ipOutDiscards); 21449 if (next_mp) { 21450 freemsg(next_mp); 21451 ire_refrele(ire1); 21452 } 21453 ire_refrele(ire); 21454 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21455 "ip_wput_ire_end: q %p (%S)", 21456 q, "discard MDATA"); 21457 if (conn_outgoing_ill != NULL) 21458 ill_refrele(conn_outgoing_ill); 21459 return; 21460 } 21461 UPDATE_OB_PKT_COUNT(ire); 21462 ire->ire_last_used_time = lbolt; 21463 21464 if (multirt_send) { 21465 /* 21466 * We are in a multiple send case, 21467 * need to re-enter the sending loop 21468 * using the next ire. 21469 */ 21470 putnext(stq, mp1); 21471 ire_refrele(ire); 21472 ire = ire1; 21473 stq = ire->ire_stq; 21474 mp = next_mp; 21475 next_mp = NULL; 21476 ipha = (ipha_t *)mp->b_rptr; 21477 ill_index = Q_TO_INDEX(stq); 21478 } 21479 } while (multirt_send); 21480 21481 if (!next_mp) { 21482 /* 21483 * Last copy going out (the ultra-common 21484 * case). Note that we intentionally replicate 21485 * the putnext rather than calling it before 21486 * the next_mp check in hopes of a little 21487 * tail-call action out of the compiler. 21488 */ 21489 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21490 "ip_wput_ire_end: q %p (%S)", 21491 q, "last copy out(1)"); 21492 putnext(stq, mp1); 21493 ire_refrele(ire); 21494 if (conn_outgoing_ill != NULL) 21495 ill_refrele(conn_outgoing_ill); 21496 return; 21497 } 21498 /* More copies going out below. */ 21499 putnext(stq, mp1); 21500 } else { 21501 int offset; 21502 fragmentit: 21503 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 21504 /* 21505 * If this would generate a icmp_frag_needed message, 21506 * we need to handle it before we do the IPSEC 21507 * processing. Otherwise, we need to strip the IPSEC 21508 * headers before we send up the message to the ULPs 21509 * which becomes messy and difficult. 21510 */ 21511 if (ipsec_len != 0) { 21512 if ((max_frag < (unsigned int)(LENGTH + 21513 ipsec_len)) && (offset & IPH_DF)) { 21514 21515 BUMP_MIB(&ip_mib, ipFragFails); 21516 ipha->ipha_hdr_checksum = 0; 21517 ipha->ipha_hdr_checksum = 21518 (uint16_t)ip_csum_hdr(ipha); 21519 icmp_frag_needed(ire->ire_stq, first_mp, 21520 max_frag); 21521 if (!next_mp) { 21522 ire_refrele(ire); 21523 if (conn_outgoing_ill != NULL) { 21524 ill_refrele( 21525 conn_outgoing_ill); 21526 } 21527 return; 21528 } 21529 } else { 21530 /* 21531 * This won't cause a icmp_frag_needed 21532 * message. to be gnerated. Send it on 21533 * the wire. Note that this could still 21534 * cause fragmentation and all we 21535 * do is the generation of the message 21536 * to the ULP if needed before IPSEC. 21537 */ 21538 if (!next_mp) { 21539 ipsec_out_process(q, first_mp, 21540 ire, ill_index); 21541 TRACE_2(TR_FAC_IP, 21542 TR_IP_WPUT_IRE_END, 21543 "ip_wput_ire_end: q %p " 21544 "(%S)", q, 21545 "last ipsec_out_process"); 21546 ire_refrele(ire); 21547 if (conn_outgoing_ill != NULL) { 21548 ill_refrele( 21549 conn_outgoing_ill); 21550 } 21551 return; 21552 } 21553 ipsec_out_process(q, first_mp, 21554 ire, ill_index); 21555 } 21556 } else { 21557 /* Initiate IPPF processing */ 21558 if (IPP_ENABLED(IPP_LOCAL_OUT)) { 21559 ip_process(IPP_LOCAL_OUT, &mp, 21560 ill_index); 21561 if (mp == NULL) { 21562 BUMP_MIB(&ip_mib, 21563 ipOutDiscards); 21564 if (next_mp != NULL) { 21565 freemsg(next_mp); 21566 ire_refrele(ire1); 21567 } 21568 ire_refrele(ire); 21569 TRACE_2(TR_FAC_IP, 21570 TR_IP_WPUT_IRE_END, 21571 "ip_wput_ire: q %p (%S)", 21572 q, "discard MDATA"); 21573 if (conn_outgoing_ill != NULL) { 21574 ill_refrele( 21575 conn_outgoing_ill); 21576 } 21577 return; 21578 } 21579 } 21580 if (!next_mp) { 21581 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21582 "ip_wput_ire_end: q %p (%S)", 21583 q, "last fragmentation"); 21584 ip_wput_ire_fragmentit(mp, ire); 21585 ire_refrele(ire); 21586 if (conn_outgoing_ill != NULL) 21587 ill_refrele(conn_outgoing_ill); 21588 return; 21589 } 21590 ip_wput_ire_fragmentit(mp, ire); 21591 } 21592 } 21593 } else { 21594 nullstq: 21595 /* A NULL stq means the destination address is local. */ 21596 UPDATE_OB_PKT_COUNT(ire); 21597 ire->ire_last_used_time = lbolt; 21598 ASSERT(ire->ire_ipif != NULL); 21599 if (!next_mp) { 21600 TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, 21601 "ip_wput_ire_end: q %p (%S)", 21602 q, "local address"); 21603 ip_wput_local(q, ire->ire_ipif->ipif_ill, ipha, 21604 first_mp, ire, 0, ire->ire_zoneid); 21605 ire_refrele(ire); 21606 if (conn_outgoing_ill != NULL) 21607 ill_refrele(conn_outgoing_ill); 21608 return; 21609 } 21610 ip_wput_local(q, ire->ire_ipif->ipif_ill, ipha, first_mp, 21611 ire, 0, ire->ire_zoneid); 21612 } 21613 next: 21614 /* 21615 * More copies going out to additional interfaces. 21616 * ire1 has already been held. We don't need the 21617 * "ire" anymore. 21618 */ 21619 ire_refrele(ire); 21620 ire = ire1; 21621 ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); 21622 mp = next_mp; 21623 ASSERT(ire->ire_ipversion == IPV4_VERSION); 21624 ill = ire_to_ill(ire); 21625 first_mp = mp; 21626 if (ipsec_len != 0) { 21627 ASSERT(first_mp->b_datap->db_type == M_CTL); 21628 mp = mp->b_cont; 21629 } 21630 dst = ire->ire_addr; 21631 ipha = (ipha_t *)mp->b_rptr; 21632 /* 21633 * Restore src so that we will pick up ire->ire_src_addr if src was 0. 21634 * Restore ipha_ident "no checksum" flag. 21635 */ 21636 src = orig_src; 21637 ipha->ipha_ident = ip_hdr_included; 21638 goto another; 21639 21640 #undef rptr 21641 #undef Q_TO_INDEX 21642 } 21643 21644 /* 21645 * Routine to allocate a message that is used to notify the ULP about MDT. 21646 * The caller may provide a pointer to the link-layer MDT capabilities, 21647 * or NULL if MDT is to be disabled on the stream. 21648 */ 21649 mblk_t * 21650 ip_mdinfo_alloc(ill_mdt_capab_t *isrc) 21651 { 21652 mblk_t *mp; 21653 ip_mdt_info_t *mdti; 21654 ill_mdt_capab_t *idst; 21655 21656 if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { 21657 DB_TYPE(mp) = M_CTL; 21658 mp->b_wptr = mp->b_rptr + sizeof (*mdti); 21659 mdti = (ip_mdt_info_t *)mp->b_rptr; 21660 mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; 21661 idst = &(mdti->mdt_capab); 21662 21663 /* 21664 * If the caller provides us with the capability, copy 21665 * it over into our notification message; otherwise 21666 * we zero out the capability portion. 21667 */ 21668 if (isrc != NULL) 21669 bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); 21670 else 21671 bzero((caddr_t)idst, sizeof (*idst)); 21672 } 21673 return (mp); 21674 } 21675 21676 /* 21677 * Routine which determines whether MDT can be enabled on the destination 21678 * IRE and IPC combination, and if so, allocates and returns the MDT 21679 * notification mblk that may be used by ULP. We also check if we need to 21680 * turn MDT back to 'on' when certain restrictions prohibiting us to allow 21681 * MDT usage in the past have been lifted. This gets called during IP 21682 * and ULP binding. 21683 */ 21684 mblk_t * 21685 ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, 21686 ill_mdt_capab_t *mdt_cap) 21687 { 21688 mblk_t *mp; 21689 boolean_t rc = B_FALSE; 21690 21691 ASSERT(dst_ire != NULL); 21692 ASSERT(connp != NULL); 21693 ASSERT(mdt_cap != NULL); 21694 21695 /* 21696 * Currently, we only support simple TCP/{IPv4,IPv6} with 21697 * Multidata, which is handled in tcp_multisend(). This 21698 * is the reason why we do all these checks here, to ensure 21699 * that we don't enable Multidata for the cases which we 21700 * can't handle at the moment. 21701 */ 21702 do { 21703 /* Only do TCP at the moment */ 21704 if (connp->conn_ulp != IPPROTO_TCP) 21705 break; 21706 21707 /* 21708 * IPSEC outbound policy present? Note that we get here 21709 * after calling ipsec_conn_cache_policy() where the global 21710 * policy checking is performed. conn_latch will be 21711 * non-NULL as long as there's a policy defined, 21712 * i.e. conn_out_enforce_policy may be NULL in such case 21713 * when the connection is non-secure, and hence we check 21714 * further if the latch refers to an outbound policy. 21715 */ 21716 if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) 21717 break; 21718 21719 /* CGTP (multiroute) is enabled? */ 21720 if (dst_ire->ire_flags & RTF_MULTIRT) 21721 break; 21722 21723 /* Outbound IPQoS enabled? */ 21724 if (IPP_ENABLED(IPP_LOCAL_OUT)) { 21725 /* 21726 * In this case, we disable MDT for this and all 21727 * future connections going over the interface. 21728 */ 21729 mdt_cap->ill_mdt_on = 0; 21730 break; 21731 } 21732 21733 /* socket option(s) present? */ 21734 if (!CONN_IS_MD_FASTPATH(connp)) 21735 break; 21736 21737 rc = B_TRUE; 21738 /* CONSTCOND */ 21739 } while (0); 21740 21741 /* Remember the result */ 21742 connp->conn_mdt_ok = rc; 21743 21744 if (!rc) 21745 return (NULL); 21746 else if (!mdt_cap->ill_mdt_on) { 21747 /* 21748 * If MDT has been previously turned off in the past, and we 21749 * currently can do MDT (due to IPQoS policy removal, etc.) 21750 * then enable it for this interface. 21751 */ 21752 mdt_cap->ill_mdt_on = 1; 21753 ip1dbg(("ip_mdinfo_return: reenabling MDT for " 21754 "interface %s\n", ill_name)); 21755 } 21756 21757 /* Allocate the MDT info mblk */ 21758 if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { 21759 ip0dbg(("ip_mdinfo_return: can't enable Multidata for " 21760 "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); 21761 return (NULL); 21762 } 21763 return (mp); 21764 } 21765 21766 /* 21767 * Create destination address attribute, and fill it with the physical 21768 * destination address and SAP taken from the template DL_UNITDATA_REQ 21769 * message block. 21770 */ 21771 boolean_t 21772 ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) 21773 { 21774 dl_unitdata_req_t *dlurp; 21775 pattr_t *pa; 21776 pattrinfo_t pa_info; 21777 pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; 21778 uint_t das_len, das_off; 21779 21780 ASSERT(dlmp != NULL); 21781 21782 dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; 21783 das_len = dlurp->dl_dest_addr_length; 21784 das_off = dlurp->dl_dest_addr_offset; 21785 21786 pa_info.type = PATTR_DSTADDRSAP; 21787 pa_info.len = sizeof (**das) + das_len - 1; 21788 21789 /* create and associate the attribute */ 21790 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21791 if (pa != NULL) { 21792 ASSERT(*das != NULL); 21793 (*das)->addr_is_group = 0; 21794 (*das)->addr_len = (uint8_t)das_len; 21795 bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); 21796 } 21797 21798 return (pa != NULL); 21799 } 21800 21801 /* 21802 * Create hardware checksum attribute and fill it with the values passed. 21803 */ 21804 boolean_t 21805 ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, 21806 uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) 21807 { 21808 pattr_t *pa; 21809 pattrinfo_t pa_info; 21810 21811 ASSERT(mmd != NULL); 21812 21813 pa_info.type = PATTR_HCKSUM; 21814 pa_info.len = sizeof (pattr_hcksum_t); 21815 21816 /* create and associate the attribute */ 21817 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21818 if (pa != NULL) { 21819 pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; 21820 21821 hck->hcksum_start_offset = start_offset; 21822 hck->hcksum_stuff_offset = stuff_offset; 21823 hck->hcksum_end_offset = end_offset; 21824 hck->hcksum_flags = flags; 21825 } 21826 return (pa != NULL); 21827 } 21828 21829 /* 21830 * Create zerocopy attribute and fill it with the specified flags 21831 */ 21832 boolean_t 21833 ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) 21834 { 21835 pattr_t *pa; 21836 pattrinfo_t pa_info; 21837 21838 ASSERT(mmd != NULL); 21839 pa_info.type = PATTR_ZCOPY; 21840 pa_info.len = sizeof (pattr_zcopy_t); 21841 21842 /* create and associate the attribute */ 21843 pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); 21844 if (pa != NULL) { 21845 pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; 21846 21847 zcopy->zcopy_flags = flags; 21848 } 21849 return (pa != NULL); 21850 } 21851 21852 /* 21853 * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message 21854 * block chain. We could rewrite to handle arbitrary message block chains but 21855 * that would make the code complicated and slow. Right now there three 21856 * restrictions: 21857 * 21858 * 1. The first message block must contain the complete IP header and 21859 * at least 1 byte of payload data. 21860 * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed 21861 * so that we can use a single Multidata message. 21862 * 3. No frag must be distributed over two or more message blocks so 21863 * that we don't need more than two packet descriptors per frag. 21864 * 21865 * The above restrictions allow us to support userland applications (which 21866 * will send down a single message block) and NFS over UDP (which will 21867 * send down a chain of at most three message blocks). 21868 * 21869 * We also don't use MDT for payloads with less than or equal to 21870 * ip_wput_frag_mdt_min bytes because it would cause too much overhead. 21871 */ 21872 boolean_t 21873 ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) 21874 { 21875 int blocks; 21876 ssize_t total, missing, size; 21877 21878 ASSERT(mp != NULL); 21879 ASSERT(hdr_len > 0); 21880 21881 size = MBLKL(mp) - hdr_len; 21882 if (size <= 0) 21883 return (B_FALSE); 21884 21885 /* The first mblk contains the header and some payload. */ 21886 blocks = 1; 21887 total = size; 21888 size %= len; 21889 missing = (size == 0) ? 0 : (len - size); 21890 mp = mp->b_cont; 21891 21892 while (mp != NULL) { 21893 /* 21894 * Give up if we encounter a zero length message block. 21895 * In practice, this should rarely happen and therefore 21896 * not worth the trouble of freeing and re-linking the 21897 * mblk from the chain to handle such case. 21898 */ 21899 if ((size = MBLKL(mp)) == 0) 21900 return (B_FALSE); 21901 21902 /* Too many payload buffers for a single Multidata message? */ 21903 if (++blocks > MULTIDATA_MAX_PBUFS) 21904 return (B_FALSE); 21905 21906 total += size; 21907 /* Is a frag distributed over two or more message blocks? */ 21908 if (missing > size) 21909 return (B_FALSE); 21910 size -= missing; 21911 21912 size %= len; 21913 missing = (size == 0) ? 0 : (len - size); 21914 21915 mp = mp->b_cont; 21916 } 21917 21918 return (total > ip_wput_frag_mdt_min); 21919 } 21920 21921 /* 21922 * Outbound IPv4 fragmentation routine using MDT. 21923 */ 21924 static void 21925 ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, 21926 uint32_t frag_flag, int offset) 21927 { 21928 ipha_t *ipha_orig; 21929 int i1, ip_data_end; 21930 uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; 21931 mblk_t *hdr_mp, *md_mp = NULL; 21932 unsigned char *hdr_ptr, *pld_ptr; 21933 multidata_t *mmd; 21934 ip_pdescinfo_t pdi; 21935 21936 ASSERT(DB_TYPE(mp) == M_DATA); 21937 ASSERT(MBLKL(mp) > sizeof (ipha_t)); 21938 21939 ipha_orig = (ipha_t *)mp->b_rptr; 21940 mp->b_rptr += sizeof (ipha_t); 21941 21942 /* Calculate how many packets we will send out */ 21943 i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); 21944 pkts = (i1 + len - 1) / len; 21945 ASSERT(pkts > 1); 21946 21947 /* Allocate a message block which will hold all the IP Headers. */ 21948 wroff = ip_wroff_extra; 21949 hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; 21950 21951 i1 = pkts * hdr_chunk_len; 21952 /* 21953 * Create the header buffer, Multidata and destination address 21954 * and SAP attribute that should be associated with it. 21955 */ 21956 if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || 21957 ((hdr_mp->b_wptr += i1), 21958 (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || 21959 !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) { 21960 freemsg(mp); 21961 if (md_mp == NULL) { 21962 freemsg(hdr_mp); 21963 } else { 21964 free_mmd: IP_STAT(ip_frag_mdt_discarded); 21965 freemsg(md_mp); 21966 } 21967 IP_STAT(ip_frag_mdt_allocfail); 21968 UPDATE_MIB(&ip_mib, ipOutDiscards, pkts); 21969 return; 21970 } 21971 IP_STAT(ip_frag_mdt_allocd); 21972 21973 /* 21974 * Add a payload buffer to the Multidata; this operation must not 21975 * fail, or otherwise our logic in this routine is broken. There 21976 * is no memory allocation done by the routine, so any returned 21977 * failure simply tells us that we've done something wrong. 21978 * 21979 * A failure tells us that either we're adding the same payload 21980 * buffer more than once, or we're trying to add more buffers than 21981 * allowed. None of the above cases should happen, and we panic 21982 * because either there's horrible heap corruption, and/or 21983 * programming mistake. 21984 */ 21985 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 21986 goto pbuf_panic; 21987 21988 hdr_ptr = hdr_mp->b_rptr; 21989 pld_ptr = mp->b_rptr; 21990 21991 /* Establish the ending byte offset, based on the starting offset. */ 21992 offset <<= 3; 21993 ip_data_end = offset + ntohs(ipha_orig->ipha_length) - 21994 IP_SIMPLE_HDR_LENGTH; 21995 21996 pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; 21997 21998 while (pld_ptr < mp->b_wptr) { 21999 ipha_t *ipha; 22000 uint16_t offset_and_flags; 22001 uint16_t ip_len; 22002 int error; 22003 22004 ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); 22005 ipha = (ipha_t *)(hdr_ptr + wroff); 22006 ASSERT(OK_32PTR(ipha)); 22007 *ipha = *ipha_orig; 22008 22009 if (ip_data_end - offset > len) { 22010 offset_and_flags = IPH_MF; 22011 } else { 22012 /* 22013 * Last frag. Set len to the length of this last piece. 22014 */ 22015 len = ip_data_end - offset; 22016 /* A frag of a frag might have IPH_MF non-zero */ 22017 offset_and_flags = 22018 ntohs(ipha->ipha_fragment_offset_and_flags) & 22019 IPH_MF; 22020 } 22021 offset_and_flags |= (uint16_t)(offset >> 3); 22022 offset_and_flags |= (uint16_t)frag_flag; 22023 /* Store the offset and flags in the IP header. */ 22024 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 22025 22026 /* Store the length in the IP header. */ 22027 ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); 22028 ipha->ipha_length = htons(ip_len); 22029 22030 /* 22031 * Set the IP header checksum. Note that mp is just 22032 * the header, so this is easy to pass to ip_csum. 22033 */ 22034 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22035 22036 /* 22037 * Record offset and size of header and data of the next packet 22038 * in the multidata message. 22039 */ 22040 PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); 22041 PDESC_PLD_INIT(&pdi); 22042 i1 = MIN(mp->b_wptr - pld_ptr, len); 22043 ASSERT(i1 > 0); 22044 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); 22045 if (i1 == len) { 22046 pld_ptr += len; 22047 } else { 22048 i1 = len - i1; 22049 mp = mp->b_cont; 22050 ASSERT(mp != NULL); 22051 ASSERT(MBLKL(mp) >= i1); 22052 /* 22053 * Attach the next payload message block to the 22054 * multidata message. 22055 */ 22056 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 22057 goto pbuf_panic; 22058 PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); 22059 pld_ptr = mp->b_rptr + i1; 22060 } 22061 22062 if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, 22063 KM_NOSLEEP)) == NULL) { 22064 /* 22065 * Any failure other than ENOMEM indicates that we 22066 * have passed in invalid pdesc info or parameters 22067 * to mmd_addpdesc, which must not happen. 22068 * 22069 * EINVAL is a result of failure on boundary checks 22070 * against the pdesc info contents. It should not 22071 * happen, and we panic because either there's 22072 * horrible heap corruption, and/or programming 22073 * mistake. 22074 */ 22075 if (error != ENOMEM) { 22076 cmn_err(CE_PANIC, "ip_wput_frag_mdt: " 22077 "pdesc logic error detected for " 22078 "mmd %p pinfo %p (%d)\n", 22079 (void *)mmd, (void *)&pdi, error); 22080 /* NOTREACHED */ 22081 } 22082 IP_STAT(ip_frag_mdt_addpdescfail); 22083 /* Free unattached payload message blocks as well */ 22084 md_mp->b_cont = mp->b_cont; 22085 goto free_mmd; 22086 } 22087 22088 /* Advance fragment offset. */ 22089 offset += len; 22090 22091 /* Advance to location for next header in the buffer. */ 22092 hdr_ptr += hdr_chunk_len; 22093 22094 /* Did we reach the next payload message block? */ 22095 if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { 22096 mp = mp->b_cont; 22097 /* 22098 * Attach the next message block with payload 22099 * data to the multidata message. 22100 */ 22101 if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) 22102 goto pbuf_panic; 22103 pld_ptr = mp->b_rptr; 22104 } 22105 } 22106 22107 ASSERT(hdr_mp->b_wptr == hdr_ptr); 22108 ASSERT(mp->b_wptr == pld_ptr); 22109 22110 /* Update IP statistics */ 22111 UPDATE_MIB(&ip_mib, ipFragCreates, pkts); 22112 BUMP_MIB(&ip_mib, ipFragOKs); 22113 IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts); 22114 22115 if (pkt_type == OB_PKT) { 22116 ire->ire_ob_pkt_count += pkts; 22117 if (ire->ire_ipif != NULL) 22118 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); 22119 } else { 22120 /* 22121 * The type is IB_PKT in the forwarding path and in 22122 * the mobile IP case when the packet is being reverse- 22123 * tunneled to the home agent. 22124 */ 22125 ire->ire_ib_pkt_count += pkts; 22126 ASSERT(!IRE_IS_LOCAL(ire)); 22127 if (ire->ire_type & IRE_BROADCAST) 22128 atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); 22129 else 22130 atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); 22131 } 22132 ire->ire_last_used_time = lbolt; 22133 /* Send it down */ 22134 putnext(ire->ire_stq, md_mp); 22135 return; 22136 22137 pbuf_panic: 22138 cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " 22139 "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, 22140 pbuf_idx); 22141 /* NOTREACHED */ 22142 } 22143 22144 /* 22145 * Outbound IP fragmentation routine. 22146 * 22147 * NOTE : This routine does not ire_refrele the ire that is passed in 22148 * as the argument. 22149 */ 22150 static void 22151 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, 22152 uint32_t frag_flag) 22153 { 22154 int i1; 22155 mblk_t *ll_hdr_mp; 22156 int ll_hdr_len; 22157 int hdr_len; 22158 mblk_t *hdr_mp; 22159 ipha_t *ipha; 22160 int ip_data_end; 22161 int len; 22162 mblk_t *mp = mp_orig; 22163 int offset; 22164 queue_t *q; 22165 uint32_t v_hlen_tos_len; 22166 mblk_t *first_mp; 22167 boolean_t mctl_present; 22168 ill_t *ill; 22169 mblk_t *xmit_mp; 22170 mblk_t *carve_mp; 22171 ire_t *ire1 = NULL; 22172 ire_t *save_ire = NULL; 22173 mblk_t *next_mp = NULL; 22174 boolean_t last_frag = B_FALSE; 22175 boolean_t multirt_send = B_FALSE; 22176 ire_t *first_ire = NULL; 22177 irb_t *irb = NULL; 22178 22179 TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, 22180 "ip_wput_frag_start:"); 22181 22182 if (mp->b_datap->db_type == M_CTL) { 22183 first_mp = mp; 22184 mp_orig = mp = mp->b_cont; 22185 mctl_present = B_TRUE; 22186 } else { 22187 first_mp = mp; 22188 mctl_present = B_FALSE; 22189 } 22190 22191 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 22192 ipha = (ipha_t *)mp->b_rptr; 22193 22194 /* 22195 * If the Don't Fragment flag is on, generate an ICMP destination 22196 * unreachable, fragmentation needed. 22197 */ 22198 offset = ntohs(ipha->ipha_fragment_offset_and_flags); 22199 if (offset & IPH_DF) { 22200 BUMP_MIB(&ip_mib, ipFragFails); 22201 /* 22202 * Need to compute hdr checksum if called from ip_wput_ire. 22203 * Note that ip_rput_forward verifies the checksum before 22204 * calling this routine so in that case this is a noop. 22205 */ 22206 ipha->ipha_hdr_checksum = 0; 22207 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22208 icmp_frag_needed(ire->ire_stq, first_mp, max_frag); 22209 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22210 "ip_wput_frag_end:(%S)", 22211 "don't fragment"); 22212 return; 22213 } 22214 if (mctl_present) 22215 freeb(first_mp); 22216 /* 22217 * Establish the starting offset. May not be zero if we are fragging 22218 * a fragment that is being forwarded. 22219 */ 22220 offset = offset & IPH_OFFSET; 22221 22222 /* TODO why is this test needed? */ 22223 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 22224 if (((max_frag - LENGTH) & ~7) < 8) { 22225 /* TODO: notify ulp somehow */ 22226 BUMP_MIB(&ip_mib, ipFragFails); 22227 freemsg(mp); 22228 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22229 "ip_wput_frag_end:(%S)", 22230 "len < 8"); 22231 return; 22232 } 22233 22234 hdr_len = (V_HLEN & 0xF) << 2; 22235 22236 ipha->ipha_hdr_checksum = 0; 22237 22238 /* 22239 * Establish the number of bytes maximum per frag, after putting 22240 * in the header. 22241 */ 22242 len = (max_frag - hdr_len) & ~7; 22243 22244 /* Check if we can use MDT to send out the frags. */ 22245 ASSERT(!IRE_IS_LOCAL(ire)); 22246 if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound && 22247 !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) && 22248 (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) && 22249 IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { 22250 ASSERT(ill->ill_mdt_capab != NULL); 22251 if (!ill->ill_mdt_capab->ill_mdt_on) { 22252 /* 22253 * If MDT has been previously turned off in the past, 22254 * and we currently can do MDT (due to IPQoS policy 22255 * removal, etc.) then enable it for this interface. 22256 */ 22257 ill->ill_mdt_capab->ill_mdt_on = 1; 22258 ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", 22259 ill->ill_name)); 22260 } 22261 ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, 22262 offset); 22263 return; 22264 } 22265 22266 /* Get a copy of the header for the trailing frags */ 22267 hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset); 22268 if (!hdr_mp) { 22269 BUMP_MIB(&ip_mib, ipOutDiscards); 22270 freemsg(mp); 22271 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22272 "ip_wput_frag_end:(%S)", 22273 "couldn't copy hdr"); 22274 return; 22275 } 22276 if (DB_CRED(mp) != NULL) 22277 mblk_setcred(hdr_mp, DB_CRED(mp)); 22278 22279 /* Store the starting offset, with the MoreFrags flag. */ 22280 i1 = offset | IPH_MF | frag_flag; 22281 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1); 22282 22283 /* Establish the ending byte offset, based on the starting offset. */ 22284 offset <<= 3; 22285 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; 22286 22287 /* Store the length of the first fragment in the IP header. */ 22288 i1 = len + hdr_len; 22289 ASSERT(i1 <= IP_MAXPACKET); 22290 ipha->ipha_length = htons((uint16_t)i1); 22291 22292 /* 22293 * Compute the IP header checksum for the first frag. We have to 22294 * watch out that we stop at the end of the header. 22295 */ 22296 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22297 22298 /* 22299 * Now carve off the first frag. Note that this will include the 22300 * original IP header. 22301 */ 22302 if (!(mp = ip_carve_mp(&mp_orig, i1))) { 22303 BUMP_MIB(&ip_mib, ipOutDiscards); 22304 freeb(hdr_mp); 22305 freemsg(mp_orig); 22306 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22307 "ip_wput_frag_end:(%S)", 22308 "couldn't carve first"); 22309 return; 22310 } 22311 22312 /* 22313 * Multirouting case. Each fragment is replicated 22314 * via all non-condemned RTF_MULTIRT routes 22315 * currently resolved. 22316 * We ensure that first_ire is the first RTF_MULTIRT 22317 * ire in the bucket. 22318 */ 22319 if (ire->ire_flags & RTF_MULTIRT) { 22320 irb = ire->ire_bucket; 22321 ASSERT(irb != NULL); 22322 22323 multirt_send = B_TRUE; 22324 22325 /* Make sure we do not omit any multiroute ire. */ 22326 IRB_REFHOLD(irb); 22327 for (first_ire = irb->irb_ire; 22328 first_ire != NULL; 22329 first_ire = first_ire->ire_next) { 22330 if ((first_ire->ire_flags & RTF_MULTIRT) && 22331 (first_ire->ire_addr == ire->ire_addr) && 22332 !(first_ire->ire_marks & 22333 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 22334 break; 22335 } 22336 22337 if (first_ire != NULL) { 22338 if (first_ire != ire) { 22339 IRE_REFHOLD(first_ire); 22340 /* 22341 * Do not release the ire passed in 22342 * as the argument. 22343 */ 22344 ire = first_ire; 22345 } else { 22346 first_ire = NULL; 22347 } 22348 } 22349 IRB_REFRELE(irb); 22350 22351 /* 22352 * Save the first ire; we will need to restore it 22353 * for the trailing frags. 22354 * We REFHOLD save_ire, as each iterated ire will be 22355 * REFRELEd. 22356 */ 22357 save_ire = ire; 22358 IRE_REFHOLD(save_ire); 22359 } 22360 22361 /* 22362 * First fragment emission loop. 22363 * In most cases, the emission loop below is entered only 22364 * once. Only in the case where the ire holds the RTF_MULTIRT 22365 * flag, do we loop to process all RTF_MULTIRT ires in the 22366 * bucket, and send the fragment through all crossed 22367 * RTF_MULTIRT routes. 22368 */ 22369 do { 22370 if (ire->ire_flags & RTF_MULTIRT) { 22371 /* 22372 * We are in a multiple send case, need to get 22373 * the next ire and make a copy of the packet. 22374 * ire1 holds here the next ire to process in the 22375 * bucket. If multirouting is expected, 22376 * any non-RTF_MULTIRT ire that has the 22377 * right destination address is ignored. 22378 * 22379 * We have to take into account the MTU of 22380 * each walked ire. max_frag is set by the 22381 * the caller and generally refers to 22382 * the primary ire entry. Here we ensure that 22383 * no route with a lower MTU will be used, as 22384 * fragments are carved once for all ires, 22385 * then replicated. 22386 */ 22387 ASSERT(irb != NULL); 22388 IRB_REFHOLD(irb); 22389 for (ire1 = ire->ire_next; 22390 ire1 != NULL; 22391 ire1 = ire1->ire_next) { 22392 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 22393 continue; 22394 if (ire1->ire_addr != ire->ire_addr) 22395 continue; 22396 if (ire1->ire_marks & 22397 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 22398 continue; 22399 /* 22400 * Ensure we do not exceed the MTU 22401 * of the next route. 22402 */ 22403 if (ire1->ire_max_frag < max_frag) { 22404 ip_multirt_bad_mtu(ire1, max_frag); 22405 continue; 22406 } 22407 22408 /* Got one. */ 22409 IRE_REFHOLD(ire1); 22410 break; 22411 } 22412 IRB_REFRELE(irb); 22413 22414 if (ire1 != NULL) { 22415 next_mp = copyb(mp); 22416 if ((next_mp == NULL) || 22417 ((mp->b_cont != NULL) && 22418 ((next_mp->b_cont = 22419 dupmsg(mp->b_cont)) == NULL))) { 22420 freemsg(next_mp); 22421 next_mp = NULL; 22422 ire_refrele(ire1); 22423 ire1 = NULL; 22424 } 22425 } 22426 22427 /* Last multiroute ire; don't loop anymore. */ 22428 if (ire1 == NULL) { 22429 multirt_send = B_FALSE; 22430 } 22431 } 22432 22433 ll_hdr_len = 0; 22434 LOCK_IRE_FP_MP(ire); 22435 ll_hdr_mp = ire->ire_fp_mp; 22436 if (ll_hdr_mp != NULL) { 22437 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 22438 ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; 22439 } else { 22440 ll_hdr_mp = ire->ire_dlureq_mp; 22441 } 22442 22443 /* If there is a transmit header, get a copy for this frag. */ 22444 /* 22445 * TODO: should check db_ref before calling ip_carve_mp since 22446 * it might give us a dup. 22447 */ 22448 if (!ll_hdr_mp) { 22449 /* No xmit header. */ 22450 xmit_mp = mp; 22451 } else if (mp->b_datap->db_ref == 1 && 22452 ll_hdr_len != 0 && 22453 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 22454 /* M_DATA fastpath */ 22455 mp->b_rptr -= ll_hdr_len; 22456 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); 22457 xmit_mp = mp; 22458 } else if (!(xmit_mp = copyb(ll_hdr_mp))) { 22459 UNLOCK_IRE_FP_MP(ire); 22460 BUMP_MIB(&ip_mib, ipOutDiscards); 22461 freeb(hdr_mp); 22462 freemsg(mp); 22463 freemsg(mp_orig); 22464 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22465 "ip_wput_frag_end:(%S)", 22466 "discard"); 22467 22468 if (multirt_send) { 22469 ASSERT(ire1); 22470 ASSERT(next_mp); 22471 22472 freemsg(next_mp); 22473 ire_refrele(ire1); 22474 } 22475 if (save_ire != NULL) 22476 IRE_REFRELE(save_ire); 22477 22478 if (first_ire != NULL) 22479 ire_refrele(first_ire); 22480 return; 22481 } else { 22482 xmit_mp->b_cont = mp; 22483 if (DB_CRED(mp) != NULL) 22484 mblk_setcred(xmit_mp, DB_CRED(mp)); 22485 /* Get priority marking, if any. */ 22486 if (DB_TYPE(xmit_mp) == M_DATA) 22487 xmit_mp->b_band = mp->b_band; 22488 } 22489 UNLOCK_IRE_FP_MP(ire); 22490 q = ire->ire_stq; 22491 BUMP_MIB(&ip_mib, ipFragCreates); 22492 putnext(q, xmit_mp); 22493 if (pkt_type != OB_PKT) { 22494 /* 22495 * Update the packet count of trailing 22496 * RTF_MULTIRT ires. 22497 */ 22498 UPDATE_OB_PKT_COUNT(ire); 22499 } 22500 22501 if (multirt_send) { 22502 /* 22503 * We are in a multiple send case; look for 22504 * the next ire and re-enter the loop. 22505 */ 22506 ASSERT(ire1); 22507 ASSERT(next_mp); 22508 /* REFRELE the current ire before looping */ 22509 ire_refrele(ire); 22510 ire = ire1; 22511 ire1 = NULL; 22512 mp = next_mp; 22513 next_mp = NULL; 22514 } 22515 } while (multirt_send); 22516 22517 ASSERT(ire1 == NULL); 22518 22519 /* Restore the original ire; we need it for the trailing frags */ 22520 if (save_ire != NULL) { 22521 /* REFRELE the last iterated ire */ 22522 ire_refrele(ire); 22523 /* save_ire has been REFHOLDed */ 22524 ire = save_ire; 22525 save_ire = NULL; 22526 q = ire->ire_stq; 22527 } 22528 22529 if (pkt_type == OB_PKT) { 22530 UPDATE_OB_PKT_COUNT(ire); 22531 } else { 22532 UPDATE_IB_PKT_COUNT(ire); 22533 } 22534 22535 /* Advance the offset to the second frag starting point. */ 22536 offset += len; 22537 /* 22538 * Update hdr_len from the copied header - there might be less options 22539 * in the later fragments. 22540 */ 22541 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr); 22542 /* Loop until done. */ 22543 for (;;) { 22544 uint16_t offset_and_flags; 22545 uint16_t ip_len; 22546 22547 if (ip_data_end - offset > len) { 22548 /* 22549 * Carve off the appropriate amount from the original 22550 * datagram. 22551 */ 22552 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 22553 mp = NULL; 22554 break; 22555 } 22556 /* 22557 * More frags after this one. Get another copy 22558 * of the header. 22559 */ 22560 if (carve_mp->b_datap->db_ref == 1 && 22561 hdr_mp->b_wptr - hdr_mp->b_rptr < 22562 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 22563 /* Inline IP header */ 22564 carve_mp->b_rptr -= hdr_mp->b_wptr - 22565 hdr_mp->b_rptr; 22566 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 22567 hdr_mp->b_wptr - hdr_mp->b_rptr); 22568 mp = carve_mp; 22569 } else { 22570 if (!(mp = copyb(hdr_mp))) { 22571 freemsg(carve_mp); 22572 break; 22573 } 22574 /* Get priority marking, if any. */ 22575 mp->b_band = carve_mp->b_band; 22576 mp->b_cont = carve_mp; 22577 } 22578 ipha = (ipha_t *)mp->b_rptr; 22579 offset_and_flags = IPH_MF; 22580 } else { 22581 /* 22582 * Last frag. Consume the header. Set len to 22583 * the length of this last piece. 22584 */ 22585 len = ip_data_end - offset; 22586 22587 /* 22588 * Carve off the appropriate amount from the original 22589 * datagram. 22590 */ 22591 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) { 22592 mp = NULL; 22593 break; 22594 } 22595 if (carve_mp->b_datap->db_ref == 1 && 22596 hdr_mp->b_wptr - hdr_mp->b_rptr < 22597 carve_mp->b_rptr - carve_mp->b_datap->db_base) { 22598 /* Inline IP header */ 22599 carve_mp->b_rptr -= hdr_mp->b_wptr - 22600 hdr_mp->b_rptr; 22601 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr, 22602 hdr_mp->b_wptr - hdr_mp->b_rptr); 22603 mp = carve_mp; 22604 freeb(hdr_mp); 22605 hdr_mp = mp; 22606 } else { 22607 mp = hdr_mp; 22608 /* Get priority marking, if any. */ 22609 mp->b_band = carve_mp->b_band; 22610 mp->b_cont = carve_mp; 22611 } 22612 ipha = (ipha_t *)mp->b_rptr; 22613 /* A frag of a frag might have IPH_MF non-zero */ 22614 offset_and_flags = 22615 ntohs(ipha->ipha_fragment_offset_and_flags) & 22616 IPH_MF; 22617 } 22618 offset_and_flags |= (uint16_t)(offset >> 3); 22619 offset_and_flags |= (uint16_t)frag_flag; 22620 /* Store the offset and flags in the IP header. */ 22621 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); 22622 22623 /* Store the length in the IP header. */ 22624 ip_len = (uint16_t)(len + hdr_len); 22625 ipha->ipha_length = htons(ip_len); 22626 22627 /* 22628 * Set the IP header checksum. Note that mp is just 22629 * the header, so this is easy to pass to ip_csum. 22630 */ 22631 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 22632 22633 /* Attach a transmit header, if any, and ship it. */ 22634 if (pkt_type == OB_PKT) { 22635 UPDATE_OB_PKT_COUNT(ire); 22636 } else { 22637 UPDATE_IB_PKT_COUNT(ire); 22638 } 22639 22640 if (ire->ire_flags & RTF_MULTIRT) { 22641 irb = ire->ire_bucket; 22642 ASSERT(irb != NULL); 22643 22644 multirt_send = B_TRUE; 22645 22646 /* 22647 * Save the original ire; we will need to restore it 22648 * for the tailing frags. 22649 */ 22650 save_ire = ire; 22651 IRE_REFHOLD(save_ire); 22652 } 22653 /* 22654 * Emission loop for this fragment, similar 22655 * to what is done for the first fragment. 22656 */ 22657 do { 22658 if (multirt_send) { 22659 /* 22660 * We are in a multiple send case, need to get 22661 * the next ire and make a copy of the packet. 22662 */ 22663 ASSERT(irb != NULL); 22664 IRB_REFHOLD(irb); 22665 for (ire1 = ire->ire_next; 22666 ire1 != NULL; 22667 ire1 = ire1->ire_next) { 22668 if (!(ire1->ire_flags & RTF_MULTIRT)) 22669 continue; 22670 if (ire1->ire_addr != ire->ire_addr) 22671 continue; 22672 if (ire1->ire_marks & 22673 (IRE_MARK_CONDEMNED| 22674 IRE_MARK_HIDDEN)) 22675 continue; 22676 /* 22677 * Ensure we do not exceed the MTU 22678 * of the next route. 22679 */ 22680 if (ire1->ire_max_frag < max_frag) { 22681 ip_multirt_bad_mtu(ire1, 22682 max_frag); 22683 continue; 22684 } 22685 22686 /* Got one. */ 22687 IRE_REFHOLD(ire1); 22688 break; 22689 } 22690 IRB_REFRELE(irb); 22691 22692 if (ire1 != NULL) { 22693 next_mp = copyb(mp); 22694 if ((next_mp == NULL) || 22695 ((mp->b_cont != NULL) && 22696 ((next_mp->b_cont = 22697 dupmsg(mp->b_cont)) == NULL))) { 22698 freemsg(next_mp); 22699 next_mp = NULL; 22700 ire_refrele(ire1); 22701 ire1 = NULL; 22702 } 22703 } 22704 22705 /* Last multiroute ire; don't loop anymore. */ 22706 if (ire1 == NULL) { 22707 multirt_send = B_FALSE; 22708 } 22709 } 22710 22711 /* Update transmit header */ 22712 ll_hdr_len = 0; 22713 LOCK_IRE_FP_MP(ire); 22714 ll_hdr_mp = ire->ire_fp_mp; 22715 if (ll_hdr_mp != NULL) { 22716 ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); 22717 ll_hdr_len = MBLKL(ll_hdr_mp); 22718 } else { 22719 ll_hdr_mp = ire->ire_dlureq_mp; 22720 } 22721 22722 if (!ll_hdr_mp) { 22723 xmit_mp = mp; 22724 } else if (mp->b_datap->db_ref == 1 && 22725 ll_hdr_len != 0 && 22726 ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { 22727 /* M_DATA fastpath */ 22728 mp->b_rptr -= ll_hdr_len; 22729 bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, 22730 ll_hdr_len); 22731 xmit_mp = mp; 22732 } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { 22733 xmit_mp->b_cont = mp; 22734 if (DB_CRED(mp) != NULL) 22735 mblk_setcred(xmit_mp, DB_CRED(mp)); 22736 /* Get priority marking, if any. */ 22737 if (DB_TYPE(xmit_mp) == M_DATA) 22738 xmit_mp->b_band = mp->b_band; 22739 } else { 22740 /* 22741 * Exit both the replication and 22742 * fragmentation loops. 22743 */ 22744 UNLOCK_IRE_FP_MP(ire); 22745 goto drop_pkt; 22746 } 22747 UNLOCK_IRE_FP_MP(ire); 22748 BUMP_MIB(&ip_mib, ipFragCreates); 22749 putnext(q, xmit_mp); 22750 22751 if (pkt_type != OB_PKT) { 22752 /* 22753 * Update the packet count of trailing 22754 * RTF_MULTIRT ires. 22755 */ 22756 UPDATE_OB_PKT_COUNT(ire); 22757 } 22758 22759 /* All done if we just consumed the hdr_mp. */ 22760 if (mp == hdr_mp) { 22761 last_frag = B_TRUE; 22762 } 22763 22764 if (multirt_send) { 22765 /* 22766 * We are in a multiple send case; look for 22767 * the next ire and re-enter the loop. 22768 */ 22769 ASSERT(ire1); 22770 ASSERT(next_mp); 22771 /* REFRELE the current ire before looping */ 22772 ire_refrele(ire); 22773 ire = ire1; 22774 ire1 = NULL; 22775 q = ire->ire_stq; 22776 mp = next_mp; 22777 next_mp = NULL; 22778 } 22779 } while (multirt_send); 22780 /* 22781 * Restore the original ire; we need it for the 22782 * trailing frags 22783 */ 22784 if (save_ire != NULL) { 22785 ASSERT(ire1 == NULL); 22786 /* REFRELE the last iterated ire */ 22787 ire_refrele(ire); 22788 /* save_ire has been REFHOLDed */ 22789 ire = save_ire; 22790 q = ire->ire_stq; 22791 save_ire = NULL; 22792 } 22793 22794 if (last_frag) { 22795 BUMP_MIB(&ip_mib, ipFragOKs); 22796 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22797 "ip_wput_frag_end:(%S)", 22798 "consumed hdr_mp"); 22799 22800 if (first_ire != NULL) 22801 ire_refrele(first_ire); 22802 return; 22803 } 22804 /* Otherwise, advance and loop. */ 22805 offset += len; 22806 } 22807 22808 drop_pkt: 22809 /* Clean up following allocation failure. */ 22810 BUMP_MIB(&ip_mib, ipOutDiscards); 22811 freemsg(mp); 22812 if (mp != hdr_mp) 22813 freeb(hdr_mp); 22814 if (mp != mp_orig) 22815 freemsg(mp_orig); 22816 22817 if (save_ire != NULL) 22818 IRE_REFRELE(save_ire); 22819 if (first_ire != NULL) 22820 ire_refrele(first_ire); 22821 22822 TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, 22823 "ip_wput_frag_end:(%S)", 22824 "end--alloc failure"); 22825 } 22826 22827 /* 22828 * Copy the header plus those options which have the copy bit set 22829 */ 22830 static mblk_t * 22831 ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset) 22832 { 22833 mblk_t *mp; 22834 uchar_t *up; 22835 22836 /* 22837 * Quick check if we need to look for options without the copy bit 22838 * set 22839 */ 22840 mp = allocb(ip_wroff_extra + hdr_len, BPRI_HI); 22841 if (!mp) 22842 return (mp); 22843 mp->b_rptr += ip_wroff_extra; 22844 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) { 22845 bcopy(rptr, mp->b_rptr, hdr_len); 22846 mp->b_wptr += hdr_len + ip_wroff_extra; 22847 return (mp); 22848 } 22849 up = mp->b_rptr; 22850 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH); 22851 up += IP_SIMPLE_HDR_LENGTH; 22852 rptr += IP_SIMPLE_HDR_LENGTH; 22853 hdr_len -= IP_SIMPLE_HDR_LENGTH; 22854 while (hdr_len > 0) { 22855 uint32_t optval; 22856 uint32_t optlen; 22857 22858 optval = *rptr; 22859 if (optval == IPOPT_EOL) 22860 break; 22861 if (optval == IPOPT_NOP) 22862 optlen = 1; 22863 else 22864 optlen = rptr[1]; 22865 if (optval & IPOPT_COPY) { 22866 bcopy(rptr, up, optlen); 22867 up += optlen; 22868 } 22869 rptr += optlen; 22870 hdr_len -= optlen; 22871 } 22872 /* 22873 * Make sure that we drop an even number of words by filling 22874 * with EOL to the next word boundary. 22875 */ 22876 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH); 22877 hdr_len & 0x3; hdr_len++) 22878 *up++ = IPOPT_EOL; 22879 mp->b_wptr = up; 22880 /* Update header length */ 22881 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2)); 22882 return (mp); 22883 } 22884 22885 /* 22886 * Delivery to local recipients including fanout to multiple recipients. 22887 * Does not do checksumming of UDP/TCP. 22888 * Note: q should be the read side queue for either the ill or conn. 22889 * Note: rq should be the read side q for the lower (ill) stream. 22890 * We don't send packets to IPPF processing, thus the last argument 22891 * to all the fanout calls are B_FALSE. 22892 */ 22893 void 22894 ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, 22895 int fanout_flags, zoneid_t zoneid) 22896 { 22897 uint32_t protocol; 22898 mblk_t *first_mp; 22899 boolean_t mctl_present; 22900 int ire_type; 22901 #define rptr ((uchar_t *)ipha) 22902 22903 TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, 22904 "ip_wput_local_start: q %p", q); 22905 22906 if (ire != NULL) { 22907 ire_type = ire->ire_type; 22908 } else { 22909 /* 22910 * Only ip_multicast_loopback() calls us with a NULL ire. If the 22911 * packet is not multicast, we can't tell the ire type. 22912 */ 22913 ASSERT(CLASSD(ipha->ipha_dst)); 22914 ire_type = IRE_BROADCAST; 22915 } 22916 22917 first_mp = mp; 22918 if (first_mp->b_datap->db_type == M_CTL) { 22919 ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; 22920 if (!io->ipsec_out_secure) { 22921 /* 22922 * This ipsec_out_t was allocated in ip_wput 22923 * for multicast packets to store the ill_index. 22924 * As this is being delivered locally, we don't 22925 * need this anymore. 22926 */ 22927 mp = first_mp->b_cont; 22928 freeb(first_mp); 22929 first_mp = mp; 22930 mctl_present = B_FALSE; 22931 } else { 22932 mctl_present = B_TRUE; 22933 mp = first_mp->b_cont; 22934 ASSERT(mp != NULL); 22935 ipsec_out_to_in(first_mp); 22936 } 22937 } else { 22938 mctl_present = B_FALSE; 22939 } 22940 22941 loopback_packets++; 22942 22943 ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", 22944 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); 22945 if (!IS_SIMPLE_IPH(ipha)) { 22946 ip_wput_local_options(ipha); 22947 } 22948 22949 protocol = ipha->ipha_protocol; 22950 switch (protocol) { 22951 case IPPROTO_ICMP: { 22952 ire_t *ire_zone; 22953 ilm_t *ilm; 22954 mblk_t *mp1; 22955 zoneid_t last_zoneid; 22956 22957 if (CLASSD(ipha->ipha_dst) && 22958 !(ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) { 22959 ASSERT(ire_type == IRE_BROADCAST); 22960 /* 22961 * In the multicast case, applications may have joined 22962 * the group from different zones, so we need to deliver 22963 * the packet to each of them. Loop through the 22964 * multicast memberships structures (ilm) on the receive 22965 * ill and send a copy of the packet up each matching 22966 * one. However, we don't do this for multicasts sent on 22967 * the loopback interface (PHYI_LOOPBACK flag set) as 22968 * they must stay in the sender's zone. 22969 * 22970 * ilm_add_v6() ensures that ilms in the same zone are 22971 * contiguous in the ill_ilm list. We use this property 22972 * to avoid sending duplicates needed when two 22973 * applications in the same zone join the same group on 22974 * different logical interfaces: we ignore the ilm if 22975 * its zoneid is the same as the last matching one. 22976 * In addition, the sending of the packet for 22977 * ire_zoneid is delayed until all of the other ilms 22978 * have been exhausted. 22979 */ 22980 last_zoneid = -1; 22981 ILM_WALKER_HOLD(ill); 22982 for (ilm = ill->ill_ilm; ilm != NULL; 22983 ilm = ilm->ilm_next) { 22984 if ((ilm->ilm_flags & ILM_DELETED) || 22985 ipha->ipha_dst != ilm->ilm_addr || 22986 ilm->ilm_zoneid == last_zoneid || 22987 ilm->ilm_zoneid == zoneid || 22988 !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) 22989 continue; 22990 mp1 = ip_copymsg(first_mp); 22991 if (mp1 == NULL) 22992 continue; 22993 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 22994 mctl_present, B_FALSE, ill, 22995 ilm->ilm_zoneid); 22996 last_zoneid = ilm->ilm_zoneid; 22997 } 22998 ILM_WALKER_RELE(ill); 22999 /* 23000 * Loopback case: the sending endpoint has 23001 * IP_MULTICAST_LOOP disabled, therefore we don't 23002 * dispatch the multicast packet to the sending zone. 23003 */ 23004 if (fanout_flags & IP_FF_NO_MCAST_LOOP) { 23005 freemsg(first_mp); 23006 return; 23007 } 23008 } else if (ire_type == IRE_BROADCAST) { 23009 /* 23010 * In the broadcast case, there may be many zones 23011 * which need a copy of the packet delivered to them. 23012 * There is one IRE_BROADCAST per broadcast address 23013 * and per zone; we walk those using a helper function. 23014 * In addition, the sending of the packet for zoneid is 23015 * delayed until all of the other ires have been 23016 * processed. 23017 */ 23018 IRB_REFHOLD(ire->ire_bucket); 23019 ire_zone = NULL; 23020 while ((ire_zone = ire_get_next_bcast_ire(ire_zone, 23021 ire)) != NULL) { 23022 mp1 = ip_copymsg(first_mp); 23023 if (mp1 == NULL) 23024 continue; 23025 23026 UPDATE_IB_PKT_COUNT(ire_zone); 23027 ire_zone->ire_last_used_time = lbolt; 23028 icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, 23029 mctl_present, B_FALSE, ill, 23030 ire_zone->ire_zoneid); 23031 } 23032 IRB_REFRELE(ire->ire_bucket); 23033 } 23034 icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, 23035 0, mctl_present, B_FALSE, ill, zoneid); 23036 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23037 "ip_wput_local_end: q %p (%S)", 23038 q, "icmp"); 23039 return; 23040 } 23041 case IPPROTO_IGMP: 23042 if (igmp_input(q, mp, ill)) { 23043 /* Bad packet - discarded by igmp_input */ 23044 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23045 "ip_wput_local_end: q %p (%S)", 23046 q, "igmp_input--bad packet"); 23047 if (mctl_present) 23048 freeb(first_mp); 23049 return; 23050 } 23051 /* 23052 * igmp_input() may have pulled up the message so ipha needs to 23053 * be reinitialized. 23054 */ 23055 ipha = (ipha_t *)mp->b_rptr; 23056 /* deliver to local raw users */ 23057 break; 23058 case IPPROTO_ENCAP: 23059 /* 23060 * This case is covered by either ip_fanout_proto, or by 23061 * the above security processing for self-tunneled packets. 23062 */ 23063 break; 23064 case IPPROTO_UDP: { 23065 uint16_t *up; 23066 uint32_t ports; 23067 23068 up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + 23069 UDP_PORTS_OFFSET); 23070 /* Force a 'valid' checksum. */ 23071 up[3] = 0; 23072 23073 ports = *(uint32_t *)up; 23074 ip_fanout_udp(q, first_mp, ill, ipha, ports, 23075 (ire_type == IRE_BROADCAST), 23076 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23077 IP_FF_SEND_SLLA | IP_FF_IP6INFO, mctl_present, B_FALSE, 23078 ill, zoneid); 23079 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23080 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); 23081 return; 23082 } 23083 case IPPROTO_TCP: { 23084 23085 /* 23086 * For TCP, discard broadcast packets. 23087 */ 23088 if ((ushort_t)ire_type == IRE_BROADCAST) { 23089 freemsg(first_mp); 23090 BUMP_MIB(&ip_mib, ipInDiscards); 23091 ip2dbg(("ip_wput_local: discard broadcast\n")); 23092 return; 23093 } 23094 23095 if (mp->b_datap->db_type == M_DATA) { 23096 /* 23097 * M_DATA mblk, so init mblk (chain) for no struio(). 23098 */ 23099 mblk_t *mp1 = mp; 23100 23101 do 23102 mp1->b_datap->db_struioflag = 0; 23103 while ((mp1 = mp1->b_cont) != NULL); 23104 } 23105 ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) 23106 <= mp->b_wptr); 23107 ip_fanout_tcp(q, first_mp, ill, ipha, 23108 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23109 IP_FF_SYN_ADDIRE | IP_FF_IP6INFO, 23110 mctl_present, B_FALSE, zoneid); 23111 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23112 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); 23113 return; 23114 } 23115 case IPPROTO_SCTP: 23116 { 23117 uint32_t ports; 23118 23119 bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); 23120 ip_fanout_sctp(first_mp, ill, ipha, ports, 23121 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | 23122 IP_FF_IP6INFO, 23123 mctl_present, B_FALSE, 0, zoneid); 23124 return; 23125 } 23126 23127 default: 23128 break; 23129 } 23130 /* 23131 * Find a client for some other protocol. We give 23132 * copies to multiple clients, if more than one is 23133 * bound. 23134 */ 23135 ip_fanout_proto(q, first_mp, ill, ipha, 23136 fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, 23137 mctl_present, B_FALSE, ill, zoneid); 23138 TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, 23139 "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); 23140 #undef rptr 23141 } 23142 23143 /* 23144 * Update any source route, record route, or timestamp options. 23145 * Check that we are at end of strict source route. 23146 * The options have been sanity checked by ip_wput_options(). 23147 */ 23148 static void 23149 ip_wput_local_options(ipha_t *ipha) 23150 { 23151 ipoptp_t opts; 23152 uchar_t *opt; 23153 uint8_t optval; 23154 uint8_t optlen; 23155 ipaddr_t dst; 23156 uint32_t ts; 23157 ire_t *ire; 23158 timestruc_t now; 23159 23160 ip2dbg(("ip_wput_local_options\n")); 23161 for (optval = ipoptp_first(&opts, ipha); 23162 optval != IPOPT_EOL; 23163 optval = ipoptp_next(&opts)) { 23164 opt = opts.ipoptp_cur; 23165 optlen = opts.ipoptp_len; 23166 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 23167 switch (optval) { 23168 uint32_t off; 23169 case IPOPT_SSRR: 23170 case IPOPT_LSRR: 23171 off = opt[IPOPT_OFFSET]; 23172 off--; 23173 if (optlen < IP_ADDR_LEN || 23174 off > optlen - IP_ADDR_LEN) { 23175 /* End of source route */ 23176 break; 23177 } 23178 /* 23179 * This will only happen if two consecutive entries 23180 * in the source route contains our address or if 23181 * it is a packet with a loose source route which 23182 * reaches us before consuming the whole source route 23183 */ 23184 ip1dbg(("ip_wput_local_options: not end of SR\n")); 23185 if (optval == IPOPT_SSRR) { 23186 return; 23187 } 23188 /* 23189 * Hack: instead of dropping the packet truncate the 23190 * source route to what has been used by filling the 23191 * rest with IPOPT_NOP. 23192 */ 23193 opt[IPOPT_OLEN] = (uint8_t)off; 23194 while (off < optlen) { 23195 opt[off++] = IPOPT_NOP; 23196 } 23197 break; 23198 case IPOPT_RR: 23199 off = opt[IPOPT_OFFSET]; 23200 off--; 23201 if (optlen < IP_ADDR_LEN || 23202 off > optlen - IP_ADDR_LEN) { 23203 /* No more room - ignore */ 23204 ip1dbg(( 23205 "ip_wput_forward_options: end of RR\n")); 23206 break; 23207 } 23208 dst = htonl(INADDR_LOOPBACK); 23209 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 23210 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 23211 break; 23212 case IPOPT_TS: 23213 /* Insert timestamp if there is romm */ 23214 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 23215 case IPOPT_TS_TSONLY: 23216 off = IPOPT_TS_TIMELEN; 23217 break; 23218 case IPOPT_TS_PRESPEC: 23219 case IPOPT_TS_PRESPEC_RFC791: 23220 /* Verify that the address matched */ 23221 off = opt[IPOPT_OFFSET] - 1; 23222 bcopy((char *)opt + off, &dst, IP_ADDR_LEN); 23223 ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, 23224 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23225 if (ire == NULL) { 23226 /* Not for us */ 23227 break; 23228 } 23229 ire_refrele(ire); 23230 /* FALLTHRU */ 23231 case IPOPT_TS_TSANDADDR: 23232 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 23233 break; 23234 default: 23235 /* 23236 * ip_*put_options should have already 23237 * dropped this packet. 23238 */ 23239 cmn_err(CE_PANIC, "ip_wput_local_options: " 23240 "unknown IT - bug in ip_wput_options?\n"); 23241 return; /* Keep "lint" happy */ 23242 } 23243 if (opt[IPOPT_OFFSET] - 1 + off > optlen) { 23244 /* Increase overflow counter */ 23245 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1; 23246 opt[IPOPT_POS_OV_FLG] = (uint8_t) 23247 (opt[IPOPT_POS_OV_FLG] & 0x0F) | 23248 (off << 4); 23249 break; 23250 } 23251 off = opt[IPOPT_OFFSET] - 1; 23252 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 23253 case IPOPT_TS_PRESPEC: 23254 case IPOPT_TS_PRESPEC_RFC791: 23255 case IPOPT_TS_TSANDADDR: 23256 dst = htonl(INADDR_LOOPBACK); 23257 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN); 23258 opt[IPOPT_OFFSET] += IP_ADDR_LEN; 23259 /* FALLTHRU */ 23260 case IPOPT_TS_TSONLY: 23261 off = opt[IPOPT_OFFSET] - 1; 23262 /* Compute # of milliseconds since midnight */ 23263 gethrestime(&now); 23264 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 23265 now.tv_nsec / (NANOSEC / MILLISEC); 23266 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN); 23267 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN; 23268 break; 23269 } 23270 break; 23271 } 23272 } 23273 } 23274 23275 /* 23276 * Send out a multicast packet on interface ipif. 23277 * The sender does not have an conn. 23278 * Caller verifies that this isn't a PHYI_LOOPBACK. 23279 */ 23280 void 23281 ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif) 23282 { 23283 ipha_t *ipha; 23284 ire_t *ire; 23285 ipaddr_t dst; 23286 mblk_t *first_mp; 23287 23288 /* igmp_sendpkt always allocates a ipsec_out_t */ 23289 ASSERT(mp->b_datap->db_type == M_CTL); 23290 ASSERT(!ipif->ipif_isv6); 23291 ASSERT(!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)); 23292 23293 first_mp = mp; 23294 mp = first_mp->b_cont; 23295 ASSERT(mp->b_datap->db_type == M_DATA); 23296 ipha = (ipha_t *)mp->b_rptr; 23297 23298 /* 23299 * Find an IRE which matches the destination and the outgoing 23300 * queue (i.e. the outgoing interface.) 23301 */ 23302 if (ipif->ipif_flags & IPIF_POINTOPOINT) 23303 dst = ipif->ipif_pp_dst_addr; 23304 else 23305 dst = ipha->ipha_dst; 23306 /* 23307 * The source address has already been initialized by the 23308 * caller and hence matching on ILL (MATCH_IRE_ILL) would 23309 * be sufficient rather than MATCH_IRE_IPIF. 23310 * 23311 * This function is used for sending IGMP packets. We need 23312 * to make sure that we send the packet out of the interface 23313 * (ipif->ipif_ill) where we joined the group. This is to 23314 * prevent from switches doing IGMP snooping to send us multicast 23315 * packets for a given group on the interface we have joined. 23316 * If we can't find an ire, igmp_sendpkt has already initialized 23317 * ipsec_out_attach_if so that this will not be load spread in 23318 * ip_newroute_ipif. 23319 */ 23320 ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, NULL, 23321 MATCH_IRE_ILL); 23322 if (!ire) { 23323 /* 23324 * Mark this packet to make it be delivered to 23325 * ip_wput_ire after the new ire has been 23326 * created. 23327 */ 23328 mp->b_prev = NULL; 23329 mp->b_next = NULL; 23330 ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC); 23331 return; 23332 } 23333 23334 /* 23335 * Honor the RTF_SETSRC flag; this is the only case 23336 * where we force this addr whatever the current src addr is, 23337 * because this address is set by igmp_sendpkt(), and 23338 * cannot be specified by any user. 23339 */ 23340 if (ire->ire_flags & RTF_SETSRC) { 23341 ipha->ipha_src = ire->ire_src_addr; 23342 } 23343 23344 ip_wput_ire(q, first_mp, ire, NULL, B_FALSE); 23345 } 23346 23347 /* 23348 * NOTE : This function does not ire_refrele the ire argument passed in. 23349 * 23350 * Copy the link layer header and do IPQoS if needed. Frees the mblk on 23351 * failure. The ire_fp_mp can vanish any time in the case of IRE_MIPRTUN 23352 * and IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold 23353 * the ire_lock to access the ire_fp_mp in this case. 23354 * IPQoS assumes that the first M_DATA contains the IP header. So, if we are 23355 * prepending a fastpath message IPQoS processing must precede it, we also set 23356 * the b_band of the fastpath message to that of the mblk returned by IPQoS 23357 * (IPQoS might have set the b_band for CoS marking). 23358 * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing 23359 * must follow it so that IPQoS can mark the dl_priority field for CoS 23360 * marking, if needed. 23361 */ 23362 static mblk_t * 23363 ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, uint32_t ill_index) 23364 { 23365 uint_t hlen; 23366 ipha_t *ipha; 23367 mblk_t *mp1; 23368 boolean_t qos_done = B_FALSE; 23369 uchar_t *ll_hdr; 23370 23371 #define rptr ((uchar_t *)ipha) 23372 23373 ipha = (ipha_t *)mp->b_rptr; 23374 hlen = 0; 23375 LOCK_IRE_FP_MP(ire); 23376 if ((mp1 = ire->ire_fp_mp) != NULL) { 23377 ASSERT(DB_TYPE(mp1) == M_DATA); 23378 /* Initiate IPPF processing */ 23379 if ((proc != 0) && IPP_ENABLED(proc)) { 23380 UNLOCK_IRE_FP_MP(ire); 23381 ip_process(proc, &mp, ill_index); 23382 if (mp == NULL) 23383 return (NULL); 23384 23385 ipha = (ipha_t *)mp->b_rptr; 23386 LOCK_IRE_FP_MP(ire); 23387 if ((mp1 = ire->ire_fp_mp) == NULL) { 23388 qos_done = B_TRUE; 23389 goto no_fp_mp; 23390 } 23391 ASSERT(DB_TYPE(mp1) == M_DATA); 23392 } 23393 hlen = MBLKL(mp1); 23394 /* 23395 * Check if we have enough room to prepend fastpath 23396 * header 23397 */ 23398 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { 23399 ll_hdr = rptr - hlen; 23400 bcopy(mp1->b_rptr, ll_hdr, hlen); 23401 /* XXX ipha is not aligned here */ 23402 ipha = (ipha_t *)(rptr - hlen); 23403 /* 23404 * Set the b_rptr to the start of the link layer 23405 * header 23406 */ 23407 mp->b_rptr = rptr; 23408 mp1 = mp; 23409 } else { 23410 mp1 = copyb(mp1); 23411 if (mp1 == NULL) 23412 goto unlock_err; 23413 mp1->b_band = mp->b_band; 23414 mp1->b_cont = mp; 23415 /* 23416 * certain system generated traffic may not 23417 * have cred/label in ip header block. This 23418 * is true even for a labeled system. But for 23419 * labeled traffic, inherit the label in the 23420 * new header. 23421 */ 23422 if (DB_CRED(mp) != NULL) 23423 mblk_setcred(mp1, DB_CRED(mp)); 23424 /* 23425 * XXX disable ICK_VALID and compute checksum 23426 * here; can happen if ire_fp_mp changes and 23427 * it can't be copied now due to insufficient 23428 * space. (unlikely, fp mp can change, but it 23429 * does not increase in length) 23430 */ 23431 } 23432 UNLOCK_IRE_FP_MP(ire); 23433 } else { 23434 no_fp_mp: 23435 mp1 = copyb(ire->ire_dlureq_mp); 23436 if (mp1 == NULL) { 23437 unlock_err: 23438 UNLOCK_IRE_FP_MP(ire); 23439 freemsg(mp); 23440 return (NULL); 23441 } 23442 UNLOCK_IRE_FP_MP(ire); 23443 mp1->b_cont = mp; 23444 /* 23445 * certain system generated traffic may not 23446 * have cred/label in ip header block. This 23447 * is true even for a labeled system. But for 23448 * labeled traffic, inherit the label in the 23449 * new header. 23450 */ 23451 if (DB_CRED(mp) != NULL) 23452 mblk_setcred(mp1, DB_CRED(mp)); 23453 if (!qos_done && (proc != 0) && IPP_ENABLED(proc)) { 23454 ip_process(proc, &mp1, ill_index); 23455 if (mp1 == NULL) 23456 return (NULL); 23457 } 23458 } 23459 return (mp1); 23460 #undef rptr 23461 } 23462 23463 /* 23464 * Finish the outbound IPsec processing for an IPv6 packet. This function 23465 * is called from ipsec_out_process() if the IPsec packet was processed 23466 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 23467 * asynchronously. 23468 */ 23469 void 23470 ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, 23471 ire_t *ire_arg) 23472 { 23473 in6_addr_t *v6dstp; 23474 ire_t *ire; 23475 mblk_t *mp; 23476 uint_t ill_index; 23477 ipsec_out_t *io; 23478 boolean_t attach_if, hwaccel; 23479 uint32_t flags = IP6_NO_IPPOLICY; 23480 int match_flags; 23481 zoneid_t zoneid; 23482 boolean_t ill_need_rele = B_FALSE; 23483 boolean_t ire_need_rele = B_FALSE; 23484 23485 mp = ipsec_mp->b_cont; 23486 io = (ipsec_out_t *)ipsec_mp->b_rptr; 23487 ill_index = io->ipsec_out_ill_index; 23488 if (io->ipsec_out_reachable) { 23489 flags |= IPV6_REACHABILITY_CONFIRMATION; 23490 } 23491 attach_if = io->ipsec_out_attach_if; 23492 hwaccel = io->ipsec_out_accelerated; 23493 zoneid = io->ipsec_out_zoneid; 23494 ASSERT(zoneid != ALL_ZONES); 23495 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 23496 /* Multicast addresses should have non-zero ill_index. */ 23497 v6dstp = &ip6h->ip6_dst; 23498 ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); 23499 ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); 23500 ASSERT(!attach_if || ill_index != 0); 23501 if (ill_index != 0) { 23502 if (ill == NULL) { 23503 ill = ip_grab_attach_ill(NULL, ipsec_mp, ill_index, 23504 B_TRUE); 23505 23506 /* Failure case frees things for us. */ 23507 if (ill == NULL) 23508 return; 23509 23510 ill_need_rele = B_TRUE; 23511 } 23512 /* 23513 * If this packet needs to go out on a particular interface 23514 * honor it. 23515 */ 23516 if (attach_if) { 23517 match_flags = MATCH_IRE_ILL; 23518 23519 /* 23520 * Check if we need an ire that will not be 23521 * looked up by anybody else i.e. HIDDEN. 23522 */ 23523 if (ill_is_probeonly(ill)) { 23524 match_flags |= MATCH_IRE_MARK_HIDDEN; 23525 } 23526 } 23527 } 23528 ASSERT(mp != NULL); 23529 23530 if (IN6_IS_ADDR_MULTICAST(v6dstp)) { 23531 boolean_t unspec_src; 23532 ipif_t *ipif; 23533 23534 /* 23535 * Use the ill_index to get the right ill. 23536 */ 23537 unspec_src = io->ipsec_out_unspec_src; 23538 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 23539 if (ipif == NULL) { 23540 if (ill_need_rele) 23541 ill_refrele(ill); 23542 freemsg(ipsec_mp); 23543 return; 23544 } 23545 23546 if (ire_arg != NULL) { 23547 ire = ire_arg; 23548 } else { 23549 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 23550 zoneid, MBLK_GETLABEL(mp), match_flags); 23551 ire_need_rele = B_TRUE; 23552 } 23553 if (ire != NULL) { 23554 ipif_refrele(ipif); 23555 /* 23556 * XXX Do the multicast forwarding now, as the IPSEC 23557 * processing has been done. 23558 */ 23559 goto send; 23560 } 23561 23562 ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); 23563 mp->b_prev = NULL; 23564 mp->b_next = NULL; 23565 23566 /* 23567 * If the IPsec packet was processed asynchronously, 23568 * drop it now. 23569 */ 23570 if (q == NULL) { 23571 if (ill_need_rele) 23572 ill_refrele(ill); 23573 freemsg(ipsec_mp); 23574 return; 23575 } 23576 23577 ip_newroute_ipif_v6(q, ipsec_mp, ipif, *v6dstp, 23578 unspec_src, zoneid); 23579 ipif_refrele(ipif); 23580 } else { 23581 if (attach_if) { 23582 ipif_t *ipif; 23583 23584 ipif = ipif_get_next_ipif(NULL, ill); 23585 if (ipif == NULL) { 23586 if (ill_need_rele) 23587 ill_refrele(ill); 23588 freemsg(ipsec_mp); 23589 return; 23590 } 23591 ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, 23592 zoneid, MBLK_GETLABEL(mp), match_flags); 23593 ire_need_rele = B_TRUE; 23594 ipif_refrele(ipif); 23595 } else { 23596 if (ire_arg != NULL) { 23597 ire = ire_arg; 23598 } else { 23599 ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL); 23600 ire_need_rele = B_TRUE; 23601 } 23602 } 23603 if (ire != NULL) 23604 goto send; 23605 /* 23606 * ire disappeared underneath. 23607 * 23608 * What we need to do here is the ip_newroute 23609 * logic to get the ire without doing the IPSEC 23610 * processing. Follow the same old path. But this 23611 * time, ip_wput or ire_add_then_send will call us 23612 * directly as all the IPSEC operations are done. 23613 */ 23614 ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); 23615 mp->b_prev = NULL; 23616 mp->b_next = NULL; 23617 23618 /* 23619 * If the IPsec packet was processed asynchronously, 23620 * drop it now. 23621 */ 23622 if (q == NULL) { 23623 if (ill_need_rele) 23624 ill_refrele(ill); 23625 freemsg(ipsec_mp); 23626 return; 23627 } 23628 23629 ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, 23630 zoneid); 23631 } 23632 if (ill != NULL && ill_need_rele) 23633 ill_refrele(ill); 23634 return; 23635 send: 23636 if (ill != NULL && ill_need_rele) 23637 ill_refrele(ill); 23638 23639 /* Local delivery */ 23640 if (ire->ire_stq == NULL) { 23641 ASSERT(q != NULL); 23642 ip_wput_local_v6(RD(q), ire->ire_ipif->ipif_ill, ip6h, ipsec_mp, 23643 ire, 0); 23644 if (ire_need_rele) 23645 ire_refrele(ire); 23646 return; 23647 } 23648 /* 23649 * Everything is done. Send it out on the wire. 23650 * We force the insertion of a fragment header using the 23651 * IPH_FRAG_HDR flag in two cases: 23652 * - after reception of an ICMPv6 "packet too big" message 23653 * with a MTU < 1280 (cf. RFC 2460 section 5) 23654 * - for multirouted IPv6 packets, so that the receiver can 23655 * discard duplicates according to their fragment identifier 23656 */ 23657 /* XXX fix flow control problems. */ 23658 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || 23659 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 23660 if (hwaccel) { 23661 /* 23662 * hardware acceleration does not handle these 23663 * "slow path" cases. 23664 */ 23665 /* IPsec KSTATS: should bump bean counter here. */ 23666 if (ire_need_rele) 23667 ire_refrele(ire); 23668 freemsg(ipsec_mp); 23669 return; 23670 } 23671 if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != 23672 (mp->b_cont ? msgdsize(mp) : 23673 mp->b_wptr - (uchar_t *)ip6h)) { 23674 /* IPsec KSTATS: should bump bean counter here. */ 23675 ip0dbg(("Packet length mismatch: %d, %ld\n", 23676 ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, 23677 msgdsize(mp))); 23678 if (ire_need_rele) 23679 ire_refrele(ire); 23680 freemsg(ipsec_mp); 23681 return; 23682 } 23683 ASSERT(mp->b_prev == NULL); 23684 ip2dbg(("Fragmenting Size = %d, mtu = %d\n", 23685 ntohs(ip6h->ip6_plen) + 23686 IPV6_HDR_LEN, ire->ire_max_frag)); 23687 ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, 23688 ire->ire_max_frag); 23689 } else { 23690 UPDATE_OB_PKT_COUNT(ire); 23691 ire->ire_last_used_time = lbolt; 23692 ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); 23693 } 23694 if (ire_need_rele) 23695 ire_refrele(ire); 23696 freeb(ipsec_mp); 23697 } 23698 23699 void 23700 ipsec_hw_putnext(queue_t *q, mblk_t *mp) 23701 { 23702 mblk_t *hada_mp; /* attributes M_CTL mblk */ 23703 da_ipsec_t *hada; /* data attributes */ 23704 ill_t *ill = (ill_t *)q->q_ptr; 23705 23706 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); 23707 23708 if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { 23709 /* IPsec KSTATS: Bump lose counter here! */ 23710 freemsg(mp); 23711 return; 23712 } 23713 23714 /* 23715 * It's an IPsec packet that must be 23716 * accelerated by the Provider, and the 23717 * outbound ill is IPsec acceleration capable. 23718 * Prepends the mblk with an IPHADA_M_CTL, and ship it 23719 * to the ill. 23720 * IPsec KSTATS: should bump packet counter here. 23721 */ 23722 23723 hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); 23724 if (hada_mp == NULL) { 23725 /* IPsec KSTATS: should bump packet counter here. */ 23726 freemsg(mp); 23727 return; 23728 } 23729 23730 hada_mp->b_datap->db_type = M_CTL; 23731 hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); 23732 hada_mp->b_cont = mp; 23733 23734 hada = (da_ipsec_t *)hada_mp->b_rptr; 23735 bzero(hada, sizeof (da_ipsec_t)); 23736 hada->da_type = IPHADA_M_CTL; 23737 23738 putnext(q, hada_mp); 23739 } 23740 23741 /* 23742 * Finish the outbound IPsec processing. This function is called from 23743 * ipsec_out_process() if the IPsec packet was processed 23744 * synchronously, or from {ah,esp}_kcf_callback() if it was processed 23745 * asynchronously. 23746 */ 23747 void 23748 ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, 23749 ire_t *ire_arg) 23750 { 23751 uint32_t v_hlen_tos_len; 23752 ipaddr_t dst; 23753 ipif_t *ipif = NULL; 23754 ire_t *ire; 23755 ire_t *ire1 = NULL; 23756 mblk_t *next_mp = NULL; 23757 uint32_t max_frag; 23758 boolean_t multirt_send = B_FALSE; 23759 mblk_t *mp; 23760 mblk_t *mp1; 23761 uint_t ill_index; 23762 ipsec_out_t *io; 23763 boolean_t attach_if; 23764 int match_flags, offset; 23765 irb_t *irb = NULL; 23766 boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; 23767 zoneid_t zoneid; 23768 uint32_t cksum; 23769 uint16_t *up; 23770 #ifdef _BIG_ENDIAN 23771 #define LENGTH (v_hlen_tos_len & 0xFFFF) 23772 #else 23773 #define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) 23774 #endif 23775 23776 mp = ipsec_mp->b_cont; 23777 ASSERT(mp != NULL); 23778 v_hlen_tos_len = ((uint32_t *)ipha)[0]; 23779 dst = ipha->ipha_dst; 23780 23781 io = (ipsec_out_t *)ipsec_mp->b_rptr; 23782 ill_index = io->ipsec_out_ill_index; 23783 attach_if = io->ipsec_out_attach_if; 23784 zoneid = io->ipsec_out_zoneid; 23785 ASSERT(zoneid != ALL_ZONES); 23786 match_flags = MATCH_IRE_ILL_GROUP | MATCH_IRE_SECATTR; 23787 if (ill_index != 0) { 23788 if (ill == NULL) { 23789 ill = ip_grab_attach_ill(NULL, ipsec_mp, 23790 ill_index, B_FALSE); 23791 23792 /* Failure case frees things for us. */ 23793 if (ill == NULL) 23794 return; 23795 23796 ill_need_rele = B_TRUE; 23797 } 23798 /* 23799 * If this packet needs to go out on a particular interface 23800 * honor it. 23801 */ 23802 if (attach_if) { 23803 match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; 23804 23805 /* 23806 * Check if we need an ire that will not be 23807 * looked up by anybody else i.e. HIDDEN. 23808 */ 23809 if (ill_is_probeonly(ill)) { 23810 match_flags |= MATCH_IRE_MARK_HIDDEN; 23811 } 23812 } 23813 } 23814 23815 if (CLASSD(dst)) { 23816 boolean_t conn_dontroute; 23817 /* 23818 * Use the ill_index to get the right ipif. 23819 */ 23820 conn_dontroute = io->ipsec_out_dontroute; 23821 if (ill_index == 0) 23822 ipif = ipif_lookup_group(dst, zoneid); 23823 else 23824 (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); 23825 if (ipif == NULL) { 23826 ip1dbg(("ip_wput_ipsec_out: No ipif for" 23827 " multicast\n")); 23828 BUMP_MIB(&ip_mib, ipOutNoRoutes); 23829 freemsg(ipsec_mp); 23830 goto done; 23831 } 23832 /* 23833 * ipha_src has already been intialized with the 23834 * value of the ipif in ip_wput. All we need now is 23835 * an ire to send this downstream. 23836 */ 23837 ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, 23838 MBLK_GETLABEL(mp), match_flags); 23839 if (ire != NULL) { 23840 ill_t *ill1; 23841 /* 23842 * Do the multicast forwarding now, as the IPSEC 23843 * processing has been done. 23844 */ 23845 if (ip_g_mrouter && !conn_dontroute && 23846 (ill1 = ire_to_ill(ire))) { 23847 if (ip_mforward(ill1, ipha, mp)) { 23848 freemsg(ipsec_mp); 23849 ip1dbg(("ip_wput_ipsec_out: mforward " 23850 "failed\n")); 23851 ire_refrele(ire); 23852 goto done; 23853 } 23854 } 23855 goto send; 23856 } 23857 23858 ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); 23859 mp->b_prev = NULL; 23860 mp->b_next = NULL; 23861 23862 /* 23863 * If the IPsec packet was processed asynchronously, 23864 * drop it now. 23865 */ 23866 if (q == NULL) { 23867 freemsg(ipsec_mp); 23868 goto done; 23869 } 23870 23871 /* 23872 * We may be using a wrong ipif to create the ire. 23873 * But it is okay as the source address is assigned 23874 * for the packet already. Next outbound packet would 23875 * create the IRE with the right IPIF in ip_wput. 23876 * 23877 * Also handle RTF_MULTIRT routes. 23878 */ 23879 ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT); 23880 } else { 23881 if (attach_if) { 23882 ire = ire_ctable_lookup(dst, 0, 0, ill->ill_ipif, 23883 zoneid, MBLK_GETLABEL(mp), match_flags); 23884 } else { 23885 if (ire_arg != NULL) { 23886 ire = ire_arg; 23887 ire_need_rele = B_FALSE; 23888 } else { 23889 ire = ire_cache_lookup(dst, zoneid, 23890 MBLK_GETLABEL(mp)); 23891 } 23892 } 23893 if (ire != NULL) { 23894 goto send; 23895 } 23896 23897 /* 23898 * ire disappeared underneath. 23899 * 23900 * What we need to do here is the ip_newroute 23901 * logic to get the ire without doing the IPSEC 23902 * processing. Follow the same old path. But this 23903 * time, ip_wput or ire_add_then_put will call us 23904 * directly as all the IPSEC operations are done. 23905 */ 23906 ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); 23907 mp->b_prev = NULL; 23908 mp->b_next = NULL; 23909 23910 /* 23911 * If the IPsec packet was processed asynchronously, 23912 * drop it now. 23913 */ 23914 if (q == NULL) { 23915 freemsg(ipsec_mp); 23916 goto done; 23917 } 23918 23919 /* 23920 * Since we're going through ip_newroute() again, we 23921 * need to make sure we don't: 23922 * 23923 * 1.) Trigger the ASSERT() with the ipha_ident 23924 * overloading. 23925 * 2.) Redo transport-layer checksumming, since we've 23926 * already done all that to get this far. 23927 * 23928 * The easiest way not do either of the above is to set 23929 * the ipha_ident field to IP_HDR_INCLUDED. 23930 */ 23931 ipha->ipha_ident = IP_HDR_INCLUDED; 23932 ip_newroute(q, ipsec_mp, dst, NULL, 23933 (CONN_Q(q) ? Q_TO_CONN(q) : NULL)); 23934 } 23935 goto done; 23936 send: 23937 if (ipha->ipha_protocol == IPPROTO_UDP && udp_compute_checksum()) { 23938 /* 23939 * ESP NAT-Traversal packet. 23940 * 23941 * Just do software checksum for now. 23942 */ 23943 23944 offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET; 23945 IP_STAT(ip_out_sw_cksum); 23946 IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes, 23947 ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH)); 23948 #define iphs ((uint16_t *)ipha) 23949 cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + 23950 iphs[9] + ntohs(htons(ipha->ipha_length) - 23951 IP_SIMPLE_HDR_LENGTH); 23952 #undef iphs 23953 if ((cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH, cksum)) == 0) 23954 cksum = 0xFFFF; 23955 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) 23956 if (mp1->b_wptr - mp1->b_rptr >= 23957 offset + sizeof (uint16_t)) { 23958 up = (uint16_t *)(mp1->b_rptr + offset); 23959 *up = cksum; 23960 break; /* out of for loop */ 23961 } else { 23962 offset -= (mp->b_wptr - mp->b_rptr); 23963 } 23964 } /* Otherwise, just keep the all-zero checksum. */ 23965 23966 if (ire->ire_stq == NULL) { 23967 /* 23968 * Loopbacks go through ip_wput_local except for one case. 23969 * We come here if we generate a icmp_frag_needed message 23970 * after IPSEC processing is over. When this function calls 23971 * ip_wput_ire_fragmentit, ip_wput_frag might end up calling 23972 * icmp_frag_needed. The message generated comes back here 23973 * through icmp_frag_needed -> icmp_pkt -> ip_wput -> 23974 * ipsec_out_process -> ip_wput_ipsec_out. We need to set the 23975 * source address as it is usually set in ip_wput_ire. As 23976 * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process 23977 * and we end up here. We can't enter ip_wput_ire once the 23978 * IPSEC processing is over and hence we need to do it here. 23979 */ 23980 ASSERT(q != NULL); 23981 UPDATE_OB_PKT_COUNT(ire); 23982 ire->ire_last_used_time = lbolt; 23983 if (ipha->ipha_src == 0) 23984 ipha->ipha_src = ire->ire_src_addr; 23985 ip_wput_local(RD(q), ire->ire_ipif->ipif_ill, ipha, ipsec_mp, 23986 ire, 0, zoneid); 23987 if (ire_need_rele) 23988 ire_refrele(ire); 23989 goto done; 23990 } 23991 23992 if (ire->ire_max_frag < (unsigned int)LENGTH) { 23993 /* 23994 * We are through with IPSEC processing. 23995 * Fragment this and send it on the wire. 23996 */ 23997 if (io->ipsec_out_accelerated) { 23998 /* 23999 * The packet has been accelerated but must 24000 * be fragmented. This should not happen 24001 * since AH and ESP must not accelerate 24002 * packets that need fragmentation, however 24003 * the configuration could have changed 24004 * since the AH or ESP processing. 24005 * Drop packet. 24006 * IPsec KSTATS: bump bean counter here. 24007 */ 24008 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " 24009 "fragmented accelerated packet!\n")); 24010 freemsg(ipsec_mp); 24011 } else { 24012 ip_wput_ire_fragmentit(ipsec_mp, ire); 24013 } 24014 if (ire_need_rele) 24015 ire_refrele(ire); 24016 goto done; 24017 } 24018 24019 ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " 24020 "ipif %p\n", (void *)ipsec_mp, (void *)ire, 24021 (void *)ire->ire_ipif, (void *)ipif)); 24022 24023 /* 24024 * Multiroute the secured packet, unless IPsec really 24025 * requires the packet to go out only through a particular 24026 * interface. 24027 */ 24028 if ((ire->ire_flags & RTF_MULTIRT) && !attach_if) { 24029 ire_t *first_ire; 24030 irb = ire->ire_bucket; 24031 ASSERT(irb != NULL); 24032 /* 24033 * This ire has been looked up as the one that 24034 * goes through the given ipif; 24035 * make sure we do not omit any other multiroute ire 24036 * that may be present in the bucket before this one. 24037 */ 24038 IRB_REFHOLD(irb); 24039 for (first_ire = irb->irb_ire; 24040 first_ire != NULL; 24041 first_ire = first_ire->ire_next) { 24042 if ((first_ire->ire_flags & RTF_MULTIRT) && 24043 (first_ire->ire_addr == ire->ire_addr) && 24044 !(first_ire->ire_marks & 24045 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))) 24046 break; 24047 } 24048 24049 if ((first_ire != NULL) && (first_ire != ire)) { 24050 /* 24051 * Don't change the ire if the packet must 24052 * be fragmented if sent via this new one. 24053 */ 24054 if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { 24055 IRE_REFHOLD(first_ire); 24056 if (ire_need_rele) 24057 ire_refrele(ire); 24058 else 24059 ire_need_rele = B_TRUE; 24060 ire = first_ire; 24061 } 24062 } 24063 IRB_REFRELE(irb); 24064 24065 multirt_send = B_TRUE; 24066 max_frag = ire->ire_max_frag; 24067 } else { 24068 if ((ire->ire_flags & RTF_MULTIRT) && attach_if) { 24069 ip1dbg(("ip_wput_ipsec_out: ignoring multirouting " 24070 "flag, attach_if %d\n", attach_if)); 24071 } 24072 } 24073 24074 /* 24075 * In most cases, the emission loop below is entered only once. 24076 * Only in the case where the ire holds the RTF_MULTIRT 24077 * flag, we loop to process all RTF_MULTIRT ires in the 24078 * bucket, and send the packet through all crossed 24079 * RTF_MULTIRT routes. 24080 */ 24081 do { 24082 if (multirt_send) { 24083 /* 24084 * ire1 holds here the next ire to process in the 24085 * bucket. If multirouting is expected, 24086 * any non-RTF_MULTIRT ire that has the 24087 * right destination address is ignored. 24088 */ 24089 ASSERT(irb != NULL); 24090 IRB_REFHOLD(irb); 24091 for (ire1 = ire->ire_next; 24092 ire1 != NULL; 24093 ire1 = ire1->ire_next) { 24094 if ((ire1->ire_flags & RTF_MULTIRT) == 0) 24095 continue; 24096 if (ire1->ire_addr != ire->ire_addr) 24097 continue; 24098 if (ire1->ire_marks & 24099 (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN)) 24100 continue; 24101 /* No loopback here */ 24102 if (ire1->ire_stq == NULL) 24103 continue; 24104 /* 24105 * Ensure we do not exceed the MTU 24106 * of the next route. 24107 */ 24108 if (ire1->ire_max_frag < (unsigned int)LENGTH) { 24109 ip_multirt_bad_mtu(ire1, max_frag); 24110 continue; 24111 } 24112 24113 IRE_REFHOLD(ire1); 24114 break; 24115 } 24116 IRB_REFRELE(irb); 24117 if (ire1 != NULL) { 24118 /* 24119 * We are in a multiple send case, need to 24120 * make a copy of the packet. 24121 */ 24122 next_mp = copymsg(ipsec_mp); 24123 if (next_mp == NULL) { 24124 ire_refrele(ire1); 24125 ire1 = NULL; 24126 } 24127 } 24128 } 24129 24130 /* Everything is done. Send it out on the wire */ 24131 mp1 = ip_wput_attach_llhdr(mp, ire, 0, 0); 24132 if (mp1 == NULL) { 24133 BUMP_MIB(&ip_mib, ipOutDiscards); 24134 freemsg(ipsec_mp); 24135 if (ire_need_rele) 24136 ire_refrele(ire); 24137 if (ire1 != NULL) { 24138 ire_refrele(ire1); 24139 freemsg(next_mp); 24140 } 24141 goto done; 24142 } 24143 UPDATE_OB_PKT_COUNT(ire); 24144 ire->ire_last_used_time = lbolt; 24145 if (!io->ipsec_out_accelerated) { 24146 putnext(ire->ire_stq, mp1); 24147 } else { 24148 /* 24149 * Safety Pup says: make sure this is going to 24150 * the right interface! 24151 */ 24152 ill_t *ill1 = (ill_t *)ire->ire_stq->q_ptr; 24153 int ifindex = ill1->ill_phyint->phyint_ifindex; 24154 24155 if (ifindex != io->ipsec_out_capab_ill_index) { 24156 /* IPsec kstats: bump lose counter */ 24157 freemsg(mp1); 24158 } else { 24159 ipsec_hw_putnext(ire->ire_stq, mp1); 24160 } 24161 } 24162 24163 freeb(ipsec_mp); 24164 if (ire_need_rele) 24165 ire_refrele(ire); 24166 24167 if (ire1 != NULL) { 24168 ire = ire1; 24169 ire_need_rele = B_TRUE; 24170 ASSERT(next_mp); 24171 ipsec_mp = next_mp; 24172 mp = ipsec_mp->b_cont; 24173 ire1 = NULL; 24174 next_mp = NULL; 24175 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24176 } else { 24177 multirt_send = B_FALSE; 24178 } 24179 } while (multirt_send); 24180 done: 24181 if (ill != NULL && ill_need_rele) 24182 ill_refrele(ill); 24183 if (ipif != NULL) 24184 ipif_refrele(ipif); 24185 } 24186 24187 /* 24188 * Get the ill corresponding to the specified ire, and compare its 24189 * capabilities with the protocol and algorithms specified by the 24190 * the SA obtained from ipsec_out. If they match, annotate the 24191 * ipsec_out structure to indicate that the packet needs acceleration. 24192 * 24193 * 24194 * A packet is eligible for outbound hardware acceleration if the 24195 * following conditions are satisfied: 24196 * 24197 * 1. the packet will not be fragmented 24198 * 2. the provider supports the algorithm 24199 * 3. there is no pending control message being exchanged 24200 * 4. snoop is not attached 24201 * 5. the destination address is not a broadcast or multicast address. 24202 * 24203 * Rationale: 24204 * - Hardware drivers do not support fragmentation with 24205 * the current interface. 24206 * - snoop, multicast, and broadcast may result in exposure of 24207 * a cleartext datagram. 24208 * We check all five of these conditions here. 24209 * 24210 * XXX would like to nuke "ire_t *" parameter here; problem is that 24211 * IRE is only way to figure out if a v4 address is a broadcast and 24212 * thus ineligible for acceleration... 24213 */ 24214 static void 24215 ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) 24216 { 24217 ipsec_out_t *io; 24218 mblk_t *data_mp; 24219 uint_t plen, overhead; 24220 24221 if ((sa->ipsa_flags & IPSA_F_HW) == 0) 24222 return; 24223 24224 if (ill == NULL) 24225 return; 24226 24227 /* 24228 * Destination address is a broadcast or multicast. Punt. 24229 */ 24230 if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| 24231 IRE_LOCAL))) 24232 return; 24233 24234 data_mp = ipsec_mp->b_cont; 24235 24236 if (ill->ill_isv6) { 24237 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; 24238 24239 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 24240 return; 24241 24242 plen = ip6h->ip6_plen; 24243 } else { 24244 ipha_t *ipha = (ipha_t *)data_mp->b_rptr; 24245 24246 if (CLASSD(ipha->ipha_dst)) 24247 return; 24248 24249 plen = ipha->ipha_length; 24250 } 24251 /* 24252 * Is there a pending DLPI control message being exchanged 24253 * between IP/IPsec and the DLS Provider? If there is, it 24254 * could be a SADB update, and the state of the DLS Provider 24255 * SADB might not be in sync with the SADB maintained by 24256 * IPsec. To avoid dropping packets or using the wrong keying 24257 * material, we do not accelerate this packet. 24258 */ 24259 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 24260 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 24261 "ill_dlpi_pending! don't accelerate packet\n")); 24262 return; 24263 } 24264 24265 /* 24266 * Is the Provider in promiscous mode? If it does, we don't 24267 * accelerate the packet since it will bounce back up to the 24268 * listeners in the clear. 24269 */ 24270 if (ill->ill_promisc_on_phys) { 24271 IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " 24272 "ill in promiscous mode, don't accelerate packet\n")); 24273 return; 24274 } 24275 24276 /* 24277 * Will the packet require fragmentation? 24278 */ 24279 24280 /* 24281 * IPsec ESP note: this is a pessimistic estimate, but the same 24282 * as is used elsewhere. 24283 * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) 24284 * + 2-byte trailer 24285 */ 24286 overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : 24287 IPSEC_BASE_ESP_HDR_SIZE(sa); 24288 24289 if ((plen + overhead) > ill->ill_max_mtu) 24290 return; 24291 24292 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24293 24294 /* 24295 * Can the ill accelerate this IPsec protocol and algorithm 24296 * specified by the SA? 24297 */ 24298 if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, 24299 ill->ill_isv6, sa)) { 24300 return; 24301 } 24302 24303 /* 24304 * Tell AH or ESP that the outbound ill is capable of 24305 * accelerating this packet. 24306 */ 24307 io->ipsec_out_is_capab_ill = B_TRUE; 24308 } 24309 24310 /* 24311 * Select which AH & ESP SA's to use (if any) for the outbound packet. 24312 * 24313 * If this function returns B_TRUE, the requested SA's have been filled 24314 * into the ipsec_out_*_sa pointers. 24315 * 24316 * If the function returns B_FALSE, the packet has been "consumed", most 24317 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. 24318 * 24319 * The SA references created by the protocol-specific "select" 24320 * function will be released when the ipsec_mp is freed, thanks to the 24321 * ipsec_out_free destructor -- see spd.c. 24322 */ 24323 static boolean_t 24324 ipsec_out_select_sa(mblk_t *ipsec_mp) 24325 { 24326 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; 24327 ipsec_out_t *io; 24328 ipsec_policy_t *pp; 24329 ipsec_action_t *ap; 24330 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24331 ASSERT(io->ipsec_out_type == IPSEC_OUT); 24332 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 24333 24334 if (!io->ipsec_out_secure) { 24335 /* 24336 * We came here by mistake. 24337 * Don't bother with ipsec processing 24338 * We should "discourage" this path in the future. 24339 */ 24340 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 24341 return (B_FALSE); 24342 } 24343 ASSERT(io->ipsec_out_need_policy == B_FALSE); 24344 ASSERT((io->ipsec_out_policy != NULL) || 24345 (io->ipsec_out_act != NULL)); 24346 24347 ASSERT(io->ipsec_out_failed == B_FALSE); 24348 24349 /* 24350 * IPSEC processing has started. 24351 */ 24352 io->ipsec_out_proc_begin = B_TRUE; 24353 ap = io->ipsec_out_act; 24354 if (ap == NULL) { 24355 pp = io->ipsec_out_policy; 24356 ASSERT(pp != NULL); 24357 ap = pp->ipsp_act; 24358 ASSERT(ap != NULL); 24359 } 24360 24361 /* 24362 * We have an action. now, let's select SA's. 24363 * (In the future, we can cache this in the conn_t..) 24364 */ 24365 if (ap->ipa_want_esp) { 24366 if (io->ipsec_out_esp_sa == NULL) { 24367 need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, 24368 IPPROTO_ESP); 24369 } 24370 ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); 24371 } 24372 24373 if (ap->ipa_want_ah) { 24374 if (io->ipsec_out_ah_sa == NULL) { 24375 need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, 24376 IPPROTO_AH); 24377 } 24378 ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); 24379 /* 24380 * The ESP and AH processing order needs to be preserved 24381 * when both protocols are required (ESP should be applied 24382 * before AH for an outbound packet). Force an ESP ACQUIRE 24383 * when both ESP and AH are required, and an AH ACQUIRE 24384 * is needed. 24385 */ 24386 if (ap->ipa_want_esp && need_ah_acquire) 24387 need_esp_acquire = B_TRUE; 24388 } 24389 24390 /* 24391 * Send an ACQUIRE (extended, regular, or both) if we need one. 24392 * Release SAs that got referenced, but will not be used until we 24393 * acquire _all_ of the SAs we need. 24394 */ 24395 if (need_ah_acquire || need_esp_acquire) { 24396 if (io->ipsec_out_ah_sa != NULL) { 24397 IPSA_REFRELE(io->ipsec_out_ah_sa); 24398 io->ipsec_out_ah_sa = NULL; 24399 } 24400 if (io->ipsec_out_esp_sa != NULL) { 24401 IPSA_REFRELE(io->ipsec_out_esp_sa); 24402 io->ipsec_out_esp_sa = NULL; 24403 } 24404 24405 sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); 24406 return (B_FALSE); 24407 } 24408 24409 return (B_TRUE); 24410 } 24411 24412 /* 24413 * Process an IPSEC_OUT message and see what you can 24414 * do with it. 24415 * IPQoS Notes: 24416 * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for 24417 * IPSec. 24418 * XXX would like to nuke ire_t. 24419 * XXX ill_index better be "real" 24420 */ 24421 void 24422 ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) 24423 { 24424 ipsec_out_t *io; 24425 ipsec_policy_t *pp; 24426 ipsec_action_t *ap; 24427 ipha_t *ipha; 24428 ip6_t *ip6h; 24429 mblk_t *mp; 24430 ill_t *ill; 24431 zoneid_t zoneid; 24432 ipsec_status_t ipsec_rc; 24433 boolean_t ill_need_rele = B_FALSE; 24434 24435 io = (ipsec_out_t *)ipsec_mp->b_rptr; 24436 ASSERT(io->ipsec_out_type == IPSEC_OUT); 24437 ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); 24438 mp = ipsec_mp->b_cont; 24439 24440 /* 24441 * Initiate IPPF processing. We do it here to account for packets 24442 * coming here that don't have any policy (i.e. !io->ipsec_out_secure). 24443 * We can check for ipsec_out_proc_begin even for such packets, as 24444 * they will always be false (asserted below). 24445 */ 24446 if (IPP_ENABLED(IPP_LOCAL_OUT) && !io->ipsec_out_proc_begin) { 24447 ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? 24448 io->ipsec_out_ill_index : ill_index); 24449 if (mp == NULL) { 24450 ip2dbg(("ipsec_out_process: packet dropped "\ 24451 "during IPPF processing\n")); 24452 freeb(ipsec_mp); 24453 BUMP_MIB(&ip_mib, ipOutDiscards); 24454 return; 24455 } 24456 } 24457 24458 if (!io->ipsec_out_secure) { 24459 /* 24460 * We came here by mistake. 24461 * Don't bother with ipsec processing 24462 * Should "discourage" this path in the future. 24463 */ 24464 ASSERT(io->ipsec_out_proc_begin == B_FALSE); 24465 goto done; 24466 } 24467 ASSERT(io->ipsec_out_need_policy == B_FALSE); 24468 ASSERT((io->ipsec_out_policy != NULL) || 24469 (io->ipsec_out_act != NULL)); 24470 ASSERT(io->ipsec_out_failed == B_FALSE); 24471 24472 if (!ipsec_loaded()) { 24473 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 24474 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 24475 BUMP_MIB(&ip_mib, ipOutDiscards); 24476 } else { 24477 BUMP_MIB(&ip6_mib, ipv6OutDiscards); 24478 } 24479 ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, 24480 &ipdrops_ip_ipsec_not_loaded, &ip_dropper); 24481 return; 24482 } 24483 24484 /* 24485 * IPSEC processing has started. 24486 */ 24487 io->ipsec_out_proc_begin = B_TRUE; 24488 ap = io->ipsec_out_act; 24489 if (ap == NULL) { 24490 pp = io->ipsec_out_policy; 24491 ASSERT(pp != NULL); 24492 ap = pp->ipsp_act; 24493 ASSERT(ap != NULL); 24494 } 24495 24496 /* 24497 * Save the outbound ill index. When the packet comes back 24498 * from IPsec, we make sure the ill hasn't changed or disappeared 24499 * before sending it the accelerated packet. 24500 */ 24501 if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { 24502 int ifindex; 24503 ill = ire_to_ill(ire); 24504 ifindex = ill->ill_phyint->phyint_ifindex; 24505 io->ipsec_out_capab_ill_index = ifindex; 24506 } 24507 24508 /* 24509 * The order of processing is first insert a IP header if needed. 24510 * Then insert the ESP header and then the AH header. 24511 */ 24512 if ((io->ipsec_out_se_done == B_FALSE) && 24513 (ap->ipa_want_se)) { 24514 /* 24515 * First get the outer IP header before sending 24516 * it to ESP. 24517 */ 24518 ipha_t *oipha, *iipha; 24519 mblk_t *outer_mp, *inner_mp; 24520 24521 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { 24522 (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, 24523 "ipsec_out_process: " 24524 "Self-Encapsulation failed: Out of memory\n"); 24525 freemsg(ipsec_mp); 24526 BUMP_MIB(&ip_mib, ipOutDiscards); 24527 return; 24528 } 24529 inner_mp = ipsec_mp->b_cont; 24530 ASSERT(inner_mp->b_datap->db_type == M_DATA); 24531 oipha = (ipha_t *)outer_mp->b_rptr; 24532 iipha = (ipha_t *)inner_mp->b_rptr; 24533 *oipha = *iipha; 24534 outer_mp->b_wptr += sizeof (ipha_t); 24535 oipha->ipha_length = htons(ntohs(iipha->ipha_length) + 24536 sizeof (ipha_t)); 24537 oipha->ipha_protocol = IPPROTO_ENCAP; 24538 oipha->ipha_version_and_hdr_length = 24539 IP_SIMPLE_HDR_VERSION; 24540 oipha->ipha_hdr_checksum = 0; 24541 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); 24542 outer_mp->b_cont = inner_mp; 24543 ipsec_mp->b_cont = outer_mp; 24544 24545 io->ipsec_out_se_done = B_TRUE; 24546 io->ipsec_out_encaps = B_TRUE; 24547 } 24548 24549 if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || 24550 (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && 24551 !ipsec_out_select_sa(ipsec_mp)) 24552 return; 24553 24554 /* 24555 * By now, we know what SA's to use. Toss over to ESP & AH 24556 * to do the heavy lifting. 24557 */ 24558 zoneid = io->ipsec_out_zoneid; 24559 ASSERT(zoneid != ALL_ZONES); 24560 if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { 24561 ASSERT(io->ipsec_out_esp_sa != NULL); 24562 io->ipsec_out_esp_done = B_TRUE; 24563 /* 24564 * Note that since hw accel can only apply one transform, 24565 * not two, we skip hw accel for ESP if we also have AH 24566 * This is an design limitation of the interface 24567 * which should be revisited. 24568 */ 24569 ASSERT(ire != NULL); 24570 if (io->ipsec_out_ah_sa == NULL) { 24571 ill = (ill_t *)ire->ire_stq->q_ptr; 24572 ipsec_out_is_accelerated(ipsec_mp, 24573 io->ipsec_out_esp_sa, ill, ire); 24574 } 24575 24576 ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); 24577 switch (ipsec_rc) { 24578 case IPSEC_STATUS_SUCCESS: 24579 break; 24580 case IPSEC_STATUS_FAILED: 24581 BUMP_MIB(&ip_mib, ipOutDiscards); 24582 /* FALLTHRU */ 24583 case IPSEC_STATUS_PENDING: 24584 return; 24585 } 24586 } 24587 24588 if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { 24589 ASSERT(io->ipsec_out_ah_sa != NULL); 24590 io->ipsec_out_ah_done = B_TRUE; 24591 if (ire == NULL) { 24592 int idx = io->ipsec_out_capab_ill_index; 24593 ill = ill_lookup_on_ifindex(idx, B_FALSE, 24594 NULL, NULL, NULL, NULL); 24595 ill_need_rele = B_TRUE; 24596 } else { 24597 ill = (ill_t *)ire->ire_stq->q_ptr; 24598 } 24599 ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, 24600 ire); 24601 24602 ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); 24603 switch (ipsec_rc) { 24604 case IPSEC_STATUS_SUCCESS: 24605 break; 24606 case IPSEC_STATUS_FAILED: 24607 BUMP_MIB(&ip_mib, ipOutDiscards); 24608 /* FALLTHRU */ 24609 case IPSEC_STATUS_PENDING: 24610 if (ill != NULL && ill_need_rele) 24611 ill_refrele(ill); 24612 return; 24613 } 24614 } 24615 /* 24616 * We are done with IPSEC processing. Send it over 24617 * the wire. 24618 */ 24619 done: 24620 mp = ipsec_mp->b_cont; 24621 ipha = (ipha_t *)mp->b_rptr; 24622 if (IPH_HDR_VERSION(ipha) == IP_VERSION) { 24623 ip_wput_ipsec_out(q, ipsec_mp, ipha, ill, ire); 24624 } else { 24625 ip6h = (ip6_t *)ipha; 24626 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ill, ire); 24627 } 24628 if (ill != NULL && ill_need_rele) 24629 ill_refrele(ill); 24630 } 24631 24632 /* ARGSUSED */ 24633 void 24634 ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) 24635 { 24636 opt_restart_t *or; 24637 int err; 24638 conn_t *connp; 24639 24640 ASSERT(CONN_Q(q)); 24641 connp = Q_TO_CONN(q); 24642 24643 ASSERT(first_mp->b_datap->db_type == M_CTL); 24644 or = (opt_restart_t *)first_mp->b_rptr; 24645 /* 24646 * We don't need to pass any credentials here since this is just 24647 * a restart. The credentials are passed in when svr4_optcom_req 24648 * is called the first time (from ip_wput_nondata). 24649 */ 24650 if (or->or_type == T_SVR4_OPTMGMT_REQ) { 24651 err = svr4_optcom_req(q, first_mp, NULL, 24652 &ip_opt_obj); 24653 } else { 24654 ASSERT(or->or_type == T_OPTMGMT_REQ); 24655 err = tpi_optcom_req(q, first_mp, NULL, 24656 &ip_opt_obj); 24657 } 24658 if (err != EINPROGRESS) { 24659 /* operation is done */ 24660 CONN_OPER_PENDING_DONE(connp); 24661 } 24662 } 24663 24664 /* 24665 * ioctls that go through a down/up sequence may need to wait for the down 24666 * to complete. This involves waiting for the ire and ipif refcnts to go down 24667 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail. 24668 */ 24669 /* ARGSUSED */ 24670 void 24671 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 24672 { 24673 struct iocblk *iocp; 24674 mblk_t *mp1; 24675 ipif_t *ipif; 24676 ip_ioctl_cmd_t *ipip; 24677 int err; 24678 sin_t *sin; 24679 struct lifreq *lifr; 24680 struct ifreq *ifr; 24681 24682 iocp = (struct iocblk *)mp->b_rptr; 24683 ASSERT(ipsq != NULL); 24684 /* Existence of mp1 verified in ip_wput_nondata */ 24685 mp1 = mp->b_cont->b_cont; 24686 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 24687 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) { 24688 ill_t *ill; 24689 /* 24690 * Special case where ipsq_current_ipif may not be set. 24691 * ill_phyint_reinit merged the v4 and v6 into a single ipsq. 24692 * ill could also have become part of a ipmp group in the 24693 * process, we are here as were not able to complete the 24694 * operation in ipif_set_values because we could not become 24695 * exclusive on the new ipsq, In such a case ipsq_current_ipif 24696 * will not be set so we need to set it. 24697 */ 24698 ill = (ill_t *)q->q_ptr; 24699 ipsq->ipsq_current_ipif = ill->ill_ipif; 24700 ipsq->ipsq_last_cmd = ipip->ipi_cmd; 24701 } 24702 24703 ipif = ipsq->ipsq_current_ipif; 24704 ASSERT(ipif != NULL); 24705 if (ipip->ipi_cmd_type == IF_CMD) { 24706 /* This a old style SIOC[GS]IF* command */ 24707 ifr = (struct ifreq *)mp1->b_rptr; 24708 sin = (sin_t *)&ifr->ifr_addr; 24709 } else if (ipip->ipi_cmd_type == LIF_CMD) { 24710 /* This a new style SIOC[GS]LIF* command */ 24711 lifr = (struct lifreq *)mp1->b_rptr; 24712 sin = (sin_t *)&lifr->lifr_addr; 24713 } else { 24714 sin = NULL; 24715 } 24716 24717 err = (*ipip->ipi_func_restart)(ipif, sin, q, mp, ipip, 24718 (void *)mp1->b_rptr); 24719 24720 /* SIOCLIFREMOVEIF could have removed the ipif */ 24721 ip_ioctl_finish(q, mp, err, 24722 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24723 ipip->ipi_cmd == SIOCLIFREMOVEIF ? NULL : ipif, ipsq); 24724 } 24725 24726 /* 24727 * ioctl processing 24728 * 24729 * ioctl processing starts with ip_sioctl_copyin_setup which looks up 24730 * the ioctl command in the ioctl tables and determines the copyin data size 24731 * from the ioctl property ipi_copyin_size, and does an mi_copyin() of that 24732 * size. 24733 * 24734 * ioctl processing then continues when the M_IOCDATA makes its way down. 24735 * Now the ioctl is looked up again in the ioctl table, and its properties are 24736 * extracted. The associated 'conn' is then refheld till the end of the ioctl 24737 * and the general ioctl processing function ip_process_ioctl is called. 24738 * ip_process_ioctl determines if the ioctl needs to be serialized, and if 24739 * so goes thru the serialization primitive ipsq_try_enter. Then the 24740 * appropriate function to handle the ioctl is called based on the entry in 24741 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish 24742 * which also refreleases the 'conn' that was refheld at the start of the 24743 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq. 24744 * ip_extract_lifreq_cmn extracts the interface name from the lifreq/ifreq 24745 * struct and looks up the ipif. ip_extract_tunreq handles the case of tunnel. 24746 * 24747 * Many exclusive ioctls go thru an internal down up sequence as part of 24748 * the operation. For example an attempt to change the IP address of an 24749 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface 24750 * does all the cleanup such as deleting all ires that use this address. 24751 * Then we need to wait till all references to the interface go away. 24752 */ 24753 void 24754 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 24755 { 24756 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 24757 ip_ioctl_cmd_t *ipip = (ip_ioctl_cmd_t *)arg; 24758 cmd_info_t ci; 24759 int err; 24760 boolean_t entered_ipsq = B_FALSE; 24761 24762 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd)); 24763 24764 if (ipip == NULL) 24765 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 24766 24767 /* 24768 * SIOCLIFADDIF needs to go thru a special path since the 24769 * ill may not exist yet. This happens in the case of lo0 24770 * which is created using this ioctl. 24771 */ 24772 if (ipip->ipi_cmd == SIOCLIFADDIF) { 24773 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); 24774 ip_ioctl_finish(q, mp, err, 24775 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24776 NULL, NULL); 24777 return; 24778 } 24779 24780 ci.ci_ipif = NULL; 24781 switch (ipip->ipi_cmd_type) { 24782 case IF_CMD: 24783 case LIF_CMD: 24784 /* 24785 * ioctls that pass in a [l]ifreq appear here. 24786 * ip_extract_lifreq_cmn returns a refheld ipif in 24787 * ci.ci_ipif 24788 */ 24789 err = ip_extract_lifreq_cmn(q, mp, ipip->ipi_cmd_type, 24790 ipip->ipi_flags, &ci, ip_process_ioctl); 24791 if (err != 0) { 24792 ip_ioctl_finish(q, mp, err, 24793 ipip->ipi_flags & IPI_GET_CMD ? 24794 COPYOUT : NO_COPYOUT, NULL, NULL); 24795 return; 24796 } 24797 ASSERT(ci.ci_ipif != NULL); 24798 break; 24799 24800 case TUN_CMD: 24801 /* 24802 * SIOC[GS]TUNPARAM appear here. ip_extract_tunreq returns 24803 * a refheld ipif in ci.ci_ipif 24804 */ 24805 err = ip_extract_tunreq(q, mp, &ci.ci_ipif, ip_process_ioctl); 24806 if (err != 0) { 24807 ip_ioctl_finish(q, mp, err, 24808 ipip->ipi_flags & IPI_GET_CMD ? 24809 COPYOUT : NO_COPYOUT, NULL, NULL); 24810 return; 24811 } 24812 ASSERT(ci.ci_ipif != NULL); 24813 break; 24814 24815 case MISC_CMD: 24816 /* 24817 * ioctls that neither pass in [l]ifreq or iftun_req come here 24818 * For eg. SIOCGLIFCONF will appear here. 24819 */ 24820 switch (ipip->ipi_cmd) { 24821 case IF_UNITSEL: 24822 /* ioctl comes down the ill */ 24823 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif; 24824 ipif_refhold(ci.ci_ipif); 24825 break; 24826 case SIOCGMSFILTER: 24827 case SIOCSMSFILTER: 24828 case SIOCGIPMSFILTER: 24829 case SIOCSIPMSFILTER: 24830 err = ip_extract_msfilter(q, mp, &ci.ci_ipif, 24831 ip_process_ioctl); 24832 if (err != 0) { 24833 ip_ioctl_finish(q, mp, err, 24834 ipip->ipi_flags & IPI_GET_CMD ? 24835 COPYOUT : NO_COPYOUT, NULL, NULL); 24836 return; 24837 } 24838 break; 24839 } 24840 err = 0; 24841 ci.ci_sin = NULL; 24842 ci.ci_sin6 = NULL; 24843 ci.ci_lifr = NULL; 24844 break; 24845 } 24846 24847 /* 24848 * If ipsq is non-null, we are already being called exclusively 24849 */ 24850 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 24851 if (!(ipip->ipi_flags & IPI_WR)) { 24852 /* 24853 * A return value of EINPROGRESS means the ioctl is 24854 * either queued and waiting for some reason or has 24855 * already completed. 24856 */ 24857 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 24858 ci.ci_lifr); 24859 if (ci.ci_ipif != NULL) 24860 ipif_refrele(ci.ci_ipif); 24861 ip_ioctl_finish(q, mp, err, 24862 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24863 NULL, NULL); 24864 return; 24865 } 24866 24867 ASSERT(ci.ci_ipif != NULL); 24868 24869 if (ipsq == NULL) { 24870 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, 24871 ip_process_ioctl, NEW_OP, B_TRUE); 24872 entered_ipsq = B_TRUE; 24873 } 24874 /* 24875 * Release the ipif so that ipif_down and friends that wait for 24876 * references to go away are not misled about the current ipif_refcnt 24877 * values. We are writer so we can access the ipif even after releasing 24878 * the ipif. 24879 */ 24880 ipif_refrele(ci.ci_ipif); 24881 if (ipsq == NULL) 24882 return; 24883 24884 mutex_enter(&ipsq->ipsq_lock); 24885 ASSERT(ipsq->ipsq_current_ipif == NULL); 24886 ipsq->ipsq_current_ipif = ci.ci_ipif; 24887 ipsq->ipsq_last_cmd = ipip->ipi_cmd; 24888 mutex_exit(&ipsq->ipsq_lock); 24889 mutex_enter(&(ci.ci_ipif)->ipif_ill->ill_lock); 24890 /* 24891 * For most set ioctls that come here, this serves as a single point 24892 * where we set the IPIF_CHANGING flag. This ensures that there won't 24893 * be any new references to the ipif. This helps functions that go 24894 * through this path and end up trying to wait for the refcnts 24895 * associated with the ipif to go down to zero. Some exceptions are 24896 * Failover, Failback, and Groupname commands that operate on more than 24897 * just the ci.ci_ipif. These commands internally determine the 24898 * set of ipif's they operate on and set and clear the IPIF_CHANGING 24899 * flags on that set. Another exception is the Removeif command that 24900 * sets the IPIF_CONDEMNED flag internally after identifying the right 24901 * ipif to operate on. 24902 */ 24903 if (ipip->ipi_cmd != SIOCLIFREMOVEIF && 24904 ipip->ipi_cmd != SIOCLIFFAILOVER && 24905 ipip->ipi_cmd != SIOCLIFFAILBACK && 24906 ipip->ipi_cmd != SIOCSLIFGROUPNAME) 24907 (ci.ci_ipif)->ipif_state_flags |= IPIF_CHANGING; 24908 mutex_exit(&(ci.ci_ipif)->ipif_ill->ill_lock); 24909 24910 /* 24911 * A return value of EINPROGRESS means the ioctl is 24912 * either queued and waiting for some reason or has 24913 * already completed. 24914 */ 24915 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, 24916 ci.ci_lifr); 24917 24918 /* SIOCLIFREMOVEIF could have removed the ipif */ 24919 ip_ioctl_finish(q, mp, err, 24920 ipip->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT, 24921 ipip->ipi_cmd == SIOCLIFREMOVEIF ? NULL : ci.ci_ipif, ipsq); 24922 24923 if (entered_ipsq) 24924 ipsq_exit(ipsq, B_TRUE, B_TRUE); 24925 } 24926 24927 /* 24928 * Complete the ioctl. Typically ioctls use the mi package and need to 24929 * do mi_copyout/mi_copy_done. 24930 */ 24931 void 24932 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, 24933 ipif_t *ipif, ipsq_t *ipsq) 24934 { 24935 conn_t *connp = NULL; 24936 24937 if (err == EINPROGRESS) 24938 return; 24939 24940 if (CONN_Q(q)) { 24941 connp = Q_TO_CONN(q); 24942 ASSERT(connp->conn_ref >= 2); 24943 } 24944 24945 switch (mode) { 24946 case COPYOUT: 24947 if (err == 0) 24948 mi_copyout(q, mp); 24949 else 24950 mi_copy_done(q, mp, err); 24951 break; 24952 24953 case NO_COPYOUT: 24954 mi_copy_done(q, mp, err); 24955 break; 24956 24957 default: 24958 /* An ioctl aborted through a conn close would take this path */ 24959 break; 24960 } 24961 24962 /* 24963 * The refhold placed at the start of the ioctl is released here. 24964 */ 24965 if (connp != NULL) 24966 CONN_OPER_PENDING_DONE(connp); 24967 24968 /* 24969 * If the ioctl were an exclusive ioctl it would have set 24970 * IPIF_CHANGING at the start of the ioctl which is undone here. 24971 */ 24972 if (ipif != NULL) { 24973 mutex_enter(&(ipif)->ipif_ill->ill_lock); 24974 ipif->ipif_state_flags &= ~IPIF_CHANGING; 24975 mutex_exit(&(ipif)->ipif_ill->ill_lock); 24976 } 24977 24978 /* 24979 * Clear the current ipif in the ipsq at the completion of the ioctl. 24980 * Note that a non-null ipsq_current_ipif prevents new ioctls from 24981 * entering the ipsq 24982 */ 24983 if (ipsq != NULL) { 24984 mutex_enter(&ipsq->ipsq_lock); 24985 ipsq->ipsq_current_ipif = NULL; 24986 mutex_exit(&ipsq->ipsq_lock); 24987 } 24988 } 24989 24990 /* 24991 * This is called from ip_wput_nondata to resume a deferred TCP bind. 24992 */ 24993 /* ARGSUSED */ 24994 void 24995 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) 24996 { 24997 conn_t *connp = arg; 24998 tcp_t *tcp; 24999 25000 ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); 25001 tcp = connp->conn_tcp; 25002 25003 if (connp->conn_tcp->tcp_state == TCPS_CLOSED) 25004 freemsg(mp); 25005 else 25006 tcp_rput_other(tcp, mp); 25007 CONN_OPER_PENDING_DONE(connp); 25008 } 25009 25010 /* Called from ip_wput for all non data messages */ 25011 /* ARGSUSED */ 25012 void 25013 ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 25014 { 25015 mblk_t *mp1; 25016 ire_t *ire; 25017 ill_t *ill; 25018 struct iocblk *iocp; 25019 ip_ioctl_cmd_t *ipip; 25020 cred_t *cr; 25021 conn_t *connp = NULL; 25022 int cmd, err; 25023 25024 if (CONN_Q(q)) 25025 connp = Q_TO_CONN(q); 25026 25027 cr = DB_CREDDEF(mp, GET_QUEUE_CRED(q)); 25028 25029 /* Check if it is a queue to /dev/sctp. */ 25030 if (connp != NULL && connp->conn_ulp == IPPROTO_SCTP && 25031 connp->conn_rq == NULL) { 25032 sctp_wput(q, mp); 25033 return; 25034 } 25035 25036 switch (DB_TYPE(mp)) { 25037 case M_IOCTL: 25038 /* 25039 * IOCTL processing begins in ip_sioctl_copyin_setup which 25040 * will arrange to copy in associated control structures. 25041 */ 25042 ip_sioctl_copyin_setup(q, mp); 25043 return; 25044 case M_IOCDATA: 25045 /* 25046 * Ensure that this is associated with one of our trans- 25047 * parent ioctls. If it's not ours, discard it if we're 25048 * running as a driver, or pass it on if we're a module. 25049 */ 25050 iocp = (struct iocblk *)mp->b_rptr; 25051 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 25052 if (ipip == NULL) { 25053 if (q->q_next == NULL) { 25054 goto nak; 25055 } else { 25056 putnext(q, mp); 25057 } 25058 return; 25059 } else if ((q->q_next != NULL) && 25060 !(ipip->ipi_flags & IPI_MODOK)) { 25061 /* 25062 * the ioctl is one we recognise, but is not 25063 * consumed by IP as a module, pass M_IOCDATA 25064 * for processing downstream, but only for 25065 * common Streams ioctls. 25066 */ 25067 if (ipip->ipi_flags & IPI_PASS_DOWN) { 25068 putnext(q, mp); 25069 return; 25070 } else { 25071 goto nak; 25072 } 25073 } 25074 25075 /* IOCTL continuation following copyin or copyout. */ 25076 if (mi_copy_state(q, mp, NULL) == -1) { 25077 /* 25078 * The copy operation failed. mi_copy_state already 25079 * cleaned up, so we're out of here. 25080 */ 25081 return; 25082 } 25083 /* 25084 * If we just completed a copy in, we become writer and 25085 * continue processing in ip_sioctl_copyin_done. If it 25086 * was a copy out, we call mi_copyout again. If there is 25087 * nothing more to copy out, it will complete the IOCTL. 25088 */ 25089 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) { 25090 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) { 25091 mi_copy_done(q, mp, EPROTO); 25092 return; 25093 } 25094 /* 25095 * Check for cases that need more copying. A return 25096 * value of 0 means a second copyin has been started, 25097 * so we return; a return value of 1 means no more 25098 * copying is needed, so we continue. 25099 */ 25100 cmd = iocp->ioc_cmd; 25101 if ((cmd == SIOCGMSFILTER || cmd == SIOCSMSFILTER || 25102 cmd == SIOCGIPMSFILTER || cmd == SIOCSIPMSFILTER) && 25103 MI_COPY_COUNT(mp) == 1) { 25104 if (ip_copyin_msfilter(q, mp) == 0) 25105 return; 25106 } 25107 /* 25108 * Refhold the conn, till the ioctl completes. This is 25109 * needed in case the ioctl ends up in the pending mp 25110 * list. Every mp in the ill_pending_mp list and 25111 * the ipsq_pending_mp must have a refhold on the conn 25112 * to resume processing. The refhold is released when 25113 * the ioctl completes. (normally or abnormally) 25114 * In all cases ip_ioctl_finish is called to finish 25115 * the ioctl. 25116 */ 25117 if (connp != NULL) { 25118 /* This is not a reentry */ 25119 ASSERT(ipsq == NULL); 25120 CONN_INC_REF(connp); 25121 } else { 25122 if (!(ipip->ipi_flags & IPI_MODOK)) { 25123 mi_copy_done(q, mp, EINVAL); 25124 return; 25125 } 25126 } 25127 25128 ip_process_ioctl(ipsq, q, mp, ipip); 25129 25130 } else { 25131 mi_copyout(q, mp); 25132 } 25133 return; 25134 nak: 25135 iocp->ioc_error = EINVAL; 25136 mp->b_datap->db_type = M_IOCNAK; 25137 iocp->ioc_count = 0; 25138 qreply(q, mp); 25139 return; 25140 25141 case M_IOCNAK: 25142 /* 25143 * The only way we could get here is if a resolver didn't like 25144 * an IOCTL we sent it. This shouldn't happen. 25145 */ 25146 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 25147 "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", 25148 ((struct iocblk *)mp->b_rptr)->ioc_cmd); 25149 freemsg(mp); 25150 return; 25151 case M_IOCACK: 25152 /* Finish socket ioctls passed through to ARP. */ 25153 ip_sioctl_iocack(q, mp); 25154 return; 25155 case M_FLUSH: 25156 if (*mp->b_rptr & FLUSHW) 25157 flushq(q, FLUSHALL); 25158 if (q->q_next) { 25159 /* 25160 * M_FLUSH is sent up to IP by some drivers during 25161 * unbind. ip_rput has already replied to it. We are 25162 * here for the M_FLUSH that we originated in IP 25163 * before sending the unbind request to the driver. 25164 * Just free it as we don't queue packets in IP 25165 * on the write side of the device instance. 25166 */ 25167 freemsg(mp); 25168 return; 25169 } 25170 if (*mp->b_rptr & FLUSHR) { 25171 *mp->b_rptr &= ~FLUSHW; 25172 qreply(q, mp); 25173 return; 25174 } 25175 freemsg(mp); 25176 return; 25177 case IRE_DB_REQ_TYPE: 25178 /* An Upper Level Protocol wants a copy of an IRE. */ 25179 ip_ire_req(q, mp); 25180 return; 25181 case M_CTL: 25182 if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) 25183 break; 25184 25185 if (connp != NULL && *(uint32_t *)mp->b_rptr == 25186 IP_ULP_OUT_LABELED) { 25187 out_labeled_t *olp; 25188 25189 if (mp->b_wptr - mp->b_rptr != sizeof (*olp)) 25190 break; 25191 olp = (out_labeled_t *)mp->b_rptr; 25192 connp->conn_ulp_labeled = olp->out_qnext == q; 25193 freemsg(mp); 25194 return; 25195 } 25196 25197 /* M_CTL messages are used by ARP to tell us things. */ 25198 if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) 25199 break; 25200 switch (((arc_t *)mp->b_rptr)->arc_cmd) { 25201 case AR_ENTRY_SQUERY: 25202 ip_wput_ctl(q, mp); 25203 return; 25204 case AR_CLIENT_NOTIFY: 25205 ip_arp_news(q, mp); 25206 return; 25207 case AR_DLPIOP_DONE: 25208 ASSERT(q->q_next != NULL); 25209 ill = (ill_t *)q->q_ptr; 25210 /* qwriter_ip releases the refhold */ 25211 /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ 25212 ill_refhold(ill); 25213 (void) qwriter_ip(NULL, ill, q, mp, ip_arp_done, 25214 CUR_OP, B_FALSE); 25215 return; 25216 case AR_ARP_CLOSING: 25217 /* 25218 * ARP (above us) is closing. If no ARP bringup is 25219 * currently pending, ack the message so that ARP 25220 * can complete its close. Also mark ill_arp_closing 25221 * so that new ARP bringups will fail. If any 25222 * ARP bringup is currently in progress, we will 25223 * ack this when the current ARP bringup completes. 25224 */ 25225 ASSERT(q->q_next != NULL); 25226 ill = (ill_t *)q->q_ptr; 25227 mutex_enter(&ill->ill_lock); 25228 ill->ill_arp_closing = 1; 25229 if (!ill->ill_arp_bringup_pending) { 25230 mutex_exit(&ill->ill_lock); 25231 qreply(q, mp); 25232 } else { 25233 mutex_exit(&ill->ill_lock); 25234 freemsg(mp); 25235 } 25236 return; 25237 default: 25238 break; 25239 } 25240 break; 25241 case M_PROTO: 25242 case M_PCPROTO: 25243 /* 25244 * The only PROTO messages we expect are ULP binds and 25245 * copies of option negotiation acknowledgements. 25246 */ 25247 switch (((union T_primitives *)mp->b_rptr)->type) { 25248 case O_T_BIND_REQ: 25249 case T_BIND_REQ: { 25250 /* Request can get queued in bind */ 25251 ASSERT(connp != NULL); 25252 /* 25253 * Both TCP and UDP call ip_bind_{v4,v6}() directly 25254 * instead of going through this path. We only get 25255 * here in the following cases: 25256 * 25257 * a. Bind retries, where ipsq is non-NULL. 25258 * b. T_BIND_REQ is issued from non TCP/UDP 25259 * transport, e.g. icmp for raw socket, 25260 * in which case ipsq will be NULL. 25261 */ 25262 ASSERT(ipsq != NULL || 25263 (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp))); 25264 25265 /* Don't increment refcnt if this is a re-entry */ 25266 if (ipsq == NULL) 25267 CONN_INC_REF(connp); 25268 mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, 25269 connp, NULL) : ip_bind_v4(q, mp, connp); 25270 if (mp == NULL) 25271 return; 25272 if (IPCL_IS_TCP(connp)) { 25273 /* 25274 * In the case of TCP endpoint we 25275 * come here only for bind retries 25276 */ 25277 ASSERT(ipsq != NULL); 25278 CONN_INC_REF(connp); 25279 squeue_fill(connp->conn_sqp, mp, 25280 ip_resume_tcp_bind, connp, 25281 SQTAG_BIND_RETRY); 25282 return; 25283 } else if (IPCL_IS_UDP(connp)) { 25284 /* 25285 * In the case of UDP endpoint we 25286 * come here only for bind retries 25287 */ 25288 ASSERT(ipsq != NULL); 25289 udp_resume_bind(connp, mp); 25290 return; 25291 } 25292 qreply(q, mp); 25293 CONN_OPER_PENDING_DONE(connp); 25294 return; 25295 } 25296 case T_SVR4_OPTMGMT_REQ: 25297 ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", 25298 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); 25299 25300 ASSERT(connp != NULL); 25301 if (!snmpcom_req(q, mp, ip_snmp_set, 25302 ip_snmp_get, cr)) { 25303 /* 25304 * Call svr4_optcom_req so that it can 25305 * generate the ack. We don't come here 25306 * if this operation is being restarted. 25307 * ip_restart_optmgmt will drop the conn ref. 25308 * In the case of ipsec option after the ipsec 25309 * load is complete conn_restart_ipsec_waiter 25310 * drops the conn ref. 25311 */ 25312 ASSERT(ipsq == NULL); 25313 CONN_INC_REF(connp); 25314 if (ip_check_for_ipsec_opt(q, mp)) 25315 return; 25316 err = svr4_optcom_req(q, mp, cr, &ip_opt_obj); 25317 if (err != EINPROGRESS) { 25318 /* Operation is done */ 25319 CONN_OPER_PENDING_DONE(connp); 25320 } 25321 } 25322 return; 25323 case T_OPTMGMT_REQ: 25324 ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); 25325 /* 25326 * Note: No snmpcom_req support through new 25327 * T_OPTMGMT_REQ. 25328 * Call tpi_optcom_req so that it can 25329 * generate the ack. 25330 */ 25331 ASSERT(connp != NULL); 25332 ASSERT(ipsq == NULL); 25333 /* 25334 * We don't come here for restart. ip_restart_optmgmt 25335 * will drop the conn ref. In the case of ipsec option 25336 * after the ipsec load is complete 25337 * conn_restart_ipsec_waiter drops the conn ref. 25338 */ 25339 CONN_INC_REF(connp); 25340 if (ip_check_for_ipsec_opt(q, mp)) 25341 return; 25342 err = tpi_optcom_req(q, mp, cr, &ip_opt_obj); 25343 if (err != EINPROGRESS) { 25344 /* Operation is done */ 25345 CONN_OPER_PENDING_DONE(connp); 25346 } 25347 return; 25348 case T_UNBIND_REQ: 25349 mp = ip_unbind(q, mp); 25350 qreply(q, mp); 25351 return; 25352 default: 25353 /* 25354 * Have to drop any DLPI messages coming down from 25355 * arp (such as an info_req which would cause ip 25356 * to receive an extra info_ack if it was passed 25357 * through. 25358 */ 25359 ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", 25360 (int)*(uint_t *)mp->b_rptr)); 25361 freemsg(mp); 25362 return; 25363 } 25364 /* NOTREACHED */ 25365 case IRE_DB_TYPE: { 25366 nce_t *nce; 25367 ill_t *ill; 25368 in6_addr_t gw_addr_v6; 25369 25370 25371 /* 25372 * This is a response back from a resolver. It 25373 * consists of a message chain containing: 25374 * IRE_MBLK-->LL_HDR_MBLK->pkt 25375 * The IRE_MBLK is the one we allocated in ip_newroute. 25376 * The LL_HDR_MBLK is the DLPI header to use to get 25377 * the attached packet, and subsequent ones for the 25378 * same destination, transmitted. 25379 */ 25380 if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ 25381 break; 25382 /* 25383 * First, check to make sure the resolution succeeded. 25384 * If it failed, the second mblk will be empty. 25385 * If it is, free the chain, dropping the packet. 25386 * (We must ire_delete the ire; that frees the ire mblk) 25387 * We're doing this now to support PVCs for ATM; it's 25388 * a partial xresolv implementation. When we fully implement 25389 * xresolv interfaces, instead of freeing everything here 25390 * we'll initiate neighbor discovery. 25391 * 25392 * For v4 (ARP and other external resolvers) the resolver 25393 * frees the message, so no check is needed. This check 25394 * is required, though, for a full xresolve implementation. 25395 * Including this code here now both shows how external 25396 * resolvers can NACK a resolution request using an 25397 * existing design that has no specific provisions for NACKs, 25398 * and also takes into account that the current non-ARP 25399 * external resolver has been coded to use this method of 25400 * NACKing for all IPv6 (xresolv) cases, 25401 * whether our xresolv implementation is complete or not. 25402 * 25403 */ 25404 ire = (ire_t *)mp->b_rptr; 25405 ill = ire_to_ill(ire); 25406 mp1 = mp->b_cont; /* dl_unitdata_req */ 25407 if (mp1->b_rptr == mp1->b_wptr) { 25408 if (ire->ire_ipversion == IPV6_VERSION) { 25409 /* 25410 * XRESOLV interface. 25411 */ 25412 ASSERT(ill->ill_flags & ILLF_XRESOLV); 25413 mutex_enter(&ire->ire_lock); 25414 gw_addr_v6 = ire->ire_gateway_addr_v6; 25415 mutex_exit(&ire->ire_lock); 25416 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 25417 nce = ndp_lookup(ill, 25418 &ire->ire_addr_v6, B_FALSE); 25419 } else { 25420 nce = ndp_lookup(ill, &gw_addr_v6, 25421 B_FALSE); 25422 } 25423 if (nce != NULL) { 25424 nce_resolv_failed(nce); 25425 ndp_delete(nce); 25426 NCE_REFRELE(nce); 25427 } 25428 } 25429 mp->b_cont = NULL; 25430 freemsg(mp1); /* frees the pkt as well */ 25431 ire_delete((ire_t *)mp->b_rptr); 25432 return; 25433 } 25434 /* 25435 * Split them into IRE_MBLK and pkt and feed it into 25436 * ire_add_then_send. Then in ire_add_then_send 25437 * the IRE will be added, and then the packet will be 25438 * run back through ip_wput. This time it will make 25439 * it to the wire. 25440 */ 25441 mp->b_cont = NULL; 25442 mp = mp1->b_cont; /* now, mp points to pkt */ 25443 mp1->b_cont = NULL; 25444 ip1dbg(("ip_wput_nondata: reply from external resolver \n")); 25445 if (ire->ire_ipversion == IPV6_VERSION) { 25446 /* 25447 * XRESOLV interface. Find the nce and put a copy 25448 * of the dl_unitdata_req in nce_res_mp 25449 */ 25450 ASSERT(ill->ill_flags & ILLF_XRESOLV); 25451 mutex_enter(&ire->ire_lock); 25452 gw_addr_v6 = ire->ire_gateway_addr_v6; 25453 mutex_exit(&ire->ire_lock); 25454 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 25455 nce = ndp_lookup(ill, &ire->ire_addr_v6, 25456 B_FALSE); 25457 } else { 25458 nce = ndp_lookup(ill, &gw_addr_v6, B_FALSE); 25459 } 25460 if (nce != NULL) { 25461 /* 25462 * We have to protect nce_res_mp here 25463 * from being accessed by other threads 25464 * while we change the mblk pointer. 25465 * Other functions will also lock the nce when 25466 * accessing nce_res_mp. 25467 * 25468 * The reason we change the mblk pointer 25469 * here rather than copying the resolved address 25470 * into the template is that, unlike with 25471 * ethernet, we have no guarantee that the 25472 * resolved address length will be 25473 * smaller than or equal to the lla length 25474 * with which the template was allocated, 25475 * (for ethernet, they're equal) 25476 * so we have to use the actual resolved 25477 * address mblk - which holds the real 25478 * dl_unitdata_req with the resolved address. 25479 * 25480 * Doing this is the same behavior as was 25481 * previously used in the v4 ARP case. 25482 */ 25483 mutex_enter(&nce->nce_lock); 25484 if (nce->nce_res_mp != NULL) 25485 freemsg(nce->nce_res_mp); 25486 nce->nce_res_mp = mp1; 25487 mutex_exit(&nce->nce_lock); 25488 /* 25489 * We do a fastpath probe here because 25490 * we have resolved the address without 25491 * using Neighbor Discovery. 25492 * In the non-XRESOLV v6 case, the fastpath 25493 * probe is done right after neighbor 25494 * discovery completes. 25495 */ 25496 if (nce->nce_res_mp != NULL) { 25497 int res; 25498 nce_fastpath_list_add(nce); 25499 res = ill_fastpath_probe(ill, 25500 nce->nce_res_mp); 25501 if (res != 0 && res != EAGAIN) 25502 nce_fastpath_list_delete(nce); 25503 } 25504 25505 ire_add_then_send(q, ire, mp); 25506 /* 25507 * Now we have to clean out any packets 25508 * that may have been queued on the nce 25509 * while it was waiting for address resolution 25510 * to complete. 25511 */ 25512 mutex_enter(&nce->nce_lock); 25513 mp1 = nce->nce_qd_mp; 25514 nce->nce_qd_mp = NULL; 25515 mutex_exit(&nce->nce_lock); 25516 while (mp1 != NULL) { 25517 mblk_t *nxt_mp; 25518 queue_t *fwdq = NULL; 25519 ill_t *inbound_ill; 25520 uint_t ifindex; 25521 25522 nxt_mp = mp1->b_next; 25523 mp1->b_next = NULL; 25524 /* 25525 * Retrieve ifindex stored in 25526 * ip_rput_data_v6() 25527 */ 25528 ifindex = 25529 (uint_t)(uintptr_t)mp1->b_prev; 25530 inbound_ill = 25531 ill_lookup_on_ifindex(ifindex, 25532 B_TRUE, NULL, NULL, NULL, 25533 NULL); 25534 mp1->b_prev = NULL; 25535 if (inbound_ill != NULL) 25536 fwdq = inbound_ill->ill_rq; 25537 25538 if (fwdq != NULL) { 25539 put(fwdq, mp1); 25540 ill_refrele(inbound_ill); 25541 } else 25542 put(WR(ill->ill_rq), mp1); 25543 mp1 = nxt_mp; 25544 } 25545 NCE_REFRELE(nce); 25546 } else { /* nce is NULL; clean up */ 25547 ire_delete(ire); 25548 freemsg(mp); 25549 freemsg(mp1); 25550 return; 25551 } 25552 } else { 25553 ire->ire_dlureq_mp = mp1; 25554 ire_add_then_send(q, ire, mp); 25555 } 25556 return; /* All is well, the packet has been sent. */ 25557 } 25558 default: 25559 break; 25560 } 25561 if (q->q_next) { 25562 putnext(q, mp); 25563 } else 25564 freemsg(mp); 25565 } 25566 25567 /* 25568 * Process IP options in an outbound packet. Modify the destination if there 25569 * is a source route option. 25570 * Returns non-zero if something fails in which case an ICMP error has been 25571 * sent and mp freed. 25572 */ 25573 static int 25574 ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, 25575 boolean_t mctl_present, zoneid_t zoneid) 25576 { 25577 ipoptp_t opts; 25578 uchar_t *opt; 25579 uint8_t optval; 25580 uint8_t optlen; 25581 ipaddr_t dst; 25582 intptr_t code = 0; 25583 mblk_t *mp; 25584 ire_t *ire = NULL; 25585 25586 ip2dbg(("ip_wput_options\n")); 25587 mp = ipsec_mp; 25588 if (mctl_present) { 25589 mp = ipsec_mp->b_cont; 25590 } 25591 25592 dst = ipha->ipha_dst; 25593 for (optval = ipoptp_first(&opts, ipha); 25594 optval != IPOPT_EOL; 25595 optval = ipoptp_next(&opts)) { 25596 opt = opts.ipoptp_cur; 25597 optlen = opts.ipoptp_len; 25598 ip2dbg(("ip_wput_options: opt %d, len %d\n", 25599 optval, optlen)); 25600 switch (optval) { 25601 uint32_t off; 25602 case IPOPT_SSRR: 25603 case IPOPT_LSRR: 25604 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25605 ip1dbg(( 25606 "ip_wput_options: bad option offset\n")); 25607 code = (char *)&opt[IPOPT_OLEN] - 25608 (char *)ipha; 25609 goto param_prob; 25610 } 25611 off = opt[IPOPT_OFFSET]; 25612 ip1dbg(("ip_wput_options: next hop 0x%x\n", 25613 ntohl(dst))); 25614 /* 25615 * For strict: verify that dst is directly 25616 * reachable. 25617 */ 25618 if (optval == IPOPT_SSRR) { 25619 ire = ire_ftable_lookup(dst, 0, 0, 25620 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, 25621 MBLK_GETLABEL(mp), 25622 MATCH_IRE_TYPE | MATCH_IRE_SECATTR); 25623 if (ire == NULL) { 25624 ip1dbg(("ip_wput_options: SSRR not" 25625 " directly reachable: 0x%x\n", 25626 ntohl(dst))); 25627 goto bad_src_route; 25628 } 25629 ire_refrele(ire); 25630 } 25631 break; 25632 case IPOPT_RR: 25633 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25634 ip1dbg(( 25635 "ip_wput_options: bad option offset\n")); 25636 code = (char *)&opt[IPOPT_OLEN] - 25637 (char *)ipha; 25638 goto param_prob; 25639 } 25640 break; 25641 case IPOPT_TS: 25642 /* 25643 * Verify that length >=5 and that there is either 25644 * room for another timestamp or that the overflow 25645 * counter is not maxed out. 25646 */ 25647 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; 25648 if (optlen < IPOPT_MINLEN_IT) { 25649 goto param_prob; 25650 } 25651 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 25652 ip1dbg(( 25653 "ip_wput_options: bad option offset\n")); 25654 code = (char *)&opt[IPOPT_OFFSET] - 25655 (char *)ipha; 25656 goto param_prob; 25657 } 25658 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) { 25659 case IPOPT_TS_TSONLY: 25660 off = IPOPT_TS_TIMELEN; 25661 break; 25662 case IPOPT_TS_TSANDADDR: 25663 case IPOPT_TS_PRESPEC: 25664 case IPOPT_TS_PRESPEC_RFC791: 25665 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; 25666 break; 25667 default: 25668 code = (char *)&opt[IPOPT_POS_OV_FLG] - 25669 (char *)ipha; 25670 goto param_prob; 25671 } 25672 if (opt[IPOPT_OFFSET] - 1 + off > optlen && 25673 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) { 25674 /* 25675 * No room and the overflow counter is 15 25676 * already. 25677 */ 25678 goto param_prob; 25679 } 25680 break; 25681 } 25682 } 25683 25684 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) 25685 return (0); 25686 25687 ip1dbg(("ip_wput_options: error processing IP options.")); 25688 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; 25689 25690 param_prob: 25691 /* 25692 * Since ip_wput() isn't close to finished, we fill 25693 * in enough of the header for credible error reporting. 25694 */ 25695 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 25696 /* Failed */ 25697 freemsg(ipsec_mp); 25698 return (-1); 25699 } 25700 icmp_param_problem(q, ipsec_mp, (uint8_t)code); 25701 return (-1); 25702 25703 bad_src_route: 25704 /* 25705 * Since ip_wput() isn't close to finished, we fill 25706 * in enough of the header for credible error reporting. 25707 */ 25708 if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid)) { 25709 /* Failed */ 25710 freemsg(ipsec_mp); 25711 return (-1); 25712 } 25713 icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED); 25714 return (-1); 25715 } 25716 25717 /* 25718 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT. 25719 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads 25720 * thru /etc/system. 25721 */ 25722 #define CONN_MAXDRAINCNT 64 25723 25724 static void 25725 conn_drain_init(void) 25726 { 25727 int i; 25728 25729 conn_drain_list_cnt = conn_drain_nthreads; 25730 25731 if ((conn_drain_list_cnt == 0) || 25732 (conn_drain_list_cnt > CONN_MAXDRAINCNT)) { 25733 /* 25734 * Default value of the number of drainers is the 25735 * number of cpus, subject to maximum of 8 drainers. 25736 */ 25737 if (boot_max_ncpus != -1) 25738 conn_drain_list_cnt = MIN(boot_max_ncpus, 8); 25739 else 25740 conn_drain_list_cnt = MIN(max_ncpus, 8); 25741 } 25742 25743 conn_drain_list = kmem_zalloc(conn_drain_list_cnt * sizeof (idl_t), 25744 KM_SLEEP); 25745 25746 for (i = 0; i < conn_drain_list_cnt; i++) { 25747 mutex_init(&conn_drain_list[i].idl_lock, NULL, 25748 MUTEX_DEFAULT, NULL); 25749 } 25750 } 25751 25752 static void 25753 conn_drain_fini(void) 25754 { 25755 int i; 25756 25757 for (i = 0; i < conn_drain_list_cnt; i++) 25758 mutex_destroy(&conn_drain_list[i].idl_lock); 25759 kmem_free(conn_drain_list, conn_drain_list_cnt * sizeof (idl_t)); 25760 conn_drain_list = NULL; 25761 } 25762 25763 /* 25764 * Note: For an overview of how flowcontrol is handled in IP please see the 25765 * IP Flowcontrol notes at the top of this file. 25766 * 25767 * Flow control has blocked us from proceeding. Insert the given conn in one 25768 * of the conn drain lists. These conn wq's will be qenabled later on when 25769 * STREAMS flow control does a backenable. conn_walk_drain will enable 25770 * the first conn in each of these drain lists. Each of these qenabled conns 25771 * in turn enables the next in the list, after it runs, or when it closes, 25772 * thus sustaining the drain process. 25773 * 25774 * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> 25775 * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert 25776 * running at any time, on a given conn, since there can be only 1 service proc 25777 * running on a queue at any time. 25778 */ 25779 void 25780 conn_drain_insert(conn_t *connp) 25781 { 25782 idl_t *idl; 25783 uint_t index; 25784 25785 mutex_enter(&connp->conn_lock); 25786 if (connp->conn_state_flags & CONN_CLOSING) { 25787 /* 25788 * The conn is closing as a result of which CONN_CLOSING 25789 * is set. Return. 25790 */ 25791 mutex_exit(&connp->conn_lock); 25792 return; 25793 } else if (connp->conn_idl == NULL) { 25794 /* 25795 * Assign the next drain list round robin. We dont' use 25796 * a lock, and thus it may not be strictly round robin. 25797 * Atomicity of load/stores is enough to make sure that 25798 * conn_drain_list_index is always within bounds. 25799 */ 25800 index = conn_drain_list_index; 25801 ASSERT(index < conn_drain_list_cnt); 25802 connp->conn_idl = &conn_drain_list[index]; 25803 index++; 25804 if (index == conn_drain_list_cnt) 25805 index = 0; 25806 conn_drain_list_index = index; 25807 } 25808 mutex_exit(&connp->conn_lock); 25809 25810 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 25811 if ((connp->conn_drain_prev != NULL) || 25812 (connp->conn_state_flags & CONN_CLOSING)) { 25813 /* 25814 * The conn is already in the drain list, OR 25815 * the conn is closing. We need to check again for 25816 * the closing case again since close can happen 25817 * after we drop the conn_lock, and before we 25818 * acquire the CONN_DRAIN_LIST_LOCK. 25819 */ 25820 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25821 return; 25822 } else { 25823 idl = connp->conn_idl; 25824 } 25825 25826 /* 25827 * The conn is not in the drain list. Insert it at the 25828 * tail of the drain list. The drain list is circular 25829 * and doubly linked. idl_conn points to the 1st element 25830 * in the list. 25831 */ 25832 if (idl->idl_conn == NULL) { 25833 idl->idl_conn = connp; 25834 connp->conn_drain_next = connp; 25835 connp->conn_drain_prev = connp; 25836 } else { 25837 conn_t *head = idl->idl_conn; 25838 25839 connp->conn_drain_next = head; 25840 connp->conn_drain_prev = head->conn_drain_prev; 25841 head->conn_drain_prev->conn_drain_next = connp; 25842 head->conn_drain_prev = connp; 25843 } 25844 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25845 } 25846 25847 /* 25848 * This conn is closing, and we are called from ip_close. OR 25849 * This conn has been serviced by ip_wsrv, and we need to do the tail 25850 * processing. 25851 * If this conn is part of the drain list, we may need to sustain the drain 25852 * process by qenabling the next conn in the drain list. We may also need to 25853 * remove this conn from the list, if it is done. 25854 */ 25855 static void 25856 conn_drain_tail(conn_t *connp, boolean_t closing) 25857 { 25858 idl_t *idl; 25859 25860 /* 25861 * connp->conn_idl is stable at this point, and no lock is needed 25862 * to check it. If we are called from ip_close, close has already 25863 * set CONN_CLOSING, thus freezing the value of conn_idl, and 25864 * called us only because conn_idl is non-null. If we are called thru 25865 * service, conn_idl could be null, but it cannot change because 25866 * service is single-threaded per queue, and there cannot be another 25867 * instance of service trying to call conn_drain_insert on this conn 25868 * now. 25869 */ 25870 ASSERT(!closing || (connp->conn_idl != NULL)); 25871 25872 /* 25873 * If connp->conn_idl is null, the conn has not been inserted into any 25874 * drain list even once since creation of the conn. Just return. 25875 */ 25876 if (connp->conn_idl == NULL) 25877 return; 25878 25879 mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); 25880 25881 if (connp->conn_drain_prev == NULL) { 25882 /* This conn is currently not in the drain list. */ 25883 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25884 return; 25885 } 25886 idl = connp->conn_idl; 25887 if (idl->idl_conn_draining == connp) { 25888 /* 25889 * This conn is the current drainer. If this is the last conn 25890 * in the drain list, we need to do more checks, in the 'if' 25891 * below. Otherwwise we need to just qenable the next conn, 25892 * to sustain the draining, and is handled in the 'else' 25893 * below. 25894 */ 25895 if (connp->conn_drain_next == idl->idl_conn) { 25896 /* 25897 * This conn is the last in this list. This round 25898 * of draining is complete. If idl_repeat is set, 25899 * it means another flow enabling has happened from 25900 * the driver/streams and we need to another round 25901 * of draining. 25902 * If there are more than 2 conns in the drain list, 25903 * do a left rotate by 1, so that all conns except the 25904 * conn at the head move towards the head by 1, and the 25905 * the conn at the head goes to the tail. This attempts 25906 * a more even share for all queues that are being 25907 * drained. 25908 */ 25909 if ((connp->conn_drain_next != connp) && 25910 (idl->idl_conn->conn_drain_next != connp)) { 25911 idl->idl_conn = idl->idl_conn->conn_drain_next; 25912 } 25913 if (idl->idl_repeat) { 25914 qenable(idl->idl_conn->conn_wq); 25915 idl->idl_conn_draining = idl->idl_conn; 25916 idl->idl_repeat = 0; 25917 } else { 25918 idl->idl_conn_draining = NULL; 25919 } 25920 } else { 25921 /* 25922 * If the next queue that we are now qenable'ing, 25923 * is closing, it will remove itself from this list 25924 * and qenable the subsequent queue in ip_close(). 25925 * Serialization is acheived thru idl_lock. 25926 */ 25927 qenable(connp->conn_drain_next->conn_wq); 25928 idl->idl_conn_draining = connp->conn_drain_next; 25929 } 25930 } 25931 if (!connp->conn_did_putbq || closing) { 25932 /* 25933 * Remove ourself from the drain list, if we did not do 25934 * a putbq, or if the conn is closing. 25935 * Note: It is possible that q->q_first is non-null. It means 25936 * that these messages landed after we did a enableok() in 25937 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to 25938 * service them. 25939 */ 25940 if (connp->conn_drain_next == connp) { 25941 /* Singleton in the list */ 25942 ASSERT(connp->conn_drain_prev == connp); 25943 idl->idl_conn = NULL; 25944 idl->idl_conn_draining = NULL; 25945 } else { 25946 connp->conn_drain_prev->conn_drain_next = 25947 connp->conn_drain_next; 25948 connp->conn_drain_next->conn_drain_prev = 25949 connp->conn_drain_prev; 25950 if (idl->idl_conn == connp) 25951 idl->idl_conn = connp->conn_drain_next; 25952 ASSERT(idl->idl_conn_draining != connp); 25953 25954 } 25955 connp->conn_drain_next = NULL; 25956 connp->conn_drain_prev = NULL; 25957 } 25958 mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); 25959 } 25960 25961 /* 25962 * Write service routine. Shared perimeter entry point. 25963 * ip_wsrv can be called in any of the following ways. 25964 * 1. The device queue's messages has fallen below the low water mark 25965 * and STREAMS has backenabled the ill_wq. We walk thru all the 25966 * the drain lists and backenable the first conn in each list. 25967 * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the 25968 * qenabled non-tcp upper layers. We start dequeing messages and call 25969 * ip_wput for each message. 25970 */ 25971 25972 void 25973 ip_wsrv(queue_t *q) 25974 { 25975 conn_t *connp; 25976 ill_t *ill; 25977 mblk_t *mp; 25978 25979 if (q->q_next) { 25980 ill = (ill_t *)q->q_ptr; 25981 if (ill->ill_state_flags == 0) { 25982 /* 25983 * The device flow control has opened up. 25984 * Walk through conn drain lists and qenable the 25985 * first conn in each list. This makes sense only 25986 * if the stream is fully plumbed and setup. 25987 * Hence the if check above. 25988 */ 25989 ip1dbg(("ip_wsrv: walking\n")); 25990 conn_walk_drain(); 25991 } 25992 return; 25993 } 25994 25995 connp = Q_TO_CONN(q); 25996 ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); 25997 25998 /* 25999 * 1. Set conn_draining flag to signal that service is active. 26000 * 26001 * 2. ip_output determines whether it has been called from service, 26002 * based on the last parameter. If it is IP_WSRV it concludes it 26003 * has been called from service. 26004 * 26005 * 3. Message ordering is preserved by the following logic. 26006 * i. A directly called ip_output (i.e. not thru service) will queue 26007 * the message at the tail, if conn_draining is set (i.e. service 26008 * is running) or if q->q_first is non-null. 26009 * 26010 * ii. If ip_output is called from service, and if ip_output cannot 26011 * putnext due to flow control, it does a putbq. 26012 * 26013 * 4. noenable the queue so that a putbq from ip_wsrv does not reenable 26014 * (causing an infinite loop). 26015 */ 26016 ASSERT(!connp->conn_did_putbq); 26017 while ((q->q_first != NULL) && !connp->conn_did_putbq) { 26018 connp->conn_draining = 1; 26019 noenable(q); 26020 while ((mp = getq(q)) != NULL) { 26021 ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); 26022 if (connp->conn_did_putbq) { 26023 /* ip_wput did a putbq */ 26024 break; 26025 } 26026 } 26027 /* 26028 * At this point, a thread coming down from top, calling 26029 * ip_wput, may end up queueing the message. We have not yet 26030 * enabled the queue, so ip_wsrv won't be called again. 26031 * To avoid this race, check q->q_first again (in the loop) 26032 * If the other thread queued the message before we call 26033 * enableok(), we will catch it in the q->q_first check. 26034 * If the other thread queues the message after we call 26035 * enableok(), ip_wsrv will be called again by STREAMS. 26036 */ 26037 connp->conn_draining = 0; 26038 enableok(q); 26039 } 26040 26041 /* Enable the next conn for draining */ 26042 conn_drain_tail(connp, B_FALSE); 26043 26044 connp->conn_did_putbq = 0; 26045 } 26046 26047 /* 26048 * Walk the list of all conn's calling the function provided with the 26049 * specified argument for each. Note that this only walks conn's that 26050 * have been bound. 26051 * Applies to both IPv4 and IPv6. 26052 */ 26053 static void 26054 conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid) 26055 { 26056 conn_walk_fanout_table(ipcl_udp_fanout, ipcl_udp_fanout_size, 26057 func, arg, zoneid); 26058 conn_walk_fanout_table(ipcl_conn_fanout, ipcl_conn_fanout_size, 26059 func, arg, zoneid); 26060 conn_walk_fanout_table(ipcl_bind_fanout, ipcl_bind_fanout_size, 26061 func, arg, zoneid); 26062 conn_walk_fanout_table(ipcl_proto_fanout, 26063 A_CNT(ipcl_proto_fanout), func, arg, zoneid); 26064 conn_walk_fanout_table(ipcl_proto_fanout_v6, 26065 A_CNT(ipcl_proto_fanout_v6), func, arg, zoneid); 26066 } 26067 26068 /* 26069 * Flowcontrol has relieved, and STREAMS has backenabled us. For each list 26070 * of conns that need to be drained, check if drain is already in progress. 26071 * If so set the idl_repeat bit, indicating that the last conn in the list 26072 * needs to reinitiate the drain once again, for the list. If drain is not 26073 * in progress for the list, initiate the draining, by qenabling the 1st 26074 * conn in the list. The drain is self-sustaining, each qenabled conn will 26075 * in turn qenable the next conn, when it is done/blocked/closing. 26076 */ 26077 static void 26078 conn_walk_drain(void) 26079 { 26080 int i; 26081 idl_t *idl; 26082 26083 IP_STAT(ip_conn_walk_drain); 26084 26085 for (i = 0; i < conn_drain_list_cnt; i++) { 26086 idl = &conn_drain_list[i]; 26087 mutex_enter(&idl->idl_lock); 26088 if (idl->idl_conn == NULL) { 26089 mutex_exit(&idl->idl_lock); 26090 continue; 26091 } 26092 /* 26093 * If this list is not being drained currently by 26094 * an ip_wsrv thread, start the process. 26095 */ 26096 if (idl->idl_conn_draining == NULL) { 26097 ASSERT(idl->idl_repeat == 0); 26098 qenable(idl->idl_conn->conn_wq); 26099 idl->idl_conn_draining = idl->idl_conn; 26100 } else { 26101 idl->idl_repeat = 1; 26102 } 26103 mutex_exit(&idl->idl_lock); 26104 } 26105 } 26106 26107 /* 26108 * Walk an conn hash table of `count' buckets, calling func for each entry. 26109 */ 26110 static void 26111 conn_walk_fanout_table(connf_t *connfp, uint_t count, pfv_t func, void *arg, 26112 zoneid_t zoneid) 26113 { 26114 conn_t *connp; 26115 26116 while (count-- > 0) { 26117 mutex_enter(&connfp->connf_lock); 26118 for (connp = connfp->connf_head; connp != NULL; 26119 connp = connp->conn_next) { 26120 if (zoneid == GLOBAL_ZONEID || 26121 zoneid == connp->conn_zoneid) { 26122 CONN_INC_REF(connp); 26123 mutex_exit(&connfp->connf_lock); 26124 (*func)(connp, arg); 26125 mutex_enter(&connfp->connf_lock); 26126 CONN_DEC_REF(connp); 26127 } 26128 } 26129 mutex_exit(&connfp->connf_lock); 26130 connfp++; 26131 } 26132 } 26133 26134 /* ipcl_walk routine invoked for ip_conn_report for each conn. */ 26135 static void 26136 conn_report1(conn_t *connp, void *mp) 26137 { 26138 char buf1[INET6_ADDRSTRLEN]; 26139 char buf2[INET6_ADDRSTRLEN]; 26140 uint_t print_len, buf_len; 26141 26142 ASSERT(connp != NULL); 26143 26144 buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr; 26145 if (buf_len <= 0) 26146 return; 26147 (void) inet_ntop(AF_INET6, &connp->conn_srcv6, buf1, sizeof (buf1)), 26148 (void) inet_ntop(AF_INET6, &connp->conn_remv6, buf2, sizeof (buf2)), 26149 print_len = snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len, 26150 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 26151 "%5d %s/%05d %s/%05d\n", 26152 (void *)connp, (void *)CONNP_TO_RQ(connp), 26153 (void *)CONNP_TO_WQ(connp), connp->conn_zoneid, 26154 buf1, connp->conn_lport, 26155 buf2, connp->conn_fport); 26156 if (print_len < buf_len) { 26157 ((mblk_t *)mp)->b_wptr += print_len; 26158 } else { 26159 ((mblk_t *)mp)->b_wptr += buf_len; 26160 } 26161 } 26162 26163 /* 26164 * Named Dispatch routine to produce a formatted report on all conns 26165 * that are listed in one of the fanout tables. 26166 * This report is accessed by using the ndd utility to "get" ND variable 26167 * "ip_conn_status". 26168 */ 26169 /* ARGSUSED */ 26170 static int 26171 ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 26172 { 26173 (void) mi_mpprintf(mp, 26174 "CONN " MI_COL_HDRPAD_STR 26175 "rfq " MI_COL_HDRPAD_STR 26176 "stq " MI_COL_HDRPAD_STR 26177 " zone local remote"); 26178 26179 /* 26180 * Because of the ndd constraint, at most we can have 64K buffer 26181 * to put in all conn info. So to be more efficient, just 26182 * allocate a 64K buffer here, assuming we need that large buffer. 26183 * This should be OK as only privileged processes can do ndd /dev/ip. 26184 */ 26185 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 26186 /* The following may work even if we cannot get a large buf. */ 26187 (void) mi_mpprintf(mp, "<< Out of buffer >>\n"); 26188 return (0); 26189 } 26190 26191 conn_walk_fanout(conn_report1, mp->b_cont, Q_TO_CONN(q)->conn_zoneid); 26192 return (0); 26193 } 26194 26195 /* 26196 * Determine if the ill and multicast aspects of that packets 26197 * "matches" the conn. 26198 */ 26199 boolean_t 26200 conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, 26201 zoneid_t zoneid) 26202 { 26203 ill_t *in_ill; 26204 boolean_t found; 26205 ipif_t *ipif; 26206 ire_t *ire; 26207 ipaddr_t dst, src; 26208 26209 dst = ipha->ipha_dst; 26210 src = ipha->ipha_src; 26211 26212 /* 26213 * conn_incoming_ill is set by IP_BOUND_IF which limits 26214 * unicast, broadcast and multicast reception to 26215 * conn_incoming_ill. conn_wantpacket itself is called 26216 * only for BROADCAST and multicast. 26217 * 26218 * 1) ip_rput supresses duplicate broadcasts if the ill 26219 * is part of a group. Hence, we should be receiving 26220 * just one copy of broadcast for the whole group. 26221 * Thus, if it is part of the group the packet could 26222 * come on any ill of the group and hence we need a 26223 * match on the group. Otherwise, match on ill should 26224 * be sufficient. 26225 * 26226 * 2) ip_rput does not suppress duplicate multicast packets. 26227 * If there are two interfaces in a ill group and we have 26228 * 2 applications (conns) joined a multicast group G on 26229 * both the interfaces, ilm_lookup_ill filter in ip_rput 26230 * will give us two packets because we join G on both the 26231 * interfaces rather than nominating just one interface 26232 * for receiving multicast like broadcast above. So, 26233 * we have to call ilg_lookup_ill to filter out duplicate 26234 * copies, if ill is part of a group. 26235 */ 26236 in_ill = connp->conn_incoming_ill; 26237 if (in_ill != NULL) { 26238 if (in_ill->ill_group == NULL) { 26239 if (in_ill != ill) 26240 return (B_FALSE); 26241 } else if (in_ill->ill_group != ill->ill_group) { 26242 return (B_FALSE); 26243 } 26244 } 26245 26246 if (!CLASSD(dst)) { 26247 if (connp->conn_zoneid == zoneid) 26248 return (B_TRUE); 26249 /* 26250 * The conn is in a different zone; we need to check that this 26251 * broadcast address is configured in the application's zone and 26252 * on one ill in the group. 26253 */ 26254 ipif = ipif_get_next_ipif(NULL, ill); 26255 if (ipif == NULL) 26256 return (B_FALSE); 26257 ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, 26258 connp->conn_zoneid, NULL, 26259 (MATCH_IRE_TYPE | MATCH_IRE_ILL_GROUP)); 26260 ipif_refrele(ipif); 26261 if (ire != NULL) { 26262 ire_refrele(ire); 26263 return (B_TRUE); 26264 } else { 26265 return (B_FALSE); 26266 } 26267 } 26268 26269 if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && 26270 connp->conn_zoneid == zoneid) { 26271 /* 26272 * Loopback case: the sending endpoint has IP_MULTICAST_LOOP 26273 * disabled, therefore we don't dispatch the multicast packet to 26274 * the sending zone. 26275 */ 26276 return (B_FALSE); 26277 } 26278 26279 if ((ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) && 26280 connp->conn_zoneid != zoneid) { 26281 /* 26282 * Multicast packet on the loopback interface: we only match 26283 * conns who joined the group in the specified zone. 26284 */ 26285 return (B_FALSE); 26286 } 26287 26288 if (connp->conn_multi_router) { 26289 /* multicast packet and multicast router socket: send up */ 26290 return (B_TRUE); 26291 } 26292 26293 mutex_enter(&connp->conn_lock); 26294 found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); 26295 mutex_exit(&connp->conn_lock); 26296 return (found); 26297 } 26298 26299 /* 26300 * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. 26301 */ 26302 /* ARGSUSED */ 26303 static void 26304 ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) 26305 { 26306 ill_t *ill = (ill_t *)q->q_ptr; 26307 mblk_t *mp1, *mp2; 26308 ipif_t *ipif; 26309 int err = 0; 26310 conn_t *connp = NULL; 26311 ipsq_t *ipsq; 26312 arc_t *arc; 26313 26314 ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); 26315 26316 ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); 26317 ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); 26318 26319 ASSERT(IAM_WRITER_ILL(ill)); 26320 mp2 = mp->b_cont; 26321 mp->b_cont = NULL; 26322 26323 /* 26324 * We have now received the arp bringup completion message 26325 * from ARP. Mark the arp bringup as done. Also if the arp 26326 * stream has already started closing, send up the AR_ARP_CLOSING 26327 * ack now since ARP is waiting in close for this ack. 26328 */ 26329 mutex_enter(&ill->ill_lock); 26330 ill->ill_arp_bringup_pending = 0; 26331 if (ill->ill_arp_closing) { 26332 mutex_exit(&ill->ill_lock); 26333 /* Let's reuse the mp for sending the ack */ 26334 arc = (arc_t *)mp->b_rptr; 26335 mp->b_wptr = mp->b_rptr + sizeof (arc_t); 26336 arc->arc_cmd = AR_ARP_CLOSING; 26337 qreply(q, mp); 26338 } else { 26339 mutex_exit(&ill->ill_lock); 26340 freeb(mp); 26341 } 26342 26343 /* We should have an IOCTL waiting on this. */ 26344 ipsq = ill->ill_phyint->phyint_ipsq; 26345 ipif = ipsq->ipsq_pending_ipif; 26346 mp1 = ipsq_pending_mp_get(ipsq, &connp); 26347 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 26348 if (mp1 == NULL) { 26349 /* bringup was aborted by the user */ 26350 freemsg(mp2); 26351 return; 26352 } 26353 ASSERT(connp != NULL); 26354 q = CONNP_TO_WQ(connp); 26355 /* 26356 * If the DL_BIND_REQ fails, it is noted 26357 * in arc_name_offset. 26358 */ 26359 err = *((int *)mp2->b_rptr); 26360 if (err == 0) { 26361 if (ipif->ipif_isv6) { 26362 if ((err = ipif_up_done_v6(ipif)) != 0) 26363 ip0dbg(("ip_arp_done: init failed\n")); 26364 } else { 26365 if ((err = ipif_up_done(ipif)) != 0) 26366 ip0dbg(("ip_arp_done: init failed\n")); 26367 } 26368 } else { 26369 ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); 26370 } 26371 26372 freemsg(mp2); 26373 26374 if ((err == 0) && (ill->ill_up_ipifs)) { 26375 err = ill_up_ipifs(ill, q, mp1); 26376 if (err == EINPROGRESS) 26377 return; 26378 } 26379 26380 if (ill->ill_up_ipifs) { 26381 ill_group_cleanup(ill); 26382 } 26383 26384 /* 26385 * The ioctl must complete now without EINPROGRESS 26386 * since ipsq_pending_mp_get has removed the ioctl mblk 26387 * from ipsq_pending_mp. Otherwise the ioctl will be 26388 * stuck for ever in the ipsq. 26389 */ 26390 ASSERT(err != EINPROGRESS); 26391 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipif, ipsq); 26392 } 26393 26394 /* Allocate the private structure */ 26395 static int 26396 ip_priv_alloc(void **bufp) 26397 { 26398 void *buf; 26399 26400 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL) 26401 return (ENOMEM); 26402 26403 *bufp = buf; 26404 return (0); 26405 } 26406 26407 /* Function to delete the private structure */ 26408 void 26409 ip_priv_free(void *buf) 26410 { 26411 ASSERT(buf != NULL); 26412 kmem_free(buf, sizeof (ip_priv_t)); 26413 } 26414 26415 /* 26416 * The entry point for IPPF processing. 26417 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the 26418 * routine just returns. 26419 * 26420 * When called, ip_process generates an ipp_packet_t structure 26421 * which holds the state information for this packet and invokes the 26422 * the classifier (via ipp_packet_process). The classification, depending on 26423 * configured filters, results in a list of actions for this packet. Invoking 26424 * an action may cause the packet to be dropped, in which case the resulting 26425 * mblk (*mpp) is NULL. proc indicates the callout position for 26426 * this packet and ill_index is the interface this packet on or will leave 26427 * on (inbound and outbound resp.). 26428 */ 26429 void 26430 ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) 26431 { 26432 mblk_t *mp; 26433 ip_priv_t *priv; 26434 ipp_action_id_t aid; 26435 int rc = 0; 26436 ipp_packet_t *pp; 26437 #define IP_CLASS "ip" 26438 26439 /* If the classifier is not loaded, return */ 26440 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { 26441 return; 26442 } 26443 26444 mp = *mpp; 26445 ASSERT(mp != NULL); 26446 26447 /* Allocate the packet structure */ 26448 rc = ipp_packet_alloc(&pp, IP_CLASS, aid); 26449 if (rc != 0) { 26450 *mpp = NULL; 26451 freemsg(mp); 26452 return; 26453 } 26454 26455 /* Allocate the private structure */ 26456 rc = ip_priv_alloc((void **)&priv); 26457 if (rc != 0) { 26458 *mpp = NULL; 26459 freemsg(mp); 26460 ipp_packet_free(pp); 26461 return; 26462 } 26463 priv->proc = proc; 26464 priv->ill_index = ill_index; 26465 ipp_packet_set_private(pp, priv, ip_priv_free); 26466 ipp_packet_set_data(pp, mp); 26467 26468 /* Invoke the classifier */ 26469 rc = ipp_packet_process(&pp); 26470 if (pp != NULL) { 26471 mp = ipp_packet_get_data(pp); 26472 ipp_packet_free(pp); 26473 if (rc != 0) { 26474 freemsg(mp); 26475 *mpp = NULL; 26476 } 26477 } else { 26478 *mpp = NULL; 26479 } 26480 #undef IP_CLASS 26481 } 26482 26483 /* 26484 * Propagate a multicast group membership operation (add/drop) on 26485 * all the interfaces crossed by the related multirt routes. 26486 * The call is considered successful if the operation succeeds 26487 * on at least one interface. 26488 */ 26489 static int 26490 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, 26491 uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, 26492 boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, 26493 mblk_t *first_mp) 26494 { 26495 ire_t *ire_gw; 26496 irb_t *irb; 26497 int error = 0; 26498 opt_restart_t *or; 26499 26500 irb = ire->ire_bucket; 26501 ASSERT(irb != NULL); 26502 26503 ASSERT(DB_TYPE(first_mp) == M_CTL); 26504 26505 or = (opt_restart_t *)first_mp->b_rptr; 26506 IRB_REFHOLD(irb); 26507 for (; ire != NULL; ire = ire->ire_next) { 26508 if ((ire->ire_flags & RTF_MULTIRT) == 0) 26509 continue; 26510 if (ire->ire_addr != group) 26511 continue; 26512 26513 ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, 26514 IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, 26515 MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE); 26516 /* No resolver exists for the gateway; skip this ire. */ 26517 if (ire_gw == NULL) 26518 continue; 26519 26520 /* 26521 * This function can return EINPROGRESS. If so the operation 26522 * will be restarted from ip_restart_optmgmt which will 26523 * call ip_opt_set and option processing will restart for 26524 * this option. So we may end up calling 'fn' more than once. 26525 * This requires that 'fn' is idempotent except for the 26526 * return value. The operation is considered a success if 26527 * it succeeds at least once on any one interface. 26528 */ 26529 error = fn(connp, checkonly, group, ire_gw->ire_src_addr, 26530 NULL, fmode, src, first_mp); 26531 if (error == 0) 26532 or->or_private = CGTP_MCAST_SUCCESS; 26533 26534 if (ip_debug > 0) { 26535 ulong_t off; 26536 char *ksym; 26537 ksym = kobj_getsymname((uintptr_t)fn, &off); 26538 ip2dbg(("ip_multirt_apply_membership: " 26539 "called %s, multirt group 0x%08x via itf 0x%08x, " 26540 "error %d [success %u]\n", 26541 ksym ? ksym : "?", 26542 ntohl(group), ntohl(ire_gw->ire_src_addr), 26543 error, or->or_private)); 26544 } 26545 26546 ire_refrele(ire_gw); 26547 if (error == EINPROGRESS) { 26548 IRB_REFRELE(irb); 26549 return (error); 26550 } 26551 } 26552 IRB_REFRELE(irb); 26553 /* 26554 * Consider the call as successful if we succeeded on at least 26555 * one interface. Otherwise, return the last encountered error. 26556 */ 26557 return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); 26558 } 26559 26560 26561 /* 26562 * Issue a warning regarding a route crossing an interface with an 26563 * incorrect MTU. Only one message every 'ip_multirt_log_interval' 26564 * amount of time is logged. 26565 */ 26566 static void 26567 ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) 26568 { 26569 hrtime_t current = gethrtime(); 26570 char buf[16]; 26571 26572 /* Convert interval in ms to hrtime in ns */ 26573 if (multirt_bad_mtu_last_time + 26574 ((hrtime_t)ip_multirt_log_interval * (hrtime_t)1000000) <= 26575 current) { 26576 cmn_err(CE_WARN, "ip: ignoring multiroute " 26577 "to %s, incorrect MTU %u (expected %u)\n", 26578 ip_dot_addr(ire->ire_addr, buf), 26579 ire->ire_max_frag, max_frag); 26580 26581 multirt_bad_mtu_last_time = current; 26582 } 26583 } 26584 26585 26586 /* 26587 * Get the CGTP (multirouting) filtering status. 26588 * If 0, the CGTP hooks are transparent. 26589 */ 26590 /* ARGSUSED */ 26591 static int 26592 ip_cgtp_filter_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 26593 { 26594 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 26595 26596 (void) mi_mpprintf(mp, "%d", (int)*ip_cgtp_filter_value); 26597 return (0); 26598 } 26599 26600 26601 /* 26602 * Set the CGTP (multirouting) filtering status. 26603 * If the status is changed from active to transparent 26604 * or from transparent to active, forward the new status 26605 * to the filtering module (if loaded). 26606 */ 26607 /* ARGSUSED */ 26608 static int 26609 ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 26610 cred_t *ioc_cr) 26611 { 26612 long new_value; 26613 boolean_t *ip_cgtp_filter_value = (boolean_t *)cp; 26614 26615 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 26616 new_value < 0 || new_value > 1) { 26617 return (EINVAL); 26618 } 26619 26620 /* 26621 * Do not enable CGTP filtering - thus preventing the hooks 26622 * from being invoked - if the version number of the 26623 * filtering module hooks does not match. 26624 */ 26625 if ((ip_cgtp_filter_ops != NULL) && 26626 (ip_cgtp_filter_ops->cfo_filter_rev != CGTP_FILTER_REV)) { 26627 cmn_err(CE_WARN, "IP: CGTP filtering version mismatch " 26628 "(module hooks version %d, expecting %d)\n", 26629 ip_cgtp_filter_ops->cfo_filter_rev, CGTP_FILTER_REV); 26630 return (ENOTSUP); 26631 } 26632 26633 if ((!*ip_cgtp_filter_value) && new_value) { 26634 cmn_err(CE_NOTE, "IP: enabling CGTP filtering%s", 26635 ip_cgtp_filter_ops == NULL ? 26636 " (module not loaded)" : ""); 26637 } 26638 if (*ip_cgtp_filter_value && (!new_value)) { 26639 cmn_err(CE_NOTE, "IP: disabling CGTP filtering%s", 26640 ip_cgtp_filter_ops == NULL ? 26641 " (module not loaded)" : ""); 26642 } 26643 26644 if (ip_cgtp_filter_ops != NULL) { 26645 int res; 26646 if ((res = ip_cgtp_filter_ops->cfo_change_state(new_value))) { 26647 return (res); 26648 } 26649 } 26650 26651 *ip_cgtp_filter_value = (boolean_t)new_value; 26652 26653 return (0); 26654 } 26655 26656 26657 /* 26658 * Return the expected CGTP hooks version number. 26659 */ 26660 int 26661 ip_cgtp_filter_supported(void) 26662 { 26663 return (ip_cgtp_filter_rev); 26664 } 26665 26666 26667 /* 26668 * CGTP hooks can be registered by directly touching ip_cgtp_filter_ops 26669 * or by invoking this function. In the first case, the version number 26670 * of the registered structure is checked at hooks activation time 26671 * in ip_cgtp_filter_set(). 26672 */ 26673 int 26674 ip_cgtp_filter_register(cgtp_filter_ops_t *ops) 26675 { 26676 if (ops->cfo_filter_rev != CGTP_FILTER_REV) 26677 return (ENOTSUP); 26678 26679 ip_cgtp_filter_ops = ops; 26680 return (0); 26681 } 26682 26683 static squeue_func_t 26684 ip_squeue_switch(int val) 26685 { 26686 squeue_func_t rval = squeue_fill; 26687 26688 switch (val) { 26689 case IP_SQUEUE_ENTER_NODRAIN: 26690 rval = squeue_enter_nodrain; 26691 break; 26692 case IP_SQUEUE_ENTER: 26693 rval = squeue_enter; 26694 break; 26695 default: 26696 break; 26697 } 26698 return (rval); 26699 } 26700 26701 /* ARGSUSED */ 26702 static int 26703 ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 26704 caddr_t addr, cred_t *cr) 26705 { 26706 int *v = (int *)addr; 26707 long new_value; 26708 26709 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 26710 return (EINVAL); 26711 26712 ip_input_proc = ip_squeue_switch(new_value); 26713 *v = new_value; 26714 return (0); 26715 } 26716 26717 /* ARGSUSED */ 26718 static int 26719 ip_int_set(queue_t *q, mblk_t *mp, char *value, 26720 caddr_t addr, cred_t *cr) 26721 { 26722 int *v = (int *)addr; 26723 long new_value; 26724 26725 if (ddi_strtol(value, NULL, 10, &new_value) != 0) 26726 return (EINVAL); 26727 26728 *v = new_value; 26729 return (0); 26730 } 26731 26732 static void 26733 ip_kstat_init(void) 26734 { 26735 ip_named_kstat_t template = { 26736 { "forwarding", KSTAT_DATA_UINT32, 0 }, 26737 { "defaultTTL", KSTAT_DATA_UINT32, 0 }, 26738 { "inReceives", KSTAT_DATA_UINT32, 0 }, 26739 { "inHdrErrors", KSTAT_DATA_UINT32, 0 }, 26740 { "inAddrErrors", KSTAT_DATA_UINT32, 0 }, 26741 { "forwDatagrams", KSTAT_DATA_UINT32, 0 }, 26742 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 }, 26743 { "inDiscards", KSTAT_DATA_UINT32, 0 }, 26744 { "inDelivers", KSTAT_DATA_UINT32, 0 }, 26745 { "outRequests", KSTAT_DATA_UINT32, 0 }, 26746 { "outDiscards", KSTAT_DATA_UINT32, 0 }, 26747 { "outNoRoutes", KSTAT_DATA_UINT32, 0 }, 26748 { "reasmTimeout", KSTAT_DATA_UINT32, 0 }, 26749 { "reasmReqds", KSTAT_DATA_UINT32, 0 }, 26750 { "reasmOKs", KSTAT_DATA_UINT32, 0 }, 26751 { "reasmFails", KSTAT_DATA_UINT32, 0 }, 26752 { "fragOKs", KSTAT_DATA_UINT32, 0 }, 26753 { "fragFails", KSTAT_DATA_UINT32, 0 }, 26754 { "fragCreates", KSTAT_DATA_UINT32, 0 }, 26755 { "addrEntrySize", KSTAT_DATA_INT32, 0 }, 26756 { "routeEntrySize", KSTAT_DATA_INT32, 0 }, 26757 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 }, 26758 { "routingDiscards", KSTAT_DATA_UINT32, 0 }, 26759 { "inErrs", KSTAT_DATA_UINT32, 0 }, 26760 { "noPorts", KSTAT_DATA_UINT32, 0 }, 26761 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 26762 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 }, 26763 { "reasmPartDups", KSTAT_DATA_UINT32, 0 }, 26764 { "forwProhibits", KSTAT_DATA_UINT32, 0 }, 26765 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 }, 26766 { "udpInOverflows", KSTAT_DATA_UINT32, 0 }, 26767 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 }, 26768 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 }, 26769 { "ipsecInFailed", KSTAT_DATA_INT32, 0 }, 26770 { "memberEntrySize", KSTAT_DATA_INT32, 0 }, 26771 { "inIPv6", KSTAT_DATA_UINT32, 0 }, 26772 { "outIPv6", KSTAT_DATA_UINT32, 0 }, 26773 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 }, 26774 }; 26775 26776 ip_mibkp = kstat_create("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED, 26777 NUM_OF_FIELDS(ip_named_kstat_t), 26778 0); 26779 if (!ip_mibkp) 26780 return; 26781 26782 template.forwarding.value.ui32 = WE_ARE_FORWARDING ? 1:2; 26783 template.defaultTTL.value.ui32 = (uint32_t)ip_def_ttl; 26784 template.reasmTimeout.value.ui32 = ip_g_frag_timeout; 26785 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t); 26786 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t); 26787 26788 template.netToMediaEntrySize.value.i32 = 26789 sizeof (mib2_ipNetToMediaEntry_t); 26790 26791 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t); 26792 26793 bcopy(&template, ip_mibkp->ks_data, sizeof (template)); 26794 26795 ip_mibkp->ks_update = ip_kstat_update; 26796 26797 kstat_install(ip_mibkp); 26798 } 26799 26800 static void 26801 ip_kstat_fini(void) 26802 { 26803 26804 if (ip_mibkp != NULL) { 26805 kstat_delete(ip_mibkp); 26806 ip_mibkp = NULL; 26807 } 26808 } 26809 26810 static int 26811 ip_kstat_update(kstat_t *kp, int rw) 26812 { 26813 ip_named_kstat_t *ipkp; 26814 26815 if (!kp || !kp->ks_data) 26816 return (EIO); 26817 26818 if (rw == KSTAT_WRITE) 26819 return (EACCES); 26820 26821 ipkp = (ip_named_kstat_t *)kp->ks_data; 26822 26823 ipkp->forwarding.value.ui32 = ip_mib.ipForwarding; 26824 ipkp->defaultTTL.value.ui32 = ip_mib.ipDefaultTTL; 26825 ipkp->inReceives.value.ui32 = ip_mib.ipInReceives; 26826 ipkp->inHdrErrors.value.ui32 = ip_mib.ipInHdrErrors; 26827 ipkp->inAddrErrors.value.ui32 = ip_mib.ipInAddrErrors; 26828 ipkp->forwDatagrams.value.ui32 = ip_mib.ipForwDatagrams; 26829 ipkp->inUnknownProtos.value.ui32 = ip_mib.ipInUnknownProtos; 26830 ipkp->inDiscards.value.ui32 = ip_mib.ipInDiscards; 26831 ipkp->inDelivers.value.ui32 = ip_mib.ipInDelivers; 26832 ipkp->outRequests.value.ui32 = ip_mib.ipOutRequests; 26833 ipkp->outDiscards.value.ui32 = ip_mib.ipOutDiscards; 26834 ipkp->outNoRoutes.value.ui32 = ip_mib.ipOutNoRoutes; 26835 ipkp->reasmTimeout.value.ui32 = ip_mib.ipReasmTimeout; 26836 ipkp->reasmReqds.value.ui32 = ip_mib.ipReasmReqds; 26837 ipkp->reasmOKs.value.ui32 = ip_mib.ipReasmOKs; 26838 ipkp->reasmFails.value.ui32 = ip_mib.ipReasmFails; 26839 ipkp->fragOKs.value.ui32 = ip_mib.ipFragOKs; 26840 ipkp->fragFails.value.ui32 = ip_mib.ipFragFails; 26841 ipkp->fragCreates.value.ui32 = ip_mib.ipFragCreates; 26842 26843 ipkp->routingDiscards.value.ui32 = ip_mib.ipRoutingDiscards; 26844 ipkp->inErrs.value.ui32 = ip_mib.tcpInErrs; 26845 ipkp->noPorts.value.ui32 = ip_mib.udpNoPorts; 26846 ipkp->inCksumErrs.value.ui32 = ip_mib.ipInCksumErrs; 26847 ipkp->reasmDuplicates.value.ui32 = ip_mib.ipReasmDuplicates; 26848 ipkp->reasmPartDups.value.ui32 = ip_mib.ipReasmPartDups; 26849 ipkp->forwProhibits.value.ui32 = ip_mib.ipForwProhibits; 26850 ipkp->udpInCksumErrs.value.ui32 = ip_mib.udpInCksumErrs; 26851 ipkp->udpInOverflows.value.ui32 = ip_mib.udpInOverflows; 26852 ipkp->rawipInOverflows.value.ui32 = ip_mib.rawipInOverflows; 26853 ipkp->ipsecInSucceeded.value.ui32 = ip_mib.ipsecInSucceeded; 26854 ipkp->ipsecInFailed.value.i32 = ip_mib.ipsecInFailed; 26855 26856 ipkp->inIPv6.value.ui32 = ip_mib.ipInIPv6; 26857 ipkp->outIPv6.value.ui32 = ip_mib.ipOutIPv6; 26858 ipkp->outSwitchIPv6.value.ui32 = ip_mib.ipOutSwitchIPv6; 26859 26860 return (0); 26861 } 26862 26863 static void 26864 icmp_kstat_init(void) 26865 { 26866 icmp_named_kstat_t template = { 26867 { "inMsgs", KSTAT_DATA_UINT32 }, 26868 { "inErrors", KSTAT_DATA_UINT32 }, 26869 { "inDestUnreachs", KSTAT_DATA_UINT32 }, 26870 { "inTimeExcds", KSTAT_DATA_UINT32 }, 26871 { "inParmProbs", KSTAT_DATA_UINT32 }, 26872 { "inSrcQuenchs", KSTAT_DATA_UINT32 }, 26873 { "inRedirects", KSTAT_DATA_UINT32 }, 26874 { "inEchos", KSTAT_DATA_UINT32 }, 26875 { "inEchoReps", KSTAT_DATA_UINT32 }, 26876 { "inTimestamps", KSTAT_DATA_UINT32 }, 26877 { "inTimestampReps", KSTAT_DATA_UINT32 }, 26878 { "inAddrMasks", KSTAT_DATA_UINT32 }, 26879 { "inAddrMaskReps", KSTAT_DATA_UINT32 }, 26880 { "outMsgs", KSTAT_DATA_UINT32 }, 26881 { "outErrors", KSTAT_DATA_UINT32 }, 26882 { "outDestUnreachs", KSTAT_DATA_UINT32 }, 26883 { "outTimeExcds", KSTAT_DATA_UINT32 }, 26884 { "outParmProbs", KSTAT_DATA_UINT32 }, 26885 { "outSrcQuenchs", KSTAT_DATA_UINT32 }, 26886 { "outRedirects", KSTAT_DATA_UINT32 }, 26887 { "outEchos", KSTAT_DATA_UINT32 }, 26888 { "outEchoReps", KSTAT_DATA_UINT32 }, 26889 { "outTimestamps", KSTAT_DATA_UINT32 }, 26890 { "outTimestampReps", KSTAT_DATA_UINT32 }, 26891 { "outAddrMasks", KSTAT_DATA_UINT32 }, 26892 { "outAddrMaskReps", KSTAT_DATA_UINT32 }, 26893 { "inChksumErrs", KSTAT_DATA_UINT32 }, 26894 { "inUnknowns", KSTAT_DATA_UINT32 }, 26895 { "inFragNeeded", KSTAT_DATA_UINT32 }, 26896 { "outFragNeeded", KSTAT_DATA_UINT32 }, 26897 { "outDrops", KSTAT_DATA_UINT32 }, 26898 { "inOverFlows", KSTAT_DATA_UINT32 }, 26899 { "inBadRedirects", KSTAT_DATA_UINT32 }, 26900 }; 26901 26902 icmp_mibkp = kstat_create("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED, 26903 NUM_OF_FIELDS(icmp_named_kstat_t), 26904 0); 26905 if (icmp_mibkp == NULL) 26906 return; 26907 26908 bcopy(&template, icmp_mibkp->ks_data, sizeof (template)); 26909 26910 icmp_mibkp->ks_update = icmp_kstat_update; 26911 26912 kstat_install(icmp_mibkp); 26913 } 26914 26915 static void 26916 icmp_kstat_fini(void) 26917 { 26918 26919 if (icmp_mibkp != NULL) { 26920 kstat_delete(icmp_mibkp); 26921 icmp_mibkp = NULL; 26922 } 26923 } 26924 26925 static int 26926 icmp_kstat_update(kstat_t *kp, int rw) 26927 { 26928 icmp_named_kstat_t *icmpkp; 26929 26930 if ((kp == NULL) || (kp->ks_data == NULL)) 26931 return (EIO); 26932 26933 if (rw == KSTAT_WRITE) 26934 return (EACCES); 26935 26936 icmpkp = (icmp_named_kstat_t *)kp->ks_data; 26937 26938 icmpkp->inMsgs.value.ui32 = icmp_mib.icmpInMsgs; 26939 icmpkp->inErrors.value.ui32 = icmp_mib.icmpInErrors; 26940 icmpkp->inDestUnreachs.value.ui32 = icmp_mib.icmpInDestUnreachs; 26941 icmpkp->inTimeExcds.value.ui32 = icmp_mib.icmpInTimeExcds; 26942 icmpkp->inParmProbs.value.ui32 = icmp_mib.icmpInParmProbs; 26943 icmpkp->inSrcQuenchs.value.ui32 = icmp_mib.icmpInSrcQuenchs; 26944 icmpkp->inRedirects.value.ui32 = icmp_mib.icmpInRedirects; 26945 icmpkp->inEchos.value.ui32 = icmp_mib.icmpInEchos; 26946 icmpkp->inEchoReps.value.ui32 = icmp_mib.icmpInEchoReps; 26947 icmpkp->inTimestamps.value.ui32 = icmp_mib.icmpInTimestamps; 26948 icmpkp->inTimestampReps.value.ui32 = icmp_mib.icmpInTimestampReps; 26949 icmpkp->inAddrMasks.value.ui32 = icmp_mib.icmpInAddrMasks; 26950 icmpkp->inAddrMaskReps.value.ui32 = icmp_mib.icmpInAddrMaskReps; 26951 icmpkp->outMsgs.value.ui32 = icmp_mib.icmpOutMsgs; 26952 icmpkp->outErrors.value.ui32 = icmp_mib.icmpOutErrors; 26953 icmpkp->outDestUnreachs.value.ui32 = icmp_mib.icmpOutDestUnreachs; 26954 icmpkp->outTimeExcds.value.ui32 = icmp_mib.icmpOutTimeExcds; 26955 icmpkp->outParmProbs.value.ui32 = icmp_mib.icmpOutParmProbs; 26956 icmpkp->outSrcQuenchs.value.ui32 = icmp_mib.icmpOutSrcQuenchs; 26957 icmpkp->outRedirects.value.ui32 = icmp_mib.icmpOutRedirects; 26958 icmpkp->outEchos.value.ui32 = icmp_mib.icmpOutEchos; 26959 icmpkp->outEchoReps.value.ui32 = icmp_mib.icmpOutEchoReps; 26960 icmpkp->outTimestamps.value.ui32 = icmp_mib.icmpOutTimestamps; 26961 icmpkp->outTimestampReps.value.ui32 = icmp_mib.icmpOutTimestampReps; 26962 icmpkp->outAddrMasks.value.ui32 = icmp_mib.icmpOutAddrMasks; 26963 icmpkp->outAddrMaskReps.value.ui32 = icmp_mib.icmpOutAddrMaskReps; 26964 icmpkp->inCksumErrs.value.ui32 = icmp_mib.icmpInCksumErrs; 26965 icmpkp->inUnknowns.value.ui32 = icmp_mib.icmpInUnknowns; 26966 icmpkp->inFragNeeded.value.ui32 = icmp_mib.icmpInFragNeeded; 26967 icmpkp->outFragNeeded.value.ui32 = icmp_mib.icmpOutFragNeeded; 26968 icmpkp->outDrops.value.ui32 = icmp_mib.icmpOutDrops; 26969 icmpkp->inOverflows.value.ui32 = icmp_mib.icmpInOverflows; 26970 icmpkp->inBadRedirects.value.ui32 = icmp_mib.icmpInBadRedirects; 26971 26972 return (0); 26973 } 26974 26975 /* 26976 * This is the fanout function for raw socket opened for SCTP. Note 26977 * that it is called after SCTP checks that there is no socket which 26978 * wants a packet. Then before SCTP handles this out of the blue packet, 26979 * this function is called to see if there is any raw socket for SCTP. 26980 * If there is and it is bound to the correct address, the packet will 26981 * be sent to that socket. Note that only one raw socket can be bound to 26982 * a port. This is assured in ipcl_sctp_hash_insert(); 26983 */ 26984 void 26985 ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, 26986 uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, 26987 uint_t ipif_seqid, zoneid_t zoneid) 26988 { 26989 conn_t *connp; 26990 queue_t *rq; 26991 mblk_t *first_mp; 26992 boolean_t secure; 26993 ip6_t *ip6h; 26994 26995 first_mp = mp; 26996 if (mctl_present) { 26997 mp = first_mp->b_cont; 26998 secure = ipsec_in_is_secure(first_mp); 26999 ASSERT(mp != NULL); 27000 } else { 27001 secure = B_FALSE; 27002 } 27003 ip6h = (isv4) ? NULL : (ip6_t *)ipha; 27004 27005 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha); 27006 if (connp == NULL) { 27007 sctp_ootb_input(first_mp, recv_ill, ipif_seqid, zoneid, 27008 mctl_present); 27009 return; 27010 } 27011 rq = connp->conn_rq; 27012 if (!canputnext(rq)) { 27013 CONN_DEC_REF(connp); 27014 BUMP_MIB(&ip_mib, rawipInOverflows); 27015 freemsg(first_mp); 27016 return; 27017 } 27018 if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp) : 27019 CONN_INBOUND_POLICY_PRESENT_V6(connp)) || secure) { 27020 first_mp = ipsec_check_inbound_policy(first_mp, connp, 27021 (isv4 ? ipha : NULL), ip6h, mctl_present); 27022 if (first_mp == NULL) { 27023 CONN_DEC_REF(connp); 27024 return; 27025 } 27026 } 27027 /* 27028 * We probably should not send M_CTL message up to 27029 * raw socket. 27030 */ 27031 if (mctl_present) 27032 freeb(first_mp); 27033 27034 /* Initiate IPPF processing here if needed. */ 27035 if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN) && ip_policy) || 27036 (!isv4 && IP6_IN_IPP(flags))) { 27037 ip_process(IPP_LOCAL_IN, &mp, 27038 recv_ill->ill_phyint->phyint_ifindex); 27039 if (mp == NULL) { 27040 CONN_DEC_REF(connp); 27041 return; 27042 } 27043 } 27044 27045 if (connp->conn_recvif || connp->conn_recvslla || 27046 ((connp->conn_ipv6_recvpktinfo || 27047 (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && 27048 (flags & IP_FF_IP6INFO))) { 27049 int in_flags = 0; 27050 27051 if (connp->conn_recvif || connp->conn_ipv6_recvpktinfo) { 27052 in_flags = IPF_RECVIF; 27053 } 27054 if (connp->conn_recvslla) { 27055 in_flags |= IPF_RECVSLLA; 27056 } 27057 if (isv4) { 27058 mp = ip_add_info(mp, recv_ill, in_flags); 27059 } else { 27060 mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); 27061 if (mp == NULL) { 27062 CONN_DEC_REF(connp); 27063 return; 27064 } 27065 } 27066 } 27067 27068 BUMP_MIB(&ip_mib, ipInDelivers); 27069 /* 27070 * We are sending the IPSEC_IN message also up. Refer 27071 * to comments above this function. 27072 */ 27073 putnext(rq, mp); 27074 CONN_DEC_REF(connp); 27075 } 27076 27077 /* 27078 * Martian Address Filtering [RFC 1812, Section 5.3.7] 27079 */ 27080 static boolean_t 27081 ip_no_forward(ipha_t *ipha, ill_t *ill) 27082 { 27083 ipaddr_t ip_src, ip_dst; 27084 ire_t *src_ire = NULL; 27085 27086 ip_src = ntohl(ipha->ipha_src); 27087 ip_dst = ntohl(ipha->ipha_dst); 27088 27089 if (ip_dst == INADDR_ANY) 27090 goto dont_forward; 27091 27092 if (IN_CLASSD(ip_src)) 27093 goto dont_forward; 27094 27095 if ((ip_src >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 27096 goto dont_forward; 27097 27098 if (IN_BADCLASS(ip_dst)) 27099 goto dont_forward; 27100 27101 src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, 27102 ALL_ZONES, NULL, MATCH_IRE_TYPE); 27103 if (src_ire != NULL) { 27104 ire_refrele(src_ire); 27105 goto dont_forward; 27106 } 27107 27108 return (B_FALSE); 27109 27110 dont_forward: 27111 if (ip_debug > 2) { 27112 printf("ip_no_forward: dropping packet received on %s\n", 27113 ill->ill_name); 27114 pr_addr_dbg("ip_no_forward: from src %s\n", 27115 AF_INET, &ipha->ipha_src); 27116 pr_addr_dbg("ip_no_forward: to dst %s\n", 27117 AF_INET, &ipha->ipha_dst); 27118 } 27119 BUMP_MIB(&ip_mib, ipForwProhibits); 27120 return (B_TRUE); 27121 } 27122 27123 static boolean_t 27124 ip_loopback_src_or_dst(ipha_t *ipha, ill_t *ill) 27125 { 27126 if (((ntohl(ipha->ipha_src) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) || 27127 ((ntohl(ipha->ipha_dst) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { 27128 if (ip_debug > 2) { 27129 if (ill != NULL) { 27130 printf("ip_loopback_src_or_dst: " 27131 "dropping packet received on %s\n", 27132 ill->ill_name); 27133 } else { 27134 printf("ip_loopback_src_or_dst: " 27135 "dropping packet\n"); 27136 } 27137 27138 pr_addr_dbg( 27139 "ip_loopback_src_or_dst: from src %s\n", 27140 AF_INET, &ipha->ipha_src); 27141 pr_addr_dbg( 27142 "ip_loopback_src_or_dst: to dst %s\n", 27143 AF_INET, &ipha->ipha_dst); 27144 } 27145 27146 BUMP_MIB(&ip_mib, ipInAddrErrors); 27147 return (B_TRUE); 27148 } 27149 return (B_FALSE); 27150 } 27151 27152 /* 27153 * Return B_TRUE if the buffers differ in length or content. 27154 * This is used for comparing extension header buffers. 27155 * Note that an extension header would be declared different 27156 * even if all that changed was the next header value in that header i.e. 27157 * what really changed is the next extension header. 27158 */ 27159 boolean_t 27160 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf, 27161 uint_t blen) 27162 { 27163 if (!b_valid) 27164 blen = 0; 27165 27166 if (alen != blen) 27167 return (B_TRUE); 27168 if (alen == 0) 27169 return (B_FALSE); /* Both zero length */ 27170 return (bcmp(abuf, bbuf, alen)); 27171 } 27172 27173 /* 27174 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok. 27175 * Return B_FALSE if memory allocation fails - don't change any state! 27176 */ 27177 boolean_t 27178 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 27179 const void *src, uint_t srclen) 27180 { 27181 void *dst; 27182 27183 if (!src_valid) 27184 srclen = 0; 27185 27186 ASSERT(*dstlenp == 0); 27187 if (src != NULL && srclen != 0) { 27188 dst = mi_alloc(srclen, BPRI_MED); 27189 if (dst == NULL) 27190 return (B_FALSE); 27191 } else { 27192 dst = NULL; 27193 } 27194 if (*dstp != NULL) 27195 mi_free(*dstp); 27196 *dstp = dst; 27197 *dstlenp = dst == NULL ? 0 : srclen; 27198 return (B_TRUE); 27199 } 27200 27201 /* 27202 * Replace what is in *dst, *dstlen with the source. 27203 * Assumes ip_allocbuf has already been called. 27204 */ 27205 void 27206 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, 27207 const void *src, uint_t srclen) 27208 { 27209 if (!src_valid) 27210 srclen = 0; 27211 27212 ASSERT(*dstlenp == srclen); 27213 if (src != NULL && srclen != 0) 27214 bcopy(src, *dstp, srclen); 27215 } 27216 27217 /* 27218 * Free the storage pointed to by the members of an ip6_pkt_t. 27219 */ 27220 void 27221 ip6_pkt_free(ip6_pkt_t *ipp) 27222 { 27223 ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); 27224 27225 if (ipp->ipp_fields & IPPF_HOPOPTS) { 27226 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); 27227 ipp->ipp_hopopts = NULL; 27228 ipp->ipp_hopoptslen = 0; 27229 } 27230 if (ipp->ipp_fields & IPPF_RTDSTOPTS) { 27231 kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 27232 ipp->ipp_rtdstopts = NULL; 27233 ipp->ipp_rtdstoptslen = 0; 27234 } 27235 if (ipp->ipp_fields & IPPF_DSTOPTS) { 27236 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); 27237 ipp->ipp_dstopts = NULL; 27238 ipp->ipp_dstoptslen = 0; 27239 } 27240 if (ipp->ipp_fields & IPPF_RTHDR) { 27241 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); 27242 ipp->ipp_rthdr = NULL; 27243 ipp->ipp_rthdrlen = 0; 27244 } 27245 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 27246 IPPF_RTHDR); 27247 } 27248